From da6d5fcd13af2adb6ce7961e06f85cf714fde7f5 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Wed, 1 Jul 2026 02:45:07 -0700
Subject: [PATCH] fix(auth): serialize Codex OAuth pool refresh under the
 auth-store lock (#56233)

The credential-pool Codex refresh path synced tokens from auth.json and
then POSTed the refresh_token to OpenAI's token endpoint without holding
the cross-process auth-store lock across the whole read->POST->write-back
sequence. Because Codex refresh tokens are single-use, two concurrent
Hermes processes could both adopt the same on-disk token and both POST
it; the loser got refresh_token_reused / invalid_grant.

Wrap the Codex OAuth branch of _refresh_entry in the existing shared
_auth_store_lock (reentrant, cross-process flock) using the same
extended-timeout pattern resolve_codex_runtime_credentials() already
uses. A waiting process now blocks on the lock and, once inside, the
in-lock re-sync picks up the rotated token the winner persisted and
skips its own POST. Also send User-Agent: hermes-cli/<version> on the
refresh request.

Credit @cooper-oai (#34820) for identifying the concurrent-refresh
reuse race; this ships the narrow lock-serialization fix without the
separate Codex auth-store partition.
---
 agent/credential_pool.py                      | 28 ++++++++
 hermes_cli/auth.py                            | 13 +++-
 ...test_credential_pool_oauth_writethrough.py | 68 +++++++++++++++++++
 3 files changed, 108 insertions(+), 1 deletion(-)
diff --git a/agent/credential_pool.py b/agent/credential_pool.py
index 6ff22e303..39de25202 100644
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -964,6 +964,34 @@ class CredentialPool:
                 self._mark_exhausted(entry, None)
             return None
 
+        # Codex OAuth refresh tokens are single-use.  The sync→POST→write-back
+        # sequence below must run atomically across Hermes processes: otherwise
+        # two processes can both adopt the same on-disk token, both POST it, and
+        # the loser gets ``refresh_token_reused``.  Serialize the whole sequence
+        # through the shared cross-process auth-store flock (the same lock and
+        # extended-timeout pattern used by resolve_codex_runtime_credentials()).
+        # When a waiter finally acquires the lock, the in-lock re-sync below
+        # picks up the rotated token the winner persisted and skips the POST.
+        if self.provider == "openai-codex":
+            refresh_timeout_seconds = auth_mod.env_float(
+                "HERMES_CODEX_REFRESH_TIMEOUT_SECONDS", 20
+            )
+            lock_timeout = max(
+                float(auth_mod.AUTH_LOCK_TIMEOUT_SECONDS),
+                float(refresh_timeout_seconds) + 5.0,
+            )
+            with _auth_store_lock(timeout_seconds=lock_timeout):
+                synced = self._sync_codex_entry_from_auth_store(entry)
+                if synced is not entry:
+                    entry = synced
+                    if not force and not self._entry_needs_refresh(entry):
+                        return entry
+                return self._refresh_entry_impl(entry, force=force)
+        return self._refresh_entry_impl(entry, force=force)
+
+    def _refresh_entry_impl(
+        self, entry: PooledCredential, *, force: bool
+    ) -> Optional[PooledCredential]:
         try:
             if self.provider == "anthropic":
                 from agent.anthropic_adapter import refresh_anthropic_oauth_pure
diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py
index d53f2cbbf..7c1a0d269 100644
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -96,6 +96,11 @@ STEPFUN_STEP_PLAN_INTL_BASE_URL = "https://api.stepfun.ai/step_plan/v1"
 STEPFUN_STEP_PLAN_CN_BASE_URL = "https://api.stepfun.com/step_plan/v1"
 CODEX_OAUTH_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
 CODEX_OAUTH_TOKEN_URL = "https://auth.openai.com/oauth/token"
+try:  # Version tag for the Codex token-endpoint User-Agent; fall back if unavailable.
+    from hermes_cli import __version__ as _HERMES_CLI_VERSION
+except Exception:  # pragma: no cover - version import should always succeed
+    _HERMES_CLI_VERSION = "unknown"
+CODEX_OAUTH_USER_AGENT = f"hermes-cli/{_HERMES_CLI_VERSION}"
 CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
 XAI_OAUTH_ISSUER = "https://auth.x.ai"
 XAI_OAUTH_DISCOVERY_URL = f"{XAI_OAUTH_ISSUER}/.well-known/openid-configuration"
@@ -3608,7 +3613,13 @@ def refresh_codex_oauth_pure(
         )
 
     timeout = httpx.Timeout(max(5.0, float(timeout_seconds)))
-    with httpx.Client(timeout=timeout, headers={"Accept": "application/json"}) as client:
+    with httpx.Client(
+        timeout=timeout,
+        headers={
+            "Accept": "application/json",
+            "User-Agent": CODEX_OAUTH_USER_AGENT,
+        },
+    ) as client:
         response = client.post(
             CODEX_OAUTH_TOKEN_URL,
             headers={"Content-Type": "application/x-www-form-urlencoded"},
diff --git a/tests/agent/test_credential_pool_oauth_writethrough.py b/tests/agent/test_credential_pool_oauth_writethrough.py
index 819e304f4..f5379aef2 100644
--- a/tests/agent/test_credential_pool_oauth_writethrough.py
+++ b/tests/agent/test_credential_pool_oauth_writethrough.py
@@ -188,3 +188,71 @@ def test_write_through_helper_is_noop_in_classic_mode(monkeypatch, tmp_path):
     CP._write_through_provider_state_to_global_root(
         "openai-codex", {"tokens": {"access_token": "a", "refresh_token": "r"}}
     )
+
+
+def test_codex_pool_refresh_holds_auth_store_lock_across_post(monkeypatch, tmp_path):
+    """The Codex OAuth pool refresh must POST under the cross-process auth lock.
+
+    Codex refresh tokens are single-use. If two Hermes processes both read the
+    same on-disk token and both POST it, the loser gets ``refresh_token_reused``.
+    Serializing the sync -> refresh POST -> write-back sequence through the
+    shared ``_auth_store_lock`` closes that window: a second process blocks on
+    the flock and, once inside, adopts the rotated token instead of re-POSTing.
+
+    This asserts the invariant directly — that ``refresh_codex_oauth_pure`` is
+    only ever called while the auth-store lock is held — rather than snapshotting
+    any token value.
+    """
+    provider = "openai-codex"
+    profile_path = tmp_path / "auth.json"
+    monkeypatch.setattr(A, "_auth_file_path", lambda: profile_path)
+    monkeypatch.setattr(A, "_global_auth_file_path", lambda: None)
+    monkeypatch.setenv("HOME", str(tmp_path / "not-the-root"))
+
+    lock_held: dict = {"during_post": None}
+    real_lock = A._auth_store_lock
+
+    depth = {"n": 0}
+
+    import contextlib
+
+    @contextlib.contextmanager
+    def tracking_lock(*args, **kwargs):
+        depth["n"] += 1
+        try:
+            with real_lock(*args, **kwargs):
+                yield
+        finally:
+            depth["n"] -= 1
+
+    monkeypatch.setattr(A, "_auth_store_lock", tracking_lock)
+    # credential_pool imported _auth_store_lock by name; patch that binding too.
+    monkeypatch.setattr(CP, "_auth_store_lock", tracking_lock)
+
+    def fake_refresh(access_token, refresh_token, **kwargs):
+        # The POST to the token endpoint must happen with the lock held.
+        lock_held["during_post"] = depth["n"] > 0
+        return {
+            "access_token": "rotated-access",
+            "refresh_token": "rotated-refresh",
+            "last_refresh": "2020-01-02T00:00:00Z",
+        }
+
+    monkeypatch.setattr(A, "refresh_codex_oauth_pure", fake_refresh)
+
+    entry = _entry(
+        provider,
+        id="codex-1",
+        access_token="stale-access",
+        refresh_token="stale-refresh",
+    )
+    pool = CredentialPool(provider, [entry])
+
+    refreshed = pool._refresh_entry(entry, force=True)
+
+    assert refreshed is not None
+    assert refreshed.access_token == "rotated-access"
+    assert refreshed.refresh_token == "rotated-refresh"
+    # The invariant: the single-use token POST ran inside the auth-store lock.
+    assert lock_held["during_post"] is True
+