fix(auth): serialize Codex OAuth pool refresh under the auth-store lock (#56233)

The credential-pool Codex refresh path synced tokens from auth.json and then POSTed the refresh_token to OpenAI's token endpoint without holding the cross-process auth-store lock across the whole read->POST->write-back sequence. Because Codex refresh tokens are single-use, two concurrent Hermes processes could both adopt the same on-disk token and both POST it; the loser got refresh_token_reused / invalid_grant. Wrap the Codex OAuth branch of _refresh_entry in the existing shared _auth_store_lock (reentrant, cross-process flock) using the same extended-timeout pattern resolve_codex_runtime_credentials() already uses. A waiting process now blocks on the lock and, once inside, the in-lock re-sync picks up the rotated token the winner persisted and skips its own POST. Also send User-Agent: hermes-cli/<version> on the refresh request. Credit @cooper-oai (#34820) for identifying the concurrent-refresh reuse race; this ships the narrow lock-serialization fix without the separate Codex auth-store partition.
2026-07-01 02:45:07 -07:00 · 2026-07-01 02:45:07 -07:00 · da6d5fcd13
commit da6d5fcd13
parent a8a97c358f
3 changed files with 108 additions and 1 deletions
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@ -964,6 +964,34 @@ class CredentialPool:
                self._mark_exhausted(entry, None)
            return None

+        # Codex OAuth refresh tokens are single-use.  The sync→POST→write-back
+        # sequence below must run atomically across Hermes processes: otherwise
+        # two processes can both adopt the same on-disk token, both POST it, and
+        # the loser gets ``refresh_token_reused``.  Serialize the whole sequence
+        # through the shared cross-process auth-store flock (the same lock and
+        # extended-timeout pattern used by resolve_codex_runtime_credentials()).
+        # When a waiter finally acquires the lock, the in-lock re-sync below
+        # picks up the rotated token the winner persisted and skips the POST.
+        if self.provider == "openai-codex":
+            refresh_timeout_seconds = auth_mod.env_float(
+                "HERMES_CODEX_REFRESH_TIMEOUT_SECONDS", 20
+            )
+            lock_timeout = max(
+                float(auth_mod.AUTH_LOCK_TIMEOUT_SECONDS),
+                float(refresh_timeout_seconds) + 5.0,
+            )
+            with _auth_store_lock(timeout_seconds=lock_timeout):
+                synced = self._sync_codex_entry_from_auth_store(entry)
+                if synced is not entry:
+                    entry = synced
+                    if not force and not self._entry_needs_refresh(entry):
+                        return entry
+                return self._refresh_entry_impl(entry, force=force)
+        return self._refresh_entry_impl(entry, force=force)
+
+    def _refresh_entry_impl(
+        self, entry: PooledCredential, *, force: bool
+    ) -> Optional[PooledCredential]:
        try:
            if self.provider == "anthropic":
                from agent.anthropic_adapter import refresh_anthropic_oauth_pure
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@ -96,6 +96,11 @@ STEPFUN_STEP_PLAN_INTL_BASE_URL = "https://api.stepfun.ai/step_plan/v1"
 STEPFUN_STEP_PLAN_CN_BASE_URL = "https://api.stepfun.com/step_plan/v1"
 CODEX_OAUTH_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
 CODEX_OAUTH_TOKEN_URL = "https://auth.openai.com/oauth/token"
+try:  # Version tag for the Codex token-endpoint User-Agent; fall back if unavailable.
+    from hermes_cli import __version__ as _HERMES_CLI_VERSION
+except Exception:  # pragma: no cover - version import should always succeed
+    _HERMES_CLI_VERSION = "unknown"
+CODEX_OAUTH_USER_AGENT = f"hermes-cli/{_HERMES_CLI_VERSION}"
 CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
 XAI_OAUTH_ISSUER = "https://auth.x.ai"
 XAI_OAUTH_DISCOVERY_URL = f"{XAI_OAUTH_ISSUER}/.well-known/openid-configuration"
@ -3608,7 +3613,13 @@ def refresh_codex_oauth_pure(
        )

    timeout = httpx.Timeout(max(5.0, float(timeout_seconds)))
-    with httpx.Client(timeout=timeout, headers={"Accept": "application/json"}) as client:
+    with httpx.Client(
+        timeout=timeout,
+        headers={
+            "Accept": "application/json",
+            "User-Agent": CODEX_OAUTH_USER_AGENT,
+        },
+    ) as client:
        response = client.post(
            CODEX_OAUTH_TOKEN_URL,
            headers={"Content-Type": "application/x-www-form-urlencoded"},
--- a/tests/agent/test_credential_pool_oauth_writethrough.py
+++ b/tests/agent/test_credential_pool_oauth_writethrough.py
@ -188,3 +188,71 @@ def test_write_through_helper_is_noop_in_classic_mode(monkeypatch, tmp_path):
    CP._write_through_provider_state_to_global_root(
        "openai-codex", {"tokens": {"access_token": "a", "refresh_token": "r"}}
    )
+
+
+def test_codex_pool_refresh_holds_auth_store_lock_across_post(monkeypatch, tmp_path):
+    """The Codex OAuth pool refresh must POST under the cross-process auth lock.
+
+    Codex refresh tokens are single-use. If two Hermes processes both read the
+    same on-disk token and both POST it, the loser gets ``refresh_token_reused``.
+    Serializing the sync -> refresh POST -> write-back sequence through the
+    shared ``_auth_store_lock`` closes that window: a second process blocks on
+    the flock and, once inside, adopts the rotated token instead of re-POSTing.
+
+    This asserts the invariant directly — that ``refresh_codex_oauth_pure`` is
+    only ever called while the auth-store lock is held — rather than snapshotting
+    any token value.
+    """
+    provider = "openai-codex"
+    profile_path = tmp_path / "auth.json"
+    monkeypatch.setattr(A, "_auth_file_path", lambda: profile_path)
+    monkeypatch.setattr(A, "_global_auth_file_path", lambda: None)
+    monkeypatch.setenv("HOME", str(tmp_path / "not-the-root"))
+
+    lock_held: dict = {"during_post": None}
+    real_lock = A._auth_store_lock
+
+    depth = {"n": 0}
+
+    import contextlib
+
+    @contextlib.contextmanager
+    def tracking_lock(*args, **kwargs):
+        depth["n"] += 1
+        try:
+            with real_lock(*args, **kwargs):
+                yield
+        finally:
+            depth["n"] -= 1
+
+    monkeypatch.setattr(A, "_auth_store_lock", tracking_lock)
+    # credential_pool imported _auth_store_lock by name; patch that binding too.
+    monkeypatch.setattr(CP, "_auth_store_lock", tracking_lock)
+
+    def fake_refresh(access_token, refresh_token, **kwargs):
+        # The POST to the token endpoint must happen with the lock held.
+        lock_held["during_post"] = depth["n"] > 0
+        return {
+            "access_token": "rotated-access",
+            "refresh_token": "rotated-refresh",
+            "last_refresh": "2020-01-02T00:00:00Z",
+        }
+
+    monkeypatch.setattr(A, "refresh_codex_oauth_pure", fake_refresh)
+
+    entry = _entry(
+        provider,
+        id="codex-1",
+        access_token="stale-access",
+        refresh_token="stale-refresh",
+    )
+    pool = CredentialPool(provider, [entry])
+
+    refreshed = pool._refresh_entry(entry, force=True)
+
+    assert refreshed is not None
+    assert refreshed.access_token == "rotated-access"
+    assert refreshed.refresh_token == "rotated-refresh"
+    # The invariant: the single-use token POST ran inside the auth-store lock.
+    assert lock_held["during_post"] is True
+