From da6d5fcd13af2adb6ce7961e06f85cf714fde7f5 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 1 Jul 2026 02:45:07 -0700 Subject: [PATCH] fix(auth): serialize Codex OAuth pool refresh under the auth-store lock (#56233) The credential-pool Codex refresh path synced tokens from auth.json and then POSTed the refresh_token to OpenAI's token endpoint without holding the cross-process auth-store lock across the whole read->POST->write-back sequence. Because Codex refresh tokens are single-use, two concurrent Hermes processes could both adopt the same on-disk token and both POST it; the loser got refresh_token_reused / invalid_grant. Wrap the Codex OAuth branch of _refresh_entry in the existing shared _auth_store_lock (reentrant, cross-process flock) using the same extended-timeout pattern resolve_codex_runtime_credentials() already uses. A waiting process now blocks on the lock and, once inside, the in-lock re-sync picks up the rotated token the winner persisted and skips its own POST. Also send User-Agent: hermes-cli/ on the refresh request. Credit @cooper-oai (#34820) for identifying the concurrent-refresh reuse race; this ships the narrow lock-serialization fix without the separate Codex auth-store partition. --- agent/credential_pool.py | 28 ++++++++ hermes_cli/auth.py | 13 +++- ...test_credential_pool_oauth_writethrough.py | 68 +++++++++++++++++++ 3 files changed, 108 insertions(+), 1 deletion(-) diff --git a/agent/credential_pool.py b/agent/credential_pool.py index 6ff22e303..39de25202 100644 --- a/agent/credential_pool.py +++ b/agent/credential_pool.py @@ -964,6 +964,34 @@ class CredentialPool: self._mark_exhausted(entry, None) return None + # Codex OAuth refresh tokens are single-use. The sync→POST→write-back + # sequence below must run atomically across Hermes processes: otherwise + # two processes can both adopt the same on-disk token, both POST it, and + # the loser gets ``refresh_token_reused``. Serialize the whole sequence + # through the shared cross-process auth-store flock (the same lock and + # extended-timeout pattern used by resolve_codex_runtime_credentials()). + # When a waiter finally acquires the lock, the in-lock re-sync below + # picks up the rotated token the winner persisted and skips the POST. + if self.provider == "openai-codex": + refresh_timeout_seconds = auth_mod.env_float( + "HERMES_CODEX_REFRESH_TIMEOUT_SECONDS", 20 + ) + lock_timeout = max( + float(auth_mod.AUTH_LOCK_TIMEOUT_SECONDS), + float(refresh_timeout_seconds) + 5.0, + ) + with _auth_store_lock(timeout_seconds=lock_timeout): + synced = self._sync_codex_entry_from_auth_store(entry) + if synced is not entry: + entry = synced + if not force and not self._entry_needs_refresh(entry): + return entry + return self._refresh_entry_impl(entry, force=force) + return self._refresh_entry_impl(entry, force=force) + + def _refresh_entry_impl( + self, entry: PooledCredential, *, force: bool + ) -> Optional[PooledCredential]: try: if self.provider == "anthropic": from agent.anthropic_adapter import refresh_anthropic_oauth_pure diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py index d53f2cbbf..7c1a0d269 100644 --- a/hermes_cli/auth.py +++ b/hermes_cli/auth.py @@ -96,6 +96,11 @@ STEPFUN_STEP_PLAN_INTL_BASE_URL = "https://api.stepfun.ai/step_plan/v1" STEPFUN_STEP_PLAN_CN_BASE_URL = "https://api.stepfun.com/step_plan/v1" CODEX_OAUTH_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann" CODEX_OAUTH_TOKEN_URL = "https://auth.openai.com/oauth/token" +try: # Version tag for the Codex token-endpoint User-Agent; fall back if unavailable. + from hermes_cli import __version__ as _HERMES_CLI_VERSION +except Exception: # pragma: no cover - version import should always succeed + _HERMES_CLI_VERSION = "unknown" +CODEX_OAUTH_USER_AGENT = f"hermes-cli/{_HERMES_CLI_VERSION}" CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120 XAI_OAUTH_ISSUER = "https://auth.x.ai" XAI_OAUTH_DISCOVERY_URL = f"{XAI_OAUTH_ISSUER}/.well-known/openid-configuration" @@ -3608,7 +3613,13 @@ def refresh_codex_oauth_pure( ) timeout = httpx.Timeout(max(5.0, float(timeout_seconds))) - with httpx.Client(timeout=timeout, headers={"Accept": "application/json"}) as client: + with httpx.Client( + timeout=timeout, + headers={ + "Accept": "application/json", + "User-Agent": CODEX_OAUTH_USER_AGENT, + }, + ) as client: response = client.post( CODEX_OAUTH_TOKEN_URL, headers={"Content-Type": "application/x-www-form-urlencoded"}, diff --git a/tests/agent/test_credential_pool_oauth_writethrough.py b/tests/agent/test_credential_pool_oauth_writethrough.py index 819e304f4..f5379aef2 100644 --- a/tests/agent/test_credential_pool_oauth_writethrough.py +++ b/tests/agent/test_credential_pool_oauth_writethrough.py @@ -188,3 +188,71 @@ def test_write_through_helper_is_noop_in_classic_mode(monkeypatch, tmp_path): CP._write_through_provider_state_to_global_root( "openai-codex", {"tokens": {"access_token": "a", "refresh_token": "r"}} ) + + +def test_codex_pool_refresh_holds_auth_store_lock_across_post(monkeypatch, tmp_path): + """The Codex OAuth pool refresh must POST under the cross-process auth lock. + + Codex refresh tokens are single-use. If two Hermes processes both read the + same on-disk token and both POST it, the loser gets ``refresh_token_reused``. + Serializing the sync -> refresh POST -> write-back sequence through the + shared ``_auth_store_lock`` closes that window: a second process blocks on + the flock and, once inside, adopts the rotated token instead of re-POSTing. + + This asserts the invariant directly — that ``refresh_codex_oauth_pure`` is + only ever called while the auth-store lock is held — rather than snapshotting + any token value. + """ + provider = "openai-codex" + profile_path = tmp_path / "auth.json" + monkeypatch.setattr(A, "_auth_file_path", lambda: profile_path) + monkeypatch.setattr(A, "_global_auth_file_path", lambda: None) + monkeypatch.setenv("HOME", str(tmp_path / "not-the-root")) + + lock_held: dict = {"during_post": None} + real_lock = A._auth_store_lock + + depth = {"n": 0} + + import contextlib + + @contextlib.contextmanager + def tracking_lock(*args, **kwargs): + depth["n"] += 1 + try: + with real_lock(*args, **kwargs): + yield + finally: + depth["n"] -= 1 + + monkeypatch.setattr(A, "_auth_store_lock", tracking_lock) + # credential_pool imported _auth_store_lock by name; patch that binding too. + monkeypatch.setattr(CP, "_auth_store_lock", tracking_lock) + + def fake_refresh(access_token, refresh_token, **kwargs): + # The POST to the token endpoint must happen with the lock held. + lock_held["during_post"] = depth["n"] > 0 + return { + "access_token": "rotated-access", + "refresh_token": "rotated-refresh", + "last_refresh": "2020-01-02T00:00:00Z", + } + + monkeypatch.setattr(A, "refresh_codex_oauth_pure", fake_refresh) + + entry = _entry( + provider, + id="codex-1", + access_token="stale-access", + refresh_token="stale-refresh", + ) + pool = CredentialPool(provider, [entry]) + + refreshed = pool._refresh_entry(entry, force=True) + + assert refreshed is not None + assert refreshed.access_token == "rotated-access" + assert refreshed.refresh_token == "rotated-refresh" + # The invariant: the single-use token POST ran inside the auth-store lock. + assert lock_held["during_post"] is True +