fix(auth): serialize Codex OAuth pool refresh under the auth-store lock (#56233)

The credential-pool Codex refresh path synced tokens from auth.json and
then POSTed the refresh_token to OpenAI's token endpoint without holding
the cross-process auth-store lock across the whole read->POST->write-back
sequence. Because Codex refresh tokens are single-use, two concurrent
Hermes processes could both adopt the same on-disk token and both POST
it; the loser got refresh_token_reused / invalid_grant.

Wrap the Codex OAuth branch of _refresh_entry in the existing shared
_auth_store_lock (reentrant, cross-process flock) using the same
extended-timeout pattern resolve_codex_runtime_credentials() already
uses. A waiting process now blocks on the lock and, once inside, the
in-lock re-sync picks up the rotated token the winner persisted and
skips its own POST. Also send User-Agent: hermes-cli/<version> on the
refresh request.

Credit @cooper-oai (#34820) for identifying the concurrent-refresh
reuse race; this ships the narrow lock-serialization fix without the
separate Codex auth-store partition.
This commit is contained in:
Teknium 2026-07-01 02:45:07 -07:00 committed by GitHub
parent a8a97c358f
commit da6d5fcd13
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 108 additions and 1 deletions

View file

@ -964,6 +964,34 @@ class CredentialPool:
self._mark_exhausted(entry, None)
return None
# Codex OAuth refresh tokens are single-use. The sync→POST→write-back
# sequence below must run atomically across Hermes processes: otherwise
# two processes can both adopt the same on-disk token, both POST it, and
# the loser gets ``refresh_token_reused``. Serialize the whole sequence
# through the shared cross-process auth-store flock (the same lock and
# extended-timeout pattern used by resolve_codex_runtime_credentials()).
# When a waiter finally acquires the lock, the in-lock re-sync below
# picks up the rotated token the winner persisted and skips the POST.
if self.provider == "openai-codex":
refresh_timeout_seconds = auth_mod.env_float(
"HERMES_CODEX_REFRESH_TIMEOUT_SECONDS", 20
)
lock_timeout = max(
float(auth_mod.AUTH_LOCK_TIMEOUT_SECONDS),
float(refresh_timeout_seconds) + 5.0,
)
with _auth_store_lock(timeout_seconds=lock_timeout):
synced = self._sync_codex_entry_from_auth_store(entry)
if synced is not entry:
entry = synced
if not force and not self._entry_needs_refresh(entry):
return entry
return self._refresh_entry_impl(entry, force=force)
return self._refresh_entry_impl(entry, force=force)
def _refresh_entry_impl(
self, entry: PooledCredential, *, force: bool
) -> Optional[PooledCredential]:
try:
if self.provider == "anthropic":
from agent.anthropic_adapter import refresh_anthropic_oauth_pure

View file

@ -96,6 +96,11 @@ STEPFUN_STEP_PLAN_INTL_BASE_URL = "https://api.stepfun.ai/step_plan/v1"
STEPFUN_STEP_PLAN_CN_BASE_URL = "https://api.stepfun.com/step_plan/v1"
CODEX_OAUTH_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
CODEX_OAUTH_TOKEN_URL = "https://auth.openai.com/oauth/token"
try: # Version tag for the Codex token-endpoint User-Agent; fall back if unavailable.
from hermes_cli import __version__ as _HERMES_CLI_VERSION
except Exception: # pragma: no cover - version import should always succeed
_HERMES_CLI_VERSION = "unknown"
CODEX_OAUTH_USER_AGENT = f"hermes-cli/{_HERMES_CLI_VERSION}"
CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
XAI_OAUTH_ISSUER = "https://auth.x.ai"
XAI_OAUTH_DISCOVERY_URL = f"{XAI_OAUTH_ISSUER}/.well-known/openid-configuration"
@ -3608,7 +3613,13 @@ def refresh_codex_oauth_pure(
)
timeout = httpx.Timeout(max(5.0, float(timeout_seconds)))
with httpx.Client(timeout=timeout, headers={"Accept": "application/json"}) as client:
with httpx.Client(
timeout=timeout,
headers={
"Accept": "application/json",
"User-Agent": CODEX_OAUTH_USER_AGENT,
},
) as client:
response = client.post(
CODEX_OAUTH_TOKEN_URL,
headers={"Content-Type": "application/x-www-form-urlencoded"},

View file

@ -188,3 +188,71 @@ def test_write_through_helper_is_noop_in_classic_mode(monkeypatch, tmp_path):
CP._write_through_provider_state_to_global_root(
"openai-codex", {"tokens": {"access_token": "a", "refresh_token": "r"}}
)
def test_codex_pool_refresh_holds_auth_store_lock_across_post(monkeypatch, tmp_path):
"""The Codex OAuth pool refresh must POST under the cross-process auth lock.
Codex refresh tokens are single-use. If two Hermes processes both read the
same on-disk token and both POST it, the loser gets ``refresh_token_reused``.
Serializing the sync -> refresh POST -> write-back sequence through the
shared ``_auth_store_lock`` closes that window: a second process blocks on
the flock and, once inside, adopts the rotated token instead of re-POSTing.
This asserts the invariant directly that ``refresh_codex_oauth_pure`` is
only ever called while the auth-store lock is held rather than snapshotting
any token value.
"""
provider = "openai-codex"
profile_path = tmp_path / "auth.json"
monkeypatch.setattr(A, "_auth_file_path", lambda: profile_path)
monkeypatch.setattr(A, "_global_auth_file_path", lambda: None)
monkeypatch.setenv("HOME", str(tmp_path / "not-the-root"))
lock_held: dict = {"during_post": None}
real_lock = A._auth_store_lock
depth = {"n": 0}
import contextlib
@contextlib.contextmanager
def tracking_lock(*args, **kwargs):
depth["n"] += 1
try:
with real_lock(*args, **kwargs):
yield
finally:
depth["n"] -= 1
monkeypatch.setattr(A, "_auth_store_lock", tracking_lock)
# credential_pool imported _auth_store_lock by name; patch that binding too.
monkeypatch.setattr(CP, "_auth_store_lock", tracking_lock)
def fake_refresh(access_token, refresh_token, **kwargs):
# The POST to the token endpoint must happen with the lock held.
lock_held["during_post"] = depth["n"] > 0
return {
"access_token": "rotated-access",
"refresh_token": "rotated-refresh",
"last_refresh": "2020-01-02T00:00:00Z",
}
monkeypatch.setattr(A, "refresh_codex_oauth_pure", fake_refresh)
entry = _entry(
provider,
id="codex-1",
access_token="stale-access",
refresh_token="stale-refresh",
)
pool = CredentialPool(provider, [entry])
refreshed = pool._refresh_entry(entry, force=True)
assert refreshed is not None
assert refreshed.access_token == "rotated-access"
assert refreshed.refresh_token == "rotated-refresh"
# The invariant: the single-use token POST ran inside the auth-store lock.
assert lock_held["during_post"] is True