fix(auth): serialize Codex OAuth pool refresh under the auth-store lock (#56233)
The credential-pool Codex refresh path synced tokens from auth.json and then POSTed the refresh_token to OpenAI's token endpoint without holding the cross-process auth-store lock across the whole read->POST->write-back sequence. Because Codex refresh tokens are single-use, two concurrent Hermes processes could both adopt the same on-disk token and both POST it; the loser got refresh_token_reused / invalid_grant. Wrap the Codex OAuth branch of _refresh_entry in the existing shared _auth_store_lock (reentrant, cross-process flock) using the same extended-timeout pattern resolve_codex_runtime_credentials() already uses. A waiting process now blocks on the lock and, once inside, the in-lock re-sync picks up the rotated token the winner persisted and skips its own POST. Also send User-Agent: hermes-cli/<version> on the refresh request. Credit @cooper-oai (#34820) for identifying the concurrent-refresh reuse race; this ships the narrow lock-serialization fix without the separate Codex auth-store partition.
This commit is contained in:
parent
a8a97c358f
commit
da6d5fcd13
3 changed files with 108 additions and 1 deletions
|
|
@ -964,6 +964,34 @@ class CredentialPool:
|
|||
self._mark_exhausted(entry, None)
|
||||
return None
|
||||
|
||||
# Codex OAuth refresh tokens are single-use. The sync→POST→write-back
|
||||
# sequence below must run atomically across Hermes processes: otherwise
|
||||
# two processes can both adopt the same on-disk token, both POST it, and
|
||||
# the loser gets ``refresh_token_reused``. Serialize the whole sequence
|
||||
# through the shared cross-process auth-store flock (the same lock and
|
||||
# extended-timeout pattern used by resolve_codex_runtime_credentials()).
|
||||
# When a waiter finally acquires the lock, the in-lock re-sync below
|
||||
# picks up the rotated token the winner persisted and skips the POST.
|
||||
if self.provider == "openai-codex":
|
||||
refresh_timeout_seconds = auth_mod.env_float(
|
||||
"HERMES_CODEX_REFRESH_TIMEOUT_SECONDS", 20
|
||||
)
|
||||
lock_timeout = max(
|
||||
float(auth_mod.AUTH_LOCK_TIMEOUT_SECONDS),
|
||||
float(refresh_timeout_seconds) + 5.0,
|
||||
)
|
||||
with _auth_store_lock(timeout_seconds=lock_timeout):
|
||||
synced = self._sync_codex_entry_from_auth_store(entry)
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
if not force and not self._entry_needs_refresh(entry):
|
||||
return entry
|
||||
return self._refresh_entry_impl(entry, force=force)
|
||||
return self._refresh_entry_impl(entry, force=force)
|
||||
|
||||
def _refresh_entry_impl(
|
||||
self, entry: PooledCredential, *, force: bool
|
||||
) -> Optional[PooledCredential]:
|
||||
try:
|
||||
if self.provider == "anthropic":
|
||||
from agent.anthropic_adapter import refresh_anthropic_oauth_pure
|
||||
|
|
|
|||
|
|
@ -96,6 +96,11 @@ STEPFUN_STEP_PLAN_INTL_BASE_URL = "https://api.stepfun.ai/step_plan/v1"
|
|||
STEPFUN_STEP_PLAN_CN_BASE_URL = "https://api.stepfun.com/step_plan/v1"
|
||||
CODEX_OAUTH_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
|
||||
CODEX_OAUTH_TOKEN_URL = "https://auth.openai.com/oauth/token"
|
||||
try: # Version tag for the Codex token-endpoint User-Agent; fall back if unavailable.
|
||||
from hermes_cli import __version__ as _HERMES_CLI_VERSION
|
||||
except Exception: # pragma: no cover - version import should always succeed
|
||||
_HERMES_CLI_VERSION = "unknown"
|
||||
CODEX_OAUTH_USER_AGENT = f"hermes-cli/{_HERMES_CLI_VERSION}"
|
||||
CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 120
|
||||
XAI_OAUTH_ISSUER = "https://auth.x.ai"
|
||||
XAI_OAUTH_DISCOVERY_URL = f"{XAI_OAUTH_ISSUER}/.well-known/openid-configuration"
|
||||
|
|
@ -3608,7 +3613,13 @@ def refresh_codex_oauth_pure(
|
|||
)
|
||||
|
||||
timeout = httpx.Timeout(max(5.0, float(timeout_seconds)))
|
||||
with httpx.Client(timeout=timeout, headers={"Accept": "application/json"}) as client:
|
||||
with httpx.Client(
|
||||
timeout=timeout,
|
||||
headers={
|
||||
"Accept": "application/json",
|
||||
"User-Agent": CODEX_OAUTH_USER_AGENT,
|
||||
},
|
||||
) as client:
|
||||
response = client.post(
|
||||
CODEX_OAUTH_TOKEN_URL,
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
||||
|
|
|
|||
|
|
@ -188,3 +188,71 @@ def test_write_through_helper_is_noop_in_classic_mode(monkeypatch, tmp_path):
|
|||
CP._write_through_provider_state_to_global_root(
|
||||
"openai-codex", {"tokens": {"access_token": "a", "refresh_token": "r"}}
|
||||
)
|
||||
|
||||
|
||||
def test_codex_pool_refresh_holds_auth_store_lock_across_post(monkeypatch, tmp_path):
|
||||
"""The Codex OAuth pool refresh must POST under the cross-process auth lock.
|
||||
|
||||
Codex refresh tokens are single-use. If two Hermes processes both read the
|
||||
same on-disk token and both POST it, the loser gets ``refresh_token_reused``.
|
||||
Serializing the sync -> refresh POST -> write-back sequence through the
|
||||
shared ``_auth_store_lock`` closes that window: a second process blocks on
|
||||
the flock and, once inside, adopts the rotated token instead of re-POSTing.
|
||||
|
||||
This asserts the invariant directly — that ``refresh_codex_oauth_pure`` is
|
||||
only ever called while the auth-store lock is held — rather than snapshotting
|
||||
any token value.
|
||||
"""
|
||||
provider = "openai-codex"
|
||||
profile_path = tmp_path / "auth.json"
|
||||
monkeypatch.setattr(A, "_auth_file_path", lambda: profile_path)
|
||||
monkeypatch.setattr(A, "_global_auth_file_path", lambda: None)
|
||||
monkeypatch.setenv("HOME", str(tmp_path / "not-the-root"))
|
||||
|
||||
lock_held: dict = {"during_post": None}
|
||||
real_lock = A._auth_store_lock
|
||||
|
||||
depth = {"n": 0}
|
||||
|
||||
import contextlib
|
||||
|
||||
@contextlib.contextmanager
|
||||
def tracking_lock(*args, **kwargs):
|
||||
depth["n"] += 1
|
||||
try:
|
||||
with real_lock(*args, **kwargs):
|
||||
yield
|
||||
finally:
|
||||
depth["n"] -= 1
|
||||
|
||||
monkeypatch.setattr(A, "_auth_store_lock", tracking_lock)
|
||||
# credential_pool imported _auth_store_lock by name; patch that binding too.
|
||||
monkeypatch.setattr(CP, "_auth_store_lock", tracking_lock)
|
||||
|
||||
def fake_refresh(access_token, refresh_token, **kwargs):
|
||||
# The POST to the token endpoint must happen with the lock held.
|
||||
lock_held["during_post"] = depth["n"] > 0
|
||||
return {
|
||||
"access_token": "rotated-access",
|
||||
"refresh_token": "rotated-refresh",
|
||||
"last_refresh": "2020-01-02T00:00:00Z",
|
||||
}
|
||||
|
||||
monkeypatch.setattr(A, "refresh_codex_oauth_pure", fake_refresh)
|
||||
|
||||
entry = _entry(
|
||||
provider,
|
||||
id="codex-1",
|
||||
access_token="stale-access",
|
||||
refresh_token="stale-refresh",
|
||||
)
|
||||
pool = CredentialPool(provider, [entry])
|
||||
|
||||
refreshed = pool._refresh_entry(entry, force=True)
|
||||
|
||||
assert refreshed is not None
|
||||
assert refreshed.access_token == "rotated-access"
|
||||
assert refreshed.refresh_token == "rotated-refresh"
|
||||
# The invariant: the single-use token POST ran inside the auth-store lock.
|
||||
assert lock_held["during_post"] is True
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue