fix: make Nous Portal access token resolution resilient

- Track auth store source path on Nous state reads and write rotated
  OAuth refresh tokens back to the same store, preventing stale-token
  replays when Hermes falls back to a global/root auth.json.
- Skip Nous fallback entries locally when no access/refresh token is
  present, suppressing repeated failed resolution attempts within a
  session.
- Sync session model metadata after fallback switches so the gateway
  DB reflects the backend that actually served the latest turn.
This commit is contained in:
HODLCLONE 2026-06-19 20:20:01 -04:00 committed by Teknium
parent cfbc7ed1f9
commit 6ed2f5d76f
4 changed files with 276 additions and 32 deletions

View file

@ -1124,6 +1124,35 @@ def rewrite_prompt_model_identity(agent, model: str, provider: str) -> None:
agent._cached_system_prompt = sp
def _fallback_entry_key(fb: dict) -> tuple[str, str, str]:
return (
str(fb.get("provider") or "").strip().lower(),
str(fb.get("model") or "").strip(),
str(fb.get("base_url") or "").strip().rstrip("/"),
)
def _fallback_entry_unavailable_without_network(agent, fb: dict) -> Optional[str]:
"""Return a skip reason for fallback entries known to be unusable locally."""
fb_provider = (fb.get("provider") or "").strip().lower()
if fb_provider != "nous":
return None
try:
from hermes_cli.auth import get_provider_auth_state
state = get_provider_auth_state("nous") or {}
except Exception as exc:
return f"nous_auth_unreadable:{type(exc).__name__}"
access_value = state.get("access_token")
refresh_value = state.get("refresh_token")
has_access = isinstance(access_value, str) and bool(access_value.strip())
has_refresh = isinstance(refresh_value, str) and bool(refresh_value.strip())
if not (has_access or has_refresh):
return "nous_token_missing"
return None
def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool:
"""Switch to the next fallback model/provider in the chain.
@ -1164,10 +1193,29 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
return False
fb = agent._fallback_chain[agent._fallback_index]
agent._fallback_index += 1
fb_key = _fallback_entry_key(fb)
unavailable = getattr(agent, "_unavailable_fallback_keys", None)
if unavailable is None:
unavailable = set()
agent._unavailable_fallback_keys = unavailable
if fb_key in unavailable:
logger.debug("Fallback skip: %s previously marked unavailable", fb_key)
return agent._try_activate_fallback(reason)
fb_provider = (fb.get("provider") or "").strip().lower()
fb_model = (fb.get("model") or "").strip()
if not fb_provider or not fb_model:
return agent._try_activate_fallback() # skip invalid, try next
return agent._try_activate_fallback(reason) # skip invalid, try next
local_skip_reason = _fallback_entry_unavailable_without_network(agent, fb)
if local_skip_reason:
unavailable.add(fb_key)
logger.warning(
"Fallback skip: %s/%s is not locally usable (%s); suppressing for this session",
fb_provider,
fb_model,
local_skip_reason,
)
return agent._try_activate_fallback(reason)
# Skip entries that resolve to the current (provider, model) — falling
# back to the same backend that just failed loops the failure. Compare
@ -1182,7 +1230,7 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
"Fallback skip: chain entry %s/%s matches current provider/model",
fb_provider, fb_model,
)
return agent._try_activate_fallback()
return agent._try_activate_fallback(reason)
if (
fb_base_url_for_dedup
and current_base_url
@ -1193,7 +1241,7 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
"Fallback skip: chain entry base_url %s matches current backend",
fb_base_url_for_dedup,
)
return agent._try_activate_fallback()
return agent._try_activate_fallback(reason)
# Use centralized router for client construction.
# raw_codex=True because the main agent needs direct responses.stream()
@ -1224,7 +1272,8 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
logger.warning(
"Fallback to %s failed: provider not configured",
fb_provider)
return agent._try_activate_fallback() # try next in chain
unavailable.add(fb_key)
return agent._try_activate_fallback(reason) # try next in chain
try:
from hermes_cli.model_normalize import normalize_model_for_provider
@ -1425,8 +1474,10 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
)
return True
except Exception as e:
if fb_provider == "nous":
unavailable.add(fb_key)
logger.error("Failed to activate fallback %s: %s", fb_model, e)
return agent._try_activate_fallback() # try next in chain
return agent._try_activate_fallback(reason) # try next in chain