diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index a45915299..bc2b7b15a 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -2743,7 +2743,7 @@ def _is_connection_error(exc: Exception) -> bool: def _is_transient_transport_error(exc: Exception) -> bool: - """Return True for a one-off transport blip worth retrying ONCE on the + """Return True for a one-off transport blip worth retrying ON the same provider before any provider/model fallback. Covers connection/streaming-close errors (via the canonical @@ -2761,6 +2761,34 @@ def _is_transient_transport_error(exc: Exception) -> bool: return isinstance(status, int) and (status == 408 or 500 <= status < 600) +_DEFAULT_TRANSIENT_RETRIES = 2 +# Base for exponential backoff between transient retries (seconds). Overridable +# so tests can zero it out and not sleep real wall-clock time. +_TRANSIENT_RETRY_BACKOFF_BASE = 1.0 + + +def _transient_retry_count() -> int: + """Number of same-provider retries for a transient transport blip. + + Read from ``auxiliary.transient_retries`` in config.yaml (default 2 → + 3 total attempts). Clamped to [0, 6] to bound worst-case wall time. A + connection blip to a pinned auxiliary target (e.g. a MoA reference + advisor) has no meaningful provider fallback, so a couple of retries with + backoff is the difference between recovering and silently losing the call. + Best-effort: any config-read failure falls back to the default. + """ + try: + from hermes_cli.config import cfg_get, load_config + + val = cfg_get(load_config(), "auxiliary", "transient_retries") + if val is None: + return _DEFAULT_TRANSIENT_RETRIES + n = int(val) + return max(0, min(n, 6)) + except Exception: + return _DEFAULT_TRANSIENT_RETRIES + + def _is_auth_error(exc: Exception) -> bool: """Detect auth failures that should trigger provider-specific refresh.""" status = getattr(exc, "status_code", None) @@ -5036,6 +5064,7 @@ def _client_cache_key( main_runtime: Optional[Dict[str, Any]] = None, is_vision: bool = False, task: Optional[str] = None, + model: Optional[str] = None, ) -> tuple: runtime = _normalize_main_runtime(main_runtime) runtime_key = tuple(runtime.get(field, "") for field in _MAIN_RUNTIME_FIELDS) if provider == "auto" else () @@ -5044,7 +5073,17 @@ def _client_cache_key( # old cache shape because the explicit provider/model tuple is sufficient. task_key = (task or "") if provider == "auto" else "" pool_hint = _pool_cache_hint(provider, main_runtime=main_runtime) - return (provider, async_mode, base_url or "", api_key or "", api_mode or "", runtime_key, is_vision, task_key, pool_hint) + # The model MUST participate in the key. Two concurrent auxiliary calls to + # the SAME provider/base_url/key but DIFFERENT models (e.g. a MoA reference + # fan-out running opus + gpt-5.5 in parallel threads) would otherwise share + # one cache entry. On a cache MISS both build a client for the same key; the + # second's _store_cached_client sees the first as the "old" entry and CLOSES + # it — while the first call is still mid-request on it — yielding a spurious + # APIConnectionError that fails the sibling advisor (root cause of the run2 + # double-advisor "Connection error" collapse). Keying on model gives each + # model its own client, so concurrent fan-out calls never cross-close. + model_key = model or "" + return (provider, async_mode, base_url or "", api_key or "", api_mode or "", runtime_key, is_vision, task_key, pool_hint, model_key) def _store_cached_client(cache_key: tuple, client: Any, default_model: Optional[str], *, bound_loop: Any = None) -> None: @@ -5100,6 +5139,7 @@ def _refresh_nous_auxiliary_client( api_mode=api_mode, main_runtime=main_runtime, is_vision=is_vision, + model=final_model, ) _store_cached_client(cache_key, client, final_model, bound_loop=current_loop) return client, final_model @@ -5276,6 +5316,7 @@ def _get_cached_client( main_runtime=main_runtime, is_vision=is_vision, task=task, + model=model, ) with _client_cache_lock: if cache_key in _client_cache: @@ -6005,15 +6046,22 @@ def call_llm( # Handle unsupported temperature, max_tokens vs max_completion_tokens retry, # then payment fallback. try: - # Retry ONCE on the same provider for a one-off transient transport - # blip (streaming-close / incomplete chunked read / 5xx / 408) before - # the except-chain below escalates to provider/model fallback. A - # single dropped connection shouldn't abandon an otherwise-healthy - # provider. A second failure (or any non-transient error) falls - # through to ``first_err`` and the existing fallback handling - # unchanged. This is the unified home for the transient retry that - # every auxiliary task (compression, memory flush, title-gen, - # session-search, vision) shares. (PR #16587) + # Retry on the same provider for a transient transport blip + # (connection reset / streaming-close / incomplete chunked read / 5xx / + # 408) before the except-chain below escalates to provider/model + # fallback. A dropped connection shouldn't abandon an otherwise-healthy + # provider — this especially matters for pinned auxiliary calls like MoA + # reference advisors, where "fallback to another provider" is not a + # meaningful recovery (the advisor is a specific model), so a transient + # blip that isn't retried simply loses that advisor for the turn (root + # of the run2 double-advisor "Connection error" collapse — a genuine + # upstream blip hitting both parallel advisors at once). + # + # Attempts are bounded and use exponential backoff. Count is configurable + # via auxiliary.transient_retries (default 2 retries → 3 total attempts); + # a second/third failure or any non-transient error falls through to + # ``first_err`` and the existing fallback handling unchanged. Unified home + # for the transient retry every auxiliary task shares. (PR #16587) try: return _validate_llm_response( client.chat.completions.create(**kwargs), task) @@ -6035,13 +6083,26 @@ def call_llm( transient_err, ) raise - logger.info( - "Auxiliary %s: transient transport error; retrying once on " - "the same provider before fallback: %s", - task or "call", transient_err, - ) - return _validate_llm_response( - client.chat.completions.create(**kwargs), task) + _max_transient_retries = _transient_retry_count() + _last_transient = transient_err + for _attempt in range(1, _max_transient_retries + 1): + _backoff = min(_TRANSIENT_RETRY_BACKOFF_BASE * (2.0 ** (_attempt - 1)), 8.0) + logger.info( + "Auxiliary %s: transient transport error (attempt %d/%d); " + "retrying same provider after %.1fs before fallback: %s", + task or "call", _attempt, _max_transient_retries, _backoff, + _last_transient, + ) + time.sleep(_backoff) + try: + return _validate_llm_response( + client.chat.completions.create(**kwargs), task) + except Exception as retry_transient: + if not _is_transient_transport_error(retry_transient): + raise + _last_transient = retry_transient + # Retries exhausted — fall through to first_err fallback handling. + raise _last_transient except Exception as first_err: if "temperature" in kwargs and _is_unsupported_temperature_error(first_err): retry_kwargs = dict(kwargs) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index d98dd26d4..ac2dedd71 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1465,6 +1465,13 @@ DEFAULT_CONFIG = { # Each aux task is independent — main-agent provider_routing and # openrouter.min_coding_score do NOT propagate to aux calls by design. "auxiliary": { + # Same-provider retries for a transient transport blip (connection + # reset / timeout / 5xx / 408) on ANY auxiliary call before falling + # back. Default 2 (→ 3 total attempts), clamped [0,6]. Matters most for + # pinned calls like MoA reference advisors, where provider fallback is + # not a meaningful recovery, so an unretried blip silently loses the + # call. + "transient_retries": 2, "vision": { "provider": "auto", # auto | openrouter | nous | codex | custom "model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o" diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index c1a9f0852..b5901391e 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -1523,7 +1523,13 @@ class TestAuxiliaryPoolAwareness: assert client is fake_client assert model == "openai/gpt-5.4-mini" - assert mock_resolve.call_count == 1 + # A DIFFERENT model resolves its own client (model participates in the + # cache key). This isolation is what stops two concurrent advisors on + # the same provider/base_url/key (e.g. a MoA fan-out) from sharing — and + # racing the lifecycle of — one cached client. Same-model reuse is still + # a single resolve (verified elsewhere); distinct models => distinct + # resolves. + assert mock_resolve.call_count == 2 # ── Payment / credit exhaustion fallback ───────────────────────────────── @@ -2581,6 +2587,8 @@ class TestTransientTransportRetry: p1, p2, p3 = self._patches(primary) with ( p1, p2, p3, + patch("agent.auxiliary_client._transient_retry_count", return_value=1), + patch("agent.auxiliary_client._TRANSIENT_RETRY_BACKOFF_BASE", 0.0), patch( "agent.auxiliary_client._try_configured_fallback_chain", return_value=(None, None, ""), @@ -2592,7 +2600,7 @@ class TestTransientTransportRetry: ): result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}]) assert result == {"fallback": True} - # Primary tried twice (initial + same-target retry), then fallback. + # Primary tried twice (initial + one same-target retry), then fallback. assert primary.chat.completions.create.call_count == 2 assert fb_client.chat.completions.create.call_count == 1 diff --git a/tests/agent/test_auxiliary_transient_retry.py b/tests/agent/test_auxiliary_transient_retry.py new file mode 100644 index 000000000..ac46cdbce --- /dev/null +++ b/tests/agent/test_auxiliary_transient_retry.py @@ -0,0 +1,82 @@ +"""Transient-transport retry count + per-model client-cache isolation. + +Two related hardening behaviors for auxiliary calls (which include MoA +reference advisors, a pinned-model path where provider fallback is not a +meaningful recovery): + +1. A transient transport blip (connection reset / timeout / 5xx) is retried + on the SAME provider several times with backoff before giving up — a single + upstream blip should not silently lose a pinned auxiliary call (root of the + run2 double-advisor "Connection error" collapse). +2. Two auxiliary calls to the same provider/base_url/key but DIFFERENT models + get DISTINCT client-cache keys, so a concurrent fan-out (e.g. opus + gpt-5.5 + advisors) never shares one client entry. +""" + +from __future__ import annotations + +import os +import types +from unittest.mock import patch + +import pytest + + +class _ConnErr(Exception): + """Stand-in that the transient detector recognizes as a connection blip.""" + + +def test_transient_retry_count_default(monkeypatch): + from agent import auxiliary_client as ac + + # No config value -> default. + monkeypatch.setattr(ac, "load_config", lambda: {}, raising=False) + with patch("hermes_cli.config.load_config", return_value={}), \ + patch("hermes_cli.config.cfg_get", return_value=None): + assert ac._transient_retry_count() == ac._DEFAULT_TRANSIENT_RETRIES + + +def test_transient_retry_count_configurable_and_clamped(): + from agent import auxiliary_client as ac + + with patch("hermes_cli.config.cfg_get", return_value=4): + assert ac._transient_retry_count() == 4 + with patch("hermes_cli.config.cfg_get", return_value=100): + assert ac._transient_retry_count() == 6 # clamped high + with patch("hermes_cli.config.cfg_get", return_value=-3): + assert ac._transient_retry_count() == 0 # clamped low + with patch("hermes_cli.config.cfg_get", side_effect=RuntimeError): + assert ac._transient_retry_count() == ac._DEFAULT_TRANSIENT_RETRIES + + +def test_model_participates_in_client_cache_key(): + """Same provider/base_url/key, different model -> different cache key. + + This is what stops two concurrent advisors from sharing (and racing on) + one cached client entry.""" + from agent.auxiliary_client import _client_cache_key + + k_opus = _client_cache_key( + "openrouter", async_mode=False, base_url="https://openrouter.ai/api/v1", + api_key="K", model="anthropic/claude-opus-4.8", + ) + k_gpt = _client_cache_key( + "openrouter", async_mode=False, base_url="https://openrouter.ai/api/v1", + api_key="K", model="openai/gpt-5.5", + ) + assert k_opus != k_gpt + # Same model still collides (cache still works for reuse). + k_opus2 = _client_cache_key( + "openrouter", async_mode=False, base_url="https://openrouter.ai/api/v1", + api_key="K", model="anthropic/claude-opus-4.8", + ) + assert k_opus == k_opus2 + + +def test_missing_model_key_is_stable(): + """Omitting model (legacy callers) is still a valid, stable key.""" + from agent.auxiliary_client import _client_cache_key + + a = _client_cache_key("openrouter", async_mode=False, base_url="u", api_key="k") + b = _client_cache_key("openrouter", async_mode=False, base_url="u", api_key="k") + assert a == b