diff --git a/gateway/run.py b/gateway/run.py index f86b5c98c..3db565895 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1734,9 +1734,9 @@ class GatewayRunner: # as a fallback when a fresh config read transiently returns an empty # model (e.g. an mtime-keyed config-cache miss during a post-interrupt # recovery turn). Without this, the agent is built with model="" and - # every API call fails HTTP 400 "No models provided" — the session - # goes silent until the user manually re-sends. See #35314. The "*" - # key holds a process-wide last-known-good for first-seen sessions. + # every API call fails HTTP 400 "No models provided" — the session goes + # silent until the user manually re-sends. See #35314. ``"*"`` holds a + # process-wide last-known-good for sessions seen for the first time. self._last_resolved_model: Dict[str, str] = {} # Overflow buffer for explicit /queue commands. The adapter-level # _pending_messages dict is a single slot per session (designed for @@ -2502,8 +2502,8 @@ class GatewayRunner: # successfully resolved for this session (or, failing that, the most # recent one resolved process-wide). Building an agent with model="" # makes every API call fail HTTP 400 "No models provided" and the - # session goes silent until the user manually re-sends. getattr guards - # against bare test runners built via object.__new__. + # session goes silent until the user manually re-sends. ``getattr`` + # guards against bare test runners built via ``object.__new__``. _last_good = getattr(self, "_last_resolved_model", None) if _last_good is not None: if not model: @@ -2818,10 +2818,12 @@ class GatewayRunner: """Mark a queued platform as paused — keep it in ``_failed_platforms`` but stop the reconnect watcher from hammering it. - Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive - retryable failures, and by ``/platform pause `` for manual - intervention. Paused platforms are surfaced in ``/platform list`` - and resumed with ``/platform resume ``. + Used by ``/platform pause `` for manual operator intervention. + Paused platforms are surfaced in ``/platform list`` and resumed with + ``/platform resume ``. Note: the reconnect watcher does NOT + auto-pause — retryable (network/DNS) failures keep retrying at the + backoff cap indefinitely so a transient outage self-heals without + manual intervention. """ info = getattr(self, "_failed_platforms", {}).get(platform) if info is None: @@ -5899,15 +5901,17 @@ class GatewayRunner: """Background task that periodically retries connecting failed platforms. Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap). - Retryable failures keep retrying at the backoff cap indefinitely - — but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row - without ever succeeding, it is *paused*: kept in the retry queue - but no longer hammered. The user surfaces it with ``/platform list`` - and resumes it with ``/platform resume ``. Non-retryable - failures (bad auth, etc.) still drop out of the queue immediately. + Retryable failures (network/DNS blips) keep retrying at the backoff + cap indefinitely — they self-heal once connectivity returns, so a + transient outage never requires manual intervention. Non-retryable + failures (bad auth, etc.) drop out of the queue immediately. The + circuit breaker (``_pause_failed_platform`` / ``/platform pause``) + remains available for manual operator control via ``/platform list`` + and ``/platform resume ``, but is no longer triggered + automatically — auto-pausing a recovered platform was the cause of + bots silently staying dead after a transient DNS failure. """ _BACKOFF_CAP = 300 # 5 minutes max between retries - _PAUSE_AFTER_FAILURES = 10 # circuit-breaker threshold await asyncio.sleep(10) # initial delay — let startup finish while self._running: @@ -6002,14 +6006,14 @@ class GatewayRunner: "Reconnect %s failed, next retry in %ds", platform.value, backoff, ) - if attempt >= _PAUSE_AFTER_FAILURES: - self._pause_failed_platform( - platform, - reason=( - adapter.fatal_error_message - or "failed to reconnect" - ), - ) + # Retryable failures (network/DNS blips) keep retrying + # at the backoff cap indefinitely — they self-heal once + # connectivity returns. We do NOT auto-pause them: a + # transient outage must never require manual `/platform + # resume` to recover. Non-retryable failures (bad auth, + # etc.) already drop out of the queue via the + # `not fatal_error_retryable` branch above, so anything + # reaching here is by definition retryable. except Exception as e: self._update_platform_runtime_status( platform.value, @@ -6024,8 +6028,9 @@ class GatewayRunner: "Reconnect %s error: %s, next retry in %ds", platform.value, e, backoff, ) - if attempt >= _PAUSE_AFTER_FAILURES: - self._pause_failed_platform(platform, reason=str(e)) + # A raised exception during reconnect (connect timeout, DNS + # resolution failure, etc.) is inherently transient — keep + # retrying at the backoff cap rather than auto-pausing. # Check every 10 seconds for platforms that need reconnection for _ in range(10): diff --git a/tests/gateway/test_platform_reconnect.py b/tests/gateway/test_platform_reconnect.py index 1a5a35a42..3cd507550 100644 --- a/tests/gateway/test_platform_reconnect.py +++ b/tests/gateway/test_platform_reconnect.py @@ -294,19 +294,20 @@ class TestPlatformReconnectWatcher: assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2 @pytest.mark.asyncio - async def test_reconnect_pauses_after_circuit_breaker_threshold(self): - """After enough consecutive retryable failures, the watcher should - *pause* the platform (keep it in the queue but stop hammering it), - not drop it. The user resumes via /platform resume. + async def test_reconnect_never_auto_pauses_retryable_failures(self): + """Retryable failures (network/DNS) must keep retrying indefinitely — + the watcher must NOT auto-pause them. Auto-pausing a transiently-failed + platform left bots silently dead after a DNS blip (#35284). The pause + circuit breaker remains available for manual /platform pause only. """ runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") - # 9 prior attempts — the next failure will be the 10th and should - # trip the circuit breaker. + # Far past the old circuit-breaker threshold (10): even after many + # consecutive retryable failures the platform must stay unpaused. runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, - "attempts": 9, + "attempts": 25, "next_retry": time.monotonic() - 1, } @@ -332,12 +333,15 @@ class TestPlatformReconnectWatcher: await run_one_iteration() - # Platform stays in queue — paused, not dropped + # Platform stays in queue and keeps retrying — never auto-paused. assert Platform.TELEGRAM in runner._failed_platforms info = runner._failed_platforms[Platform.TELEGRAM] - assert info["paused"] is True - assert info["attempts"] == 10 - assert "pause_reason" in info + assert info.get("paused") is not True + assert "pause_reason" not in info + assert info["attempts"] == 26 + # next_retry is pushed out by the backoff (capped at 300s), not inf. + assert info["next_retry"] != float("inf") + assert info["next_retry"] > time.monotonic() @pytest.mark.asyncio async def test_reconnect_skips_paused_platforms(self):