fix(gateway): never auto-pause platforms on transient network/DNS failures (#35387)

The per-platform reconnect watcher auto-paused a platform after 10 consecutive reconnect failures, setting next_retry=inf and requiring a manual /platform resume to recover. But both pause sites only ever fire on *retryable* failures — non-retryable errors (bad auth) already drop out of the retry queue earlier. So a transient DNS outage that spanned the watcher's backoff window would silently park the bot forever, even after connectivity returned. The watcher's own docstring already promised 'retryable failures keep retrying at the backoff cap indefinitely' — the code contradicted it. Remove the auto-pause from both reconnect-failure branches. Retryable failures now retry at the 5-min backoff cap forever and self-heal once the network recovers. The circuit breaker (_pause_failed_platform / _resume_paused_platform) stays for manual /platform pause|resume. Fixes #35284.
2026-05-30 07:33:34 -07:00 · 2026-05-30 07:33:34 -07:00 · 45465b0d5d
commit 45465b0d5d
parent cddb7283d9
2 changed files with 46 additions and 37 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -1734,9 +1734,9 @@ class GatewayRunner:
        # as a fallback when a fresh config read transiently returns an empty
        # model (e.g. an mtime-keyed config-cache miss during a post-interrupt
        # recovery turn). Without this, the agent is built with model="" and
-        # every API call fails HTTP 400 "No models provided" — the session
-        # goes silent until the user manually re-sends. See #35314. The "*"
-        # key holds a process-wide last-known-good for first-seen sessions.
+        # every API call fails HTTP 400 "No models provided" — the session goes
+        # silent until the user manually re-sends. See #35314. ``"*"`` holds a
+        # process-wide last-known-good for sessions seen for the first time.
        self._last_resolved_model: Dict[str, str] = {}
        # Overflow buffer for explicit /queue commands.  The adapter-level
        # _pending_messages dict is a single slot per session (designed for
@ -2502,8 +2502,8 @@ class GatewayRunner:
        # successfully resolved for this session (or, failing that, the most
        # recent one resolved process-wide). Building an agent with model=""
        # makes every API call fail HTTP 400 "No models provided" and the
-        # session goes silent until the user manually re-sends. getattr guards
-        # against bare test runners built via object.__new__.
+        # session goes silent until the user manually re-sends. ``getattr``
+        # guards against bare test runners built via ``object.__new__``.
        _last_good = getattr(self, "_last_resolved_model", None)
        if _last_good is not None:
            if not model:
@ -2818,10 +2818,12 @@ class GatewayRunner:
        """Mark a queued platform as paused — keep it in ``_failed_platforms``
        but stop the reconnect watcher from hammering it.

-        Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
-        retryable failures, and by ``/platform pause <name>`` for manual
-        intervention.  Paused platforms are surfaced in ``/platform list``
-        and resumed with ``/platform resume <name>``.
+        Used by ``/platform pause <name>`` for manual operator intervention.
+        Paused platforms are surfaced in ``/platform list`` and resumed with
+        ``/platform resume <name>``.  Note: the reconnect watcher does NOT
+        auto-pause — retryable (network/DNS) failures keep retrying at the
+        backoff cap indefinitely so a transient outage self-heals without
+        manual intervention.
        """
        info = getattr(self, "_failed_platforms", {}).get(platform)
        if info is None:
@ -5899,15 +5901,17 @@ class GatewayRunner:
        """Background task that periodically retries connecting failed platforms.

        Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap).
-        Retryable failures keep retrying at the backoff cap indefinitely
-        — but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
-        without ever succeeding, it is *paused*: kept in the retry queue
-        but no longer hammered.  The user surfaces it with ``/platform list``
-        and resumes it with ``/platform resume <name>``.  Non-retryable
-        failures (bad auth, etc.) still drop out of the queue immediately.
+        Retryable failures (network/DNS blips) keep retrying at the backoff
+        cap indefinitely — they self-heal once connectivity returns, so a
+        transient outage never requires manual intervention. Non-retryable
+        failures (bad auth, etc.) drop out of the queue immediately. The
+        circuit breaker (``_pause_failed_platform`` / ``/platform pause``)
+        remains available for manual operator control via ``/platform list``
+        and ``/platform resume <name>``, but is no longer triggered
+        automatically — auto-pausing a recovered platform was the cause of
+        bots silently staying dead after a transient DNS failure.
        """
        _BACKOFF_CAP = 300  # 5 minutes max between retries
-        _PAUSE_AFTER_FAILURES = 10  # circuit-breaker threshold

        await asyncio.sleep(10)  # initial delay — let startup finish
        while self._running:
@ -6002,14 +6006,14 @@ class GatewayRunner:
                            "Reconnect %s failed, next retry in %ds",
                            platform.value, backoff,
                        )
-                        if attempt >= _PAUSE_AFTER_FAILURES:
-                            self._pause_failed_platform(
-                                platform,
-                                reason=(
-                                    adapter.fatal_error_message
-                                    or "failed to reconnect"
-                                ),
-                            )
+                        # Retryable failures (network/DNS blips) keep retrying
+                        # at the backoff cap indefinitely — they self-heal once
+                        # connectivity returns. We do NOT auto-pause them: a
+                        # transient outage must never require manual `/platform
+                        # resume` to recover. Non-retryable failures (bad auth,
+                        # etc.) already drop out of the queue via the
+                        # `not fatal_error_retryable` branch above, so anything
+                        # reaching here is by definition retryable.
                except Exception as e:
                    self._update_platform_runtime_status(
                        platform.value,
@ -6024,8 +6028,9 @@ class GatewayRunner:
                        "Reconnect %s error: %s, next retry in %ds",
                        platform.value, e, backoff,
                    )
-                    if attempt >= _PAUSE_AFTER_FAILURES:
-                        self._pause_failed_platform(platform, reason=str(e))
+                    # A raised exception during reconnect (connect timeout, DNS
+                    # resolution failure, etc.) is inherently transient — keep
+                    # retrying at the backoff cap rather than auto-pausing.

            # Check every 10 seconds for platforms that need reconnection
            for _ in range(10):
--- a/tests/gateway/test_platform_reconnect.py
+++ b/tests/gateway/test_platform_reconnect.py
@ -294,19 +294,20 @@ class TestPlatformReconnectWatcher:
        assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2

    @pytest.mark.asyncio
-    async def test_reconnect_pauses_after_circuit_breaker_threshold(self):
-        """After enough consecutive retryable failures, the watcher should
-        *pause* the platform (keep it in the queue but stop hammering it),
-        not drop it. The user resumes via /platform resume.
+    async def test_reconnect_never_auto_pauses_retryable_failures(self):
+        """Retryable failures (network/DNS) must keep retrying indefinitely —
+        the watcher must NOT auto-pause them. Auto-pausing a transiently-failed
+        platform left bots silently dead after a DNS blip (#35284). The pause
+        circuit breaker remains available for manual /platform pause only.
        """
        runner = _make_runner()

        platform_config = PlatformConfig(enabled=True, token="test")
-        # 9 prior attempts — the next failure will be the 10th and should
-        # trip the circuit breaker.
+        # Far past the old circuit-breaker threshold (10): even after many
+        # consecutive retryable failures the platform must stay unpaused.
        runner._failed_platforms[Platform.TELEGRAM] = {
            "config": platform_config,
-            "attempts": 9,
+            "attempts": 25,
            "next_retry": time.monotonic() - 1,
        }

@ -332,12 +333,15 @@ class TestPlatformReconnectWatcher:

            await run_one_iteration()

-        # Platform stays in queue — paused, not dropped
+        # Platform stays in queue and keeps retrying — never auto-paused.
        assert Platform.TELEGRAM in runner._failed_platforms
        info = runner._failed_platforms[Platform.TELEGRAM]
-        assert info["paused"] is True
-        assert info["attempts"] == 10
-        assert "pause_reason" in info
+        assert info.get("paused") is not True
+        assert "pause_reason" not in info
+        assert info["attempts"] == 26
+        # next_retry is pushed out by the backoff (capped at 300s), not inf.
+        assert info["next_retry"] != float("inf")
+        assert info["next_retry"] > time.monotonic()

    @pytest.mark.asyncio
    async def test_reconnect_skips_paused_platforms(self):