fix(gateway): never auto-pause platforms on transient network/DNS failures (#35387)

The per-platform reconnect watcher auto-paused a platform after 10
consecutive reconnect failures, setting next_retry=inf and requiring a
manual /platform resume to recover. But both pause sites only ever fire
on *retryable* failures — non-retryable errors (bad auth) already drop
out of the retry queue earlier. So a transient DNS outage that spanned
the watcher's backoff window would silently park the bot forever, even
after connectivity returned.

The watcher's own docstring already promised 'retryable failures keep
retrying at the backoff cap indefinitely' — the code contradicted it.

Remove the auto-pause from both reconnect-failure branches. Retryable
failures now retry at the 5-min backoff cap forever and self-heal once
the network recovers. The circuit breaker (_pause_failed_platform /
_resume_paused_platform) stays for manual /platform pause|resume.

Fixes #35284.
This commit is contained in:
Teknium 2026-05-30 07:33:34 -07:00 committed by GitHub
parent cddb7283d9
commit 45465b0d5d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 46 additions and 37 deletions

View file

@ -1734,9 +1734,9 @@ class GatewayRunner:
# as a fallback when a fresh config read transiently returns an empty
# model (e.g. an mtime-keyed config-cache miss during a post-interrupt
# recovery turn). Without this, the agent is built with model="" and
# every API call fails HTTP 400 "No models provided" — the session
# goes silent until the user manually re-sends. See #35314. The "*"
# key holds a process-wide last-known-good for first-seen sessions.
# every API call fails HTTP 400 "No models provided" — the session goes
# silent until the user manually re-sends. See #35314. ``"*"`` holds a
# process-wide last-known-good for sessions seen for the first time.
self._last_resolved_model: Dict[str, str] = {}
# Overflow buffer for explicit /queue commands. The adapter-level
# _pending_messages dict is a single slot per session (designed for
@ -2502,8 +2502,8 @@ class GatewayRunner:
# successfully resolved for this session (or, failing that, the most
# recent one resolved process-wide). Building an agent with model=""
# makes every API call fail HTTP 400 "No models provided" and the
# session goes silent until the user manually re-sends. getattr guards
# against bare test runners built via object.__new__.
# session goes silent until the user manually re-sends. ``getattr``
# guards against bare test runners built via ``object.__new__``.
_last_good = getattr(self, "_last_resolved_model", None)
if _last_good is not None:
if not model:
@ -2818,10 +2818,12 @@ class GatewayRunner:
"""Mark a queued platform as paused — keep it in ``_failed_platforms``
but stop the reconnect watcher from hammering it.
Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
retryable failures, and by ``/platform pause <name>`` for manual
intervention. Paused platforms are surfaced in ``/platform list``
and resumed with ``/platform resume <name>``.
Used by ``/platform pause <name>`` for manual operator intervention.
Paused platforms are surfaced in ``/platform list`` and resumed with
``/platform resume <name>``. Note: the reconnect watcher does NOT
auto-pause retryable (network/DNS) failures keep retrying at the
backoff cap indefinitely so a transient outage self-heals without
manual intervention.
"""
info = getattr(self, "_failed_platforms", {}).get(platform)
if info is None:
@ -5899,15 +5901,17 @@ class GatewayRunner:
"""Background task that periodically retries connecting failed platforms.
Uses exponential backoff: 30s 60s 120s 240s 300s (cap).
Retryable failures keep retrying at the backoff cap indefinitely
but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
without ever succeeding, it is *paused*: kept in the retry queue
but no longer hammered. The user surfaces it with ``/platform list``
and resumes it with ``/platform resume <name>``. Non-retryable
failures (bad auth, etc.) still drop out of the queue immediately.
Retryable failures (network/DNS blips) keep retrying at the backoff
cap indefinitely they self-heal once connectivity returns, so a
transient outage never requires manual intervention. Non-retryable
failures (bad auth, etc.) drop out of the queue immediately. The
circuit breaker (``_pause_failed_platform`` / ``/platform pause``)
remains available for manual operator control via ``/platform list``
and ``/platform resume <name>``, but is no longer triggered
automatically auto-pausing a recovered platform was the cause of
bots silently staying dead after a transient DNS failure.
"""
_BACKOFF_CAP = 300 # 5 minutes max between retries
_PAUSE_AFTER_FAILURES = 10 # circuit-breaker threshold
await asyncio.sleep(10) # initial delay — let startup finish
while self._running:
@ -6002,14 +6006,14 @@ class GatewayRunner:
"Reconnect %s failed, next retry in %ds",
platform.value, backoff,
)
if attempt >= _PAUSE_AFTER_FAILURES:
self._pause_failed_platform(
platform,
reason=(
adapter.fatal_error_message
or "failed to reconnect"
),
)
# Retryable failures (network/DNS blips) keep retrying
# at the backoff cap indefinitely — they self-heal once
# connectivity returns. We do NOT auto-pause them: a
# transient outage must never require manual `/platform
# resume` to recover. Non-retryable failures (bad auth,
# etc.) already drop out of the queue via the
# `not fatal_error_retryable` branch above, so anything
# reaching here is by definition retryable.
except Exception as e:
self._update_platform_runtime_status(
platform.value,
@ -6024,8 +6028,9 @@ class GatewayRunner:
"Reconnect %s error: %s, next retry in %ds",
platform.value, e, backoff,
)
if attempt >= _PAUSE_AFTER_FAILURES:
self._pause_failed_platform(platform, reason=str(e))
# A raised exception during reconnect (connect timeout, DNS
# resolution failure, etc.) is inherently transient — keep
# retrying at the backoff cap rather than auto-pausing.
# Check every 10 seconds for platforms that need reconnection
for _ in range(10):

View file

@ -294,19 +294,20 @@ class TestPlatformReconnectWatcher:
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2
@pytest.mark.asyncio
async def test_reconnect_pauses_after_circuit_breaker_threshold(self):
"""After enough consecutive retryable failures, the watcher should
*pause* the platform (keep it in the queue but stop hammering it),
not drop it. The user resumes via /platform resume.
async def test_reconnect_never_auto_pauses_retryable_failures(self):
"""Retryable failures (network/DNS) must keep retrying indefinitely —
the watcher must NOT auto-pause them. Auto-pausing a transiently-failed
platform left bots silently dead after a DNS blip (#35284). The pause
circuit breaker remains available for manual /platform pause only.
"""
runner = _make_runner()
platform_config = PlatformConfig(enabled=True, token="test")
# 9 prior attempts — the next failure will be the 10th and should
# trip the circuit breaker.
# Far past the old circuit-breaker threshold (10): even after many
# consecutive retryable failures the platform must stay unpaused.
runner._failed_platforms[Platform.TELEGRAM] = {
"config": platform_config,
"attempts": 9,
"attempts": 25,
"next_retry": time.monotonic() - 1,
}
@ -332,12 +333,15 @@ class TestPlatformReconnectWatcher:
await run_one_iteration()
# Platform stays in queue — paused, not dropped
# Platform stays in queue and keeps retrying — never auto-paused.
assert Platform.TELEGRAM in runner._failed_platforms
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is True
assert info["attempts"] == 10
assert "pause_reason" in info
assert info.get("paused") is not True
assert "pause_reason" not in info
assert info["attempts"] == 26
# next_retry is pushed out by the backoff (capped at 300s), not inf.
assert info["next_retry"] != float("inf")
assert info["next_retry"] > time.monotonic()
@pytest.mark.asyncio
async def test_reconnect_skips_paused_platforms(self):