From f374ae4c619c365bc9c56a463ad32ff1ace2d4c3 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 3 Apr 2026 02:16:46 -0700 Subject: [PATCH] fix: prevent compression death spiral from API disconnects (#2153) (#4750) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for long-running gateway sessions that enter a death spiral when API disconnects prevent token data collection, which prevents compression, which causes more disconnects: Layer 1 — Stale token counter fallback (run_agent.py in-loop): When last_prompt_tokens is 0 (stale after API disconnect or provider returned no usage data), fall back to estimate_messages_tokens_rough() instead of passing 0 to should_compress(), which would never fire. Layer 2 — Server disconnect heuristic (run_agent.py error handler): When ReadError/RemoteProtocolError hits a large session (>60% context or >200 messages), treat it as a context-length error and trigger compression rather than burning through retries that all fail the same way. Layer 3 — Hard message count limit (gateway/run.py hygiene): Force compression when a session exceeds 400 messages, regardless of token estimates. This catches runaway growth even when all token-based checks fail due to missing API data. Based on the analysis from PR #2157 by ygd58 — the gateway threshold direction fix (1.4x multiplier) was already resolved on main. --- gateway/run.py | 13 ++++++++++++- run_agent.py | 45 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 1ab455634..225f82fa1 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2361,7 +2361,18 @@ class GatewayRunner: # 85% * 1.4 = 119% of context — which exceeds the model's limit # and prevented hygiene from ever firing for ~200K models (GLM-5). - _needs_compress = _approx_tokens >= _compress_token_threshold + # Hard safety valve: force compression if message count is + # extreme, regardless of token estimates. This breaks the + # death spiral where API disconnects prevent token data + # collection, which prevents compression, which causes more + # disconnects. 400 messages is well above normal sessions + # but catches runaway growth before it becomes unrecoverable. + # (#2153) + _HARD_MSG_LIMIT = 400 + _needs_compress = ( + _approx_tokens >= _compress_token_threshold + or _msg_count >= _HARD_MSG_LIMIT + ) if _needs_compress: logger.info( diff --git a/run_agent.py b/run_agent.py index 769035925..e18932d36 100644 --- a/run_agent.py +++ b/run_agent.py @@ -7540,7 +7540,33 @@ class AIAgent: f"treating as probable context overflow.", force=True, ) - + + # Server disconnects on large sessions are often caused by + # the request exceeding the provider's context/payload limit + # without a proper HTTP error response. Treat these as + # context-length errors to trigger compression rather than + # burning through retries that will all fail the same way. + # This breaks the death spiral: disconnect → no token data + # → no compression → bigger session → more disconnects. + # (#2153) + if not is_context_length_error and not status_code: + _is_server_disconnect = ( + 'server disconnected' in error_msg + or 'peer closed connection' in error_msg + or error_type in ('ReadError', 'RemoteProtocolError', 'ServerDisconnectedError') + ) + if _is_server_disconnect: + ctx_len = getattr(getattr(self, 'context_compressor', None), 'context_length', 200000) + _is_large = approx_tokens > ctx_len * 0.6 or len(api_messages) > 200 + if _is_large: + is_context_length_error = True + self._vprint( + f"{self.log_prefix}⚠️ Server disconnected with large session " + f"(~{approx_tokens:,} tokens, {len(api_messages)} msgs) — " + f"treating as context-length error, attempting compression.", + force=True, + ) + if is_context_length_error: compressor = self.context_compressor old_ctx = compressor.context_length @@ -8175,11 +8201,20 @@ class AIAgent: # threshold (default 50%) leaves ample headroom; if tool # results push past it, the next API call will report the # real total and trigger compression then. + # + # If last_prompt_tokens is 0 (stale after API disconnect + # or provider returned no usage data), fall back to rough + # estimate to avoid missing compression. Without this, + # a session can grow unbounded after disconnects because + # should_compress(0) never fires. (#2153) _compressor = self.context_compressor - _real_tokens = ( - _compressor.last_prompt_tokens - + _compressor.last_completion_tokens - ) + if _compressor.last_prompt_tokens > 0: + _real_tokens = ( + _compressor.last_prompt_tokens + + _compressor.last_completion_tokens + ) + else: + _real_tokens = estimate_messages_tokens_rough(messages) # ── Context pressure warnings (user-facing only) ────────── # Notify the user (NOT the LLM) as context approaches the