From f374ae4c619c365bc9c56a463ad32ff1ace2d4c3 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Fri, 3 Apr 2026 02:16:46 -0700
Subject: [PATCH] fix: prevent compression death spiral from API disconnects
 (#2153) (#4750)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three fixes for long-running gateway sessions that enter a death spiral
when API disconnects prevent token data collection, which prevents
compression, which causes more disconnects:

Layer 1 — Stale token counter fallback (run_agent.py in-loop):
When last_prompt_tokens is 0 (stale after API disconnect or provider
returned no usage data), fall back to estimate_messages_tokens_rough()
instead of passing 0 to should_compress(), which would never fire.

Layer 2 — Server disconnect heuristic (run_agent.py error handler):
When ReadError/RemoteProtocolError hits a large session (>60% context
or >200 messages), treat it as a context-length error and trigger
compression rather than burning through retries that all fail the
same way.

Layer 3 — Hard message count limit (gateway/run.py hygiene):
Force compression when a session exceeds 400 messages, regardless of
token estimates. This catches runaway growth even when all token-based
checks fail due to missing API data.

Based on the analysis from PR #2157 by ygd58 — the gateway threshold
direction fix (1.4x multiplier) was already resolved on main.
---
 gateway/run.py | 13 ++++++++++++-
 run_agent.py   | 45 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/gateway/run.py b/gateway/run.py
index 1ab455634..225f82fa1 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -2361,7 +2361,18 @@ class GatewayRunner:
                     # 85% * 1.4 = 119% of context — which exceeds the model's limit
                     # and prevented hygiene from ever firing for ~200K models (GLM-5).
 
-                _needs_compress = _approx_tokens >= _compress_token_threshold
+                # Hard safety valve: force compression if message count is
+                # extreme, regardless of token estimates.  This breaks the
+                # death spiral where API disconnects prevent token data
+                # collection, which prevents compression, which causes more
+                # disconnects.  400 messages is well above normal sessions
+                # but catches runaway growth before it becomes unrecoverable.
+                # (#2153)
+                _HARD_MSG_LIMIT = 400
+                _needs_compress = (
+                    _approx_tokens >= _compress_token_threshold
+                    or _msg_count >= _HARD_MSG_LIMIT
+                )
 
                 if _needs_compress:
                     logger.info(
diff --git a/run_agent.py b/run_agent.py
index 769035925..e18932d36 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -7540,7 +7540,33 @@ class AIAgent:
                                 f"treating as probable context overflow.",
                                 force=True,
                             )
-                    
+
+                    # Server disconnects on large sessions are often caused by
+                    # the request exceeding the provider's context/payload limit
+                    # without a proper HTTP error response.  Treat these as
+                    # context-length errors to trigger compression rather than
+                    # burning through retries that will all fail the same way.
+                    # This breaks the death spiral: disconnect → no token data
+                    # → no compression → bigger session → more disconnects.
+                    # (#2153)
+                    if not is_context_length_error and not status_code:
+                        _is_server_disconnect = (
+                            'server disconnected' in error_msg
+                            or 'peer closed connection' in error_msg
+                            or error_type in ('ReadError', 'RemoteProtocolError', 'ServerDisconnectedError')
+                        )
+                        if _is_server_disconnect:
+                            ctx_len = getattr(getattr(self, 'context_compressor', None), 'context_length', 200000)
+                            _is_large = approx_tokens > ctx_len * 0.6 or len(api_messages) > 200
+                            if _is_large:
+                                is_context_length_error = True
+                                self._vprint(
+                                    f"{self.log_prefix}⚠️  Server disconnected with large session "
+                                    f"(~{approx_tokens:,} tokens, {len(api_messages)} msgs) — "
+                                    f"treating as context-length error, attempting compression.",
+                                    force=True,
+                                )
+
                     if is_context_length_error:
                         compressor = self.context_compressor
                         old_ctx = compressor.context_length
@@ -8175,11 +8201,20 @@ class AIAgent:
                     # threshold (default 50%) leaves ample headroom; if tool
                     # results push past it, the next API call will report the
                     # real total and trigger compression then.
+                    #
+                    # If last_prompt_tokens is 0 (stale after API disconnect
+                    # or provider returned no usage data), fall back to rough
+                    # estimate to avoid missing compression.  Without this,
+                    # a session can grow unbounded after disconnects because
+                    # should_compress(0) never fires.  (#2153)
                     _compressor = self.context_compressor
-                    _real_tokens = (
-                        _compressor.last_prompt_tokens
-                        + _compressor.last_completion_tokens
-                    )
+                    if _compressor.last_prompt_tokens > 0:
+                        _real_tokens = (
+                            _compressor.last_prompt_tokens
+                            + _compressor.last_completion_tokens
+                        )
+                    else:
+                        _real_tokens = estimate_messages_tokens_rough(messages)
 
                     # ── Context pressure warnings (user-facing only) ──────────
                     # Notify the user (NOT the LLM) as context approaches the