From 122e5bc0373e8df65db751d5d820445aec275a95 Mon Sep 17 00:00:00 2001
From: Tranquil-Flow <66773372+Tranquil-Flow@users.noreply.github.com>
Date: Wed, 1 Jul 2026 03:06:00 -0700
Subject: [PATCH] fix(agent): retry 413 after stripping vision payloads
 (#47339)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When text compression can't reduce a 413 request further, evict base64
image parts from tool messages and retry once instead of dead-ending
with 'Payload too large and cannot compress further.'

A 413 is a request-body byte-size limit, not a token limit. browser_vision
screenshots (2-5MB base64 each) keep the HTTP body oversized even after
aggressive summarization. The strip pass passes remember_model=False so a
413 does not poison _no_list_tool_content_models — that set is for providers
that reject list-type tool content, a distinct failure mode.

Cherry-picked from #47397 by Tranquil-Flow; placed onto main's current
token-aware 413 recovery else branch.
---
 agent/conversation_loop.py              | 10 ++++
 run_agent.py                            | 39 +++++++++-----
 tests/run_agent/test_413_compression.py | 70 +++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index 7c85ae8ff..a60af85c8 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -3244,6 +3244,16 @@ def run_conversation(
                         _retry.restart_with_compressed_messages = True
                         break
                     else:
+                        if agent._try_strip_image_parts_from_tool_messages(
+                            api_messages,
+                            remember_model=False,
+                        ):
+                            agent._buffer_status(
+                                "📐 Compression could not reduce the request further — "
+                                "removed retained vision payloads and retrying..."
+                            )
+                            continue
+
                         # Terminal — surface buffered context so the user
                         # sees what compression attempts were made.
                         agent._flush_status_buffer()
diff --git a/run_agent.py b/run_agent.py
index 9b8078a51..319f13ebb 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -4932,17 +4932,27 @@ class AIAgent:
             max_dimension=max_dimension,
         )
 
-    def _try_strip_image_parts_from_tool_messages(self, api_messages: list) -> bool:
+    def _try_strip_image_parts_from_tool_messages(
+        self,
+        api_messages: list,
+        *,
+        remember_model: bool = True,
+    ) -> bool:
         """Downgrade list-type tool messages to text summaries in-place.
 
         Recovery path for providers that reject list-type tool message content
         (e.g. Xiaomi MiMo's 400 "text is not set"; see issue #27344).  Walks
         ``api_messages`` for any ``role: "tool"`` message whose ``content`` is
         a list containing image parts, replaces the content with the existing
-        text part(s) (or a minimal placeholder if none survive), and records
-        the active (provider, model) in ``self._no_list_tool_content_models``
-        so subsequent ``_tool_result_content_for_active_model`` calls in this
-        session preemptively downgrade screenshots without a round-trip.
+        text part(s) (or a minimal placeholder if none survive), and by default
+        records the active (provider, model) in
+        ``self._no_list_tool_content_models`` so subsequent
+        ``_tool_result_content_for_active_model`` calls in this session
+        preemptively downgrade screenshots without a round-trip.
+
+        413 payload-size recovery passes ``remember_model=False`` because that
+        error means this request body was too large, not that the provider/model
+        rejects list-type tool content in general.
 
         Returns True when at least one tool message was downgraded — the
         caller (the 400 recovery branch in ``agent.conversation_loop``) uses
@@ -4952,15 +4962,16 @@ class AIAgent:
         if not isinstance(api_messages, list):
             return False
 
-        # Record (provider, model) so we don't relearn this lesson.
-        key = (
-            (getattr(self, "provider", "") or "").strip().lower(),
-            (getattr(self, "model", "") or "").strip(),
-        )
-        if not hasattr(self, "_no_list_tool_content_models"):
-            self._no_list_tool_content_models = set()
-        if key[1]:  # only record when we actually have a model id
-            self._no_list_tool_content_models.add(key)
+        if remember_model:
+            # Record (provider, model) so we don't relearn this lesson.
+            key = (
+                (getattr(self, "provider", "") or "").strip().lower(),
+                (getattr(self, "model", "") or "").strip(),
+            )
+            if not hasattr(self, "_no_list_tool_content_models"):
+                self._no_list_tool_content_models = set()
+            if key[1]:  # only record when we actually have a model id
+                self._no_list_tool_content_models.add(key)
 
         changed = False
         for msg in api_messages:
diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py
index 48ce2636c..61fe70193 100644
--- a/tests/run_agent/test_413_compression.py
+++ b/tests/run_agent/test_413_compression.py
@@ -227,6 +227,76 @@ class TestHTTP413Compression:
         mock_compress.assert_called_once()
         assert result["completed"] is True
 
+    def test_413_strips_vision_payloads_when_compression_cannot_reduce_messages(self, agent):
+        """If compression leaves image payloads behind, strip them and retry.
+
+        Browser vision tool results can contain base64 image parts. A 413 can
+        persist even after summarisation when the remaining recent tool result
+        still carries binary data; Hermes should evict the image payload and
+        keep the text/placeholder context instead of failing immediately.
+        """
+        err_413 = _make_413_error()
+        ok_resp = _mock_response(content="Recovered after image eviction", finish_reason="stop")
+        request_payloads = []
+
+        def _side_effect(**kwargs):
+            request_payloads.append(kwargs)
+            if len(request_payloads) == 1:
+                raise err_413
+            return ok_resp
+
+        agent.client.chat.completions.create.side_effect = _side_effect
+
+        image_part = {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64," + ("a" * 2000)},
+        }
+        prefill = [
+            {"role": "user", "content": "please inspect this page"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_vision",
+                        "type": "function",
+                        "function": {"name": "browser_vision", "arguments": "{}"},
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call_vision",
+                "name": "browser_vision",
+                "content": [
+                    {"type": "text", "text": "Screenshot of the dashboard"},
+                    image_part,
+                ],
+            },
+        ]
+
+        with (
+            patch.object(agent, "_compress_context") as mock_compress,
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            # Simulate the bad production case: compression ran, but the
+            # recent vision tool message survived so message count did not drop.
+            mock_compress.side_effect = lambda msgs, *_a, **_k: (msgs, "compressed prompt")
+            result = agent.run_conversation("continue", conversation_history=prefill)
+
+        mock_compress.assert_called_once()
+        assert result["completed"] is True
+        assert result["final_response"] == "Recovered after image eviction"
+        assert len(request_payloads) == 2
+        first_tool = next(m for m in request_payloads[0]["messages"] if m.get("role") == "tool")
+        retried_tool = next(m for m in request_payloads[1]["messages"] if m.get("role") == "tool")
+        assert "Screenshot of the dashboard" in str(first_tool["content"])
+        assert "data:image" not in str(retried_tool["content"])
+        assert "Screenshot of the dashboard" in str(retried_tool["content"])
+        assert not getattr(agent, "_no_list_tool_content_models", set())
+
     def test_413_clears_conversation_history_on_persist(self, agent):
         """After 413-triggered compression, _persist_session must receive None history.