fix(agent): retry 413 after stripping vision payloads (#47339)

When text compression can't reduce a 413 request further, evict base64 image parts from tool messages and retry once instead of dead-ending with 'Payload too large and cannot compress further.' A 413 is a request-body byte-size limit, not a token limit. browser_vision screenshots (2-5MB base64 each) keep the HTTP body oversized even after aggressive summarization. The strip pass passes remember_model=False so a 413 does not poison _no_list_tool_content_models — that set is for providers that reject list-type tool content, a distinct failure mode. Cherry-picked from #47397 by Tranquil-Flow; placed onto main's current token-aware 413 recovery else branch.
2026-07-01 03:06:00 -07:00 · 2026-07-01 03:06:00 -07:00 · 122e5bc037
commit 122e5bc037
parent 2b8adb8683
3 changed files with 105 additions and 14 deletions
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -3244,6 +3244,16 @@ def run_conversation(
                        _retry.restart_with_compressed_messages = True
                        break
                    else:
+                        if agent._try_strip_image_parts_from_tool_messages(
+                            api_messages,
+                            remember_model=False,
+                        ):
+                            agent._buffer_status(
+                                "📐 Compression could not reduce the request further — "
+                                "removed retained vision payloads and retrying..."
+                            )
+                            continue
+
                        # Terminal — surface buffered context so the user
                        # sees what compression attempts were made.
                        agent._flush_status_buffer()
--- a/run_agent.py
+++ b/run_agent.py
@ -4932,17 +4932,27 @@ class AIAgent:
            max_dimension=max_dimension,
        )

-    def _try_strip_image_parts_from_tool_messages(self, api_messages: list) -> bool:
+    def _try_strip_image_parts_from_tool_messages(
+        self,
+        api_messages: list,
+        *,
+        remember_model: bool = True,
+    ) -> bool:
        """Downgrade list-type tool messages to text summaries in-place.

        Recovery path for providers that reject list-type tool message content
        (e.g. Xiaomi MiMo's 400 "text is not set"; see issue #27344).  Walks
        ``api_messages`` for any ``role: "tool"`` message whose ``content`` is
        a list containing image parts, replaces the content with the existing
-        text part(s) (or a minimal placeholder if none survive), and records
-        the active (provider, model) in ``self._no_list_tool_content_models``
-        so subsequent ``_tool_result_content_for_active_model`` calls in this
-        session preemptively downgrade screenshots without a round-trip.
+        text part(s) (or a minimal placeholder if none survive), and by default
+        records the active (provider, model) in
+        ``self._no_list_tool_content_models`` so subsequent
+        ``_tool_result_content_for_active_model`` calls in this session
+        preemptively downgrade screenshots without a round-trip.
+
+        413 payload-size recovery passes ``remember_model=False`` because that
+        error means this request body was too large, not that the provider/model
+        rejects list-type tool content in general.

        Returns True when at least one tool message was downgraded — the
        caller (the 400 recovery branch in ``agent.conversation_loop``) uses
@ -4952,15 +4962,16 @@ class AIAgent:
        if not isinstance(api_messages, list):
            return False

-        # Record (provider, model) so we don't relearn this lesson.
-        key = (
-            (getattr(self, "provider", "") or "").strip().lower(),
-            (getattr(self, "model", "") or "").strip(),
-        )
-        if not hasattr(self, "_no_list_tool_content_models"):
-            self._no_list_tool_content_models = set()
-        if key[1]:  # only record when we actually have a model id
-            self._no_list_tool_content_models.add(key)
+        if remember_model:
+            # Record (provider, model) so we don't relearn this lesson.
+            key = (
+                (getattr(self, "provider", "") or "").strip().lower(),
+                (getattr(self, "model", "") or "").strip(),
+            )
+            if not hasattr(self, "_no_list_tool_content_models"):
+                self._no_list_tool_content_models = set()
+            if key[1]:  # only record when we actually have a model id
+                self._no_list_tool_content_models.add(key)

        changed = False
        for msg in api_messages:
--- a/tests/run_agent/test_413_compression.py
+++ b/tests/run_agent/test_413_compression.py
@ -227,6 +227,76 @@ class TestHTTP413Compression:
        mock_compress.assert_called_once()
        assert result["completed"] is True

+    def test_413_strips_vision_payloads_when_compression_cannot_reduce_messages(self, agent):
+        """If compression leaves image payloads behind, strip them and retry.
+
+        Browser vision tool results can contain base64 image parts. A 413 can
+        persist even after summarisation when the remaining recent tool result
+        still carries binary data; Hermes should evict the image payload and
+        keep the text/placeholder context instead of failing immediately.
+        """
+        err_413 = _make_413_error()
+        ok_resp = _mock_response(content="Recovered after image eviction", finish_reason="stop")
+        request_payloads = []
+
+        def _side_effect(**kwargs):
+            request_payloads.append(kwargs)
+            if len(request_payloads) == 1:
+                raise err_413
+            return ok_resp
+
+        agent.client.chat.completions.create.side_effect = _side_effect
+
+        image_part = {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64," + ("a" * 2000)},
+        }
+        prefill = [
+            {"role": "user", "content": "please inspect this page"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_vision",
+                        "type": "function",
+                        "function": {"name": "browser_vision", "arguments": "{}"},
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call_vision",
+                "name": "browser_vision",
+                "content": [
+                    {"type": "text", "text": "Screenshot of the dashboard"},
+                    image_part,
+                ],
+            },
+        ]
+
+        with (
+            patch.object(agent, "_compress_context") as mock_compress,
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            # Simulate the bad production case: compression ran, but the
+            # recent vision tool message survived so message count did not drop.
+            mock_compress.side_effect = lambda msgs, *_a, **_k: (msgs, "compressed prompt")
+            result = agent.run_conversation("continue", conversation_history=prefill)
+
+        mock_compress.assert_called_once()
+        assert result["completed"] is True
+        assert result["final_response"] == "Recovered after image eviction"
+        assert len(request_payloads) == 2
+        first_tool = next(m for m in request_payloads[0]["messages"] if m.get("role") == "tool")
+        retried_tool = next(m for m in request_payloads[1]["messages"] if m.get("role") == "tool")
+        assert "Screenshot of the dashboard" in str(first_tool["content"])
+        assert "data:image" not in str(retried_tool["content"])
+        assert "Screenshot of the dashboard" in str(retried_tool["content"])
+        assert not getattr(agent, "_no_list_tool_content_models", set())
+
    def test_413_clears_conversation_history_on_persist(self, agent):
        """After 413-triggered compression, _persist_session must receive None history.