From 122e5bc0373e8df65db751d5d820445aec275a95 Mon Sep 17 00:00:00 2001 From: Tranquil-Flow <66773372+Tranquil-Flow@users.noreply.github.com> Date: Wed, 1 Jul 2026 03:06:00 -0700 Subject: [PATCH] fix(agent): retry 413 after stripping vision payloads (#47339) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When text compression can't reduce a 413 request further, evict base64 image parts from tool messages and retry once instead of dead-ending with 'Payload too large and cannot compress further.' A 413 is a request-body byte-size limit, not a token limit. browser_vision screenshots (2-5MB base64 each) keep the HTTP body oversized even after aggressive summarization. The strip pass passes remember_model=False so a 413 does not poison _no_list_tool_content_models — that set is for providers that reject list-type tool content, a distinct failure mode. Cherry-picked from #47397 by Tranquil-Flow; placed onto main's current token-aware 413 recovery else branch. --- agent/conversation_loop.py | 10 ++++ run_agent.py | 39 +++++++++----- tests/run_agent/test_413_compression.py | 70 +++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 14 deletions(-) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 7c85ae8ff..a60af85c8 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -3244,6 +3244,16 @@ def run_conversation( _retry.restart_with_compressed_messages = True break else: + if agent._try_strip_image_parts_from_tool_messages( + api_messages, + remember_model=False, + ): + agent._buffer_status( + "📐 Compression could not reduce the request further — " + "removed retained vision payloads and retrying..." + ) + continue + # Terminal — surface buffered context so the user # sees what compression attempts were made. agent._flush_status_buffer() diff --git a/run_agent.py b/run_agent.py index 9b8078a51..319f13ebb 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4932,17 +4932,27 @@ class AIAgent: max_dimension=max_dimension, ) - def _try_strip_image_parts_from_tool_messages(self, api_messages: list) -> bool: + def _try_strip_image_parts_from_tool_messages( + self, + api_messages: list, + *, + remember_model: bool = True, + ) -> bool: """Downgrade list-type tool messages to text summaries in-place. Recovery path for providers that reject list-type tool message content (e.g. Xiaomi MiMo's 400 "text is not set"; see issue #27344). Walks ``api_messages`` for any ``role: "tool"`` message whose ``content`` is a list containing image parts, replaces the content with the existing - text part(s) (or a minimal placeholder if none survive), and records - the active (provider, model) in ``self._no_list_tool_content_models`` - so subsequent ``_tool_result_content_for_active_model`` calls in this - session preemptively downgrade screenshots without a round-trip. + text part(s) (or a minimal placeholder if none survive), and by default + records the active (provider, model) in + ``self._no_list_tool_content_models`` so subsequent + ``_tool_result_content_for_active_model`` calls in this session + preemptively downgrade screenshots without a round-trip. + + 413 payload-size recovery passes ``remember_model=False`` because that + error means this request body was too large, not that the provider/model + rejects list-type tool content in general. Returns True when at least one tool message was downgraded — the caller (the 400 recovery branch in ``agent.conversation_loop``) uses @@ -4952,15 +4962,16 @@ class AIAgent: if not isinstance(api_messages, list): return False - # Record (provider, model) so we don't relearn this lesson. - key = ( - (getattr(self, "provider", "") or "").strip().lower(), - (getattr(self, "model", "") or "").strip(), - ) - if not hasattr(self, "_no_list_tool_content_models"): - self._no_list_tool_content_models = set() - if key[1]: # only record when we actually have a model id - self._no_list_tool_content_models.add(key) + if remember_model: + # Record (provider, model) so we don't relearn this lesson. + key = ( + (getattr(self, "provider", "") or "").strip().lower(), + (getattr(self, "model", "") or "").strip(), + ) + if not hasattr(self, "_no_list_tool_content_models"): + self._no_list_tool_content_models = set() + if key[1]: # only record when we actually have a model id + self._no_list_tool_content_models.add(key) changed = False for msg in api_messages: diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py index 48ce2636c..61fe70193 100644 --- a/tests/run_agent/test_413_compression.py +++ b/tests/run_agent/test_413_compression.py @@ -227,6 +227,76 @@ class TestHTTP413Compression: mock_compress.assert_called_once() assert result["completed"] is True + def test_413_strips_vision_payloads_when_compression_cannot_reduce_messages(self, agent): + """If compression leaves image payloads behind, strip them and retry. + + Browser vision tool results can contain base64 image parts. A 413 can + persist even after summarisation when the remaining recent tool result + still carries binary data; Hermes should evict the image payload and + keep the text/placeholder context instead of failing immediately. + """ + err_413 = _make_413_error() + ok_resp = _mock_response(content="Recovered after image eviction", finish_reason="stop") + request_payloads = [] + + def _side_effect(**kwargs): + request_payloads.append(kwargs) + if len(request_payloads) == 1: + raise err_413 + return ok_resp + + agent.client.chat.completions.create.side_effect = _side_effect + + image_part = { + "type": "image_url", + "image_url": {"url": "data:image/png;base64," + ("a" * 2000)}, + } + prefill = [ + {"role": "user", "content": "please inspect this page"}, + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_vision", + "type": "function", + "function": {"name": "browser_vision", "arguments": "{}"}, + } + ], + }, + { + "role": "tool", + "tool_call_id": "call_vision", + "name": "browser_vision", + "content": [ + {"type": "text", "text": "Screenshot of the dashboard"}, + image_part, + ], + }, + ] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + # Simulate the bad production case: compression ran, but the + # recent vision tool message survived so message count did not drop. + mock_compress.side_effect = lambda msgs, *_a, **_k: (msgs, "compressed prompt") + result = agent.run_conversation("continue", conversation_history=prefill) + + mock_compress.assert_called_once() + assert result["completed"] is True + assert result["final_response"] == "Recovered after image eviction" + assert len(request_payloads) == 2 + first_tool = next(m for m in request_payloads[0]["messages"] if m.get("role") == "tool") + retried_tool = next(m for m in request_payloads[1]["messages"] if m.get("role") == "tool") + assert "Screenshot of the dashboard" in str(first_tool["content"]) + assert "data:image" not in str(retried_tool["content"]) + assert "Screenshot of the dashboard" in str(retried_tool["content"]) + assert not getattr(agent, "_no_list_tool_content_models", set()) + def test_413_clears_conversation_history_on_persist(self, agent): """After 413-triggered compression, _persist_session must receive None history.