From 8bf797f1c20f7e4acfafb4457ab3918362dc9673 Mon Sep 17 00:00:00 2001 From: Jiahui-Gu Date: Tue, 26 May 2026 17:19:21 +0800 Subject: [PATCH] fix(agent): prefer native vision over auxiliary fallback in auto mode (#29135) --- agent/image_routing.py | 34 +++++++++++++++++++------------ tests/agent/test_image_routing.py | 34 ++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/agent/image_routing.py b/agent/image_routing.py index 13d39675e..ba6d8da32 100644 --- a/agent/image_routing.py +++ b/agent/image_routing.py @@ -17,13 +17,17 @@ It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native`` | ``text``, default ``auto``) and the active model's capability metadata. In ``auto`` mode: - - If the user has explicitly configured ``auxiliary.vision.provider`` - (i.e. not ``auto`` and not empty), we assume they want the text pipeline - regardless of the main model — they've opted in to a specific vision - backend for a reason (cost, quality, local-only, etc.). - - Otherwise, if the active model reports ``supports_vision=True`` in its - models.dev metadata, we attach natively. - - Otherwise (non-vision model, no explicit override), we fall back to text. + - If the active model reports ``supports_vision=True`` (via config + override or models.dev metadata), we attach natively — vision-capable + main models should always see the original pixels, even when an + auxiliary vision backend is configured. That auxiliary backend then + acts as a *fallback* for sessions whose main model can't take images. + - Otherwise, if the user has explicitly configured ``auxiliary.vision`` + (provider/model/base_url not ``auto``/empty), we route through the + text pipeline so the auxiliary vision backend can describe the image + for the text-only main model. + - Otherwise (non-vision model, no explicit override), we fall back to + text via the default vision_analyze flow. This keeps ``vision_analyze`` surfaced as a tool in every session — skills and agent flows that chain it (browser screenshots, deeper inspection of @@ -342,8 +346,10 @@ def _coerce_mode(raw: Any) -> str: def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool: """True when the user configured a specific auxiliary vision backend. - An explicit override means the user *wants* the text pipeline (they're - paying for a dedicated vision model), so we don't silently bypass it. + An explicit override means the user has a dedicated vision backend + available; it's used as a *fallback* when the main model can't take + images natively. In ``auto`` mode, native vision on a vision-capable + main model still wins over this fallback — see issue #29135. """ if not isinstance(cfg, dict): return False @@ -432,13 +438,15 @@ def decide_image_input_mode( if mode_cfg == "text": return "text" - # auto - if _explicit_aux_vision_override(cfg): - return "text" - + # auto: prefer native vision when the main model supports it. An + # explicit auxiliary.vision config acts as a *fallback* for text-only + # main models — it should not preempt native vision on a model that + # can natively inspect the pixels (issue #29135). supports = _lookup_supports_vision(provider, model, cfg) if supports is True: return "native" + if _explicit_aux_vision_override(cfg): + return "text" return "text" diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py index dfcd45af0..675823112 100644 --- a/tests/agent/test_image_routing.py +++ b/tests/agent/test_image_routing.py @@ -97,11 +97,21 @@ class TestDecideImageInputMode: with patch("agent.image_routing._lookup_supports_vision", return_value=None): assert decide_image_input_mode("openrouter", "brand-new-slug", {}) == "text" - def test_auto_respects_aux_vision_override_even_for_vision_model(self): - """If the user configured a dedicated vision backend, don't bypass it.""" + def test_auto_prefers_native_for_vision_capable_main_model_even_with_aux_configured(self): + """Regression #29135: vision-capable main model wins over aux fallback. + + Auxiliary.vision is a fallback for text-only main models; it must + not preempt native vision on a vision-capable main model. + """ cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}} with patch("agent.image_routing._lookup_supports_vision", return_value=True): - assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text" + assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native" + + def test_auto_uses_aux_vision_fallback_for_text_only_main_model(self): + """#29135: aux vision still acts as fallback for non-vision main models.""" + cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}} + with patch("agent.image_routing._lookup_supports_vision", return_value=False): + assert decide_image_input_mode("deepseek", "deepseek-v4-pro", cfg) == "text" def test_none_config_is_auto(self): with patch("agent.image_routing._lookup_supports_vision", return_value=True): @@ -325,15 +335,25 @@ class TestAutoModeRespectsOverride: with patch("agent.models_dev.get_model_capabilities", return_value=None): assert decide_image_input_mode("custom", "unknown", {}) == "text" - def test_explicit_aux_vision_override_still_wins(self): - # If the user has configured a dedicated vision aux backend, respect - # it even when supports_vision: true is also set. + def test_explicit_aux_vision_no_longer_overrides_native_capable_main(self): + # #29135: aux.vision is a fallback for text-only main models; it + # must NOT preempt native routing when the main model can take + # images directly (supports_vision: true). cfg = { "model": {"supports_vision": True}, "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}}, } with patch("agent.models_dev.get_model_capabilities", return_value=None): - assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "text" + assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "native" + + def test_explicit_aux_vision_used_when_main_model_supports_vision_false(self): + # #29135 counterpart: text-only main model + aux fallback → text. + cfg = { + "model": {"supports_vision": False}, + "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}}, + } + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert decide_image_input_mode("custom", "deepseek-v4", cfg) == "text" # ─── build_native_content_parts ──────────────────────────────────────────────