From 8bf797f1c20f7e4acfafb4457ab3918362dc9673 Mon Sep 17 00:00:00 2001
From: Jiahui-Gu <jiahuigu@users.noreply.github.com>
Date: Tue, 26 May 2026 17:19:21 +0800
Subject: [PATCH] fix(agent): prefer native vision over auxiliary fallback in
 auto mode (#29135)

---
 agent/image_routing.py            | 34 +++++++++++++++++++------------
 tests/agent/test_image_routing.py | 34 ++++++++++++++++++++++++-------
 2 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/agent/image_routing.py b/agent/image_routing.py
index 13d39675e..ba6d8da32 100644
--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@@ -17,13 +17,17 @@ It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native``
 | ``text``, default ``auto``) and the active model's capability metadata.
 
 In ``auto`` mode:
-  - If the user has explicitly configured ``auxiliary.vision.provider``
-    (i.e. not ``auto`` and not empty), we assume they want the text pipeline
-    regardless of the main model — they've opted in to a specific vision
-    backend for a reason (cost, quality, local-only, etc.).
-  - Otherwise, if the active model reports ``supports_vision=True`` in its
-    models.dev metadata, we attach natively.
-  - Otherwise (non-vision model, no explicit override), we fall back to text.
+  - If the active model reports ``supports_vision=True`` (via config
+    override or models.dev metadata), we attach natively — vision-capable
+    main models should always see the original pixels, even when an
+    auxiliary vision backend is configured. That auxiliary backend then
+    acts as a *fallback* for sessions whose main model can't take images.
+  - Otherwise, if the user has explicitly configured ``auxiliary.vision``
+    (provider/model/base_url not ``auto``/empty), we route through the
+    text pipeline so the auxiliary vision backend can describe the image
+    for the text-only main model.
+  - Otherwise (non-vision model, no explicit override), we fall back to
+    text via the default vision_analyze flow.
 
 This keeps ``vision_analyze`` surfaced as a tool in every session — skills
 and agent flows that chain it (browser screenshots, deeper inspection of
@@ -342,8 +346,10 @@ def _coerce_mode(raw: Any) -> str:
 def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
     """True when the user configured a specific auxiliary vision backend.
 
-    An explicit override means the user *wants* the text pipeline (they're
-    paying for a dedicated vision model), so we don't silently bypass it.
+    An explicit override means the user has a dedicated vision backend
+    available; it's used as a *fallback* when the main model can't take
+    images natively. In ``auto`` mode, native vision on a vision-capable
+    main model still wins over this fallback — see issue #29135.
     """
     if not isinstance(cfg, dict):
         return False
@@ -432,13 +438,15 @@ def decide_image_input_mode(
     if mode_cfg == "text":
         return "text"
 
-    # auto
-    if _explicit_aux_vision_override(cfg):
-        return "text"
-
+    # auto: prefer native vision when the main model supports it. An
+    # explicit auxiliary.vision config acts as a *fallback* for text-only
+    # main models — it should not preempt native vision on a model that
+    # can natively inspect the pixels (issue #29135).
     supports = _lookup_supports_vision(provider, model, cfg)
     if supports is True:
         return "native"
+    if _explicit_aux_vision_override(cfg):
+        return "text"
     return "text"
 
 
diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py
index dfcd45af0..675823112 100644
--- a/tests/agent/test_image_routing.py
+++ b/tests/agent/test_image_routing.py
@@ -97,11 +97,21 @@ class TestDecideImageInputMode:
         with patch("agent.image_routing._lookup_supports_vision", return_value=None):
             assert decide_image_input_mode("openrouter", "brand-new-slug", {}) == "text"
 
-    def test_auto_respects_aux_vision_override_even_for_vision_model(self):
-        """If the user configured a dedicated vision backend, don't bypass it."""
+    def test_auto_prefers_native_for_vision_capable_main_model_even_with_aux_configured(self):
+        """Regression #29135: vision-capable main model wins over aux fallback.
+
+        Auxiliary.vision is a fallback for text-only main models; it must
+        not preempt native vision on a vision-capable main model.
+        """
         cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
         with patch("agent.image_routing._lookup_supports_vision", return_value=True):
-            assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text"
+            assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native"
+
+    def test_auto_uses_aux_vision_fallback_for_text_only_main_model(self):
+        """#29135: aux vision still acts as fallback for non-vision main models."""
+        cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
+        with patch("agent.image_routing._lookup_supports_vision", return_value=False):
+            assert decide_image_input_mode("deepseek", "deepseek-v4-pro", cfg) == "text"
 
     def test_none_config_is_auto(self):
         with patch("agent.image_routing._lookup_supports_vision", return_value=True):
@@ -325,15 +335,25 @@ class TestAutoModeRespectsOverride:
         with patch("agent.models_dev.get_model_capabilities", return_value=None):
             assert decide_image_input_mode("custom", "unknown", {}) == "text"
 
-    def test_explicit_aux_vision_override_still_wins(self):
-        # If the user has configured a dedicated vision aux backend, respect
-        # it even when supports_vision: true is also set.
+    def test_explicit_aux_vision_no_longer_overrides_native_capable_main(self):
+        # #29135: aux.vision is a fallback for text-only main models; it
+        # must NOT preempt native routing when the main model can take
+        # images directly (supports_vision: true).
         cfg = {
             "model": {"supports_vision": True},
             "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
         }
         with patch("agent.models_dev.get_model_capabilities", return_value=None):
-            assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "text"
+            assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "native"
+
+    def test_explicit_aux_vision_used_when_main_model_supports_vision_false(self):
+        # #29135 counterpart: text-only main model + aux fallback → text.
+        cfg = {
+            "model": {"supports_vision": False},
+            "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
+        }
+        with patch("agent.models_dev.get_model_capabilities", return_value=None):
+            assert decide_image_input_mode("custom", "deepseek-v4", cfg) == "text"
 
 
 # ─── build_native_content_parts ──────────────────────────────────────────────