fix(agent): prefer native vision over auxiliary fallback in auto mode (#29135)

2026-05-26 17:19:21 +08:00 · 2026-05-26 17:19:21 +08:00 · 8bf797f1c2
commit 8bf797f1c2
parent b19e32c702
2 changed files with 48 additions and 20 deletions
--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@ -17,13 +17,17 @@ It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native``
 | ``text``, default ``auto``) and the active model's capability metadata.

 In ``auto`` mode:
-  - If the user has explicitly configured ``auxiliary.vision.provider``
-    (i.e. not ``auto`` and not empty), we assume they want the text pipeline
-    regardless of the main model — they've opted in to a specific vision
-    backend for a reason (cost, quality, local-only, etc.).
-  - Otherwise, if the active model reports ``supports_vision=True`` in its
-    models.dev metadata, we attach natively.
-  - Otherwise (non-vision model, no explicit override), we fall back to text.
+  - If the active model reports ``supports_vision=True`` (via config
+    override or models.dev metadata), we attach natively — vision-capable
+    main models should always see the original pixels, even when an
+    auxiliary vision backend is configured. That auxiliary backend then
+    acts as a *fallback* for sessions whose main model can't take images.
+  - Otherwise, if the user has explicitly configured ``auxiliary.vision``
+    (provider/model/base_url not ``auto``/empty), we route through the
+    text pipeline so the auxiliary vision backend can describe the image
+    for the text-only main model.
+  - Otherwise (non-vision model, no explicit override), we fall back to
+    text via the default vision_analyze flow.

 This keeps ``vision_analyze`` surfaced as a tool in every session — skills
 and agent flows that chain it (browser screenshots, deeper inspection of
@ -342,8 +346,10 @@ def _coerce_mode(raw: Any) -> str:
 def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
    """True when the user configured a specific auxiliary vision backend.

-    An explicit override means the user *wants* the text pipeline (they're
-    paying for a dedicated vision model), so we don't silently bypass it.
+    An explicit override means the user has a dedicated vision backend
+    available; it's used as a *fallback* when the main model can't take
+    images natively. In ``auto`` mode, native vision on a vision-capable
+    main model still wins over this fallback — see issue #29135.
    """
    if not isinstance(cfg, dict):
        return False
@ -432,13 +438,15 @@ def decide_image_input_mode(
    if mode_cfg == "text":
        return "text"

-    # auto
-    if _explicit_aux_vision_override(cfg):
-        return "text"
-
+    # auto: prefer native vision when the main model supports it. An
+    # explicit auxiliary.vision config acts as a *fallback* for text-only
+    # main models — it should not preempt native vision on a model that
+    # can natively inspect the pixels (issue #29135).
    supports = _lookup_supports_vision(provider, model, cfg)
    if supports is True:
        return "native"
+    if _explicit_aux_vision_override(cfg):
+        return "text"
    return "text"


--- a/tests/agent/test_image_routing.py
+++ b/tests/agent/test_image_routing.py
@ -97,11 +97,21 @@ class TestDecideImageInputMode:
        with patch("agent.image_routing._lookup_supports_vision", return_value=None):
            assert decide_image_input_mode("openrouter", "brand-new-slug", {}) == "text"

-    def test_auto_respects_aux_vision_override_even_for_vision_model(self):
-        """If the user configured a dedicated vision backend, don't bypass it."""
+    def test_auto_prefers_native_for_vision_capable_main_model_even_with_aux_configured(self):
+        """Regression #29135: vision-capable main model wins over aux fallback.
+
+        Auxiliary.vision is a fallback for text-only main models; it must
+        not preempt native vision on a vision-capable main model.
+        """
        cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
        with patch("agent.image_routing._lookup_supports_vision", return_value=True):
-            assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text"
+            assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native"
+
+    def test_auto_uses_aux_vision_fallback_for_text_only_main_model(self):
+        """#29135: aux vision still acts as fallback for non-vision main models."""
+        cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
+        with patch("agent.image_routing._lookup_supports_vision", return_value=False):
+            assert decide_image_input_mode("deepseek", "deepseek-v4-pro", cfg) == "text"

    def test_none_config_is_auto(self):
        with patch("agent.image_routing._lookup_supports_vision", return_value=True):
@ -325,15 +335,25 @@ class TestAutoModeRespectsOverride:
        with patch("agent.models_dev.get_model_capabilities", return_value=None):
            assert decide_image_input_mode("custom", "unknown", {}) == "text"

-    def test_explicit_aux_vision_override_still_wins(self):
-        # If the user has configured a dedicated vision aux backend, respect
-        # it even when supports_vision: true is also set.
+    def test_explicit_aux_vision_no_longer_overrides_native_capable_main(self):
+        # #29135: aux.vision is a fallback for text-only main models; it
+        # must NOT preempt native routing when the main model can take
+        # images directly (supports_vision: true).
        cfg = {
            "model": {"supports_vision": True},
            "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
        }
        with patch("agent.models_dev.get_model_capabilities", return_value=None):
-            assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "text"
+            assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "native"
+
+    def test_explicit_aux_vision_used_when_main_model_supports_vision_false(self):
+        # #29135 counterpart: text-only main model + aux fallback → text.
+        cfg = {
+            "model": {"supports_vision": False},
+            "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
+        }
+        with patch("agent.models_dev.get_model_capabilities", return_value=None):
+            assert decide_image_input_mode("custom", "deepseek-v4", cfg) == "text"


 # ─── build_native_content_parts ──────────────────────────────────────────────