fix(agent): prefer native vision over auxiliary fallback in auto mode (#29135)

This commit is contained in:
Jiahui-Gu 2026-05-26 17:19:21 +08:00 committed by Teknium
parent b19e32c702
commit 8bf797f1c2
2 changed files with 48 additions and 20 deletions

View file

@ -17,13 +17,17 @@ It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native``
| ``text``, default ``auto``) and the active model's capability metadata.
In ``auto`` mode:
- If the user has explicitly configured ``auxiliary.vision.provider``
(i.e. not ``auto`` and not empty), we assume they want the text pipeline
regardless of the main model they've opted in to a specific vision
backend for a reason (cost, quality, local-only, etc.).
- Otherwise, if the active model reports ``supports_vision=True`` in its
models.dev metadata, we attach natively.
- Otherwise (non-vision model, no explicit override), we fall back to text.
- If the active model reports ``supports_vision=True`` (via config
override or models.dev metadata), we attach natively vision-capable
main models should always see the original pixels, even when an
auxiliary vision backend is configured. That auxiliary backend then
acts as a *fallback* for sessions whose main model can't take images.
- Otherwise, if the user has explicitly configured ``auxiliary.vision``
(provider/model/base_url not ``auto``/empty), we route through the
text pipeline so the auxiliary vision backend can describe the image
for the text-only main model.
- Otherwise (non-vision model, no explicit override), we fall back to
text via the default vision_analyze flow.
This keeps ``vision_analyze`` surfaced as a tool in every session skills
and agent flows that chain it (browser screenshots, deeper inspection of
@ -342,8 +346,10 @@ def _coerce_mode(raw: Any) -> str:
def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
"""True when the user configured a specific auxiliary vision backend.
An explicit override means the user *wants* the text pipeline (they're
paying for a dedicated vision model), so we don't silently bypass it.
An explicit override means the user has a dedicated vision backend
available; it's used as a *fallback* when the main model can't take
images natively. In ``auto`` mode, native vision on a vision-capable
main model still wins over this fallback see issue #29135.
"""
if not isinstance(cfg, dict):
return False
@ -432,13 +438,15 @@ def decide_image_input_mode(
if mode_cfg == "text":
return "text"
# auto
if _explicit_aux_vision_override(cfg):
return "text"
# auto: prefer native vision when the main model supports it. An
# explicit auxiliary.vision config acts as a *fallback* for text-only
# main models — it should not preempt native vision on a model that
# can natively inspect the pixels (issue #29135).
supports = _lookup_supports_vision(provider, model, cfg)
if supports is True:
return "native"
if _explicit_aux_vision_override(cfg):
return "text"
return "text"

View file

@ -97,11 +97,21 @@ class TestDecideImageInputMode:
with patch("agent.image_routing._lookup_supports_vision", return_value=None):
assert decide_image_input_mode("openrouter", "brand-new-slug", {}) == "text"
def test_auto_respects_aux_vision_override_even_for_vision_model(self):
"""If the user configured a dedicated vision backend, don't bypass it."""
def test_auto_prefers_native_for_vision_capable_main_model_even_with_aux_configured(self):
"""Regression #29135: vision-capable main model wins over aux fallback.
Auxiliary.vision is a fallback for text-only main models; it must
not preempt native vision on a vision-capable main model.
"""
cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text"
assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native"
def test_auto_uses_aux_vision_fallback_for_text_only_main_model(self):
"""#29135: aux vision still acts as fallback for non-vision main models."""
cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
with patch("agent.image_routing._lookup_supports_vision", return_value=False):
assert decide_image_input_mode("deepseek", "deepseek-v4-pro", cfg) == "text"
def test_none_config_is_auto(self):
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
@ -325,15 +335,25 @@ class TestAutoModeRespectsOverride:
with patch("agent.models_dev.get_model_capabilities", return_value=None):
assert decide_image_input_mode("custom", "unknown", {}) == "text"
def test_explicit_aux_vision_override_still_wins(self):
# If the user has configured a dedicated vision aux backend, respect
# it even when supports_vision: true is also set.
def test_explicit_aux_vision_no_longer_overrides_native_capable_main(self):
# #29135: aux.vision is a fallback for text-only main models; it
# must NOT preempt native routing when the main model can take
# images directly (supports_vision: true).
cfg = {
"model": {"supports_vision": True},
"auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
}
with patch("agent.models_dev.get_model_capabilities", return_value=None):
assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "text"
assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "native"
def test_explicit_aux_vision_used_when_main_model_supports_vision_false(self):
# #29135 counterpart: text-only main model + aux fallback → text.
cfg = {
"model": {"supports_vision": False},
"auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
}
with patch("agent.models_dev.get_model_capabilities", return_value=None):
assert decide_image_input_mode("custom", "deepseek-v4", cfg) == "text"
# ─── build_native_content_parts ──────────────────────────────────────────────