fix(agent): prefer native vision over auxiliary fallback in auto mode (#29135)
This commit is contained in:
parent
b19e32c702
commit
8bf797f1c2
2 changed files with 48 additions and 20 deletions
|
|
@ -17,13 +17,17 @@ It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native``
|
|||
| ``text``, default ``auto``) and the active model's capability metadata.
|
||||
|
||||
In ``auto`` mode:
|
||||
- If the user has explicitly configured ``auxiliary.vision.provider``
|
||||
(i.e. not ``auto`` and not empty), we assume they want the text pipeline
|
||||
regardless of the main model — they've opted in to a specific vision
|
||||
backend for a reason (cost, quality, local-only, etc.).
|
||||
- Otherwise, if the active model reports ``supports_vision=True`` in its
|
||||
models.dev metadata, we attach natively.
|
||||
- Otherwise (non-vision model, no explicit override), we fall back to text.
|
||||
- If the active model reports ``supports_vision=True`` (via config
|
||||
override or models.dev metadata), we attach natively — vision-capable
|
||||
main models should always see the original pixels, even when an
|
||||
auxiliary vision backend is configured. That auxiliary backend then
|
||||
acts as a *fallback* for sessions whose main model can't take images.
|
||||
- Otherwise, if the user has explicitly configured ``auxiliary.vision``
|
||||
(provider/model/base_url not ``auto``/empty), we route through the
|
||||
text pipeline so the auxiliary vision backend can describe the image
|
||||
for the text-only main model.
|
||||
- Otherwise (non-vision model, no explicit override), we fall back to
|
||||
text via the default vision_analyze flow.
|
||||
|
||||
This keeps ``vision_analyze`` surfaced as a tool in every session — skills
|
||||
and agent flows that chain it (browser screenshots, deeper inspection of
|
||||
|
|
@ -342,8 +346,10 @@ def _coerce_mode(raw: Any) -> str:
|
|||
def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
|
||||
"""True when the user configured a specific auxiliary vision backend.
|
||||
|
||||
An explicit override means the user *wants* the text pipeline (they're
|
||||
paying for a dedicated vision model), so we don't silently bypass it.
|
||||
An explicit override means the user has a dedicated vision backend
|
||||
available; it's used as a *fallback* when the main model can't take
|
||||
images natively. In ``auto`` mode, native vision on a vision-capable
|
||||
main model still wins over this fallback — see issue #29135.
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
return False
|
||||
|
|
@ -432,13 +438,15 @@ def decide_image_input_mode(
|
|||
if mode_cfg == "text":
|
||||
return "text"
|
||||
|
||||
# auto
|
||||
if _explicit_aux_vision_override(cfg):
|
||||
return "text"
|
||||
|
||||
# auto: prefer native vision when the main model supports it. An
|
||||
# explicit auxiliary.vision config acts as a *fallback* for text-only
|
||||
# main models — it should not preempt native vision on a model that
|
||||
# can natively inspect the pixels (issue #29135).
|
||||
supports = _lookup_supports_vision(provider, model, cfg)
|
||||
if supports is True:
|
||||
return "native"
|
||||
if _explicit_aux_vision_override(cfg):
|
||||
return "text"
|
||||
return "text"
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -97,11 +97,21 @@ class TestDecideImageInputMode:
|
|||
with patch("agent.image_routing._lookup_supports_vision", return_value=None):
|
||||
assert decide_image_input_mode("openrouter", "brand-new-slug", {}) == "text"
|
||||
|
||||
def test_auto_respects_aux_vision_override_even_for_vision_model(self):
|
||||
"""If the user configured a dedicated vision backend, don't bypass it."""
|
||||
def test_auto_prefers_native_for_vision_capable_main_model_even_with_aux_configured(self):
|
||||
"""Regression #29135: vision-capable main model wins over aux fallback.
|
||||
|
||||
Auxiliary.vision is a fallback for text-only main models; it must
|
||||
not preempt native vision on a vision-capable main model.
|
||||
"""
|
||||
cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
|
||||
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
|
||||
assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text"
|
||||
assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native"
|
||||
|
||||
def test_auto_uses_aux_vision_fallback_for_text_only_main_model(self):
|
||||
"""#29135: aux vision still acts as fallback for non-vision main models."""
|
||||
cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
|
||||
with patch("agent.image_routing._lookup_supports_vision", return_value=False):
|
||||
assert decide_image_input_mode("deepseek", "deepseek-v4-pro", cfg) == "text"
|
||||
|
||||
def test_none_config_is_auto(self):
|
||||
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
|
||||
|
|
@ -325,15 +335,25 @@ class TestAutoModeRespectsOverride:
|
|||
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||
assert decide_image_input_mode("custom", "unknown", {}) == "text"
|
||||
|
||||
def test_explicit_aux_vision_override_still_wins(self):
|
||||
# If the user has configured a dedicated vision aux backend, respect
|
||||
# it even when supports_vision: true is also set.
|
||||
def test_explicit_aux_vision_no_longer_overrides_native_capable_main(self):
|
||||
# #29135: aux.vision is a fallback for text-only main models; it
|
||||
# must NOT preempt native routing when the main model can take
|
||||
# images directly (supports_vision: true).
|
||||
cfg = {
|
||||
"model": {"supports_vision": True},
|
||||
"auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
|
||||
}
|
||||
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||
assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "text"
|
||||
assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "native"
|
||||
|
||||
def test_explicit_aux_vision_used_when_main_model_supports_vision_false(self):
|
||||
# #29135 counterpart: text-only main model + aux fallback → text.
|
||||
cfg = {
|
||||
"model": {"supports_vision": False},
|
||||
"auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
|
||||
}
|
||||
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||
assert decide_image_input_mode("custom", "deepseek-v4", cfg) == "text"
|
||||
|
||||
|
||||
# ─── build_native_content_parts ──────────────────────────────────────────────
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue