From 149641485c7f5bcb33f6acf11544f0a816d8b054 Mon Sep 17 00:00:00 2001
From: liuhao1024 <sunsky.lau@gmail.com>
Date: Sun, 28 Jun 2026 01:41:23 +0800
Subject: [PATCH] fix(vision): read auxiliary model from config.yaml before env
 var

_handlers for vision_analyze and video_analyze read model name from
config.yaml (auxiliary.vision.model / auxiliary.video.model) before
falling back to AUXILIARY_VISION_MODEL / AUXILIARY_VIDEO_MODEL env
vars.  Matches the existing config-first pattern for timeout and
temperature in the same file.

Fixes #53749
---
 tests/tools/test_vision_tools.py | 42 ++++++++++++++++++++++++++++++++
 tools/vision_tools.py            | 27 ++++++++++++++++++--
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py
index 98bdd2276..47d4b6481 100644
--- a/tests/tools/test_vision_tools.py
+++ b/tests/tools/test_vision_tools.py
@@ -261,6 +261,48 @@ class TestHandleVisionAnalyze:
             # (the centralized call_llm router picks the default)
             assert model is None
 
+    def test_config_yaml_model_takes_priority_over_env(self):
+        """config.yaml auxiliary.vision.model should be preferred over env var."""
+        with (
+            patch(
+                "tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock
+            ) as mock_tool,
+            patch(
+                "hermes_cli.config.load_config",
+                return_value={"auxiliary": {"vision": {"model": "qwen3.7-plus"}}},
+            ),
+            patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "env-model"}),
+        ):
+            mock_tool.return_value = json.dumps({"result": "ok"})
+            coro = _handle_vision_analyze(
+                {"image_url": "https://example.com/img.png", "question": "test"}
+            )
+            coro.close()
+            call_args = mock_tool.call_args
+            model = call_args[0][2]  # third positional arg
+            assert model == "qwen3.7-plus"
+
+    def test_env_var_used_when_config_missing_model(self):
+        """Env var should be used when config.yaml has no auxiliary.vision.model."""
+        with (
+            patch(
+                "tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock
+            ) as mock_tool,
+            patch(
+                "hermes_cli.config.load_config",
+                return_value={"auxiliary": {"vision": {}}},
+            ),
+            patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "fallback-model"}),
+        ):
+            mock_tool.return_value = json.dumps({"result": "ok"})
+            coro = _handle_vision_analyze(
+                {"image_url": "https://example.com/img.png", "question": "test"}
+            )
+            coro.close()
+            call_args = mock_tool.call_args
+            model = call_args[0][2]
+            assert model == "fallback-model"
+
     def test_empty_args_graceful(self):
         """Missing keys should default to empty strings, not raise."""
         with patch(
diff --git a/tools/vision_tools.py b/tools/vision_tools.py
index 23273483e..6b67abec9 100644
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -1356,7 +1356,18 @@ async def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> str:
         "Fully describe and explain everything about this image, then answer the "
         f"following question:\n\n{question}"
     )
-    model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
+    # Prefer config.yaml auxiliary.vision.model; env var is a legacy override.
+    model = None
+    try:
+        from hermes_cli.config import cfg_get, load_config
+        _cfg = load_config()
+        _vmodel = cfg_get(_cfg, "auxiliary", "vision", "model")
+        if _vmodel:
+            model = str(_vmodel).strip() or None
+    except Exception:
+        pass
+    if not model:
+        model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
     return await vision_analyze_tool(image_url, full_prompt, model)
 
 
@@ -1718,7 +1729,19 @@ def _handle_video_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
         "including visual content, motion, audio cues, text overlays, and scene "
         f"transitions. Then answer the following question:\n\n{question}"
     )
-    model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
+    # Prefer config.yaml auxiliary.video.model (falling back to vision);
+    # env vars are a legacy override.
+    model = None
+    try:
+        from hermes_cli.config import cfg_get, load_config
+        _cfg = load_config()
+        _vmodel = cfg_get(_cfg, "auxiliary", "video", "model") or cfg_get(_cfg, "auxiliary", "vision", "model")
+        if _vmodel:
+            model = str(_vmodel).strip() or None
+    except Exception:
+        pass
+    if not model:
+        model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
     return video_analyze_tool(video_url, full_prompt, model)