From 149641485c7f5bcb33f6acf11544f0a816d8b054 Mon Sep 17 00:00:00 2001 From: liuhao1024 Date: Sun, 28 Jun 2026 01:41:23 +0800 Subject: [PATCH] fix(vision): read auxiliary model from config.yaml before env var _handlers for vision_analyze and video_analyze read model name from config.yaml (auxiliary.vision.model / auxiliary.video.model) before falling back to AUXILIARY_VISION_MODEL / AUXILIARY_VIDEO_MODEL env vars. Matches the existing config-first pattern for timeout and temperature in the same file. Fixes #53749 --- tests/tools/test_vision_tools.py | 42 ++++++++++++++++++++++++++++++++ tools/vision_tools.py | 27 ++++++++++++++++++-- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py index 98bdd2276..47d4b6481 100644 --- a/tests/tools/test_vision_tools.py +++ b/tests/tools/test_vision_tools.py @@ -261,6 +261,48 @@ class TestHandleVisionAnalyze: # (the centralized call_llm router picks the default) assert model is None + def test_config_yaml_model_takes_priority_over_env(self): + """config.yaml auxiliary.vision.model should be preferred over env var.""" + with ( + patch( + "tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock + ) as mock_tool, + patch( + "hermes_cli.config.load_config", + return_value={"auxiliary": {"vision": {"model": "qwen3.7-plus"}}}, + ), + patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "env-model"}), + ): + mock_tool.return_value = json.dumps({"result": "ok"}) + coro = _handle_vision_analyze( + {"image_url": "https://example.com/img.png", "question": "test"} + ) + coro.close() + call_args = mock_tool.call_args + model = call_args[0][2] # third positional arg + assert model == "qwen3.7-plus" + + def test_env_var_used_when_config_missing_model(self): + """Env var should be used when config.yaml has no auxiliary.vision.model.""" + with ( + patch( + "tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock + ) as mock_tool, + patch( + "hermes_cli.config.load_config", + return_value={"auxiliary": {"vision": {}}}, + ), + patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "fallback-model"}), + ): + mock_tool.return_value = json.dumps({"result": "ok"}) + coro = _handle_vision_analyze( + {"image_url": "https://example.com/img.png", "question": "test"} + ) + coro.close() + call_args = mock_tool.call_args + model = call_args[0][2] + assert model == "fallback-model" + def test_empty_args_graceful(self): """Missing keys should default to empty strings, not raise.""" with patch( diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 23273483e..6b67abec9 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -1356,7 +1356,18 @@ async def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> str: "Fully describe and explain everything about this image, then answer the " f"following question:\n\n{question}" ) - model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None + # Prefer config.yaml auxiliary.vision.model; env var is a legacy override. + model = None + try: + from hermes_cli.config import cfg_get, load_config + _cfg = load_config() + _vmodel = cfg_get(_cfg, "auxiliary", "vision", "model") + if _vmodel: + model = str(_vmodel).strip() or None + except Exception: + pass + if not model: + model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None return await vision_analyze_tool(image_url, full_prompt, model) @@ -1718,7 +1729,19 @@ def _handle_video_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: "including visual content, motion, audio cues, text overlays, and scene " f"transitions. Then answer the following question:\n\n{question}" ) - model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None + # Prefer config.yaml auxiliary.video.model (falling back to vision); + # env vars are a legacy override. + model = None + try: + from hermes_cli.config import cfg_get, load_config + _cfg = load_config() + _vmodel = cfg_get(_cfg, "auxiliary", "video", "model") or cfg_get(_cfg, "auxiliary", "vision", "model") + if _vmodel: + model = str(_vmodel).strip() or None + except Exception: + pass + if not model: + model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None return video_analyze_tool(video_url, full_prompt, model)