From b9a197ec59393ebd13b897ed342749a9228dd86d Mon Sep 17 00:00:00 2001 From: kshitijk4poor Date: Fri, 3 Jul 2026 03:27:13 +0530 Subject: [PATCH] fix(agent): resolve review findings on vLLM local-context salvage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Salvage review of #56431 surfaced one Critical + two Warning issues; fix them on top of the contributor's cherry-picked commits: 1. Critical — duplicate non-agentic warning on the interactive CLI. The new agent_init warning fires on every platform, but cli.py show_banner() already warns on CLI (richer output + /model hint), so a CLI user saw the warning twice per startup. Guard the agent_init emit to skip platform=="cli" — it now fills exactly the gateway/TUI gap the PR intended, no duplication. 2. Warning — vLLM error-parse regex under-matched. The patterns required a literal space before the number, so "max_model_len: 32768", "=32768", "(32768)", and "... is 32768" all returned None. Broaden both patterns to accept :/=/(/ 'is' delimiters. Add a parametrized test over all delimiter variants. 3. Warning — per-call live probe latency on local endpoints. The new reconcile-on-hit + pre-defaults step-7 probe made every local resolution fire a synchronous network probe (banner + /model switch + compressor update_model each within one startup). Add a 30s in-process TTL cache keyed by (model, base_url) around _query_local_context_length so back-to- back resolutions reuse one round-trip; not persisted to disk, so the reconcile freshness contract (re-probe after restart) is preserved. Add an autouse fixture clearing the cache between tests + TTL coverage. Tests: 148 passed (was 138). ruff clean. --- agent/agent_init.py | 6 +- agent/model_metadata.py | 40 ++++++++++- tests/agent/test_model_metadata.py | 18 +++++ tests/agent/test_model_metadata_local_ctx.py | 70 ++++++++++++++++++++ 4 files changed, 130 insertions(+), 4 deletions(-) diff --git a/agent/agent_init.py b/agent/agent_init.py index 64c77dc35..29041308a 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1722,8 +1722,10 @@ def init_agent( ) # Nous Hermes 3/4 are chat models, not tool-call-tuned — surface the - # warning on every platform (CLI already did this; gateway/TUI did not). - if not agent.quiet_mode: + # warning on gateway/TUI. The interactive CLI already warns via + # cli.py show_banner() (richer output + /model switch hint), so skip the + # CLI platform here to avoid emitting the warning twice per startup. + if not agent.quiet_mode and (agent.platform or "cli") != "cli": try: from hermes_cli.model_switch import _check_hermes_model_warning diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 4b87f574c..28d8e1d9c 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -184,6 +184,15 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0] # Sessions, model switches, and cron jobs should reject models below this. MINIMUM_CONTEXT_LENGTH = 64_000 +# Short-lived in-process cache for local-server context probes. Bounds the +# probe rate when the new local-endpoint live-probe paths (reconcile-on-hit + +# pre-defaults step 7) resolve the same model several times during one startup +# (banner, /model switch, compressor update_model). Keyed by (model, base_url); +# values are (result, monotonic_timestamp). Not persisted to disk — cross- +# restart freshness is handled by the reconcile logic re-probing after expiry. +_LOCAL_CTX_PROBE_TTL_SECONDS = 30.0 +_LOCAL_CTX_PROBE_CACHE: Dict[tuple, tuple] = {} + # Thin fallback defaults — only broad model family patterns. # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic # all miss. Replaced the previous 80+ entry dict. @@ -1068,8 +1077,8 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]: error_lower = error_msg.lower() # Pattern: look for numbers near context-related keywords patterns = [ - r'max_model_len\s+(\d{4,})', # vLLM: "exceeds the max_model_len 32768" - r'maximum model length\s+(\d{4,})', # vLLM alt: "exceeds maximum model length 131072" + r'max_model_len\s*(?:is\s*)?[:=(]?\s*(\d{4,})', # vLLM: "max_model_len 32768", "=32768", ": 32768", "(32768)", "is 32768" + r'maximum model length\s*(?:is\s*)?[:=(]?\s*(\d{4,})', # vLLM alt: "maximum model length 131072", "... is 131072" r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})', r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})', r'(\d{4,})\s*(?:token)?\s*(?:context|limit)', @@ -1515,6 +1524,33 @@ def _model_name_suggests_grok_4_3(model: str) -> bool: def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]: + """Query a local server for the model's context length (short-TTL cached). + + The live-probe paths added for local endpoints (reconcile-on-hit and the + pre-defaults step-7 probe) can fire this function several times in quick + succession during one startup — banner display, ``/model`` switch, + compressor ``update_model`` all resolve the same model. Each raw probe + issues synchronous ``detect_local_server_type`` + query HTTP calls (bounded + by the 3s httpx timeout), so an unreachable/slow local server would pay + that cost repeatedly. A tiny in-process TTL cache collapses back-to-back + probes for the same (model, base_url) into one network round-trip without + persisting anything to disk (freshness across restarts is still handled by + the reconcile logic, which probes again once the TTL expires). + """ + import time as _time + + cache_key = (_strip_provider_prefix(model), base_url.rstrip("/")) + now = _time.monotonic() + cached = _LOCAL_CTX_PROBE_CACHE.get(cache_key) + if cached is not None and (now - cached[1]) < _LOCAL_CTX_PROBE_TTL_SECONDS: + return cached[0] + + result = _query_local_context_length_uncached(model, base_url, api_key=api_key) + _LOCAL_CTX_PROBE_CACHE[cache_key] = (result, now) + return result + + +def _query_local_context_length_uncached(model: str, base_url: str, api_key: str = "") -> Optional[int]: """Query a local server for the model's context length.""" import httpx diff --git a/tests/agent/test_model_metadata.py b/tests/agent/test_model_metadata.py index 82356260a..d450580cd 100644 --- a/tests/agent/test_model_metadata.py +++ b/tests/agent/test_model_metadata.py @@ -12,6 +12,7 @@ Coverage levels: import time +import pytest import yaml from unittest.mock import patch, MagicMock @@ -1384,6 +1385,23 @@ class TestParseContextLimitFromError: msg = "prompt length 200000 exceeds maximum model length 131072" assert parse_context_limit_from_error(msg) == 131072 + @pytest.mark.parametrize("msg,expected", [ + ("max_model_len 32768", 32768), + ("max_model_len: 32768", 32768), + ("max_model_len=32768", 32768), + ("max_model_len (32768)", 32768), + ("max_model_len is 32768", 32768), + ("maximum model length 131072", 131072), + ("maximum model length is 131072", 131072), + ("maximum model length: 131072", 131072), + ]) + def test_vllm_delimiter_variants(self, msg, expected): + """vLLM emits the limit with various delimiters (space/colon/equals/ + paren/'is'). The parser must catch all of them — the original + space-only patterns silently missed ':', '=', '(' and 'is' forms and + fell through to None.""" + assert parse_context_limit_from_error(msg) == expected + def test_get_context_length_from_vllm_max_model_len_error(self): from agent.model_metadata import get_context_length_from_provider_error diff --git a/tests/agent/test_model_metadata_local_ctx.py b/tests/agent/test_model_metadata_local_ctx.py index fec729c6a..f2250f204 100644 --- a/tests/agent/test_model_metadata_local_ctx.py +++ b/tests/agent/test_model_metadata_local_ctx.py @@ -8,9 +8,27 @@ import sys import os from unittest.mock import MagicMock, patch +import pytest + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +@pytest.fixture(autouse=True) +def _clear_local_ctx_probe_cache(): + """Reset the in-process local-probe TTL cache around every test. + + _query_local_context_length memoizes probes per (model, base_url) for a + short TTL to bound the probe rate on hot paths. In tests that mock httpx + to return different responses for the same (model, base_url), a stale + cache entry would leak across cases — clear it before and after each test. + """ + import agent.model_metadata as _mm + + _mm._LOCAL_CTX_PROBE_CACHE.clear() + yield + _mm._LOCAL_CTX_PROBE_CACHE.clear() + + # --------------------------------------------------------------------------- # _query_local_context_length — unit tests with mocked httpx @@ -732,3 +750,55 @@ class TestGetModelContextLengthLocalFallback: result = get_model_context_length("unknown-xyz-model", "") mock_query.assert_not_called() + + +class TestLocalContextProbeTTLCache: + """The in-process TTL cache collapses back-to-back probes for the same + (model, base_url) into one network round-trip (bounds probe rate on hot + paths like banner + /model switch + compressor update within one startup), + while a different key still probes.""" + + def _make_resp(self, status_code, body): + resp = MagicMock() + resp.status_code = status_code + resp.json.return_value = body + return resp + + def test_second_call_within_ttl_does_not_reprobe(self): + from agent.model_metadata import _query_local_context_length + + show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}}) + models_resp = self._make_resp(404, {}) + client_mock = MagicMock() + client_mock.__enter__ = lambda s: client_mock + client_mock.__exit__ = MagicMock(return_value=False) + client_mock.post.return_value = show_resp + client_mock.get.return_value = models_resp + + with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \ + patch("httpx.Client", return_value=client_mock): + first = _query_local_context_length("m", "http://localhost:11434/v1") + second = _query_local_context_length("m", "http://localhost:11434/v1") + + assert first == 32768 + assert second == 32768 + # Only the first call hits the network; the second is served from cache. + assert detect.call_count == 1 + + def test_different_key_still_probes(self): + from agent.model_metadata import _query_local_context_length + + show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}}) + models_resp = self._make_resp(404, {}) + client_mock = MagicMock() + client_mock.__enter__ = lambda s: client_mock + client_mock.__exit__ = MagicMock(return_value=False) + client_mock.post.return_value = show_resp + client_mock.get.return_value = models_resp + + with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \ + patch("httpx.Client", return_value=client_mock): + _query_local_context_length("m1", "http://localhost:11434/v1") + _query_local_context_length("m2", "http://localhost:11434/v1") + + assert detect.call_count == 2