fix(agent): resolve review findings on vLLM local-context salvage

Salvage review of #56431 surfaced one Critical + two Warning issues; fix them on top of the contributor's cherry-picked commits: 1. Critical — duplicate non-agentic warning on the interactive CLI. The new agent_init warning fires on every platform, but cli.py show_banner() already warns on CLI (richer output + /model hint), so a CLI user saw the warning twice per startup. Guard the agent_init emit to skip platform=="cli" — it now fills exactly the gateway/TUI gap the PR intended, no duplication. 2. Warning — vLLM error-parse regex under-matched. The patterns required a literal space before the number, so "max_model_len: 32768", "=32768", "(32768)", and "... is 32768" all returned None. Broaden both patterns to accept :/=/(/ 'is' delimiters. Add a parametrized test over all delimiter variants. 3. Warning — per-call live probe latency on local endpoints. The new reconcile-on-hit + pre-defaults step-7 probe made every local resolution fire a synchronous network probe (banner + /model switch + compressor update_model each within one startup). Add a 30s in-process TTL cache keyed by (model, base_url) around _query_local_context_length so back-to- back resolutions reuse one round-trip; not persisted to disk, so the reconcile freshness contract (re-probe after restart) is preserved. Add an autouse fixture clearing the cache between tests + TTL coverage. Tests: 148 passed (was 138). ruff clean.
2026-07-03 03:27:13 +05:30 · 2026-07-03 03:27:13 +05:30 · b9a197ec59
commit b9a197ec59
parent 53063d92b0
4 changed files with 130 additions and 4 deletions
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@ -1722,8 +1722,10 @@ def init_agent(
        )

    # Nous Hermes 3/4 are chat models, not tool-call-tuned — surface the
-    # warning on every platform (CLI already did this; gateway/TUI did not).
-    if not agent.quiet_mode:
+    # warning on gateway/TUI. The interactive CLI already warns via
+    # cli.py show_banner() (richer output + /model switch hint), so skip the
+    # CLI platform here to avoid emitting the warning twice per startup.
+    if not agent.quiet_mode and (agent.platform or "cli") != "cli":
        try:
            from hermes_cli.model_switch import _check_hermes_model_warning

--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -184,6 +184,15 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
 # Sessions, model switches, and cron jobs should reject models below this.
 MINIMUM_CONTEXT_LENGTH = 64_000

+# Short-lived in-process cache for local-server context probes. Bounds the
+# probe rate when the new local-endpoint live-probe paths (reconcile-on-hit +
+# pre-defaults step 7) resolve the same model several times during one startup
+# (banner, /model switch, compressor update_model). Keyed by (model, base_url);
+# values are (result, monotonic_timestamp). Not persisted to disk — cross-
+# restart freshness is handled by the reconcile logic re-probing after expiry.
+_LOCAL_CTX_PROBE_TTL_SECONDS = 30.0
+_LOCAL_CTX_PROBE_CACHE: Dict[tuple, tuple] = {}
+
 # Thin fallback defaults — only broad model family patterns.
 # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
 # all miss. Replaced the previous 80+ entry dict.
@ -1068,8 +1077,8 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
    error_lower = error_msg.lower()
    # Pattern: look for numbers near context-related keywords
    patterns = [
-        r'max_model_len\s+(\d{4,})',  # vLLM: "exceeds the max_model_len 32768"
-        r'maximum model length\s+(\d{4,})',  # vLLM alt: "exceeds maximum model length 131072"
+        r'max_model_len\s*(?:is\s*)?[:=(]?\s*(\d{4,})',  # vLLM: "max_model_len 32768", "=32768", ": 32768", "(32768)", "is 32768"
+        r'maximum model length\s*(?:is\s*)?[:=(]?\s*(\d{4,})',  # vLLM alt: "maximum model length 131072", "... is 131072"
        r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
        r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
        r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
@ -1515,6 +1524,33 @@ def _model_name_suggests_grok_4_3(model: str) -> bool:


 def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]:
+    """Query a local server for the model's context length (short-TTL cached).
+
+    The live-probe paths added for local endpoints (reconcile-on-hit and the
+    pre-defaults step-7 probe) can fire this function several times in quick
+    succession during one startup — banner display, ``/model`` switch,
+    compressor ``update_model`` all resolve the same model. Each raw probe
+    issues synchronous ``detect_local_server_type`` + query HTTP calls (bounded
+    by the 3s httpx timeout), so an unreachable/slow local server would pay
+    that cost repeatedly. A tiny in-process TTL cache collapses back-to-back
+    probes for the same (model, base_url) into one network round-trip without
+    persisting anything to disk (freshness across restarts is still handled by
+    the reconcile logic, which probes again once the TTL expires).
+    """
+    import time as _time
+
+    cache_key = (_strip_provider_prefix(model), base_url.rstrip("/"))
+    now = _time.monotonic()
+    cached = _LOCAL_CTX_PROBE_CACHE.get(cache_key)
+    if cached is not None and (now - cached[1]) < _LOCAL_CTX_PROBE_TTL_SECONDS:
+        return cached[0]
+
+    result = _query_local_context_length_uncached(model, base_url, api_key=api_key)
+    _LOCAL_CTX_PROBE_CACHE[cache_key] = (result, now)
+    return result
+
+
+def _query_local_context_length_uncached(model: str, base_url: str, api_key: str = "") -> Optional[int]:
    """Query a local server for the model's context length."""
    import httpx

--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -12,6 +12,7 @@ Coverage levels:

 import time

+import pytest
 import yaml
 from unittest.mock import patch, MagicMock

@ -1384,6 +1385,23 @@ class TestParseContextLimitFromError:
        msg = "prompt length 200000 exceeds maximum model length 131072"
        assert parse_context_limit_from_error(msg) == 131072

+    @pytest.mark.parametrize("msg,expected", [
+        ("max_model_len 32768", 32768),
+        ("max_model_len: 32768", 32768),
+        ("max_model_len=32768", 32768),
+        ("max_model_len (32768)", 32768),
+        ("max_model_len is 32768", 32768),
+        ("maximum model length 131072", 131072),
+        ("maximum model length is 131072", 131072),
+        ("maximum model length: 131072", 131072),
+    ])
+    def test_vllm_delimiter_variants(self, msg, expected):
+        """vLLM emits the limit with various delimiters (space/colon/equals/
+        paren/'is'). The parser must catch all of them — the original
+        space-only patterns silently missed ':', '=', '(' and 'is' forms and
+        fell through to None."""
+        assert parse_context_limit_from_error(msg) == expected
+
    def test_get_context_length_from_vllm_max_model_len_error(self):
        from agent.model_metadata import get_context_length_from_provider_error

--- a/tests/agent/test_model_metadata_local_ctx.py
+++ b/tests/agent/test_model_metadata_local_ctx.py
@ -8,9 +8,27 @@ import sys
 import os
 from unittest.mock import MagicMock, patch

+import pytest
+
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))


+@pytest.fixture(autouse=True)
+def _clear_local_ctx_probe_cache():
+    """Reset the in-process local-probe TTL cache around every test.
+
+    _query_local_context_length memoizes probes per (model, base_url) for a
+    short TTL to bound the probe rate on hot paths. In tests that mock httpx
+    to return different responses for the same (model, base_url), a stale
+    cache entry would leak across cases — clear it before and after each test.
+    """
+    import agent.model_metadata as _mm
+
+    _mm._LOCAL_CTX_PROBE_CACHE.clear()
+    yield
+    _mm._LOCAL_CTX_PROBE_CACHE.clear()
+
+

 # ---------------------------------------------------------------------------
 # _query_local_context_length — unit tests with mocked httpx
@ -732,3 +750,55 @@ class TestGetModelContextLengthLocalFallback:
            result = get_model_context_length("unknown-xyz-model", "")

        mock_query.assert_not_called()
+
+
+class TestLocalContextProbeTTLCache:
+    """The in-process TTL cache collapses back-to-back probes for the same
+    (model, base_url) into one network round-trip (bounds probe rate on hot
+    paths like banner + /model switch + compressor update within one startup),
+    while a different key still probes."""
+
+    def _make_resp(self, status_code, body):
+        resp = MagicMock()
+        resp.status_code = status_code
+        resp.json.return_value = body
+        return resp
+
+    def test_second_call_within_ttl_does_not_reprobe(self):
+        from agent.model_metadata import _query_local_context_length
+
+        show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}})
+        models_resp = self._make_resp(404, {})
+        client_mock = MagicMock()
+        client_mock.__enter__ = lambda s: client_mock
+        client_mock.__exit__ = MagicMock(return_value=False)
+        client_mock.post.return_value = show_resp
+        client_mock.get.return_value = models_resp
+
+        with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \
+             patch("httpx.Client", return_value=client_mock):
+            first = _query_local_context_length("m", "http://localhost:11434/v1")
+            second = _query_local_context_length("m", "http://localhost:11434/v1")
+
+        assert first == 32768
+        assert second == 32768
+        # Only the first call hits the network; the second is served from cache.
+        assert detect.call_count == 1
+
+    def test_different_key_still_probes(self):
+        from agent.model_metadata import _query_local_context_length
+
+        show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}})
+        models_resp = self._make_resp(404, {})
+        client_mock = MagicMock()
+        client_mock.__enter__ = lambda s: client_mock
+        client_mock.__exit__ = MagicMock(return_value=False)
+        client_mock.post.return_value = show_resp
+        client_mock.get.return_value = models_resp
+
+        with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \
+             patch("httpx.Client", return_value=client_mock):
+            _query_local_context_length("m1", "http://localhost:11434/v1")
+            _query_local_context_length("m2", "http://localhost:11434/v1")
+
+        assert detect.call_count == 2