From b9a197ec59393ebd13b897ed342749a9228dd86d Mon Sep 17 00:00:00 2001
From: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
Date: Fri, 3 Jul 2026 03:27:13 +0530
Subject: [PATCH] fix(agent): resolve review findings on vLLM local-context
 salvage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Salvage review of #56431 surfaced one Critical + two Warning issues; fix
them on top of the contributor's cherry-picked commits:

1. Critical — duplicate non-agentic warning on the interactive CLI. The new
   agent_init warning fires on every platform, but cli.py show_banner()
   already warns on CLI (richer output + /model hint), so a CLI user saw the
   warning twice per startup. Guard the agent_init emit to skip platform=="cli"
   — it now fills exactly the gateway/TUI gap the PR intended, no duplication.

2. Warning — vLLM error-parse regex under-matched. The patterns required a
   literal space before the number, so "max_model_len: 32768", "=32768",
   "(32768)", and "... is 32768" all returned None. Broaden both patterns to
   accept :/=/(/ 'is' delimiters. Add a parametrized test over all delimiter
   variants.

3. Warning — per-call live probe latency on local endpoints. The new
   reconcile-on-hit + pre-defaults step-7 probe made every local resolution
   fire a synchronous network probe (banner + /model switch + compressor
   update_model each within one startup). Add a 30s in-process TTL cache
   keyed by (model, base_url) around _query_local_context_length so back-to-
   back resolutions reuse one round-trip; not persisted to disk, so the
   reconcile freshness contract (re-probe after restart) is preserved. Add an
   autouse fixture clearing the cache between tests + TTL coverage.

Tests: 148 passed (was 138). ruff clean.
---
 agent/agent_init.py                          |  6 +-
 agent/model_metadata.py                      | 40 ++++++++++-
 tests/agent/test_model_metadata.py           | 18 +++++
 tests/agent/test_model_metadata_local_ctx.py | 70 ++++++++++++++++++++
 4 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/agent/agent_init.py b/agent/agent_init.py
index 64c77dc35..29041308a 100644
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -1722,8 +1722,10 @@ def init_agent(
         )
 
     # Nous Hermes 3/4 are chat models, not tool-call-tuned — surface the
-    # warning on every platform (CLI already did this; gateway/TUI did not).
-    if not agent.quiet_mode:
+    # warning on gateway/TUI. The interactive CLI already warns via
+    # cli.py show_banner() (richer output + /model switch hint), so skip the
+    # CLI platform here to avoid emitting the warning twice per startup.
+    if not agent.quiet_mode and (agent.platform or "cli") != "cli":
         try:
             from hermes_cli.model_switch import _check_hermes_model_warning
 
diff --git a/agent/model_metadata.py b/agent/model_metadata.py
index 4b87f574c..28d8e1d9c 100644
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -184,6 +184,15 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
 # Sessions, model switches, and cron jobs should reject models below this.
 MINIMUM_CONTEXT_LENGTH = 64_000
 
+# Short-lived in-process cache for local-server context probes. Bounds the
+# probe rate when the new local-endpoint live-probe paths (reconcile-on-hit +
+# pre-defaults step 7) resolve the same model several times during one startup
+# (banner, /model switch, compressor update_model). Keyed by (model, base_url);
+# values are (result, monotonic_timestamp). Not persisted to disk — cross-
+# restart freshness is handled by the reconcile logic re-probing after expiry.
+_LOCAL_CTX_PROBE_TTL_SECONDS = 30.0
+_LOCAL_CTX_PROBE_CACHE: Dict[tuple, tuple] = {}
+
 # Thin fallback defaults — only broad model family patterns.
 # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
 # all miss. Replaced the previous 80+ entry dict.
@@ -1068,8 +1077,8 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
     error_lower = error_msg.lower()
     # Pattern: look for numbers near context-related keywords
     patterns = [
-        r'max_model_len\s+(\d{4,})',  # vLLM: "exceeds the max_model_len 32768"
-        r'maximum model length\s+(\d{4,})',  # vLLM alt: "exceeds maximum model length 131072"
+        r'max_model_len\s*(?:is\s*)?[:=(]?\s*(\d{4,})',  # vLLM: "max_model_len 32768", "=32768", ": 32768", "(32768)", "is 32768"
+        r'maximum model length\s*(?:is\s*)?[:=(]?\s*(\d{4,})',  # vLLM alt: "maximum model length 131072", "... is 131072"
         r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
         r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
         r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
@@ -1515,6 +1524,33 @@ def _model_name_suggests_grok_4_3(model: str) -> bool:
 
 
 def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]:
+    """Query a local server for the model's context length (short-TTL cached).
+
+    The live-probe paths added for local endpoints (reconcile-on-hit and the
+    pre-defaults step-7 probe) can fire this function several times in quick
+    succession during one startup — banner display, ``/model`` switch,
+    compressor ``update_model`` all resolve the same model. Each raw probe
+    issues synchronous ``detect_local_server_type`` + query HTTP calls (bounded
+    by the 3s httpx timeout), so an unreachable/slow local server would pay
+    that cost repeatedly. A tiny in-process TTL cache collapses back-to-back
+    probes for the same (model, base_url) into one network round-trip without
+    persisting anything to disk (freshness across restarts is still handled by
+    the reconcile logic, which probes again once the TTL expires).
+    """
+    import time as _time
+
+    cache_key = (_strip_provider_prefix(model), base_url.rstrip("/"))
+    now = _time.monotonic()
+    cached = _LOCAL_CTX_PROBE_CACHE.get(cache_key)
+    if cached is not None and (now - cached[1]) < _LOCAL_CTX_PROBE_TTL_SECONDS:
+        return cached[0]
+
+    result = _query_local_context_length_uncached(model, base_url, api_key=api_key)
+    _LOCAL_CTX_PROBE_CACHE[cache_key] = (result, now)
+    return result
+
+
+def _query_local_context_length_uncached(model: str, base_url: str, api_key: str = "") -> Optional[int]:
     """Query a local server for the model's context length."""
     import httpx
 
diff --git a/tests/agent/test_model_metadata.py b/tests/agent/test_model_metadata.py
index 82356260a..d450580cd 100644
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@@ -12,6 +12,7 @@ Coverage levels:
 
 import time
 
+import pytest
 import yaml
 from unittest.mock import patch, MagicMock
 
@@ -1384,6 +1385,23 @@ class TestParseContextLimitFromError:
         msg = "prompt length 200000 exceeds maximum model length 131072"
         assert parse_context_limit_from_error(msg) == 131072
 
+    @pytest.mark.parametrize("msg,expected", [
+        ("max_model_len 32768", 32768),
+        ("max_model_len: 32768", 32768),
+        ("max_model_len=32768", 32768),
+        ("max_model_len (32768)", 32768),
+        ("max_model_len is 32768", 32768),
+        ("maximum model length 131072", 131072),
+        ("maximum model length is 131072", 131072),
+        ("maximum model length: 131072", 131072),
+    ])
+    def test_vllm_delimiter_variants(self, msg, expected):
+        """vLLM emits the limit with various delimiters (space/colon/equals/
+        paren/'is'). The parser must catch all of them — the original
+        space-only patterns silently missed ':', '=', '(' and 'is' forms and
+        fell through to None."""
+        assert parse_context_limit_from_error(msg) == expected
+
     def test_get_context_length_from_vllm_max_model_len_error(self):
         from agent.model_metadata import get_context_length_from_provider_error
 
diff --git a/tests/agent/test_model_metadata_local_ctx.py b/tests/agent/test_model_metadata_local_ctx.py
index fec729c6a..f2250f204 100644
--- a/tests/agent/test_model_metadata_local_ctx.py
+++ b/tests/agent/test_model_metadata_local_ctx.py
@@ -8,9 +8,27 @@ import sys
 import os
 from unittest.mock import MagicMock, patch
 
+import pytest
+
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 
 
+@pytest.fixture(autouse=True)
+def _clear_local_ctx_probe_cache():
+    """Reset the in-process local-probe TTL cache around every test.
+
+    _query_local_context_length memoizes probes per (model, base_url) for a
+    short TTL to bound the probe rate on hot paths. In tests that mock httpx
+    to return different responses for the same (model, base_url), a stale
+    cache entry would leak across cases — clear it before and after each test.
+    """
+    import agent.model_metadata as _mm
+
+    _mm._LOCAL_CTX_PROBE_CACHE.clear()
+    yield
+    _mm._LOCAL_CTX_PROBE_CACHE.clear()
+
+
 
 # ---------------------------------------------------------------------------
 # _query_local_context_length — unit tests with mocked httpx
@@ -732,3 +750,55 @@ class TestGetModelContextLengthLocalFallback:
             result = get_model_context_length("unknown-xyz-model", "")
 
         mock_query.assert_not_called()
+
+
+class TestLocalContextProbeTTLCache:
+    """The in-process TTL cache collapses back-to-back probes for the same
+    (model, base_url) into one network round-trip (bounds probe rate on hot
+    paths like banner + /model switch + compressor update within one startup),
+    while a different key still probes."""
+
+    def _make_resp(self, status_code, body):
+        resp = MagicMock()
+        resp.status_code = status_code
+        resp.json.return_value = body
+        return resp
+
+    def test_second_call_within_ttl_does_not_reprobe(self):
+        from agent.model_metadata import _query_local_context_length
+
+        show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}})
+        models_resp = self._make_resp(404, {})
+        client_mock = MagicMock()
+        client_mock.__enter__ = lambda s: client_mock
+        client_mock.__exit__ = MagicMock(return_value=False)
+        client_mock.post.return_value = show_resp
+        client_mock.get.return_value = models_resp
+
+        with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \
+             patch("httpx.Client", return_value=client_mock):
+            first = _query_local_context_length("m", "http://localhost:11434/v1")
+            second = _query_local_context_length("m", "http://localhost:11434/v1")
+
+        assert first == 32768
+        assert second == 32768
+        # Only the first call hits the network; the second is served from cache.
+        assert detect.call_count == 1
+
+    def test_different_key_still_probes(self):
+        from agent.model_metadata import _query_local_context_length
+
+        show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}})
+        models_resp = self._make_resp(404, {})
+        client_mock = MagicMock()
+        client_mock.__enter__ = lambda s: client_mock
+        client_mock.__exit__ = MagicMock(return_value=False)
+        client_mock.post.return_value = show_resp
+        client_mock.get.return_value = models_resp
+
+        with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \
+             patch("httpx.Client", return_value=client_mock):
+            _query_local_context_length("m1", "http://localhost:11434/v1")
+            _query_local_context_length("m2", "http://localhost:11434/v1")
+
+        assert detect.call_count == 2