fix(agent): resolve review findings on vLLM local-context salvage

Salvage review of #56431 surfaced one Critical + two Warning issues; fix
them on top of the contributor's cherry-picked commits:

1. Critical — duplicate non-agentic warning on the interactive CLI. The new
   agent_init warning fires on every platform, but cli.py show_banner()
   already warns on CLI (richer output + /model hint), so a CLI user saw the
   warning twice per startup. Guard the agent_init emit to skip platform=="cli"
   — it now fills exactly the gateway/TUI gap the PR intended, no duplication.

2. Warning — vLLM error-parse regex under-matched. The patterns required a
   literal space before the number, so "max_model_len: 32768", "=32768",
   "(32768)", and "... is 32768" all returned None. Broaden both patterns to
   accept :/=/(/ 'is' delimiters. Add a parametrized test over all delimiter
   variants.

3. Warning — per-call live probe latency on local endpoints. The new
   reconcile-on-hit + pre-defaults step-7 probe made every local resolution
   fire a synchronous network probe (banner + /model switch + compressor
   update_model each within one startup). Add a 30s in-process TTL cache
   keyed by (model, base_url) around _query_local_context_length so back-to-
   back resolutions reuse one round-trip; not persisted to disk, so the
   reconcile freshness contract (re-probe after restart) is preserved. Add an
   autouse fixture clearing the cache between tests + TTL coverage.

Tests: 148 passed (was 138). ruff clean.
This commit is contained in:
kshitijk4poor 2026-07-03 03:27:13 +05:30
parent 53063d92b0
commit b9a197ec59
4 changed files with 130 additions and 4 deletions

View file

@ -1722,8 +1722,10 @@ def init_agent(
)
# Nous Hermes 3/4 are chat models, not tool-call-tuned — surface the
# warning on every platform (CLI already did this; gateway/TUI did not).
if not agent.quiet_mode:
# warning on gateway/TUI. The interactive CLI already warns via
# cli.py show_banner() (richer output + /model switch hint), so skip the
# CLI platform here to avoid emitting the warning twice per startup.
if not agent.quiet_mode and (agent.platform or "cli") != "cli":
try:
from hermes_cli.model_switch import _check_hermes_model_warning

View file

@ -184,6 +184,15 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
# Sessions, model switches, and cron jobs should reject models below this.
MINIMUM_CONTEXT_LENGTH = 64_000
# Short-lived in-process cache for local-server context probes. Bounds the
# probe rate when the new local-endpoint live-probe paths (reconcile-on-hit +
# pre-defaults step 7) resolve the same model several times during one startup
# (banner, /model switch, compressor update_model). Keyed by (model, base_url);
# values are (result, monotonic_timestamp). Not persisted to disk — cross-
# restart freshness is handled by the reconcile logic re-probing after expiry.
_LOCAL_CTX_PROBE_TTL_SECONDS = 30.0
_LOCAL_CTX_PROBE_CACHE: Dict[tuple, tuple] = {}
# Thin fallback defaults — only broad model family patterns.
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
# all miss. Replaced the previous 80+ entry dict.
@ -1068,8 +1077,8 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
error_lower = error_msg.lower()
# Pattern: look for numbers near context-related keywords
patterns = [
r'max_model_len\s+(\d{4,})', # vLLM: "exceeds the max_model_len 32768"
r'maximum model length\s+(\d{4,})', # vLLM alt: "exceeds maximum model length 131072"
r'max_model_len\s*(?:is\s*)?[:=(]?\s*(\d{4,})', # vLLM: "max_model_len 32768", "=32768", ": 32768", "(32768)", "is 32768"
r'maximum model length\s*(?:is\s*)?[:=(]?\s*(\d{4,})', # vLLM alt: "maximum model length 131072", "... is 131072"
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
@ -1515,6 +1524,33 @@ def _model_name_suggests_grok_4_3(model: str) -> bool:
def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]:
"""Query a local server for the model's context length (short-TTL cached).
The live-probe paths added for local endpoints (reconcile-on-hit and the
pre-defaults step-7 probe) can fire this function several times in quick
succession during one startup banner display, ``/model`` switch,
compressor ``update_model`` all resolve the same model. Each raw probe
issues synchronous ``detect_local_server_type`` + query HTTP calls (bounded
by the 3s httpx timeout), so an unreachable/slow local server would pay
that cost repeatedly. A tiny in-process TTL cache collapses back-to-back
probes for the same (model, base_url) into one network round-trip without
persisting anything to disk (freshness across restarts is still handled by
the reconcile logic, which probes again once the TTL expires).
"""
import time as _time
cache_key = (_strip_provider_prefix(model), base_url.rstrip("/"))
now = _time.monotonic()
cached = _LOCAL_CTX_PROBE_CACHE.get(cache_key)
if cached is not None and (now - cached[1]) < _LOCAL_CTX_PROBE_TTL_SECONDS:
return cached[0]
result = _query_local_context_length_uncached(model, base_url, api_key=api_key)
_LOCAL_CTX_PROBE_CACHE[cache_key] = (result, now)
return result
def _query_local_context_length_uncached(model: str, base_url: str, api_key: str = "") -> Optional[int]:
"""Query a local server for the model's context length."""
import httpx

View file

@ -12,6 +12,7 @@ Coverage levels:
import time
import pytest
import yaml
from unittest.mock import patch, MagicMock
@ -1384,6 +1385,23 @@ class TestParseContextLimitFromError:
msg = "prompt length 200000 exceeds maximum model length 131072"
assert parse_context_limit_from_error(msg) == 131072
@pytest.mark.parametrize("msg,expected", [
("max_model_len 32768", 32768),
("max_model_len: 32768", 32768),
("max_model_len=32768", 32768),
("max_model_len (32768)", 32768),
("max_model_len is 32768", 32768),
("maximum model length 131072", 131072),
("maximum model length is 131072", 131072),
("maximum model length: 131072", 131072),
])
def test_vllm_delimiter_variants(self, msg, expected):
"""vLLM emits the limit with various delimiters (space/colon/equals/
paren/'is'). The parser must catch all of them the original
space-only patterns silently missed ':', '=', '(' and 'is' forms and
fell through to None."""
assert parse_context_limit_from_error(msg) == expected
def test_get_context_length_from_vllm_max_model_len_error(self):
from agent.model_metadata import get_context_length_from_provider_error

View file

@ -8,9 +8,27 @@ import sys
import os
from unittest.mock import MagicMock, patch
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
@pytest.fixture(autouse=True)
def _clear_local_ctx_probe_cache():
"""Reset the in-process local-probe TTL cache around every test.
_query_local_context_length memoizes probes per (model, base_url) for a
short TTL to bound the probe rate on hot paths. In tests that mock httpx
to return different responses for the same (model, base_url), a stale
cache entry would leak across cases clear it before and after each test.
"""
import agent.model_metadata as _mm
_mm._LOCAL_CTX_PROBE_CACHE.clear()
yield
_mm._LOCAL_CTX_PROBE_CACHE.clear()
# ---------------------------------------------------------------------------
# _query_local_context_length — unit tests with mocked httpx
@ -732,3 +750,55 @@ class TestGetModelContextLengthLocalFallback:
result = get_model_context_length("unknown-xyz-model", "")
mock_query.assert_not_called()
class TestLocalContextProbeTTLCache:
"""The in-process TTL cache collapses back-to-back probes for the same
(model, base_url) into one network round-trip (bounds probe rate on hot
paths like banner + /model switch + compressor update within one startup),
while a different key still probes."""
def _make_resp(self, status_code, body):
resp = MagicMock()
resp.status_code = status_code
resp.json.return_value = body
return resp
def test_second_call_within_ttl_does_not_reprobe(self):
from agent.model_metadata import _query_local_context_length
show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}})
models_resp = self._make_resp(404, {})
client_mock = MagicMock()
client_mock.__enter__ = lambda s: client_mock
client_mock.__exit__ = MagicMock(return_value=False)
client_mock.post.return_value = show_resp
client_mock.get.return_value = models_resp
with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \
patch("httpx.Client", return_value=client_mock):
first = _query_local_context_length("m", "http://localhost:11434/v1")
second = _query_local_context_length("m", "http://localhost:11434/v1")
assert first == 32768
assert second == 32768
# Only the first call hits the network; the second is served from cache.
assert detect.call_count == 1
def test_different_key_still_probes(self):
from agent.model_metadata import _query_local_context_length
show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}})
models_resp = self._make_resp(404, {})
client_mock = MagicMock()
client_mock.__enter__ = lambda s: client_mock
client_mock.__exit__ = MagicMock(return_value=False)
client_mock.post.return_value = show_resp
client_mock.get.return_value = models_resp
with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \
patch("httpx.Client", return_value=client_mock):
_query_local_context_length("m1", "http://localhost:11434/v1")
_query_local_context_length("m2", "http://localhost:11434/v1")
assert detect.call_count == 2