fix(agent): resolve review findings on vLLM local-context salvage
Salvage review of #56431 surfaced one Critical + two Warning issues; fix them on top of the contributor's cherry-picked commits: 1. Critical — duplicate non-agentic warning on the interactive CLI. The new agent_init warning fires on every platform, but cli.py show_banner() already warns on CLI (richer output + /model hint), so a CLI user saw the warning twice per startup. Guard the agent_init emit to skip platform=="cli" — it now fills exactly the gateway/TUI gap the PR intended, no duplication. 2. Warning — vLLM error-parse regex under-matched. The patterns required a literal space before the number, so "max_model_len: 32768", "=32768", "(32768)", and "... is 32768" all returned None. Broaden both patterns to accept :/=/(/ 'is' delimiters. Add a parametrized test over all delimiter variants. 3. Warning — per-call live probe latency on local endpoints. The new reconcile-on-hit + pre-defaults step-7 probe made every local resolution fire a synchronous network probe (banner + /model switch + compressor update_model each within one startup). Add a 30s in-process TTL cache keyed by (model, base_url) around _query_local_context_length so back-to- back resolutions reuse one round-trip; not persisted to disk, so the reconcile freshness contract (re-probe after restart) is preserved. Add an autouse fixture clearing the cache between tests + TTL coverage. Tests: 148 passed (was 138). ruff clean.
This commit is contained in:
parent
53063d92b0
commit
b9a197ec59
4 changed files with 130 additions and 4 deletions
|
|
@ -1722,8 +1722,10 @@ def init_agent(
|
|||
)
|
||||
|
||||
# Nous Hermes 3/4 are chat models, not tool-call-tuned — surface the
|
||||
# warning on every platform (CLI already did this; gateway/TUI did not).
|
||||
if not agent.quiet_mode:
|
||||
# warning on gateway/TUI. The interactive CLI already warns via
|
||||
# cli.py show_banner() (richer output + /model switch hint), so skip the
|
||||
# CLI platform here to avoid emitting the warning twice per startup.
|
||||
if not agent.quiet_mode and (agent.platform or "cli") != "cli":
|
||||
try:
|
||||
from hermes_cli.model_switch import _check_hermes_model_warning
|
||||
|
||||
|
|
|
|||
|
|
@ -184,6 +184,15 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
|
|||
# Sessions, model switches, and cron jobs should reject models below this.
|
||||
MINIMUM_CONTEXT_LENGTH = 64_000
|
||||
|
||||
# Short-lived in-process cache for local-server context probes. Bounds the
|
||||
# probe rate when the new local-endpoint live-probe paths (reconcile-on-hit +
|
||||
# pre-defaults step 7) resolve the same model several times during one startup
|
||||
# (banner, /model switch, compressor update_model). Keyed by (model, base_url);
|
||||
# values are (result, monotonic_timestamp). Not persisted to disk — cross-
|
||||
# restart freshness is handled by the reconcile logic re-probing after expiry.
|
||||
_LOCAL_CTX_PROBE_TTL_SECONDS = 30.0
|
||||
_LOCAL_CTX_PROBE_CACHE: Dict[tuple, tuple] = {}
|
||||
|
||||
# Thin fallback defaults — only broad model family patterns.
|
||||
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
|
||||
# all miss. Replaced the previous 80+ entry dict.
|
||||
|
|
@ -1068,8 +1077,8 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
|||
error_lower = error_msg.lower()
|
||||
# Pattern: look for numbers near context-related keywords
|
||||
patterns = [
|
||||
r'max_model_len\s+(\d{4,})', # vLLM: "exceeds the max_model_len 32768"
|
||||
r'maximum model length\s+(\d{4,})', # vLLM alt: "exceeds maximum model length 131072"
|
||||
r'max_model_len\s*(?:is\s*)?[:=(]?\s*(\d{4,})', # vLLM: "max_model_len 32768", "=32768", ": 32768", "(32768)", "is 32768"
|
||||
r'maximum model length\s*(?:is\s*)?[:=(]?\s*(\d{4,})', # vLLM alt: "maximum model length 131072", "... is 131072"
|
||||
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
|
||||
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
|
||||
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
|
||||
|
|
@ -1515,6 +1524,33 @@ def _model_name_suggests_grok_4_3(model: str) -> bool:
|
|||
|
||||
|
||||
def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]:
|
||||
"""Query a local server for the model's context length (short-TTL cached).
|
||||
|
||||
The live-probe paths added for local endpoints (reconcile-on-hit and the
|
||||
pre-defaults step-7 probe) can fire this function several times in quick
|
||||
succession during one startup — banner display, ``/model`` switch,
|
||||
compressor ``update_model`` all resolve the same model. Each raw probe
|
||||
issues synchronous ``detect_local_server_type`` + query HTTP calls (bounded
|
||||
by the 3s httpx timeout), so an unreachable/slow local server would pay
|
||||
that cost repeatedly. A tiny in-process TTL cache collapses back-to-back
|
||||
probes for the same (model, base_url) into one network round-trip without
|
||||
persisting anything to disk (freshness across restarts is still handled by
|
||||
the reconcile logic, which probes again once the TTL expires).
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
cache_key = (_strip_provider_prefix(model), base_url.rstrip("/"))
|
||||
now = _time.monotonic()
|
||||
cached = _LOCAL_CTX_PROBE_CACHE.get(cache_key)
|
||||
if cached is not None and (now - cached[1]) < _LOCAL_CTX_PROBE_TTL_SECONDS:
|
||||
return cached[0]
|
||||
|
||||
result = _query_local_context_length_uncached(model, base_url, api_key=api_key)
|
||||
_LOCAL_CTX_PROBE_CACHE[cache_key] = (result, now)
|
||||
return result
|
||||
|
||||
|
||||
def _query_local_context_length_uncached(model: str, base_url: str, api_key: str = "") -> Optional[int]:
|
||||
"""Query a local server for the model's context length."""
|
||||
import httpx
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ Coverage levels:
|
|||
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
|
|
@ -1384,6 +1385,23 @@ class TestParseContextLimitFromError:
|
|||
msg = "prompt length 200000 exceeds maximum model length 131072"
|
||||
assert parse_context_limit_from_error(msg) == 131072
|
||||
|
||||
@pytest.mark.parametrize("msg,expected", [
|
||||
("max_model_len 32768", 32768),
|
||||
("max_model_len: 32768", 32768),
|
||||
("max_model_len=32768", 32768),
|
||||
("max_model_len (32768)", 32768),
|
||||
("max_model_len is 32768", 32768),
|
||||
("maximum model length 131072", 131072),
|
||||
("maximum model length is 131072", 131072),
|
||||
("maximum model length: 131072", 131072),
|
||||
])
|
||||
def test_vllm_delimiter_variants(self, msg, expected):
|
||||
"""vLLM emits the limit with various delimiters (space/colon/equals/
|
||||
paren/'is'). The parser must catch all of them — the original
|
||||
space-only patterns silently missed ':', '=', '(' and 'is' forms and
|
||||
fell through to None."""
|
||||
assert parse_context_limit_from_error(msg) == expected
|
||||
|
||||
def test_get_context_length_from_vllm_max_model_len_error(self):
|
||||
from agent.model_metadata import get_context_length_from_provider_error
|
||||
|
||||
|
|
|
|||
|
|
@ -8,9 +8,27 @@ import sys
|
|||
import os
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _clear_local_ctx_probe_cache():
|
||||
"""Reset the in-process local-probe TTL cache around every test.
|
||||
|
||||
_query_local_context_length memoizes probes per (model, base_url) for a
|
||||
short TTL to bound the probe rate on hot paths. In tests that mock httpx
|
||||
to return different responses for the same (model, base_url), a stale
|
||||
cache entry would leak across cases — clear it before and after each test.
|
||||
"""
|
||||
import agent.model_metadata as _mm
|
||||
|
||||
_mm._LOCAL_CTX_PROBE_CACHE.clear()
|
||||
yield
|
||||
_mm._LOCAL_CTX_PROBE_CACHE.clear()
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _query_local_context_length — unit tests with mocked httpx
|
||||
|
|
@ -732,3 +750,55 @@ class TestGetModelContextLengthLocalFallback:
|
|||
result = get_model_context_length("unknown-xyz-model", "")
|
||||
|
||||
mock_query.assert_not_called()
|
||||
|
||||
|
||||
class TestLocalContextProbeTTLCache:
|
||||
"""The in-process TTL cache collapses back-to-back probes for the same
|
||||
(model, base_url) into one network round-trip (bounds probe rate on hot
|
||||
paths like banner + /model switch + compressor update within one startup),
|
||||
while a different key still probes."""
|
||||
|
||||
def _make_resp(self, status_code, body):
|
||||
resp = MagicMock()
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = body
|
||||
return resp
|
||||
|
||||
def test_second_call_within_ttl_does_not_reprobe(self):
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}})
|
||||
models_resp = self._make_resp(404, {})
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = show_resp
|
||||
client_mock.get.return_value = models_resp
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
first = _query_local_context_length("m", "http://localhost:11434/v1")
|
||||
second = _query_local_context_length("m", "http://localhost:11434/v1")
|
||||
|
||||
assert first == 32768
|
||||
assert second == 32768
|
||||
# Only the first call hits the network; the second is served from cache.
|
||||
assert detect.call_count == 1
|
||||
|
||||
def test_different_key_still_probes(self):
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
show_resp = self._make_resp(200, {"model_info": {"llama.context_length": 32768}})
|
||||
models_resp = self._make_resp(404, {})
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = show_resp
|
||||
client_mock.get.return_value = models_resp
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="ollama") as detect, \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
_query_local_context_length("m1", "http://localhost:11434/v1")
|
||||
_query_local_context_length("m2", "http://localhost:11434/v1")
|
||||
|
||||
assert detect.call_count == 2
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue