test(agent): cover local vLLM context-length resolution

Add regression tests for vLLM max_model_len error parsing, stale local
cache reconciliation, live probes over llama defaults, and the 64K minimum
guard on persistent cache writes.

(cherry picked from commit 1cb47ef437de7ce289cb358e8d6b89e9194b43ed)
This commit is contained in:
infinitycrew39 2026-07-02 22:21:52 +07:00 committed by kshitijk4poor
parent cecedcddf3
commit 53063d92b0
2 changed files with 88 additions and 1 deletions

View file

@ -1373,6 +1373,26 @@ class TestParseContextLimitFromError:
msg = "Error: context window of 4096 tokens exceeded"
assert parse_context_limit_from_error(msg) == 4096
def test_vllm_max_model_len_format(self):
msg = (
"The engine prompt length 1327246 exceeds the max_model_len 32768. "
"Please reduce prompt."
)
assert parse_context_limit_from_error(msg) == 32768
def test_vllm_maximum_model_length_format(self):
msg = "prompt length 200000 exceeds maximum model length 131072"
assert parse_context_limit_from_error(msg) == 131072
def test_get_context_length_from_vllm_max_model_len_error(self):
from agent.model_metadata import get_context_length_from_provider_error
msg = (
"The engine prompt length 90000 exceeds the max_model_len 32768. "
"Please reduce prompt."
)
assert get_context_length_from_provider_error(msg, 131072) == 32768
def test_minimax_delta_only_message_returns_none(self):
msg = "invalid params, context window exceeds limit (2013)"
assert parse_context_limit_from_error(msg) is None

View file

@ -615,6 +615,70 @@ class TestGetModelContextLengthLocalFallback:
mock_save.assert_called_once_with("omnicoder-9b", "http://localhost:11434/v1", 131072)
def test_local_endpoint_stale_cache_reconciled_from_live_probe(self):
"""Stale disk cache must yield to a live local max_model_len probe."""
from agent.model_metadata import get_model_context_length
model = "NousResearch/Hermes-3-Llama-3.1-70B"
base = "http://192.168.1.50:8000/v1"
with patch("agent.model_metadata.get_cached_context_length", return_value=131072), \
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
patch("agent.model_metadata._query_ollama_api_show", return_value=None), \
patch("agent.model_metadata._is_custom_endpoint", return_value=False), \
patch("agent.model_metadata.is_local_endpoint", return_value=True), \
patch("agent.model_metadata._query_local_context_length", return_value=32768), \
patch("agent.model_metadata._invalidate_cached_context_length") as mock_invalidate, \
patch("agent.model_metadata.save_context_length") as mock_save:
result = get_model_context_length(model, base, provider="custom")
assert result == 32768
mock_invalidate.assert_called_once_with(model, base)
mock_save.assert_not_called()
def test_local_endpoint_stale_cache_reconciled_to_valid_live_probe(self):
"""Live probes at or above the 64K minimum are persisted."""
from agent.model_metadata import get_model_context_length
model = "NousResearch/Hermes-3-Llama-3.1-70B"
base = "http://192.168.1.50:8000/v1"
with patch("agent.model_metadata.get_cached_context_length", return_value=131072), \
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
patch("agent.model_metadata._query_ollama_api_show", return_value=None), \
patch("agent.model_metadata._is_custom_endpoint", return_value=False), \
patch("agent.model_metadata.is_local_endpoint", return_value=True), \
patch("agent.model_metadata._query_local_context_length", return_value=65536), \
patch("agent.model_metadata._invalidate_cached_context_length") as mock_invalidate, \
patch("agent.model_metadata.save_context_length") as mock_save:
result = get_model_context_length(model, base, provider="custom")
assert result == 65536
mock_invalidate.assert_called_once_with(model, base)
mock_save.assert_called_once_with(model, base, 65536)
def test_local_endpoint_bypasses_stale_persistent_cache(self):
"""Hermes-3-Llama names must not inherit the generic llama 131072 default."""
from agent.model_metadata import get_model_context_length
model = "NousResearch/Hermes-3-Llama-3.1-70B"
base = "http://spark1:8000/v1"
with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
patch("agent.model_metadata._query_ollama_api_show", return_value=None), \
patch("agent.model_metadata._is_custom_endpoint", return_value=False), \
patch("agent.model_metadata.is_local_endpoint", return_value=True), \
patch("agent.model_metadata._query_local_context_length", return_value=32768), \
patch("agent.model_metadata.save_context_length") as mock_save:
result = get_model_context_length(model, base, provider="custom")
assert result == 32768
mock_save.assert_not_called()
def test_local_endpoint_server_returns_none_falls_back_to_2m(self):
"""When local server returns None, still falls back to 2M probe tier."""
from agent.model_metadata import get_model_context_length, CONTEXT_PROBE_TIERS
@ -648,8 +712,11 @@ class TestGetModelContextLengthLocalFallback:
from agent.model_metadata import get_model_context_length
with patch("agent.model_metadata.get_cached_context_length", return_value=65536), \
patch("agent.model_metadata.is_local_endpoint", return_value=False), \
patch("agent.model_metadata._query_local_context_length") as mock_query:
result = get_model_context_length("omnicoder-9b", "http://localhost:11434/v1")
result = get_model_context_length(
"omnicoder-9b", "https://api.example.com/v1"
)
assert result == 65536
mock_query.assert_not_called()