test(agent): cover local vLLM context-length resolution

Add regression tests for vLLM max_model_len error parsing, stale local cache reconciliation, live probes over llama defaults, and the 64K minimum guard on persistent cache writes. (cherry picked from commit 1cb47ef437de7ce289cb358e8d6b89e9194b43ed)
2026-07-02 22:21:52 +07:00 · 2026-07-02 22:21:52 +07:00 · 53063d92b0
commit 53063d92b0
parent cecedcddf3
2 changed files with 88 additions and 1 deletions
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -1373,6 +1373,26 @@ class TestParseContextLimitFromError:
        msg = "Error: context window of 4096 tokens exceeded"
        assert parse_context_limit_from_error(msg) == 4096

+    def test_vllm_max_model_len_format(self):
+        msg = (
+            "The engine prompt length 1327246 exceeds the max_model_len 32768. "
+            "Please reduce prompt."
+        )
+        assert parse_context_limit_from_error(msg) == 32768
+
+    def test_vllm_maximum_model_length_format(self):
+        msg = "prompt length 200000 exceeds maximum model length 131072"
+        assert parse_context_limit_from_error(msg) == 131072
+
+    def test_get_context_length_from_vllm_max_model_len_error(self):
+        from agent.model_metadata import get_context_length_from_provider_error
+
+        msg = (
+            "The engine prompt length 90000 exceeds the max_model_len 32768. "
+            "Please reduce prompt."
+        )
+        assert get_context_length_from_provider_error(msg, 131072) == 32768
+
    def test_minimax_delta_only_message_returns_none(self):
        msg = "invalid params, context window exceeds limit (2013)"
        assert parse_context_limit_from_error(msg) is None
--- a/tests/agent/test_model_metadata_local_ctx.py
+++ b/tests/agent/test_model_metadata_local_ctx.py
@ -615,6 +615,70 @@ class TestGetModelContextLengthLocalFallback:

        mock_save.assert_called_once_with("omnicoder-9b", "http://localhost:11434/v1", 131072)

+    def test_local_endpoint_stale_cache_reconciled_from_live_probe(self):
+        """Stale disk cache must yield to a live local max_model_len probe."""
+        from agent.model_metadata import get_model_context_length
+
+        model = "NousResearch/Hermes-3-Llama-3.1-70B"
+        base = "http://192.168.1.50:8000/v1"
+
+        with patch("agent.model_metadata.get_cached_context_length", return_value=131072), \
+             patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
+             patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
+             patch("agent.model_metadata._query_ollama_api_show", return_value=None), \
+             patch("agent.model_metadata._is_custom_endpoint", return_value=False), \
+             patch("agent.model_metadata.is_local_endpoint", return_value=True), \
+             patch("agent.model_metadata._query_local_context_length", return_value=32768), \
+             patch("agent.model_metadata._invalidate_cached_context_length") as mock_invalidate, \
+             patch("agent.model_metadata.save_context_length") as mock_save:
+            result = get_model_context_length(model, base, provider="custom")
+
+        assert result == 32768
+        mock_invalidate.assert_called_once_with(model, base)
+        mock_save.assert_not_called()
+
+    def test_local_endpoint_stale_cache_reconciled_to_valid_live_probe(self):
+        """Live probes at or above the 64K minimum are persisted."""
+        from agent.model_metadata import get_model_context_length
+
+        model = "NousResearch/Hermes-3-Llama-3.1-70B"
+        base = "http://192.168.1.50:8000/v1"
+
+        with patch("agent.model_metadata.get_cached_context_length", return_value=131072), \
+             patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
+             patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
+             patch("agent.model_metadata._query_ollama_api_show", return_value=None), \
+             patch("agent.model_metadata._is_custom_endpoint", return_value=False), \
+             patch("agent.model_metadata.is_local_endpoint", return_value=True), \
+             patch("agent.model_metadata._query_local_context_length", return_value=65536), \
+             patch("agent.model_metadata._invalidate_cached_context_length") as mock_invalidate, \
+             patch("agent.model_metadata.save_context_length") as mock_save:
+            result = get_model_context_length(model, base, provider="custom")
+
+        assert result == 65536
+        mock_invalidate.assert_called_once_with(model, base)
+        mock_save.assert_called_once_with(model, base, 65536)
+
+    def test_local_endpoint_bypasses_stale_persistent_cache(self):
+        """Hermes-3-Llama names must not inherit the generic llama 131072 default."""
+        from agent.model_metadata import get_model_context_length
+
+        model = "NousResearch/Hermes-3-Llama-3.1-70B"
+        base = "http://spark1:8000/v1"
+
+        with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
+             patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
+             patch("agent.model_metadata._query_ollama_api_show", return_value=None), \
+             patch("agent.model_metadata._is_custom_endpoint", return_value=False), \
+             patch("agent.model_metadata.is_local_endpoint", return_value=True), \
+             patch("agent.model_metadata._query_local_context_length", return_value=32768), \
+             patch("agent.model_metadata.save_context_length") as mock_save:
+            result = get_model_context_length(model, base, provider="custom")
+
+        assert result == 32768
+        mock_save.assert_not_called()
+
    def test_local_endpoint_server_returns_none_falls_back_to_2m(self):
        """When local server returns None, still falls back to 2M probe tier."""
        from agent.model_metadata import get_model_context_length, CONTEXT_PROBE_TIERS
@ -648,8 +712,11 @@ class TestGetModelContextLengthLocalFallback:
        from agent.model_metadata import get_model_context_length

        with patch("agent.model_metadata.get_cached_context_length", return_value=65536), \
+             patch("agent.model_metadata.is_local_endpoint", return_value=False), \
             patch("agent.model_metadata._query_local_context_length") as mock_query:
-            result = get_model_context_length("omnicoder-9b", "http://localhost:11434/v1")
+            result = get_model_context_length(
+                "omnicoder-9b", "https://api.example.com/v1"
+            )

        assert result == 65536
        mock_query.assert_not_called()