From cecedcddf3488e7972b4d40b7860f734835a0835 Mon Sep 17 00:00:00 2001
From: infinitycrew39 <infinitycrew39@gmail.com>
Date: Thu, 2 Jul 2026 22:21:52 +0700
Subject: [PATCH] fix(agent): honor live vLLM context limits on local endpoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reconcile stale local disk cache against live vLLM/Ollama max_model_len
probes, probe local servers before the llama hardcoded default, parse
vLLM max_model_len overflow errors, and surface the non-agentic Hermes 3/4
warning at agent init on gateway/TUI.

Sub-64K live probes are returned for startup rejection but are not
persisted to the context cache — preserving the 64K minimum-context
contract instead of normalizing undersized windows as valid config.

(cherry picked from commit c3a02db4fd9d57b7b0eb2732de91f8334d311aa5)
---
 agent/agent_init.py     |  22 +++++++++
 agent/model_metadata.py | 104 +++++++++++++++++++++++++++++++++-------
 2 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/agent/agent_init.py b/agent/agent_init.py
index 045fcfc1e..64c77dc35 100644
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -1721,6 +1721,28 @@ def init_agent(
             f"(this must be at least {MINIMUM_CONTEXT_LENGTH // 1000}K)."
         )
 
+    # Nous Hermes 3/4 are chat models, not tool-call-tuned — surface the
+    # warning on every platform (CLI already did this; gateway/TUI did not).
+    if not agent.quiet_mode:
+        try:
+            from hermes_cli.model_switch import _check_hermes_model_warning
+
+            _hermes_warn = _check_hermes_model_warning(agent.model or "")
+            if _hermes_warn:
+                _user_msg = (
+                    "⚠ Nous Research Hermes 3 & 4 models are NOT agentic — they "
+                    "lack reliable tool-calling for agent workflows (delegation, "
+                    "cron, proactive tools). Consider an agentic model instead "
+                    "(Claude, GPT, Gemini, Qwen-Coder, etc.)."
+                )
+                if hasattr(agent, "_emit_warning"):
+                    agent._emit_warning(_user_msg)
+                else:
+                    print(f"\n{_user_msg}\n", file=sys.stderr)
+                _ra().logger.warning(_hermes_warn)
+        except Exception:
+            pass
+
     # Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand).
     # Skip names that are already present — the _ra().get_tool_definitions()
     # quiet_mode cache returned a shared list pre-#17335, so a stray
diff --git a/agent/model_metadata.py b/agent/model_metadata.py
index 547fe2f57..4b87f574c 100644
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -496,6 +496,68 @@ def _is_known_provider_base_url(base_url: str) -> bool:
     return _infer_provider_from_url(base_url) is not None
 
 
+def _skip_persistent_context_cache(base_url: str, provider: str) -> bool:
+    """Return True when the on-disk context cache must not short-circuit probing.
+
+    LM Studio excludes caching because loaded context is transient — the user
+    can reload the model with a different context_length at any time.
+   """
+    return provider == "lmstudio"
+
+
+def _maybe_cache_local_context_length(
+    model: str,
+    base_url: str,
+    length: int,
+) -> None:
+    """Persist a locally probed context length only when it meets Hermes minimum.
+
+    Sub-minimum live windows (e.g. vLLM ``--max-model-len 32768``) are still
+    returned to callers so ``agent_init`` can fail with the existing
+    minimum-context guidance — they must not be normalized into the on-disk cache
+    as if they were valid operating limits.
+    """
+    if length >= MINIMUM_CONTEXT_LENGTH:
+        save_context_length(model, base_url, length)
+
+
+def _reconcile_local_cached_context_length(
+    model: str,
+    base_url: str,
+    cached: int,
+    api_key: str = "",
+) -> int:
+    """Return *cached* unless a live local probe reports a different limit.
+
+    vLLM/Ollama operators can restart with a new ``--max-model-len`` / ``num_ctx``
+    without changing the model id.  When the server is reachable, prefer its
+    reported window over a stale disk entry; when the probe fails (offline tests,
+    network blip), keep the cached value.
+
+    Live probes below :data:`MINIMUM_CONTEXT_LENGTH` invalidate stale cache
+    entries but are not persisted — startup should reject them, not bless a
+    sub-64K window as config.
+    """
+    live_ctx = _query_local_context_length(model, base_url, api_key=api_key)
+    if live_ctx and live_ctx > 0 and live_ctx != cached:
+        if live_ctx < MINIMUM_CONTEXT_LENGTH:
+            logger.info(
+                "Live local probe for %s@%s reports %s (< minimum %s); "
+                "invalidating stale cache — agent init should reject",
+                model, base_url, f"{live_ctx:,}", f"{MINIMUM_CONTEXT_LENGTH:,}",
+            )
+            _invalidate_cached_context_length(model, base_url)
+            return live_ctx
+        logger.info(
+            "Reconciling stale local cache entry %s@%s: %s -> %s (live probe)",
+            model, base_url, f"{cached:,}", f"{live_ctx:,}",
+        )
+        _invalidate_cached_context_length(model, base_url)
+        _maybe_cache_local_context_length(model, base_url, live_ctx)
+        return live_ctx
+    return cached
+
+
 def is_local_endpoint(base_url: str) -> bool:
     """Return True if base_url points to a local machine.
 
@@ -1006,6 +1068,8 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
     error_lower = error_msg.lower()
     # Pattern: look for numbers near context-related keywords
     patterns = [
+        r'max_model_len\s+(\d{4,})',  # vLLM: "exceeds the max_model_len 32768"
+        r'maximum model length\s+(\d{4,})',  # vLLM alt: "exceeds maximum model length 131072"
         r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
         r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
         r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
@@ -1805,8 +1869,8 @@ def get_model_context_length(
        e. Ollama native /api/show probe (any base_url, provider-agnostic)
        f. models.dev registry lookup (with :cloud/-cloud suffix fallback)
     6. OpenRouter live API metadata (Kimi-family 32k guard)
-    7. Hardcoded defaults (broad family patterns, longest-key-first)
-    8. Local server query (last resort)
+    7. Local server query (before hardcoded defaults for local endpoints)
+    8. Hardcoded defaults (broad family patterns, longest-key-first)
     9. Default fallback (256K)"""
     # 0. Explicit config override — user knows best
     if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
@@ -1866,7 +1930,7 @@ def get_model_context_length(
     # LM Studio is excluded — its loaded context length is transient (the
     # user can reload the model with a different context_length at any time
     # via /api/v1/models/load), so a stale cached value would mask reloads.
-    if base_url and provider != "lmstudio":
+    if base_url and not _skip_persistent_context_cache(base_url, provider):
         cached = get_cached_context_length(model, base_url)
         if cached is not None:
             # Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds
@@ -1931,6 +1995,10 @@ def get_model_context_length(
                 )
                 # Fall through; step 5b reconciles and overwrites if portal responds.
             else:
+                if is_local_endpoint(base_url):
+                    return _reconcile_local_cached_context_length(
+                        model, base_url, cached, api_key=api_key,
+                    )
                 return cached
 
     # 1b. AWS Bedrock — use static context length table.
@@ -1975,14 +2043,15 @@ def get_model_context_length(
             # 404/405 quickly.  Fall through on failure.
             ctx = _query_ollama_api_show(model, base_url, api_key=api_key)
             if ctx is not None:
-                save_context_length(model, base_url, ctx)
+                if not _skip_persistent_context_cache(base_url, provider):
+                    save_context_length(model, base_url, ctx)
                 return ctx
             # 3. Try querying local server directly
             if is_local_endpoint(base_url):
                 local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
                 if local_ctx and local_ctx > 0:
-                    if provider != "lmstudio":
-                        save_context_length(model, base_url, local_ctx)
+                    if not _skip_persistent_context_cache(base_url, provider):
+                        _maybe_cache_local_context_length(model, base_url, local_ctx)
                     return local_ctx
             logger.info(
                 "Could not detect context length for model %r at %s — "
@@ -2088,7 +2157,8 @@ def get_model_context_length(
     if base_url:
         ctx = _query_ollama_api_show(model, base_url, api_key=api_key)
         if ctx is not None:
-            save_context_length(model, base_url, ctx)
+            if not _skip_persistent_context_cache(base_url, provider):
+                save_context_length(model, base_url, ctx)
             return ctx
     # 5f. OpenRouter live /models metadata — authoritative for OpenRouter-routed
     # models. OpenRouter's catalog carries per-model context_length (e.g.
@@ -2147,7 +2217,15 @@ def get_model_context_length(
             else:
                 return or_ctx
 
-    # 7. (reserved)
+    # 7. Query local server before hardcoded defaults — model names like
+    # ``Hermes-3-Llama-3.1-70B`` substring-match ``llama`` (131072) even when
+    # vLLM is running at a lower ``--max-model-len`` (e.g. 32768 on limited VRAM).
+    if base_url and is_local_endpoint(base_url):
+        local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
+        if local_ctx and local_ctx > 0:
+            if not _skip_persistent_context_cache(base_url, provider):
+                _maybe_cache_local_context_length(model, base_url, local_ctx)
+            return local_ctx
 
     # 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
     # Only check `default_model in model` (is the key a substring of the input).
@@ -2160,15 +2238,7 @@ def get_model_context_length(
         if default_model in model_lower:
             return length
 
-    # 9. Query local server as last resort
-    if base_url and is_local_endpoint(base_url):
-        local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
-        if local_ctx and local_ctx > 0:
-            if provider != "lmstudio":
-                save_context_length(model, base_url, local_ctx)
-            return local_ctx
-
-    # 10. Default fallback — 256K
+    # 9. Default fallback — 256K
     return DEFAULT_FALLBACK_CONTEXT