fix(agent): honor live vLLM context limits on local endpoints

Reconcile stale local disk cache against live vLLM/Ollama max_model_len
probes, probe local servers before the llama hardcoded default, parse
vLLM max_model_len overflow errors, and surface the non-agentic Hermes 3/4
warning at agent init on gateway/TUI.

Sub-64K live probes are returned for startup rejection but are not
persisted to the context cache — preserving the 64K minimum-context
contract instead of normalizing undersized windows as valid config.

(cherry picked from commit c3a02db4fd9d57b7b0eb2732de91f8334d311aa5)
This commit is contained in:
infinitycrew39 2026-07-02 22:21:52 +07:00 committed by kshitijk4poor
parent 30e947e0a0
commit cecedcddf3
2 changed files with 109 additions and 17 deletions

View file

@ -1721,6 +1721,28 @@ def init_agent(
f"(this must be at least {MINIMUM_CONTEXT_LENGTH // 1000}K)."
)
# Nous Hermes 3/4 are chat models, not tool-call-tuned — surface the
# warning on every platform (CLI already did this; gateway/TUI did not).
if not agent.quiet_mode:
try:
from hermes_cli.model_switch import _check_hermes_model_warning
_hermes_warn = _check_hermes_model_warning(agent.model or "")
if _hermes_warn:
_user_msg = (
"⚠ Nous Research Hermes 3 & 4 models are NOT agentic — they "
"lack reliable tool-calling for agent workflows (delegation, "
"cron, proactive tools). Consider an agentic model instead "
"(Claude, GPT, Gemini, Qwen-Coder, etc.)."
)
if hasattr(agent, "_emit_warning"):
agent._emit_warning(_user_msg)
else:
print(f"\n{_user_msg}\n", file=sys.stderr)
_ra().logger.warning(_hermes_warn)
except Exception:
pass
# Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand).
# Skip names that are already present — the _ra().get_tool_definitions()
# quiet_mode cache returned a shared list pre-#17335, so a stray

View file

@ -496,6 +496,68 @@ def _is_known_provider_base_url(base_url: str) -> bool:
return _infer_provider_from_url(base_url) is not None
def _skip_persistent_context_cache(base_url: str, provider: str) -> bool:
"""Return True when the on-disk context cache must not short-circuit probing.
LM Studio excludes caching because loaded context is transient the user
can reload the model with a different context_length at any time.
"""
return provider == "lmstudio"
def _maybe_cache_local_context_length(
model: str,
base_url: str,
length: int,
) -> None:
"""Persist a locally probed context length only when it meets Hermes minimum.
Sub-minimum live windows (e.g. vLLM ``--max-model-len 32768``) are still
returned to callers so ``agent_init`` can fail with the existing
minimum-context guidance they must not be normalized into the on-disk cache
as if they were valid operating limits.
"""
if length >= MINIMUM_CONTEXT_LENGTH:
save_context_length(model, base_url, length)
def _reconcile_local_cached_context_length(
model: str,
base_url: str,
cached: int,
api_key: str = "",
) -> int:
"""Return *cached* unless a live local probe reports a different limit.
vLLM/Ollama operators can restart with a new ``--max-model-len`` / ``num_ctx``
without changing the model id. When the server is reachable, prefer its
reported window over a stale disk entry; when the probe fails (offline tests,
network blip), keep the cached value.
Live probes below :data:`MINIMUM_CONTEXT_LENGTH` invalidate stale cache
entries but are not persisted startup should reject them, not bless a
sub-64K window as config.
"""
live_ctx = _query_local_context_length(model, base_url, api_key=api_key)
if live_ctx and live_ctx > 0 and live_ctx != cached:
if live_ctx < MINIMUM_CONTEXT_LENGTH:
logger.info(
"Live local probe for %s@%s reports %s (< minimum %s); "
"invalidating stale cache — agent init should reject",
model, base_url, f"{live_ctx:,}", f"{MINIMUM_CONTEXT_LENGTH:,}",
)
_invalidate_cached_context_length(model, base_url)
return live_ctx
logger.info(
"Reconciling stale local cache entry %s@%s: %s -> %s (live probe)",
model, base_url, f"{cached:,}", f"{live_ctx:,}",
)
_invalidate_cached_context_length(model, base_url)
_maybe_cache_local_context_length(model, base_url, live_ctx)
return live_ctx
return cached
def is_local_endpoint(base_url: str) -> bool:
"""Return True if base_url points to a local machine.
@ -1006,6 +1068,8 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
error_lower = error_msg.lower()
# Pattern: look for numbers near context-related keywords
patterns = [
r'max_model_len\s+(\d{4,})', # vLLM: "exceeds the max_model_len 32768"
r'maximum model length\s+(\d{4,})', # vLLM alt: "exceeds maximum model length 131072"
r'(?:max(?:imum)?|limit)\s*(?:context\s*)?(?:length|size|window)?\s*(?:is|of|:)?\s*(\d{4,})',
r'context\s*(?:length|size|window)\s*(?:is|of|:)?\s*(\d{4,})',
r'(\d{4,})\s*(?:token)?\s*(?:context|limit)',
@ -1805,8 +1869,8 @@ def get_model_context_length(
e. Ollama native /api/show probe (any base_url, provider-agnostic)
f. models.dev registry lookup (with :cloud/-cloud suffix fallback)
6. OpenRouter live API metadata (Kimi-family 32k guard)
7. Hardcoded defaults (broad family patterns, longest-key-first)
8. Local server query (last resort)
7. Local server query (before hardcoded defaults for local endpoints)
8. Hardcoded defaults (broad family patterns, longest-key-first)
9. Default fallback (256K)"""
# 0. Explicit config override — user knows best
if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
@ -1866,7 +1930,7 @@ def get_model_context_length(
# LM Studio is excluded — its loaded context length is transient (the
# user can reload the model with a different context_length at any time
# via /api/v1/models/load), so a stale cached value would mask reloads.
if base_url and provider != "lmstudio":
if base_url and not _skip_persistent_context_cache(base_url, provider):
cached = get_cached_context_length(model, base_url)
if cached is not None:
# Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds
@ -1931,6 +1995,10 @@ def get_model_context_length(
)
# Fall through; step 5b reconciles and overwrites if portal responds.
else:
if is_local_endpoint(base_url):
return _reconcile_local_cached_context_length(
model, base_url, cached, api_key=api_key,
)
return cached
# 1b. AWS Bedrock — use static context length table.
@ -1975,14 +2043,15 @@ def get_model_context_length(
# 404/405 quickly. Fall through on failure.
ctx = _query_ollama_api_show(model, base_url, api_key=api_key)
if ctx is not None:
save_context_length(model, base_url, ctx)
if not _skip_persistent_context_cache(base_url, provider):
save_context_length(model, base_url, ctx)
return ctx
# 3. Try querying local server directly
if is_local_endpoint(base_url):
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
if local_ctx and local_ctx > 0:
if provider != "lmstudio":
save_context_length(model, base_url, local_ctx)
if not _skip_persistent_context_cache(base_url, provider):
_maybe_cache_local_context_length(model, base_url, local_ctx)
return local_ctx
logger.info(
"Could not detect context length for model %r at %s"
@ -2088,7 +2157,8 @@ def get_model_context_length(
if base_url:
ctx = _query_ollama_api_show(model, base_url, api_key=api_key)
if ctx is not None:
save_context_length(model, base_url, ctx)
if not _skip_persistent_context_cache(base_url, provider):
save_context_length(model, base_url, ctx)
return ctx
# 5f. OpenRouter live /models metadata — authoritative for OpenRouter-routed
# models. OpenRouter's catalog carries per-model context_length (e.g.
@ -2147,7 +2217,15 @@ def get_model_context_length(
else:
return or_ctx
# 7. (reserved)
# 7. Query local server before hardcoded defaults — model names like
# ``Hermes-3-Llama-3.1-70B`` substring-match ``llama`` (131072) even when
# vLLM is running at a lower ``--max-model-len`` (e.g. 32768 on limited VRAM).
if base_url and is_local_endpoint(base_url):
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
if local_ctx and local_ctx > 0:
if not _skip_persistent_context_cache(base_url, provider):
_maybe_cache_local_context_length(model, base_url, local_ctx)
return local_ctx
# 8. Hardcoded defaults (fuzzy match — longest key first for specificity)
# Only check `default_model in model` (is the key a substring of the input).
@ -2160,15 +2238,7 @@ def get_model_context_length(
if default_model in model_lower:
return length
# 9. Query local server as last resort
if base_url and is_local_endpoint(base_url):
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
if local_ctx and local_ctx > 0:
if provider != "lmstudio":
save_context_length(model, base_url, local_ctx)
return local_ctx
# 10. Default fallback — 256K
# 9. Default fallback — 256K
return DEFAULT_FALLBACK_CONTEXT