feat(agent): add lmstudio integration
This commit is contained in:
parent
7d4648461a
commit
214ca943ac
26 changed files with 1137 additions and 40 deletions
48
agent/lmstudio_reasoning.py
Normal file
48
agent/lmstudio_reasoning.py
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
"""LM Studio reasoning-effort resolution shared by the chat-completions
|
||||||
|
transport and run_agent's iteration-limit summary path.
|
||||||
|
|
||||||
|
LM Studio publishes per-model ``capabilities.reasoning.allowed_options`` (e.g.
|
||||||
|
``["off","on"]`` for toggle-style models, ``["off","minimal","low"]`` for
|
||||||
|
graduated models). We map the user's ``reasoning_config`` onto LM Studio's
|
||||||
|
OpenAI-compatible vocabulary, then clamp against the model's allowed set so
|
||||||
|
the server doesn't 400 on an unsupported effort.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
# LM Studio accepts these top-level reasoning_effort values via its
|
||||||
|
# OpenAI-compatible chat.completions endpoint.
|
||||||
|
_LM_VALID_EFFORTS = {"none", "minimal", "low", "medium", "high", "xhigh"}
|
||||||
|
|
||||||
|
# Toggle-style models publish allowed_options as ["off","on"] in /api/v1/models.
|
||||||
|
# Map them onto the OpenAI-compatible request vocabulary.
|
||||||
|
_LM_EFFORT_ALIASES = {"off": "none", "on": "medium"}
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_lmstudio_effort(
|
||||||
|
reasoning_config: Optional[dict],
|
||||||
|
allowed_options: Optional[List[str]],
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Return the ``reasoning_effort`` string to send to LM Studio, or ``None``.
|
||||||
|
|
||||||
|
``None`` means "omit the field": the user picked a level the model can't
|
||||||
|
honor, so let LM Studio fall back to the model's declared default rather
|
||||||
|
than silently substituting a different effort. When ``allowed_options`` is
|
||||||
|
falsy (probe failed), skip clamping and send the resolved effort anyway.
|
||||||
|
"""
|
||||||
|
effort = "medium"
|
||||||
|
if reasoning_config and isinstance(reasoning_config, dict):
|
||||||
|
if reasoning_config.get("enabled") is False:
|
||||||
|
effort = "none"
|
||||||
|
else:
|
||||||
|
raw = (reasoning_config.get("effort") or "").strip().lower()
|
||||||
|
raw = _LM_EFFORT_ALIASES.get(raw, raw)
|
||||||
|
if raw in _LM_VALID_EFFORTS:
|
||||||
|
effort = raw
|
||||||
|
if allowed_options:
|
||||||
|
allowed = {_LM_EFFORT_ALIASES.get(opt, opt) for opt in allowed_options}
|
||||||
|
if effort not in allowed:
|
||||||
|
return None
|
||||||
|
return effort
|
||||||
|
|
@ -1281,7 +1281,10 @@ def get_model_context_length(
|
||||||
model = _strip_provider_prefix(model)
|
model = _strip_provider_prefix(model)
|
||||||
|
|
||||||
# 1. Check persistent cache (model+provider)
|
# 1. Check persistent cache (model+provider)
|
||||||
if base_url:
|
# LM Studio is excluded — its loaded context length is transient (the
|
||||||
|
# user can reload the model with a different context_length at any time
|
||||||
|
# via /api/v1/models/load), so a stale cached value would mask reloads.
|
||||||
|
if base_url and provider != "lmstudio":
|
||||||
cached = get_cached_context_length(model, base_url)
|
cached = get_cached_context_length(model, base_url)
|
||||||
if cached is not None:
|
if cached is not None:
|
||||||
# Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds
|
# Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds
|
||||||
|
|
@ -1334,7 +1337,8 @@ def get_model_context_length(
|
||||||
if is_local_endpoint(base_url):
|
if is_local_endpoint(base_url):
|
||||||
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
|
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
|
||||||
if local_ctx and local_ctx > 0:
|
if local_ctx and local_ctx > 0:
|
||||||
save_context_length(model, base_url, local_ctx)
|
if provider != "lmstudio":
|
||||||
|
save_context_length(model, base_url, local_ctx)
|
||||||
return local_ctx
|
return local_ctx
|
||||||
logger.info(
|
logger.info(
|
||||||
"Could not detect context length for model %r at %s — "
|
"Could not detect context length for model %r at %s — "
|
||||||
|
|
@ -1424,7 +1428,8 @@ def get_model_context_length(
|
||||||
if base_url and is_local_endpoint(base_url):
|
if base_url and is_local_endpoint(base_url):
|
||||||
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
|
local_ctx = _query_local_context_length(model, base_url, api_key=api_key)
|
||||||
if local_ctx and local_ctx > 0:
|
if local_ctx and local_ctx > 0:
|
||||||
save_context_length(model, base_url, local_ctx)
|
if provider != "lmstudio":
|
||||||
|
save_context_length(model, base_url, local_ctx)
|
||||||
return local_ctx
|
return local_ctx
|
||||||
|
|
||||||
# 10. Default fallback — 128K
|
# 10. Default fallback — 128K
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ reasoning configuration, temperature handling, and extra_body assembly.
|
||||||
import copy
|
import copy
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from agent.lmstudio_reasoning import resolve_lmstudio_effort
|
||||||
from agent.moonshot_schema import is_moonshot_model, sanitize_moonshot_tools
|
from agent.moonshot_schema import is_moonshot_model, sanitize_moonshot_tools
|
||||||
from agent.prompt_builder import DEVELOPER_ROLE_MODELS
|
from agent.prompt_builder import DEVELOPER_ROLE_MODELS
|
||||||
from agent.transports.base import ProviderTransport
|
from agent.transports.base import ProviderTransport
|
||||||
|
|
@ -153,6 +154,8 @@ class ChatCompletionsTransport(ProviderTransport):
|
||||||
is_github_models: bool
|
is_github_models: bool
|
||||||
is_nvidia_nim: bool
|
is_nvidia_nim: bool
|
||||||
is_kimi: bool
|
is_kimi: bool
|
||||||
|
is_tokenhub: bool
|
||||||
|
is_lmstudio: bool
|
||||||
is_custom_provider: bool
|
is_custom_provider: bool
|
||||||
ollama_num_ctx: int | None
|
ollama_num_ctx: int | None
|
||||||
# Provider routing
|
# Provider routing
|
||||||
|
|
@ -166,6 +169,7 @@ class ChatCompletionsTransport(ProviderTransport):
|
||||||
# Reasoning
|
# Reasoning
|
||||||
supports_reasoning: bool
|
supports_reasoning: bool
|
||||||
github_reasoning_extra: dict | None
|
github_reasoning_extra: dict | None
|
||||||
|
lmstudio_reasoning_options: list[str] | None # raw allowed_options from /api/v1/models
|
||||||
# Claude on OpenRouter/Nous max output
|
# Claude on OpenRouter/Nous max output
|
||||||
anthropic_max_output: int | None
|
anthropic_max_output: int | None
|
||||||
# Extra
|
# Extra
|
||||||
|
|
@ -287,6 +291,18 @@ class ChatCompletionsTransport(ProviderTransport):
|
||||||
_tokenhub_effort = _e
|
_tokenhub_effort = _e
|
||||||
api_kwargs["reasoning_effort"] = _tokenhub_effort
|
api_kwargs["reasoning_effort"] = _tokenhub_effort
|
||||||
|
|
||||||
|
# LM Studio: top-level reasoning_effort. Only emit when the model
|
||||||
|
# declares reasoning support via /api/v1/models capabilities (gated
|
||||||
|
# upstream by params["supports_reasoning"]). resolve_lmstudio_effort
|
||||||
|
# is shared with run_agent's summary path so both stay in sync.
|
||||||
|
if params.get("is_lmstudio", False) and params.get("supports_reasoning", False):
|
||||||
|
_lm_effort = resolve_lmstudio_effort(
|
||||||
|
reasoning_config,
|
||||||
|
params.get("lmstudio_reasoning_options"),
|
||||||
|
)
|
||||||
|
if _lm_effort is not None:
|
||||||
|
api_kwargs["reasoning_effort"] = _lm_effort
|
||||||
|
|
||||||
# extra_body assembly
|
# extra_body assembly
|
||||||
extra_body: Dict[str, Any] = {}
|
extra_body: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
|
@ -309,8 +325,9 @@ class ChatCompletionsTransport(ProviderTransport):
|
||||||
"type": "enabled" if _kimi_thinking_enabled else "disabled",
|
"type": "enabled" if _kimi_thinking_enabled else "disabled",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Reasoning
|
# Reasoning. LM Studio is handled above via top-level reasoning_effort,
|
||||||
if params.get("supports_reasoning", False):
|
# so skip emitting extra_body.reasoning for it.
|
||||||
|
if params.get("supports_reasoning", False) and not params.get("is_lmstudio", False):
|
||||||
if is_github_models:
|
if is_github_models:
|
||||||
gh_reasoning = params.get("github_reasoning_extra")
|
gh_reasoning = params.get("github_reasoning_extra")
|
||||||
if gh_reasoning is not None:
|
if gh_reasoning is not None:
|
||||||
|
|
|
||||||
|
|
@ -30,14 +30,13 @@ model:
|
||||||
# "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY — https://ollama.com/settings)
|
# "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY — https://ollama.com/settings)
|
||||||
# "kilocode" - KiloCode gateway (requires: KILOCODE_API_KEY)
|
# "kilocode" - KiloCode gateway (requires: KILOCODE_API_KEY)
|
||||||
# "ai-gateway" - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY)
|
# "ai-gateway" - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY)
|
||||||
|
# "lmstudio" - LM Studio local server (optional: LM_API_KEY, defaults to http://127.0.0.1:1234/v1)
|
||||||
#
|
#
|
||||||
# Local servers (LM Studio, Ollama, vLLM, llama.cpp):
|
# Local servers (LM Studio, Ollama, vLLM, llama.cpp):
|
||||||
# "custom" - Any OpenAI-compatible endpoint. Set base_url below.
|
# "custom" - Any other OpenAI-compatible endpoint. Set base_url below.
|
||||||
# Aliases: "lmstudio", "ollama", "vllm", "llamacpp" all map to "custom".
|
# Aliases: "ollama", "vllm", "llamacpp" all map to "custom".
|
||||||
# Example for LM Studio:
|
# LM Studio is first-class and uses provider: "lmstudio".
|
||||||
# provider: "lmstudio"
|
# It works with both no-auth and auth-enabled server modes.
|
||||||
# base_url: "http://localhost:1234/v1"
|
|
||||||
# No API key needed — local servers typically ignore auth.
|
|
||||||
#
|
#
|
||||||
# Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var.
|
# Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var.
|
||||||
provider: "auto"
|
provider: "auto"
|
||||||
|
|
|
||||||
2
cli.py
2
cli.py
|
|
@ -5459,6 +5459,8 @@ class HermesCLI:
|
||||||
try:
|
try:
|
||||||
providers = list_authenticated_providers(
|
providers = list_authenticated_providers(
|
||||||
current_provider=self.provider or "",
|
current_provider=self.provider or "",
|
||||||
|
current_base_url=self.base_url or "",
|
||||||
|
current_model=self.model or "",
|
||||||
user_providers=user_provs,
|
user_providers=user_provs,
|
||||||
custom_providers=custom_provs,
|
custom_providers=custom_provs,
|
||||||
max_models=50,
|
max_models=50,
|
||||||
|
|
|
||||||
|
|
@ -6169,6 +6169,7 @@ class GatewayRunner:
|
||||||
providers = list_authenticated_providers(
|
providers = list_authenticated_providers(
|
||||||
current_provider=current_provider,
|
current_provider=current_provider,
|
||||||
current_base_url=current_base_url,
|
current_base_url=current_base_url,
|
||||||
|
current_model=current_model,
|
||||||
user_providers=user_provs,
|
user_providers=user_provs,
|
||||||
custom_providers=custom_provs,
|
custom_providers=custom_provs,
|
||||||
max_models=50,
|
max_models=50,
|
||||||
|
|
@ -6290,6 +6291,7 @@ class GatewayRunner:
|
||||||
providers = list_authenticated_providers(
|
providers = list_authenticated_providers(
|
||||||
current_provider=current_provider,
|
current_provider=current_provider,
|
||||||
current_base_url=current_base_url,
|
current_base_url=current_base_url,
|
||||||
|
current_model=current_model,
|
||||||
user_providers=user_provs,
|
user_providers=user_provs,
|
||||||
custom_providers=custom_provs,
|
custom_providers=custom_provs,
|
||||||
max_models=5,
|
max_models=5,
|
||||||
|
|
|
||||||
|
|
@ -110,6 +110,12 @@ SERVICE_PROVIDER_NAMES: Dict[str, str] = {
|
||||||
DEFAULT_GEMINI_CLOUDCODE_BASE_URL = "cloudcode-pa://google"
|
DEFAULT_GEMINI_CLOUDCODE_BASE_URL = "cloudcode-pa://google"
|
||||||
GEMINI_OAUTH_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 60 # refresh 60s before expiry
|
GEMINI_OAUTH_ACCESS_TOKEN_REFRESH_SKEW_SECONDS = 60 # refresh 60s before expiry
|
||||||
|
|
||||||
|
# LM Studio's default no-auth mode still requires *some* non-empty bearer for
|
||||||
|
# the API-key code paths (auxiliary_client, runtime resolver) to treat the
|
||||||
|
# provider as configured. This sentinel is sent only to LM Studio, never to
|
||||||
|
# any remote service.
|
||||||
|
LMSTUDIO_NOAUTH_PLACEHOLDER = "dummy-lm-api-key"
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Provider Registry
|
# Provider Registry
|
||||||
|
|
@ -160,6 +166,14 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
|
||||||
auth_type="oauth_external",
|
auth_type="oauth_external",
|
||||||
inference_base_url=DEFAULT_GEMINI_CLOUDCODE_BASE_URL,
|
inference_base_url=DEFAULT_GEMINI_CLOUDCODE_BASE_URL,
|
||||||
),
|
),
|
||||||
|
"lmstudio": ProviderConfig(
|
||||||
|
id="lmstudio",
|
||||||
|
name="LM Studio",
|
||||||
|
auth_type="api_key",
|
||||||
|
inference_base_url="http://127.0.0.1:1234/v1",
|
||||||
|
api_key_env_vars=("LM_API_KEY",),
|
||||||
|
base_url_env_var="LM_BASE_URL",
|
||||||
|
),
|
||||||
"copilot": ProviderConfig(
|
"copilot": ProviderConfig(
|
||||||
id="copilot",
|
id="copilot",
|
||||||
name="GitHub Copilot",
|
name="GitHub Copilot",
|
||||||
|
|
@ -1155,8 +1169,8 @@ def resolve_provider(
|
||||||
"aws": "bedrock", "aws-bedrock": "bedrock", "amazon-bedrock": "bedrock", "amazon": "bedrock",
|
"aws": "bedrock", "aws-bedrock": "bedrock", "amazon-bedrock": "bedrock", "amazon": "bedrock",
|
||||||
"go": "opencode-go", "opencode-go-sub": "opencode-go",
|
"go": "opencode-go", "opencode-go-sub": "opencode-go",
|
||||||
"kilo": "kilocode", "kilo-code": "kilocode", "kilo-gateway": "kilocode",
|
"kilo": "kilocode", "kilo-code": "kilocode", "kilo-gateway": "kilocode",
|
||||||
|
"lmstudio": "lmstudio", "lm-studio": "lmstudio", "lm_studio": "lmstudio",
|
||||||
# Local server aliases — route through the generic custom provider
|
# Local server aliases — route through the generic custom provider
|
||||||
"lmstudio": "custom", "lm-studio": "custom", "lm_studio": "custom",
|
|
||||||
"ollama": "custom", "ollama_cloud": "ollama-cloud",
|
"ollama": "custom", "ollama_cloud": "ollama-cloud",
|
||||||
"vllm": "custom", "llamacpp": "custom",
|
"vllm": "custom", "llamacpp": "custom",
|
||||||
"llama.cpp": "custom", "llama-cpp": "custom",
|
"llama.cpp": "custom", "llama-cpp": "custom",
|
||||||
|
|
@ -1203,8 +1217,11 @@ def resolve_provider(
|
||||||
continue
|
continue
|
||||||
# GitHub tokens are commonly present for repo/tool access but should not
|
# GitHub tokens are commonly present for repo/tool access but should not
|
||||||
# hijack inference auto-selection unless the user explicitly chooses
|
# hijack inference auto-selection unless the user explicitly chooses
|
||||||
# Copilot/GitHub Models as the provider.
|
# Copilot/GitHub Models as the provider. LM Studio is a local server
|
||||||
if pid == "copilot":
|
# whose availability isn't implied by LM_API_KEY presence (it may be
|
||||||
|
# offline, and the no-auth setup uses a placeholder value), so it
|
||||||
|
# also requires explicit selection.
|
||||||
|
if pid in ("copilot", "lmstudio"):
|
||||||
continue
|
continue
|
||||||
for env_var in pconfig.api_key_env_vars:
|
for env_var in pconfig.api_key_env_vars:
|
||||||
if has_usable_secret(os.getenv(env_var, "")):
|
if has_usable_secret(os.getenv(env_var, "")):
|
||||||
|
|
@ -3482,6 +3499,13 @@ def resolve_api_key_provider_credentials(provider_id: str) -> Dict[str, Any]:
|
||||||
key_source = ""
|
key_source = ""
|
||||||
api_key, key_source = _resolve_api_key_provider_secret(provider_id, pconfig)
|
api_key, key_source = _resolve_api_key_provider_secret(provider_id, pconfig)
|
||||||
|
|
||||||
|
# No-auth LM Studio: substitute a placeholder so runtime / auxiliary_client
|
||||||
|
# see the local server as configured. doctor still reports unconfigured
|
||||||
|
# because get_api_key_provider_status uses the raw secret resolver.
|
||||||
|
if not api_key and provider_id == "lmstudio":
|
||||||
|
api_key = LMSTUDIO_NOAUTH_PLACEHOLDER
|
||||||
|
key_source = key_source or "default"
|
||||||
|
|
||||||
env_url = ""
|
env_url = ""
|
||||||
if pconfig.base_url_env_var:
|
if pconfig.base_url_env_var:
|
||||||
env_url = os.getenv(pconfig.base_url_env_var, "").strip()
|
env_url = os.getenv(pconfig.base_url_env_var, "").strip()
|
||||||
|
|
|
||||||
|
|
@ -946,6 +946,42 @@ def slack_subcommand_map() -> dict[str, str]:
|
||||||
# Autocomplete
|
# Autocomplete
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
# Per-process cache for /model<space> LM Studio autocomplete. Probing on
|
||||||
|
# every keystroke would block the UI; a short TTL keeps it live without
|
||||||
|
# hammering the server.
|
||||||
|
_LMSTUDIO_COMPLETION_CACHE: tuple[float, list[str]] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _lmstudio_completion_models() -> list[str]:
|
||||||
|
"""Locally-loaded LM Studio models for /model autocomplete (cached, gated)."""
|
||||||
|
global _LMSTUDIO_COMPLETION_CACHE
|
||||||
|
# Gate: don't probe 127.0.0.1 on every keystroke for users who don't use LM Studio.
|
||||||
|
if not (os.environ.get("LM_API_KEY") or os.environ.get("LM_BASE_URL")):
|
||||||
|
try:
|
||||||
|
from hermes_cli.auth import _load_auth_store
|
||||||
|
store = _load_auth_store() or {}
|
||||||
|
if "lmstudio" not in (store.get("providers") or {}) \
|
||||||
|
and "lmstudio" not in (store.get("credential_pool") or {}):
|
||||||
|
return []
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
now = time.time()
|
||||||
|
if _LMSTUDIO_COMPLETION_CACHE and (now - _LMSTUDIO_COMPLETION_CACHE[0]) < 30.0:
|
||||||
|
return _LMSTUDIO_COMPLETION_CACHE[1]
|
||||||
|
try:
|
||||||
|
from hermes_cli.models import fetch_lmstudio_models
|
||||||
|
models = fetch_lmstudio_models(
|
||||||
|
api_key=os.environ.get("LM_API_KEY", ""),
|
||||||
|
base_url=os.environ.get("LM_BASE_URL") or "http://127.0.0.1:1234/v1",
|
||||||
|
timeout=0.8,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
models = []
|
||||||
|
_LMSTUDIO_COMPLETION_CACHE = (now, models)
|
||||||
|
return models
|
||||||
|
|
||||||
|
|
||||||
class SlashCommandCompleter(Completer):
|
class SlashCommandCompleter(Completer):
|
||||||
"""Autocomplete for built-in slash commands, subcommands, and skill commands."""
|
"""Autocomplete for built-in slash commands, subcommands, and skill commands."""
|
||||||
|
|
||||||
|
|
@ -1369,6 +1405,19 @@ class SlashCommandCompleter(Completer):
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
# LM Studio: surface locally-loaded models. Gated on the user actually
|
||||||
|
# having LM Studio configured (env var or auth-store entry) so we
|
||||||
|
# don't probe 127.0.0.1 on every keystroke for users who don't use it.
|
||||||
|
for name in _lmstudio_completion_models():
|
||||||
|
if name in seen:
|
||||||
|
continue
|
||||||
|
if name.startswith(sub_lower) and name != sub_lower:
|
||||||
|
yield Completion(
|
||||||
|
name,
|
||||||
|
start_position=-len(sub_text),
|
||||||
|
display=name,
|
||||||
|
display_meta="LM Studio",
|
||||||
|
)
|
||||||
|
|
||||||
def get_completions(self, document, complete_event):
|
def get_completions(self, document, complete_event):
|
||||||
text = document.text_before_cursor
|
text = document.text_before_cursor
|
||||||
|
|
|
||||||
|
|
@ -1123,7 +1123,7 @@ DEFAULT_CONFIG = {
|
||||||
},
|
},
|
||||||
|
|
||||||
# Config schema version - bump this when adding new required fields
|
# Config schema version - bump this when adding new required fields
|
||||||
"_config_version": 22,
|
"_config_version": 23,
|
||||||
}
|
}
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
@ -1223,6 +1223,22 @@ OPTIONAL_ENV_VARS = {
|
||||||
"category": "provider",
|
"category": "provider",
|
||||||
"advanced": True,
|
"advanced": True,
|
||||||
},
|
},
|
||||||
|
"LM_API_KEY": {
|
||||||
|
"description": "LM Studio bearer token for auth-enabled local servers",
|
||||||
|
"prompt": "LM Studio API key / bearer token",
|
||||||
|
"url": None,
|
||||||
|
"password": True,
|
||||||
|
"category": "provider",
|
||||||
|
"advanced": True,
|
||||||
|
},
|
||||||
|
"LM_BASE_URL": {
|
||||||
|
"description": "LM Studio base URL override",
|
||||||
|
"prompt": "LM Studio base URL (leave empty for default)",
|
||||||
|
"url": None,
|
||||||
|
"password": False,
|
||||||
|
"category": "provider",
|
||||||
|
"advanced": True,
|
||||||
|
},
|
||||||
"GLM_API_KEY": {
|
"GLM_API_KEY": {
|
||||||
"description": "Z.AI / GLM API key (also recognized as ZAI_API_KEY / Z_AI_API_KEY)",
|
"description": "Z.AI / GLM API key (also recognized as ZAI_API_KEY / Z_AI_API_KEY)",
|
||||||
"prompt": "Z.AI / GLM API key",
|
"prompt": "Z.AI / GLM API key",
|
||||||
|
|
@ -3107,6 +3123,28 @@ def migrate_config(interactive: bool = True, quiet: bool = False) -> Dict[str, A
|
||||||
"Use `hermes plugins enable <name>` to activate."
|
"Use `hermes plugins enable <name>` to activate."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ── Version 22 → 23: ensure LM_API_KEY is set when provider is lmstudio ──
|
||||||
|
# LM Studio's documented default is no-auth, but our API-key registry
|
||||||
|
# path needs *some* non-empty value to satisfy auxiliary_client and
|
||||||
|
# runtime resolution. Self-heal users whose config.yaml has
|
||||||
|
# provider:lmstudio but no LM_API_KEY in .env (cross-machine sync,
|
||||||
|
# manual edit, profile move).
|
||||||
|
if current_ver < 23:
|
||||||
|
try:
|
||||||
|
from hermes_cli.auth import LMSTUDIO_NOAUTH_PLACEHOLDER
|
||||||
|
config = load_config()
|
||||||
|
model_cfg = config.get("model")
|
||||||
|
if isinstance(model_cfg, dict) and str(model_cfg.get("provider") or "").strip().lower() == "lmstudio":
|
||||||
|
if not get_env_value("LM_API_KEY"):
|
||||||
|
save_env_value("LM_API_KEY", LMSTUDIO_NOAUTH_PLACEHOLDER)
|
||||||
|
results["env_added"].append(
|
||||||
|
f"LM_API_KEY={LMSTUDIO_NOAUTH_PLACEHOLDER} (placeholder for no-auth LM Studio)"
|
||||||
|
)
|
||||||
|
if not quiet:
|
||||||
|
print(" ✓ Added placeholder LM_API_KEY for LM Studio (no-auth default)")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
if current_ver < latest_ver and not quiet:
|
if current_ver < latest_ver and not quiet:
|
||||||
print(f"Config version: {current_ver} → {latest_ver}")
|
print(f"Config version: {current_ver} → {latest_ver}")
|
||||||
|
|
||||||
|
|
@ -3806,7 +3844,7 @@ def save_env_value(key: str, value: str):
|
||||||
value = _check_non_ascii_credential(key, value)
|
value = _check_non_ascii_credential(key, value)
|
||||||
ensure_hermes_home()
|
ensure_hermes_home()
|
||||||
env_path = get_env_path()
|
env_path = get_env_path()
|
||||||
|
|
||||||
# On Windows, open() defaults to the system locale (cp1252) which can
|
# On Windows, open() defaults to the system locale (cp1252) which can
|
||||||
# cause OSError errno 22 on UTF-8 .env files.
|
# cause OSError errno 22 on UTF-8 .env files.
|
||||||
read_kw = {"encoding": "utf-8", "errors": "replace"} if _IS_WINDOWS else {}
|
read_kw = {"encoding": "utf-8", "errors": "replace"} if _IS_WINDOWS else {}
|
||||||
|
|
@ -3818,7 +3856,7 @@ def save_env_value(key: str, value: str):
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
# Sanitize on every read: split concatenated keys, drop stale placeholders
|
# Sanitize on every read: split concatenated keys, drop stale placeholders
|
||||||
lines = _sanitize_env_lines(lines)
|
lines = _sanitize_env_lines(lines)
|
||||||
|
|
||||||
# Find and update or append
|
# Find and update or append
|
||||||
found = False
|
found = False
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
|
|
@ -3826,7 +3864,7 @@ def save_env_value(key: str, value: str):
|
||||||
lines[i] = f"{key}={value}\n"
|
lines[i] = f"{key}={value}\n"
|
||||||
found = True
|
found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if not found:
|
if not found:
|
||||||
# Ensure there's a newline at the end of the file before appending
|
# Ensure there's a newline at the end of the file before appending
|
||||||
if lines and not lines[-1].endswith("\n"):
|
if lines and not lines[-1].endswith("\n"):
|
||||||
|
|
|
||||||
|
|
@ -344,7 +344,7 @@ def run_doctor(args):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Warn if model is set to a provider-prefixed name on a provider that doesn't use them
|
# Warn if model is set to a provider-prefixed name on a provider that doesn't use them
|
||||||
if default_model and "/" in default_model and canonical_provider and canonical_provider not in ("openrouter", "custom", "auto", "ai-gateway", "kilocode", "opencode-zen", "huggingface", "nous"):
|
if default_model and "/" in default_model and canonical_provider and canonical_provider not in ("openrouter", "custom", "auto", "ai-gateway", "kilocode", "opencode-zen", "huggingface", "nous", "lmstudio"):
|
||||||
check_warn(
|
check_warn(
|
||||||
f"model.default '{default_model}' uses a vendor/model slug but provider is '{provider_raw}'",
|
f"model.default '{default_model}' uses a vendor/model slug but provider is '{provider_raw}'",
|
||||||
"(vendor-prefixed slugs belong to aggregators like openrouter)",
|
"(vendor-prefixed slugs belong to aggregators like openrouter)",
|
||||||
|
|
|
||||||
|
|
@ -1821,6 +1821,7 @@ def select_provider_and_model(args=None):
|
||||||
"nvidia",
|
"nvidia",
|
||||||
"ollama-cloud",
|
"ollama-cloud",
|
||||||
"tencent-tokenhub",
|
"tencent-tokenhub",
|
||||||
|
"lmstudio",
|
||||||
):
|
):
|
||||||
_model_flow_api_key_provider(config, selected_provider, current_model)
|
_model_flow_api_key_provider(config, selected_provider, current_model)
|
||||||
|
|
||||||
|
|
@ -2047,7 +2048,11 @@ def _aux_select_for_task(task: str) -> None:
|
||||||
|
|
||||||
# Gather authenticated providers (has credentials + curated model list)
|
# Gather authenticated providers (has credentials + curated model list)
|
||||||
try:
|
try:
|
||||||
providers = list_authenticated_providers(current_provider=current_provider)
|
providers = list_authenticated_providers(
|
||||||
|
current_provider=current_provider,
|
||||||
|
current_model=current_model,
|
||||||
|
current_base_url=current_base_url,
|
||||||
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"Could not detect authenticated providers: {exc}")
|
print(f"Could not detect authenticated providers: {exc}")
|
||||||
providers = []
|
providers = []
|
||||||
|
|
@ -4377,6 +4382,7 @@ def _model_flow_bedrock(config, current_model=""):
|
||||||
def _model_flow_api_key_provider(config, provider_id, current_model=""):
|
def _model_flow_api_key_provider(config, provider_id, current_model=""):
|
||||||
"""Generic flow for API-key providers (z.ai, MiniMax, OpenCode, etc.)."""
|
"""Generic flow for API-key providers (z.ai, MiniMax, OpenCode, etc.)."""
|
||||||
from hermes_cli.auth import (
|
from hermes_cli.auth import (
|
||||||
|
LMSTUDIO_NOAUTH_PLACEHOLDER,
|
||||||
PROVIDER_REGISTRY,
|
PROVIDER_REGISTRY,
|
||||||
_prompt_model_selection,
|
_prompt_model_selection,
|
||||||
_save_model_choice,
|
_save_model_choice,
|
||||||
|
|
@ -4411,13 +4417,20 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""):
|
||||||
try:
|
try:
|
||||||
import getpass
|
import getpass
|
||||||
|
|
||||||
new_key = getpass.getpass(f"{key_env} (or Enter to cancel): ").strip()
|
if provider_id == "lmstudio":
|
||||||
|
prompt = f"{key_env} (Enter for no-auth default {LMSTUDIO_NOAUTH_PLACEHOLDER!r}): "
|
||||||
|
else:
|
||||||
|
prompt = f"{key_env} (or Enter to cancel): "
|
||||||
|
new_key = getpass.getpass(prompt).strip()
|
||||||
except (KeyboardInterrupt, EOFError):
|
except (KeyboardInterrupt, EOFError):
|
||||||
print()
|
print()
|
||||||
return
|
return
|
||||||
if not new_key:
|
if not new_key:
|
||||||
print("Cancelled.")
|
if provider_id == "lmstudio":
|
||||||
return
|
new_key = LMSTUDIO_NOAUTH_PLACEHOLDER
|
||||||
|
else:
|
||||||
|
print("Cancelled.")
|
||||||
|
return
|
||||||
save_env_value(key_env, new_key)
|
save_env_value(key_env, new_key)
|
||||||
existing_key = new_key
|
existing_key = new_key
|
||||||
print("API key saved.")
|
print("API key saved.")
|
||||||
|
|
@ -4484,10 +4497,21 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""):
|
||||||
print(" Tier check: could not verify (proceeding anyway).")
|
print(" Tier check: could not verify (proceeding anyway).")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Optional base URL override
|
# Optional base URL override.
|
||||||
|
# Precedence: env var → config.yaml model.base_url → registry default.
|
||||||
|
# Reading config.yaml prevents silently overwriting a saved remote URL
|
||||||
|
# (e.g. a remote LM Studio endpoint) with localhost when the user just
|
||||||
|
# presses Enter at the prompt below.
|
||||||
current_base = ""
|
current_base = ""
|
||||||
if base_url_env:
|
if base_url_env:
|
||||||
current_base = get_env_value(base_url_env) or os.getenv(base_url_env, "")
|
current_base = get_env_value(base_url_env) or os.getenv(base_url_env, "")
|
||||||
|
if not current_base:
|
||||||
|
try:
|
||||||
|
_m = load_config().get("model") or {}
|
||||||
|
if str(_m.get("provider") or "").strip().lower() == provider_id:
|
||||||
|
current_base = str(_m.get("base_url") or "").strip()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
effective_base = current_base or pconfig.inference_base_url
|
effective_base = current_base or pconfig.inference_base_url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -4509,8 +4533,22 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""):
|
||||||
# 2. Curated static fallback list (offline insurance)
|
# 2. Curated static fallback list (offline insurance)
|
||||||
# 3. Live /models endpoint probe (small providers without models.dev data)
|
# 3. Live /models endpoint probe (small providers without models.dev data)
|
||||||
#
|
#
|
||||||
# Ollama Cloud: dedicated merged discovery (live API + models.dev + disk cache)
|
# LM Studio: live /api/v1/models probe (no models.dev catalog).
|
||||||
if provider_id == "ollama-cloud":
|
# Ollama Cloud: merged discovery (live API + models.dev + disk cache).
|
||||||
|
if provider_id == "lmstudio":
|
||||||
|
from hermes_cli.auth import AuthError
|
||||||
|
from hermes_cli.models import fetch_lmstudio_models
|
||||||
|
|
||||||
|
api_key_for_probe = existing_key or (get_env_value(key_env) if key_env else "")
|
||||||
|
try:
|
||||||
|
model_list = fetch_lmstudio_models(api_key=api_key_for_probe, base_url=effective_base)
|
||||||
|
except AuthError as exc:
|
||||||
|
print(f" LM Studio rejected the request: {exc}")
|
||||||
|
print(" Set LM_API_KEY (or update it) to match the server's bearer token.")
|
||||||
|
model_list = []
|
||||||
|
if model_list:
|
||||||
|
print(f" Found {len(model_list)} model(s) from LM Studio")
|
||||||
|
elif provider_id == "ollama-cloud":
|
||||||
from hermes_cli.models import fetch_ollama_cloud_models
|
from hermes_cli.models import fetch_ollama_cloud_models
|
||||||
|
|
||||||
api_key_for_probe = existing_key or (get_env_value(key_env) if key_env else "")
|
api_key_for_probe = existing_key or (get_env_value(key_env) if key_env else "")
|
||||||
|
|
|
||||||
|
|
@ -984,6 +984,7 @@ def list_authenticated_providers(
|
||||||
user_providers: dict = None,
|
user_providers: dict = None,
|
||||||
custom_providers: list | None = None,
|
custom_providers: list | None = None,
|
||||||
max_models: int = 8,
|
max_models: int = 8,
|
||||||
|
current_model: str = "",
|
||||||
) -> List[dict]:
|
) -> List[dict]:
|
||||||
"""Detect which providers have credentials and list their curated models.
|
"""Detect which providers have credentials and list their curated models.
|
||||||
|
|
||||||
|
|
@ -1030,6 +1031,34 @@ def list_authenticated_providers(
|
||||||
if "ollama-cloud" not in curated:
|
if "ollama-cloud" not in curated:
|
||||||
from hermes_cli.models import fetch_ollama_cloud_models
|
from hermes_cli.models import fetch_ollama_cloud_models
|
||||||
curated["ollama-cloud"] = fetch_ollama_cloud_models()
|
curated["ollama-cloud"] = fetch_ollama_cloud_models()
|
||||||
|
# LM Studio has no static catalog — probe its native /api/v1/models
|
||||||
|
# endpoint live so the picker reflects whatever the user has loaded.
|
||||||
|
# Base URL precedence: LM_BASE_URL env var > active config's base_url
|
||||||
|
# (when current provider is lmstudio) > 127.0.0.1 default.
|
||||||
|
# On auth rejection or unreachable server, fall back to the caller-supplied
|
||||||
|
# current model so the picker still shows something when offline / mis-keyed.
|
||||||
|
if "lmstudio" not in curated and (
|
||||||
|
os.environ.get("LM_API_KEY") or os.environ.get("LM_BASE_URL") or current_provider.strip().lower() == "lmstudio"
|
||||||
|
):
|
||||||
|
from hermes_cli.models import fetch_lmstudio_models
|
||||||
|
from hermes_cli.auth import AuthError
|
||||||
|
is_current_lmstudio = current_provider.strip().lower() == "lmstudio"
|
||||||
|
lm_base = (
|
||||||
|
os.environ.get("LM_BASE_URL")
|
||||||
|
or (current_base_url if is_current_lmstudio and current_base_url else None)
|
||||||
|
or "http://127.0.0.1:1234/v1"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
live = fetch_lmstudio_models(
|
||||||
|
api_key=os.environ.get("LM_API_KEY", ""),
|
||||||
|
base_url=lm_base,
|
||||||
|
timeout=1.5, # Smaller timeout for picker
|
||||||
|
)
|
||||||
|
except AuthError:
|
||||||
|
live = []
|
||||||
|
if not live and is_current_lmstudio and current_model:
|
||||||
|
live = [current_model]
|
||||||
|
curated["lmstudio"] = live
|
||||||
|
|
||||||
# --- 1. Check Hermes-mapped providers ---
|
# --- 1. Check Hermes-mapped providers ---
|
||||||
for hermes_id, mdev_id in PROVIDER_TO_MODELS_DEV.items():
|
for hermes_id, mdev_id in PROVIDER_TO_MODELS_DEV.items():
|
||||||
|
|
|
||||||
|
|
@ -768,6 +768,7 @@ class ProviderEntry(NamedTuple):
|
||||||
CANONICAL_PROVIDERS: list[ProviderEntry] = [
|
CANONICAL_PROVIDERS: list[ProviderEntry] = [
|
||||||
ProviderEntry("nous", "Nous Portal", "Nous Portal (Nous Research subscription)"),
|
ProviderEntry("nous", "Nous Portal", "Nous Portal (Nous Research subscription)"),
|
||||||
ProviderEntry("openrouter", "OpenRouter", "OpenRouter (100+ models, pay-per-use)"),
|
ProviderEntry("openrouter", "OpenRouter", "OpenRouter (100+ models, pay-per-use)"),
|
||||||
|
ProviderEntry("lmstudio", "LM Studio", "LM Studio (local desktop app with built-in model server)"),
|
||||||
ProviderEntry("ai-gateway", "Vercel AI Gateway", "Vercel AI Gateway (200+ models, $5 free credit, no markup)"),
|
ProviderEntry("ai-gateway", "Vercel AI Gateway", "Vercel AI Gateway (200+ models, $5 free credit, no markup)"),
|
||||||
ProviderEntry("anthropic", "Anthropic", "Anthropic (Claude models — API key or Claude Code)"),
|
ProviderEntry("anthropic", "Anthropic", "Anthropic (Claude models — API key or Claude Code)"),
|
||||||
ProviderEntry("openai-codex", "OpenAI Codex", "OpenAI Codex"),
|
ProviderEntry("openai-codex", "OpenAI Codex", "OpenAI Codex"),
|
||||||
|
|
@ -870,6 +871,9 @@ _PROVIDER_ALIASES = {
|
||||||
"nvidia-nim": "nvidia",
|
"nvidia-nim": "nvidia",
|
||||||
"build-nvidia": "nvidia",
|
"build-nvidia": "nvidia",
|
||||||
"nemotron": "nvidia",
|
"nemotron": "nvidia",
|
||||||
|
"lmstudio": "lmstudio",
|
||||||
|
"lm-studio": "lmstudio",
|
||||||
|
"lm_studio": "lmstudio",
|
||||||
"ollama": "custom", # bare "ollama" = local; use "ollama-cloud" for cloud
|
"ollama": "custom", # bare "ollama" = local; use "ollama-cloud" for cloud
|
||||||
"ollama_cloud": "ollama-cloud",
|
"ollama_cloud": "ollama-cloud",
|
||||||
}
|
}
|
||||||
|
|
@ -2195,6 +2199,225 @@ def _is_github_models_base_url(base_url: Optional[str]) -> bool:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def probe_lmstudio_models(
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
timeout: float = 5.0,
|
||||||
|
) -> Optional[list[str]]:
|
||||||
|
"""Probe LM Studio's model listing.
|
||||||
|
|
||||||
|
Returns chat-capable model keys on success, including the valid empty-list
|
||||||
|
case when the server is reachable but has no non-embedding models.
|
||||||
|
Returns ``None`` on network errors, malformed responses, or empty/invalid
|
||||||
|
base URLs.
|
||||||
|
|
||||||
|
Raises ``AuthError`` on HTTP 401/403 so callers can surface token issues
|
||||||
|
separately from reachability problems.
|
||||||
|
"""
|
||||||
|
server_root = (base_url or "").strip().rstrip("/")
|
||||||
|
if server_root.endswith("/v1"):
|
||||||
|
server_root = server_root[:-3].rstrip("/")
|
||||||
|
if not server_root:
|
||||||
|
return None
|
||||||
|
|
||||||
|
headers = {"User-Agent": _HERMES_USER_AGENT}
|
||||||
|
token = str(api_key or "").strip()
|
||||||
|
if token:
|
||||||
|
headers["Authorization"] = f"Bearer {token}"
|
||||||
|
request = urllib.request.Request(server_root + "/api/v1/models", headers=headers)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(request, timeout=timeout) as resp:
|
||||||
|
payload = json.loads(resp.read().decode())
|
||||||
|
except urllib.error.HTTPError as exc:
|
||||||
|
if exc.code in (401, 403):
|
||||||
|
from hermes_cli.auth import AuthError
|
||||||
|
raise AuthError(
|
||||||
|
f"LM Studio rejected the request with HTTP {exc.code}.",
|
||||||
|
provider="lmstudio",
|
||||||
|
code="auth_rejected",
|
||||||
|
) from exc
|
||||||
|
import logging
|
||||||
|
logging.getLogger(__name__).debug(
|
||||||
|
"LM Studio probe at %s failed with HTTP %s", server_root, exc.code,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
except Exception as exc:
|
||||||
|
import logging
|
||||||
|
logging.getLogger(__name__).debug(
|
||||||
|
"LM Studio probe at %s failed: %s", server_root, exc,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw_models = payload.get("models") if isinstance(payload, dict) else None
|
||||||
|
if not isinstance(raw_models, list):
|
||||||
|
import logging
|
||||||
|
logging.getLogger(__name__).debug(
|
||||||
|
"LM Studio probe at %s returned malformed payload (no `models` list)",
|
||||||
|
server_root,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
keys: list[str] = []
|
||||||
|
for raw in raw_models:
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
continue
|
||||||
|
if str(raw.get("type") or "").strip().lower() == "embedding":
|
||||||
|
continue
|
||||||
|
key = str(raw.get("key") or raw.get("id") or "").strip()
|
||||||
|
if key and key not in keys:
|
||||||
|
keys.append(key)
|
||||||
|
return keys
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_lmstudio_models(
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
timeout: float = 5.0,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Fetch LM Studio chat-capable model keys from native ``/api/v1/models``.
|
||||||
|
|
||||||
|
Returns a list of model keys (e.g. ``publisher/model-name``) with embedding
|
||||||
|
models filtered out. Returns an empty list on network errors, malformed
|
||||||
|
responses, or empty/invalid base URLs.
|
||||||
|
|
||||||
|
Raises ``AuthError`` on HTTP 401/403 so callers can distinguish a missing
|
||||||
|
or wrong ``LM_API_KEY`` from an unreachable server — the most common
|
||||||
|
LM Studio support case once auth-enabled mode is turned on.
|
||||||
|
"""
|
||||||
|
models = probe_lmstudio_models(api_key=api_key, base_url=base_url, timeout=timeout)
|
||||||
|
return models or []
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_lmstudio_model_loaded(
|
||||||
|
model: str,
|
||||||
|
base_url: Optional[str],
|
||||||
|
api_key: Optional[str],
|
||||||
|
target_context_length: int,
|
||||||
|
timeout: float = 120.0,
|
||||||
|
) -> Optional[int]:
|
||||||
|
"""Ensure LM Studio has ``model`` loaded with at least ``target_context_length``.
|
||||||
|
|
||||||
|
No-op when an instance is already loaded with sufficient context. Otherwise
|
||||||
|
POSTs ``/api/v1/models/load`` to (re)load with the target context, capped
|
||||||
|
at the model's ``max_context_length``. Returns the resolved loaded context
|
||||||
|
length, or ``None`` when the probe / load failed.
|
||||||
|
"""
|
||||||
|
server_root = (base_url or "").strip().rstrip("/")
|
||||||
|
if server_root.endswith("/v1"):
|
||||||
|
server_root = server_root[:-3].rstrip("/")
|
||||||
|
if not server_root:
|
||||||
|
return None
|
||||||
|
|
||||||
|
headers = {"User-Agent": _HERMES_USER_AGENT}
|
||||||
|
token = str(api_key or "").strip()
|
||||||
|
if token:
|
||||||
|
headers["Authorization"] = f"Bearer {token}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(
|
||||||
|
urllib.request.Request(server_root + "/api/v1/models", headers=headers),
|
||||||
|
timeout=10,
|
||||||
|
) as resp:
|
||||||
|
payload = json.loads(resp.read().decode())
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw_models = payload.get("models") if isinstance(payload, dict) else None
|
||||||
|
if not isinstance(raw_models, list):
|
||||||
|
return None
|
||||||
|
|
||||||
|
target_entry = None
|
||||||
|
for raw in raw_models:
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
continue
|
||||||
|
if raw.get("key") == model or raw.get("id") == model:
|
||||||
|
target_entry = raw
|
||||||
|
break
|
||||||
|
if target_entry is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
max_ctx = target_entry.get("max_context_length")
|
||||||
|
if isinstance(max_ctx, int) and max_ctx > 0:
|
||||||
|
target_context_length = min(target_context_length, max_ctx)
|
||||||
|
|
||||||
|
for inst in target_entry.get("loaded_instances") or []:
|
||||||
|
cfg = inst.get("config") if isinstance(inst, dict) else None
|
||||||
|
loaded_ctx = cfg.get("context_length") if isinstance(cfg, dict) else None
|
||||||
|
if isinstance(loaded_ctx, int) and loaded_ctx >= target_context_length:
|
||||||
|
return loaded_ctx
|
||||||
|
|
||||||
|
body = json.dumps({
|
||||||
|
"model": model,
|
||||||
|
"context_length": target_context_length,
|
||||||
|
}).encode()
|
||||||
|
load_headers = dict(headers)
|
||||||
|
load_headers["Content-Type"] = "application/json"
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(
|
||||||
|
urllib.request.Request(
|
||||||
|
server_root + "/api/v1/models/load",
|
||||||
|
data=body,
|
||||||
|
headers=load_headers,
|
||||||
|
method="POST",
|
||||||
|
),
|
||||||
|
timeout=timeout,
|
||||||
|
) as resp:
|
||||||
|
resp.read()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return target_context_length
|
||||||
|
|
||||||
|
|
||||||
|
def lmstudio_model_reasoning_options(
|
||||||
|
model: str,
|
||||||
|
base_url: Optional[str],
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
timeout: float = 5.0,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Return the reasoning ``allowed_options`` LM Studio publishes for ``model``.
|
||||||
|
|
||||||
|
Pulls ``capabilities.reasoning.allowed_options`` from ``/api/v1/models``.
|
||||||
|
Returns ``[]`` when the model is unknown, the endpoint is unreachable,
|
||||||
|
or the model does not declare a reasoning capability.
|
||||||
|
"""
|
||||||
|
server_root = (base_url or "").strip().rstrip("/")
|
||||||
|
if server_root.endswith("/v1"):
|
||||||
|
server_root = server_root[:-3].rstrip("/")
|
||||||
|
if not server_root:
|
||||||
|
return []
|
||||||
|
|
||||||
|
headers = {"User-Agent": _HERMES_USER_AGENT}
|
||||||
|
token = str(api_key or "").strip()
|
||||||
|
if token:
|
||||||
|
headers["Authorization"] = f"Bearer {token}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(
|
||||||
|
urllib.request.Request(server_root + "/api/v1/models", headers=headers),
|
||||||
|
timeout=timeout,
|
||||||
|
) as resp:
|
||||||
|
payload = json.loads(resp.read().decode())
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
raw_models = payload.get("models") if isinstance(payload, dict) else None
|
||||||
|
if not isinstance(raw_models, list):
|
||||||
|
return []
|
||||||
|
|
||||||
|
for raw in raw_models:
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
continue
|
||||||
|
if raw.get("key") != model and raw.get("id") != model:
|
||||||
|
continue
|
||||||
|
caps = raw.get("capabilities")
|
||||||
|
reasoning = caps.get("reasoning") if isinstance(caps, dict) else None
|
||||||
|
opts = reasoning.get("allowed_options") if isinstance(reasoning, dict) else None
|
||||||
|
if isinstance(opts, list):
|
||||||
|
return [str(o).strip().lower() for o in opts if isinstance(o, str)]
|
||||||
|
return []
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def _fetch_github_models(api_key: Optional[str] = None, timeout: float = 5.0) -> Optional[list[str]]:
|
def _fetch_github_models(api_key: Optional[str] = None, timeout: float = 5.0) -> Optional[list[str]]:
|
||||||
catalog = fetch_github_model_catalog(api_key=api_key, timeout=timeout)
|
catalog = fetch_github_model_catalog(api_key=api_key, timeout=timeout)
|
||||||
if not catalog:
|
if not catalog:
|
||||||
|
|
@ -2790,6 +3013,40 @@ def validate_requested_model(
|
||||||
"message": "Model names cannot contain spaces.",
|
"message": "Model names cannot contain spaces.",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if normalized == "lmstudio":
|
||||||
|
from hermes_cli.auth import AuthError
|
||||||
|
# Use probe_lmstudio_models so we can distinguish None (unreachable
|
||||||
|
# / malformed response) from [] (reachable, but no chat-capable models
|
||||||
|
# are loaded). fetch_lmstudio_models collapses both to [].
|
||||||
|
try:
|
||||||
|
models = probe_lmstudio_models(api_key=api_key, base_url=base_url)
|
||||||
|
except AuthError as exc:
|
||||||
|
return {
|
||||||
|
"accepted": False, "persist": False, "recognized": False,
|
||||||
|
"message": (
|
||||||
|
f"{exc} Set `LM_API_KEY` (or update it) to match the server's bearer token."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
if models is None:
|
||||||
|
return {
|
||||||
|
"accepted": False, "persist": False, "recognized": False,
|
||||||
|
"message": f"Could not reach LM Studio's `/api/v1/models` to validate `{requested}`.",
|
||||||
|
}
|
||||||
|
if not models:
|
||||||
|
return {
|
||||||
|
"accepted": False, "persist": False, "recognized": False,
|
||||||
|
"message": (
|
||||||
|
f"LM Studio is reachable but no chat-capable models are loaded. "
|
||||||
|
f"Load `{requested}` in LM Studio (Developer tab → Load Model) and try again."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
if requested_for_lookup in set(models):
|
||||||
|
return {"accepted": True, "persist": True, "recognized": True, "message": None}
|
||||||
|
return {
|
||||||
|
"accepted": False, "persist": False, "recognized": False,
|
||||||
|
"message": f"Model `{requested}` was not found in LM Studio's model listing.",
|
||||||
|
}
|
||||||
|
|
||||||
if normalized == "custom":
|
if normalized == "custom":
|
||||||
# Try probing with correct auth for the api_mode.
|
# Try probing with correct auth for the api_mode.
|
||||||
if api_mode == "anthropic_messages":
|
if api_mode == "anthropic_messages":
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,13 @@ HERMES_OVERLAYS: Dict[str, HermesOverlay] = {
|
||||||
auth_type="oauth_external",
|
auth_type="oauth_external",
|
||||||
base_url_override="cloudcode-pa://google",
|
base_url_override="cloudcode-pa://google",
|
||||||
),
|
),
|
||||||
|
"lmstudio": HermesOverlay(
|
||||||
|
transport="openai_chat",
|
||||||
|
auth_type="api_key",
|
||||||
|
extra_env_vars=("LM_API_KEY",),
|
||||||
|
base_url_override="http://127.0.0.1:1234/v1",
|
||||||
|
base_url_env_var="LM_BASE_URL",
|
||||||
|
),
|
||||||
"copilot-acp": HermesOverlay(
|
"copilot-acp": HermesOverlay(
|
||||||
transport="codex_responses",
|
transport="codex_responses",
|
||||||
auth_type="external_process",
|
auth_type="external_process",
|
||||||
|
|
@ -345,6 +352,7 @@ _LABEL_OVERRIDES: Dict[str, str] = {
|
||||||
"xiaomi": "Xiaomi MiMo",
|
"xiaomi": "Xiaomi MiMo",
|
||||||
"gmi": "GMI Cloud",
|
"gmi": "GMI Cloud",
|
||||||
"tencent-tokenhub": "Tencent TokenHub",
|
"tencent-tokenhub": "Tencent TokenHub",
|
||||||
|
"lmstudio": "LM Studio",
|
||||||
"local": "Local endpoint",
|
"local": "Local endpoint",
|
||||||
"bedrock": "AWS Bedrock",
|
"bedrock": "AWS Bedrock",
|
||||||
"ollama-cloud": "Ollama Cloud",
|
"ollama-cloud": "Ollama Cloud",
|
||||||
|
|
|
||||||
|
|
@ -1245,14 +1245,20 @@ def resolve_runtime_provider(
|
||||||
if pconfig and pconfig.auth_type == "api_key":
|
if pconfig and pconfig.auth_type == "api_key":
|
||||||
creds = resolve_api_key_provider_credentials(provider)
|
creds = resolve_api_key_provider_credentials(provider)
|
||||||
# Honour model.base_url from config.yaml when the configured provider
|
# Honour model.base_url from config.yaml when the configured provider
|
||||||
# matches this provider — mirrors the Anthropic path above. Without
|
# matches this provider, unless the provider-specific BASE_URL env var
|
||||||
# this, users who set model.base_url to e.g. api.minimaxi.com/anthropic
|
# is set. That keeps temporary env overrides (e.g. LM_BASE_URL) in sync
|
||||||
# (China endpoint) still get the hardcoded api.minimax.io default (#6039).
|
# with picker-time probing while still preserving saved config URLs when
|
||||||
|
# no override is present.
|
||||||
cfg_provider = str(model_cfg.get("provider") or "").strip().lower()
|
cfg_provider = str(model_cfg.get("provider") or "").strip().lower()
|
||||||
cfg_base_url = ""
|
cfg_base_url = ""
|
||||||
if cfg_provider == provider:
|
if cfg_provider == provider:
|
||||||
cfg_base_url = (model_cfg.get("base_url") or "").strip().rstrip("/")
|
cfg_base_url = (model_cfg.get("base_url") or "").strip().rstrip("/")
|
||||||
base_url = cfg_base_url or creds.get("base_url", "").rstrip("/")
|
env_base_url = ""
|
||||||
|
if pconfig.base_url_env_var:
|
||||||
|
env_base_url = os.getenv(pconfig.base_url_env_var, "").strip().rstrip("/")
|
||||||
|
base_url = creds.get("base_url", "").rstrip("/")
|
||||||
|
if cfg_base_url and not env_base_url:
|
||||||
|
base_url = cfg_base_url
|
||||||
api_mode = "chat_completions"
|
api_mode = "chat_completions"
|
||||||
if provider == "copilot":
|
if provider == "copilot":
|
||||||
api_mode = _copilot_runtime_api_mode(model_cfg, creds.get("api_key", ""))
|
api_mode = _copilot_runtime_api_mode(model_cfg, creds.get("api_key", ""))
|
||||||
|
|
|
||||||
|
|
@ -274,6 +274,23 @@ def show_status(args):
|
||||||
label = "configured" if configured else "not configured (run: hermes model)"
|
label = "configured" if configured else "not configured (run: hermes model)"
|
||||||
print(f" {pname:<16} {check_mark(configured)} {label}")
|
print(f" {pname:<16} {check_mark(configured)} {label}")
|
||||||
|
|
||||||
|
# LM Studio reachability — only probe when it's the active provider so
|
||||||
|
# users with foreign configs don't see noise. Auth rejection vs. silent
|
||||||
|
# empty list is the most common LM Studio support case.
|
||||||
|
if _effective_provider_label() == "LM Studio":
|
||||||
|
from hermes_cli.models import probe_lmstudio_models
|
||||||
|
model_cfg = config.get("model")
|
||||||
|
base = (model_cfg.get("base_url") if isinstance(model_cfg, dict) else None) or get_env_value("LM_BASE_URL") or "http://127.0.0.1:1234/v1"
|
||||||
|
try:
|
||||||
|
models = probe_lmstudio_models(api_key=get_env_value("LM_API_KEY") or "", base_url=base, timeout=1.5)
|
||||||
|
if models is None:
|
||||||
|
ok, msg = False, f"unreachable at {base}"
|
||||||
|
else:
|
||||||
|
ok, msg = True, f"reachable ({len(models)} model(s)) at {base}"
|
||||||
|
except AuthError:
|
||||||
|
ok, msg = False, "auth rejected — set LM_API_KEY"
|
||||||
|
print(f" {'LM Studio':<16} {check_mark(ok)} {msg}")
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Terminal Configuration
|
# Terminal Configuration
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
|
|
|
||||||
103
run_agent.py
103
run_agent.py
|
|
@ -1826,9 +1826,6 @@ class AIAgent:
|
||||||
)
|
)
|
||||||
_config_context_length = None
|
_config_context_length = None
|
||||||
|
|
||||||
# Store for reuse in switch_model (so config override persists across model switches)
|
|
||||||
self._config_context_length = _config_context_length
|
|
||||||
|
|
||||||
# Resolve custom_providers list once for reuse below (startup
|
# Resolve custom_providers list once for reuse below (startup
|
||||||
# context-length override and plugin context-engine init).
|
# context-length override and plugin context-engine init).
|
||||||
try:
|
try:
|
||||||
|
|
@ -1887,7 +1884,14 @@ class AIAgent:
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Persist for reuse on switch_model / fallback activation. Must come
|
||||||
|
# AFTER the custom_providers branch so per-model overrides aren't lost.
|
||||||
|
self._config_context_length = _config_context_length
|
||||||
|
|
||||||
|
self._ensure_lmstudio_runtime_loaded(_config_context_length)
|
||||||
|
|
||||||
|
|
||||||
# Select context engine: config-driven (like memory providers).
|
# Select context engine: config-driven (like memory providers).
|
||||||
# 1. Check config.yaml context.engine setting
|
# 1. Check config.yaml context.engine setting
|
||||||
# 2. Check plugins/context_engine/<name>/ directory (repo-shipped)
|
# 2. Check plugins/context_engine/<name>/ directory (repo-shipped)
|
||||||
|
|
@ -2129,6 +2133,24 @@ class AIAgent:
|
||||||
if hasattr(self, "context_compressor") and self.context_compressor:
|
if hasattr(self, "context_compressor") and self.context_compressor:
|
||||||
self.context_compressor.on_session_reset()
|
self.context_compressor.on_session_reset()
|
||||||
|
|
||||||
|
def _ensure_lmstudio_runtime_loaded(self, config_context_length: Optional[int] = None) -> None:
|
||||||
|
"""
|
||||||
|
Preload the LM Studio model with at least Hermes' minimum context.
|
||||||
|
"""
|
||||||
|
if (self.provider or "").strip().lower() != "lmstudio":
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
|
||||||
|
from hermes_cli.models import ensure_lmstudio_model_loaded
|
||||||
|
if config_context_length is None:
|
||||||
|
config_context_length = getattr(self, "_config_context_length", None)
|
||||||
|
target_ctx = max(config_context_length or 0, MINIMUM_CONTEXT_LENGTH)
|
||||||
|
ensure_lmstudio_model_loaded(
|
||||||
|
self.model, self.base_url, getattr(self, "api_key", ""), target_ctx,
|
||||||
|
)
|
||||||
|
except Exception as err:
|
||||||
|
logger.debug("LM Studio preload skipped: %s", err)
|
||||||
|
|
||||||
def switch_model(self, new_model, new_provider, api_key='', base_url='', api_mode=''):
|
def switch_model(self, new_model, new_provider, api_key='', base_url='', api_mode=''):
|
||||||
"""Switch the model/provider in-place for a live agent.
|
"""Switch the model/provider in-place for a live agent.
|
||||||
|
|
||||||
|
|
@ -2224,6 +2246,9 @@ class AIAgent:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ── LM Studio: preload before probing context length ──
|
||||||
|
self._ensure_lmstudio_runtime_loaded()
|
||||||
|
|
||||||
# ── Update context compressor ──
|
# ── Update context compressor ──
|
||||||
if hasattr(self, "context_compressor") and self.context_compressor:
|
if hasattr(self, "context_compressor") and self.context_compressor:
|
||||||
from agent.model_metadata import get_model_context_length
|
from agent.model_metadata import get_model_context_length
|
||||||
|
|
@ -7327,6 +7352,9 @@ class AIAgent:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# LM Studio: preload before probing the fallback's context length.
|
||||||
|
self._ensure_lmstudio_runtime_loaded()
|
||||||
|
|
||||||
# Update context compressor limits for the fallback model.
|
# Update context compressor limits for the fallback model.
|
||||||
# Without this, compression decisions use the primary model's
|
# Without this, compression decisions use the primary model's
|
||||||
# context window (e.g. 200K) instead of the fallback's (e.g. 32K),
|
# context window (e.g. 200K) instead of the fallback's (e.g. 32K),
|
||||||
|
|
@ -8047,6 +8075,7 @@ class AIAgent:
|
||||||
or base_url_host_matches(self.base_url, "moonshot.cn")
|
or base_url_host_matches(self.base_url, "moonshot.cn")
|
||||||
)
|
)
|
||||||
_is_tokenhub = base_url_host_matches(self._base_url_lower, "tokenhub.tencentmaas.com")
|
_is_tokenhub = base_url_host_matches(self._base_url_lower, "tokenhub.tencentmaas.com")
|
||||||
|
_is_lmstudio = (self.provider or "").strip().lower() == "lmstudio"
|
||||||
|
|
||||||
# Temperature: _fixed_temperature_for_model may return OMIT_TEMPERATURE
|
# Temperature: _fixed_temperature_for_model may return OMIT_TEMPERATURE
|
||||||
# sentinel (temperature omitted entirely), a numeric override, or None.
|
# sentinel (temperature omitted entirely), a numeric override, or None.
|
||||||
|
|
@ -8119,6 +8148,7 @@ class AIAgent:
|
||||||
is_nvidia_nim=_is_nvidia,
|
is_nvidia_nim=_is_nvidia,
|
||||||
is_kimi=_is_kimi,
|
is_kimi=_is_kimi,
|
||||||
is_tokenhub=_is_tokenhub,
|
is_tokenhub=_is_tokenhub,
|
||||||
|
is_lmstudio=_is_lmstudio,
|
||||||
is_custom_provider=self.provider == "custom",
|
is_custom_provider=self.provider == "custom",
|
||||||
ollama_num_ctx=self._ollama_num_ctx,
|
ollama_num_ctx=self._ollama_num_ctx,
|
||||||
provider_preferences=_prefs or None,
|
provider_preferences=_prefs or None,
|
||||||
|
|
@ -8129,6 +8159,7 @@ class AIAgent:
|
||||||
omit_temperature=_omit_temp,
|
omit_temperature=_omit_temp,
|
||||||
supports_reasoning=self._supports_reasoning_extra_body(),
|
supports_reasoning=self._supports_reasoning_extra_body(),
|
||||||
github_reasoning_extra=self._github_models_reasoning_extra_body() if _is_gh else None,
|
github_reasoning_extra=self._github_models_reasoning_extra_body() if _is_gh else None,
|
||||||
|
lmstudio_reasoning_options=self._lmstudio_reasoning_options_cached() if _is_lmstudio else None,
|
||||||
anthropic_max_output=_ant_max,
|
anthropic_max_output=_ant_max,
|
||||||
provider_name=self.provider,
|
provider_name=self.provider,
|
||||||
)
|
)
|
||||||
|
|
@ -8154,6 +8185,10 @@ class AIAgent:
|
||||||
return bool(github_model_reasoning_efforts(self.model))
|
return bool(github_model_reasoning_efforts(self.model))
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
if (self.provider or "").strip().lower() == "lmstudio":
|
||||||
|
opts = self._lmstudio_reasoning_options_cached()
|
||||||
|
# "off-only" (or absent) means no real reasoning capability.
|
||||||
|
return any(opt and opt != "off" for opt in opts)
|
||||||
if "openrouter" not in self._base_url_lower:
|
if "openrouter" not in self._base_url_lower:
|
||||||
return False
|
return False
|
||||||
if "api.mistral.ai" in self._base_url_lower:
|
if "api.mistral.ai" in self._base_url_lower:
|
||||||
|
|
@ -8171,6 +8206,48 @@ class AIAgent:
|
||||||
)
|
)
|
||||||
return any(model.startswith(prefix) for prefix in reasoning_model_prefixes)
|
return any(model.startswith(prefix) for prefix in reasoning_model_prefixes)
|
||||||
|
|
||||||
|
def _lmstudio_reasoning_options_cached(self) -> list[str]:
|
||||||
|
"""Probe LM Studio's published reasoning ``allowed_options`` once per
|
||||||
|
(model, base_url). The list (e.g. ``["off","on"]`` or
|
||||||
|
``["off","minimal","low"]``) is needed both for the supports-reasoning
|
||||||
|
gate and for clamping the emitted ``reasoning_effort`` so toggle-style
|
||||||
|
models don't 400 on ``high``. Cache is keyed on (model, base_url) so
|
||||||
|
``/model`` swaps and base-URL changes don't reuse a stale list, and an
|
||||||
|
empty result (transient probe failure) is *not* cached so the next call
|
||||||
|
retries instead of silently disabling reasoning for the rest of the
|
||||||
|
session.
|
||||||
|
"""
|
||||||
|
cache = getattr(self, "_lm_reasoning_opts_cache", None)
|
||||||
|
if cache is None:
|
||||||
|
cache = self._lm_reasoning_opts_cache = {}
|
||||||
|
key = (self.model, self.base_url)
|
||||||
|
cached = cache.get(key)
|
||||||
|
if cached:
|
||||||
|
return cached
|
||||||
|
try:
|
||||||
|
from hermes_cli.models import lmstudio_model_reasoning_options
|
||||||
|
opts = lmstudio_model_reasoning_options(
|
||||||
|
self.model, self.base_url, getattr(self, "api_key", ""),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
opts = []
|
||||||
|
if opts:
|
||||||
|
cache[key] = opts
|
||||||
|
return opts
|
||||||
|
|
||||||
|
def _resolve_lmstudio_summary_reasoning_effort(self) -> Optional[str]:
|
||||||
|
"""Resolve a safe top-level ``reasoning_effort`` for LM Studio.
|
||||||
|
|
||||||
|
The iteration-limit summary path calls ``chat.completions.create()``
|
||||||
|
directly, bypassing the transport. Share the helper so the two paths
|
||||||
|
can't drift on effort resolution and clamping.
|
||||||
|
"""
|
||||||
|
from agent.lmstudio_reasoning import resolve_lmstudio_effort
|
||||||
|
return resolve_lmstudio_effort(
|
||||||
|
self.reasoning_config,
|
||||||
|
self._lmstudio_reasoning_options_cached(),
|
||||||
|
)
|
||||||
|
|
||||||
def _github_models_reasoning_extra_body(self) -> dict | None:
|
def _github_models_reasoning_extra_body(self) -> dict | None:
|
||||||
"""Format reasoning payload for GitHub Models/OpenAI-compatible routes."""
|
"""Format reasoning payload for GitHub Models/OpenAI-compatible routes."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -9692,7 +9769,19 @@ class AIAgent:
|
||||||
_omit_summary_temperature = _raw_summary_temp is _OMIT_TEMP
|
_omit_summary_temperature = _raw_summary_temp is _OMIT_TEMP
|
||||||
_summary_temperature = None if _omit_summary_temperature else _raw_summary_temp
|
_summary_temperature = None if _omit_summary_temperature else _raw_summary_temp
|
||||||
_is_nous = "nousresearch" in self._base_url_lower
|
_is_nous = "nousresearch" in self._base_url_lower
|
||||||
if self._supports_reasoning_extra_body():
|
# LM Studio uses top-level `reasoning_effort` (not extra_body.reasoning).
|
||||||
|
# Mirror ChatCompletionsTransport.build_kwargs() so the summary path
|
||||||
|
# — which calls chat.completions.create() directly without going
|
||||||
|
# through the transport — sends the same shape the transport does.
|
||||||
|
_is_lmstudio_summary = (
|
||||||
|
(self.provider or "").strip().lower() == "lmstudio"
|
||||||
|
and self._supports_reasoning_extra_body()
|
||||||
|
)
|
||||||
|
_lm_reasoning_effort: str | None = (
|
||||||
|
self._resolve_lmstudio_summary_reasoning_effort()
|
||||||
|
if _is_lmstudio_summary else None
|
||||||
|
)
|
||||||
|
if not _is_lmstudio_summary and self._supports_reasoning_extra_body():
|
||||||
if self.reasoning_config is not None:
|
if self.reasoning_config is not None:
|
||||||
summary_extra_body["reasoning"] = self.reasoning_config
|
summary_extra_body["reasoning"] = self.reasoning_config
|
||||||
else:
|
else:
|
||||||
|
|
@ -9719,6 +9808,8 @@ class AIAgent:
|
||||||
summary_kwargs["temperature"] = _summary_temperature
|
summary_kwargs["temperature"] = _summary_temperature
|
||||||
if self.max_tokens is not None:
|
if self.max_tokens is not None:
|
||||||
summary_kwargs.update(self._max_tokens_param(self.max_tokens))
|
summary_kwargs.update(self._max_tokens_param(self.max_tokens))
|
||||||
|
if _lm_reasoning_effort is not None:
|
||||||
|
summary_kwargs["reasoning_effort"] = _lm_reasoning_effort
|
||||||
|
|
||||||
# Include provider routing preferences
|
# Include provider routing preferences
|
||||||
provider_preferences = {}
|
provider_preferences = {}
|
||||||
|
|
@ -9784,6 +9875,8 @@ class AIAgent:
|
||||||
summary_kwargs["temperature"] = _summary_temperature
|
summary_kwargs["temperature"] = _summary_temperature
|
||||||
if self.max_tokens is not None:
|
if self.max_tokens is not None:
|
||||||
summary_kwargs.update(self._max_tokens_param(self.max_tokens))
|
summary_kwargs.update(self._max_tokens_param(self.max_tokens))
|
||||||
|
if _lm_reasoning_effort is not None:
|
||||||
|
summary_kwargs["reasoning_effort"] = _lm_reasoning_effort
|
||||||
if summary_extra_body:
|
if summary_extra_body:
|
||||||
summary_kwargs["extra_body"] = summary_extra_body
|
summary_kwargs["extra_body"] = summary_extra_body
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -376,6 +376,80 @@ class TestChatCompletionsKimi:
|
||||||
assert "type" not in kw["tools"][0]["function"]["parameters"]["properties"]["q"]
|
assert "type" not in kw["tools"][0]["function"]["parameters"]["properties"]["q"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestChatCompletionsLmStudioReasoning:
|
||||||
|
"""LM Studio publishes per-model reasoning ``allowed_options``. When the
|
||||||
|
user requests an effort the model can't honor (e.g. ``high`` on a
|
||||||
|
toggle-style ``["off","on"]`` model), the transport omits
|
||||||
|
``reasoning_effort`` so LM Studio falls back to the model's default —
|
||||||
|
silently downgrading "high" to "low" would mislead the user.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_omits_effort_when_high_not_allowed_toggle(self, transport):
|
||||||
|
kw = transport.build_kwargs(
|
||||||
|
model="gpt-oss", messages=[{"role": "user", "content": "Hi"}],
|
||||||
|
is_lmstudio=True,
|
||||||
|
supports_reasoning=True,
|
||||||
|
reasoning_config={"effort": "high"},
|
||||||
|
lmstudio_reasoning_options=["off", "on"],
|
||||||
|
)
|
||||||
|
assert "reasoning_effort" not in kw
|
||||||
|
|
||||||
|
def test_omits_effort_when_high_not_allowed_minimal_low(self, transport):
|
||||||
|
kw = transport.build_kwargs(
|
||||||
|
model="gpt-oss", messages=[{"role": "user", "content": "Hi"}],
|
||||||
|
is_lmstudio=True,
|
||||||
|
supports_reasoning=True,
|
||||||
|
reasoning_config={"effort": "high"},
|
||||||
|
lmstudio_reasoning_options=["off", "minimal", "low"],
|
||||||
|
)
|
||||||
|
assert "reasoning_effort" not in kw
|
||||||
|
|
||||||
|
def test_passes_through_when_effort_allowed(self, transport):
|
||||||
|
kw = transport.build_kwargs(
|
||||||
|
model="gpt-oss", messages=[{"role": "user", "content": "Hi"}],
|
||||||
|
is_lmstudio=True,
|
||||||
|
supports_reasoning=True,
|
||||||
|
reasoning_config={"effort": "high"},
|
||||||
|
lmstudio_reasoning_options=["off", "low", "medium", "high"],
|
||||||
|
)
|
||||||
|
assert kw["reasoning_effort"] == "high"
|
||||||
|
|
||||||
|
def test_passes_through_aliased_on_for_toggle(self, transport):
|
||||||
|
# User has reasoning enabled at the default "medium"; toggle model
|
||||||
|
# publishes ["off","on"] which aliases to {"none","medium"}, so the
|
||||||
|
# default request is honorable and gets sent.
|
||||||
|
kw = transport.build_kwargs(
|
||||||
|
model="gpt-oss", messages=[{"role": "user", "content": "Hi"}],
|
||||||
|
is_lmstudio=True,
|
||||||
|
supports_reasoning=True,
|
||||||
|
reasoning_config={"effort": "medium"},
|
||||||
|
lmstudio_reasoning_options=["off", "on"],
|
||||||
|
)
|
||||||
|
assert kw["reasoning_effort"] == "medium"
|
||||||
|
|
||||||
|
def test_disabled_keeps_none_when_off_allowed(self, transport):
|
||||||
|
kw = transport.build_kwargs(
|
||||||
|
model="gpt-oss", messages=[{"role": "user", "content": "Hi"}],
|
||||||
|
is_lmstudio=True,
|
||||||
|
supports_reasoning=True,
|
||||||
|
reasoning_config={"enabled": False},
|
||||||
|
lmstudio_reasoning_options=["off", "on"],
|
||||||
|
)
|
||||||
|
assert kw["reasoning_effort"] == "none"
|
||||||
|
|
||||||
|
def test_no_options_falls_back_to_legacy_behavior(self, transport):
|
||||||
|
# When the probe failed or returned nothing, allowed_options is unknown;
|
||||||
|
# send whatever the user picked rather than blocking the request.
|
||||||
|
kw = transport.build_kwargs(
|
||||||
|
model="gpt-oss", messages=[{"role": "user", "content": "Hi"}],
|
||||||
|
is_lmstudio=True,
|
||||||
|
supports_reasoning=True,
|
||||||
|
reasoning_config={"effort": "high"},
|
||||||
|
lmstudio_reasoning_options=None,
|
||||||
|
)
|
||||||
|
assert kw["reasoning_effort"] == "high"
|
||||||
|
|
||||||
|
|
||||||
class TestChatCompletionsValidate:
|
class TestChatCompletionsValidate:
|
||||||
|
|
||||||
def test_none(self, transport):
|
def test_none(self, transport):
|
||||||
|
|
|
||||||
|
|
@ -145,6 +145,7 @@ class TestProviderRegistry:
|
||||||
PROVIDER_ENV_VARS = (
|
PROVIDER_ENV_VARS = (
|
||||||
"OPENROUTER_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "ANTHROPIC_TOKEN",
|
"OPENROUTER_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "ANTHROPIC_TOKEN",
|
||||||
"CLAUDE_CODE_OAUTH_TOKEN",
|
"CLAUDE_CODE_OAUTH_TOKEN",
|
||||||
|
"LM_API_KEY", "LM_BASE_URL",
|
||||||
"GLM_API_KEY", "ZAI_API_KEY", "Z_AI_API_KEY",
|
"GLM_API_KEY", "ZAI_API_KEY", "Z_AI_API_KEY",
|
||||||
"KIMI_API_KEY", "KIMI_BASE_URL", "STEPFUN_API_KEY", "STEPFUN_BASE_URL",
|
"KIMI_API_KEY", "KIMI_BASE_URL", "STEPFUN_API_KEY", "STEPFUN_BASE_URL",
|
||||||
"MINIMAX_API_KEY", "MINIMAX_CN_API_KEY",
|
"MINIMAX_API_KEY", "MINIMAX_CN_API_KEY",
|
||||||
|
|
@ -428,6 +429,29 @@ class TestResolveApiKeyProviderCredentials:
|
||||||
assert creds["base_url"] == "https://api.githubcopilot.com"
|
assert creds["base_url"] == "https://api.githubcopilot.com"
|
||||||
assert creds["source"] == "gh auth token"
|
assert creds["source"] == "gh auth token"
|
||||||
|
|
||||||
|
def test_resolve_lmstudio_uses_token_and_base_url_from_env(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("LM_API_KEY", "lm-token")
|
||||||
|
monkeypatch.setenv("LM_BASE_URL", "http://lmstudio.remote:4321/v1")
|
||||||
|
|
||||||
|
creds = resolve_api_key_provider_credentials("lmstudio")
|
||||||
|
|
||||||
|
assert creds["provider"] == "lmstudio"
|
||||||
|
assert creds["api_key"] == "lm-token"
|
||||||
|
assert creds["base_url"] == "http://lmstudio.remote:4321/v1"
|
||||||
|
|
||||||
|
def test_resolve_lmstudio_no_api_key_substitutes_placeholder(self, monkeypatch):
|
||||||
|
# No-auth LM Studio: when LM_API_KEY isn't set, runtime credentials
|
||||||
|
# carry a placeholder so gateway/TUI/cron paths see the local server
|
||||||
|
# as configured. get_api_key_provider_status still reports unconfigured.
|
||||||
|
monkeypatch.delenv("LM_API_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("LM_BASE_URL", raising=False)
|
||||||
|
|
||||||
|
creds = resolve_api_key_provider_credentials("lmstudio")
|
||||||
|
|
||||||
|
assert creds["provider"] == "lmstudio"
|
||||||
|
assert creds["api_key"] == "dummy-lm-api-key"
|
||||||
|
assert creds["base_url"] == "http://127.0.0.1:1234/v1"
|
||||||
|
|
||||||
def test_try_gh_cli_token_uses_homebrew_path_when_not_on_path(self, monkeypatch):
|
def test_try_gh_cli_token_uses_homebrew_path_when_not_on_path(self, monkeypatch):
|
||||||
monkeypatch.setattr("hermes_cli.copilot_auth.shutil.which", lambda command: None)
|
monkeypatch.setattr("hermes_cli.copilot_auth.shutil.which", lambda command: None)
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
|
|
|
||||||
|
|
@ -260,6 +260,33 @@ class TestProviderPersistsAfterModelSave:
|
||||||
assert model.get("default") == "minimax-m2.5"
|
assert model.get("default") == "minimax-m2.5"
|
||||||
assert model.get("api_mode") == "anthropic_messages"
|
assert model.get("api_mode") == "anthropic_messages"
|
||||||
|
|
||||||
|
def test_lmstudio_provider_saved_when_selected(self, config_home, monkeypatch):
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
from hermes_cli.main import _model_flow_api_key_provider
|
||||||
|
|
||||||
|
monkeypatch.setenv("LM_API_KEY", "lm-token")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"hermes_cli.auth._prompt_model_selection",
|
||||||
|
lambda models, current_model="": "publisher/model-a",
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("hermes_cli.auth.deactivate_provider", lambda: None)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"hermes_cli.models.fetch_lmstudio_models",
|
||||||
|
lambda api_key=None, base_url=None, timeout=5.0: ["publisher/model-a"],
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("builtins.input", side_effect=[""]):
|
||||||
|
_model_flow_api_key_provider(load_config(), "lmstudio", "old-model")
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
config = yaml.safe_load((config_home / "config.yaml").read_text()) or {}
|
||||||
|
model = config.get("model")
|
||||||
|
assert isinstance(model, dict)
|
||||||
|
assert model.get("provider") == "lmstudio"
|
||||||
|
assert model.get("base_url") == "http://127.0.0.1:1234/v1"
|
||||||
|
assert model.get("default") == "publisher/model-a"
|
||||||
|
|
||||||
|
|
||||||
class TestBaseUrlValidation:
|
class TestBaseUrlValidation:
|
||||||
"""Reject non-URL values in the base URL prompt (e.g. shell commands)."""
|
"""Reject non-URL values in the base URL prompt (e.g. shell commands)."""
|
||||||
|
|
|
||||||
|
|
@ -398,3 +398,84 @@ def test_list_authenticated_providers_total_models_reflects_grouped_count(monkey
|
||||||
assert group["total_models"] == 6
|
assert group["total_models"] == 6
|
||||||
# All six models are preserved in the grouped row.
|
# All six models are preserved in the grouped row.
|
||||||
assert sorted(group["models"]) == sorted(f"model-{i}" for i in range(6))
|
assert sorted(group["models"]) == sorted(f"model-{i}" for i in range(6))
|
||||||
|
|
||||||
|
|
||||||
|
def test_lmstudio_picker_probes_active_config_base_url(monkeypatch):
|
||||||
|
"""When `provider: lmstudio` is saved with a remote base_url and no
|
||||||
|
LM_BASE_URL env var, the picker must probe the saved base_url — not
|
||||||
|
127.0.0.1. Regression: prior behavior always probed localhost, so users
|
||||||
|
with LM Studio on a lab box saw the wrong (or empty) model list.
|
||||||
|
"""
|
||||||
|
monkeypatch.setattr("agent.models_dev.fetch_models_dev", lambda: {})
|
||||||
|
monkeypatch.setattr(providers_mod, "HERMES_OVERLAYS", {})
|
||||||
|
monkeypatch.delenv("LM_BASE_URL", raising=False)
|
||||||
|
monkeypatch.delenv("LM_API_KEY", raising=False)
|
||||||
|
|
||||||
|
captured: dict = {}
|
||||||
|
|
||||||
|
def _fake_fetch(api_key=None, base_url=None, timeout=5.0):
|
||||||
|
captured["base_url"] = base_url
|
||||||
|
captured["api_key"] = api_key
|
||||||
|
return ["qwen/qwen3-coder-30b"]
|
||||||
|
|
||||||
|
monkeypatch.setattr("hermes_cli.models.fetch_lmstudio_models", _fake_fetch)
|
||||||
|
|
||||||
|
list_authenticated_providers(
|
||||||
|
current_provider="lmstudio",
|
||||||
|
current_base_url="http://192.168.1.10:1234/v1",
|
||||||
|
current_model="qwen/qwen3-coder-30b",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert captured["base_url"] == "http://192.168.1.10:1234/v1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_lmstudio_picker_lm_base_url_env_wins_over_active_config(monkeypatch):
|
||||||
|
"""LM_BASE_URL env var must still take precedence over the saved
|
||||||
|
base_url so users can temporarily redirect the picker without editing
|
||||||
|
config.yaml.
|
||||||
|
"""
|
||||||
|
monkeypatch.setattr("agent.models_dev.fetch_models_dev", lambda: {})
|
||||||
|
monkeypatch.setattr(providers_mod, "HERMES_OVERLAYS", {})
|
||||||
|
monkeypatch.setenv("LM_BASE_URL", "http://override.local:9999/v1")
|
||||||
|
monkeypatch.delenv("LM_API_KEY", raising=False)
|
||||||
|
|
||||||
|
captured: dict = {}
|
||||||
|
|
||||||
|
def _fake_fetch(api_key=None, base_url=None, timeout=5.0):
|
||||||
|
captured["base_url"] = base_url
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr("hermes_cli.models.fetch_lmstudio_models", _fake_fetch)
|
||||||
|
|
||||||
|
list_authenticated_providers(
|
||||||
|
current_provider="lmstudio",
|
||||||
|
current_base_url="http://192.168.1.10:1234/v1",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert captured["base_url"] == "http://override.local:9999/v1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_lmstudio_picker_skips_probe_when_not_configured(monkeypatch):
|
||||||
|
"""If the user has never configured LM Studio (no LM_API_KEY / LM_BASE_URL
|
||||||
|
and not on lmstudio), the picker must not pay the localhost probe cost
|
||||||
|
just to discover LM Studio is unavailable.
|
||||||
|
"""
|
||||||
|
monkeypatch.setattr("agent.models_dev.fetch_models_dev", lambda: {})
|
||||||
|
monkeypatch.setattr(providers_mod, "HERMES_OVERLAYS", {})
|
||||||
|
monkeypatch.delenv("LM_BASE_URL", raising=False)
|
||||||
|
monkeypatch.delenv("LM_API_KEY", raising=False)
|
||||||
|
|
||||||
|
captured: dict = {}
|
||||||
|
|
||||||
|
def _fake_fetch(api_key=None, base_url=None, timeout=5.0):
|
||||||
|
captured["base_url"] = base_url
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr("hermes_cli.models.fetch_lmstudio_models", _fake_fetch)
|
||||||
|
|
||||||
|
list_authenticated_providers(
|
||||||
|
current_provider="openrouter",
|
||||||
|
current_base_url="https://openrouter.ai/api/v1",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "base_url" not in captured
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
"""Tests for provider-aware `/model` validation in hermes_cli.models."""
|
"""Tests for provider-aware `/model` validation in hermes_cli.models."""
|
||||||
|
|
||||||
from unittest.mock import patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
from hermes_cli.models import (
|
from hermes_cli.models import (
|
||||||
azure_foundry_model_api_mode,
|
azure_foundry_model_api_mode,
|
||||||
|
|
@ -8,6 +8,7 @@ from hermes_cli.models import (
|
||||||
fetch_github_model_catalog,
|
fetch_github_model_catalog,
|
||||||
curated_models_for_provider,
|
curated_models_for_provider,
|
||||||
fetch_api_models,
|
fetch_api_models,
|
||||||
|
fetch_lmstudio_models,
|
||||||
github_model_reasoning_efforts,
|
github_model_reasoning_efforts,
|
||||||
normalize_copilot_model_id,
|
normalize_copilot_model_id,
|
||||||
normalize_opencode_model_id,
|
normalize_opencode_model_id,
|
||||||
|
|
@ -638,6 +639,110 @@ class TestValidateApiFallback:
|
||||||
assert "http://localhost:8000/v1/models" in result["message"]
|
assert "http://localhost:8000/v1/models" in result["message"]
|
||||||
assert "http://localhost:8000/v1" in result["message"]
|
assert "http://localhost:8000/v1" in result["message"]
|
||||||
|
|
||||||
|
def test_fetch_lmstudio_models_filters_embedding_type(self):
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.__enter__.return_value = mock_resp
|
||||||
|
mock_resp.__exit__.return_value = False
|
||||||
|
mock_resp.read.return_value = (
|
||||||
|
b'{"models":['
|
||||||
|
b'{"key":"publisher/chat-model","id":"publisher/chat-model","type":"llm"},'
|
||||||
|
b'{"key":"publisher/embed-model","id":"publisher/embed-model","type":"embedding"}'
|
||||||
|
b']}'
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("hermes_cli.models.urllib.request.urlopen", return_value=mock_resp):
|
||||||
|
models = fetch_lmstudio_models(base_url="http://localhost:1234/v1")
|
||||||
|
|
||||||
|
assert models == ["publisher/chat-model"]
|
||||||
|
|
||||||
|
def test_validate_lmstudio_rejects_embedding_models(self):
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.__enter__.return_value = mock_resp
|
||||||
|
mock_resp.__exit__.return_value = False
|
||||||
|
mock_resp.read.return_value = (
|
||||||
|
b'{"models":['
|
||||||
|
b'{"key":"publisher/chat-model","id":"publisher/chat-model","type":"llm"},'
|
||||||
|
b'{"key":"publisher/embed-model","id":"publisher/embed-model","type":"embedding"}'
|
||||||
|
b']}'
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("hermes_cli.models.urllib.request.urlopen", return_value=mock_resp):
|
||||||
|
result = validate_requested_model(
|
||||||
|
"publisher/embed-model",
|
||||||
|
"lmstudio",
|
||||||
|
base_url="http://localhost:1234/v1",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["accepted"] is False
|
||||||
|
assert result["recognized"] is False
|
||||||
|
assert "not found in LM Studio's model listing" in result["message"]
|
||||||
|
|
||||||
|
def test_fetch_lmstudio_models_raises_auth_error_on_401(self):
|
||||||
|
import urllib.error
|
||||||
|
from hermes_cli.auth import AuthError
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
http_error = urllib.error.HTTPError(
|
||||||
|
url="http://localhost:1234/api/v1/models",
|
||||||
|
code=401,
|
||||||
|
msg="Unauthorized",
|
||||||
|
hdrs=None,
|
||||||
|
fp=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("hermes_cli.models.urllib.request.urlopen", side_effect=http_error):
|
||||||
|
with pytest.raises(AuthError) as excinfo:
|
||||||
|
fetch_lmstudio_models(base_url="http://localhost:1234/v1")
|
||||||
|
|
||||||
|
assert excinfo.value.provider == "lmstudio"
|
||||||
|
assert excinfo.value.code == "auth_rejected"
|
||||||
|
assert "401" in str(excinfo.value)
|
||||||
|
|
||||||
|
def test_fetch_lmstudio_models_returns_empty_on_network_error(self):
|
||||||
|
with patch(
|
||||||
|
"hermes_cli.models.urllib.request.urlopen",
|
||||||
|
side_effect=ConnectionRefusedError(),
|
||||||
|
):
|
||||||
|
models = fetch_lmstudio_models(base_url="http://localhost:1234/v1")
|
||||||
|
|
||||||
|
assert models == []
|
||||||
|
|
||||||
|
def test_validate_lmstudio_distinguishes_auth_failure(self):
|
||||||
|
import urllib.error
|
||||||
|
|
||||||
|
http_error = urllib.error.HTTPError(
|
||||||
|
url="http://localhost:1234/api/v1/models",
|
||||||
|
code=401,
|
||||||
|
msg="Unauthorized",
|
||||||
|
hdrs=None,
|
||||||
|
fp=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("hermes_cli.models.urllib.request.urlopen", side_effect=http_error):
|
||||||
|
result = validate_requested_model(
|
||||||
|
"publisher/chat-model",
|
||||||
|
"lmstudio",
|
||||||
|
base_url="http://localhost:1234/v1",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["accepted"] is False
|
||||||
|
assert "401" in result["message"]
|
||||||
|
assert "LM_API_KEY" in result["message"]
|
||||||
|
|
||||||
|
def test_validate_lmstudio_distinguishes_unreachable(self):
|
||||||
|
with patch(
|
||||||
|
"hermes_cli.models.urllib.request.urlopen",
|
||||||
|
side_effect=ConnectionRefusedError(),
|
||||||
|
):
|
||||||
|
result = validate_requested_model(
|
||||||
|
"publisher/chat-model",
|
||||||
|
"lmstudio",
|
||||||
|
base_url="http://localhost:1234/v1",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["accepted"] is False
|
||||||
|
assert "Could not reach LM Studio" in result["message"]
|
||||||
|
|
||||||
|
|
||||||
# -- validate — Codex auto-correction ------------------------------------------
|
# -- validate — Codex auto-correction ------------------------------------------
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -240,6 +240,110 @@ def test_resolve_runtime_provider_ai_gateway(monkeypatch):
|
||||||
assert resolved["requested_provider"] == "ai-gateway"
|
assert resolved["requested_provider"] == "ai-gateway"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_runtime_provider_lmstudio_uses_token_when_present(monkeypatch):
|
||||||
|
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "lmstudio")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
rp,
|
||||||
|
"_get_model_config",
|
||||||
|
lambda: {
|
||||||
|
"provider": "lmstudio",
|
||||||
|
"base_url": "http://127.0.0.1:1234/v1",
|
||||||
|
"default": "publisher/model-a",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
rp,
|
||||||
|
"load_pool",
|
||||||
|
lambda provider: type("Pool", (), {"has_credentials": lambda self: False})(),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
rp,
|
||||||
|
"resolve_api_key_provider_credentials",
|
||||||
|
lambda provider: {
|
||||||
|
"provider": "lmstudio",
|
||||||
|
"api_key": "lm-token",
|
||||||
|
"base_url": "http://127.0.0.1:1234/v1",
|
||||||
|
"source": "LM_API_KEY",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
resolved = rp.resolve_runtime_provider(requested="lmstudio")
|
||||||
|
|
||||||
|
assert resolved["provider"] == "lmstudio"
|
||||||
|
assert resolved["api_key"] == "lm-token"
|
||||||
|
assert resolved["api_mode"] == "chat_completions"
|
||||||
|
assert resolved["base_url"] == "http://127.0.0.1:1234/v1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_runtime_provider_lmstudio_honors_saved_base_url(monkeypatch):
|
||||||
|
"""Pre-existing configs with `provider: lmstudio` + custom base_url must keep working.
|
||||||
|
|
||||||
|
Before this PR, `lmstudio` aliased to `custom`, so a user with a remote
|
||||||
|
LM Studio (e.g. lab box) could write `provider: "lmstudio"` plus
|
||||||
|
`base_url: "http://192.168.1.10:1234/v1"` and the custom path honored it.
|
||||||
|
Now that `lmstudio` is first-class with `inference_base_url=127.0.0.1`,
|
||||||
|
the saved `base_url` from `model_cfg` must still win — otherwise this
|
||||||
|
PR is a silent breaking change for those users.
|
||||||
|
"""
|
||||||
|
monkeypatch.delenv("LM_API_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("LM_BASE_URL", raising=False)
|
||||||
|
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "lmstudio")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
rp,
|
||||||
|
"_get_model_config",
|
||||||
|
lambda: {
|
||||||
|
"provider": "lmstudio",
|
||||||
|
"base_url": "http://192.168.1.10:1234/v1",
|
||||||
|
"default": "qwen/qwen3-coder-30b",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
rp,
|
||||||
|
"load_pool",
|
||||||
|
lambda provider: type("Pool", (), {"has_credentials": lambda self: False})(),
|
||||||
|
)
|
||||||
|
# Don't mock resolve_api_key_provider_credentials — exercise the real
|
||||||
|
# function so we test the end-to-end precedence between model_cfg and
|
||||||
|
# the pconfig default.
|
||||||
|
|
||||||
|
resolved = rp.resolve_runtime_provider(requested="lmstudio")
|
||||||
|
|
||||||
|
assert resolved["provider"] == "lmstudio"
|
||||||
|
assert resolved["api_mode"] == "chat_completions"
|
||||||
|
# The saved base_url must NOT be shadowed by the 127.0.0.1 default.
|
||||||
|
assert resolved["base_url"] == "http://192.168.1.10:1234/v1"
|
||||||
|
# No-auth LM Studio: missing LM_API_KEY substitutes the placeholder.
|
||||||
|
assert resolved["api_key"] == "dummy-lm-api-key"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_runtime_provider_lmstudio_base_url_env_wins_over_saved_base_url(monkeypatch):
|
||||||
|
"""LM_BASE_URL should override the saved lmstudio base_url for temporary redirects."""
|
||||||
|
monkeypatch.delenv("LM_API_KEY", raising=False)
|
||||||
|
monkeypatch.setenv("LM_BASE_URL", "http://override.local:9999/v1")
|
||||||
|
monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "lmstudio")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
rp,
|
||||||
|
"_get_model_config",
|
||||||
|
lambda: {
|
||||||
|
"provider": "lmstudio",
|
||||||
|
"base_url": "http://192.168.1.10:1234/v1",
|
||||||
|
"default": "qwen/qwen3-coder-30b",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
rp,
|
||||||
|
"load_pool",
|
||||||
|
lambda provider: type("Pool", (), {"has_credentials": lambda self: False})(),
|
||||||
|
)
|
||||||
|
|
||||||
|
resolved = rp.resolve_runtime_provider(requested="lmstudio")
|
||||||
|
|
||||||
|
assert resolved["provider"] == "lmstudio"
|
||||||
|
assert resolved["api_mode"] == "chat_completions"
|
||||||
|
assert resolved["base_url"] == "http://override.local:9999/v1"
|
||||||
|
assert resolved["api_key"] == "dummy-lm-api-key"
|
||||||
|
|
||||||
|
|
||||||
def test_resolve_runtime_provider_ai_gateway_explicit_override_skips_pool(monkeypatch):
|
def test_resolve_runtime_provider_ai_gateway_explicit_override_skips_pool(monkeypatch):
|
||||||
def _unexpected_pool(provider):
|
def _unexpected_pool(provider):
|
||||||
raise AssertionError(f"load_pool should not be called for {provider}")
|
raise AssertionError(f"load_pool should not be called for {provider}")
|
||||||
|
|
@ -1237,6 +1341,21 @@ def test_resolve_provider_openrouter_unchanged():
|
||||||
assert resolve_provider("openrouter") == "openrouter"
|
assert resolve_provider("openrouter") == "openrouter"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_provider_lmstudio_returns_lmstudio(monkeypatch):
|
||||||
|
"""resolve_provider('lmstudio') must return 'lmstudio', not 'custom'.
|
||||||
|
|
||||||
|
Regression for the alias-map bug where 'lmstudio' was rewritten to
|
||||||
|
'custom' before the PROVIDER_REGISTRY lookup, bypassing the first-class
|
||||||
|
LM Studio provider entirely at runtime.
|
||||||
|
"""
|
||||||
|
from hermes_cli.auth import resolve_provider
|
||||||
|
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
|
||||||
|
assert resolve_provider("lmstudio") == "lmstudio"
|
||||||
|
assert resolve_provider("lm-studio") == "lmstudio"
|
||||||
|
assert resolve_provider("lm_studio") == "lmstudio"
|
||||||
|
|
||||||
|
|
||||||
def test_custom_provider_runtime_preserves_provider_name(monkeypatch):
|
def test_custom_provider_runtime_preserves_provider_name(monkeypatch):
|
||||||
"""resolve_runtime_provider with provider='custom' must return provider='custom'."""
|
"""resolve_runtime_provider with provider='custom' must return provider='custom'."""
|
||||||
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||||
|
|
|
||||||
|
|
@ -122,3 +122,34 @@ def test_show_status_hides_nous_subscription_section_when_feature_flag_is_off(mo
|
||||||
|
|
||||||
out = capsys.readouterr().out
|
out = capsys.readouterr().out
|
||||||
assert "Nous Tool Gateway" not in out
|
assert "Nous Tool Gateway" not in out
|
||||||
|
|
||||||
|
|
||||||
|
def test_show_status_reports_empty_lmstudio_listing_as_reachable(monkeypatch, capsys, tmp_path):
|
||||||
|
from hermes_cli import status as status_mod
|
||||||
|
|
||||||
|
_patch_common_status_deps(monkeypatch, status_mod, tmp_path)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
status_mod,
|
||||||
|
"load_config",
|
||||||
|
lambda: {
|
||||||
|
"model": {
|
||||||
|
"default": "qwen/qwen3-coder-30b",
|
||||||
|
"provider": "lmstudio",
|
||||||
|
"base_url": "http://127.0.0.1:1234/v1",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
raising=False,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(status_mod, "resolve_requested_provider", lambda requested=None: "lmstudio", raising=False)
|
||||||
|
monkeypatch.setattr(status_mod, "resolve_provider", lambda requested=None, **kwargs: "lmstudio", raising=False)
|
||||||
|
monkeypatch.setattr(status_mod, "provider_label", lambda provider: "LM Studio", raising=False)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"hermes_cli.models.probe_lmstudio_models",
|
||||||
|
lambda api_key=None, base_url=None, timeout=5.0: [],
|
||||||
|
)
|
||||||
|
|
||||||
|
status_mod.show_status(SimpleNamespace(all=False, deep=False))
|
||||||
|
|
||||||
|
out = capsys.readouterr().out
|
||||||
|
assert "LM Studio" in out
|
||||||
|
assert "reachable (0 model(s)) at http://127.0.0.1:1234/v1" in out
|
||||||
|
|
|
||||||
|
|
@ -4168,6 +4168,7 @@ def _(rid, params: dict) -> dict:
|
||||||
cfg = _load_cfg()
|
cfg = _load_cfg()
|
||||||
current_provider = getattr(agent, "provider", "") or ""
|
current_provider = getattr(agent, "provider", "") or ""
|
||||||
current_model = getattr(agent, "model", "") or _resolve_model()
|
current_model = getattr(agent, "model", "") or _resolve_model()
|
||||||
|
current_base_url = getattr(agent, "base_url", "") or ""
|
||||||
# list_authenticated_providers already populates each provider's
|
# list_authenticated_providers already populates each provider's
|
||||||
# "models" with the curated list (same source as `hermes model` and
|
# "models" with the curated list (same source as `hermes model` and
|
||||||
# classic CLI's /model picker). Do NOT overwrite with live
|
# classic CLI's /model picker). Do NOT overwrite with live
|
||||||
|
|
@ -4176,6 +4177,8 @@ def _(rid, params: dict) -> dict:
|
||||||
# TTS, embeddings, rerankers, image/video generators).
|
# TTS, embeddings, rerankers, image/video generators).
|
||||||
providers = list_authenticated_providers(
|
providers = list_authenticated_providers(
|
||||||
current_provider=current_provider,
|
current_provider=current_provider,
|
||||||
|
current_base_url=current_base_url,
|
||||||
|
current_model=current_model,
|
||||||
user_providers=(
|
user_providers=(
|
||||||
cfg.get("providers") if isinstance(cfg.get("providers"), dict) else {}
|
cfg.get("providers") if isinstance(cfg.get("providers"), dict) else {}
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ You need at least one way to connect to an LLM. Use `hermes model` to switch pro
|
||||||
| **Hugging Face** | `HF_TOKEN` in `~/.hermes/.env` (provider: `huggingface`, aliases: `hf`) |
|
| **Hugging Face** | `HF_TOKEN` in `~/.hermes/.env` (provider: `huggingface`, aliases: `hf`) |
|
||||||
| **Google / Gemini** | `GOOGLE_API_KEY` (or `GEMINI_API_KEY`) in `~/.hermes/.env` (provider: `gemini`) |
|
| **Google / Gemini** | `GOOGLE_API_KEY` (or `GEMINI_API_KEY`) in `~/.hermes/.env` (provider: `gemini`) |
|
||||||
| **Google Gemini (OAuth)** | `hermes model` → "Google Gemini (OAuth)" (provider: `google-gemini-cli`, free tier supported, browser PKCE login) |
|
| **Google Gemini (OAuth)** | `hermes model` → "Google Gemini (OAuth)" (provider: `google-gemini-cli`, free tier supported, browser PKCE login) |
|
||||||
|
| **LM Studio** | `hermes model` → "LM Studio" (provider: `lmstudio`, optional `LM_API_KEY`) |
|
||||||
| **Custom Endpoint** | `hermes model` → choose "Custom endpoint" (saved in `config.yaml`) |
|
| **Custom Endpoint** | `hermes model` → choose "Custom endpoint" (saved in `config.yaml`) |
|
||||||
|
|
||||||
:::tip Model key alias
|
:::tip Model key alias
|
||||||
|
|
@ -725,10 +726,10 @@ Then configure Hermes:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
hermes model
|
hermes model
|
||||||
# Select "Custom endpoint (self-hosted / VLLM / etc.)"
|
# Select "LM Studio"
|
||||||
# Enter URL: http://localhost:1234/v1
|
# Press Enter to use http://127.0.0.1:1234/v1
|
||||||
# Skip API key (LM Studio doesn't require one)
|
# Pick one of the discovered models
|
||||||
# Enter model name
|
# If LM Studio server auth is enabled, enter LM_API_KEY when prompted
|
||||||
```
|
```
|
||||||
|
|
||||||
:::caution Context length often defaults to 2048
|
:::caution Context length often defaults to 2048
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue