diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py index 54567ac02..98bdd2276 100644 --- a/tests/tools/test_vision_tools.py +++ b/tests/tools/test_vision_tools.py @@ -1085,28 +1085,29 @@ class TestDownloadRetryClassification: # --------------------------------------------------------------------------- -# Fan-out concurrency cap — a single turn (or several concurrent sessions in -# one process) can launch dozens of vision_analyze calls at once. The -# process-global semaphore must bound how many run simultaneously so a video- -# frame storm can't pin a worker thread and starve the dashboard event loop. +# CPU-burst concurrency cap — a single turn (or several concurrent sessions in +# one process) can launch dozens of vision_analyze calls at once. Only the +# CPU-bound encode/resize is bounded (to host cores), so a video-frame storm +# can't saturate every core and starve the dashboard event loop — while the +# network-bound LLM calls stay fully concurrent for legitimate multi-image work. # --------------------------------------------------------------------------- -class TestVisionFanoutConcurrencyCap: - """The process-global semaphore bounds concurrent vision analyses.""" +class TestVisionCpuBurstCap: + """The bounded CPU executor caps concurrent encode/resize, not LLM calls.""" - def test_resolver_defaults_to_min_cpus_and_ceiling(self): + def test_resolver_defaults_to_host_cpus_no_ceiling(self): from tools import vision_tools as vt with ( patch.dict(os.environ, {}, clear=False), patch("tools.vision_tools._detect_host_cpus", return_value=64), + patch("hermes_cli.config.load_config", side_effect=Exception), ): os.environ.pop("HERMES_VISION_MAX_CONCURRENCY", None) - # No config override available in the test env → falls to default, - # which is clamped to the ceiling even on a 64-core host. - with patch("hermes_cli.config.load_config", side_effect=Exception): - assert vt._resolve_vision_max_concurrency() == vt._VISION_DEFAULT_CONCURRENCY_CEILING + # No fixed ceiling: a 64-core host gets 64 encode workers. The cap + # tracks the actual resource (cores), not a magic number. + assert vt._resolve_vision_cpu_workers() == 64 def test_resolver_respects_low_host_cpu_count(self): from tools import vision_tools as vt @@ -1117,14 +1118,15 @@ class TestVisionFanoutConcurrencyCap: patch("hermes_cli.config.load_config", side_effect=Exception), ): os.environ.pop("HERMES_VISION_MAX_CONCURRENCY", None) - # 2-core host → cap is 2 (host limit, below the ceiling of 4). - assert vt._resolve_vision_max_concurrency() == 2 + assert vt._resolve_vision_cpu_workers() == 2 def test_resolver_env_override(self): from tools import vision_tools as vt - with patch.dict(os.environ, {"HERMES_VISION_MAX_CONCURRENCY": "1"}): - assert vt._resolve_vision_max_concurrency() == 1 + with patch.dict(os.environ, {"HERMES_VISION_MAX_CONCURRENCY": "16"}): + # Explicit override is honored verbatim — including ABOVE core count, + # so operators can raise it for heavy multi-image workloads. + assert vt._resolve_vision_cpu_workers() == 16 def test_resolver_rejects_sub_one_override(self): from tools import vision_tools as vt @@ -1134,55 +1136,95 @@ class TestVisionFanoutConcurrencyCap: patch("tools.vision_tools._detect_host_cpus", return_value=2), patch("hermes_cli.config.load_config", side_effect=Exception), ): - # 0 is ignored (cap can never be disabled) → falls back to default. - assert vt._resolve_vision_max_concurrency() == 2 + # 0 is ignored (cap can never be disabled) → falls back to host cores. + assert vt._resolve_vision_cpu_workers() == 2 + + def test_cpu_executor_is_dedicated_and_sized_to_workers(self): + """The encode executor must be dedicated, not the shared default pool.""" + import importlib + from concurrent.futures import ThreadPoolExecutor + + vt = importlib.import_module("tools.vision_tools") + assert isinstance(vt._vision_cpu_executor, ThreadPoolExecutor) + assert vt._vision_cpu_executor._max_workers == vt._VISION_CPU_WORKERS @pytest.mark.asyncio - async def test_fanout_is_bounded_by_semaphore(self): - """Firing many concurrent vision calls must never exceed the cap in flight. + async def test_encode_runs_on_dedicated_cpu_executor(self): + """Encode/resize must execute on a ``vision-encode`` thread, off the loop. - This is the regression guard for the prod incident: an unbounded - fan-out pinned the event loop. With the cap, peak concurrency is - clamped to the semaphore value regardless of how many calls launch. + Regression guard: the CPU burst is what saturated cores and starved the + loop. It must run on the bounded vision executor, not the caller's loop + thread nor the shared default pool. + """ + import importlib + import threading + + vt = importlib.import_module("tools.vision_tools") + + seen_threads = [] + + def fake_encode(path, mime_type=None): + seen_threads.append(threading.current_thread().name) + return "data:image/jpeg;base64,AAAA" + + result = await vt._run_encode_on_cpu_executor(fake_encode, "p", mime_type="image/jpeg") + assert result == "data:image/jpeg;base64,AAAA" + assert len(seen_threads) == 1 + assert seen_threads[0].startswith("vision-encode"), seen_threads + + @pytest.mark.asyncio + async def test_encode_bursts_bounded_but_llm_stays_concurrent(self): + """Encode concurrency is clamped to the cap; the LLM call is not. + + Drives many native-path calls whose encode step is the only thing on + the CPU executor. With the executor sized to CAP, no more than CAP + encodes ever run at once — even though all N calls are in flight + simultaneously (proving the analyses themselves are NOT serialized). """ import asyncio import importlib - import threading - # Resolve the module fresh and drive BOTH the handler and the patch - # targets through that SAME module object. Sibling suites - # (test_vision_routing_31179) delete tools.vision_tools from - # sys.modules, so the top-level ``_handle_vision_analyze`` import can - # be bound to a stale module while ``patch`` hits the current one — - # patching the wrong object lets the real function run (peak stays 0). + from concurrent.futures import ThreadPoolExecutor + vt = importlib.import_module("tools.vision_tools") CAP = 3 - in_flight = 0 - peak = 0 - lock = asyncio.Lock() + N = 12 + enc_inflight = 0 + enc_peak = 0 + calls_inflight = 0 + calls_peak = 0 + import threading as _t + enc_lock = _t.Lock() + + def slow_encode(path, mime_type=None): + nonlocal enc_inflight, enc_peak + with enc_lock: + enc_inflight += 1 + enc_peak = max(enc_peak, enc_inflight) + try: + _t.Event().wait(0.04) # simulate CPU burst + finally: + with enc_lock: + enc_inflight -= 1 + return "data:image/jpeg;base64,AAAA" async def fake_native(image_url, question): - nonlocal in_flight, peak - async with lock: - in_flight += 1 - peak = max(peak, in_flight) + nonlocal calls_inflight, calls_peak + calls_inflight += 1 + calls_peak = max(calls_peak, calls_inflight) try: - # Hold the slot long enough that, without a cap, all callers - # would overlap and drive peak up to N. - await asyncio.sleep(0.05) + # The encode is the capped CPU step. + await vt._run_encode_on_cpu_executor(slow_encode, "p", mime_type="image/jpeg") + # The "LLM call" is NOT capped — overlaps freely. + await asyncio.sleep(0.02) finally: - async with lock: - in_flight -= 1 + calls_inflight -= 1 return json.dumps({"ok": True}) - N = 12 - # Install a fresh semaphore at the test cap so the assertion is - # deterministic regardless of the host's core count. with ( - patch.object(vt, "_vision_concurrency_semaphore", - threading.BoundedSemaphore(CAP)), - patch.object(vt, "_should_use_native_vision_fast_path", - return_value=True), + patch.object(vt, "_vision_cpu_executor", + ThreadPoolExecutor(max_workers=CAP, thread_name_prefix="vision-encode")), + patch.object(vt, "_should_use_native_vision_fast_path", return_value=True), patch.object(vt, "_vision_analyze_native", side_effect=fake_native), ): await asyncio.gather(*[ @@ -1193,58 +1235,12 @@ class TestVisionFanoutConcurrencyCap: for i in range(N) ]) - assert peak <= CAP, f"peak concurrency {peak} exceeded cap {CAP}" - # Sanity: with N > CAP and a real wait, we should have actually - # saturated the cap (otherwise the test proves nothing). - assert peak == CAP, f"expected to saturate cap {CAP}, only reached {peak}" - - @pytest.mark.asyncio - async def test_unbounded_fanout_would_exceed_cap_without_semaphore(self): - """Control: with a no-op (effectively unbounded) semaphore, peak blows past CAP. - - Proves the guard above would fail if the semaphore weren't enforcing - the limit — i.e. the test is actually exercising the cap. - """ - import asyncio - import importlib - import threading - vt = importlib.import_module("tools.vision_tools") - - CAP = 3 - in_flight = 0 - peak = 0 - lock = asyncio.Lock() - - async def fake_native(image_url, question): - nonlocal in_flight, peak - async with lock: - in_flight += 1 - peak = max(peak, in_flight) - try: - await asyncio.sleep(0.05) - finally: - async with lock: - in_flight -= 1 - return json.dumps({"ok": True}) - - N = 12 - # A semaphore sized to N imposes no real limit for this workload. - with ( - patch.object(vt, "_vision_concurrency_semaphore", - threading.BoundedSemaphore(N)), - patch.object(vt, "_should_use_native_vision_fast_path", - return_value=True), - patch.object(vt, "_vision_analyze_native", side_effect=fake_native), - ): - await asyncio.gather(*[ - vt._handle_vision_analyze( - {"image_url": f"https://example.com/frame_{i}.png", - "question": "what is this"} - ) - for i in range(N) - ]) - - assert peak > CAP, ( - "control failed: peak did not exceed CAP even without a real cap " - f"(peak={peak})" + assert enc_peak <= CAP, f"encode peak {enc_peak} exceeded cap {CAP}" + assert enc_peak == CAP, f"expected to saturate encode cap {CAP}, got {enc_peak}" + # The analyses themselves were NOT serialized to the cap — all N ran + # concurrently, which is the whole point (multi-image workflows keep + # their concurrency; only the CPU burst is bounded). + assert calls_peak > CAP, ( + f"analyses were serialized to the cap (peak={calls_peak}); only the " + "encode burst should be bounded, not the whole call" ) diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 69bbc3832..b6a05e01b 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -32,6 +32,7 @@ import base64 import contextlib import asyncio import json +from concurrent.futures import ThreadPoolExecutor import logging import os import uuid @@ -77,36 +78,35 @@ _VISION_MAX_DOWNLOAD_BYTES = 50 * 1024 * 1024 # --------------------------------------------------------------------------- -# Fan-out concurrency cap +# CPU-burst concurrency cap (vision encode/resize) # --------------------------------------------------------------------------- # A single agent turn can fan out N vision_analyze calls at once (the classic # trigger is "analyze every frame of this video" — ffmpeg explodes a clip into -# dozens of frames, the model then calls vision_analyze on each). Every call -# does a CPU-heavy base64-encode/resize burst AND holds a long-lived LLM stream -# open. The tool executor runs concurrent tool calls on a ThreadPoolExecutor -# (agent.tool_executor._MAX_TOOL_WORKERS = 8) PER SESSION, and several agent -# sessions share one process (the dashboard runs the agent in-process). With no -# global ceiling, a video-frame fan-out across one or more sessions pins a -# worker thread at ~100% CPU and starves the shared asyncio event loop that also -# serves the dashboard's /api/status liveness probe — so the instance flaps to -# UNHEALTHY even though nothing has actually crashed (observed in prod, June -# 2026). +# dozens of frames, the model then calls vision_analyze on each). Each call does +# a CPU-heavy base64-encode + (sometimes) Pillow resize. The tool executor runs +# concurrent tool calls on a ThreadPoolExecutor (agent.tool_executor = +# 8 workers) PER SESSION, and several agent sessions share one process (the +# dashboard runs the agent in-process). Unbounded, a video-frame fan-out across +# one or more sessions runs *every* encode at once, saturates all cores, and +# leaves no CPU to service the shared asyncio event loop that serves the +# dashboard's /api/status liveness probe — so the instance flaps to UNHEALTHY +# even though nothing has crashed (observed in prod, June 2026). # -# This semaphore bounds the number of vision analyses running concurrently -# across the WHOLE process, regardless of how many sessions or worker threads -# issue them. It is a threading.Semaphore (NOT asyncio.Semaphore): each vision -# call is dispatched through model_tools._run_async on a PER-THREAD event loop, -# so an asyncio primitive bound to one loop cannot coordinate across them. A -# threading semaphore is loop- and thread-agnostic, which is exactly what we -# need here. +# The fix is NOT to cap how many vision analyses run — multi-image workflows +# ("compare these 6 screenshots", "read this 10-page scan") legitimately want +# high concurrency, and the slow part (the LLM stream) is network-bound and +# harmless to the loop. We cap ONLY the CPU burst: the encode/resize is offloaded +# to a dedicated, bounded executor sized to the host's usable core count. That +# is the resource the incident actually exhausted (cores), so bounding it to +# cores is *correct*, not an arbitrary number — excess encodes queue on the +# executor instead of all running at once, the LLM calls stay fully concurrent, +# and the loop always keeps a core. No fixed ceiling: the limit tracks the host. # -# Default: min(host CPU count, 4), floored at 1 — "respect the host's -# concurrency, or lower". 4 is a conservative ceiling: vision work is a mix of -# CPU (encode/resize) and network (LLM stream), and we would rather under- -# subscribe than let a frame storm wedge the loop. Override with -# HERMES_VISION_MAX_CONCURRENCY (env) or auxiliary.vision.max_concurrency -# (config.yaml). 0 / negative / unparseable falls back to the default. -import threading +# A threading primitive (NOT asyncio) is required: each vision call is dispatched +# through model_tools._run_async on a PER-THREAD event loop, so an asyncio +# executor/semaphore bound to one loop cannot coordinate across them. A +# ThreadPoolExecutor is loop- and thread-agnostic. +import threading # noqa: F401 (kept for downstream importers / patch targets) def _detect_host_cpus() -> int: @@ -122,19 +122,19 @@ def _detect_host_cpus() -> int: return max(1, os.cpu_count() or 1) -# Absolute ceiling for the default (not for explicit overrides): even on a -# many-core host, more than this many simultaneous in-process vision analyses -# is rarely worth the event-loop pressure. -_VISION_DEFAULT_CONCURRENCY_CEILING = 4 +def _resolve_vision_cpu_workers() -> int: + """Resolve how many vision encode/resize bursts may run concurrently. + Defaults to the host's usable core count (``_detect_host_cpus``) — no fixed + ceiling, because the cap tracks the actual exhausted resource (CPU cores), + not a magic number. The LLM call is NOT covered by this limit, so legitimate + multi-image fan-out keeps full request concurrency; only the simultaneous + CPU bursts are bounded so the event loop always keeps a core. -def _resolve_vision_max_concurrency() -> int: - """Resolve the max concurrent vision analyses for this process. - - Resolution order: HERMES_VISION_MAX_CONCURRENCY env → config.yaml - auxiliary.vision.max_concurrency → default ``min(host_cpus, 4)``. Any - value that parses to < 1 is ignored in favor of the next source so the - cap can never be disabled into an unbounded fan-out. + Resolution order: HERMES_VISION_MAX_CONCURRENCY env → + config.yaml auxiliary.vision.max_concurrency → host core count. Any value + that parses to < 1 is ignored in favor of the next source so the cap can + never be disabled into an unbounded encode storm. """ env_val = os.getenv("HERMES_VISION_MAX_CONCURRENCY", "").strip() if env_val: @@ -154,11 +154,39 @@ def _resolve_vision_max_concurrency() -> int: return parsed except Exception: pass - return max(1, min(_detect_host_cpus(), _VISION_DEFAULT_CONCURRENCY_CEILING)) + return _detect_host_cpus() -_VISION_MAX_CONCURRENCY = _resolve_vision_max_concurrency() -_vision_concurrency_semaphore = threading.BoundedSemaphore(_VISION_MAX_CONCURRENCY) +_VISION_CPU_WORKERS = _resolve_vision_cpu_workers() + +# Dedicated, bounded executor for the CPU-bound encode/resize burst ONLY. We do +# NOT use the default executor (run_in_executor(None, ...)) — that pool is shared +# with the gateway and web server, so a fan-out would park encode work there and +# starve those callers. Sizing it to the usable core count means at most +# _VISION_CPU_WORKERS encodes run at once; further encodes queue on this +# executor's work queue, leaving cores free for the event loop. The LLM call is +# deliberately left OUTSIDE this executor so multi-image workflows keep full +# request concurrency. +_vision_cpu_executor = ThreadPoolExecutor( + max_workers=_VISION_CPU_WORKERS, + thread_name_prefix="vision-encode", +) + + +async def _run_encode_on_cpu_executor(fn, *args, **kwargs): + """Run a sync encode/resize callable on the bounded vision CPU executor. + + Offloads CPU-bound image work to :data:`_vision_cpu_executor` so it (a) + never runs on the caller's event-loop thread and (b) is bounded to the + host's usable core count process-wide. Excess encodes queue on the + executor instead of all running at once, leaving cores free for the loop. + The LLM call must NOT be routed through here — only the encode/resize. + """ + import functools + loop = asyncio.get_running_loop() + return await loop.run_in_executor( + _vision_cpu_executor, functools.partial(fn, *args, **kwargs) + ) def _image_url_shape_ok(url: str) -> bool: @@ -774,22 +802,17 @@ def _build_native_vision_tool_result( @contextlib.asynccontextmanager async def _vision_concurrency_slot(): - """Hold one process-global vision-concurrency slot for the duration. + """Deprecated no-op shim kept for backward compatibility. - Acquires :data:`_vision_concurrency_semaphore` before yielding and always - releases it on exit. The blocking acquire is offloaded to a worker thread - via ``run_in_executor`` so that waiting for a slot never blocks the calling - event loop (callers run on per-thread loops; blocking the acquire on the - loop thread would freeze that loop's other tasks while we wait). The - semaphore is a ``BoundedSemaphore`` so a double-release would raise rather - than silently inflate the limit. + The fan-out cap was narrowed to the CPU-bound encode/resize burst only + (see :data:`_vision_cpu_executor` / :func:`_run_encode_on_cpu_executor`). + Holding a slot across the whole analysis serialized legitimate multi-image + workflows behind the slow LLM call, which is exactly what we don't want. + This context manager no longer gates anything; encode/resize is bounded + where it actually runs. Retained only so any external caller importing it + keeps working. """ - loop = asyncio.get_event_loop() - await loop.run_in_executor(None, _vision_concurrency_semaphore.acquire) - try: - yield - finally: - _vision_concurrency_semaphore.release() + yield async def _vision_analyze_native( @@ -851,7 +874,8 @@ async def _vision_analyze_native( success=False, ) - image_data_url = _image_to_base64_data_url( + image_data_url = await _run_encode_on_cpu_executor( + _image_to_base64_data_url, temp_image_path, mime_type=detected_mime_type, ) @@ -864,9 +888,12 @@ async def _vision_analyze_native( # target (4 MB / 7900px, headroom under both ceilings) whenever the # payload exceeds either limit, not just at the 20 MB hard ceiling. _over_bytes = len(image_data_url) > _EMBED_TARGET_BYTES - _over_dims = _image_exceeds_dimension(temp_image_path, _EMBED_MAX_DIMENSION) + _over_dims = await _run_encode_on_cpu_executor( + _image_exceeds_dimension, temp_image_path, _EMBED_MAX_DIMENSION, + ) if _over_bytes or _over_dims: - image_data_url = _resize_image_for_vision( + image_data_url = await _run_encode_on_cpu_executor( + _resize_image_for_vision, temp_image_path, mime_type=detected_mime_type, max_base64_bytes=_EMBED_TARGET_BYTES, max_dimension=_EMBED_MAX_DIMENSION, @@ -1008,15 +1035,19 @@ async def vision_analyze_tool( # Convert image to base64 — send at full resolution first. # If the provider rejects it as too large, we auto-resize and retry. + # Offloaded to the bounded vision CPU executor so a fan-out of encodes + # can't saturate every core and starve the event loop. logger.info("Converting image to base64...") - image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type) + image_data_url = await _run_encode_on_cpu_executor( + _image_to_base64_data_url, temp_image_path, mime_type=detected_mime_type) data_size_kb = len(image_data_url) / 1024 logger.info("Image converted to base64 (%.1f KB)", data_size_kb) # Hard limit (20 MB) — no provider accepts payloads this large. if len(image_data_url) > _MAX_BASE64_BYTES: # Try to resize down to 5 MB before giving up. - image_data_url = _resize_image_for_vision( + image_data_url = await _run_encode_on_cpu_executor( + _resize_image_for_vision, temp_image_path, mime_type=detected_mime_type) if len(image_data_url) > _MAX_BASE64_BYTES: raise ValueError( @@ -1092,7 +1123,8 @@ async def vision_analyze_tool( len(image_data_url) / (1024 * 1024), _RESIZE_TARGET_BYTES / (1024 * 1024), ) - image_data_url = _resize_image_for_vision( + image_data_url = await _run_encode_on_cpu_executor( + _resize_image_for_vision, temp_image_path, mime_type=detected_mime_type) messages[0]["content"][1]["image_url"]["url"] = image_data_url response = await async_call_llm(**call_kwargs) @@ -1305,32 +1337,28 @@ async def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> str: image_url = args.get("image_url", "") question = args.get("question", "") - # Bound process-wide vision fan-out: a single turn (or several concurrent - # sessions sharing this process) can launch dozens of vision_analyze calls - # at once — e.g. "analyze every frame of this video". Each one is a - # CPU-heavy encode/resize plus a long LLM stream; unbounded, they pin a - # worker thread and starve the shared event loop that serves /api/status, - # flapping the instance to UNHEALTHY. The slot is held across the WHOLE - # analysis (image load + encode + LLM call), and acquiring it waits off the - # event loop, so excess calls queue instead of piling on simultaneously. - async with _vision_concurrency_slot(): - # Fast path: when native image routing is in effect for the active main - # model (provider accepts images in tool results, or the user set the - # model.supports_vision override), short-circuit the auxiliary LLM and - # return the image bytes as a multimodal tool-result envelope. The main - # model sees the pixels directly on its next turn — no aux call, no - # information loss, no extra latency. - if _should_use_native_vision_fast_path(): - logger.info("vision_analyze: native fast path") - return await _vision_analyze_native(image_url, question) + # The fan-out cap lives inside the encode/resize step (offloaded to the + # bounded _vision_cpu_executor), NOT around the whole analysis — so a + # legitimate multi-image workflow keeps full request concurrency while the + # CPU bursts that actually starve the loop are bounded to host cores. + # + # Fast path: when native image routing is in effect for the active main + # model (provider accepts images in tool results, or the user set the + # model.supports_vision override), short-circuit the auxiliary LLM and + # return the image bytes as a multimodal tool-result envelope. The main + # model sees the pixels directly on its next turn — no aux call, no + # information loss, no extra latency. + if _should_use_native_vision_fast_path(): + logger.info("vision_analyze: native fast path") + return await _vision_analyze_native(image_url, question) - # Legacy path: aux LLM describes the image and we return its text. - full_prompt = ( - "Fully describe and explain everything about this image, then answer the " - f"following question:\n\n{question}" - ) - model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None - return await vision_analyze_tool(image_url, full_prompt, model) + # Legacy path: aux LLM describes the image and we return its text. + full_prompt = ( + "Fully describe and explain everything about this image, then answer the " + f"following question:\n\n{question}" + ) + model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None + return await vision_analyze_tool(image_url, full_prompt, model) registry.register( diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index 83c08397b..1b45de6c1 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -684,7 +684,7 @@ Advanced per-platform knobs for throttling the outbound message batcher. Most us | `HERMES_FEISHU_DEDUP_CACHE_SIZE` | Size of the Feishu webhook dedup cache (default: `1024`). | | `HERMES_WECOM_TEXT_BATCH_DELAY_SECONDS` / `_SPLIT_DELAY_SECONDS` | WeCom batcher tuning. | | `HERMES_VISION_DOWNLOAD_TIMEOUT` | Timeout in seconds for downloading an image before handing it to vision models (default: `30`). | -| `HERMES_VISION_MAX_CONCURRENCY` | Max vision analyses running concurrently across the whole process (override for `auxiliary.vision.max_concurrency`; default `min(host CPUs, 4)`). Bounds video-frame fan-out so it can't saturate the event loop. Values `< 1` are ignored. | +| `HERMES_VISION_MAX_CONCURRENCY` | Max concurrent image **encode/resize** bursts across the whole process (override for `auxiliary.vision.max_concurrency`; default: host CPU core count, no ceiling). Bounds only the CPU-bound encode step so a video-frame fan-out can't saturate every core and starve the event loop — the LLM calls stay fully concurrent. Values `< 1` are ignored. | | `HERMES_RESTART_DRAIN_TIMEOUT` | Gateway: seconds to wait for active runs to drain on `/restart` before forcing the restart (default: `900`). | | `HERMES_GATEWAY_PLATFORM_CONNECT_TIMEOUT` | Per-platform connect timeout during gateway startup (seconds). | | `HERMES_GATEWAY_BUSY_INPUT_MODE` | Default gateway busy-input behavior: `queue`, `steer`, or `interrupt`. Can be overridden per chat with `/busy`. | diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index e1ba4c6b5..0bcda2138 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -1005,9 +1005,11 @@ auxiliary: api_key: "" # API key for base_url (falls back to OPENAI_API_KEY) timeout: 120 # seconds — LLM API call timeout; vision payloads need generous timeout download_timeout: 30 # seconds — image HTTP download; increase for slow connections - max_concurrency: 4 # max vision analyses running at once across the whole process - # (default: min(host CPUs, 4)) — bounds video-frame fan-out so it - # can't saturate the event loop. Minimum 1; values < 1 are ignored. + max_concurrency: 8 # max concurrent image encode/resize bursts across the process + # (default: host CPU core count, no ceiling) — bounds only the + # CPU-bound encode step so a video-frame fan-out can't saturate + # every core and starve the event loop; LLM calls stay fully + # concurrent. Minimum 1; values < 1 are ignored. # Web page summarization + browser page text extraction web_extract: