Desktop/dashboard WebSocket connections drop during long agent operations (delegate_task subagents, large model outputs) when the uvicorn event loop is GIL-starved for minutes. Root cause: uvicorn's ws keepalive ping runs on the SAME event loop as agent turns. A single synchronous GIL-holding call on a worker thread (a regex/scrub over a large output, or a long subagent turn) freezes the loop, so it cannot process the incoming pong within ws_ping_timeout and uvicorn closes an otherwise-healthy connection (#53773: 'event loop stalled 226.3s'; #48445/#50005). Loosening the timeout only raises the threshold — a multi-minute stall sails past any finite window. The keepalive ping exists to detect half-open connections (reverse-proxy 524, dropped tunnels), which cannot happen on loopback: there is no network or proxy in the path, and a dead local client tears the socket down with a real FIN/RST that starlette surfaces as WebSocketDisconnect regardless of the ping. So on loopback the ping provides ~no liveness value while actively killing recoverable stalls — disable it entirely (ws_ping_interval/timeout=None). Non-loopback (public) binds sit behind a Cloudflare Tunnel where half-open IS a real failure mode, so the ping stays at 20/20 to detect it. Empirically verified (real uvicorn + websockets peer): with ws_ping=None the server never closes a silent peer during an 8s window; with the pre-fix 2s/2s window uvicorn closes it. A genuinely-dead client still fires the WebSocketDisconnect reap path regardless of the ping. Note: this fixes the local Desktop case (the OP's scenario). A remote Desktop over an authenticated public dashboard route (McCalebTheSecond's comment) keeps the ping and needs the deeper GIL-hotspot fix — tracked separately. Closes #53773
212 lines
7.8 KiB
Python
212 lines
7.8 KiB
Python
"""Test that start_server configures ws-ping keepalive.
|
|
|
|
The server now uses uvicorn.Server directly (not uvicorn.run) so we stub
|
|
Config + Server + asyncio.run to capture kwargs without starting an event loop.
|
|
"""
|
|
|
|
import asyncio
|
|
import contextlib
|
|
|
|
import uvicorn
|
|
|
|
from hermes_cli import web_server
|
|
|
|
|
|
def _stub_uvicorn(monkeypatch):
|
|
"""Replace uvicorn.Config/Server with fakes so start_server returns
|
|
immediately. Returns a dict with captured Config kwargs."""
|
|
captured: dict = {}
|
|
|
|
class _FakeConfig:
|
|
loaded = True
|
|
host = "127.0.0.1"
|
|
port = 8000
|
|
_loop_factory = None
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
captured.update(kwargs)
|
|
|
|
def load(self):
|
|
pass
|
|
|
|
def get_loop_factory(self):
|
|
return self._loop_factory
|
|
|
|
class lifespan_class:
|
|
should_exit = False
|
|
state: dict = {}
|
|
|
|
def __init__(self, *a, **kw):
|
|
pass
|
|
|
|
async def startup(self):
|
|
pass
|
|
|
|
async def shutdown(self):
|
|
pass
|
|
|
|
class _FakeServer:
|
|
should_exit = False
|
|
started = True
|
|
servers: list = []
|
|
lifespan = None
|
|
|
|
@staticmethod
|
|
def capture_signals():
|
|
return contextlib.nullcontext()
|
|
|
|
async def startup(self, sockets=None):
|
|
pass
|
|
|
|
async def main_loop(self):
|
|
pass
|
|
|
|
async def shutdown(self, sockets=None):
|
|
pass
|
|
|
|
monkeypatch.setattr(uvicorn, "Config", _FakeConfig)
|
|
monkeypatch.setattr(uvicorn, "Server", lambda config: _FakeServer())
|
|
return captured
|
|
|
|
|
|
def test_start_server_disables_ws_ping_on_loopback(monkeypatch):
|
|
"""Loopback binds (the Desktop case) MUST disable uvicorn's protocol-level
|
|
keepalive ping so an event-loop stall can never trigger a false disconnect.
|
|
|
|
uvicorn's ws ping runs on the same event loop as agent turns. A single
|
|
synchronous GIL-holding call on a worker thread can starve that loop for
|
|
minutes, so the loop can't process the pong and uvicorn kills an
|
|
otherwise-healthy local connection (#53773 "event loop stalled 226.3s",
|
|
#48445/#50005). On loopback there is no network/proxy path where a
|
|
half-open connection can occur — a dead local client tears the socket down
|
|
with a real FIN/RST that surfaces as WebSocketDisconnect regardless — so
|
|
the ping provides no liveness value and only harms. Assert it is disabled.
|
|
"""
|
|
captured = _stub_uvicorn(monkeypatch)
|
|
|
|
# Loopback bind => no auth gate, so this reaches the Config constructor.
|
|
web_server.start_server(host="127.0.0.1", port=0, open_browser=False)
|
|
|
|
assert captured["ws_ping_interval"] is None
|
|
assert captured["ws_ping_timeout"] is None
|
|
|
|
|
|
def test_start_server_enables_ws_ping_for_half_open_detection(monkeypatch):
|
|
"""Non-loopback (public) binds MUST keep the ws ping enabled so half-open
|
|
connections (reverse-proxy 524, dropped Cloudflare Tunnel) raise
|
|
WebSocketDisconnect into the reaping path (#32377).
|
|
|
|
The invariant asserted here is that ping stays enabled (non-None, positive)
|
|
and the timeout is never shorter than the interval — not a frozen literal,
|
|
which churns every time the window is retuned. Loopback disables the ping
|
|
(see test_start_server_disables_ws_ping_on_loopback); this covers the
|
|
public-bind half-open case, so the auth gate is active here.
|
|
"""
|
|
captured = _stub_uvicorn(monkeypatch)
|
|
|
|
# Non-loopback bind so the _is_loopback branch selects the enabled-ping
|
|
# window. Neutralize the auth gate so start_server reaches uvicorn.Config
|
|
# without requiring a registered provider (a real public bind would raise
|
|
# SystemExit here). The ping window keys off the host, not the auth flag.
|
|
monkeypatch.setattr(web_server, "should_require_auth", lambda *a, **k: False)
|
|
web_server.start_server(host="0.0.0.0", port=0, open_browser=False)
|
|
|
|
assert captured["ws_ping_interval"] and captured["ws_ping_interval"] > 0
|
|
assert captured["ws_ping_timeout"] and captured["ws_ping_timeout"] > 0
|
|
assert captured["ws_ping_timeout"] >= captured["ws_ping_interval"]
|
|
|
|
|
|
def test_start_server_runs_on_uvicorns_loop_factory(monkeypatch):
|
|
"""The dashboard/desktop backend must serve uvicorn on the loop *uvicorn*
|
|
selects, not the interpreter default.
|
|
|
|
On Windows ``asyncio.run`` defaults to a ProactorEventLoop, but uvicorn's
|
|
socket-serving stack forces a SelectorEventLoop on win32
|
|
(``uvicorn/loops/asyncio.py``). Serving on the proactor loop binds a socket
|
|
that never accepts — the backend prints "Skipping web UI build" and hangs
|
|
forever with the port LISTENING but no TCP handshake (#50641). We fix that
|
|
by routing the serve call through ``uvicorn._compat.asyncio_run`` with
|
|
``config.get_loop_factory()`` — exactly what ``uvicorn.Server.run`` does.
|
|
|
|
This asserts the behavioral contract: on Windows the loop factory the runner
|
|
receives is the one uvicorn's own Config produced, and bare ``asyncio.run``
|
|
is never the serve path when the loop-factory runner exists.
|
|
"""
|
|
_stub_uvicorn(monkeypatch)
|
|
|
|
# The fix only changes behavior on win32; simulate it so the Windows branch
|
|
# is actually exercised on a POSIX CI host.
|
|
monkeypatch.setattr(web_server.sys, "platform", "win32")
|
|
|
|
# The fake Config (installed by _stub_uvicorn) returns its ``_loop_factory``
|
|
# from get_loop_factory(). Pin a sentinel so we can assert it is threaded
|
|
# through to the runner unchanged.
|
|
sentinel_factory = object()
|
|
monkeypatch.setattr(uvicorn.Config, "_loop_factory", sentinel_factory, raising=False)
|
|
|
|
seen: dict = {}
|
|
|
|
def _fake_runner(coro, *, loop_factory=None):
|
|
seen["loop_factory"] = loop_factory
|
|
coro.close() # drain without an event loop
|
|
|
|
monkeypatch.setattr("uvicorn._compat.asyncio_run", _fake_runner, raising=False)
|
|
|
|
# Bare asyncio.run must NOT be the serve path on Windows when the
|
|
# loop-factory runner is importable.
|
|
called_bare = {"hit": False}
|
|
|
|
def _guard_asyncio_run(coro):
|
|
called_bare["hit"] = True
|
|
coro.close()
|
|
return None
|
|
|
|
monkeypatch.setattr(asyncio, "run", _guard_asyncio_run)
|
|
|
|
web_server.start_server(host="127.0.0.1", port=0, open_browser=False)
|
|
|
|
assert seen.get("loop_factory") is sentinel_factory, (
|
|
"start_server must pass uvicorn's get_loop_factory() result to the "
|
|
"runner so Windows serves on a SelectorEventLoop"
|
|
)
|
|
assert called_bare["hit"] is False, (
|
|
"start_server must not fall back to bare asyncio.run when uvicorn's "
|
|
"loop-factory runner is available"
|
|
)
|
|
|
|
|
|
def test_start_server_keeps_bare_asyncio_run_on_posix(monkeypatch):
|
|
"""POSIX behavior must be byte-for-byte unchanged: serve via the plain
|
|
``asyncio.run(_serve())`` path, never the Windows loop-factory branch.
|
|
|
|
The #50641 fix is intentionally win32-scoped to keep the blast radius
|
|
minimal — Python's default loop on POSIX is already a SelectorEventLoop
|
|
(or uvloop), which is what uvicorn serves on, so there is nothing to fix.
|
|
"""
|
|
_stub_uvicorn(monkeypatch)
|
|
monkeypatch.setattr(web_server.sys, "platform", "linux")
|
|
|
|
# If the Windows branch were taken, the loop-factory runner would fire.
|
|
runner_called = {"hit": False}
|
|
|
|
def _fake_runner(coro, *, loop_factory=None):
|
|
runner_called["hit"] = True
|
|
coro.close()
|
|
|
|
monkeypatch.setattr("uvicorn._compat.asyncio_run", _fake_runner, raising=False)
|
|
|
|
bare_called = {"hit": False}
|
|
|
|
def _fake_asyncio_run(coro):
|
|
bare_called["hit"] = True
|
|
coro.close()
|
|
return None
|
|
|
|
monkeypatch.setattr(asyncio, "run", _fake_asyncio_run)
|
|
|
|
web_server.start_server(host="127.0.0.1", port=0, open_browser=False)
|
|
|
|
assert bare_called["hit"] is True, "POSIX must serve via bare asyncio.run"
|
|
assert runner_called["hit"] is False, (
|
|
"POSIX must not take the Windows loop-factory branch"
|
|
)
|