hermes-agent/tests/test_web_server.py
kshitijk4poor e73adb5043 fix(dashboard): disable ws keepalive ping on loopback to survive event-loop stalls
Desktop/dashboard WebSocket connections drop during long agent operations
(delegate_task subagents, large model outputs) when the uvicorn event loop is
GIL-starved for minutes. Root cause: uvicorn's ws keepalive ping runs on the
SAME event loop as agent turns. A single synchronous GIL-holding call on a
worker thread (a regex/scrub over a large output, or a long subagent turn)
freezes the loop, so it cannot process the incoming pong within ws_ping_timeout
and uvicorn closes an otherwise-healthy connection (#53773: 'event loop stalled
226.3s'; #48445/#50005). Loosening the timeout only raises the threshold — a
multi-minute stall sails past any finite window.

The keepalive ping exists to detect half-open connections (reverse-proxy 524,
dropped tunnels), which cannot happen on loopback: there is no network or proxy
in the path, and a dead local client tears the socket down with a real FIN/RST
that starlette surfaces as WebSocketDisconnect regardless of the ping. So on
loopback the ping provides ~no liveness value while actively killing
recoverable stalls — disable it entirely (ws_ping_interval/timeout=None).

Non-loopback (public) binds sit behind a Cloudflare Tunnel where half-open IS a
real failure mode, so the ping stays at 20/20 to detect it.

Empirically verified (real uvicorn + websockets peer): with ws_ping=None the
server never closes a silent peer during an 8s window; with the pre-fix 2s/2s
window uvicorn closes it. A genuinely-dead client still fires the
WebSocketDisconnect reap path regardless of the ping.

Note: this fixes the local Desktop case (the OP's scenario). A remote Desktop
over an authenticated public dashboard route (McCalebTheSecond's comment) keeps
the ping and needs the deeper GIL-hotspot fix — tracked separately.

Closes #53773
2026-07-03 03:33:22 +05:30

212 lines
7.8 KiB
Python

"""Test that start_server configures ws-ping keepalive.
The server now uses uvicorn.Server directly (not uvicorn.run) so we stub
Config + Server + asyncio.run to capture kwargs without starting an event loop.
"""
import asyncio
import contextlib
import uvicorn
from hermes_cli import web_server
def _stub_uvicorn(monkeypatch):
"""Replace uvicorn.Config/Server with fakes so start_server returns
immediately. Returns a dict with captured Config kwargs."""
captured: dict = {}
class _FakeConfig:
loaded = True
host = "127.0.0.1"
port = 8000
_loop_factory = None
def __init__(self, *args, **kwargs):
captured.update(kwargs)
def load(self):
pass
def get_loop_factory(self):
return self._loop_factory
class lifespan_class:
should_exit = False
state: dict = {}
def __init__(self, *a, **kw):
pass
async def startup(self):
pass
async def shutdown(self):
pass
class _FakeServer:
should_exit = False
started = True
servers: list = []
lifespan = None
@staticmethod
def capture_signals():
return contextlib.nullcontext()
async def startup(self, sockets=None):
pass
async def main_loop(self):
pass
async def shutdown(self, sockets=None):
pass
monkeypatch.setattr(uvicorn, "Config", _FakeConfig)
monkeypatch.setattr(uvicorn, "Server", lambda config: _FakeServer())
return captured
def test_start_server_disables_ws_ping_on_loopback(monkeypatch):
"""Loopback binds (the Desktop case) MUST disable uvicorn's protocol-level
keepalive ping so an event-loop stall can never trigger a false disconnect.
uvicorn's ws ping runs on the same event loop as agent turns. A single
synchronous GIL-holding call on a worker thread can starve that loop for
minutes, so the loop can't process the pong and uvicorn kills an
otherwise-healthy local connection (#53773 "event loop stalled 226.3s",
#48445/#50005). On loopback there is no network/proxy path where a
half-open connection can occur — a dead local client tears the socket down
with a real FIN/RST that surfaces as WebSocketDisconnect regardless — so
the ping provides no liveness value and only harms. Assert it is disabled.
"""
captured = _stub_uvicorn(monkeypatch)
# Loopback bind => no auth gate, so this reaches the Config constructor.
web_server.start_server(host="127.0.0.1", port=0, open_browser=False)
assert captured["ws_ping_interval"] is None
assert captured["ws_ping_timeout"] is None
def test_start_server_enables_ws_ping_for_half_open_detection(monkeypatch):
"""Non-loopback (public) binds MUST keep the ws ping enabled so half-open
connections (reverse-proxy 524, dropped Cloudflare Tunnel) raise
WebSocketDisconnect into the reaping path (#32377).
The invariant asserted here is that ping stays enabled (non-None, positive)
and the timeout is never shorter than the interval — not a frozen literal,
which churns every time the window is retuned. Loopback disables the ping
(see test_start_server_disables_ws_ping_on_loopback); this covers the
public-bind half-open case, so the auth gate is active here.
"""
captured = _stub_uvicorn(monkeypatch)
# Non-loopback bind so the _is_loopback branch selects the enabled-ping
# window. Neutralize the auth gate so start_server reaches uvicorn.Config
# without requiring a registered provider (a real public bind would raise
# SystemExit here). The ping window keys off the host, not the auth flag.
monkeypatch.setattr(web_server, "should_require_auth", lambda *a, **k: False)
web_server.start_server(host="0.0.0.0", port=0, open_browser=False)
assert captured["ws_ping_interval"] and captured["ws_ping_interval"] > 0
assert captured["ws_ping_timeout"] and captured["ws_ping_timeout"] > 0
assert captured["ws_ping_timeout"] >= captured["ws_ping_interval"]
def test_start_server_runs_on_uvicorns_loop_factory(monkeypatch):
"""The dashboard/desktop backend must serve uvicorn on the loop *uvicorn*
selects, not the interpreter default.
On Windows ``asyncio.run`` defaults to a ProactorEventLoop, but uvicorn's
socket-serving stack forces a SelectorEventLoop on win32
(``uvicorn/loops/asyncio.py``). Serving on the proactor loop binds a socket
that never accepts — the backend prints "Skipping web UI build" and hangs
forever with the port LISTENING but no TCP handshake (#50641). We fix that
by routing the serve call through ``uvicorn._compat.asyncio_run`` with
``config.get_loop_factory()`` — exactly what ``uvicorn.Server.run`` does.
This asserts the behavioral contract: on Windows the loop factory the runner
receives is the one uvicorn's own Config produced, and bare ``asyncio.run``
is never the serve path when the loop-factory runner exists.
"""
_stub_uvicorn(monkeypatch)
# The fix only changes behavior on win32; simulate it so the Windows branch
# is actually exercised on a POSIX CI host.
monkeypatch.setattr(web_server.sys, "platform", "win32")
# The fake Config (installed by _stub_uvicorn) returns its ``_loop_factory``
# from get_loop_factory(). Pin a sentinel so we can assert it is threaded
# through to the runner unchanged.
sentinel_factory = object()
monkeypatch.setattr(uvicorn.Config, "_loop_factory", sentinel_factory, raising=False)
seen: dict = {}
def _fake_runner(coro, *, loop_factory=None):
seen["loop_factory"] = loop_factory
coro.close() # drain without an event loop
monkeypatch.setattr("uvicorn._compat.asyncio_run", _fake_runner, raising=False)
# Bare asyncio.run must NOT be the serve path on Windows when the
# loop-factory runner is importable.
called_bare = {"hit": False}
def _guard_asyncio_run(coro):
called_bare["hit"] = True
coro.close()
return None
monkeypatch.setattr(asyncio, "run", _guard_asyncio_run)
web_server.start_server(host="127.0.0.1", port=0, open_browser=False)
assert seen.get("loop_factory") is sentinel_factory, (
"start_server must pass uvicorn's get_loop_factory() result to the "
"runner so Windows serves on a SelectorEventLoop"
)
assert called_bare["hit"] is False, (
"start_server must not fall back to bare asyncio.run when uvicorn's "
"loop-factory runner is available"
)
def test_start_server_keeps_bare_asyncio_run_on_posix(monkeypatch):
"""POSIX behavior must be byte-for-byte unchanged: serve via the plain
``asyncio.run(_serve())`` path, never the Windows loop-factory branch.
The #50641 fix is intentionally win32-scoped to keep the blast radius
minimal — Python's default loop on POSIX is already a SelectorEventLoop
(or uvloop), which is what uvicorn serves on, so there is nothing to fix.
"""
_stub_uvicorn(monkeypatch)
monkeypatch.setattr(web_server.sys, "platform", "linux")
# If the Windows branch were taken, the loop-factory runner would fire.
runner_called = {"hit": False}
def _fake_runner(coro, *, loop_factory=None):
runner_called["hit"] = True
coro.close()
monkeypatch.setattr("uvicorn._compat.asyncio_run", _fake_runner, raising=False)
bare_called = {"hit": False}
def _fake_asyncio_run(coro):
bare_called["hit"] = True
coro.close()
return None
monkeypatch.setattr(asyncio, "run", _fake_asyncio_run)
web_server.start_server(host="127.0.0.1", port=0, open_browser=False)
assert bare_called["hit"] is True, "POSIX must serve via bare asyncio.run"
assert runner_called["hit"] is False, (
"POSIX must not take the Windows loop-factory branch"
)