Three CLI reliability fixes: 1. Interrupt reliability: chat() only re-queued the user's interrupt message when the turn result carried interrupted=True. When the agent thread raced past its last interrupt check (or finished) before the interrupt landed, the message was silently dropped — and the stale _interrupt_requested flag left on the agent instantly aborted the NEXT turn. Un-acknowledged interrupt messages are now re-queued as the next turn and the stale flag is cleared (only when the agent thread actually exited). The clarify-race path also parks the message in _pending_input instead of dropping it. 2. Slow exit (5+ min): stdlib ThreadPoolExecutor workers are non-daemon and joined unconditionally by concurrent.futures' atexit hook — even after shutdown(wait=False). One wedged tool worker (abandoned after interrupt/timeout) held the process open forever. Promoted async_delegation's daemon executor to a shared tools/daemon_pool module and adopted it in tool_executor (concurrent tool batches), memory_manager (background sync), delegate_tool (child timeout wrapper + batch fan-out), and skills_hub (source fan-out). Added a 30s exit watchdog (HERMES_EXIT_WATCHDOG_S) armed at _run_cleanup start as a backstop for wedged cleanup steps. 3. Exit jank: after prompt_toolkit tears down the input/status bars the terminal sat silent for the whole cleanup window, looking hung. Print 'Shutting down… (finalizing session)' immediately at exit start. E2E: live PTY interrupt of a foreground 'sleep 120' terminal tool now aborts in ~1s and the typed message runs as the next turn; wedged-worker + wedged-cleanup subprocess exits in 5.8s (watchdog) instead of hanging.
89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
"""Tests for tools.daemon_pool.DaemonThreadPoolExecutor.
|
|
|
|
The daemon pool exists so abandoned workers (interrupted/timed-out tool
|
|
batches, wedged memory-provider syncs) can never block interpreter exit:
|
|
stdlib ThreadPoolExecutor workers are non-daemon AND registered in
|
|
concurrent.futures.thread._threads_queues, whose atexit hook joins every
|
|
worker unconditionally — even after shutdown(wait=False).
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
import threading
|
|
import time
|
|
|
|
from concurrent.futures.thread import _threads_queues
|
|
|
|
from tools.daemon_pool import DaemonThreadPoolExecutor
|
|
|
|
|
|
def test_workers_are_daemon_threads():
|
|
pool = DaemonThreadPoolExecutor(max_workers=2)
|
|
try:
|
|
info = pool.submit(
|
|
lambda: (threading.current_thread().daemon, threading.current_thread())
|
|
).result(timeout=10)
|
|
is_daemon, worker = info
|
|
assert is_daemon is True
|
|
# Not registered with concurrent.futures' atexit join hook.
|
|
assert worker not in _threads_queues
|
|
finally:
|
|
pool.shutdown(wait=True)
|
|
|
|
|
|
def test_results_and_initializer_work_like_stdlib():
|
|
seen = []
|
|
|
|
def _init(tag):
|
|
seen.append(tag)
|
|
|
|
pool = DaemonThreadPoolExecutor(max_workers=1, initializer=_init, initargs=("t",))
|
|
try:
|
|
assert pool.submit(lambda: 41 + 1).result(timeout=10) == 42
|
|
assert seen == ["t"]
|
|
finally:
|
|
pool.shutdown(wait=True)
|
|
|
|
|
|
def test_idle_worker_reuse():
|
|
pool = DaemonThreadPoolExecutor(max_workers=4)
|
|
try:
|
|
tid1 = pool.submit(threading.get_ident).result(timeout=10)
|
|
time.sleep(0.05) # let the worker park on the idle semaphore
|
|
tid2 = pool.submit(threading.get_ident).result(timeout=10)
|
|
assert tid1 == tid2
|
|
finally:
|
|
pool.shutdown(wait=True)
|
|
|
|
|
|
def test_wedged_worker_does_not_block_interpreter_exit():
|
|
"""A worker stuck in a long sleep must not hold the process open.
|
|
|
|
With stdlib ThreadPoolExecutor this subprocess hangs until the sleep
|
|
finishes (the atexit hook joins the worker); with the daemon pool it
|
|
exits as soon as the main thread returns.
|
|
"""
|
|
script = (
|
|
"import sys; sys.path.insert(0, %r)\n"
|
|
"from tools.daemon_pool import DaemonThreadPoolExecutor\n"
|
|
"import time\n"
|
|
"pool = DaemonThreadPoolExecutor(max_workers=1)\n"
|
|
"pool.submit(time.sleep, 120)\n"
|
|
"time.sleep(0.3)\n"
|
|
"pool.shutdown(wait=False)\n"
|
|
"print('main-done', flush=True)\n"
|
|
) % (str(_repo_root()),)
|
|
proc = subprocess.run(
|
|
[sys.executable, "-c", script],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
assert proc.returncode == 0
|
|
assert "main-done" in proc.stdout
|
|
|
|
|
|
def _repo_root():
|
|
import pathlib
|
|
|
|
return pathlib.Path(__file__).resolve().parents[2]
|