From df27267ed72cf495e2cbd7c3f19c9e46fb3cb128 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Wed, 1 Jul 2026 15:59:37 +0530 Subject: [PATCH] fix(gateway): release PID file + runtime lock in the force-exit backstop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #54111. That PR routed the early SystemExit exit paths (clean-fatal-config #51228, startup-aborted-before-running) through _exit_after_graceful_shutdown / os._exit. Those paths raise right after runner.start() without going through _stop_impl, so they relied on atexit to release the PID file + runtime lock — and os._exit bypasses atexit, leaking both. Release them explicitly in the backstop (the single guaranteed cleanup chokepoint). Both calls are idempotent: no-op on the normal _stop_impl path, actual cleanup on the early-exit paths. Corrects the now-inaccurate docstring claim that teardown always ran first. Adds a guard test plus the missing str-code->1 coverage. E2E: real PID file written + lock acquired, _exit_after_graceful_shutdown(78) exits code 78 AND removes the PID file (leak confirmed closed). --- gateway/run.py | 29 +++++++++++---- tests/gateway/test_gateway_process_exit.py | 42 ++++++++++++++++++++++ 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index e341292b5..d9d31acf8 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -19326,7 +19326,7 @@ def main(): def _exit_after_graceful_shutdown(exit_code: int) -> None: - """Flush stdio + logging, then hard-exit with ``exit_code``. + """Flush stdio, release the PID file + runtime lock, then hard-exit. Graceful teardown is already complete by the time this runs, so there is nothing left that needs a clean interpreter shutdown. We deliberately use @@ -19334,12 +19334,19 @@ def _exit_after_graceful_shutdown(exit_code: int) -> None: triggers ``Py_FinalizeEx`` → ``wait_for_thread_shutdown`` and joins every non-daemon thread — exactly the hang (#53107) a wedged tool-worker causes. - ``os._exit`` also bypasses ``atexit`` handlers. That is safe here: the - gateway's atexit-registered cleanup (``remove_pid_file``, - ``release_gateway_runtime_lock``) is also performed explicitly inside - ``start_gateway``'s teardown, so nothing is leaked. Any NEW atexit handler - added to this process would be skipped — perform such cleanup in the - teardown path, not via atexit. + ``os._exit`` bypasses ``atexit`` handlers, so we cannot rely on the + ``atexit``-registered ``remove_pid_file`` / ``release_gateway_runtime_lock`` + (registered in ``start_gateway``) to run. The full-shutdown path releases + both explicitly in ``_stop_impl``, but the EARLY exit paths — + clean-fatal-config (#51228) and startup-aborted-before-running — raise + ``SystemExit`` right after ``runner.start()`` without going through + ``_stop_impl``, so on those paths ``atexit`` was the only thing releasing + them. Now that those paths are routed through this backstop (#53107), + release both here explicitly. Both calls are idempotent — + ``remove_pid_file`` only unlinks a PID file that belongs to this process, + and ``release_gateway_runtime_lock`` no-ops when the lock is already + released — so this is a no-op on the normal shutdown path and the actual + cleanup on the early-exit paths. Logging is not flushed here: the gateway's handlers are synchronous ``RotatingFileHandler``s that write each record immediately (no @@ -19351,6 +19358,14 @@ def _exit_after_graceful_shutdown(exit_code: int) -> None: stream.flush() except Exception: pass + # Guaranteed cleanup chokepoint: os._exit skips atexit, and the early + # SystemExit exit paths never run _stop_impl, so release here (idempotent). + try: + from gateway.status import remove_pid_file, release_gateway_runtime_lock + remove_pid_file() + release_gateway_runtime_lock() + except Exception: + pass os._exit(exit_code) diff --git a/tests/gateway/test_gateway_process_exit.py b/tests/gateway/test_gateway_process_exit.py index 4eb6e2437..b9020a5d9 100644 --- a/tests/gateway/test_gateway_process_exit.py +++ b/tests/gateway/test_gateway_process_exit.py @@ -125,3 +125,45 @@ def test_main_systemexit_none_code_maps_to_zero(monkeypatch): gateway_run.main() assert exc_info.value.code == 0 + + +def test_main_systemexit_str_code_maps_to_one(monkeypatch): + """SystemExit with a str code (CPython prints it to stderr then exits 1). + We can't print during os._exit, but the code must still map to 1 — matching + CPython's handle_system_exit semantics for a non-int, non-None code.""" + async def fake_start_gateway(config=None): + raise SystemExit("fatal: something went wrong") + + monkeypatch.setattr(gateway_run, "start_gateway", fake_start_gateway) + monkeypatch.setattr(gateway_run.os, "_exit", _raise_exit) + monkeypatch.setattr(gateway_run.sys, "argv", ["gateway.run"]) + monkeypatch.setattr(gateway_run.sys, "stdout", SimpleNamespace(flush=Mock())) + monkeypatch.setattr(gateway_run.sys, "stderr", SimpleNamespace(flush=Mock())) + + with pytest.raises(_ExitCalled) as exc_info: + gateway_run.main() + + assert exc_info.value.code == 1 + + +def test_exit_backstop_releases_pid_file_and_runtime_lock(monkeypatch): + """os._exit bypasses atexit, and the early SystemExit exit paths never run + _stop_impl — so the force-exit backstop itself must release the PID file and + runtime lock, or those early paths (#51228 fatal-config) would leak them. + Both releases are idempotent, so this is safe on every exit path.""" + from gateway import status as gateway_status + + remove_pid = Mock() + release_lock = Mock() + monkeypatch.setattr(gateway_status, "remove_pid_file", remove_pid) + monkeypatch.setattr(gateway_status, "release_gateway_runtime_lock", release_lock) + monkeypatch.setattr(gateway_run.os, "_exit", _raise_exit) + monkeypatch.setattr(gateway_run.sys, "stdout", SimpleNamespace(flush=Mock())) + monkeypatch.setattr(gateway_run.sys, "stderr", SimpleNamespace(flush=Mock())) + + with pytest.raises(_ExitCalled) as exc_info: + gateway_run._exit_after_graceful_shutdown(78) + + assert exc_info.value.code == 78 + remove_pid.assert_called_once_with() + release_lock.assert_called_once_with()