fix(gateway): release PID file + runtime lock in the force-exit backstop

Follow-up to #54111. That PR routed the early SystemExit exit paths
(clean-fatal-config #51228, startup-aborted-before-running) through
_exit_after_graceful_shutdown / os._exit. Those paths raise right after
runner.start() without going through _stop_impl, so they relied on atexit
to release the PID file + runtime lock — and os._exit bypasses atexit,
leaking both.

Release them explicitly in the backstop (the single guaranteed cleanup
chokepoint). Both calls are idempotent: no-op on the normal _stop_impl
path, actual cleanup on the early-exit paths. Corrects the now-inaccurate
docstring claim that teardown always ran first. Adds a guard test plus the
missing str-code->1 coverage.

E2E: real PID file written + lock acquired, _exit_after_graceful_shutdown(78)
exits code 78 AND removes the PID file (leak confirmed closed).
This commit is contained in:
kshitijk4poor 2026-07-01 15:59:37 +05:30
parent e23f723389
commit df27267ed7
2 changed files with 64 additions and 7 deletions

View file

@ -19326,7 +19326,7 @@ def main():
def _exit_after_graceful_shutdown(exit_code: int) -> None:
"""Flush stdio + logging, then hard-exit with ``exit_code``.
"""Flush stdio, release the PID file + runtime lock, then hard-exit.
Graceful teardown is already complete by the time this runs, so there is
nothing left that needs a clean interpreter shutdown. We deliberately use
@ -19334,12 +19334,19 @@ def _exit_after_graceful_shutdown(exit_code: int) -> None:
triggers ``Py_FinalizeEx`` ``wait_for_thread_shutdown`` and joins every
non-daemon thread exactly the hang (#53107) a wedged tool-worker causes.
``os._exit`` also bypasses ``atexit`` handlers. That is safe here: the
gateway's atexit-registered cleanup (``remove_pid_file``,
``release_gateway_runtime_lock``) is also performed explicitly inside
``start_gateway``'s teardown, so nothing is leaked. Any NEW atexit handler
added to this process would be skipped perform such cleanup in the
teardown path, not via atexit.
``os._exit`` bypasses ``atexit`` handlers, so we cannot rely on the
``atexit``-registered ``remove_pid_file`` / ``release_gateway_runtime_lock``
(registered in ``start_gateway``) to run. The full-shutdown path releases
both explicitly in ``_stop_impl``, but the EARLY exit paths
clean-fatal-config (#51228) and startup-aborted-before-running — raise
``SystemExit`` right after ``runner.start()`` without going through
``_stop_impl``, so on those paths ``atexit`` was the only thing releasing
them. Now that those paths are routed through this backstop (#53107),
release both here explicitly. Both calls are idempotent
``remove_pid_file`` only unlinks a PID file that belongs to this process,
and ``release_gateway_runtime_lock`` no-ops when the lock is already
released so this is a no-op on the normal shutdown path and the actual
cleanup on the early-exit paths.
Logging is not flushed here: the gateway's handlers are synchronous
``RotatingFileHandler``s that write each record immediately (no
@ -19351,6 +19358,14 @@ def _exit_after_graceful_shutdown(exit_code: int) -> None:
stream.flush()
except Exception:
pass
# Guaranteed cleanup chokepoint: os._exit skips atexit, and the early
# SystemExit exit paths never run _stop_impl, so release here (idempotent).
try:
from gateway.status import remove_pid_file, release_gateway_runtime_lock
remove_pid_file()
release_gateway_runtime_lock()
except Exception:
pass
os._exit(exit_code)

View file

@ -125,3 +125,45 @@ def test_main_systemexit_none_code_maps_to_zero(monkeypatch):
gateway_run.main()
assert exc_info.value.code == 0
def test_main_systemexit_str_code_maps_to_one(monkeypatch):
"""SystemExit with a str code (CPython prints it to stderr then exits 1).
We can't print during os._exit, but the code must still map to 1 — matching
CPython's handle_system_exit semantics for a non-int, non-None code."""
async def fake_start_gateway(config=None):
raise SystemExit("fatal: something went wrong")
monkeypatch.setattr(gateway_run, "start_gateway", fake_start_gateway)
monkeypatch.setattr(gateway_run.os, "_exit", _raise_exit)
monkeypatch.setattr(gateway_run.sys, "argv", ["gateway.run"])
monkeypatch.setattr(gateway_run.sys, "stdout", SimpleNamespace(flush=Mock()))
monkeypatch.setattr(gateway_run.sys, "stderr", SimpleNamespace(flush=Mock()))
with pytest.raises(_ExitCalled) as exc_info:
gateway_run.main()
assert exc_info.value.code == 1
def test_exit_backstop_releases_pid_file_and_runtime_lock(monkeypatch):
"""os._exit bypasses atexit, and the early SystemExit exit paths never run
_stop_impl so the force-exit backstop itself must release the PID file and
runtime lock, or those early paths (#51228 fatal-config) would leak them.
Both releases are idempotent, so this is safe on every exit path."""
from gateway import status as gateway_status
remove_pid = Mock()
release_lock = Mock()
monkeypatch.setattr(gateway_status, "remove_pid_file", remove_pid)
monkeypatch.setattr(gateway_status, "release_gateway_runtime_lock", release_lock)
monkeypatch.setattr(gateway_run.os, "_exit", _raise_exit)
monkeypatch.setattr(gateway_run.sys, "stdout", SimpleNamespace(flush=Mock()))
monkeypatch.setattr(gateway_run.sys, "stderr", SimpleNamespace(flush=Mock()))
with pytest.raises(_ExitCalled) as exc_info:
gateway_run._exit_after_graceful_shutdown(78)
assert exc_info.value.code == 78
remove_pid.assert_called_once_with()
release_lock.assert_called_once_with()