fix(macos): compose launchd reload retry with _launchctl_bootstrap + drain-aware window

Reworks @valenteff's #53277 fix per review (Teknium's 3 findings):
- Route refresh_launchd_plist_if_needed's bootstrap through the existing
  _launchctl_bootstrap() EIO-recovery helper (canonical since #56256),
  wrapped in a wall-clock retry loop, instead of an ad-hoc 5x2s loop.
- Window sized to agent.restart_drain_timeout (default 180s), not a fixed
  ~10s: the failure happens while the old gateway is still draining (finding 1).
- Retry on subprocess.TimeoutExpired too, not just CalledProcessError — a
  bootstrap timeout after bootout otherwise escapes and leaves the service
  unloaded (finding 2).
- Confirm success with launchctl list, not a bare bootstrap exit 0 (finding 3);
  mirror verify+drain-window in the detached-helper bash path.
- Shared helpers _launchd_reload_log_path / _append_launchd_reload_log /
  _launchctl_label_registered / _retry_launchctl_bootstrap_until_registered.

3 new tests cover retry-until-listed, TimeoutExpired-retried, deadline-exhaust.
E2E: real reload log + mocked launchctl — retries CalledProcessError+TimeoutExpired,
verifies via launchctl list, logs failures.
This commit is contained in:
kshitijk4poor 2026-07-01 16:38:31 +05:30 committed by kshitij
parent 7a7d19e73b
commit fb7a38ad21
2 changed files with 199 additions and 48 deletions

View file

@ -3588,6 +3588,86 @@ def _launchctl_bootstrap(
)
def _launchd_reload_log_path() -> Path:
"""Path the launchd reload watchdog tails for persistent-orphan detection."""
return get_hermes_home() / "logs" / "launchd-reload.log"
def _append_launchd_reload_log(message: str) -> None:
"""Append a timestamped line to the launchd reload log (best-effort)."""
path = _launchd_reload_log_path()
try:
path.parent.mkdir(parents=True, exist_ok=True)
from datetime import datetime as _dt
stamp = _dt.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z")
with path.open("a", encoding="utf-8") as fh:
fh.write(f"[{stamp}] {message}\n")
except OSError:
pass
def _launchctl_label_registered(label: str) -> bool:
"""True when ``launchctl list <label>`` reports the job as registered."""
try:
result = subprocess.run(
["launchctl", "list", label],
check=False,
timeout=10,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
return result.returncode == 0
except (subprocess.TimeoutExpired, OSError):
return False
def _retry_launchctl_bootstrap_until_registered(
domain: str, plist_path, label: str, *, deadline: float
) -> bool:
"""Bootstrap with retry until the label is registered or ``deadline`` passes.
Wraps :func:`_launchctl_bootstrap` (which already recovers the EIO
"already loaded" case) in a wall-clock retry loop for the *transient*
failure mode: under high load or a launchd race the bootstrap can fail
even after ``bootout`` already tore down the prior registration, leaving
the service orphaned from ``KeepAlive`` supervision. The reported incident
happened during a graceful drain (default ``agent.restart_drain_timeout``
= 180s), so a fixed ~10s window is too short retry until ``deadline``.
Both ``CalledProcessError`` and ``TimeoutExpired`` are treated as
retryable: a ``bootstrap`` that times out after ``bootout`` still leaves
the service unloaded, so it must be retried, not allowed to escape. On
each failure a timestamped line is appended to the reload log; success is
confirmed with ``launchctl list`` (not merely a zero bootstrap exit).
Returns True once the label is registered, False if the deadline is hit.
"""
attempt = 0
while True:
attempt += 1
try:
_launchctl_bootstrap(domain, plist_path, label, timeout=30)
if _launchctl_label_registered(label):
return True
_append_launchd_reload_log(
f"bootstrap attempt {attempt} exited 0 but {domain}/{label} "
f"is not registered (launchctl list) — retrying"
)
except subprocess.CalledProcessError as exc:
_append_launchd_reload_log(
f"bootstrap attempt {attempt} failed (rc={exc.returncode}) "
f"for {domain}/{label} — retrying"
)
except subprocess.TimeoutExpired:
_append_launchd_reload_log(
f"bootstrap attempt {attempt} timed out for {domain}/{label} "
f"— retrying"
)
if time.monotonic() >= deadline:
return False
time.sleep(2)
# ── launchd unsupported marker ─────────────────────────────────────────────
# When launchd can't manage the domain on this host (error 5/125, macOS 26+),
# we write a persistent marker so `launchd_status()` can explain that launchd
@ -3871,18 +3951,25 @@ def refresh_launchd_plist_if_needed() -> bool:
reload_log_path.parent.mkdir(parents=True, exist_ok=True)
except OSError:
pass
# Retry until launchctl LISTS the label (not merely a zero bootstrap
# exit) or the drain window elapses. The failure happens while the old
# gateway is still draining (default agent.restart_drain_timeout=180s),
# so a fixed ~10s window is too short — bound by that budget instead.
_reload_budget = int(max(30.0, _get_restart_drain_timeout()))
reload_script = (
f"sleep 2; "
f"launchctl bootout {shlex.quote(target)} 2>/dev/null; "
f"sleep 1; "
f"for _i in 1 2 3 4 5; do "
f" launchctl bootstrap {shlex.quote(domain)} {shlex.quote(str(plist_path))} 2>/dev/null && break; "
f" rc=$?; "
f" echo \"[$(date '+%Y-%m-%d %H:%M:%S %z')] bootstrap attempt $_i failed (rc=$rc) for {shlex.quote(target)}\" >> {shlex.quote(str(reload_log_path))}; "
f"_deadline=$(($(date +%s) + {_reload_budget})); "
f"while :; do "
f" launchctl bootstrap {shlex.quote(domain)} {shlex.quote(str(plist_path))} 2>/dev/null; "
f" if launchctl list {shlex.quote(label)} >/dev/null 2>&1; then break; fi; "
f" echo \"[$(date '+%Y-%m-%d %H:%M:%S %z')] bootstrap not yet registered for {shlex.quote(target)} — retrying\" >> {shlex.quote(str(reload_log_path))}; "
f" if [ $(date +%s) -ge $_deadline ]; then break; fi; "
f" sleep 2; "
f"done; "
f"if ! launchctl list {shlex.quote(label)} >/dev/null 2>&1; then "
f" echo \"[$(date '+%Y-%m-%d %H:%M:%S %z')] FAILED launchd reload for {shlex.quote(target)} — service NOT registered after retries\" >> {shlex.quote(str(reload_log_path))}; "
f" echo \"[$(date '+%Y-%m-%d %H:%M:%S %z')] FAILED launchd reload for {shlex.quote(target)} — service NOT registered after {_reload_budget}s of retries\" >> {shlex.quote(str(reload_log_path))}; "
f"fi"
)
try:
@ -3901,54 +3988,39 @@ def refresh_launchd_plist_if_needed() -> bool:
)
return True
# Bootout/bootstrap so launchd picks up the new definition. Retry the
# bootstrap on transient failure — under load (high loadavg, launchd
# busy), bootstrap can return non-zero even though bootout already tore
# down the prior registration, leaving the service orphan from KeepAlive
# supervision. Mirrors the retry+verify+log contract of the detached
# helper above. See hermes-restart rootcause handoff (2026-06-26).
# Bootout/bootstrap so launchd picks up the new definition. The reported
# incident (2026-06-26) happened when bootout succeeded but bootstrap
# failed silently under load (loadavg 9.48) during a graceful /restart
# drain, leaving the service unregistered — KeepAlive can't revive a job
# launchd no longer knows about. Retry the bootstrap (via the shared
# _launchctl_bootstrap EIO-recovery helper) until the label is actually
# registered or the drain window elapses, verify with `launchctl list`,
# and log exhaustion so the reload watchdog can detect a persistent orphan.
subprocess.run(
["launchctl", "bootout", target],
check=False,
timeout=90,
)
_bootstrap_ok = False
for _attempt in range(1, 6):
try:
subprocess.run(
["launchctl", "bootstrap", domain, str(plist_path)],
check=True,
timeout=30,
)
_bootstrap_ok = True
break
except subprocess.CalledProcessError as _e:
logger.warning(
"launchctl bootstrap attempt %d/5 failed (rc=%s) for %s%s",
_attempt,
_e.returncode,
target,
"; retrying in 2s" if _attempt < 5 else "; no retries left",
)
if _attempt < 5:
import time as _bootstrap_sleep
_bootstrap_sleep.sleep(2)
if not _bootstrap_ok:
_reload_log_path = get_hermes_home() / "logs" / "launchd-reload.log"
try:
_reload_log_path.parent.mkdir(parents=True, exist_ok=True)
from datetime import datetime as _dt
with _reload_log_path.open("a", encoding="utf-8") as _f:
_f.write(
f"[{_dt.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %z')}] "
f"FAILED launchd reload of {target} — bootstrap did not "
f"succeed after 5 attempts (refresh ran outside gateway "
f"process tree)\n"
)
except OSError:
pass
# Size the retry window to the restart drain timeout (default 180s), not a
# fixed ~10s: the failure mode occurs while the old gateway is still
# draining, so a short window can exhaust before launchd settles.
_reload_budget = max(30.0, _get_restart_drain_timeout())
_deadline = time.monotonic() + _reload_budget
if not _retry_launchctl_bootstrap_until_registered(
domain, plist_path, label, deadline=_deadline
):
_append_launchd_reload_log(
f"FAILED launchd reload of {target} — service NOT registered after "
f"retrying for {int(_reload_budget)}s (refresh ran outside gateway "
f"process tree)"
)
logger.error(
"launchd reload of %s failed — service not registered after %ds of "
"retries; see %s",
target,
int(_reload_budget),
_launchd_reload_log_path(),
)
print(
"↻ Updated gateway launchd service definition to match the current Hermes install"
)

View file

@ -3469,3 +3469,82 @@ class TestLaunchctlBootstrapEioRetry:
assert excinfo.value.returncode == 125
# A non-EIO failure is not the already-loaded case: no bootout/retry.
assert calls == [["launchctl", "bootstrap", self.DOMAIN, self.PLIST]]
class TestRetryLaunchctlBootstrapUntilRegistered:
"""`_retry_launchctl_bootstrap_until_registered` — salvage of #53277.
Covers the three review findings the salvage hardens: retry until the
label is actually LISTED (not just a zero bootstrap exit), TimeoutExpired
is retried (not escaped leaving the service unloaded), and the retry is
bounded by a wall-clock deadline rather than a fixed short window.
"""
DOMAIN = "gui/501"
PLIST = "/tmp/ai.hermes.gateway.plist"
LABEL = "ai.hermes.gateway"
def test_returns_true_once_label_is_registered(self, monkeypatch):
"""Success requires launchctl list to confirm registration, not just
a zero bootstrap exit."""
list_results = iter([1, 0]) # first check: not registered, second: registered
def fake_run(cmd, check=False, **kwargs):
if cmd[:2] == ["launchctl", "list"]:
return SimpleNamespace(returncode=next(list_results))
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
monkeypatch.setattr(gateway_cli.time, "sleep", lambda *_a, **_k: None)
ok = gateway_cli._retry_launchctl_bootstrap_until_registered(
self.DOMAIN, self.PLIST, self.LABEL,
deadline=gateway_cli.time.monotonic() + 60,
)
assert ok is True
def test_timeout_expired_is_retried_not_escaped(self, monkeypatch):
"""A bootstrap that times out must be retried — it leaves the service
unloaded, so it must not escape the retry/log path (finding #2)."""
attempts = {"bootstrap": 0}
def fake_run(cmd, check=False, **kwargs):
if cmd[1] == "bootstrap":
attempts["bootstrap"] += 1
if attempts["bootstrap"] == 1:
raise subprocess.TimeoutExpired(cmd, kwargs.get("timeout", 30))
return SimpleNamespace(returncode=0, stdout="", stderr="")
if cmd[:2] == ["launchctl", "list"]:
# registered only after the second (successful) bootstrap
return SimpleNamespace(returncode=0 if attempts["bootstrap"] >= 2 else 1)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
monkeypatch.setattr(gateway_cli.time, "sleep", lambda *_a, **_k: None)
ok = gateway_cli._retry_launchctl_bootstrap_until_registered(
self.DOMAIN, self.PLIST, self.LABEL,
deadline=gateway_cli.time.monotonic() + 60,
)
assert ok is True
assert attempts["bootstrap"] >= 2 # the timeout was retried, not raised
def test_returns_false_when_deadline_exhausts(self, monkeypatch):
"""When the label never registers, the loop stops at the deadline and
returns False (so the caller logs the persistent orphan)."""
def fake_run(cmd, check=False, **kwargs):
if cmd[:2] == ["launchctl", "list"]:
return SimpleNamespace(returncode=1) # never registered
if cmd[1] == "bootstrap":
raise subprocess.CalledProcessError(1, cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
monkeypatch.setattr(gateway_cli.time, "sleep", lambda *_a, **_k: None)
# Deadline already in the past → exactly one attempt, then give up.
ok = gateway_cli._retry_launchctl_bootstrap_until_registered(
self.DOMAIN, self.PLIST, self.LABEL,
deadline=gateway_cli.time.monotonic() - 1,
)
assert ok is False