fix(macos): compose launchd reload retry with _launchctl_bootstrap + drain-aware window
Reworks @valenteff's #53277 fix per review (Teknium's 3 findings): - Route refresh_launchd_plist_if_needed's bootstrap through the existing _launchctl_bootstrap() EIO-recovery helper (canonical since #56256), wrapped in a wall-clock retry loop, instead of an ad-hoc 5x2s loop. - Window sized to agent.restart_drain_timeout (default 180s), not a fixed ~10s: the failure happens while the old gateway is still draining (finding 1). - Retry on subprocess.TimeoutExpired too, not just CalledProcessError — a bootstrap timeout after bootout otherwise escapes and leaves the service unloaded (finding 2). - Confirm success with launchctl list, not a bare bootstrap exit 0 (finding 3); mirror verify+drain-window in the detached-helper bash path. - Shared helpers _launchd_reload_log_path / _append_launchd_reload_log / _launchctl_label_registered / _retry_launchctl_bootstrap_until_registered. 3 new tests cover retry-until-listed, TimeoutExpired-retried, deadline-exhaust. E2E: real reload log + mocked launchctl — retries CalledProcessError+TimeoutExpired, verifies via launchctl list, logs failures.
This commit is contained in:
parent
7a7d19e73b
commit
fb7a38ad21
2 changed files with 199 additions and 48 deletions
|
|
@ -3588,6 +3588,86 @@ def _launchctl_bootstrap(
|
|||
)
|
||||
|
||||
|
||||
def _launchd_reload_log_path() -> Path:
|
||||
"""Path the launchd reload watchdog tails for persistent-orphan detection."""
|
||||
return get_hermes_home() / "logs" / "launchd-reload.log"
|
||||
|
||||
|
||||
def _append_launchd_reload_log(message: str) -> None:
|
||||
"""Append a timestamped line to the launchd reload log (best-effort)."""
|
||||
path = _launchd_reload_log_path()
|
||||
try:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
from datetime import datetime as _dt
|
||||
|
||||
stamp = _dt.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z")
|
||||
with path.open("a", encoding="utf-8") as fh:
|
||||
fh.write(f"[{stamp}] {message}\n")
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _launchctl_label_registered(label: str) -> bool:
|
||||
"""True when ``launchctl list <label>`` reports the job as registered."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["launchctl", "list", label],
|
||||
check=False,
|
||||
timeout=10,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
return result.returncode == 0
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
return False
|
||||
|
||||
|
||||
def _retry_launchctl_bootstrap_until_registered(
|
||||
domain: str, plist_path, label: str, *, deadline: float
|
||||
) -> bool:
|
||||
"""Bootstrap with retry until the label is registered or ``deadline`` passes.
|
||||
|
||||
Wraps :func:`_launchctl_bootstrap` (which already recovers the EIO
|
||||
"already loaded" case) in a wall-clock retry loop for the *transient*
|
||||
failure mode: under high load or a launchd race the bootstrap can fail
|
||||
even after ``bootout`` already tore down the prior registration, leaving
|
||||
the service orphaned from ``KeepAlive`` supervision. The reported incident
|
||||
happened during a graceful drain (default ``agent.restart_drain_timeout``
|
||||
= 180s), so a fixed ~10s window is too short — retry until ``deadline``.
|
||||
|
||||
Both ``CalledProcessError`` and ``TimeoutExpired`` are treated as
|
||||
retryable: a ``bootstrap`` that times out after ``bootout`` still leaves
|
||||
the service unloaded, so it must be retried, not allowed to escape. On
|
||||
each failure a timestamped line is appended to the reload log; success is
|
||||
confirmed with ``launchctl list`` (not merely a zero bootstrap exit).
|
||||
Returns True once the label is registered, False if the deadline is hit.
|
||||
"""
|
||||
attempt = 0
|
||||
while True:
|
||||
attempt += 1
|
||||
try:
|
||||
_launchctl_bootstrap(domain, plist_path, label, timeout=30)
|
||||
if _launchctl_label_registered(label):
|
||||
return True
|
||||
_append_launchd_reload_log(
|
||||
f"bootstrap attempt {attempt} exited 0 but {domain}/{label} "
|
||||
f"is not registered (launchctl list) — retrying"
|
||||
)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
_append_launchd_reload_log(
|
||||
f"bootstrap attempt {attempt} failed (rc={exc.returncode}) "
|
||||
f"for {domain}/{label} — retrying"
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
_append_launchd_reload_log(
|
||||
f"bootstrap attempt {attempt} timed out for {domain}/{label} "
|
||||
f"— retrying"
|
||||
)
|
||||
if time.monotonic() >= deadline:
|
||||
return False
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
# ── launchd unsupported marker ─────────────────────────────────────────────
|
||||
# When launchd can't manage the domain on this host (error 5/125, macOS 26+),
|
||||
# we write a persistent marker so `launchd_status()` can explain that launchd
|
||||
|
|
@ -3871,18 +3951,25 @@ def refresh_launchd_plist_if_needed() -> bool:
|
|||
reload_log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
# Retry until launchctl LISTS the label (not merely a zero bootstrap
|
||||
# exit) or the drain window elapses. The failure happens while the old
|
||||
# gateway is still draining (default agent.restart_drain_timeout=180s),
|
||||
# so a fixed ~10s window is too short — bound by that budget instead.
|
||||
_reload_budget = int(max(30.0, _get_restart_drain_timeout()))
|
||||
reload_script = (
|
||||
f"sleep 2; "
|
||||
f"launchctl bootout {shlex.quote(target)} 2>/dev/null; "
|
||||
f"sleep 1; "
|
||||
f"for _i in 1 2 3 4 5; do "
|
||||
f" launchctl bootstrap {shlex.quote(domain)} {shlex.quote(str(plist_path))} 2>/dev/null && break; "
|
||||
f" rc=$?; "
|
||||
f" echo \"[$(date '+%Y-%m-%d %H:%M:%S %z')] bootstrap attempt $_i failed (rc=$rc) for {shlex.quote(target)}\" >> {shlex.quote(str(reload_log_path))}; "
|
||||
f"_deadline=$(($(date +%s) + {_reload_budget})); "
|
||||
f"while :; do "
|
||||
f" launchctl bootstrap {shlex.quote(domain)} {shlex.quote(str(plist_path))} 2>/dev/null; "
|
||||
f" if launchctl list {shlex.quote(label)} >/dev/null 2>&1; then break; fi; "
|
||||
f" echo \"[$(date '+%Y-%m-%d %H:%M:%S %z')] bootstrap not yet registered for {shlex.quote(target)} — retrying\" >> {shlex.quote(str(reload_log_path))}; "
|
||||
f" if [ $(date +%s) -ge $_deadline ]; then break; fi; "
|
||||
f" sleep 2; "
|
||||
f"done; "
|
||||
f"if ! launchctl list {shlex.quote(label)} >/dev/null 2>&1; then "
|
||||
f" echo \"[$(date '+%Y-%m-%d %H:%M:%S %z')] FAILED launchd reload for {shlex.quote(target)} — service NOT registered after retries\" >> {shlex.quote(str(reload_log_path))}; "
|
||||
f" echo \"[$(date '+%Y-%m-%d %H:%M:%S %z')] FAILED launchd reload for {shlex.quote(target)} — service NOT registered after {_reload_budget}s of retries\" >> {shlex.quote(str(reload_log_path))}; "
|
||||
f"fi"
|
||||
)
|
||||
try:
|
||||
|
|
@ -3901,54 +3988,39 @@ def refresh_launchd_plist_if_needed() -> bool:
|
|||
)
|
||||
return True
|
||||
|
||||
# Bootout/bootstrap so launchd picks up the new definition. Retry the
|
||||
# bootstrap on transient failure — under load (high loadavg, launchd
|
||||
# busy), bootstrap can return non-zero even though bootout already tore
|
||||
# down the prior registration, leaving the service orphan from KeepAlive
|
||||
# supervision. Mirrors the retry+verify+log contract of the detached
|
||||
# helper above. See hermes-restart rootcause handoff (2026-06-26).
|
||||
# Bootout/bootstrap so launchd picks up the new definition. The reported
|
||||
# incident (2026-06-26) happened when bootout succeeded but bootstrap
|
||||
# failed silently under load (loadavg 9.48) during a graceful /restart
|
||||
# drain, leaving the service unregistered — KeepAlive can't revive a job
|
||||
# launchd no longer knows about. Retry the bootstrap (via the shared
|
||||
# _launchctl_bootstrap EIO-recovery helper) until the label is actually
|
||||
# registered or the drain window elapses, verify with `launchctl list`,
|
||||
# and log exhaustion so the reload watchdog can detect a persistent orphan.
|
||||
subprocess.run(
|
||||
["launchctl", "bootout", target],
|
||||
check=False,
|
||||
timeout=90,
|
||||
)
|
||||
_bootstrap_ok = False
|
||||
for _attempt in range(1, 6):
|
||||
try:
|
||||
subprocess.run(
|
||||
["launchctl", "bootstrap", domain, str(plist_path)],
|
||||
check=True,
|
||||
timeout=30,
|
||||
)
|
||||
_bootstrap_ok = True
|
||||
break
|
||||
except subprocess.CalledProcessError as _e:
|
||||
logger.warning(
|
||||
"launchctl bootstrap attempt %d/5 failed (rc=%s) for %s%s",
|
||||
_attempt,
|
||||
_e.returncode,
|
||||
target,
|
||||
"; retrying in 2s" if _attempt < 5 else "; no retries left",
|
||||
)
|
||||
if _attempt < 5:
|
||||
import time as _bootstrap_sleep
|
||||
|
||||
_bootstrap_sleep.sleep(2)
|
||||
if not _bootstrap_ok:
|
||||
_reload_log_path = get_hermes_home() / "logs" / "launchd-reload.log"
|
||||
try:
|
||||
_reload_log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
from datetime import datetime as _dt
|
||||
|
||||
with _reload_log_path.open("a", encoding="utf-8") as _f:
|
||||
_f.write(
|
||||
f"[{_dt.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %z')}] "
|
||||
f"FAILED launchd reload of {target} — bootstrap did not "
|
||||
f"succeed after 5 attempts (refresh ran outside gateway "
|
||||
f"process tree)\n"
|
||||
)
|
||||
except OSError:
|
||||
pass
|
||||
# Size the retry window to the restart drain timeout (default 180s), not a
|
||||
# fixed ~10s: the failure mode occurs while the old gateway is still
|
||||
# draining, so a short window can exhaust before launchd settles.
|
||||
_reload_budget = max(30.0, _get_restart_drain_timeout())
|
||||
_deadline = time.monotonic() + _reload_budget
|
||||
if not _retry_launchctl_bootstrap_until_registered(
|
||||
domain, plist_path, label, deadline=_deadline
|
||||
):
|
||||
_append_launchd_reload_log(
|
||||
f"FAILED launchd reload of {target} — service NOT registered after "
|
||||
f"retrying for {int(_reload_budget)}s (refresh ran outside gateway "
|
||||
f"process tree)"
|
||||
)
|
||||
logger.error(
|
||||
"launchd reload of %s failed — service not registered after %ds of "
|
||||
"retries; see %s",
|
||||
target,
|
||||
int(_reload_budget),
|
||||
_launchd_reload_log_path(),
|
||||
)
|
||||
print(
|
||||
"↻ Updated gateway launchd service definition to match the current Hermes install"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -3469,3 +3469,82 @@ class TestLaunchctlBootstrapEioRetry:
|
|||
assert excinfo.value.returncode == 125
|
||||
# A non-EIO failure is not the already-loaded case: no bootout/retry.
|
||||
assert calls == [["launchctl", "bootstrap", self.DOMAIN, self.PLIST]]
|
||||
|
||||
|
||||
class TestRetryLaunchctlBootstrapUntilRegistered:
|
||||
"""`_retry_launchctl_bootstrap_until_registered` — salvage of #53277.
|
||||
|
||||
Covers the three review findings the salvage hardens: retry until the
|
||||
label is actually LISTED (not just a zero bootstrap exit), TimeoutExpired
|
||||
is retried (not escaped leaving the service unloaded), and the retry is
|
||||
bounded by a wall-clock deadline rather than a fixed short window.
|
||||
"""
|
||||
|
||||
DOMAIN = "gui/501"
|
||||
PLIST = "/tmp/ai.hermes.gateway.plist"
|
||||
LABEL = "ai.hermes.gateway"
|
||||
|
||||
def test_returns_true_once_label_is_registered(self, monkeypatch):
|
||||
"""Success requires launchctl list to confirm registration, not just
|
||||
a zero bootstrap exit."""
|
||||
list_results = iter([1, 0]) # first check: not registered, second: registered
|
||||
|
||||
def fake_run(cmd, check=False, **kwargs):
|
||||
if cmd[:2] == ["launchctl", "list"]:
|
||||
return SimpleNamespace(returncode=next(list_results))
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
monkeypatch.setattr(gateway_cli.time, "sleep", lambda *_a, **_k: None)
|
||||
|
||||
ok = gateway_cli._retry_launchctl_bootstrap_until_registered(
|
||||
self.DOMAIN, self.PLIST, self.LABEL,
|
||||
deadline=gateway_cli.time.monotonic() + 60,
|
||||
)
|
||||
assert ok is True
|
||||
|
||||
def test_timeout_expired_is_retried_not_escaped(self, monkeypatch):
|
||||
"""A bootstrap that times out must be retried — it leaves the service
|
||||
unloaded, so it must not escape the retry/log path (finding #2)."""
|
||||
attempts = {"bootstrap": 0}
|
||||
|
||||
def fake_run(cmd, check=False, **kwargs):
|
||||
if cmd[1] == "bootstrap":
|
||||
attempts["bootstrap"] += 1
|
||||
if attempts["bootstrap"] == 1:
|
||||
raise subprocess.TimeoutExpired(cmd, kwargs.get("timeout", 30))
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
if cmd[:2] == ["launchctl", "list"]:
|
||||
# registered only after the second (successful) bootstrap
|
||||
return SimpleNamespace(returncode=0 if attempts["bootstrap"] >= 2 else 1)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
monkeypatch.setattr(gateway_cli.time, "sleep", lambda *_a, **_k: None)
|
||||
|
||||
ok = gateway_cli._retry_launchctl_bootstrap_until_registered(
|
||||
self.DOMAIN, self.PLIST, self.LABEL,
|
||||
deadline=gateway_cli.time.monotonic() + 60,
|
||||
)
|
||||
assert ok is True
|
||||
assert attempts["bootstrap"] >= 2 # the timeout was retried, not raised
|
||||
|
||||
def test_returns_false_when_deadline_exhausts(self, monkeypatch):
|
||||
"""When the label never registers, the loop stops at the deadline and
|
||||
returns False (so the caller logs the persistent orphan)."""
|
||||
def fake_run(cmd, check=False, **kwargs):
|
||||
if cmd[:2] == ["launchctl", "list"]:
|
||||
return SimpleNamespace(returncode=1) # never registered
|
||||
if cmd[1] == "bootstrap":
|
||||
raise subprocess.CalledProcessError(1, cmd)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
monkeypatch.setattr(gateway_cli.time, "sleep", lambda *_a, **_k: None)
|
||||
|
||||
# Deadline already in the past → exactly one attempt, then give up.
|
||||
ok = gateway_cli._retry_launchctl_bootstrap_until_registered(
|
||||
self.DOMAIN, self.PLIST, self.LABEL,
|
||||
deadline=gateway_cli.time.monotonic() - 1,
|
||||
)
|
||||
assert ok is False
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue