fix(cli): stop profile-bound backends before deleting so rmtree converges
delete_profile stopped only the process named in gateway.pid, but a Desktop app spawns a headless `serve`/`dashboard` backend per profile that holds the profile's SQLite connection open and keeps writing sessions/WAL/sandbox files. That backend is never in gateway.pid, so a CLI `hermes profile delete` run while the Desktop app is up left it writing into the tree — rmtree's final rmdir then failed with ENOTEMPTY (#47368 "Bug 2"), and pre-guard it also resurrected the directory. - _profile_bound_backend_pids(): find running Hermes backends bound to this profile via a `--profile <name>` selector or a HERMES_HOME env resolving to the profile dir. Tightly scoped — current-user only, backend subcommands (serve/dashboard/gateway) only so an interactive chat is never killed, and never this process or its ancestors. - _stop_profile_backends(): terminate them (graceful, then force), best-effort so it can never make delete worse. - _rmtree_with_retry(): a few spaced retries absorb the ENOTEMPTY / Windows file-lock race from a just-terminated writer's in-flight -wal/-shm/sandbox writes instead of failing the whole delete on a race the next attempt wins. Complements the recreation guard (deleted profiles no longer reappear) and the Desktop teardown-before-delete flow; this is the CLI-side convergence fix for a delete run while a Desktop-managed backend is live. Part of #47368.
This commit is contained in:
parent
5a6720b884
commit
1501a338c3
2 changed files with 275 additions and 5 deletions
|
|
@ -1257,6 +1257,189 @@ def backfill_profile_envs(quiet: bool = False) -> List[str]:
|
|||
return backfilled
|
||||
|
||||
|
||||
def _profile_bound_backend_pids(canon: str, profile_dir: Path) -> list[int]:
|
||||
"""PIDs of running Hermes *backends* bound to this profile.
|
||||
|
||||
The ``gateway.pid`` file only tracks the messaging gateway. A Desktop app
|
||||
spawns a headless ``serve`` (or legacy ``dashboard --no-open``) backend per
|
||||
profile that holds the profile's SQLite connection open and keeps writing
|
||||
sessions/WAL/sandbox files — the writer that makes ``rmtree`` hit
|
||||
``ENOTEMPTY`` (and, pre-fix, resurrected the tree). ``gateway.pid`` never
|
||||
names it, so find it by inspection: a Hermes backend subcommand
|
||||
(``serve``/``dashboard``/``gateway``) that is bound to *this* profile either
|
||||
by a ``--profile <canon>`` / ``-p <canon>`` selector or by a ``HERMES_HOME``
|
||||
that resolves to ``profile_dir``.
|
||||
|
||||
Best-effort and tightly scoped: current-user processes only, backend
|
||||
subcommands only (never an interactive ``chat``/``tui``), and never this
|
||||
process or its ancestors. Returns an empty list if ``psutil`` can't
|
||||
inspect anything.
|
||||
"""
|
||||
try:
|
||||
import psutil # type: ignore
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
try:
|
||||
resolved_dir = profile_dir.resolve()
|
||||
except OSError:
|
||||
resolved_dir = profile_dir
|
||||
|
||||
# Never terminate ourselves or a parent (e.g. `hermes -p <canon> profile
|
||||
# delete` runs under the very profile it's deleting).
|
||||
skip: set[int] = {os.getpid()}
|
||||
try:
|
||||
parent = psutil.Process(os.getpid()).parent()
|
||||
while parent is not None:
|
||||
skip.add(parent.pid)
|
||||
parent = parent.parent()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
current_user = psutil.Process(os.getpid()).username()
|
||||
except Exception:
|
||||
current_user = None
|
||||
|
||||
backend_tokens = {"serve", "dashboard", "gateway"}
|
||||
hermes_markers = ("hermes_cli.main", "hermes-gateway", "tui_gateway")
|
||||
pids: list[int] = []
|
||||
|
||||
for proc in psutil.process_iter(["pid", "name", "username", "cmdline"]):
|
||||
try:
|
||||
info = proc.info
|
||||
pid = info.get("pid")
|
||||
if pid is None or pid in skip:
|
||||
continue
|
||||
if current_user is not None and info.get("username") != current_user:
|
||||
continue
|
||||
|
||||
argv = info.get("cmdline") or []
|
||||
if not argv:
|
||||
continue
|
||||
|
||||
# Must be a Hermes process: either an entrypoint marker in argv, or
|
||||
# a resolved executable named `hermes`.
|
||||
joined = " ".join(argv)
|
||||
exe_name = os.path.basename(argv[0]).lower()
|
||||
is_hermes = (
|
||||
any(marker in joined for marker in hermes_markers)
|
||||
or exe_name == "hermes"
|
||||
or exe_name.startswith("hermes")
|
||||
)
|
||||
if not is_hermes:
|
||||
continue
|
||||
|
||||
# Restrict to backend subcommands so we never kill an interactive
|
||||
# session the user is deliberately running.
|
||||
tokens = {tok.lower() for tok in argv}
|
||||
if not (tokens & backend_tokens):
|
||||
continue
|
||||
|
||||
# Bound to THIS profile — by selector flag in argv...
|
||||
bound = False
|
||||
for i, tok in enumerate(argv):
|
||||
if tok in {"--profile", "-p"} and i + 1 < len(argv):
|
||||
if normalize_profile_name(argv[i + 1]) == canon:
|
||||
bound = True
|
||||
break
|
||||
elif tok.startswith("--profile="):
|
||||
if normalize_profile_name(tok.split("=", 1)[1]) == canon:
|
||||
bound = True
|
||||
break
|
||||
|
||||
# ...or by HERMES_HOME env pointing at this profile dir.
|
||||
if not bound:
|
||||
try:
|
||||
env_home = (proc.environ() or {}).get("HERMES_HOME", "")
|
||||
if env_home and Path(env_home).resolve() == resolved_dir:
|
||||
bound = True
|
||||
except Exception:
|
||||
# environ() can raise AccessDenied even same-user on some
|
||||
# platforms; fall back to the argv signal only.
|
||||
pass
|
||||
|
||||
if bound:
|
||||
pids.append(pid)
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return pids
|
||||
|
||||
|
||||
def _stop_profile_backends(canon: str, profile_dir: Path) -> None:
|
||||
"""Terminate any Desktop-spawned / stray backends bound to this profile.
|
||||
|
||||
Complements ``_stop_gateway_process`` (which only knows ``gateway.pid``):
|
||||
without this, a live ``serve``/``dashboard`` backend keeps creating files
|
||||
under the profile dir while ``rmtree`` walks it, so the final ``rmdir``
|
||||
fails with ``ENOTEMPTY`` and the delete doesn't converge. Best-effort:
|
||||
any failure is reported and swallowed so it never makes delete worse.
|
||||
"""
|
||||
pids = _profile_bound_backend_pids(canon, profile_dir)
|
||||
if not pids:
|
||||
return
|
||||
|
||||
try:
|
||||
from gateway.status import _pid_exists, terminate_pid as _terminate_pid
|
||||
except Exception:
|
||||
return
|
||||
|
||||
for pid in pids:
|
||||
try:
|
||||
_terminate_pid(pid) # graceful first
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
continue
|
||||
|
||||
# Wait up to 10s for graceful exit, then force-kill stragglers.
|
||||
deadline = time.time() + 10.0
|
||||
while time.time() < deadline:
|
||||
if not any(_pid_exists(pid) for pid in pids):
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
for pid in pids:
|
||||
if _pid_exists(pid):
|
||||
try:
|
||||
_terminate_pid(pid, force=True)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
|
||||
print(f"✓ Stopped {len(pids)} profile backend process(es)")
|
||||
|
||||
|
||||
def _rmtree_with_retry(profile_dir: Path, onexc_handler) -> None:
|
||||
"""``shutil.rmtree`` with a short retry loop for transient races.
|
||||
|
||||
Even after stopping the gateway and profile backends, a just-terminated
|
||||
process can leave in-flight writes (SQLite ``-wal``/``-shm`` checkpoints,
|
||||
sandbox temp files) that land after ``rmtree`` has walked past a directory,
|
||||
surfacing as ``ENOTEMPTY`` (POSIX) or a transient ``PermissionError``
|
||||
(Windows file lock still releasing). A few spaced retries let those settle
|
||||
instead of failing the whole delete on a race the next attempt would win.
|
||||
"""
|
||||
attempts = 3
|
||||
last_exc: OSError | None = None
|
||||
for attempt in range(attempts):
|
||||
try:
|
||||
# ``onexc`` was added in 3.12; fall back to ``onerror`` on 3.11.
|
||||
try:
|
||||
shutil.rmtree(profile_dir, onexc=onexc_handler)
|
||||
except TypeError:
|
||||
shutil.rmtree(profile_dir, onerror=onexc_handler)
|
||||
return
|
||||
except OSError as e:
|
||||
last_exc = e
|
||||
if not profile_dir.exists():
|
||||
return
|
||||
if attempt < attempts - 1:
|
||||
time.sleep(0.3 * (attempt + 1))
|
||||
if last_exc is not None:
|
||||
raise last_exc
|
||||
|
||||
|
||||
def delete_profile(name: str, yes: bool = False) -> Path:
|
||||
"""Delete a profile, its wrapper script, and its gateway service.
|
||||
|
||||
|
|
@ -1334,6 +1517,13 @@ def delete_profile(name: str, yes: bool = False) -> Path:
|
|||
if gw_running:
|
||||
_stop_gateway_process(profile_dir)
|
||||
|
||||
# 2b. Stop any other backends bound to this profile (Desktop-spawned
|
||||
# serve/dashboard processes the gateway.pid file never names). They hold
|
||||
# the profile's SQLite connection open and keep writing files, which makes
|
||||
# the rmtree below fail with ENOTEMPTY and — before the ensure_hermes_home
|
||||
# guard — resurrected the deleted tree.
|
||||
_stop_profile_backends(canon, profile_dir)
|
||||
|
||||
# 3. Remove wrapper script
|
||||
if has_wrapper:
|
||||
if remove_wrapper_script(canon):
|
||||
|
|
@ -1379,11 +1569,7 @@ def delete_profile(name: str, yes: bool = False) -> Path:
|
|||
else:
|
||||
raise
|
||||
|
||||
# ``onexc`` was added in 3.12; fall back to ``onerror`` on 3.11.
|
||||
try:
|
||||
shutil.rmtree(profile_dir, onexc=_make_writable)
|
||||
except TypeError:
|
||||
shutil.rmtree(profile_dir, onerror=_make_writable)
|
||||
_rmtree_with_retry(profile_dir, _make_writable)
|
||||
print(f"✓ Removed {profile_dir}")
|
||||
except Exception as e:
|
||||
print(f"⚠ Could not remove {profile_dir}: {e}")
|
||||
|
|
|
|||
|
|
@ -7,13 +7,18 @@ and shell completion generation.
|
|||
|
||||
import json
|
||||
import io
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tarfile
|
||||
import types
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from hermes_cli import profiles
|
||||
from hermes_cli.profiles import (
|
||||
normalize_profile_name,
|
||||
validate_profile_name,
|
||||
|
|
@ -578,6 +583,7 @@ class TestDeleteProfile:
|
|||
set_active_profile("coder")
|
||||
|
||||
with patch("hermes_cli.profiles._cleanup_gateway_service"), \
|
||||
patch("hermes_cli.profiles.time.sleep"), \
|
||||
patch("hermes_cli.profiles.shutil.rmtree", side_effect=PermissionError("locked")):
|
||||
with pytest.raises(RuntimeError, match="Could not remove profile directory"):
|
||||
delete_profile("coder", yes=True)
|
||||
|
|
@ -585,6 +591,84 @@ class TestDeleteProfile:
|
|||
assert profile_dir.is_dir()
|
||||
assert get_active_profile() == "default"
|
||||
|
||||
def test_stops_profile_bound_backends_before_removal(self, profile_env):
|
||||
"""A Desktop-spawned backend (not in gateway.pid) is stopped first."""
|
||||
profile_dir = create_profile("coder", no_alias=True)
|
||||
|
||||
with patch("hermes_cli.profiles._cleanup_gateway_service"), \
|
||||
patch("hermes_cli.profiles._profile_bound_backend_pids", return_value=[4242]) as pids, \
|
||||
patch("gateway.status.terminate_pid") as terminate, \
|
||||
patch("gateway.status._pid_exists", return_value=False):
|
||||
delete_profile("coder", yes=True)
|
||||
|
||||
pids.assert_called_once()
|
||||
terminate.assert_any_call(4242)
|
||||
assert not profile_dir.is_dir()
|
||||
|
||||
def test_rmtree_retries_transient_enotempty_then_succeeds(self, profile_env):
|
||||
"""A live writer racing rmtree (ENOTEMPTY) is absorbed by a retry."""
|
||||
profile_dir = create_profile("coder", no_alias=True)
|
||||
real_rmtree = shutil.rmtree
|
||||
calls = {"n": 0}
|
||||
|
||||
def flaky_rmtree(path, **kwargs):
|
||||
calls["n"] += 1
|
||||
if calls["n"] == 1:
|
||||
raise OSError(66, "Directory not empty")
|
||||
return real_rmtree(path)
|
||||
|
||||
with patch("hermes_cli.profiles._cleanup_gateway_service"), \
|
||||
patch("hermes_cli.profiles._profile_bound_backend_pids", return_value=[]), \
|
||||
patch("hermes_cli.profiles.time.sleep"), \
|
||||
patch("hermes_cli.profiles.shutil.rmtree", side_effect=flaky_rmtree):
|
||||
delete_profile("coder", yes=True)
|
||||
|
||||
assert calls["n"] == 2
|
||||
assert not profile_dir.is_dir()
|
||||
|
||||
def test_backend_scan_only_matches_this_profile(self, profile_env, monkeypatch):
|
||||
"""The backend PID scan binds by --profile selector and skips self."""
|
||||
create_profile("coder", no_alias=True)
|
||||
profile_dir = get_profile_dir("coder")
|
||||
|
||||
class FakeProc:
|
||||
def __init__(self, pid, cmdline, username="me"):
|
||||
self.pid = pid
|
||||
self.info = {"pid": pid, "name": "python", "username": username, "cmdline": cmdline}
|
||||
|
||||
def parent(self):
|
||||
return None
|
||||
|
||||
def username(self):
|
||||
return "me"
|
||||
|
||||
def environ(self):
|
||||
return {}
|
||||
|
||||
self_pid = os.getpid()
|
||||
procs = [
|
||||
# Backend bound to coder → matched.
|
||||
FakeProc(101, ["python", "-m", "hermes_cli.main", "--profile", "coder", "serve"]),
|
||||
# Interactive chat for coder → NOT a backend subcommand, skipped.
|
||||
FakeProc(102, ["python", "-m", "hermes_cli.main", "--profile", "coder", "chat"]),
|
||||
# Backend for a different profile → skipped.
|
||||
FakeProc(103, ["python", "-m", "hermes_cli.main", "--profile", "other", "serve"]),
|
||||
# This very process → skipped even if it matched.
|
||||
FakeProc(self_pid, ["python", "-m", "hermes_cli.main", "--profile", "coder", "serve"]),
|
||||
]
|
||||
|
||||
fake_psutil = types.SimpleNamespace(
|
||||
process_iter=lambda attrs=None: iter(procs),
|
||||
Process=lambda pid=None: FakeProc(self_pid, []),
|
||||
NoSuchProcess=Exception,
|
||||
AccessDenied=Exception,
|
||||
ZombieProcess=Exception,
|
||||
)
|
||||
monkeypatch.setitem(sys.modules, "psutil", fake_psutil)
|
||||
|
||||
pids = profiles._profile_bound_backend_pids("coder", profile_dir)
|
||||
assert pids == [101]
|
||||
|
||||
|
||||
# ===================================================================
|
||||
# TestListProfiles
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue