fix(cli): stop profile-bound backends before deleting so rmtree converges

delete_profile stopped only the process named in gateway.pid, but a Desktop
app spawns a headless `serve`/`dashboard` backend per profile that holds the
profile's SQLite connection open and keeps writing sessions/WAL/sandbox files.
That backend is never in gateway.pid, so a CLI `hermes profile delete` run
while the Desktop app is up left it writing into the tree — rmtree's final
rmdir then failed with ENOTEMPTY (#47368 "Bug 2"), and pre-guard it also
resurrected the directory.

- _profile_bound_backend_pids(): find running Hermes backends bound to this
  profile via a `--profile <name>` selector or a HERMES_HOME env resolving to
  the profile dir. Tightly scoped — current-user only, backend subcommands
  (serve/dashboard/gateway) only so an interactive chat is never killed, and
  never this process or its ancestors.
- _stop_profile_backends(): terminate them (graceful, then force), best-effort
  so it can never make delete worse.
- _rmtree_with_retry(): a few spaced retries absorb the ENOTEMPTY / Windows
  file-lock race from a just-terminated writer's in-flight -wal/-shm/sandbox
  writes instead of failing the whole delete on a race the next attempt wins.

Complements the recreation guard (deleted profiles no longer reappear) and the
Desktop teardown-before-delete flow; this is the CLI-side convergence fix for a
delete run while a Desktop-managed backend is live.

Part of #47368.
This commit is contained in:
Brooklyn Nicholson 2026-07-02 15:26:17 -05:00 committed by brooklyn!
parent 5a6720b884
commit 1501a338c3
2 changed files with 275 additions and 5 deletions

View file

@ -1257,6 +1257,189 @@ def backfill_profile_envs(quiet: bool = False) -> List[str]:
return backfilled
def _profile_bound_backend_pids(canon: str, profile_dir: Path) -> list[int]:
"""PIDs of running Hermes *backends* bound to this profile.
The ``gateway.pid`` file only tracks the messaging gateway. A Desktop app
spawns a headless ``serve`` (or legacy ``dashboard --no-open``) backend per
profile that holds the profile's SQLite connection open and keeps writing
sessions/WAL/sandbox files the writer that makes ``rmtree`` hit
``ENOTEMPTY`` (and, pre-fix, resurrected the tree). ``gateway.pid`` never
names it, so find it by inspection: a Hermes backend subcommand
(``serve``/``dashboard``/``gateway``) that is bound to *this* profile either
by a ``--profile <canon>`` / ``-p <canon>`` selector or by a ``HERMES_HOME``
that resolves to ``profile_dir``.
Best-effort and tightly scoped: current-user processes only, backend
subcommands only (never an interactive ``chat``/``tui``), and never this
process or its ancestors. Returns an empty list if ``psutil`` can't
inspect anything.
"""
try:
import psutil # type: ignore
except Exception:
return []
try:
resolved_dir = profile_dir.resolve()
except OSError:
resolved_dir = profile_dir
# Never terminate ourselves or a parent (e.g. `hermes -p <canon> profile
# delete` runs under the very profile it's deleting).
skip: set[int] = {os.getpid()}
try:
parent = psutil.Process(os.getpid()).parent()
while parent is not None:
skip.add(parent.pid)
parent = parent.parent()
except Exception:
pass
try:
current_user = psutil.Process(os.getpid()).username()
except Exception:
current_user = None
backend_tokens = {"serve", "dashboard", "gateway"}
hermes_markers = ("hermes_cli.main", "hermes-gateway", "tui_gateway")
pids: list[int] = []
for proc in psutil.process_iter(["pid", "name", "username", "cmdline"]):
try:
info = proc.info
pid = info.get("pid")
if pid is None or pid in skip:
continue
if current_user is not None and info.get("username") != current_user:
continue
argv = info.get("cmdline") or []
if not argv:
continue
# Must be a Hermes process: either an entrypoint marker in argv, or
# a resolved executable named `hermes`.
joined = " ".join(argv)
exe_name = os.path.basename(argv[0]).lower()
is_hermes = (
any(marker in joined for marker in hermes_markers)
or exe_name == "hermes"
or exe_name.startswith("hermes")
)
if not is_hermes:
continue
# Restrict to backend subcommands so we never kill an interactive
# session the user is deliberately running.
tokens = {tok.lower() for tok in argv}
if not (tokens & backend_tokens):
continue
# Bound to THIS profile — by selector flag in argv...
bound = False
for i, tok in enumerate(argv):
if tok in {"--profile", "-p"} and i + 1 < len(argv):
if normalize_profile_name(argv[i + 1]) == canon:
bound = True
break
elif tok.startswith("--profile="):
if normalize_profile_name(tok.split("=", 1)[1]) == canon:
bound = True
break
# ...or by HERMES_HOME env pointing at this profile dir.
if not bound:
try:
env_home = (proc.environ() or {}).get("HERMES_HOME", "")
if env_home and Path(env_home).resolve() == resolved_dir:
bound = True
except Exception:
# environ() can raise AccessDenied even same-user on some
# platforms; fall back to the argv signal only.
pass
if bound:
pids.append(pid)
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
except Exception:
continue
return pids
def _stop_profile_backends(canon: str, profile_dir: Path) -> None:
"""Terminate any Desktop-spawned / stray backends bound to this profile.
Complements ``_stop_gateway_process`` (which only knows ``gateway.pid``):
without this, a live ``serve``/``dashboard`` backend keeps creating files
under the profile dir while ``rmtree`` walks it, so the final ``rmdir``
fails with ``ENOTEMPTY`` and the delete doesn't converge. Best-effort:
any failure is reported and swallowed so it never makes delete worse.
"""
pids = _profile_bound_backend_pids(canon, profile_dir)
if not pids:
return
try:
from gateway.status import _pid_exists, terminate_pid as _terminate_pid
except Exception:
return
for pid in pids:
try:
_terminate_pid(pid) # graceful first
except (ProcessLookupError, PermissionError, OSError):
continue
# Wait up to 10s for graceful exit, then force-kill stragglers.
deadline = time.time() + 10.0
while time.time() < deadline:
if not any(_pid_exists(pid) for pid in pids):
break
time.sleep(0.5)
for pid in pids:
if _pid_exists(pid):
try:
_terminate_pid(pid, force=True)
except (ProcessLookupError, PermissionError, OSError):
pass
print(f"✓ Stopped {len(pids)} profile backend process(es)")
def _rmtree_with_retry(profile_dir: Path, onexc_handler) -> None:
"""``shutil.rmtree`` with a short retry loop for transient races.
Even after stopping the gateway and profile backends, a just-terminated
process can leave in-flight writes (SQLite ``-wal``/``-shm`` checkpoints,
sandbox temp files) that land after ``rmtree`` has walked past a directory,
surfacing as ``ENOTEMPTY`` (POSIX) or a transient ``PermissionError``
(Windows file lock still releasing). A few spaced retries let those settle
instead of failing the whole delete on a race the next attempt would win.
"""
attempts = 3
last_exc: OSError | None = None
for attempt in range(attempts):
try:
# ``onexc`` was added in 3.12; fall back to ``onerror`` on 3.11.
try:
shutil.rmtree(profile_dir, onexc=onexc_handler)
except TypeError:
shutil.rmtree(profile_dir, onerror=onexc_handler)
return
except OSError as e:
last_exc = e
if not profile_dir.exists():
return
if attempt < attempts - 1:
time.sleep(0.3 * (attempt + 1))
if last_exc is not None:
raise last_exc
def delete_profile(name: str, yes: bool = False) -> Path:
"""Delete a profile, its wrapper script, and its gateway service.
@ -1334,6 +1517,13 @@ def delete_profile(name: str, yes: bool = False) -> Path:
if gw_running:
_stop_gateway_process(profile_dir)
# 2b. Stop any other backends bound to this profile (Desktop-spawned
# serve/dashboard processes the gateway.pid file never names). They hold
# the profile's SQLite connection open and keep writing files, which makes
# the rmtree below fail with ENOTEMPTY and — before the ensure_hermes_home
# guard — resurrected the deleted tree.
_stop_profile_backends(canon, profile_dir)
# 3. Remove wrapper script
if has_wrapper:
if remove_wrapper_script(canon):
@ -1379,11 +1569,7 @@ def delete_profile(name: str, yes: bool = False) -> Path:
else:
raise
# ``onexc`` was added in 3.12; fall back to ``onerror`` on 3.11.
try:
shutil.rmtree(profile_dir, onexc=_make_writable)
except TypeError:
shutil.rmtree(profile_dir, onerror=_make_writable)
_rmtree_with_retry(profile_dir, _make_writable)
print(f"✓ Removed {profile_dir}")
except Exception as e:
print(f"⚠ Could not remove {profile_dir}: {e}")

View file

@ -7,13 +7,18 @@ and shell completion generation.
import json
import io
import os
import shutil
import sys
import tarfile
import types
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
import yaml
from hermes_cli import profiles
from hermes_cli.profiles import (
normalize_profile_name,
validate_profile_name,
@ -578,6 +583,7 @@ class TestDeleteProfile:
set_active_profile("coder")
with patch("hermes_cli.profiles._cleanup_gateway_service"), \
patch("hermes_cli.profiles.time.sleep"), \
patch("hermes_cli.profiles.shutil.rmtree", side_effect=PermissionError("locked")):
with pytest.raises(RuntimeError, match="Could not remove profile directory"):
delete_profile("coder", yes=True)
@ -585,6 +591,84 @@ class TestDeleteProfile:
assert profile_dir.is_dir()
assert get_active_profile() == "default"
def test_stops_profile_bound_backends_before_removal(self, profile_env):
"""A Desktop-spawned backend (not in gateway.pid) is stopped first."""
profile_dir = create_profile("coder", no_alias=True)
with patch("hermes_cli.profiles._cleanup_gateway_service"), \
patch("hermes_cli.profiles._profile_bound_backend_pids", return_value=[4242]) as pids, \
patch("gateway.status.terminate_pid") as terminate, \
patch("gateway.status._pid_exists", return_value=False):
delete_profile("coder", yes=True)
pids.assert_called_once()
terminate.assert_any_call(4242)
assert not profile_dir.is_dir()
def test_rmtree_retries_transient_enotempty_then_succeeds(self, profile_env):
"""A live writer racing rmtree (ENOTEMPTY) is absorbed by a retry."""
profile_dir = create_profile("coder", no_alias=True)
real_rmtree = shutil.rmtree
calls = {"n": 0}
def flaky_rmtree(path, **kwargs):
calls["n"] += 1
if calls["n"] == 1:
raise OSError(66, "Directory not empty")
return real_rmtree(path)
with patch("hermes_cli.profiles._cleanup_gateway_service"), \
patch("hermes_cli.profiles._profile_bound_backend_pids", return_value=[]), \
patch("hermes_cli.profiles.time.sleep"), \
patch("hermes_cli.profiles.shutil.rmtree", side_effect=flaky_rmtree):
delete_profile("coder", yes=True)
assert calls["n"] == 2
assert not profile_dir.is_dir()
def test_backend_scan_only_matches_this_profile(self, profile_env, monkeypatch):
"""The backend PID scan binds by --profile selector and skips self."""
create_profile("coder", no_alias=True)
profile_dir = get_profile_dir("coder")
class FakeProc:
def __init__(self, pid, cmdline, username="me"):
self.pid = pid
self.info = {"pid": pid, "name": "python", "username": username, "cmdline": cmdline}
def parent(self):
return None
def username(self):
return "me"
def environ(self):
return {}
self_pid = os.getpid()
procs = [
# Backend bound to coder → matched.
FakeProc(101, ["python", "-m", "hermes_cli.main", "--profile", "coder", "serve"]),
# Interactive chat for coder → NOT a backend subcommand, skipped.
FakeProc(102, ["python", "-m", "hermes_cli.main", "--profile", "coder", "chat"]),
# Backend for a different profile → skipped.
FakeProc(103, ["python", "-m", "hermes_cli.main", "--profile", "other", "serve"]),
# This very process → skipped even if it matched.
FakeProc(self_pid, ["python", "-m", "hermes_cli.main", "--profile", "coder", "serve"]),
]
fake_psutil = types.SimpleNamespace(
process_iter=lambda attrs=None: iter(procs),
Process=lambda pid=None: FakeProc(self_pid, []),
NoSuchProcess=Exception,
AccessDenied=Exception,
ZombieProcess=Exception,
)
monkeypatch.setitem(sys.modules, "psutil", fake_psutil)
pids = profiles._profile_bound_backend_pids("coder", profile_dir)
assert pids == [101]
# ===================================================================
# TestListProfiles