diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index 7f7f3b262..5e64e768b 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -1257,6 +1257,189 @@ def backfill_profile_envs(quiet: bool = False) -> List[str]: return backfilled +def _profile_bound_backend_pids(canon: str, profile_dir: Path) -> list[int]: + """PIDs of running Hermes *backends* bound to this profile. + + The ``gateway.pid`` file only tracks the messaging gateway. A Desktop app + spawns a headless ``serve`` (or legacy ``dashboard --no-open``) backend per + profile that holds the profile's SQLite connection open and keeps writing + sessions/WAL/sandbox files — the writer that makes ``rmtree`` hit + ``ENOTEMPTY`` (and, pre-fix, resurrected the tree). ``gateway.pid`` never + names it, so find it by inspection: a Hermes backend subcommand + (``serve``/``dashboard``/``gateway``) that is bound to *this* profile either + by a ``--profile `` / ``-p `` selector or by a ``HERMES_HOME`` + that resolves to ``profile_dir``. + + Best-effort and tightly scoped: current-user processes only, backend + subcommands only (never an interactive ``chat``/``tui``), and never this + process or its ancestors. Returns an empty list if ``psutil`` can't + inspect anything. + """ + try: + import psutil # type: ignore + except Exception: + return [] + + try: + resolved_dir = profile_dir.resolve() + except OSError: + resolved_dir = profile_dir + + # Never terminate ourselves or a parent (e.g. `hermes -p profile + # delete` runs under the very profile it's deleting). + skip: set[int] = {os.getpid()} + try: + parent = psutil.Process(os.getpid()).parent() + while parent is not None: + skip.add(parent.pid) + parent = parent.parent() + except Exception: + pass + + try: + current_user = psutil.Process(os.getpid()).username() + except Exception: + current_user = None + + backend_tokens = {"serve", "dashboard", "gateway"} + hermes_markers = ("hermes_cli.main", "hermes-gateway", "tui_gateway") + pids: list[int] = [] + + for proc in psutil.process_iter(["pid", "name", "username", "cmdline"]): + try: + info = proc.info + pid = info.get("pid") + if pid is None or pid in skip: + continue + if current_user is not None and info.get("username") != current_user: + continue + + argv = info.get("cmdline") or [] + if not argv: + continue + + # Must be a Hermes process: either an entrypoint marker in argv, or + # a resolved executable named `hermes`. + joined = " ".join(argv) + exe_name = os.path.basename(argv[0]).lower() + is_hermes = ( + any(marker in joined for marker in hermes_markers) + or exe_name == "hermes" + or exe_name.startswith("hermes") + ) + if not is_hermes: + continue + + # Restrict to backend subcommands so we never kill an interactive + # session the user is deliberately running. + tokens = {tok.lower() for tok in argv} + if not (tokens & backend_tokens): + continue + + # Bound to THIS profile — by selector flag in argv... + bound = False + for i, tok in enumerate(argv): + if tok in {"--profile", "-p"} and i + 1 < len(argv): + if normalize_profile_name(argv[i + 1]) == canon: + bound = True + break + elif tok.startswith("--profile="): + if normalize_profile_name(tok.split("=", 1)[1]) == canon: + bound = True + break + + # ...or by HERMES_HOME env pointing at this profile dir. + if not bound: + try: + env_home = (proc.environ() or {}).get("HERMES_HOME", "") + if env_home and Path(env_home).resolve() == resolved_dir: + bound = True + except Exception: + # environ() can raise AccessDenied even same-user on some + # platforms; fall back to the argv signal only. + pass + + if bound: + pids.append(pid) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + continue + except Exception: + continue + + return pids + + +def _stop_profile_backends(canon: str, profile_dir: Path) -> None: + """Terminate any Desktop-spawned / stray backends bound to this profile. + + Complements ``_stop_gateway_process`` (which only knows ``gateway.pid``): + without this, a live ``serve``/``dashboard`` backend keeps creating files + under the profile dir while ``rmtree`` walks it, so the final ``rmdir`` + fails with ``ENOTEMPTY`` and the delete doesn't converge. Best-effort: + any failure is reported and swallowed so it never makes delete worse. + """ + pids = _profile_bound_backend_pids(canon, profile_dir) + if not pids: + return + + try: + from gateway.status import _pid_exists, terminate_pid as _terminate_pid + except Exception: + return + + for pid in pids: + try: + _terminate_pid(pid) # graceful first + except (ProcessLookupError, PermissionError, OSError): + continue + + # Wait up to 10s for graceful exit, then force-kill stragglers. + deadline = time.time() + 10.0 + while time.time() < deadline: + if not any(_pid_exists(pid) for pid in pids): + break + time.sleep(0.5) + + for pid in pids: + if _pid_exists(pid): + try: + _terminate_pid(pid, force=True) + except (ProcessLookupError, PermissionError, OSError): + pass + + print(f"✓ Stopped {len(pids)} profile backend process(es)") + + +def _rmtree_with_retry(profile_dir: Path, onexc_handler) -> None: + """``shutil.rmtree`` with a short retry loop for transient races. + + Even after stopping the gateway and profile backends, a just-terminated + process can leave in-flight writes (SQLite ``-wal``/``-shm`` checkpoints, + sandbox temp files) that land after ``rmtree`` has walked past a directory, + surfacing as ``ENOTEMPTY`` (POSIX) or a transient ``PermissionError`` + (Windows file lock still releasing). A few spaced retries let those settle + instead of failing the whole delete on a race the next attempt would win. + """ + attempts = 3 + last_exc: OSError | None = None + for attempt in range(attempts): + try: + # ``onexc`` was added in 3.12; fall back to ``onerror`` on 3.11. + try: + shutil.rmtree(profile_dir, onexc=onexc_handler) + except TypeError: + shutil.rmtree(profile_dir, onerror=onexc_handler) + return + except OSError as e: + last_exc = e + if not profile_dir.exists(): + return + if attempt < attempts - 1: + time.sleep(0.3 * (attempt + 1)) + if last_exc is not None: + raise last_exc + + def delete_profile(name: str, yes: bool = False) -> Path: """Delete a profile, its wrapper script, and its gateway service. @@ -1334,6 +1517,13 @@ def delete_profile(name: str, yes: bool = False) -> Path: if gw_running: _stop_gateway_process(profile_dir) + # 2b. Stop any other backends bound to this profile (Desktop-spawned + # serve/dashboard processes the gateway.pid file never names). They hold + # the profile's SQLite connection open and keep writing files, which makes + # the rmtree below fail with ENOTEMPTY and — before the ensure_hermes_home + # guard — resurrected the deleted tree. + _stop_profile_backends(canon, profile_dir) + # 3. Remove wrapper script if has_wrapper: if remove_wrapper_script(canon): @@ -1379,11 +1569,7 @@ def delete_profile(name: str, yes: bool = False) -> Path: else: raise - # ``onexc`` was added in 3.12; fall back to ``onerror`` on 3.11. - try: - shutil.rmtree(profile_dir, onexc=_make_writable) - except TypeError: - shutil.rmtree(profile_dir, onerror=_make_writable) + _rmtree_with_retry(profile_dir, _make_writable) print(f"✓ Removed {profile_dir}") except Exception as e: print(f"⚠ Could not remove {profile_dir}: {e}") diff --git a/tests/hermes_cli/test_profiles.py b/tests/hermes_cli/test_profiles.py index 608a10eab..91a13dd76 100644 --- a/tests/hermes_cli/test_profiles.py +++ b/tests/hermes_cli/test_profiles.py @@ -7,13 +7,18 @@ and shell completion generation. import json import io +import os +import shutil +import sys import tarfile +import types from pathlib import Path from unittest.mock import patch, MagicMock import pytest import yaml +from hermes_cli import profiles from hermes_cli.profiles import ( normalize_profile_name, validate_profile_name, @@ -578,6 +583,7 @@ class TestDeleteProfile: set_active_profile("coder") with patch("hermes_cli.profiles._cleanup_gateway_service"), \ + patch("hermes_cli.profiles.time.sleep"), \ patch("hermes_cli.profiles.shutil.rmtree", side_effect=PermissionError("locked")): with pytest.raises(RuntimeError, match="Could not remove profile directory"): delete_profile("coder", yes=True) @@ -585,6 +591,84 @@ class TestDeleteProfile: assert profile_dir.is_dir() assert get_active_profile() == "default" + def test_stops_profile_bound_backends_before_removal(self, profile_env): + """A Desktop-spawned backend (not in gateway.pid) is stopped first.""" + profile_dir = create_profile("coder", no_alias=True) + + with patch("hermes_cli.profiles._cleanup_gateway_service"), \ + patch("hermes_cli.profiles._profile_bound_backend_pids", return_value=[4242]) as pids, \ + patch("gateway.status.terminate_pid") as terminate, \ + patch("gateway.status._pid_exists", return_value=False): + delete_profile("coder", yes=True) + + pids.assert_called_once() + terminate.assert_any_call(4242) + assert not profile_dir.is_dir() + + def test_rmtree_retries_transient_enotempty_then_succeeds(self, profile_env): + """A live writer racing rmtree (ENOTEMPTY) is absorbed by a retry.""" + profile_dir = create_profile("coder", no_alias=True) + real_rmtree = shutil.rmtree + calls = {"n": 0} + + def flaky_rmtree(path, **kwargs): + calls["n"] += 1 + if calls["n"] == 1: + raise OSError(66, "Directory not empty") + return real_rmtree(path) + + with patch("hermes_cli.profiles._cleanup_gateway_service"), \ + patch("hermes_cli.profiles._profile_bound_backend_pids", return_value=[]), \ + patch("hermes_cli.profiles.time.sleep"), \ + patch("hermes_cli.profiles.shutil.rmtree", side_effect=flaky_rmtree): + delete_profile("coder", yes=True) + + assert calls["n"] == 2 + assert not profile_dir.is_dir() + + def test_backend_scan_only_matches_this_profile(self, profile_env, monkeypatch): + """The backend PID scan binds by --profile selector and skips self.""" + create_profile("coder", no_alias=True) + profile_dir = get_profile_dir("coder") + + class FakeProc: + def __init__(self, pid, cmdline, username="me"): + self.pid = pid + self.info = {"pid": pid, "name": "python", "username": username, "cmdline": cmdline} + + def parent(self): + return None + + def username(self): + return "me" + + def environ(self): + return {} + + self_pid = os.getpid() + procs = [ + # Backend bound to coder → matched. + FakeProc(101, ["python", "-m", "hermes_cli.main", "--profile", "coder", "serve"]), + # Interactive chat for coder → NOT a backend subcommand, skipped. + FakeProc(102, ["python", "-m", "hermes_cli.main", "--profile", "coder", "chat"]), + # Backend for a different profile → skipped. + FakeProc(103, ["python", "-m", "hermes_cli.main", "--profile", "other", "serve"]), + # This very process → skipped even if it matched. + FakeProc(self_pid, ["python", "-m", "hermes_cli.main", "--profile", "coder", "serve"]), + ] + + fake_psutil = types.SimpleNamespace( + process_iter=lambda attrs=None: iter(procs), + Process=lambda pid=None: FakeProc(self_pid, []), + NoSuchProcess=Exception, + AccessDenied=Exception, + ZombieProcess=Exception, + ) + monkeypatch.setitem(sys.modules, "psutil", fake_psutil) + + pids = profiles._profile_bound_backend_pids("coder", profile_dir) + assert pids == [101] + # =================================================================== # TestListProfiles