From 74809b4e9465a6e576634ba0fcf8e9c97e6f026e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 1 Jul 2026 03:43:20 -0700 Subject: [PATCH] fix(cli): reap dead-locked worktrees so .worktrees/ can't grow unbounded (#56288) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hermes -w locks each worktree (reason 'hermes pid='). git worktree remove --force (single -f) refuses a locked tree, so a crashed session's lock was never released and its worktree accumulated forever — a real contributor to .worktrees/ bloat. _prune_stale_worktrees now classifies each lock via _worktree_lock_is_live: a live-owner pid is skipped at any age; a dead-owner (or foreign) lock is unlocked first so the aggressive age-based cleanup can actually reap it. The >72h reap tier is kept (that cleanup is intentional) but now guarded so dirty/unpushed work is preserved, and branch deletion is gated on git worktree remove succeeding. New fail-safe helpers _worktree_is_dirty and _worktree_lock_is_live (pid liveness via gateway.status._pid_exists, Windows-safe). --- cli.py | 140 +++++++++++++++++++++++++++++++++++-- tests/cli/test_worktree.py | 135 +++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+), 6 deletions(-) diff --git a/cli.py b/cli.py index 2ceac1059..e2069486f 100644 --- a/cli.py +++ b/cli.py @@ -1529,6 +1529,89 @@ def _worktree_has_unpushed_commits(worktree_path: str, timeout: int = 10) -> boo return True +def _worktree_is_dirty(worktree_path: str, timeout: int = 10) -> bool: + """Return whether a worktree has uncommitted changes (staged, unstaged, or + untracked). + + Fails SAFE: on any error returns True so callers do not delete a worktree + whose state they cannot determine. + """ + import subprocess + + try: + result = subprocess.run( + ["git", "status", "--porcelain"], + capture_output=True, text=True, timeout=timeout, cwd=worktree_path, + ) + if result.returncode != 0: + return True + return bool(result.stdout.strip()) + except Exception: + return True + + +def _worktree_lock_is_live(repo_root: str, worktree_path: str, timeout: int = 10): + """Classify a worktree's git lock as live, dead, or absent. + + ``hermes -w`` locks each worktree with reason ``hermes pid=`` so a + concurrent hermes process' startup prune leaves an in-use worktree alone. + But a *crashed* session leaves the lock behind forever, and + ``git worktree remove --force`` (single ``-f``) refuses to remove a locked + worktree — so dead-locked worktrees accumulate indefinitely. This lets the + pruner tell the two apart: + + - ``"live"`` — locked and the owning pid is still running (skip it). + - ``"dead"`` — locked but the owning pid is gone, or the reason isn't a + parseable hermes lock (safe to unlock + reap). + - ``None`` — not locked at all. + + Fails SAFE toward ``"live"``: if git can't be queried at all we cannot + prove the worktree is safe to touch, so we report it as live. + """ + import re + import subprocess + + try: + result = subprocess.run( + ["git", "worktree", "list", "--porcelain"], + capture_output=True, text=True, timeout=timeout, cwd=repo_root, + ) + if result.returncode != 0: + return "live" + except Exception: + return "live" + + target = Path(worktree_path).resolve() + current: Optional[Path] = None + for line in result.stdout.splitlines(): + if line.startswith("worktree "): + try: + current = Path(line[len("worktree "):].strip()).resolve() + except Exception: + current = None + elif line == "locked" or line.startswith("locked "): + if current != target: + continue + reason = line[len("locked"):].strip() + m = re.search(r"hermes pid=(\d+)", reason) + if not m: + # Locked by something we don't recognize as a hermes session + # (or lock reason unavailable). Treat as dead — a foreign lock + # on a hermes -w worktree is almost certainly a leftover, and + # the age/dirty/unpushed gates already ran before we got here. + return "dead" + pid = int(m.group(1)) + if pid == os.getpid(): + return "live" + try: + from gateway.status import _pid_exists + return "live" if _pid_exists(pid) else "dead" + except Exception: + # Can't determine liveness — fail safe toward keeping it. + return "live" + return None + + def _cleanup_worktree(info: Dict[str, str] = None) -> None: """Remove a worktree and its branch on exit. @@ -1673,11 +1756,23 @@ def _run_checkpoint_auto_maintenance() -> None: def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None: """Remove stale worktrees and orphaned branches on startup. - Age-based tiers: + Age-based tiers (aggressive cleanup keeps ``.worktrees/`` from growing + unbounded): - Under max_age_hours (24h): skip — session may still be active. - 24h–72h: remove if no unpushed commits. - Over 72h: force remove regardless (nothing should sit this long). + Lock handling (orthogonal to age): ``hermes -w`` locks each worktree with + reason ``hermes pid=`` so a concurrent hermes process leaves an in-use + worktree alone. A *live*-locked worktree is skipped at any age; a + *dead*-locked one (owning pid gone — a crashed session) is unlocked first + so ``git worktree remove --force`` can actually reap it, otherwise those + leftovers accumulate forever (``remove --force`` refuses a locked tree). + + Branch deletion is gated on ``git worktree remove`` succeeding, so a failed + removal never orphans the branch (which would drop easy reachability of any + commits still in the worktree). + Also prunes orphaned ``hermes/*`` and ``pr-*`` local branches that have no corresponding worktree. """ @@ -1705,12 +1800,37 @@ def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None: except Exception: continue - force = mtime <= hard_cutoff # Over 72h — force remove + force = mtime <= hard_cutoff # Over 72h — reap aggressively + # Never delete real work, regardless of age. Unpushed commits and + # uncommitted changes may be a crashed session's in-flight work; the + # >72h tier reaps only abandoned *clean, fully-pushed* worktrees (the + # scratch trees that actually cause .worktrees/ bloat). + if _worktree_has_unpushed_commits(str(entry), timeout=5): + continue # Has unpushed commits or can't check — skip if not force: - # 24h–72h tier: only remove if no unpushed commits - if _worktree_has_unpushed_commits(str(entry), timeout=5): - continue # Has unpushed commits or can't check — skip + # 24h–72h tier is conservative: unpushed check above is enough. + pass + elif _worktree_is_dirty(str(entry), timeout=5): + continue # >72h but dirty — preserve uncommitted work + + # Respect git-native session locks. A lock owned by a still-running + # hermes process means the worktree is actively in use — never touch + # it. A lock whose owning pid is gone is a crashed session's leftover: + # unlock it so `git worktree remove --force` (single -f) can reap it, + # otherwise dead-locked worktrees pile up indefinitely. + lock_state = _worktree_lock_is_live(repo_root, str(entry), timeout=5) + if lock_state == "live": + logger.debug("Skipping live-locked worktree: %s", entry.name) + continue + if lock_state == "dead": + try: + subprocess.run( + ["git", "worktree", "unlock", str(entry)], + capture_output=True, text=True, timeout=10, cwd=repo_root, + ) + except Exception as e: + logger.debug("Failed to unlock dead worktree %s: %s", entry.name, e) # Safe to remove try: @@ -1720,10 +1840,18 @@ def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None: ) branch = branch_result.stdout.strip() - subprocess.run( + remove_result = subprocess.run( ["git", "worktree", "remove", str(entry), "--force"], capture_output=True, text=True, timeout=15, cwd=repo_root, ) + if remove_result.returncode != 0: + # Removal failed — keep the branch so any commits stay + # reachable rather than orphaning it. + logger.debug( + "Failed to remove worktree %s: %s", + entry.name, remove_result.stderr.strip(), + ) + continue if branch: subprocess.run( ["git", "branch", "-D", branch], diff --git a/tests/cli/test_worktree.py b/tests/cli/test_worktree.py index 221903e0e..220e6219c 100644 --- a/tests/cli/test_worktree.py +++ b/tests/cli/test_worktree.py @@ -1011,3 +1011,138 @@ class TestSystemPromptInjection: assert info["repo_root"] in wt_note assert "isolated git worktree" in wt_note assert "commit and push" in wt_note + + +class TestWorktreeLockReaping: + """Exercise the REAL cli._prune_stale_worktrees lock/dirty/unpushed logic. + + Unlike the reimplementation-based tests above, these import the actual + production functions so the behavior contract is enforced against the + shipped code: + + - live-locked (owning pid running) -> never reaped, any age + - dead-locked clean (owning pid gone) -> unlocked + reaped (fixes the + accumulation bug: `git worktree remove --force` refuses a locked tree) + - dirty (uncommitted) at >72h -> preserved + - unpushed commits at any age -> preserved + - clean/unlocked stale -> reaped (aggressive cleanup intact) + """ + + @staticmethod + def _age(path, hours): + import time + t = time.time() - (hours * 3600) + os.utime(path, (t, t)) + + @staticmethod + def _mk(cli, repo, name, pid=None, dirty=False, unpushed=False, age_h=100): + p = repo / ".worktrees" / name + (repo / ".worktrees").mkdir(exist_ok=True) + subprocess.run( + ["git", "worktree", "add", str(p), "-b", f"hermes/{name}", "HEAD"], + cwd=repo, capture_output=True, + ) + if pid is not None: + subprocess.run( + ["git", "worktree", "lock", "--reason", f"hermes pid={pid}", str(p)], + cwd=repo, capture_output=True, + ) + if unpushed: + (p / "work.txt").write_text("x") + subprocess.run(["git", "add", "work.txt"], cwd=p, capture_output=True) + subprocess.run(["git", "commit", "-m", "wip"], cwd=p, capture_output=True) + if dirty: + (p / "dirty.txt").write_text("uncommitted") + TestWorktreeLockReaping._age(p, age_h) + return p + + def test_live_locked_survives_at_any_age(self, git_repo): + import cli + wt = self._mk(cli, git_repo, "hermes-live", pid=os.getpid()) + cli._prune_stale_worktrees(str(git_repo)) + assert wt.exists(), "live-locked worktree (this pid) must never be reaped" + + def test_dead_locked_clean_is_reaped(self, git_repo): + import cli + wt = self._mk(cli, git_repo, "hermes-dead", pid=999999) + # sanity: this is the accumulation bug — remove --force alone can't do it + assert cli._worktree_lock_is_live(str(git_repo), str(wt)) == "dead" + cli._prune_stale_worktrees(str(git_repo)) + assert not wt.exists(), "dead-locked clean worktree should be unlocked + reaped" + + def test_dead_locked_dirty_survives(self, git_repo): + import cli + wt = self._mk(cli, git_repo, "hermes-deaddirty", pid=999999, dirty=True) + cli._prune_stale_worktrees(str(git_repo)) + assert wt.exists(), "dead-locked worktree with uncommitted work must survive" + + def test_dead_locked_unpushed_survives(self, git_repo): + import cli + wt = self._mk(cli, git_repo, "hermes-deadunp", pid=999999, unpushed=True) + cli._prune_stale_worktrees(str(git_repo)) + assert wt.exists(), "dead-locked worktree with unpushed commits must survive" + + def test_unlocked_clean_stale_is_reaped(self, git_repo): + import cli + wt = self._mk(cli, git_repo, "hermes-nolock", pid=None) + cli._prune_stale_worktrees(str(git_repo)) + assert not wt.exists(), "clean unlocked stale worktree should be reaped" + + def test_dirty_survives_over_72h(self, git_repo): + import cli + wt = self._mk(cli, git_repo, "hermes-dirty72", pid=None, dirty=True, age_h=100) + cli._prune_stale_worktrees(str(git_repo)) + assert wt.exists(), "dirty worktree must survive even past the 72h tier" + + def test_recent_worktree_untouched(self, git_repo): + import cli + wt = self._mk(cli, git_repo, "hermes-fresh", pid=None, age_h=1) + cli._prune_stale_worktrees(str(git_repo)) + assert wt.exists(), "worktree under 24h must never be pruned" + + +class TestWorktreeLockPredicate: + """_worktree_lock_is_live classification (real cli helper).""" + + def _mk_locked(self, repo, name, reason): + p = repo / ".worktrees" / name + (repo / ".worktrees").mkdir(exist_ok=True) + subprocess.run( + ["git", "worktree", "add", str(p), "-b", f"hermes/{name}", "HEAD"], + cwd=repo, capture_output=True, + ) + subprocess.run( + ["git", "worktree", "lock", "--reason", reason, str(p)], + cwd=repo, capture_output=True, + ) + return p + + def test_unlocked_returns_none(self, git_repo): + import cli + p = git_repo / ".worktrees" / "hermes-x" + (git_repo / ".worktrees").mkdir(exist_ok=True) + subprocess.run( + ["git", "worktree", "add", str(p), "-b", "hermes/hermes-x", "HEAD"], + cwd=git_repo, capture_output=True, + ) + assert cli._worktree_lock_is_live(str(git_repo), str(p)) is None + + def test_live_pid_returns_live(self, git_repo): + import cli + p = self._mk_locked(git_repo, "hermes-live", f"hermes pid={os.getpid()}") + assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "live" + + def test_dead_pid_returns_dead(self, git_repo): + import cli + p = self._mk_locked(git_repo, "hermes-dead", "hermes pid=999999") + assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "dead" + + def test_foreign_lock_reason_returns_dead(self, git_repo): + import cli + p = self._mk_locked(git_repo, "hermes-foreign", "some other tool") + assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "dead" + + def test_bad_repo_root_fails_safe_to_live(self, tmp_path): + import cli + # Not a git repo -> git query fails -> must report "live" (never delete) + assert cli._worktree_lock_is_live(str(tmp_path), str(tmp_path / "x")) == "live"