fix(cli): reap dead-locked worktrees so .worktrees/ can't grow unbounded (#56288)

hermes -w locks each worktree (reason 'hermes pid=<pid>'). git worktree
remove --force (single -f) refuses a locked tree, so a crashed session's
lock was never released and its worktree accumulated forever — a real
contributor to .worktrees/ bloat.

_prune_stale_worktrees now classifies each lock via _worktree_lock_is_live:
a live-owner pid is skipped at any age; a dead-owner (or foreign) lock is
unlocked first so the aggressive age-based cleanup can actually reap it.
The >72h reap tier is kept (that cleanup is intentional) but now guarded so
dirty/unpushed work is preserved, and branch deletion is gated on
git worktree remove succeeding. New fail-safe helpers _worktree_is_dirty
and _worktree_lock_is_live (pid liveness via gateway.status._pid_exists,
Windows-safe).
This commit is contained in:
Teknium 2026-07-01 03:43:20 -07:00 committed by GitHub
parent 5c2dccd06f
commit 74809b4e94
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 269 additions and 6 deletions

140
cli.py
View file

@ -1529,6 +1529,89 @@ def _worktree_has_unpushed_commits(worktree_path: str, timeout: int = 10) -> boo
return True
def _worktree_is_dirty(worktree_path: str, timeout: int = 10) -> bool:
"""Return whether a worktree has uncommitted changes (staged, unstaged, or
untracked).
Fails SAFE: on any error returns True so callers do not delete a worktree
whose state they cannot determine.
"""
import subprocess
try:
result = subprocess.run(
["git", "status", "--porcelain"],
capture_output=True, text=True, timeout=timeout, cwd=worktree_path,
)
if result.returncode != 0:
return True
return bool(result.stdout.strip())
except Exception:
return True
def _worktree_lock_is_live(repo_root: str, worktree_path: str, timeout: int = 10):
"""Classify a worktree's git lock as live, dead, or absent.
``hermes -w`` locks each worktree with reason ``hermes pid=<pid>`` so a
concurrent hermes process' startup prune leaves an in-use worktree alone.
But a *crashed* session leaves the lock behind forever, and
``git worktree remove --force`` (single ``-f``) refuses to remove a locked
worktree so dead-locked worktrees accumulate indefinitely. This lets the
pruner tell the two apart:
- ``"live"`` locked and the owning pid is still running (skip it).
- ``"dead"`` locked but the owning pid is gone, or the reason isn't a
parseable hermes lock (safe to unlock + reap).
- ``None`` not locked at all.
Fails SAFE toward ``"live"``: if git can't be queried at all we cannot
prove the worktree is safe to touch, so we report it as live.
"""
import re
import subprocess
try:
result = subprocess.run(
["git", "worktree", "list", "--porcelain"],
capture_output=True, text=True, timeout=timeout, cwd=repo_root,
)
if result.returncode != 0:
return "live"
except Exception:
return "live"
target = Path(worktree_path).resolve()
current: Optional[Path] = None
for line in result.stdout.splitlines():
if line.startswith("worktree "):
try:
current = Path(line[len("worktree "):].strip()).resolve()
except Exception:
current = None
elif line == "locked" or line.startswith("locked "):
if current != target:
continue
reason = line[len("locked"):].strip()
m = re.search(r"hermes pid=(\d+)", reason)
if not m:
# Locked by something we don't recognize as a hermes session
# (or lock reason unavailable). Treat as dead — a foreign lock
# on a hermes -w worktree is almost certainly a leftover, and
# the age/dirty/unpushed gates already ran before we got here.
return "dead"
pid = int(m.group(1))
if pid == os.getpid():
return "live"
try:
from gateway.status import _pid_exists
return "live" if _pid_exists(pid) else "dead"
except Exception:
# Can't determine liveness — fail safe toward keeping it.
return "live"
return None
def _cleanup_worktree(info: Dict[str, str] = None) -> None:
"""Remove a worktree and its branch on exit.
@ -1673,11 +1756,23 @@ def _run_checkpoint_auto_maintenance() -> None:
def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
"""Remove stale worktrees and orphaned branches on startup.
Age-based tiers:
Age-based tiers (aggressive cleanup keeps ``.worktrees/`` from growing
unbounded):
- Under max_age_hours (24h): skip session may still be active.
- 24h72h: remove if no unpushed commits.
- Over 72h: force remove regardless (nothing should sit this long).
Lock handling (orthogonal to age): ``hermes -w`` locks each worktree with
reason ``hermes pid=<pid>`` so a concurrent hermes process leaves an in-use
worktree alone. A *live*-locked worktree is skipped at any age; a
*dead*-locked one (owning pid gone a crashed session) is unlocked first
so ``git worktree remove --force`` can actually reap it, otherwise those
leftovers accumulate forever (``remove --force`` refuses a locked tree).
Branch deletion is gated on ``git worktree remove`` succeeding, so a failed
removal never orphans the branch (which would drop easy reachability of any
commits still in the worktree).
Also prunes orphaned ``hermes/*`` and ``pr-*`` local branches that
have no corresponding worktree.
"""
@ -1705,12 +1800,37 @@ def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
except Exception:
continue
force = mtime <= hard_cutoff # Over 72h — force remove
force = mtime <= hard_cutoff # Over 72h — reap aggressively
# Never delete real work, regardless of age. Unpushed commits and
# uncommitted changes may be a crashed session's in-flight work; the
# >72h tier reaps only abandoned *clean, fully-pushed* worktrees (the
# scratch trees that actually cause .worktrees/ bloat).
if _worktree_has_unpushed_commits(str(entry), timeout=5):
continue # Has unpushed commits or can't check — skip
if not force:
# 24h72h tier: only remove if no unpushed commits
if _worktree_has_unpushed_commits(str(entry), timeout=5):
continue # Has unpushed commits or can't check — skip
# 24h72h tier is conservative: unpushed check above is enough.
pass
elif _worktree_is_dirty(str(entry), timeout=5):
continue # >72h but dirty — preserve uncommitted work
# Respect git-native session locks. A lock owned by a still-running
# hermes process means the worktree is actively in use — never touch
# it. A lock whose owning pid is gone is a crashed session's leftover:
# unlock it so `git worktree remove --force` (single -f) can reap it,
# otherwise dead-locked worktrees pile up indefinitely.
lock_state = _worktree_lock_is_live(repo_root, str(entry), timeout=5)
if lock_state == "live":
logger.debug("Skipping live-locked worktree: %s", entry.name)
continue
if lock_state == "dead":
try:
subprocess.run(
["git", "worktree", "unlock", str(entry)],
capture_output=True, text=True, timeout=10, cwd=repo_root,
)
except Exception as e:
logger.debug("Failed to unlock dead worktree %s: %s", entry.name, e)
# Safe to remove
try:
@ -1720,10 +1840,18 @@ def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
)
branch = branch_result.stdout.strip()
subprocess.run(
remove_result = subprocess.run(
["git", "worktree", "remove", str(entry), "--force"],
capture_output=True, text=True, timeout=15, cwd=repo_root,
)
if remove_result.returncode != 0:
# Removal failed — keep the branch so any commits stay
# reachable rather than orphaning it.
logger.debug(
"Failed to remove worktree %s: %s",
entry.name, remove_result.stderr.strip(),
)
continue
if branch:
subprocess.run(
["git", "branch", "-D", branch],

View file

@ -1011,3 +1011,138 @@ class TestSystemPromptInjection:
assert info["repo_root"] in wt_note
assert "isolated git worktree" in wt_note
assert "commit and push" in wt_note
class TestWorktreeLockReaping:
"""Exercise the REAL cli._prune_stale_worktrees lock/dirty/unpushed logic.
Unlike the reimplementation-based tests above, these import the actual
production functions so the behavior contract is enforced against the
shipped code:
- live-locked (owning pid running) -> never reaped, any age
- dead-locked clean (owning pid gone) -> unlocked + reaped (fixes the
accumulation bug: `git worktree remove --force` refuses a locked tree)
- dirty (uncommitted) at >72h -> preserved
- unpushed commits at any age -> preserved
- clean/unlocked stale -> reaped (aggressive cleanup intact)
"""
@staticmethod
def _age(path, hours):
import time
t = time.time() - (hours * 3600)
os.utime(path, (t, t))
@staticmethod
def _mk(cli, repo, name, pid=None, dirty=False, unpushed=False, age_h=100):
p = repo / ".worktrees" / name
(repo / ".worktrees").mkdir(exist_ok=True)
subprocess.run(
["git", "worktree", "add", str(p), "-b", f"hermes/{name}", "HEAD"],
cwd=repo, capture_output=True,
)
if pid is not None:
subprocess.run(
["git", "worktree", "lock", "--reason", f"hermes pid={pid}", str(p)],
cwd=repo, capture_output=True,
)
if unpushed:
(p / "work.txt").write_text("x")
subprocess.run(["git", "add", "work.txt"], cwd=p, capture_output=True)
subprocess.run(["git", "commit", "-m", "wip"], cwd=p, capture_output=True)
if dirty:
(p / "dirty.txt").write_text("uncommitted")
TestWorktreeLockReaping._age(p, age_h)
return p
def test_live_locked_survives_at_any_age(self, git_repo):
import cli
wt = self._mk(cli, git_repo, "hermes-live", pid=os.getpid())
cli._prune_stale_worktrees(str(git_repo))
assert wt.exists(), "live-locked worktree (this pid) must never be reaped"
def test_dead_locked_clean_is_reaped(self, git_repo):
import cli
wt = self._mk(cli, git_repo, "hermes-dead", pid=999999)
# sanity: this is the accumulation bug — remove --force alone can't do it
assert cli._worktree_lock_is_live(str(git_repo), str(wt)) == "dead"
cli._prune_stale_worktrees(str(git_repo))
assert not wt.exists(), "dead-locked clean worktree should be unlocked + reaped"
def test_dead_locked_dirty_survives(self, git_repo):
import cli
wt = self._mk(cli, git_repo, "hermes-deaddirty", pid=999999, dirty=True)
cli._prune_stale_worktrees(str(git_repo))
assert wt.exists(), "dead-locked worktree with uncommitted work must survive"
def test_dead_locked_unpushed_survives(self, git_repo):
import cli
wt = self._mk(cli, git_repo, "hermes-deadunp", pid=999999, unpushed=True)
cli._prune_stale_worktrees(str(git_repo))
assert wt.exists(), "dead-locked worktree with unpushed commits must survive"
def test_unlocked_clean_stale_is_reaped(self, git_repo):
import cli
wt = self._mk(cli, git_repo, "hermes-nolock", pid=None)
cli._prune_stale_worktrees(str(git_repo))
assert not wt.exists(), "clean unlocked stale worktree should be reaped"
def test_dirty_survives_over_72h(self, git_repo):
import cli
wt = self._mk(cli, git_repo, "hermes-dirty72", pid=None, dirty=True, age_h=100)
cli._prune_stale_worktrees(str(git_repo))
assert wt.exists(), "dirty worktree must survive even past the 72h tier"
def test_recent_worktree_untouched(self, git_repo):
import cli
wt = self._mk(cli, git_repo, "hermes-fresh", pid=None, age_h=1)
cli._prune_stale_worktrees(str(git_repo))
assert wt.exists(), "worktree under 24h must never be pruned"
class TestWorktreeLockPredicate:
"""_worktree_lock_is_live classification (real cli helper)."""
def _mk_locked(self, repo, name, reason):
p = repo / ".worktrees" / name
(repo / ".worktrees").mkdir(exist_ok=True)
subprocess.run(
["git", "worktree", "add", str(p), "-b", f"hermes/{name}", "HEAD"],
cwd=repo, capture_output=True,
)
subprocess.run(
["git", "worktree", "lock", "--reason", reason, str(p)],
cwd=repo, capture_output=True,
)
return p
def test_unlocked_returns_none(self, git_repo):
import cli
p = git_repo / ".worktrees" / "hermes-x"
(git_repo / ".worktrees").mkdir(exist_ok=True)
subprocess.run(
["git", "worktree", "add", str(p), "-b", "hermes/hermes-x", "HEAD"],
cwd=git_repo, capture_output=True,
)
assert cli._worktree_lock_is_live(str(git_repo), str(p)) is None
def test_live_pid_returns_live(self, git_repo):
import cli
p = self._mk_locked(git_repo, "hermes-live", f"hermes pid={os.getpid()}")
assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "live"
def test_dead_pid_returns_dead(self, git_repo):
import cli
p = self._mk_locked(git_repo, "hermes-dead", "hermes pid=999999")
assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "dead"
def test_foreign_lock_reason_returns_dead(self, git_repo):
import cli
p = self._mk_locked(git_repo, "hermes-foreign", "some other tool")
assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "dead"
def test_bad_repo_root_fails_safe_to_live(self, tmp_path):
import cli
# Not a git repo -> git query fails -> must report "live" (never delete)
assert cli._worktree_lock_is_live(str(tmp_path), str(tmp_path / "x")) == "live"