fix(cli): reap dead-locked worktrees so .worktrees/ can't grow unbounded (#56288)
hermes -w locks each worktree (reason 'hermes pid=<pid>'). git worktree remove --force (single -f) refuses a locked tree, so a crashed session's lock was never released and its worktree accumulated forever — a real contributor to .worktrees/ bloat. _prune_stale_worktrees now classifies each lock via _worktree_lock_is_live: a live-owner pid is skipped at any age; a dead-owner (or foreign) lock is unlocked first so the aggressive age-based cleanup can actually reap it. The >72h reap tier is kept (that cleanup is intentional) but now guarded so dirty/unpushed work is preserved, and branch deletion is gated on git worktree remove succeeding. New fail-safe helpers _worktree_is_dirty and _worktree_lock_is_live (pid liveness via gateway.status._pid_exists, Windows-safe).
This commit is contained in:
parent
5c2dccd06f
commit
74809b4e94
2 changed files with 269 additions and 6 deletions
140
cli.py
140
cli.py
|
|
@ -1529,6 +1529,89 @@ def _worktree_has_unpushed_commits(worktree_path: str, timeout: int = 10) -> boo
|
|||
return True
|
||||
|
||||
|
||||
def _worktree_is_dirty(worktree_path: str, timeout: int = 10) -> bool:
|
||||
"""Return whether a worktree has uncommitted changes (staged, unstaged, or
|
||||
untracked).
|
||||
|
||||
Fails SAFE: on any error returns True so callers do not delete a worktree
|
||||
whose state they cannot determine.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "status", "--porcelain"],
|
||||
capture_output=True, text=True, timeout=timeout, cwd=worktree_path,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return True
|
||||
return bool(result.stdout.strip())
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
|
||||
def _worktree_lock_is_live(repo_root: str, worktree_path: str, timeout: int = 10):
|
||||
"""Classify a worktree's git lock as live, dead, or absent.
|
||||
|
||||
``hermes -w`` locks each worktree with reason ``hermes pid=<pid>`` so a
|
||||
concurrent hermes process' startup prune leaves an in-use worktree alone.
|
||||
But a *crashed* session leaves the lock behind forever, and
|
||||
``git worktree remove --force`` (single ``-f``) refuses to remove a locked
|
||||
worktree — so dead-locked worktrees accumulate indefinitely. This lets the
|
||||
pruner tell the two apart:
|
||||
|
||||
- ``"live"`` — locked and the owning pid is still running (skip it).
|
||||
- ``"dead"`` — locked but the owning pid is gone, or the reason isn't a
|
||||
parseable hermes lock (safe to unlock + reap).
|
||||
- ``None`` — not locked at all.
|
||||
|
||||
Fails SAFE toward ``"live"``: if git can't be queried at all we cannot
|
||||
prove the worktree is safe to touch, so we report it as live.
|
||||
"""
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "worktree", "list", "--porcelain"],
|
||||
capture_output=True, text=True, timeout=timeout, cwd=repo_root,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return "live"
|
||||
except Exception:
|
||||
return "live"
|
||||
|
||||
target = Path(worktree_path).resolve()
|
||||
current: Optional[Path] = None
|
||||
for line in result.stdout.splitlines():
|
||||
if line.startswith("worktree "):
|
||||
try:
|
||||
current = Path(line[len("worktree "):].strip()).resolve()
|
||||
except Exception:
|
||||
current = None
|
||||
elif line == "locked" or line.startswith("locked "):
|
||||
if current != target:
|
||||
continue
|
||||
reason = line[len("locked"):].strip()
|
||||
m = re.search(r"hermes pid=(\d+)", reason)
|
||||
if not m:
|
||||
# Locked by something we don't recognize as a hermes session
|
||||
# (or lock reason unavailable). Treat as dead — a foreign lock
|
||||
# on a hermes -w worktree is almost certainly a leftover, and
|
||||
# the age/dirty/unpushed gates already ran before we got here.
|
||||
return "dead"
|
||||
pid = int(m.group(1))
|
||||
if pid == os.getpid():
|
||||
return "live"
|
||||
try:
|
||||
from gateway.status import _pid_exists
|
||||
return "live" if _pid_exists(pid) else "dead"
|
||||
except Exception:
|
||||
# Can't determine liveness — fail safe toward keeping it.
|
||||
return "live"
|
||||
return None
|
||||
|
||||
|
||||
def _cleanup_worktree(info: Dict[str, str] = None) -> None:
|
||||
"""Remove a worktree and its branch on exit.
|
||||
|
||||
|
|
@ -1673,11 +1756,23 @@ def _run_checkpoint_auto_maintenance() -> None:
|
|||
def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
|
||||
"""Remove stale worktrees and orphaned branches on startup.
|
||||
|
||||
Age-based tiers:
|
||||
Age-based tiers (aggressive cleanup keeps ``.worktrees/`` from growing
|
||||
unbounded):
|
||||
- Under max_age_hours (24h): skip — session may still be active.
|
||||
- 24h–72h: remove if no unpushed commits.
|
||||
- Over 72h: force remove regardless (nothing should sit this long).
|
||||
|
||||
Lock handling (orthogonal to age): ``hermes -w`` locks each worktree with
|
||||
reason ``hermes pid=<pid>`` so a concurrent hermes process leaves an in-use
|
||||
worktree alone. A *live*-locked worktree is skipped at any age; a
|
||||
*dead*-locked one (owning pid gone — a crashed session) is unlocked first
|
||||
so ``git worktree remove --force`` can actually reap it, otherwise those
|
||||
leftovers accumulate forever (``remove --force`` refuses a locked tree).
|
||||
|
||||
Branch deletion is gated on ``git worktree remove`` succeeding, so a failed
|
||||
removal never orphans the branch (which would drop easy reachability of any
|
||||
commits still in the worktree).
|
||||
|
||||
Also prunes orphaned ``hermes/*`` and ``pr-*`` local branches that
|
||||
have no corresponding worktree.
|
||||
"""
|
||||
|
|
@ -1705,12 +1800,37 @@ def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
|
|||
except Exception:
|
||||
continue
|
||||
|
||||
force = mtime <= hard_cutoff # Over 72h — force remove
|
||||
force = mtime <= hard_cutoff # Over 72h — reap aggressively
|
||||
|
||||
# Never delete real work, regardless of age. Unpushed commits and
|
||||
# uncommitted changes may be a crashed session's in-flight work; the
|
||||
# >72h tier reaps only abandoned *clean, fully-pushed* worktrees (the
|
||||
# scratch trees that actually cause .worktrees/ bloat).
|
||||
if _worktree_has_unpushed_commits(str(entry), timeout=5):
|
||||
continue # Has unpushed commits or can't check — skip
|
||||
if not force:
|
||||
# 24h–72h tier: only remove if no unpushed commits
|
||||
if _worktree_has_unpushed_commits(str(entry), timeout=5):
|
||||
continue # Has unpushed commits or can't check — skip
|
||||
# 24h–72h tier is conservative: unpushed check above is enough.
|
||||
pass
|
||||
elif _worktree_is_dirty(str(entry), timeout=5):
|
||||
continue # >72h but dirty — preserve uncommitted work
|
||||
|
||||
# Respect git-native session locks. A lock owned by a still-running
|
||||
# hermes process means the worktree is actively in use — never touch
|
||||
# it. A lock whose owning pid is gone is a crashed session's leftover:
|
||||
# unlock it so `git worktree remove --force` (single -f) can reap it,
|
||||
# otherwise dead-locked worktrees pile up indefinitely.
|
||||
lock_state = _worktree_lock_is_live(repo_root, str(entry), timeout=5)
|
||||
if lock_state == "live":
|
||||
logger.debug("Skipping live-locked worktree: %s", entry.name)
|
||||
continue
|
||||
if lock_state == "dead":
|
||||
try:
|
||||
subprocess.run(
|
||||
["git", "worktree", "unlock", str(entry)],
|
||||
capture_output=True, text=True, timeout=10, cwd=repo_root,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Failed to unlock dead worktree %s: %s", entry.name, e)
|
||||
|
||||
# Safe to remove
|
||||
try:
|
||||
|
|
@ -1720,10 +1840,18 @@ def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
|
|||
)
|
||||
branch = branch_result.stdout.strip()
|
||||
|
||||
subprocess.run(
|
||||
remove_result = subprocess.run(
|
||||
["git", "worktree", "remove", str(entry), "--force"],
|
||||
capture_output=True, text=True, timeout=15, cwd=repo_root,
|
||||
)
|
||||
if remove_result.returncode != 0:
|
||||
# Removal failed — keep the branch so any commits stay
|
||||
# reachable rather than orphaning it.
|
||||
logger.debug(
|
||||
"Failed to remove worktree %s: %s",
|
||||
entry.name, remove_result.stderr.strip(),
|
||||
)
|
||||
continue
|
||||
if branch:
|
||||
subprocess.run(
|
||||
["git", "branch", "-D", branch],
|
||||
|
|
|
|||
|
|
@ -1011,3 +1011,138 @@ class TestSystemPromptInjection:
|
|||
assert info["repo_root"] in wt_note
|
||||
assert "isolated git worktree" in wt_note
|
||||
assert "commit and push" in wt_note
|
||||
|
||||
|
||||
class TestWorktreeLockReaping:
|
||||
"""Exercise the REAL cli._prune_stale_worktrees lock/dirty/unpushed logic.
|
||||
|
||||
Unlike the reimplementation-based tests above, these import the actual
|
||||
production functions so the behavior contract is enforced against the
|
||||
shipped code:
|
||||
|
||||
- live-locked (owning pid running) -> never reaped, any age
|
||||
- dead-locked clean (owning pid gone) -> unlocked + reaped (fixes the
|
||||
accumulation bug: `git worktree remove --force` refuses a locked tree)
|
||||
- dirty (uncommitted) at >72h -> preserved
|
||||
- unpushed commits at any age -> preserved
|
||||
- clean/unlocked stale -> reaped (aggressive cleanup intact)
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _age(path, hours):
|
||||
import time
|
||||
t = time.time() - (hours * 3600)
|
||||
os.utime(path, (t, t))
|
||||
|
||||
@staticmethod
|
||||
def _mk(cli, repo, name, pid=None, dirty=False, unpushed=False, age_h=100):
|
||||
p = repo / ".worktrees" / name
|
||||
(repo / ".worktrees").mkdir(exist_ok=True)
|
||||
subprocess.run(
|
||||
["git", "worktree", "add", str(p), "-b", f"hermes/{name}", "HEAD"],
|
||||
cwd=repo, capture_output=True,
|
||||
)
|
||||
if pid is not None:
|
||||
subprocess.run(
|
||||
["git", "worktree", "lock", "--reason", f"hermes pid={pid}", str(p)],
|
||||
cwd=repo, capture_output=True,
|
||||
)
|
||||
if unpushed:
|
||||
(p / "work.txt").write_text("x")
|
||||
subprocess.run(["git", "add", "work.txt"], cwd=p, capture_output=True)
|
||||
subprocess.run(["git", "commit", "-m", "wip"], cwd=p, capture_output=True)
|
||||
if dirty:
|
||||
(p / "dirty.txt").write_text("uncommitted")
|
||||
TestWorktreeLockReaping._age(p, age_h)
|
||||
return p
|
||||
|
||||
def test_live_locked_survives_at_any_age(self, git_repo):
|
||||
import cli
|
||||
wt = self._mk(cli, git_repo, "hermes-live", pid=os.getpid())
|
||||
cli._prune_stale_worktrees(str(git_repo))
|
||||
assert wt.exists(), "live-locked worktree (this pid) must never be reaped"
|
||||
|
||||
def test_dead_locked_clean_is_reaped(self, git_repo):
|
||||
import cli
|
||||
wt = self._mk(cli, git_repo, "hermes-dead", pid=999999)
|
||||
# sanity: this is the accumulation bug — remove --force alone can't do it
|
||||
assert cli._worktree_lock_is_live(str(git_repo), str(wt)) == "dead"
|
||||
cli._prune_stale_worktrees(str(git_repo))
|
||||
assert not wt.exists(), "dead-locked clean worktree should be unlocked + reaped"
|
||||
|
||||
def test_dead_locked_dirty_survives(self, git_repo):
|
||||
import cli
|
||||
wt = self._mk(cli, git_repo, "hermes-deaddirty", pid=999999, dirty=True)
|
||||
cli._prune_stale_worktrees(str(git_repo))
|
||||
assert wt.exists(), "dead-locked worktree with uncommitted work must survive"
|
||||
|
||||
def test_dead_locked_unpushed_survives(self, git_repo):
|
||||
import cli
|
||||
wt = self._mk(cli, git_repo, "hermes-deadunp", pid=999999, unpushed=True)
|
||||
cli._prune_stale_worktrees(str(git_repo))
|
||||
assert wt.exists(), "dead-locked worktree with unpushed commits must survive"
|
||||
|
||||
def test_unlocked_clean_stale_is_reaped(self, git_repo):
|
||||
import cli
|
||||
wt = self._mk(cli, git_repo, "hermes-nolock", pid=None)
|
||||
cli._prune_stale_worktrees(str(git_repo))
|
||||
assert not wt.exists(), "clean unlocked stale worktree should be reaped"
|
||||
|
||||
def test_dirty_survives_over_72h(self, git_repo):
|
||||
import cli
|
||||
wt = self._mk(cli, git_repo, "hermes-dirty72", pid=None, dirty=True, age_h=100)
|
||||
cli._prune_stale_worktrees(str(git_repo))
|
||||
assert wt.exists(), "dirty worktree must survive even past the 72h tier"
|
||||
|
||||
def test_recent_worktree_untouched(self, git_repo):
|
||||
import cli
|
||||
wt = self._mk(cli, git_repo, "hermes-fresh", pid=None, age_h=1)
|
||||
cli._prune_stale_worktrees(str(git_repo))
|
||||
assert wt.exists(), "worktree under 24h must never be pruned"
|
||||
|
||||
|
||||
class TestWorktreeLockPredicate:
|
||||
"""_worktree_lock_is_live classification (real cli helper)."""
|
||||
|
||||
def _mk_locked(self, repo, name, reason):
|
||||
p = repo / ".worktrees" / name
|
||||
(repo / ".worktrees").mkdir(exist_ok=True)
|
||||
subprocess.run(
|
||||
["git", "worktree", "add", str(p), "-b", f"hermes/{name}", "HEAD"],
|
||||
cwd=repo, capture_output=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["git", "worktree", "lock", "--reason", reason, str(p)],
|
||||
cwd=repo, capture_output=True,
|
||||
)
|
||||
return p
|
||||
|
||||
def test_unlocked_returns_none(self, git_repo):
|
||||
import cli
|
||||
p = git_repo / ".worktrees" / "hermes-x"
|
||||
(git_repo / ".worktrees").mkdir(exist_ok=True)
|
||||
subprocess.run(
|
||||
["git", "worktree", "add", str(p), "-b", "hermes/hermes-x", "HEAD"],
|
||||
cwd=git_repo, capture_output=True,
|
||||
)
|
||||
assert cli._worktree_lock_is_live(str(git_repo), str(p)) is None
|
||||
|
||||
def test_live_pid_returns_live(self, git_repo):
|
||||
import cli
|
||||
p = self._mk_locked(git_repo, "hermes-live", f"hermes pid={os.getpid()}")
|
||||
assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "live"
|
||||
|
||||
def test_dead_pid_returns_dead(self, git_repo):
|
||||
import cli
|
||||
p = self._mk_locked(git_repo, "hermes-dead", "hermes pid=999999")
|
||||
assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "dead"
|
||||
|
||||
def test_foreign_lock_reason_returns_dead(self, git_repo):
|
||||
import cli
|
||||
p = self._mk_locked(git_repo, "hermes-foreign", "some other tool")
|
||||
assert cli._worktree_lock_is_live(str(git_repo), str(p)) == "dead"
|
||||
|
||||
def test_bad_repo_root_fails_safe_to_live(self, tmp_path):
|
||||
import cli
|
||||
# Not a git repo -> git query fails -> must report "live" (never delete)
|
||||
assert cli._worktree_lock_is_live(str(tmp_path), str(tmp_path / "x")) == "live"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue