fix: protect cron output root from cleanup

Only classify files below cron/output/ as disposable cron output.
The cron/output directory itself is a durable container for retained
job history and should not be tracked or deleted wholesale.

Add regression coverage for both category detection and cleanup of a
stale tracked entry pointing at the output root.
This commit is contained in:
martinramos002-bot 2026-06-19 16:23:28 -05:00 committed by kshitij
parent 7f71a48a3a
commit d173e8c3a7
2 changed files with 37 additions and 4 deletions

View file

@ -166,9 +166,11 @@ def _is_protected_cron_path(p: Path) -> bool:
"""Return True if *p* is a cron control-plane file/directory that must
never be deleted.
This only matches the directory itself and known control-plane files
(``jobs.json``, ``.tick.lock``) it does NOT blanket-protect
everything under ``cron/`` because ``cron/output/`` is disposable.
This only matches the directory itself, known control-plane files
(``jobs.json``, ``.tick.lock``), and the ``output/`` root directory.
It does NOT blanket-protect files under ``cron/output/`` because those
run artifacts are disposable, but deleting the output root itself is too
broad and can erase every job's retained run history at once.
"""
# Lazily build the set once per process so HERMES_HOME is resolved
# exactly once.
@ -177,6 +179,7 @@ def _is_protected_cron_path(p: Path) -> bool:
for parent in ("cron", "cronjobs"):
base = hermes_home / parent
_PROTECTED_CRON_PATHS.add(str(base))
_PROTECTED_CRON_PATHS.add(str(base / "output"))
_PROTECTED_CRON_PATHS.add(str(base / "jobs.json"))
_PROTECTED_CRON_PATHS.add(str(base / ".tick.lock"))
resolved = str(p.resolve())
@ -566,7 +569,7 @@ def guess_category(path: Path) -> Optional[str]:
# (e.g. ``jobs.json``, ``.tick.lock``) must never be
# auto-tracked — deleting it wipes the live scheduler
# registry. See issue #32164.
if len(rel.parts) >= 2 and rel.parts[1] == "output":
if len(rel.parts) >= 3 and rel.parts[1] == "output":
return "cron-output"
return None
if top == "cache":

View file

@ -136,6 +136,13 @@ class TestGuessCategory:
p.write_text("x")
assert dg.guess_category(p) == "cron-output"
def test_cron_output_root_not_tracked(self, _isolate_env):
"""The cron/output root is durable container state, not an artifact."""
dg = _load_lib()
output_root = _isolate_env / "cron" / "output"
output_root.mkdir(parents=True)
assert dg.guess_category(output_root) is None
def test_cron_jobs_json_not_tracked(self, _isolate_env):
"""Regression for #32164: the cron registry must never be tracked."""
dg = _load_lib()
@ -228,6 +235,29 @@ class TestStaleCronEntryMigration:
assert summary["deleted"] == 0, "cron/ dir must not be deleted"
assert cron_dir.exists()
def test_quick_skips_stale_cron_output_for_output_root(self, _isolate_env):
"""Stale entry for cron/output itself must not delete all job output."""
dg = _load_lib()
output_root = _isolate_env / "cron" / "output"
job_dir = output_root / "job_1"
job_dir.mkdir(parents=True)
run_md = job_dir / "run.md"
run_md.write_text("x")
tracked_file = _isolate_env / "disk-cleanup" / "tracked.json"
tracked_file.parent.mkdir(parents=True, exist_ok=True)
tracked_file.write_text(json.dumps([{
"path": str(output_root),
"category": "cron-output",
"timestamp": "2025-01-01T00:00:00+00:00",
"size": 0,
}]))
summary = dg.quick()
assert summary["deleted"] == 0, "cron/output root must not be deleted"
assert output_root.exists()
assert run_md.exists()
def test_quick_skips_protected_cron_paths_defense_in_depth(self, _isolate_env):
"""Defense-in-depth: even if guess_category returned cron-output
(hypothetically), protected cron paths are never deleted."""