fix(update): prevent and self-heal half-updated venvs on Windows (#57659)

Root-causes the July 2026 Windows incident chain (locked _brotlicffi.pyd /
_sodium.pyd during install, then 'No module named annotated_doc' with
'hermes update' insisting 'Already up to date!'):

- hermes update: probe venv core imports even when the checkout is current;
  a half-updated venv (dep sync killed mid-flight by a locked .pyd) is now
  detected and repaired instead of being reported as up to date
- hermes update (Windows): after pausing gateways, refuse to mutate the venv
  while other processes run from the venv interpreter (the Desktop backend
  runs as python.exe so the hermes.exe shim guard never saw it); --force
  keeps the old behavior
- install.ps1 venv stage: disarm gateway autostart Scheduled Tasks before
  the kill sweep (they respawn the gateway inside the kill->delete window),
  make the sweep a bounded loop requiring 3 clean passes, and rename-then-
  delete the old venv (a rename succeeds even with mapped DLLs) with stale-
  dir cleanup on the next run
- desktop updater: 'venv shim still locked after 15s' now ABORTS the update
  hand-off (restarting our backend, surfacing the holder to the user)
  instead of 'proceeding anyway (force)' into guaranteed venv corruption;
  the unlock wait also re-kills respawned backends each poll tick
This commit is contained in:
Teknium 2026-07-03 03:24:08 -07:00 committed by GitHub
parent 741bd9ba42
commit b14d75f8af
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 507 additions and 20 deletions

View file

@ -2168,9 +2168,25 @@ async function releaseBackendLock(updateRoot, tag) {
rememberLog(`[${tag}] venv shim unlocked; safe to proceed`)
return { unlocked: true }
}
// A supervised backend can respawn between kill and check (grandchildren,
// pool entries registered mid-teardown). Re-collect and re-kill each pass
// instead of trusting the initial sweep.
const stragglers = []
if (hermesProcess && Number.isInteger(hermesProcess.pid)) stragglers.push(hermesProcess.pid)
for (const entry of backendPool.values()) {
if (entry.process && Number.isInteger(entry.process.pid)) stragglers.push(entry.process.pid)
}
for (const pid of stragglers) forceKillProcessTree(pid)
await new Promise(r => setTimeout(r, 300))
}
rememberLog(`[${tag}] venv shim still locked after 15s; proceeding anyway (force)`)
// Do NOT proceed past a held lock: handing off to the updater while another
// process (a second desktop window, a user terminal, an unkillable child)
// still maps the venv's files guarantees a half-updated venv — the updater's
// dependency sync dies on access-denied partway through uninstalls, leaving
// imports broken (the July 2026 brotlicffi/_sodium.pyd incidents). Failing
// the update loudly and keeping the app running is strictly better than a
// bricked install that needs manual venv surgery.
rememberLog(`[${tag}] venv shim still locked after 15s; aborting hand-off (something outside this app holds the venv)`)
return { unlocked: false }
}
@ -2250,7 +2266,20 @@ async function applyUpdates(opts = {}) {
// spawn the updater. Without this the updater races a still-locked
// hermes.exe (held by the backend child / its grandchildren) and the update
// bricks. See releaseBackendLockForUpdate for the full failure analysis.
await releaseBackendLockForUpdate(updateRoot)
const lock = await releaseBackendLockForUpdate(updateRoot)
if (!lock.unlocked) {
// Something OUTSIDE this app holds the venv (a second window, a user
// terminal running hermes, an unkillable child). Handing off anyway
// guarantees a half-updated venv — abort loudly instead and let the
// user close the holder and retry. Restart our own backend so the app
// keeps working after the failed attempt.
const message =
'Update aborted: another process is holding the Hermes install open ' +
'(a second Hermes window or a terminal running hermes?). Close it and retry.'
emitUpdateProgress({ stage: 'error', message, percent: null })
startHermes().catch(() => {})
return { ok: false, error: message }
}
// Detached so the updater outlives this process — it needs us GONE before
// `hermes update` will run (the venv shim is locked while we live).

View file

@ -8776,6 +8776,159 @@ def _wait_for_windows_update_gateway_exit(
return survivors
def _venv_core_imports_healthy() -> tuple[bool, str]:
"""Probe the project venv for the core imports the backend needs to boot.
Runs a tiny import check inside the venv interpreter (NOT this process
``hermes update`` may be driven by a different Python). Catches the
half-updated-venv state: git checkout current but a dependency sync that
failed or was killed partway (e.g. Windows access-denied on a loaded
.pyd), leaving imports like ``fastapi``'s new transitive deps missing.
Without this probe, ``hermes update`` on a current checkout prints
"Already up to date!" and returns without ever re-syncing dependencies
the user's install stays broken no matter how many times they update
(ryanc's incident, July 2026).
Returns ``(healthy, detail)``. Never raises; unknown states report
healthy so a probe failure can't force needless reinstalls.
"""
venv_dir = PROJECT_ROOT / "venv"
python_name = "python.exe" if _is_windows() else "python"
bin_dir = "Scripts" if _is_windows() else "bin"
venv_python = venv_dir / bin_dir / python_name
if not venv_python.exists():
return True, ""
# Core web/serve imports plus their newest transitive deps. Import (not
# just metadata) — a package can have intact dist-info but a missing
# module after an interrupted uninstall/install cycle.
check = (
"import importlib\n"
"mods = ['fastapi', 'uvicorn', 'pydantic', 'openai', 'yaml']\n"
"missing = []\n"
"for m in mods:\n"
" try: importlib.import_module(m)\n"
" except Exception as e: missing.append(f'{m}: {e}')\n"
"print('\\n'.join(missing))\n"
)
try:
result = subprocess.run(
[str(venv_python), "-c", check],
capture_output=True,
text=True,
timeout=60,
cwd=PROJECT_ROOT,
)
except Exception as exc:
logger.debug("venv health probe failed to run: %s", exc)
return True, ""
missing = [line.strip() for line in (result.stdout or "").splitlines() if line.strip()]
if result.returncode != 0 and not missing:
# Interpreter itself is broken (e.g. deleted stdlib) — that IS unhealthy.
detail = (result.stderr or "").strip().splitlines()
return False, detail[0] if detail else "venv python failed to run"
if missing:
return False, "; ".join(missing[:4])
return True, ""
def _detect_venv_python_processes(
*, exclude_pids: set[int] | None = None
) -> list[tuple[int, str, str]]:
"""Find live processes running from the project venv's interpreter.
The hermes.exe shim guard misses the biggest lock-holder class on
Windows: the Desktop app's backend (``python.exe -m hermes_cli.main
serve``) and anything else running straight off ``venv\\Scripts\\python
(w).exe``. Those processes keep native ``.pyd`` extensions mapped, so a
dependency sync mid-update dies with access-denied and strands the venv
half-updated (ryanc's brotlicffi/_sodium.pyd incidents, July 2026).
Killing them from here is pointless the Desktop app supervises its
backend and respawns it within seconds so the caller should refuse and
tell the user to close the app instead. Returns ``(pid, name, cmdline)``
tuples; empty off-Windows / without psutil / when nothing matches. The
calling process and its ancestors are always excluded (a CLI ``hermes
update`` itself runs from the venv python). Never raises.
"""
if not _is_windows():
return []
try:
import psutil
except Exception:
return []
venv_dir = PROJECT_ROOT / "venv"
try:
venv_prefix = str(venv_dir.resolve()).lower().rstrip(os.sep) + os.sep
except OSError:
venv_prefix = str(venv_dir).lower().rstrip(os.sep) + os.sep
skip: set[int] = set(exclude_pids or set())
skip.add(os.getpid())
try:
for anc in psutil.Process().parents():
skip.add(int(anc.pid))
except Exception:
pass
matches: list[tuple[int, str, str]] = []
try:
proc_iter = psutil.process_iter(["pid", "exe", "name", "cmdline"])
except Exception:
return []
for proc in proc_iter:
try:
info = proc.info
except Exception:
continue
pid = info.get("pid")
exe = info.get("exe")
if not exe or pid is None or int(pid) in skip:
continue
try:
exe_norm = str(Path(exe).resolve()).lower()
except (OSError, ValueError):
exe_norm = str(exe).lower()
if not exe_norm.startswith(venv_prefix):
continue
name = info.get("name") or Path(exe).name
cmdline = " ".join(info.get("cmdline") or [])[:120]
matches.append((int(pid), str(name), cmdline))
return matches
def _format_venv_python_holders_message(matches: list[tuple[int, str, str]]) -> str:
"""Explain which venv processes block the update and how to clear them."""
lines = [
"✗ Other Hermes processes are running from this install's venv:",
]
for pid, name, cmdline in matches[:6]:
hint = ""
low = cmdline.lower()
if "serve" in low or "dashboard" in low:
hint = " ← Hermes Desktop backend (close the desktop app)"
elif "gateway" in low:
hint = " ← gateway"
lines.append(f" PID {pid} {name} {cmdline}{hint}")
if len(matches) > 6:
lines.append(f" ... and {len(matches) - 6} more")
lines.append("")
lines.append(
" On Windows these keep native extension files (.pyd) locked, so the"
)
lines.append(
" dependency update would fail partway and leave a broken install."
)
lines.append(
" Close the Hermes desktop app / other Hermes terminals, then re-run:"
)
lines.append(" hermes update")
lines.append(" (or use `hermes update --force` to proceed anyway at your own risk)")
return "\n".join(lines)
def _pause_windows_gateways_for_update() -> dict | None:
"""Stop running Windows gateways before mutating the checkout or venv.
@ -9235,6 +9388,19 @@ def _cmd_update_impl(args, gateway_mode: bool):
_windows_gateway_resume,
)
# With gateways paused, anything still running from the venv interpreter
# (most commonly the Desktop app's `hermes serve` backend) will keep .pyd
# files locked and corrupt the dependency sync below. Refuse rather than
# race: killing the desktop backend is futile (the app supervises and
# respawns it), so the user must close the app. --force preserves the old
# behavior for users who know what they're doing.
if _is_windows() and not getattr(args, "force", False):
_venv_holders = _detect_venv_python_processes()
if _venv_holders:
print(_format_venv_python_holders_message(_venv_holders))
_resume_windows_gateways_after_update(_windows_gateway_resume)
sys.exit(2)
# Try git-based update first, fall back to ZIP download on Windows
# when git file I/O is broken (antivirus, NTFS filter drivers, etc.)
use_zip_update = False
@ -9436,7 +9602,41 @@ def _cmd_update_impl(args, gateway_mode: bool):
text=True,
check=False,
)
print("✓ Already up to date!")
# A current checkout does NOT imply a healthy install: a previous
# dependency sync may have failed partway (classic on Windows,
# where a running gateway/desktop backend keeps .pyd files locked
# and uv/pip dies with access-denied, stranding the venv between
# versions). Probe the venv's core imports and repair if broken —
# otherwise "Already up to date!" gaslights the user while their
# install stays bricked.
healthy, detail = _venv_core_imports_healthy()
if not healthy:
print("⚠ Checkout is current, but the venv is unhealthy:")
print(f" {detail}")
print("→ Repairing Python dependencies...")
_write_update_incomplete_marker()
from hermes_cli.managed_uv import ensure_uv
repair_uv = ensure_uv()
if repair_uv:
repair_env = {**os.environ, "VIRTUAL_ENV": str(PROJECT_ROOT / "venv")}
_install_python_dependencies_with_optional_fallback(
[repair_uv, "pip"], env=repair_env, group="all"
)
else:
_install_python_dependencies_with_optional_fallback(
[sys.executable, "-m", "pip"], group="all"
)
_clear_update_incomplete_marker()
healthy_after, detail_after = _venv_core_imports_healthy()
if healthy_after:
print("✓ Dependencies repaired!")
else:
print(f"⚠ Venv still unhealthy after repair: {detail_after}")
print(" Close all Hermes windows/gateways and re-run: hermes update")
else:
print("✓ Already up to date!")
_resume_windows_gateways_after_update(_windows_gateway_resume)
return

View file

@ -1614,6 +1614,26 @@ function Install-Venv {
if ($env:OS -eq "Windows_NT") {
$myPid = $PID
Write-Info "Stopping any running hermes processes before recreating venv..."
# Disarm respawners FIRST: the gateway autostart Scheduled Task and
# the Startup-folder entry both relaunch a killed gateway within
# seconds, and losing that race re-locks the venv's .pyd files
# between our kill sweep and Remove-Item (the July 2026
# _brotlicffi.pyd incident). schtasks /End stops a running task
# instance; /Change /DISABLE stops it from re-firing mid-install.
# Re-enabled after the venv is recreated (below). Best-effort: a
# missing task just errors quietly.
$gatewayTasksDisabled = @()
try {
schtasks /Query /FO CSV 2>$null | ConvertFrom-Csv | Where-Object { $_.TaskName -like '*Hermes_Gateway*' } | ForEach-Object {
$tn = $_.TaskName
schtasks /End /TN $tn 2>$null | Out-Null
schtasks /Change /TN $tn /DISABLE 2>$null | Out-Null
$gatewayTasksDisabled += $tn
Write-Info " disabled gateway autostart task $tn for the duration of the install"
}
} catch {
Write-Warn "Could not enumerate gateway scheduled tasks: $($_.Exception.Message)"
}
# The launcher CLI (hermes.exe) plus its child tree.
& taskkill /F /T /IM hermes.exe /FI "PID ne $myPid" 2>$null | Out-Null
# taskkill /IM hermes.exe is NOT enough: the gateway/agent that a
@ -1632,27 +1652,68 @@ function Install-Venv {
# ExecutablePath for a process it cannot inspect (a different session)
# instead of throwing, so an unreadable process is skipped rather than
# aborting the whole sweep.
#
# The sweep is a bounded LOOP, not single-shot: supervised processes
# (the Desktop app's backend, a watchdog-managed gateway) respawn in
# the window between one kill pass and the delete. Each pass re-
# enumerates; three consecutive clean passes (or the attempt cap)
# ends the loop.
$venvPrefix = [System.IO.Path]::GetFullPath((Join-Path $InstallDir "venv")).TrimEnd('\') + '\'
try {
Get-CimInstance Win32_Process -ErrorAction Stop |
Where-Object { $_.ProcessId -ne $myPid -and $_.ExecutablePath -and $_.ExecutablePath.StartsWith($venvPrefix, [System.StringComparison]::OrdinalIgnoreCase) } |
ForEach-Object {
Write-Info " stopping PID $($_.ProcessId) ($($_.Name)) running from venv"
Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue
}
} catch {
Write-Warn "Could not enumerate venv processes: $($_.Exception.Message)"
$cleanPasses = 0
for ($sweep = 0; $sweep -lt 10 -and $cleanPasses -lt 3; $sweep++) {
$found = 0
try {
Get-CimInstance Win32_Process -ErrorAction Stop |
Where-Object { $_.ProcessId -ne $myPid -and $_.ExecutablePath -and $_.ExecutablePath.StartsWith($venvPrefix, [System.StringComparison]::OrdinalIgnoreCase) } |
ForEach-Object {
$found++
Write-Info " stopping PID $($_.ProcessId) ($($_.Name)) running from venv"
Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue
}
} catch {
Write-Warn "Could not enumerate venv processes: $($_.Exception.Message)"
break
}
if ($found -eq 0) { $cleanPasses++ } else { $cleanPasses = 0 }
Start-Sleep -Milliseconds 400
}
Start-Sleep -Milliseconds 800
}
Remove-Item -Recurse -Force "venv" -ErrorAction SilentlyContinue
# A killed process can take a moment to release its file handles, so a
# first Remove-Item may still hit a locked .pyd. Retry once after a short
# pause before giving up and letting the stage fail loudly.
if (Test-Path "venv") {
Start-Sleep -Seconds 2
Remove-Item -Recurse -Force "venv"
# Rename-then-delete: on Windows a directory RENAME succeeds even while
# files inside it are mapped as DLLs (only in-place delete/replace of
# the mapped file is denied, and only same-volume renames are atomic
# moves). Moving the old venv aside means `uv venv` can create a fresh
# one immediately even if some straggler still holds a .pyd from the
# old tree; the renamed dir is deleted best-effort (now, and by the
# cleanup pass below on the NEXT install if a handle outlives this one).
$staleName = "venv.stale.{0}" -f (Get-Date -Format "yyyyMMddHHmmss")
$renamed = $false
try {
Rename-Item -Path "venv" -NewName $staleName -ErrorAction Stop
$renamed = $true
} catch {
Write-Warn "Could not rename venv aside ($($_.Exception.Message)); falling back to in-place delete"
}
if ($renamed) {
Remove-Item -Recurse -Force $staleName -ErrorAction SilentlyContinue
if (Test-Path $staleName) {
Write-Warn "Old venv parked at $staleName (a process still holds files in it); it will be cleaned up on the next install"
}
} else {
Remove-Item -Recurse -Force "venv" -ErrorAction SilentlyContinue
# A killed process can take a moment to release its file handles, so a
# first Remove-Item may still hit a locked .pyd. Retry once after a short
# pause before giving up and letting the stage fail loudly.
if (Test-Path "venv") {
Start-Sleep -Seconds 2
Remove-Item -Recurse -Force "venv"
}
}
}
# Clean up parked venvs from previous installs whose handles have since
# been released. Best-effort — a still-held tree just stays for next time.
Get-ChildItem -Directory -Filter "venv.stale.*" -ErrorAction SilentlyContinue | ForEach-Object {
Remove-Item -Recurse -Force $_.FullName -ErrorAction SilentlyContinue
}
# uv creates the venv and pins the Python version in one step. uv emits
@ -1684,6 +1745,18 @@ function Install-Venv {
Pop-Location
# Re-arm the gateway autostart tasks disabled during the venv teardown.
# Same function scope, so the list survives even under the stage-per-
# process bootstrap. Deliberately NOT started here — dependencies aren't
# installed yet; the task fires normally on next logon and `hermes update`
# / the gateway resume path handles the immediate restart.
if ($gatewayTasksDisabled -and $gatewayTasksDisabled.Count -gt 0) {
foreach ($tn in $gatewayTasksDisabled) {
schtasks /Change /TN $tn /ENABLE 2>$null | Out-Null
}
Write-Info "Re-enabled gateway autostart task(s): $($gatewayTasksDisabled -join ', ')"
}
Write-Success "Virtual environment ready (Python $PythonVersion)"
}

View file

@ -0,0 +1,185 @@
"""Tests for the Windows half-updated-venv hardening (July 2026 incident).
Covers three additions to ``hermes update``:
1. ``_venv_core_imports_healthy`` the venv health probe that lets an
"Already up to date" checkout still repair a broken dependency install.
2. ``_detect_venv_python_processes`` the venv-interpreter process guard
that refuses to mutate the venv while a desktop backend / stray python
holds .pyd files mapped.
3. The commit_count == 0 repair branch wiring in ``_cmd_update_impl``.
All Windows-specific paths are exercised via ``_is_windows`` patching so
they run on any host (same approach as test_update_concurrent_quarantine).
"""
from __future__ import annotations
import subprocess
import sys
import types
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
import pytest
from hermes_cli import main as cli_main
# ---------------------------------------------------------------------------
# _venv_core_imports_healthy
# ---------------------------------------------------------------------------
def test_venv_health_reports_healthy_when_no_venv(tmp_path):
"""No venv python → nothing to probe → healthy (never blocks update)."""
with patch.object(cli_main, "PROJECT_ROOT", tmp_path):
healthy, detail = cli_main._venv_core_imports_healthy()
assert healthy is True
assert detail == ""
def _fake_venv_python(tmp_path, *, windows: bool = False):
bin_dir = tmp_path / "venv" / ("Scripts" if windows else "bin")
bin_dir.mkdir(parents=True)
py = bin_dir / ("python.exe" if windows else "python")
py.write_bytes(b"")
return py
def test_venv_health_reports_missing_imports(tmp_path):
"""Probe output lines are surfaced as the unhealthy detail."""
_fake_venv_python(tmp_path)
fake = SimpleNamespace(
returncode=0,
stdout="fastapi: No module named 'annotated_doc'\n",
stderr="",
)
with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object(
cli_main.subprocess, "run", return_value=fake
):
healthy, detail = cli_main._venv_core_imports_healthy()
assert healthy is False
assert "annotated_doc" in detail
def test_venv_health_healthy_when_probe_clean(tmp_path):
_fake_venv_python(tmp_path)
fake = SimpleNamespace(returncode=0, stdout="", stderr="")
with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object(
cli_main.subprocess, "run", return_value=fake
):
healthy, detail = cli_main._venv_core_imports_healthy()
assert healthy is True
def test_venv_health_broken_interpreter_is_unhealthy(tmp_path):
"""Nonzero exit with no module list = interpreter itself is broken."""
_fake_venv_python(tmp_path)
fake = SimpleNamespace(returncode=1, stdout="", stderr="Fatal Python error: init failed\n")
with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object(
cli_main.subprocess, "run", return_value=fake
):
healthy, detail = cli_main._venv_core_imports_healthy()
assert healthy is False
assert "Fatal Python error" in detail
def test_venv_health_probe_failure_reports_healthy(tmp_path):
"""A probe that can't run must NOT force needless reinstalls."""
_fake_venv_python(tmp_path)
with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object(
cli_main.subprocess,
"run",
side_effect=subprocess.TimeoutExpired(cmd="python", timeout=60),
):
healthy, _detail = cli_main._venv_core_imports_healthy()
assert healthy is True
# ---------------------------------------------------------------------------
# _detect_venv_python_processes
# ---------------------------------------------------------------------------
def _proc(pid: int, exe: str, name: str, cmdline: list[str] | None = None):
proc = MagicMock()
proc.info = {"pid": pid, "exe": exe, "name": name, "cmdline": cmdline or []}
return proc
def test_detect_venv_python_off_windows_is_empty():
with patch.object(cli_main, "_is_windows", return_value=False):
assert cli_main._detect_venv_python_processes() == []
@patch.object(cli_main, "_is_windows", return_value=True)
def test_detect_venv_python_finds_backend(_winp, tmp_path):
venv_py = str(tmp_path / "venv" / "Scripts" / "python.exe")
other_py = "C:\\Python311\\python.exe"
me = MagicMock()
me.parents.return_value = []
fake_psutil = types.SimpleNamespace(
process_iter=lambda attrs: iter(
[
_proc(101, venv_py, "python.exe", ["python.exe", "-m", "hermes_cli.main", "serve"]),
_proc(102, other_py, "python.exe", ["python.exe", "somescript.py"]),
]
),
Process=lambda *a, **k: me,
)
with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict(
sys.modules, {"psutil": fake_psutil}
):
matches = cli_main._detect_venv_python_processes()
assert [m[0] for m in matches] == [101]
assert "serve" in matches[0][2]
@patch.object(cli_main, "_is_windows", return_value=True)
def test_detect_venv_python_excludes_self_and_ancestors(_winp, tmp_path):
import os as _os
venv_py = str(tmp_path / "venv" / "Scripts" / "python.exe")
parent = MagicMock()
parent.pid = 555
me = MagicMock()
me.parents.return_value = [parent]
fake_psutil = types.SimpleNamespace(
process_iter=lambda attrs: iter(
[
_proc(_os.getpid(), venv_py, "python.exe"),
_proc(555, venv_py, "hermes.exe"),
]
),
Process=lambda *a, **k: me,
)
with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict(
sys.modules, {"psutil": fake_psutil}
):
assert cli_main._detect_venv_python_processes() == []
@patch.object(cli_main, "_is_windows", return_value=True)
def test_detect_venv_python_no_psutil_is_empty(_winp, tmp_path):
with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict(
sys.modules, {"psutil": None}
):
assert cli_main._detect_venv_python_processes() == []
def test_format_venv_holders_message_flags_desktop_backend(tmp_path):
matches = [
(101, "python.exe", "python.exe -m hermes_cli.main serve --host 127.0.0.1"),
(102, "pythonw.exe", "pythonw.exe -m hermes_cli.main gateway run"),
]
msg = cli_main._format_venv_python_holders_message(matches)
assert "101" in msg
assert "desktop app" in msg.lower()
assert "gateway" in msg
assert "hermes update" in msg
assert "--force" in msg