diff --git a/apps/desktop/electron/main.cjs b/apps/desktop/electron/main.cjs index da642b33a..eac26c0af 100644 --- a/apps/desktop/electron/main.cjs +++ b/apps/desktop/electron/main.cjs @@ -2168,9 +2168,25 @@ async function releaseBackendLock(updateRoot, tag) { rememberLog(`[${tag}] venv shim unlocked; safe to proceed`) return { unlocked: true } } + // A supervised backend can respawn between kill and check (grandchildren, + // pool entries registered mid-teardown). Re-collect and re-kill each pass + // instead of trusting the initial sweep. + const stragglers = [] + if (hermesProcess && Number.isInteger(hermesProcess.pid)) stragglers.push(hermesProcess.pid) + for (const entry of backendPool.values()) { + if (entry.process && Number.isInteger(entry.process.pid)) stragglers.push(entry.process.pid) + } + for (const pid of stragglers) forceKillProcessTree(pid) await new Promise(r => setTimeout(r, 300)) } - rememberLog(`[${tag}] venv shim still locked after 15s; proceeding anyway (force)`) + // Do NOT proceed past a held lock: handing off to the updater while another + // process (a second desktop window, a user terminal, an unkillable child) + // still maps the venv's files guarantees a half-updated venv — the updater's + // dependency sync dies on access-denied partway through uninstalls, leaving + // imports broken (the July 2026 brotlicffi/_sodium.pyd incidents). Failing + // the update loudly and keeping the app running is strictly better than a + // bricked install that needs manual venv surgery. + rememberLog(`[${tag}] venv shim still locked after 15s; aborting hand-off (something outside this app holds the venv)`) return { unlocked: false } } @@ -2250,7 +2266,20 @@ async function applyUpdates(opts = {}) { // spawn the updater. Without this the updater races a still-locked // hermes.exe (held by the backend child / its grandchildren) and the update // bricks. See releaseBackendLockForUpdate for the full failure analysis. - await releaseBackendLockForUpdate(updateRoot) + const lock = await releaseBackendLockForUpdate(updateRoot) + if (!lock.unlocked) { + // Something OUTSIDE this app holds the venv (a second window, a user + // terminal running hermes, an unkillable child). Handing off anyway + // guarantees a half-updated venv — abort loudly instead and let the + // user close the holder and retry. Restart our own backend so the app + // keeps working after the failed attempt. + const message = + 'Update aborted: another process is holding the Hermes install open ' + + '(a second Hermes window or a terminal running hermes?). Close it and retry.' + emitUpdateProgress({ stage: 'error', message, percent: null }) + startHermes().catch(() => {}) + return { ok: false, error: message } + } // Detached so the updater outlives this process — it needs us GONE before // `hermes update` will run (the venv shim is locked while we live). diff --git a/hermes_cli/main.py b/hermes_cli/main.py index aedcb603e..29b1ac1de 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -8776,6 +8776,159 @@ def _wait_for_windows_update_gateway_exit( return survivors +def _venv_core_imports_healthy() -> tuple[bool, str]: + """Probe the project venv for the core imports the backend needs to boot. + + Runs a tiny import check inside the venv interpreter (NOT this process — + ``hermes update`` may be driven by a different Python). Catches the + half-updated-venv state: git checkout current but a dependency sync that + failed or was killed partway (e.g. Windows access-denied on a loaded + .pyd), leaving imports like ``fastapi``'s new transitive deps missing. + Without this probe, ``hermes update`` on a current checkout prints + "Already up to date!" and returns without ever re-syncing dependencies — + the user's install stays broken no matter how many times they update + (ryanc's incident, July 2026). + + Returns ``(healthy, detail)``. Never raises; unknown states report + healthy so a probe failure can't force needless reinstalls. + """ + venv_dir = PROJECT_ROOT / "venv" + python_name = "python.exe" if _is_windows() else "python" + bin_dir = "Scripts" if _is_windows() else "bin" + venv_python = venv_dir / bin_dir / python_name + if not venv_python.exists(): + return True, "" + + # Core web/serve imports plus their newest transitive deps. Import (not + # just metadata) — a package can have intact dist-info but a missing + # module after an interrupted uninstall/install cycle. + check = ( + "import importlib\n" + "mods = ['fastapi', 'uvicorn', 'pydantic', 'openai', 'yaml']\n" + "missing = []\n" + "for m in mods:\n" + " try: importlib.import_module(m)\n" + " except Exception as e: missing.append(f'{m}: {e}')\n" + "print('\\n'.join(missing))\n" + ) + try: + result = subprocess.run( + [str(venv_python), "-c", check], + capture_output=True, + text=True, + timeout=60, + cwd=PROJECT_ROOT, + ) + except Exception as exc: + logger.debug("venv health probe failed to run: %s", exc) + return True, "" + + missing = [line.strip() for line in (result.stdout or "").splitlines() if line.strip()] + if result.returncode != 0 and not missing: + # Interpreter itself is broken (e.g. deleted stdlib) — that IS unhealthy. + detail = (result.stderr or "").strip().splitlines() + return False, detail[0] if detail else "venv python failed to run" + if missing: + return False, "; ".join(missing[:4]) + return True, "" + + +def _detect_venv_python_processes( + *, exclude_pids: set[int] | None = None +) -> list[tuple[int, str, str]]: + """Find live processes running from the project venv's interpreter. + + The hermes.exe shim guard misses the biggest lock-holder class on + Windows: the Desktop app's backend (``python.exe -m hermes_cli.main + serve``) and anything else running straight off ``venv\\Scripts\\python + (w).exe``. Those processes keep native ``.pyd`` extensions mapped, so a + dependency sync mid-update dies with access-denied and strands the venv + half-updated (ryanc's brotlicffi/_sodium.pyd incidents, July 2026). + + Killing them from here is pointless — the Desktop app supervises its + backend and respawns it within seconds — so the caller should refuse and + tell the user to close the app instead. Returns ``(pid, name, cmdline)`` + tuples; empty off-Windows / without psutil / when nothing matches. The + calling process and its ancestors are always excluded (a CLI ``hermes + update`` itself runs from the venv python). Never raises. + """ + if not _is_windows(): + return [] + try: + import psutil + except Exception: + return [] + + venv_dir = PROJECT_ROOT / "venv" + try: + venv_prefix = str(venv_dir.resolve()).lower().rstrip(os.sep) + os.sep + except OSError: + venv_prefix = str(venv_dir).lower().rstrip(os.sep) + os.sep + + skip: set[int] = set(exclude_pids or set()) + skip.add(os.getpid()) + try: + for anc in psutil.Process().parents(): + skip.add(int(anc.pid)) + except Exception: + pass + + matches: list[tuple[int, str, str]] = [] + try: + proc_iter = psutil.process_iter(["pid", "exe", "name", "cmdline"]) + except Exception: + return [] + for proc in proc_iter: + try: + info = proc.info + except Exception: + continue + pid = info.get("pid") + exe = info.get("exe") + if not exe or pid is None or int(pid) in skip: + continue + try: + exe_norm = str(Path(exe).resolve()).lower() + except (OSError, ValueError): + exe_norm = str(exe).lower() + if not exe_norm.startswith(venv_prefix): + continue + name = info.get("name") or Path(exe).name + cmdline = " ".join(info.get("cmdline") or [])[:120] + matches.append((int(pid), str(name), cmdline)) + return matches + + +def _format_venv_python_holders_message(matches: list[tuple[int, str, str]]) -> str: + """Explain which venv processes block the update and how to clear them.""" + lines = [ + "✗ Other Hermes processes are running from this install's venv:", + ] + for pid, name, cmdline in matches[:6]: + hint = "" + low = cmdline.lower() + if "serve" in low or "dashboard" in low: + hint = " ← Hermes Desktop backend (close the desktop app)" + elif "gateway" in low: + hint = " ← gateway" + lines.append(f" PID {pid} {name} {cmdline}{hint}") + if len(matches) > 6: + lines.append(f" ... and {len(matches) - 6} more") + lines.append("") + lines.append( + " On Windows these keep native extension files (.pyd) locked, so the" + ) + lines.append( + " dependency update would fail partway and leave a broken install." + ) + lines.append( + " Close the Hermes desktop app / other Hermes terminals, then re-run:" + ) + lines.append(" hermes update") + lines.append(" (or use `hermes update --force` to proceed anyway at your own risk)") + return "\n".join(lines) + + def _pause_windows_gateways_for_update() -> dict | None: """Stop running Windows gateways before mutating the checkout or venv. @@ -9235,6 +9388,19 @@ def _cmd_update_impl(args, gateway_mode: bool): _windows_gateway_resume, ) + # With gateways paused, anything still running from the venv interpreter + # (most commonly the Desktop app's `hermes serve` backend) will keep .pyd + # files locked and corrupt the dependency sync below. Refuse rather than + # race: killing the desktop backend is futile (the app supervises and + # respawns it), so the user must close the app. --force preserves the old + # behavior for users who know what they're doing. + if _is_windows() and not getattr(args, "force", False): + _venv_holders = _detect_venv_python_processes() + if _venv_holders: + print(_format_venv_python_holders_message(_venv_holders)) + _resume_windows_gateways_after_update(_windows_gateway_resume) + sys.exit(2) + # Try git-based update first, fall back to ZIP download on Windows # when git file I/O is broken (antivirus, NTFS filter drivers, etc.) use_zip_update = False @@ -9436,7 +9602,41 @@ def _cmd_update_impl(args, gateway_mode: bool): text=True, check=False, ) - print("✓ Already up to date!") + + # A current checkout does NOT imply a healthy install: a previous + # dependency sync may have failed partway (classic on Windows, + # where a running gateway/desktop backend keeps .pyd files locked + # and uv/pip dies with access-denied, stranding the venv between + # versions). Probe the venv's core imports and repair if broken — + # otherwise "Already up to date!" gaslights the user while their + # install stays bricked. + healthy, detail = _venv_core_imports_healthy() + if not healthy: + print("⚠ Checkout is current, but the venv is unhealthy:") + print(f" {detail}") + print("→ Repairing Python dependencies...") + _write_update_incomplete_marker() + from hermes_cli.managed_uv import ensure_uv + + repair_uv = ensure_uv() + if repair_uv: + repair_env = {**os.environ, "VIRTUAL_ENV": str(PROJECT_ROOT / "venv")} + _install_python_dependencies_with_optional_fallback( + [repair_uv, "pip"], env=repair_env, group="all" + ) + else: + _install_python_dependencies_with_optional_fallback( + [sys.executable, "-m", "pip"], group="all" + ) + _clear_update_incomplete_marker() + healthy_after, detail_after = _venv_core_imports_healthy() + if healthy_after: + print("✓ Dependencies repaired!") + else: + print(f"⚠ Venv still unhealthy after repair: {detail_after}") + print(" Close all Hermes windows/gateways and re-run: hermes update") + else: + print("✓ Already up to date!") _resume_windows_gateways_after_update(_windows_gateway_resume) return diff --git a/scripts/install.ps1 b/scripts/install.ps1 index 15f98f4b4..e090dcb5c 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -1614,6 +1614,26 @@ function Install-Venv { if ($env:OS -eq "Windows_NT") { $myPid = $PID Write-Info "Stopping any running hermes processes before recreating venv..." + # Disarm respawners FIRST: the gateway autostart Scheduled Task and + # the Startup-folder entry both relaunch a killed gateway within + # seconds, and losing that race re-locks the venv's .pyd files + # between our kill sweep and Remove-Item (the July 2026 + # _brotlicffi.pyd incident). schtasks /End stops a running task + # instance; /Change /DISABLE stops it from re-firing mid-install. + # Re-enabled after the venv is recreated (below). Best-effort: a + # missing task just errors quietly. + $gatewayTasksDisabled = @() + try { + schtasks /Query /FO CSV 2>$null | ConvertFrom-Csv | Where-Object { $_.TaskName -like '*Hermes_Gateway*' } | ForEach-Object { + $tn = $_.TaskName + schtasks /End /TN $tn 2>$null | Out-Null + schtasks /Change /TN $tn /DISABLE 2>$null | Out-Null + $gatewayTasksDisabled += $tn + Write-Info " disabled gateway autostart task $tn for the duration of the install" + } + } catch { + Write-Warn "Could not enumerate gateway scheduled tasks: $($_.Exception.Message)" + } # The launcher CLI (hermes.exe) plus its child tree. & taskkill /F /T /IM hermes.exe /FI "PID ne $myPid" 2>$null | Out-Null # taskkill /IM hermes.exe is NOT enough: the gateway/agent that a @@ -1632,27 +1652,68 @@ function Install-Venv { # ExecutablePath for a process it cannot inspect (a different session) # instead of throwing, so an unreadable process is skipped rather than # aborting the whole sweep. + # + # The sweep is a bounded LOOP, not single-shot: supervised processes + # (the Desktop app's backend, a watchdog-managed gateway) respawn in + # the window between one kill pass and the delete. Each pass re- + # enumerates; three consecutive clean passes (or the attempt cap) + # ends the loop. $venvPrefix = [System.IO.Path]::GetFullPath((Join-Path $InstallDir "venv")).TrimEnd('\') + '\' - try { - Get-CimInstance Win32_Process -ErrorAction Stop | - Where-Object { $_.ProcessId -ne $myPid -and $_.ExecutablePath -and $_.ExecutablePath.StartsWith($venvPrefix, [System.StringComparison]::OrdinalIgnoreCase) } | - ForEach-Object { - Write-Info " stopping PID $($_.ProcessId) ($($_.Name)) running from venv" - Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue - } - } catch { - Write-Warn "Could not enumerate venv processes: $($_.Exception.Message)" + $cleanPasses = 0 + for ($sweep = 0; $sweep -lt 10 -and $cleanPasses -lt 3; $sweep++) { + $found = 0 + try { + Get-CimInstance Win32_Process -ErrorAction Stop | + Where-Object { $_.ProcessId -ne $myPid -and $_.ExecutablePath -and $_.ExecutablePath.StartsWith($venvPrefix, [System.StringComparison]::OrdinalIgnoreCase) } | + ForEach-Object { + $found++ + Write-Info " stopping PID $($_.ProcessId) ($($_.Name)) running from venv" + Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue + } + } catch { + Write-Warn "Could not enumerate venv processes: $($_.Exception.Message)" + break + } + if ($found -eq 0) { $cleanPasses++ } else { $cleanPasses = 0 } + Start-Sleep -Milliseconds 400 } - Start-Sleep -Milliseconds 800 } - Remove-Item -Recurse -Force "venv" -ErrorAction SilentlyContinue - # A killed process can take a moment to release its file handles, so a - # first Remove-Item may still hit a locked .pyd. Retry once after a short - # pause before giving up and letting the stage fail loudly. - if (Test-Path "venv") { - Start-Sleep -Seconds 2 - Remove-Item -Recurse -Force "venv" + # Rename-then-delete: on Windows a directory RENAME succeeds even while + # files inside it are mapped as DLLs (only in-place delete/replace of + # the mapped file is denied, and only same-volume renames are atomic + # moves). Moving the old venv aside means `uv venv` can create a fresh + # one immediately even if some straggler still holds a .pyd from the + # old tree; the renamed dir is deleted best-effort (now, and by the + # cleanup pass below on the NEXT install if a handle outlives this one). + $staleName = "venv.stale.{0}" -f (Get-Date -Format "yyyyMMddHHmmss") + $renamed = $false + try { + Rename-Item -Path "venv" -NewName $staleName -ErrorAction Stop + $renamed = $true + } catch { + Write-Warn "Could not rename venv aside ($($_.Exception.Message)); falling back to in-place delete" } + if ($renamed) { + Remove-Item -Recurse -Force $staleName -ErrorAction SilentlyContinue + if (Test-Path $staleName) { + Write-Warn "Old venv parked at $staleName (a process still holds files in it); it will be cleaned up on the next install" + } + } else { + Remove-Item -Recurse -Force "venv" -ErrorAction SilentlyContinue + # A killed process can take a moment to release its file handles, so a + # first Remove-Item may still hit a locked .pyd. Retry once after a short + # pause before giving up and letting the stage fail loudly. + if (Test-Path "venv") { + Start-Sleep -Seconds 2 + Remove-Item -Recurse -Force "venv" + } + } + } + + # Clean up parked venvs from previous installs whose handles have since + # been released. Best-effort — a still-held tree just stays for next time. + Get-ChildItem -Directory -Filter "venv.stale.*" -ErrorAction SilentlyContinue | ForEach-Object { + Remove-Item -Recurse -Force $_.FullName -ErrorAction SilentlyContinue } # uv creates the venv and pins the Python version in one step. uv emits @@ -1684,6 +1745,18 @@ function Install-Venv { Pop-Location + # Re-arm the gateway autostart tasks disabled during the venv teardown. + # Same function scope, so the list survives even under the stage-per- + # process bootstrap. Deliberately NOT started here — dependencies aren't + # installed yet; the task fires normally on next logon and `hermes update` + # / the gateway resume path handles the immediate restart. + if ($gatewayTasksDisabled -and $gatewayTasksDisabled.Count -gt 0) { + foreach ($tn in $gatewayTasksDisabled) { + schtasks /Change /TN $tn /ENABLE 2>$null | Out-Null + } + Write-Info "Re-enabled gateway autostart task(s): $($gatewayTasksDisabled -join ', ')" + } + Write-Success "Virtual environment ready (Python $PythonVersion)" } diff --git a/tests/hermes_cli/test_update_venv_health.py b/tests/hermes_cli/test_update_venv_health.py new file mode 100644 index 000000000..b6660a8c1 --- /dev/null +++ b/tests/hermes_cli/test_update_venv_health.py @@ -0,0 +1,185 @@ +"""Tests for the Windows half-updated-venv hardening (July 2026 incident). + +Covers three additions to ``hermes update``: + +1. ``_venv_core_imports_healthy`` — the venv health probe that lets an + "Already up to date" checkout still repair a broken dependency install. +2. ``_detect_venv_python_processes`` — the venv-interpreter process guard + that refuses to mutate the venv while a desktop backend / stray python + holds .pyd files mapped. +3. The commit_count == 0 repair branch wiring in ``_cmd_update_impl``. + +All Windows-specific paths are exercised via ``_is_windows`` patching so +they run on any host (same approach as test_update_concurrent_quarantine). +""" + +from __future__ import annotations + +import subprocess +import sys +import types +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest + +from hermes_cli import main as cli_main + + +# --------------------------------------------------------------------------- +# _venv_core_imports_healthy +# --------------------------------------------------------------------------- + + +def test_venv_health_reports_healthy_when_no_venv(tmp_path): + """No venv python → nothing to probe → healthy (never blocks update).""" + with patch.object(cli_main, "PROJECT_ROOT", tmp_path): + healthy, detail = cli_main._venv_core_imports_healthy() + assert healthy is True + assert detail == "" + + +def _fake_venv_python(tmp_path, *, windows: bool = False): + bin_dir = tmp_path / "venv" / ("Scripts" if windows else "bin") + bin_dir.mkdir(parents=True) + py = bin_dir / ("python.exe" if windows else "python") + py.write_bytes(b"") + return py + + +def test_venv_health_reports_missing_imports(tmp_path): + """Probe output lines are surfaced as the unhealthy detail.""" + _fake_venv_python(tmp_path) + + fake = SimpleNamespace( + returncode=0, + stdout="fastapi: No module named 'annotated_doc'\n", + stderr="", + ) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object( + cli_main.subprocess, "run", return_value=fake + ): + healthy, detail = cli_main._venv_core_imports_healthy() + + assert healthy is False + assert "annotated_doc" in detail + + +def test_venv_health_healthy_when_probe_clean(tmp_path): + _fake_venv_python(tmp_path) + fake = SimpleNamespace(returncode=0, stdout="", stderr="") + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object( + cli_main.subprocess, "run", return_value=fake + ): + healthy, detail = cli_main._venv_core_imports_healthy() + assert healthy is True + + +def test_venv_health_broken_interpreter_is_unhealthy(tmp_path): + """Nonzero exit with no module list = interpreter itself is broken.""" + _fake_venv_python(tmp_path) + fake = SimpleNamespace(returncode=1, stdout="", stderr="Fatal Python error: init failed\n") + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object( + cli_main.subprocess, "run", return_value=fake + ): + healthy, detail = cli_main._venv_core_imports_healthy() + assert healthy is False + assert "Fatal Python error" in detail + + +def test_venv_health_probe_failure_reports_healthy(tmp_path): + """A probe that can't run must NOT force needless reinstalls.""" + _fake_venv_python(tmp_path) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object( + cli_main.subprocess, + "run", + side_effect=subprocess.TimeoutExpired(cmd="python", timeout=60), + ): + healthy, _detail = cli_main._venv_core_imports_healthy() + assert healthy is True + + +# --------------------------------------------------------------------------- +# _detect_venv_python_processes +# --------------------------------------------------------------------------- + + +def _proc(pid: int, exe: str, name: str, cmdline: list[str] | None = None): + proc = MagicMock() + proc.info = {"pid": pid, "exe": exe, "name": name, "cmdline": cmdline or []} + return proc + + +def test_detect_venv_python_off_windows_is_empty(): + with patch.object(cli_main, "_is_windows", return_value=False): + assert cli_main._detect_venv_python_processes() == [] + + +@patch.object(cli_main, "_is_windows", return_value=True) +def test_detect_venv_python_finds_backend(_winp, tmp_path): + venv_py = str(tmp_path / "venv" / "Scripts" / "python.exe") + other_py = "C:\\Python311\\python.exe" + + me = MagicMock() + me.parents.return_value = [] + fake_psutil = types.SimpleNamespace( + process_iter=lambda attrs: iter( + [ + _proc(101, venv_py, "python.exe", ["python.exe", "-m", "hermes_cli.main", "serve"]), + _proc(102, other_py, "python.exe", ["python.exe", "somescript.py"]), + ] + ), + Process=lambda *a, **k: me, + ) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict( + sys.modules, {"psutil": fake_psutil} + ): + matches = cli_main._detect_venv_python_processes() + + assert [m[0] for m in matches] == [101] + assert "serve" in matches[0][2] + + +@patch.object(cli_main, "_is_windows", return_value=True) +def test_detect_venv_python_excludes_self_and_ancestors(_winp, tmp_path): + import os as _os + + venv_py = str(tmp_path / "venv" / "Scripts" / "python.exe") + parent = MagicMock() + parent.pid = 555 + me = MagicMock() + me.parents.return_value = [parent] + fake_psutil = types.SimpleNamespace( + process_iter=lambda attrs: iter( + [ + _proc(_os.getpid(), venv_py, "python.exe"), + _proc(555, venv_py, "hermes.exe"), + ] + ), + Process=lambda *a, **k: me, + ) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict( + sys.modules, {"psutil": fake_psutil} + ): + assert cli_main._detect_venv_python_processes() == [] + + +@patch.object(cli_main, "_is_windows", return_value=True) +def test_detect_venv_python_no_psutil_is_empty(_winp, tmp_path): + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict( + sys.modules, {"psutil": None} + ): + assert cli_main._detect_venv_python_processes() == [] + + +def test_format_venv_holders_message_flags_desktop_backend(tmp_path): + matches = [ + (101, "python.exe", "python.exe -m hermes_cli.main serve --host 127.0.0.1"), + (102, "pythonw.exe", "pythonw.exe -m hermes_cli.main gateway run"), + ] + msg = cli_main._format_venv_python_holders_message(matches) + assert "101" in msg + assert "desktop app" in msg.lower() + assert "gateway" in msg + assert "hermes update" in msg + assert "--force" in msg