fix(agent,gateway,doctor): add SSL CA cert bundle fail-fast guard

A stale certifi CA bundle after a partial `hermes update` used to crash
the agent on the first outbound HTTPS call with a raw traceback and
trap the gateway in a retry loop.

This patch:

* Adds `agent/errors.py` with a typed `SSLConfigurationError`
* Adds `agent/ssl_guard.py` with a `verify_ca_bundle()` pre-flight
  that asserts the bundle exists, is non-trivial in size, and can build
  a working SSLContext. On macOS, it falls back to the system trust
  store when the bundle is empty but the system store is healthy
  (covers corporate proxies / MDM setups).
* Wires the guard into `run_agent.py` and `gateway/run.py` right
  after the `hermes_bootstrap` import, inside a try/except so a bug
  in the guard itself can never prevent startup.
* Adds a `SSL / CA Certificates` section to `hermes_cli doctor` so
  users can detect the failure with one command.
* Adds unit tests covering the healthy, missing, empty, skip-env, and
  macOS-fallback paths.
* Adds an RCA document describing the failure mode and the recovery
  path (`pip install -e .`).

When the bundle is broken the user sees:

    \u26a0\ufe0f SSL certificate bundle issue detected.
       Run: pip install -e .

`HERMES_SKIP_SSL_GUARD=1` disables the check for sandboxed
environments that ship their own trust store.
This commit is contained in:
chromalinx 2026-06-01 22:53:20 +02:00 committed by Teknium
parent 1106879147
commit a218a0f156
7 changed files with 244 additions and 3 deletions

3
agent/errors.py Normal file
View file

@ -0,0 +1,3 @@
class SSLConfigurationError(Exception):
"""Raised when SSL/TLS certificate bundle configuration fails."""
pass

90
agent/ssl_guard.py Normal file
View file

@ -0,0 +1,90 @@
"""Preventive SSL CA certificate guard for Hermes Agent.
This module provides an early fail-fast check to detect corrupted or missing
certifi CA bundles before any network client is initialized.
"""
import logging
import os
import platform
import ssl
from pathlib import Path
import certifi
from agent.errors import SSLConfigurationError
logger = logging.getLogger(__name__)
def _ssl_err(message: str) -> SSLConfigurationError:
"""Helper to create a consistent error with remediation hint."""
return SSLConfigurationError(message + "\nRun: pip install -e .")
def verify_ca_bundle() -> None:
"""Verify that the certifi CA bundle is valid and loadable.
Raises:
SSLConfigurationError: If the bundle is missing, empty, or cannot be
used to create a working SSLContext.
"""
if os.getenv("HERMES_SKIP_SSL_GUARD"):
logger.debug("SSL guard skipped via HERMES_SKIP_SSL_GUARD")
return
ca_bundle = str(certifi.where())
bundle_path = Path(ca_bundle)
if not bundle_path.exists():
raise _ssl_err(f"certifi CA bundle not found at {ca_bundle}")
if bundle_path.stat().st_size < 1024:
raise _ssl_err(f"certifi CA bundle at {ca_bundle} appears corrupted (too small)")
try:
ctx = ssl.create_default_context(cafile=ca_bundle)
except Exception as exc:
raise _ssl_err(
f"CA certificate bundle at {ca_bundle} cannot be loaded: {exc}"
) from exc
# Paranoid check + macOS fallback
if not ctx.get_ca_certs():
try:
fallback = ssl.create_default_context()
if not fallback.get_ca_certs():
raise _ssl_err(
f"CA certificate bundle at {ca_bundle} is empty and "
"no system CA certificates are available."
)
logger.debug(
"certifi bundle at %s is empty but system CA store is ok", ca_bundle
)
except Exception:
raise
def verify_ca_bundle_with_fallback() -> None:
"""Verify CA bundle with macOS paranoid fallback.
On macOS, if certifi fails but the system trust store works,
we allow startup (some corporate proxies / MDM setups break certifi).
The fallback only applies to "empty/unloadable" cases, not to
completely missing files.
"""
try:
verify_ca_bundle()
except SSLConfigurationError as e:
if platform.system() == "Darwin" and "not found" not in str(e).lower():
try:
context = ssl.create_default_context()
if context.get_ca_certs():
logger.warning(
"certifi bundle invalid but macOS system trust store works. "
"Proceeding with reduced security."
)
return
except Exception:
pass
raise

View file

@ -0,0 +1,47 @@
# RCA: SSL CA cert bundle corruption after `hermes update`
**Status:** resolved by `fix(agent,gateway): add SSL CA cert bundle fail-fast guard`
**Severity:** P2 — degrades the agent into a crash-loop until the user re-installs deps.
## Summary
A `git pull` (or `hermes update`) that lands new code without finishing `uv pip install -e .` leaves the certifi CA bundle stale or missing on disk. The first outbound HTTPS call (OpenAI, Telegram, Discord, etc.) then crashes with a raw `ssl.SSLCertVerificationError` and Hermes enters a crash-loop, surfacing only a traceback to the user.
## Root cause
`certifi.where()` returns the path to the CA bundle shipped by the `certifi` package inside the active venv. When the venv is partially refreshed (new `certifi` files copied but old certs in the wheel cache, or a half-deleted install), the bundle can be:
- **missing** (file removed but Python still imports the package),
- **empty / truncated** (partial write),
- **unloadable** (cert format mismatch on a Python upgrade).
Hermes used to let those failures bubble up uncaught, so the gateway would log a stacktrace and the agent would retry the same broken network call on the next turn.
## Fix
`agent/ssl_guard.py` runs a `verify_ca_bundle()` pre-flight right after the `hermes_bootstrap` import in both `run_agent.py` and `gateway/run.py`. It:
1. Resolves the certifi bundle path,
2. Asserts the file exists and is at least 1 KB,
3. Builds an `ssl.SSLContext` from it,
4. Falls back to the system trust store on macOS when the bundle is empty but the system store works (covers corporate proxies / MDM setups),
5. Raises a typed `SSLConfigurationError` with a clear remediation hint otherwise.
`run_agent.py` and `gateway/run.py` import the guard in a guarded `try/except` so a bug in the guard itself cannot prevent startup — we log a warning and continue.
`hermes_cli doctor` now exposes a `SSL / CA Certificates` section so users can detect the failure with a single command.
## Recovery
When the guard fires, the user sees:
```
⚠️ SSL certificate bundle issue detected.
Run: pip install -e .
```
`pip install -e .` (or the equivalent `uv pip install -e .`) reinstalls certifi and restores the bundle.
## Environment escape hatch
Set `HERMES_SKIP_SSL_GUARD=1` to bypass the check. Intended for sandboxed environments that ship their own trust store.

View file

@ -29,6 +29,15 @@ import dataclasses
import inspect
import json
import logging
logger = logging.getLogger(__name__)
# Early SSL certificate guard (after hermes_bootstrap)
try:
from agent.ssl_guard import verify_ca_bundle_with_fallback
verify_ca_bundle_with_fallback()
except Exception as e:
logger.warning(f"SSL guard failed: {e}")
import os
import re
import shlex

View file

@ -306,6 +306,23 @@ def _check_s6_supervision(issues: list[str]) -> None:
)
def check_certificates() -> None:
"""Verify the certifi CA bundle is loadable.
Surfaces the SSLConfigurationError user-friendly path before they hit
a wall of tracebacks on the first outbound HTTPS call.
"""
try:
from agent.ssl_guard import verify_ca_bundle_with_fallback
from agent.errors import SSLConfigurationError
verify_ca_bundle_with_fallback()
check_ok("SSL CA certificate bundle is valid")
except SSLConfigurationError as e:
check_fail("SSL CA certificate bundle is broken", str(e))
except Exception as e:
check_warn("SSL certificate check skipped", str(e))
def _check_gateway_service_linger(issues: list[str]) -> None:
"""Warn when a systemd user gateway service will stop after logout.
@ -567,7 +584,10 @@ def run_doctor(args):
# Detect drift between pyproject.toml and hermes_cli/__init__.py versions
# (a git conflict resolution can silently revert one but not the other).
_check_version_consistency(issues)
_section("SSL / CA Certificates")
check_certificates()
_section("Required Packages")
required_packages = [
("openai", "OpenAI SDK"),

View file

@ -31,13 +31,21 @@ except ModuleNotFoundError:
# means UTF-8 stdio setup is skipped on Windows; POSIX is unaffected.
pass
import logging
logger = logging.getLogger(__name__)
# Early SSL certificate guard (after hermes_bootstrap)
try:
from agent.ssl_guard import verify_ca_bundle_with_fallback
verify_ca_bundle_with_fallback()
except Exception as e:
logger.warning(f"SSL guard failed: {e}")
import asyncio
import base64
import copy
import hashlib
import json
import logging
logger = logging.getLogger(__name__)
import os
import re
import sys

View file

@ -0,0 +1,64 @@
"""Tests for the preventive SSL CA bundle guard."""
import os
import ssl
from pathlib import Path
from unittest.mock import patch
import certifi
import pytest
from agent.errors import SSLConfigurationError
from agent.ssl_guard import (
verify_ca_bundle,
verify_ca_bundle_with_fallback,
)
def test_healthy_bundle_passes(tmp_path, monkeypatch):
"""A real, non-empty certifi bundle must verify without raising."""
# Sanity: certifi.where() must point to a real file in the test venv.
bundle = Path(certifi.where())
assert bundle.exists()
assert bundle.stat().st_size > 1024
verify_ca_bundle() # should not raise
def test_missing_bundle_raises_ssl_error(monkeypatch, tmp_path):
"""Point certifi.where() at a non-existent path; expect a clear error."""
fake = tmp_path / "nope.pem"
monkeypatch.setattr(certifi, "where", lambda: str(fake))
with pytest.raises(SSLConfigurationError) as exc:
verify_ca_bundle()
assert "not found" in str(exc.value).lower()
def test_empty_bundle_raises_ssl_error(monkeypatch, tmp_path):
"""Empty file is treated as a corrupted bundle."""
fake = tmp_path / "empty.pem"
fake.write_bytes(b"")
monkeypatch.setattr(certifi, "where", lambda: str(fake))
with pytest.raises(SSLConfigurationError) as exc:
verify_ca_bundle()
assert "corrupted" in str(exc.value).lower() or "empty" in str(exc.value).lower()
def test_skip_env_var_disables_guard(monkeypatch, tmp_path):
"""HERMES_SKIP_SSL_GUARD=1 must make the guard a no-op."""
monkeypatch.setenv("HERMES_SKIP_SSL_GUARD", "1")
fake = tmp_path / "nope.pem" # would raise if guard ran
monkeypatch.setattr(certifi, "where", lambda: str(fake))
verify_ca_bundle() # should not raise
def test_macos_fallback_allows_startup(monkeypatch, tmp_path):
"""On Darwin, an unloadable certifi bundle must fall back to system trust."""
fake = tmp_path / "broken.pem"
fake.write_bytes(b"not a real bundle")
monkeypatch.setattr(certifi, "where", lambda: str(fake))
monkeypatch.setattr("platform.system", lambda: "Darwin")
fake_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
with patch("ssl.create_default_context", return_value=fake_ctx):
# Should NOT raise — macOS fallback lets startup proceed.
verify_ca_bundle_with_fallback()