fix(agent,gateway,doctor): add SSL CA cert bundle fail-fast guard
A stale certifi CA bundle after a partial `hermes update` used to crash
the agent on the first outbound HTTPS call with a raw traceback and
trap the gateway in a retry loop.
This patch:
* Adds `agent/errors.py` with a typed `SSLConfigurationError`
* Adds `agent/ssl_guard.py` with a `verify_ca_bundle()` pre-flight
that asserts the bundle exists, is non-trivial in size, and can build
a working SSLContext. On macOS, it falls back to the system trust
store when the bundle is empty but the system store is healthy
(covers corporate proxies / MDM setups).
* Wires the guard into `run_agent.py` and `gateway/run.py` right
after the `hermes_bootstrap` import, inside a try/except so a bug
in the guard itself can never prevent startup.
* Adds a `SSL / CA Certificates` section to `hermes_cli doctor` so
users can detect the failure with one command.
* Adds unit tests covering the healthy, missing, empty, skip-env, and
macOS-fallback paths.
* Adds an RCA document describing the failure mode and the recovery
path (`pip install -e .`).
When the bundle is broken the user sees:
\u26a0\ufe0f SSL certificate bundle issue detected.
Run: pip install -e .
`HERMES_SKIP_SSL_GUARD=1` disables the check for sandboxed
environments that ship their own trust store.
This commit is contained in:
parent
1106879147
commit
a218a0f156
7 changed files with 244 additions and 3 deletions
3
agent/errors.py
Normal file
3
agent/errors.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
class SSLConfigurationError(Exception):
|
||||
"""Raised when SSL/TLS certificate bundle configuration fails."""
|
||||
pass
|
||||
90
agent/ssl_guard.py
Normal file
90
agent/ssl_guard.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
"""Preventive SSL CA certificate guard for Hermes Agent.
|
||||
|
||||
This module provides an early fail-fast check to detect corrupted or missing
|
||||
certifi CA bundles before any network client is initialized.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import ssl
|
||||
from pathlib import Path
|
||||
|
||||
import certifi
|
||||
|
||||
from agent.errors import SSLConfigurationError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _ssl_err(message: str) -> SSLConfigurationError:
|
||||
"""Helper to create a consistent error with remediation hint."""
|
||||
return SSLConfigurationError(message + "\nRun: pip install -e .")
|
||||
|
||||
|
||||
def verify_ca_bundle() -> None:
|
||||
"""Verify that the certifi CA bundle is valid and loadable.
|
||||
|
||||
Raises:
|
||||
SSLConfigurationError: If the bundle is missing, empty, or cannot be
|
||||
used to create a working SSLContext.
|
||||
"""
|
||||
if os.getenv("HERMES_SKIP_SSL_GUARD"):
|
||||
logger.debug("SSL guard skipped via HERMES_SKIP_SSL_GUARD")
|
||||
return
|
||||
|
||||
ca_bundle = str(certifi.where())
|
||||
bundle_path = Path(ca_bundle)
|
||||
|
||||
if not bundle_path.exists():
|
||||
raise _ssl_err(f"certifi CA bundle not found at {ca_bundle}")
|
||||
|
||||
if bundle_path.stat().st_size < 1024:
|
||||
raise _ssl_err(f"certifi CA bundle at {ca_bundle} appears corrupted (too small)")
|
||||
|
||||
try:
|
||||
ctx = ssl.create_default_context(cafile=ca_bundle)
|
||||
except Exception as exc:
|
||||
raise _ssl_err(
|
||||
f"CA certificate bundle at {ca_bundle} cannot be loaded: {exc}"
|
||||
) from exc
|
||||
|
||||
# Paranoid check + macOS fallback
|
||||
if not ctx.get_ca_certs():
|
||||
try:
|
||||
fallback = ssl.create_default_context()
|
||||
if not fallback.get_ca_certs():
|
||||
raise _ssl_err(
|
||||
f"CA certificate bundle at {ca_bundle} is empty and "
|
||||
"no system CA certificates are available."
|
||||
)
|
||||
logger.debug(
|
||||
"certifi bundle at %s is empty but system CA store is ok", ca_bundle
|
||||
)
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
|
||||
def verify_ca_bundle_with_fallback() -> None:
|
||||
"""Verify CA bundle with macOS paranoid fallback.
|
||||
|
||||
On macOS, if certifi fails but the system trust store works,
|
||||
we allow startup (some corporate proxies / MDM setups break certifi).
|
||||
The fallback only applies to "empty/unloadable" cases, not to
|
||||
completely missing files.
|
||||
"""
|
||||
try:
|
||||
verify_ca_bundle()
|
||||
except SSLConfigurationError as e:
|
||||
if platform.system() == "Darwin" and "not found" not in str(e).lower():
|
||||
try:
|
||||
context = ssl.create_default_context()
|
||||
if context.get_ca_certs():
|
||||
logger.warning(
|
||||
"certifi bundle invalid but macOS system trust store works. "
|
||||
"Proceeding with reduced security."
|
||||
)
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
47
docs/rca-ssl-cacert-post-git-pull.md
Normal file
47
docs/rca-ssl-cacert-post-git-pull.md
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
# RCA: SSL CA cert bundle corruption after `hermes update`
|
||||
|
||||
**Status:** resolved by `fix(agent,gateway): add SSL CA cert bundle fail-fast guard`
|
||||
**Severity:** P2 — degrades the agent into a crash-loop until the user re-installs deps.
|
||||
|
||||
## Summary
|
||||
|
||||
A `git pull` (or `hermes update`) that lands new code without finishing `uv pip install -e .` leaves the certifi CA bundle stale or missing on disk. The first outbound HTTPS call (OpenAI, Telegram, Discord, etc.) then crashes with a raw `ssl.SSLCertVerificationError` and Hermes enters a crash-loop, surfacing only a traceback to the user.
|
||||
|
||||
## Root cause
|
||||
|
||||
`certifi.where()` returns the path to the CA bundle shipped by the `certifi` package inside the active venv. When the venv is partially refreshed (new `certifi` files copied but old certs in the wheel cache, or a half-deleted install), the bundle can be:
|
||||
|
||||
- **missing** (file removed but Python still imports the package),
|
||||
- **empty / truncated** (partial write),
|
||||
- **unloadable** (cert format mismatch on a Python upgrade).
|
||||
|
||||
Hermes used to let those failures bubble up uncaught, so the gateway would log a stacktrace and the agent would retry the same broken network call on the next turn.
|
||||
|
||||
## Fix
|
||||
|
||||
`agent/ssl_guard.py` runs a `verify_ca_bundle()` pre-flight right after the `hermes_bootstrap` import in both `run_agent.py` and `gateway/run.py`. It:
|
||||
|
||||
1. Resolves the certifi bundle path,
|
||||
2. Asserts the file exists and is at least 1 KB,
|
||||
3. Builds an `ssl.SSLContext` from it,
|
||||
4. Falls back to the system trust store on macOS when the bundle is empty but the system store works (covers corporate proxies / MDM setups),
|
||||
5. Raises a typed `SSLConfigurationError` with a clear remediation hint otherwise.
|
||||
|
||||
`run_agent.py` and `gateway/run.py` import the guard in a guarded `try/except` so a bug in the guard itself cannot prevent startup — we log a warning and continue.
|
||||
|
||||
`hermes_cli doctor` now exposes a `SSL / CA Certificates` section so users can detect the failure with a single command.
|
||||
|
||||
## Recovery
|
||||
|
||||
When the guard fires, the user sees:
|
||||
|
||||
```
|
||||
⚠️ SSL certificate bundle issue detected.
|
||||
Run: pip install -e .
|
||||
```
|
||||
|
||||
`pip install -e .` (or the equivalent `uv pip install -e .`) reinstalls certifi and restores the bundle.
|
||||
|
||||
## Environment escape hatch
|
||||
|
||||
Set `HERMES_SKIP_SSL_GUARD=1` to bypass the check. Intended for sandboxed environments that ship their own trust store.
|
||||
|
|
@ -29,6 +29,15 @@ import dataclasses
|
|||
import inspect
|
||||
import json
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Early SSL certificate guard (after hermes_bootstrap)
|
||||
try:
|
||||
from agent.ssl_guard import verify_ca_bundle_with_fallback
|
||||
verify_ca_bundle_with_fallback()
|
||||
except Exception as e:
|
||||
logger.warning(f"SSL guard failed: {e}")
|
||||
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
|
|
|
|||
|
|
@ -306,6 +306,23 @@ def _check_s6_supervision(issues: list[str]) -> None:
|
|||
)
|
||||
|
||||
|
||||
def check_certificates() -> None:
|
||||
"""Verify the certifi CA bundle is loadable.
|
||||
|
||||
Surfaces the SSLConfigurationError user-friendly path before they hit
|
||||
a wall of tracebacks on the first outbound HTTPS call.
|
||||
"""
|
||||
try:
|
||||
from agent.ssl_guard import verify_ca_bundle_with_fallback
|
||||
from agent.errors import SSLConfigurationError
|
||||
verify_ca_bundle_with_fallback()
|
||||
check_ok("SSL CA certificate bundle is valid")
|
||||
except SSLConfigurationError as e:
|
||||
check_fail("SSL CA certificate bundle is broken", str(e))
|
||||
except Exception as e:
|
||||
check_warn("SSL certificate check skipped", str(e))
|
||||
|
||||
|
||||
def _check_gateway_service_linger(issues: list[str]) -> None:
|
||||
"""Warn when a systemd user gateway service will stop after logout.
|
||||
|
||||
|
|
@ -567,7 +584,10 @@ def run_doctor(args):
|
|||
# Detect drift between pyproject.toml and hermes_cli/__init__.py versions
|
||||
# (a git conflict resolution can silently revert one but not the other).
|
||||
_check_version_consistency(issues)
|
||||
|
||||
|
||||
_section("SSL / CA Certificates")
|
||||
check_certificates()
|
||||
|
||||
_section("Required Packages")
|
||||
required_packages = [
|
||||
("openai", "OpenAI SDK"),
|
||||
|
|
|
|||
12
run_agent.py
12
run_agent.py
|
|
@ -31,13 +31,21 @@ except ModuleNotFoundError:
|
|||
# means UTF-8 stdio setup is skipped on Windows; POSIX is unaffected.
|
||||
pass
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Early SSL certificate guard (after hermes_bootstrap)
|
||||
try:
|
||||
from agent.ssl_guard import verify_ca_bundle_with_fallback
|
||||
verify_ca_bundle_with_fallback()
|
||||
except Exception as e:
|
||||
logger.warning(f"SSL guard failed: {e}")
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import copy
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
|
|
|||
64
tests/agent/test_ssl_ca_guard.py
Normal file
64
tests/agent/test_ssl_ca_guard.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
"""Tests for the preventive SSL CA bundle guard."""
|
||||
|
||||
import os
|
||||
import ssl
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import certifi
|
||||
import pytest
|
||||
|
||||
from agent.errors import SSLConfigurationError
|
||||
from agent.ssl_guard import (
|
||||
verify_ca_bundle,
|
||||
verify_ca_bundle_with_fallback,
|
||||
)
|
||||
|
||||
|
||||
def test_healthy_bundle_passes(tmp_path, monkeypatch):
|
||||
"""A real, non-empty certifi bundle must verify without raising."""
|
||||
# Sanity: certifi.where() must point to a real file in the test venv.
|
||||
bundle = Path(certifi.where())
|
||||
assert bundle.exists()
|
||||
assert bundle.stat().st_size > 1024
|
||||
verify_ca_bundle() # should not raise
|
||||
|
||||
|
||||
def test_missing_bundle_raises_ssl_error(monkeypatch, tmp_path):
|
||||
"""Point certifi.where() at a non-existent path; expect a clear error."""
|
||||
fake = tmp_path / "nope.pem"
|
||||
monkeypatch.setattr(certifi, "where", lambda: str(fake))
|
||||
with pytest.raises(SSLConfigurationError) as exc:
|
||||
verify_ca_bundle()
|
||||
assert "not found" in str(exc.value).lower()
|
||||
|
||||
|
||||
def test_empty_bundle_raises_ssl_error(monkeypatch, tmp_path):
|
||||
"""Empty file is treated as a corrupted bundle."""
|
||||
fake = tmp_path / "empty.pem"
|
||||
fake.write_bytes(b"")
|
||||
monkeypatch.setattr(certifi, "where", lambda: str(fake))
|
||||
with pytest.raises(SSLConfigurationError) as exc:
|
||||
verify_ca_bundle()
|
||||
assert "corrupted" in str(exc.value).lower() or "empty" in str(exc.value).lower()
|
||||
|
||||
|
||||
def test_skip_env_var_disables_guard(monkeypatch, tmp_path):
|
||||
"""HERMES_SKIP_SSL_GUARD=1 must make the guard a no-op."""
|
||||
monkeypatch.setenv("HERMES_SKIP_SSL_GUARD", "1")
|
||||
fake = tmp_path / "nope.pem" # would raise if guard ran
|
||||
monkeypatch.setattr(certifi, "where", lambda: str(fake))
|
||||
verify_ca_bundle() # should not raise
|
||||
|
||||
|
||||
def test_macos_fallback_allows_startup(monkeypatch, tmp_path):
|
||||
"""On Darwin, an unloadable certifi bundle must fall back to system trust."""
|
||||
fake = tmp_path / "broken.pem"
|
||||
fake.write_bytes(b"not a real bundle")
|
||||
monkeypatch.setattr(certifi, "where", lambda: str(fake))
|
||||
monkeypatch.setattr("platform.system", lambda: "Darwin")
|
||||
|
||||
fake_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
with patch("ssl.create_default_context", return_value=fake_ctx):
|
||||
# Should NOT raise — macOS fallback lets startup proceed.
|
||||
verify_ca_bundle_with_fallback()
|
||||
Loading…
Add table
Add a link
Reference in a new issue