diff --git a/agent/errors.py b/agent/errors.py new file mode 100644 index 000000000..abedd83d2 --- /dev/null +++ b/agent/errors.py @@ -0,0 +1,3 @@ +class SSLConfigurationError(Exception): + """Raised when SSL/TLS certificate bundle configuration fails.""" + pass diff --git a/agent/ssl_guard.py b/agent/ssl_guard.py new file mode 100644 index 000000000..85229f76d --- /dev/null +++ b/agent/ssl_guard.py @@ -0,0 +1,90 @@ +"""Preventive SSL CA certificate guard for Hermes Agent. + +This module provides an early fail-fast check to detect corrupted or missing +certifi CA bundles before any network client is initialized. +""" + +import logging +import os +import platform +import ssl +from pathlib import Path + +import certifi + +from agent.errors import SSLConfigurationError + +logger = logging.getLogger(__name__) + + +def _ssl_err(message: str) -> SSLConfigurationError: + """Helper to create a consistent error with remediation hint.""" + return SSLConfigurationError(message + "\nRun: pip install -e .") + + +def verify_ca_bundle() -> None: + """Verify that the certifi CA bundle is valid and loadable. + + Raises: + SSLConfigurationError: If the bundle is missing, empty, or cannot be + used to create a working SSLContext. + """ + if os.getenv("HERMES_SKIP_SSL_GUARD"): + logger.debug("SSL guard skipped via HERMES_SKIP_SSL_GUARD") + return + + ca_bundle = str(certifi.where()) + bundle_path = Path(ca_bundle) + + if not bundle_path.exists(): + raise _ssl_err(f"certifi CA bundle not found at {ca_bundle}") + + if bundle_path.stat().st_size < 1024: + raise _ssl_err(f"certifi CA bundle at {ca_bundle} appears corrupted (too small)") + + try: + ctx = ssl.create_default_context(cafile=ca_bundle) + except Exception as exc: + raise _ssl_err( + f"CA certificate bundle at {ca_bundle} cannot be loaded: {exc}" + ) from exc + + # Paranoid check + macOS fallback + if not ctx.get_ca_certs(): + try: + fallback = ssl.create_default_context() + if not fallback.get_ca_certs(): + raise _ssl_err( + f"CA certificate bundle at {ca_bundle} is empty and " + "no system CA certificates are available." + ) + logger.debug( + "certifi bundle at %s is empty but system CA store is ok", ca_bundle + ) + except Exception: + raise + + +def verify_ca_bundle_with_fallback() -> None: + """Verify CA bundle with macOS paranoid fallback. + + On macOS, if certifi fails but the system trust store works, + we allow startup (some corporate proxies / MDM setups break certifi). + The fallback only applies to "empty/unloadable" cases, not to + completely missing files. + """ + try: + verify_ca_bundle() + except SSLConfigurationError as e: + if platform.system() == "Darwin" and "not found" not in str(e).lower(): + try: + context = ssl.create_default_context() + if context.get_ca_certs(): + logger.warning( + "certifi bundle invalid but macOS system trust store works. " + "Proceeding with reduced security." + ) + return + except Exception: + pass + raise diff --git a/docs/rca-ssl-cacert-post-git-pull.md b/docs/rca-ssl-cacert-post-git-pull.md new file mode 100644 index 000000000..6076cc7a5 --- /dev/null +++ b/docs/rca-ssl-cacert-post-git-pull.md @@ -0,0 +1,47 @@ +# RCA: SSL CA cert bundle corruption after `hermes update` + +**Status:** resolved by `fix(agent,gateway): add SSL CA cert bundle fail-fast guard` +**Severity:** P2 — degrades the agent into a crash-loop until the user re-installs deps. + +## Summary + +A `git pull` (or `hermes update`) that lands new code without finishing `uv pip install -e .` leaves the certifi CA bundle stale or missing on disk. The first outbound HTTPS call (OpenAI, Telegram, Discord, etc.) then crashes with a raw `ssl.SSLCertVerificationError` and Hermes enters a crash-loop, surfacing only a traceback to the user. + +## Root cause + +`certifi.where()` returns the path to the CA bundle shipped by the `certifi` package inside the active venv. When the venv is partially refreshed (new `certifi` files copied but old certs in the wheel cache, or a half-deleted install), the bundle can be: + +- **missing** (file removed but Python still imports the package), +- **empty / truncated** (partial write), +- **unloadable** (cert format mismatch on a Python upgrade). + +Hermes used to let those failures bubble up uncaught, so the gateway would log a stacktrace and the agent would retry the same broken network call on the next turn. + +## Fix + +`agent/ssl_guard.py` runs a `verify_ca_bundle()` pre-flight right after the `hermes_bootstrap` import in both `run_agent.py` and `gateway/run.py`. It: + +1. Resolves the certifi bundle path, +2. Asserts the file exists and is at least 1 KB, +3. Builds an `ssl.SSLContext` from it, +4. Falls back to the system trust store on macOS when the bundle is empty but the system store works (covers corporate proxies / MDM setups), +5. Raises a typed `SSLConfigurationError` with a clear remediation hint otherwise. + +`run_agent.py` and `gateway/run.py` import the guard in a guarded `try/except` so a bug in the guard itself cannot prevent startup — we log a warning and continue. + +`hermes_cli doctor` now exposes a `SSL / CA Certificates` section so users can detect the failure with a single command. + +## Recovery + +When the guard fires, the user sees: + +``` +⚠️ SSL certificate bundle issue detected. + Run: pip install -e . +``` + +`pip install -e .` (or the equivalent `uv pip install -e .`) reinstalls certifi and restores the bundle. + +## Environment escape hatch + +Set `HERMES_SKIP_SSL_GUARD=1` to bypass the check. Intended for sandboxed environments that ship their own trust store. diff --git a/gateway/run.py b/gateway/run.py index f95a535be..f0a6923c1 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -29,6 +29,15 @@ import dataclasses import inspect import json import logging +logger = logging.getLogger(__name__) + +# Early SSL certificate guard (after hermes_bootstrap) +try: + from agent.ssl_guard import verify_ca_bundle_with_fallback + verify_ca_bundle_with_fallback() +except Exception as e: + logger.warning(f"SSL guard failed: {e}") + import os import re import shlex diff --git a/hermes_cli/doctor.py b/hermes_cli/doctor.py index cd0fcb9c5..4d9d7bf38 100644 --- a/hermes_cli/doctor.py +++ b/hermes_cli/doctor.py @@ -306,6 +306,23 @@ def _check_s6_supervision(issues: list[str]) -> None: ) +def check_certificates() -> None: + """Verify the certifi CA bundle is loadable. + + Surfaces the SSLConfigurationError user-friendly path before they hit + a wall of tracebacks on the first outbound HTTPS call. + """ + try: + from agent.ssl_guard import verify_ca_bundle_with_fallback + from agent.errors import SSLConfigurationError + verify_ca_bundle_with_fallback() + check_ok("SSL CA certificate bundle is valid") + except SSLConfigurationError as e: + check_fail("SSL CA certificate bundle is broken", str(e)) + except Exception as e: + check_warn("SSL certificate check skipped", str(e)) + + def _check_gateway_service_linger(issues: list[str]) -> None: """Warn when a systemd user gateway service will stop after logout. @@ -567,7 +584,10 @@ def run_doctor(args): # Detect drift between pyproject.toml and hermes_cli/__init__.py versions # (a git conflict resolution can silently revert one but not the other). _check_version_consistency(issues) - + + _section("SSL / CA Certificates") + check_certificates() + _section("Required Packages") required_packages = [ ("openai", "OpenAI SDK"), diff --git a/run_agent.py b/run_agent.py index ffcf2abf1..5467b3b52 100644 --- a/run_agent.py +++ b/run_agent.py @@ -31,13 +31,21 @@ except ModuleNotFoundError: # means UTF-8 stdio setup is skipped on Windows; POSIX is unaffected. pass +import logging +logger = logging.getLogger(__name__) + +# Early SSL certificate guard (after hermes_bootstrap) +try: + from agent.ssl_guard import verify_ca_bundle_with_fallback + verify_ca_bundle_with_fallback() +except Exception as e: + logger.warning(f"SSL guard failed: {e}") + import asyncio import base64 import copy import hashlib import json -import logging -logger = logging.getLogger(__name__) import os import re import sys diff --git a/tests/agent/test_ssl_ca_guard.py b/tests/agent/test_ssl_ca_guard.py new file mode 100644 index 000000000..8b5ead3e0 --- /dev/null +++ b/tests/agent/test_ssl_ca_guard.py @@ -0,0 +1,64 @@ +"""Tests for the preventive SSL CA bundle guard.""" + +import os +import ssl +from pathlib import Path +from unittest.mock import patch + +import certifi +import pytest + +from agent.errors import SSLConfigurationError +from agent.ssl_guard import ( + verify_ca_bundle, + verify_ca_bundle_with_fallback, +) + + +def test_healthy_bundle_passes(tmp_path, monkeypatch): + """A real, non-empty certifi bundle must verify without raising.""" + # Sanity: certifi.where() must point to a real file in the test venv. + bundle = Path(certifi.where()) + assert bundle.exists() + assert bundle.stat().st_size > 1024 + verify_ca_bundle() # should not raise + + +def test_missing_bundle_raises_ssl_error(monkeypatch, tmp_path): + """Point certifi.where() at a non-existent path; expect a clear error.""" + fake = tmp_path / "nope.pem" + monkeypatch.setattr(certifi, "where", lambda: str(fake)) + with pytest.raises(SSLConfigurationError) as exc: + verify_ca_bundle() + assert "not found" in str(exc.value).lower() + + +def test_empty_bundle_raises_ssl_error(monkeypatch, tmp_path): + """Empty file is treated as a corrupted bundle.""" + fake = tmp_path / "empty.pem" + fake.write_bytes(b"") + monkeypatch.setattr(certifi, "where", lambda: str(fake)) + with pytest.raises(SSLConfigurationError) as exc: + verify_ca_bundle() + assert "corrupted" in str(exc.value).lower() or "empty" in str(exc.value).lower() + + +def test_skip_env_var_disables_guard(monkeypatch, tmp_path): + """HERMES_SKIP_SSL_GUARD=1 must make the guard a no-op.""" + monkeypatch.setenv("HERMES_SKIP_SSL_GUARD", "1") + fake = tmp_path / "nope.pem" # would raise if guard ran + monkeypatch.setattr(certifi, "where", lambda: str(fake)) + verify_ca_bundle() # should not raise + + +def test_macos_fallback_allows_startup(monkeypatch, tmp_path): + """On Darwin, an unloadable certifi bundle must fall back to system trust.""" + fake = tmp_path / "broken.pem" + fake.write_bytes(b"not a real bundle") + monkeypatch.setattr(certifi, "where", lambda: str(fake)) + monkeypatch.setattr("platform.system", lambda: "Darwin") + + fake_ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + with patch("ssl.create_default_context", return_value=fake_ctx): + # Should NOT raise — macOS fallback lets startup proceed. + verify_ca_bundle_with_fallback()