fix(auth): make HERMES_PORTAL_BASE_URL/NOUS_PORTAL_BASE_URL bypass the Portal host allowlist (#56864)

Ben caught that the initial approach (widening _NOUS_PORTAL_ALLOWED_HOSTS to
include the staging host) was the wrong fix -- env vars are supposed to
override the allowlist, mirroring how NOUS_INFERENCE_BASE_URL already
bypasses _ALLOWED_NOUS_INFERENCE_HOSTS via _nous_inference_env_override().

The actual bug: both resolve_nous_access_token and
resolve_nous_runtime_credentials read
`_optional_base_url(state.get("portal_base_url")) or os.getenv(...) or ...`
-- a plain `or` chain where the STORED state value wins first (short-circuits
before the env vars are even read), and then whichever value won gets run
through the same _NOUS_PORTAL_ALLOWED_HOSTS gate regardless of its source.
So a hosted agent stamped with HERMES_PORTAL_BASE_URL=<staging> in its env
AND a staging portal_base_url already persisted to auth.json would still
get silently rewritten to prod on every refresh, because the env var never
even got a chance to be consulted.

Revert the previous _NOUS_PORTAL_ALLOWED_HOSTS widening entirely --
staying prod-only preserves the allowlist's actual job (rejecting an
untrusted network-provided portal_base_url persisted to auth.json by a
compromised Portal response).

Add _nous_portal_env_override() (mirrors _nous_inference_env_override())
and restructure both call sites so the env override is checked FIRST and,
when set, wins outright and skips the allowlist gate entirely -- the
allowlist only ever runs against the fallback (stored-state-or-default)
path now.

Rewrote tests/hermes_cli/test_nous_portal_staging_allowlist.py to test the
actual fix: the helper function, and an end-to-end
resolve_nous_access_token proof that the env override wins even when state
ALSO has the staging host stored (the exact incident shape), that it wins
over a stored PROD host too, and that the allowlist's heal-to-prod
behaviour for an untrusted stored value is preserved when no override is
set.
This commit is contained in:
Ben Barclay 2026-07-02 16:52:46 +10:00 committed by GitHub
parent 88d1d6206f
commit 9be39de0f2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 258 additions and 20 deletions

View file

@ -1926,6 +1926,28 @@ def _nous_inference_env_override() -> Optional[str]:
return _optional_base_url(os.getenv("NOUS_INFERENCE_BASE_URL"))
def _nous_portal_env_override() -> Optional[str]:
"""Return the user/deployment-set Portal base URL override, if any.
Mirrors ``_nous_inference_env_override()``: ``HERMES_PORTAL_BASE_URL`` /
``NOUS_PORTAL_BASE_URL`` are the documented dev/staging escape hatch for
pointing Hermes at a non-production Nous Portal (e.g. a hosted agent
provisioned on nous-account-service's `staging` environment, which stamps
``HERMES_PORTAL_BASE_URL=https://portal.staging-nousresearch.com`` into
the container env). The env source is trusted (the OS user/deployment
set it themselves), so like the inference override it must NOT be
gated by ``_NOUS_PORTAL_ALLOWED_HOSTS``: that allowlist exists to reject
an untrusted NETWORK-provided value (a poisoned portal_base_url
persisted to auth.json), not a value the operator explicitly configured.
Returns a trailing-slash-stripped non-empty string, or ``None`` when
neither env var is set/blank.
"""
return _optional_base_url(
os.getenv("HERMES_PORTAL_BASE_URL") or os.getenv("NOUS_PORTAL_BASE_URL")
)
def _decode_jwt_claims(token: Any) -> Dict[str, Any]:
if not isinstance(token, str) or token.count(".") != 2:
return {}
@ -5386,20 +5408,29 @@ def resolve_nous_access_token(
relogin_required=True,
)
portal_base_url = (
_optional_base_url(state.get("portal_base_url"))
or os.getenv("HERMES_PORTAL_BASE_URL")
or os.getenv("NOUS_PORTAL_BASE_URL")
or DEFAULT_NOUS_PORTAL_URL
).rstrip("/")
# HERMES_PORTAL_BASE_URL / NOUS_PORTAL_BASE_URL is the trusted
# operator/deployment override (mirrors NOUS_INFERENCE_BASE_URL) and
# must win OUTRIGHT — including over a stored value — and bypass the
# host allowlist entirely, since the allowlist exists to reject an
# untrusted network-provided value, not one the operator configured.
# Only fall through to the stored/default value + allowlist gate when
# no override is set.
env_portal_override = _nous_portal_env_override()
if env_portal_override:
portal_base_url = env_portal_override.rstrip("/")
else:
portal_base_url = (
_optional_base_url(state.get("portal_base_url"))
or DEFAULT_NOUS_PORTAL_URL
).rstrip("/")
parsed_portal_url = urlparse(portal_base_url)
if parsed_portal_url.hostname and parsed_portal_url.hostname not in _NOUS_PORTAL_ALLOWED_HOSTS:
logger.warning(
"auth: ignoring invalid portal_base_url %r (host %r not in allowlist), using default",
portal_base_url, parsed_portal_url.hostname,
)
portal_base_url = DEFAULT_NOUS_PORTAL_URL
parsed_portal_url = urlparse(portal_base_url)
if parsed_portal_url.hostname and parsed_portal_url.hostname not in _NOUS_PORTAL_ALLOWED_HOSTS:
logger.warning(
"auth: ignoring invalid portal_base_url %r (host %r not in allowlist), using default",
portal_base_url, parsed_portal_url.hostname,
)
portal_base_url = DEFAULT_NOUS_PORTAL_URL
client_id = str(state.get("client_id") or DEFAULT_NOUS_CLIENT_ID)
verify = _resolve_verify(insecure=insecure, ca_bundle=ca_bundle, auth_state=state)
@ -5718,13 +5749,22 @@ def resolve_nous_runtime_credentials(
# A persisted/stale portal_base_url is where the refresh token gets
# POSTed on refresh — reject any host outside the allowlist so a
# poisoned value can't exfiltrate the bearer, healing to the default.
parsed_portal_url = urlparse(portal_base_url)
if parsed_portal_url.hostname and parsed_portal_url.hostname not in _NOUS_PORTAL_ALLOWED_HOSTS:
logger.warning(
"auth: ignoring invalid portal_base_url %r (host %r not in allowlist), using default",
portal_base_url, parsed_portal_url.hostname,
)
portal_base_url = DEFAULT_NOUS_PORTAL_URL
# The trusted operator/deployment env override (HERMES_PORTAL_BASE_URL /
# NOUS_PORTAL_BASE_URL) bypasses this gate entirely — mirrors
# NOUS_INFERENCE_BASE_URL's treatment below; the allowlist exists to
# reject an untrusted NETWORK-provided value, not one the operator
# explicitly configured.
env_portal_override = _nous_portal_env_override()
if env_portal_override:
portal_base_url = env_portal_override.rstrip("/")
else:
parsed_portal_url = urlparse(portal_base_url)
if parsed_portal_url.hostname and parsed_portal_url.hostname not in _NOUS_PORTAL_ALLOWED_HOSTS:
logger.warning(
"auth: ignoring invalid portal_base_url %r (host %r not in allowlist), using default",
portal_base_url, parsed_portal_url.hostname,
)
portal_base_url = DEFAULT_NOUS_PORTAL_URL
# Persisted value: validated network-provenance only. The stored
# inference_base_url is re-validated on read so a poisoned/stale

View file

@ -0,0 +1,198 @@
"""Regression tests for the Nous Portal env-override bypassing the host
allowlist, mirroring the existing NOUS_INFERENCE_BASE_URL /
_ALLOWED_NOUS_INFERENCE_HOSTS treatment.
Real incident (2026-07): a hosted agent provisioned by nous-account-service
on the `staging` Vercel environment is stamped with
``HERMES_PORTAL_BASE_URL=https://portal.staging-nousresearch.com`` in its
container env (the documented dev/staging override), while its bootstrap
``auth.json`` ALSO persists ``portal_base_url`` to the same staging host.
Before this fix, ``resolve_nous_access_token`` / ``resolve_nous_runtime_
credentials`` read ``state.get("portal_base_url")`` FIRST via a plain ``or``
chain, so whenever the stored state had ANY value the env vars were never
even consulted and whichever value won (state or env) was then run through
``_NOUS_PORTAL_ALLOWED_HOSTS``, which only recognised the production host.
The staging host was silently rewritten back to prod on every refresh, so a
staging-issued refresh token got replayed against the PROD token endpoint.
Prod correctly rejected that with ``invalid_grant``, which triggered
``_quarantine_nous_oauth_state`` and wiped the entire credential pool.
The correct fix (mirroring ``_nous_inference_env_override()``): the env
override is a TRUSTED value the operator/deployment set themselves it must
win outright (even over a stored value) and bypass the allowlist entirely.
The allowlist exists only to reject an untrusted NETWORK-provided value
(a poisoned portal_base_url written to auth.json by a compromised Portal
response), never a value the operator explicitly configured.
"""
from __future__ import annotations
import json
import logging
from hermes_cli.auth import (
DEFAULT_NOUS_PORTAL_URL,
_NOUS_PORTAL_ALLOWED_HOSTS,
_nous_portal_env_override,
)
class TestPortalEnvOverrideHelper:
def test_none_when_unset(self, monkeypatch):
monkeypatch.delenv("HERMES_PORTAL_BASE_URL", raising=False)
monkeypatch.delenv("NOUS_PORTAL_BASE_URL", raising=False)
assert _nous_portal_env_override() is None
def test_hermes_portal_base_url_wins(self, monkeypatch):
monkeypatch.setenv(
"HERMES_PORTAL_BASE_URL", "https://portal.staging-nousresearch.com/"
)
monkeypatch.delenv("NOUS_PORTAL_BASE_URL", raising=False)
assert (
_nous_portal_env_override() == "https://portal.staging-nousresearch.com"
)
def test_nous_portal_base_url_used_as_fallback(self, monkeypatch):
monkeypatch.delenv("HERMES_PORTAL_BASE_URL", raising=False)
monkeypatch.setenv(
"NOUS_PORTAL_BASE_URL", "https://portal.staging-nousresearch.com"
)
assert (
_nous_portal_env_override() == "https://portal.staging-nousresearch.com"
)
def test_env_override_not_gated_by_allowlist(self, monkeypatch):
"""The whole point: an env-set staging host is NOT in
_NOUS_PORTAL_ALLOWED_HOSTS, and the helper must return it anyway
gating happens only for network-provenance values."""
monkeypatch.setenv(
"HERMES_PORTAL_BASE_URL", "https://portal.staging-nousresearch.com"
)
assert "portal.staging-nousresearch.com" not in _NOUS_PORTAL_ALLOWED_HOSTS
assert (
_nous_portal_env_override() == "https://portal.staging-nousresearch.com"
)
class TestResolveAccessTokenEnvOverrideWins:
"""End-to-end: resolve_nous_access_token must use the env override for
the refresh call, bypassing the allowlist, even when state also has a
portal_base_url set (the exact incident shape)."""
def _write_auth_file(self, tmp_path, *, stored_portal_url):
auth_file = tmp_path / "auth.json"
auth_file.write_text(
json.dumps(
{
"version": 1,
"active_provider": "nous",
"providers": {
"nous": {
"portal_base_url": stored_portal_url,
"access_token": "expired-access",
"refresh_token": "staging-refresh",
"client_id": "hermes-cli-vps",
"expires_at": "2000-01-01T00:00:00+00:00",
}
},
}
)
)
return auth_file
def _run_and_capture(self, monkeypatch, auth):
seen_portal_urls = []
def _fake_refresh(*, client, portal_base_url, client_id, refresh_token):
seen_portal_urls.append(portal_base_url)
return {
"access_token": "new-access",
"refresh_token": "new-refresh",
"expires_in": 3600,
}
monkeypatch.setattr(auth, "_refresh_access_token", _fake_refresh)
caplog_records = []
logger = logging.getLogger("hermes_cli.auth")
handler = logging.Handler()
handler.emit = lambda record: caplog_records.append(record.getMessage())
logger.addHandler(handler)
try:
auth.resolve_nous_access_token()
finally:
logger.removeHandler(handler)
return seen_portal_urls, caplog_records
def test_env_override_wins_even_with_staging_state_stored(
self, monkeypatch, tmp_path
):
"""The real incident: state ALSO has the staging host stored (from
a prior HERMES_AUTH_JSON_BOOTSTRAP seed), and the env var is set to
the same staging host. Both must resolve to staging, and the
allowlist-rejection warning must never fire."""
import hermes_cli.auth as auth
staging_portal = "https://portal.staging-nousresearch.com"
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("HERMES_PORTAL_BASE_URL", staging_portal)
self._write_auth_file(tmp_path, stored_portal_url=staging_portal)
seen_portal_urls, records = self._run_and_capture(monkeypatch, auth)
assert seen_portal_urls == [staging_portal]
assert not any(
"ignoring invalid portal_base_url" in msg for msg in records
), "env override must bypass the allowlist gate entirely"
def test_env_override_wins_over_prod_state(self, monkeypatch, tmp_path):
"""Even when the STORED state is the prod host (e.g. a stale/healed
value from before the env var was set), the env override must still
win for the actual refresh call."""
import hermes_cli.auth as auth
staging_portal = "https://portal.staging-nousresearch.com"
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("HERMES_PORTAL_BASE_URL", staging_portal)
self._write_auth_file(tmp_path, stored_portal_url=DEFAULT_NOUS_PORTAL_URL)
seen_portal_urls, _records = self._run_and_capture(monkeypatch, auth)
assert seen_portal_urls == [staging_portal]
def test_no_env_override_stored_staging_host_heals_to_prod(
self, monkeypatch, tmp_path
):
"""Without the env override set, a stored staging host is untrusted
network provenance and correctly heals to prod (this is the
allowlist's actual job — preserved, not regressed, by this fix)."""
import hermes_cli.auth as auth
staging_portal = "https://portal.staging-nousresearch.com"
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.delenv("HERMES_PORTAL_BASE_URL", raising=False)
monkeypatch.delenv("NOUS_PORTAL_BASE_URL", raising=False)
self._write_auth_file(tmp_path, stored_portal_url=staging_portal)
seen_portal_urls, records = self._run_and_capture(monkeypatch, auth)
assert seen_portal_urls == [DEFAULT_NOUS_PORTAL_URL]
assert any("ignoring invalid portal_base_url" in msg for msg in records)
def test_no_env_no_staging_state_prod_url_used_unmodified(
self, monkeypatch, tmp_path
):
"""Baseline: no override, no staging state — prod is used and the
allowlist never even logs a warning (nothing was rejected)."""
import hermes_cli.auth as auth
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.delenv("HERMES_PORTAL_BASE_URL", raising=False)
monkeypatch.delenv("NOUS_PORTAL_BASE_URL", raising=False)
self._write_auth_file(tmp_path, stored_portal_url=DEFAULT_NOUS_PORTAL_URL)
seen_portal_urls, records = self._run_and_capture(monkeypatch, auth)
assert seen_portal_urls == [DEFAULT_NOUS_PORTAL_URL]
assert not any("ignoring invalid portal_base_url" in msg for msg in records)