diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 8030b889e..e19894c96 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -178,6 +178,9 @@ jobs: - name: Create manifest list and push working-directory: /tmp/digests + env: + IMAGE_NAME: ${{ env.IMAGE_NAME }} + RELEASE_TAG: ${{ github.event.release.tag_name }} run: | set -euo pipefail args=() @@ -185,9 +188,8 @@ jobs: args+=("${IMAGE_NAME}@sha256:${digest_file}") done if [ "${{ github.event_name }}" = "release" ]; then - TAG="${{ github.event.release.tag_name }}" docker buildx imagetools create \ - -t "${IMAGE_NAME}:${TAG}" \ + -t "${IMAGE_NAME}:${RELEASE_TAG}" \ "${args[@]}" else docker buildx imagetools create \ @@ -195,15 +197,14 @@ jobs: -t "${IMAGE_NAME}:latest" \ "${args[@]}" fi - env: - IMAGE_NAME: ${{ env.IMAGE_NAME }} - name: Inspect image + env: + IMAGE_NAME: ${{ env.IMAGE_NAME }} + RELEASE_TAG: ${{ github.event.release.tag_name }} run: | if [ "${{ github.event_name }}" = "release" ]; then - docker buildx imagetools inspect "${IMAGE_NAME}:${{ github.event.release.tag_name }}" + docker buildx imagetools inspect "${IMAGE_NAME}:${RELEASE_TAG}" else docker buildx imagetools inspect "${IMAGE_NAME}:main" fi - env: - IMAGE_NAME: ${{ env.IMAGE_NAME }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index fcee2c1b8..beb3a07ab 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -98,6 +98,8 @@ jobs: echo "base ty: $(wc -c < .lint-reports/base/ty.json) bytes" - name: Generate diff summary + env: + HEAD_REF: ${{ inputs.event_name == 'pull_request' && github.head_ref || github.ref_name }} run: | python scripts/lint_diff.py \ --base-ruff .lint-reports/base/ruff.json \ @@ -105,7 +107,7 @@ jobs: --base-ty .lint-reports/base/ty.json \ --head-ty .lint-reports/head/ty.json \ --base-ref "${{ steps.base.outputs.ref }}" \ - --head-ref "${{ inputs.event_name == 'pull_request' && github.head_ref || github.ref_name }}" \ + --head-ref "$HEAD_REF" \ --output .lint-reports/summary.md cat .lint-reports/summary.md >> "$GITHUB_STEP_SUMMARY" diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py index 18ed3102c..08c0052bc 100644 --- a/agent/agent_runtime_helpers.py +++ b/agent/agent_runtime_helpers.py @@ -1478,6 +1478,46 @@ def anthropic_prompt_cache_policy( eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "") eff_model = (model if model is not None else agent.model) or "" + # MoA virtual provider: the agent's model/provider are the preset name and + # "moa" — neither matches any caching branch, so the ACTING AGGREGATOR + # (often Claude on OpenRouter) silently lost prompt caching entirely + # (measured: 85% cache share solo vs 2% on the identical model via MoA — + # tens of millions of re-billed input tokens per benchmark run). Resolve + # the policy from the preset's real aggregator slot instead. + if eff_provider.strip().lower() == "moa": + try: + from hermes_cli.config import load_config as _load_moa_cfg + from hermes_cli.moa_config import resolve_moa_preset + from hermes_cli.runtime_provider import resolve_runtime_provider + + _preset = resolve_moa_preset( + _load_moa_cfg().get("moa") or {}, eff_model or None + ) + _agg = _preset.get("aggregator") or {} + _agg_provider = str(_agg.get("provider") or "").strip() + _agg_model = str(_agg.get("model") or "").strip() + if _agg_provider and _agg_model: + _agg_base_url = "" + _agg_api_mode = "" + try: + _rt = resolve_runtime_provider( + requested=_agg_provider, target_model=_agg_model + ) + _agg_base_url = _rt.get("base_url") or "" + _agg_api_mode = _rt.get("api_mode") or "" + except Exception: + pass + return anthropic_prompt_cache_policy( + agent, + provider=_agg_provider, + base_url=_agg_base_url, + api_mode=_agg_api_mode, + model=_agg_model, + ) + except Exception as _moa_exc: # pragma: no cover - defensive + logger.debug("MoA aggregator cache-policy resolution failed: %s", _moa_exc) + return False, False + model_lower = eff_model.lower() provider_lower = eff_provider.lower() is_claude = "claude" in model_lower diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 64768963e..1eb352496 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -4940,9 +4940,35 @@ def resolve_vision_provider_client( main_provider, ) else: + # Custom endpoints (``custom`` / ``custom:``) carry no + # built-in base_url/api_key — resolve_provider_client("custom") + # would return None ("no endpoint credentials found") and the + # whole chain would fall through to the aggregators, breaking + # vision for every user on a custom provider that has no + # separate ``auxiliary.vision`` block. Recover the live main + # endpoint that ``set_runtime_main()`` recorded for this turn so + # Step 1 can build a working client. + rpc_base_url = None + rpc_api_key = None + rpc_api_mode = resolved_api_mode + if main_provider == "custom" or main_provider.startswith("custom:"): + if _RUNTIME_MAIN_BASE_URL: + rpc_base_url = _RUNTIME_MAIN_BASE_URL + rpc_api_key = _RUNTIME_MAIN_API_KEY or None + rpc_api_mode = resolved_api_mode or _RUNTIME_MAIN_API_MODE or None + else: + # No live runtime recorded (non-gateway caller): fall + # back to resolving the configured custom endpoint. + custom_base, custom_key, custom_mode = _resolve_custom_runtime() + if custom_base: + rpc_base_url = custom_base + rpc_api_key = custom_key + rpc_api_mode = resolved_api_mode or custom_mode or None rpc_client, rpc_model = resolve_provider_client( main_provider, vision_model, - api_mode=resolved_api_mode, + api_mode=rpc_api_mode, + explicit_base_url=rpc_base_url, + explicit_api_key=rpc_api_key, is_vision=True) if rpc_client is not None: logger.info( diff --git a/agent/file_safety.py b/agent/file_safety.py index 02e1eba2a..d7e20ee5f 100644 --- a/agent/file_safety.py +++ b/agent/file_safety.py @@ -304,6 +304,30 @@ def get_read_block_error(path: str) -> Optional[str]: return None +def raise_if_read_blocked(path: str) -> None: + """Raise ``ValueError`` if ``path`` is a denied Hermes read (see + :func:`get_read_block_error`), else return. + + Shared chokepoint for provider input-loading sites that read a local + file the model/tool supplied (e.g. image-gen ``image_url`` / + ``reference_image_urls`` paths). Centralizes the guard so every provider + enforces the same read boundary with identical semantics instead of each + open-coding the try/except block (#57698). + + Best-effort by design: if ``agent.file_safety`` machinery is somehow + unavailable at the call site the guard no-ops rather than breaking local + image loading — consistent with the defense-in-depth (not security + boundary) framing of the denylist itself. The blocking ``ValueError`` from + a real hit still propagates; only unexpected internal errors are swallowed. + """ + try: + blocked = get_read_block_error(path) + except Exception: # noqa: BLE001 - guard must never break local-file loading + return + if blocked: + raise ValueError(blocked) + + # --------------------------------------------------------------------------- # Cross-profile write guard (#TBD) # diff --git a/agent/image_routing.py b/agent/image_routing.py index acd66fea8..ba6d8da32 100644 --- a/agent/image_routing.py +++ b/agent/image_routing.py @@ -17,13 +17,17 @@ It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native`` | ``text``, default ``auto``) and the active model's capability metadata. In ``auto`` mode: - - If the user has explicitly configured ``auxiliary.vision.provider`` - (i.e. not ``auto`` and not empty), we assume they want the text pipeline - regardless of the main model — they've opted in to a specific vision - backend for a reason (cost, quality, local-only, etc.). - - Otherwise, if the active model reports ``supports_vision=True`` in its - models.dev metadata, we attach natively. - - Otherwise (non-vision model, no explicit override), we fall back to text. + - If the active model reports ``supports_vision=True`` (via config + override or models.dev metadata), we attach natively — vision-capable + main models should always see the original pixels, even when an + auxiliary vision backend is configured. That auxiliary backend then + acts as a *fallback* for sessions whose main model can't take images. + - Otherwise, if the user has explicitly configured ``auxiliary.vision`` + (provider/model/base_url not ``auto``/empty), we route through the + text pipeline so the auxiliary vision backend can describe the image + for the text-only main model. + - Otherwise (non-vision model, no explicit override), we fall back to + text via the default vision_analyze flow. This keeps ``vision_analyze`` surfaced as a tool in every session — skills and agent flows that chain it (browser screenshots, deeper inspection of @@ -185,7 +189,8 @@ def _supports_vision_override( 2. ``providers..models..supports_vision`` (named custom providers — ``provider`` may be the runtime-resolved value ``"custom"`` and/or the user-declared name under - ``model.provider``; both are tried) + ``model.provider``; both are tried. For ``custom:`` syntax, + the stripped ```` is also tried as a provider key.) Returns None when no override is set, so the caller falls through to models.dev. Returns False explicitly only when the user wrote a @@ -205,11 +210,16 @@ def _supports_vision_override( # get rewritten to provider="custom" at runtime # (hermes_cli/runtime_provider.py:_resolve_named_custom_runtime), so the # config still holds the user-declared name under model.provider. Try - # both as candidate provider keys. + # both as candidate provider keys, plus the stripped suffix from + # "custom:" (where is the key under providers:). config_provider = str(model_cfg.get("provider") or "").strip() + # Extract the stripped name from "custom:" if present + stripped_suffix = "" + if config_provider.startswith("custom:"): + stripped_suffix = config_provider[len("custom:"):] providers_raw = cfg.get("providers") providers_cfg: Dict[str, Any] = providers_raw if isinstance(providers_raw, dict) else {} - for p in dict.fromkeys(filter(None, (provider, config_provider))): + for p in dict.fromkeys(filter(None, (provider, config_provider, stripped_suffix))): entry_raw = providers_cfg.get(p) entry: Dict[str, Any] = entry_raw if isinstance(entry_raw, dict) else {} models_raw = entry.get("models") @@ -336,8 +346,10 @@ def _coerce_mode(raw: Any) -> str: def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool: """True when the user configured a specific auxiliary vision backend. - An explicit override means the user *wants* the text pipeline (they're - paying for a dedicated vision model), so we don't silently bypass it. + An explicit override means the user has a dedicated vision backend + available; it's used as a *fallback* when the main model can't take + images natively. In ``auto`` mode, native vision on a vision-capable + main model still wins over this fallback — see issue #29135. """ if not isinstance(cfg, dict): return False @@ -426,13 +438,15 @@ def decide_image_input_mode( if mode_cfg == "text": return "text" - # auto - if _explicit_aux_vision_override(cfg): - return "text" - + # auto: prefer native vision when the main model supports it. An + # explicit auxiliary.vision config acts as a *fallback* for text-only + # main models — it should not preempt native vision on a model that + # can natively inspect the pixels (issue #29135). supports = _lookup_supports_vision(provider, model, cfg) if supports is True: return "native" + if _explicit_aux_vision_override(cfg): + return "text" return "text" diff --git a/agent/moa_loop.py b/agent/moa_loop.py index 439698444..ccaebda8f 100644 --- a/agent/moa_loop.py +++ b/agent/moa_loop.py @@ -173,6 +173,49 @@ def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]: return out +def _maybe_apply_advisor_cache_control( + messages: list[dict[str, Any]], + runtime: dict[str, Any], +) -> list[dict[str, Any]]: + """Decorate an advisor request with cache_control when its route honors it. + + Reuses the SAME policy function as the main agent loop + (``anthropic_prompt_cache_policy``) resolved against the advisor slot's + own provider/base_url/api_mode/model, and the SAME breakpoint layout + (``apply_anthropic_cache_control``, system_and_3). This keeps advisor + calls decorated exactly like an acting agent on that provider would be — + no MoA-specific caching logic to drift. + + Returns the messages unchanged on any resolution error or when the + policy says the route doesn't honor markers. + """ + try: + from types import SimpleNamespace + + from agent.agent_runtime_helpers import anthropic_prompt_cache_policy + from agent.prompt_caching import apply_anthropic_cache_control + + # The policy function reads agent.* only as fallbacks for kwargs we + # don't pass; provide a stub so an advisor slot is judged purely on + # its own resolved runtime. + stub = SimpleNamespace(provider="", base_url="", api_mode="", model="") + should_cache, native_layout = anthropic_prompt_cache_policy( + stub, + provider=runtime.get("provider") or "", + base_url=runtime.get("base_url") or "", + api_mode=runtime.get("api_mode") or "", + model=runtime.get("model") or "", + ) + if not should_cache: + return messages + return apply_anthropic_cache_control( + messages, native_anthropic=native_layout + ) + except Exception as exc: # pragma: no cover - decoration must never break a call + logger.debug("advisor cache_control decoration skipped: %s", exc) + return messages + + def _run_reference( slot: dict[str, str], ref_messages: list[dict[str, Any]], @@ -214,6 +257,18 @@ def _run_reference( # trimmed view (_reference_messages) already strips the agent's own # system prompt, so this is the only system message the reference sees. messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages] + # Apply the same Anthropic-style prompt-caching decoration the main + # agent loop applies (system_and_3 breakpoints). The advisory view is + # append-only across iterations (new turns append before the trailing + # synthetic marker), so on cache-honoring routes (Claude via + # OpenRouter/native, MiniMax, Qwen/DashScope) iteration N+1's prefix + # replays iteration N's cached prefix. Without this, Claude advisors + # served ZERO cache reads across an entire benchmark run (measured: + # 0/1227 calls, 11.5M re-billed input tokens) because Anthropic + # caching is opt-in per request. OpenAI-family advisors are untouched + # (their caching is automatic; markers are ignored harmlessly, but we + # only decorate when the policy says the route honors them). + messages = _maybe_apply_advisor_cache_control(messages, runtime) response = call_llm( task="moa_reference", messages=messages, diff --git a/apps/bootstrap-installer/src-tauri/src/update.rs b/apps/bootstrap-installer/src-tauri/src/update.rs index c085ef60a..28597600e 100644 --- a/apps/bootstrap-installer/src-tauri/src/update.rs +++ b/apps/bootstrap-installer/src-tauri/src/update.rs @@ -230,6 +230,14 @@ async fn run_update(app: AppHandle) -> Result<()> { // us, and wait_for_install_locks_free below force-kills any straggler — so by the // time `hermes update` runs there is no legitimate hermes.exe to protect, // and the guard would only produce a false "Hermes is still running" stop. + // + // NOTE: --force does NOT bypass the venv-python holder guard (that needs + // an explicit `--force-venv`, which we deliberately do not pass). Our lock + // probe only checks the hermes.exe shim and app.asar, so an external venv + // python holding a native .pyd (a user terminal, an unmanaged gateway) + // could still be alive here — mutating the venv under it would strand the + // install half-updated. If that guard fires, it exits 2 and the match arm + // below surfaces the correct "close all Hermes windows" message. update_args.push("--force".into()); update_args.push("--branch".into()); update_args.push(update_branch); diff --git a/apps/desktop/electron/link-title-window.cjs b/apps/desktop/electron/link-title-window.cjs index 3aeabcfe6..c6792bf98 100644 --- a/apps/desktop/electron/link-title-window.cjs +++ b/apps/desktop/electron/link-title-window.cjs @@ -49,4 +49,23 @@ function guardLinkTitleSession(partitionSession) { } } -module.exports = { createLinkTitleWindow, guardLinkTitleSession, linkTitleWindowOptions } +// Read the page title from a title-fetch window. Callers schedule this from +// timers that can fire after finish() destroys the window, so every access must +// guard isDestroyed and swallow Electron's "Object has been destroyed" throws. +function readLinkTitleWindowTitle(window) { + try { + if (!window || window.isDestroyed()) return '' + const contents = window.webContents + if (!contents || contents.isDestroyed()) return '' + return contents.getTitle() || '' + } catch { + return '' + } +} + +module.exports = { + createLinkTitleWindow, + guardLinkTitleSession, + linkTitleWindowOptions, + readLinkTitleWindowTitle +} diff --git a/apps/desktop/electron/link-title-window.test.cjs b/apps/desktop/electron/link-title-window.test.cjs index 1c482a77d..468c646a0 100644 --- a/apps/desktop/electron/link-title-window.test.cjs +++ b/apps/desktop/electron/link-title-window.test.cjs @@ -1,7 +1,12 @@ const assert = require('node:assert/strict') const test = require('node:test') -const { createLinkTitleWindow, guardLinkTitleSession, linkTitleWindowOptions } = require('./link-title-window.cjs') +const { + createLinkTitleWindow, + guardLinkTitleSession, + linkTitleWindowOptions, + readLinkTitleWindowTitle +} = require('./link-title-window.cjs') function makeFakeBrowserWindow() { const calls = { audioMuted: [] } @@ -80,3 +85,44 @@ test('guardLinkTitleSession is a no-op when session.on throws', () => { }) ) }) + +test('readLinkTitleWindowTitle returns empty for missing or destroyed windows', () => { + assert.equal(readLinkTitleWindowTitle(null), '') + assert.equal(readLinkTitleWindowTitle(undefined), '') + assert.equal(readLinkTitleWindowTitle({ isDestroyed: () => true }), '') +}) + +test('readLinkTitleWindowTitle returns empty when webContents is destroyed', () => { + const window = { + isDestroyed: () => false, + webContents: { isDestroyed: () => true, getTitle: () => 'Should Not Read' } + } + + assert.equal(readLinkTitleWindowTitle(window), '') +}) + +test('readLinkTitleWindowTitle swallows getTitle throws after teardown', () => { + const window = { + isDestroyed: () => false, + webContents: { + isDestroyed: () => false, + getTitle: () => { + throw new Error('Object has been destroyed') + } + } + } + + assert.equal(readLinkTitleWindowTitle(window), '') +}) + +test('readLinkTitleWindowTitle returns trimmed page title', () => { + const window = { + isDestroyed: () => false, + webContents: { + isDestroyed: () => false, + getTitle: () => 'Example Domain' + } + } + + assert.equal(readLinkTitleWindowTitle(window), 'Example Domain') +}) diff --git a/apps/desktop/electron/main.cjs b/apps/desktop/electron/main.cjs index e3a0ae5d6..bd6b867fa 100644 --- a/apps/desktop/electron/main.cjs +++ b/apps/desktop/electron/main.cjs @@ -36,7 +36,11 @@ const { SESSION_WINDOW_MIN_WIDTH } = require('./session-windows.cjs') const { canImportHermesCli, verifyHermesCli } = require('./backend-probes.cjs') -const { createLinkTitleWindow, guardLinkTitleSession } = require('./link-title-window.cjs') +const { + createLinkTitleWindow, + guardLinkTitleSession, + readLinkTitleWindowTitle +} = require('./link-title-window.cjs') const { probeGatewayWebSocket } = require('./gateway-ws-probe.cjs') const { adoptServedDashboardToken } = require('./dashboard-token.cjs') const { waitForDashboardPortAnnouncement } = require('./backend-ready.cjs') @@ -2161,9 +2165,25 @@ async function releaseBackendLock(updateRoot, tag) { rememberLog(`[${tag}] venv shim unlocked; safe to proceed`) return { unlocked: true } } + // A supervised backend can respawn between kill and check (grandchildren, + // pool entries registered mid-teardown). Re-collect and re-kill each pass + // instead of trusting the initial sweep. + const stragglers = [] + if (hermesProcess && Number.isInteger(hermesProcess.pid)) stragglers.push(hermesProcess.pid) + for (const entry of backendPool.values()) { + if (entry.process && Number.isInteger(entry.process.pid)) stragglers.push(entry.process.pid) + } + for (const pid of stragglers) forceKillProcessTree(pid) await new Promise(r => setTimeout(r, 300)) } - rememberLog(`[${tag}] venv shim still locked after 15s; proceeding anyway (force)`) + // Do NOT proceed past a held lock: handing off to the updater while another + // process (a second desktop window, a user terminal, an unkillable child) + // still maps the venv's files guarantees a half-updated venv — the updater's + // dependency sync dies on access-denied partway through uninstalls, leaving + // imports broken (the July 2026 brotlicffi/_sodium.pyd incidents). Failing + // the update loudly and keeping the app running is strictly better than a + // bricked install that needs manual venv surgery. + rememberLog(`[${tag}] venv shim still locked after 15s; aborting hand-off (something outside this app holds the venv)`) return { unlocked: false } } @@ -2243,7 +2263,20 @@ async function applyUpdates(opts = {}) { // spawn the updater. Without this the updater races a still-locked // hermes.exe (held by the backend child / its grandchildren) and the update // bricks. See releaseBackendLockForUpdate for the full failure analysis. - await releaseBackendLockForUpdate(updateRoot) + const lock = await releaseBackendLockForUpdate(updateRoot) + if (!lock.unlocked) { + // Something OUTSIDE this app holds the venv (a second window, a user + // terminal running hermes, an unkillable child). Handing off anyway + // guarantees a half-updated venv — abort loudly instead and let the + // user close the holder and retry. Restart our own backend so the app + // keeps working after the failed attempt. + const message = + 'Update aborted: another process is holding the Hermes install open ' + + '(a second Hermes window or a terminal running hermes?). Close it and retry.' + emitUpdateProgress({ stage: 'error', message, percent: null }) + startHermes().catch(() => {}) + return { ok: false, error: message } + } // Detached so the updater outlives this process — it needs us GONE before // `hermes update` will run (the venv shim is locked while we live). @@ -3552,13 +3585,13 @@ function runRenderTitleJob(rawUrl) { return finish('') } - const readTitle = () => window?.webContents?.getTitle?.() || '' + const finishWithTitle = () => finish(readLinkTitleWindowTitle(window)) const scheduleGrace = () => { if (graceTimer) clearTimeout(graceTimer) - graceTimer = setTimeout(() => finish(readTitle()), RENDER_TITLE_GRACE_MS) + graceTimer = setTimeout(finishWithTitle, RENDER_TITLE_GRACE_MS) } - hardTimer = setTimeout(() => finish(readTitle()), RENDER_TITLE_TIMEOUT_MS) + hardTimer = setTimeout(finishWithTitle, RENDER_TITLE_TIMEOUT_MS) window.webContents.setUserAgent(TITLE_USER_AGENT) window.webContents.on('page-title-updated', scheduleGrace) diff --git a/cron/scheduler.py b/cron/scheduler.py index e072fce7f..6c26efb86 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -3114,7 +3114,26 @@ def run_one_job(job: dict, *, adapters=None, loop=None, verbose: bool = False) - ) return True # not an error — already handled/removed - success, output, final_response, error = run_job(job) + # Run the job under the profile's secret scope. get_secret() fails + # closed outside a scope once profile isolation is in play (multiple + # gateway profiles / room→profile multiplexing), and cron fires from + # the ticker thread where no per-turn scope is installed — so + # resolve_runtime_provider() raised UnscopedSecretError before model + # selection, breaking every cron job. Mirrors the per-turn pattern in + # gateway/run.py (_profile_runtime_scope). + from agent.secret_scope import ( + build_profile_secret_scope, + reset_secret_scope, + set_secret_scope, + ) + + _scope_token = set_secret_scope( + build_profile_secret_scope(_get_hermes_home()) + ) + try: + success, output, final_response, error = run_job(job) + finally: + reset_secret_scope(_scope_token) output_file = save_job_output(job["id"], output) if verbose: diff --git a/gateway/platforms/api_server.py b/gateway/platforms/api_server.py index 3cf11f335..a02959684 100644 --- a/gateway/platforms/api_server.py +++ b/gateway/platforms/api_server.py @@ -54,9 +54,11 @@ except ImportError: from gateway.config import Platform, PlatformConfig from gateway.platforms.base import ( + MEDIA_TAG_CLEANUP_RE, BasePlatformAdapter, SendResult, is_network_accessible, + validate_media_delivery_path, ) from agent.redact import redact_sensitive_text @@ -581,9 +583,6 @@ _MEDIA_MIME = { ".webp": "image/webp", ".bmp": "image/bmp", } -_MEDIA_TAG_RE = re.compile( - r"[`\"']?MEDIA:\s*(`[^`\n]+`|\"[^\"\n]+\"|'[^'\n]+'|\S+)[`\"']?" -) _MEDIA_DATA_URL_MAX_BYTES = 5 * 1024 * 1024 # skip images larger than 5MB @@ -594,18 +593,35 @@ def _resolve_media_to_data_urls(text: str) -> str: ``MEDIA:`` tags referencing images on the server are useless to them. Inline small local images as markdown data URLs; non-image or unreadable paths are left untouched. + + Uses the same anchored ``MEDIA_TAG_CLEANUP_RE`` matcher and + ``validate_media_delivery_path`` safety check every other platform + adapter's media delivery already goes through (gateway/platforms/base.py) + — an absolute-path anchor plus a known-extension requirement, and a + resolved-path check against the credential/system-path denylist. The + prior pattern here matched any bare token after ``MEDIA:`` (including a + relative/traversal path like ``../../etc/passwd.png``) and read the file + directly with no denylist, so any image-suffixed, readable file the + process could see was base64-exfiltrated to the API caller if its path + merely appeared in the model's own final reply text. """ if not text or "MEDIA:" not in text: return text import base64 def _to_data_url(path_str: str) -> Optional[str]: - p = Path(path_str.strip().strip("`\"'")).expanduser() + # validate_media_delivery_path() strips wrapping quotes/backticks + # and trailing punctuation internally, same as MEDIA_TAG_CLEANUP_RE's + # other callers (extract_media / _strip_media_tag_directives) rely on. + safe_path = validate_media_delivery_path(path_str) + if not safe_path: + return None + p = Path(safe_path) suffix = p.suffix.lower() if suffix not in _MEDIA_IMG_EXT: return None try: - if not p.is_file() or p.stat().st_size > _MEDIA_DATA_URL_MAX_BYTES: + if p.stat().st_size > _MEDIA_DATA_URL_MAX_BYTES: return None b64 = base64.b64encode(p.read_bytes()).decode() except OSError: @@ -613,10 +629,10 @@ def _resolve_media_to_data_urls(text: str) -> str: return f"![image](data:{_MEDIA_MIME[suffix]};base64,{b64})" def _repl(m: "re.Match[str]") -> str: - return _to_data_url(m.group(1)) or m.group(0) + return _to_data_url(m.group("path")) or m.group(0) try: - return _MEDIA_TAG_RE.sub(_repl, text) + return MEDIA_TAG_CLEANUP_RE.sub(_repl, text) except Exception: return text diff --git a/gateway/run.py b/gateway/run.py index d3992a760..646a48569 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -9962,6 +9962,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew event: MessageEvent, source: SessionSource, history: List[Dict[str, Any]], + session_key: Optional[str] = None, ) -> Optional[str]: """Prepare inbound event text for the agent. @@ -9980,10 +9981,10 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew message_text = event.text or "" _group_sessions_per_user = getattr(self.config, "group_sessions_per_user", True) _thread_sessions_per_user = getattr(self.config, "thread_sessions_per_user", False) - # Use the same helper every other call site uses so the write key here - # matches the consume key at the run_conversation site — even if the - # session store overrides build_session_key's default behavior. - session_key = self._session_key_for_source(source) + # Prefer the already resolved session key from the caller so this write + # key matches the consume key at the run_conversation site. Fall back + # to deriving it here for tests and legacy standalone callers. + session_key = session_key or self._session_key_for_source(source) # Reset only this session's per-call buffer; other sessions may be # concurrently preparing multimodal turns on the same runner. self._consume_pending_native_image_paths(session_key) @@ -10034,7 +10035,10 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew if image_paths: # Decide routing: native (attach pixels) vs text (vision_analyze # pre-run + prepend description). See agent/image_routing.py. - _img_mode = self._decide_image_input_mode() + _img_mode = self._decide_image_input_mode( + source=source, + session_key=session_key, + ) if _img_mode == "native": # Defer attachment to the run_conversation call site. pending_native = getattr(self, "_pending_native_image_paths_by_session", None) @@ -10997,6 +11001,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew event=event, source=source, history=history, + session_key=session_key, ) if message_text is None: return @@ -14379,25 +14384,62 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew except TypeError: executor.shutdown(wait=False) - def _decide_image_input_mode(self) -> str: - """Resolve the image-input routing for the currently active model. + def _decide_image_input_mode( + self, + *, + source: Optional[SessionSource] = None, + session_key: Optional[str] = None, + user_config: Optional[dict] = None, + provider: Optional[str] = None, + model: Optional[str] = None, + ) -> str: + """Resolve image-input routing for the effective model this turn. Returns ``"native"`` (attach pixels on the user turn) or ``"text"`` (pre-analyze with vision_analyze and prepend the description). See agent/image_routing.py for the full decision table. - The active provider/model are read from config.yaml so the decision - tracks ``/model`` switches automatically on the next message. + Gateway sessions can have /model overrides that live outside + config.yaml. Image preprocessing runs before AIAgent sets the + auxiliary_client runtime globals, so resolve the same per-session + runtime bundle the upcoming agent turn will use instead of consulting + only the persisted default model. """ try: from agent.image_routing import decide_image_input_mode from agent.auxiliary_client import _read_main_model, _read_main_provider from hermes_cli.config import load_config - cfg = load_config() - provider = _read_main_provider() - model = _read_main_model() - return decide_image_input_mode(provider, model, cfg) + cfg = user_config if isinstance(user_config, dict) else load_config() + resolved_provider = (provider or "").strip() + resolved_model = (model or "").strip() + + needs_session_runtime = not resolved_provider or not resolved_model + has_session_identity = source is not None or session_key + if needs_session_runtime and has_session_identity: + try: + turn_model, runtime_kwargs = self._resolve_session_agent_runtime( + source=source, + session_key=session_key, + user_config=cfg, + ) + if not resolved_model and isinstance(turn_model, str): + resolved_model = turn_model.strip() + runtime_provider = runtime_kwargs.get("provider") if isinstance(runtime_kwargs, dict) else None + if not resolved_provider and isinstance(runtime_provider, str): + resolved_provider = runtime_provider.strip() + except Exception as exc: + logger.debug( + "image_routing: session runtime resolution failed, falling back to config — %s", + exc, + ) + + if not resolved_provider: + resolved_provider = _read_main_provider() + if not resolved_model: + resolved_model = _read_main_model() + + return decide_image_input_mode(resolved_provider, resolved_model, cfg) except Exception as exc: logger.debug("image_routing: decision failed, falling back to text — %s", exc) return "text" @@ -19061,6 +19103,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew next_message = pending next_message_id = None next_channel_prompt = None + next_session_key = session_key if pending_event is not None: next_source = getattr(pending_event, "source", None) or source if self._is_goal_continuation_event(pending_event) and not self._goal_still_active_for_session(session_id): @@ -19069,10 +19112,24 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew session_key or "?", ) return result + # Resolve the follow-up's session key BEFORE preparing the + # inbound text: _prepare_inbound_message_text buffers native + # image paths under the key it is given, and the recursive + # _run_agent below consumes them under next_session_key. + # The write and consume keys must match or the images drop. + try: + next_session_key = self._session_key_for_source(next_source) + except Exception: + logger.debug( + "Queued follow-up session-key resolution failed; reusing %s", + session_key or "?", + exc_info=True, + ) next_message = await self._prepare_inbound_message_text( event=pending_event, source=next_source, history=updated_history, + session_key=next_session_key, ) if next_message is None: return result @@ -19114,7 +19171,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew history=updated_history, source=next_source, session_id=session_id, - session_key=session_key, + session_key=next_session_key, run_generation=run_generation, _interrupt_depth=_interrupt_depth + 1, event_message_id=next_message_id, diff --git a/hermes_cli/_parser.py b/hermes_cli/_parser.py index 521c5fcf9..d06a3d4ac 100644 --- a/hermes_cli/_parser.py +++ b/hermes_cli/_parser.py @@ -71,6 +71,7 @@ Examples: hermes logs errors View errors.log hermes logs --since 1h Lines from the last hour hermes debug share Upload debug report for support + hermes console Open the safe Hermes command console hermes update Update to latest version hermes dashboard Start web UI dashboard (port 9119) hermes dashboard --stop Stop running dashboard processes diff --git a/hermes_cli/console_engine.py b/hermes_cli/console_engine.py new file mode 100644 index 000000000..7bfa13fbf --- /dev/null +++ b/hermes_cli/console_engine.py @@ -0,0 +1,1876 @@ +"""Safe Hermes Console command engine. + +This module backs ``hermes console`` and is intentionally narrower than the +full Hermes CLI. It exposes a curated set of native adapters that can later be +shared by the dashboard console websocket without becoming a raw shell. +""" + +from __future__ import annotations + +import argparse +import contextlib +import difflib +import functools +import importlib +import io +import json +import shlex +import sys +from dataclasses import dataclass, replace +from pathlib import Path +from typing import Callable, Iterable, Literal, NoReturn, Sequence +from urllib.parse import urlparse + +from tools.ansi_strip import strip_ansi as _strip_ansi + + +ConsoleStatus = Literal["ok", "error", "confirm_required", "exit", "clear"] +ConsoleContext = Literal["local", "hosted"] +ALL_CONTEXTS: frozenset[ConsoleContext] = frozenset({"local", "hosted"}) +LOCAL_CONTEXTS: frozenset[ConsoleContext] = frozenset({"local"}) + + +class ConsoleCommandError(RuntimeError): + """User-facing console command failure.""" + + +@dataclass(frozen=True) +class ConsoleResult: + status: ConsoleStatus + output: str = "" + command: str = "" + confirmation_message: str = "" + + +@dataclass(frozen=True) +class ConsoleCommand: + path: tuple[str, ...] + usage: str + summary: str + handler: Callable[["HermesConsoleEngine", list[str]], str] + mutating: bool = False + confirmation: str = "" + contexts: frozenset[ConsoleContext] = LOCAL_CONTEXTS + + +class _ArgumentParser(argparse.ArgumentParser): + def error(self, message: str) -> NoReturn: # pragma: no cover - argparse hook + raise ConsoleCommandError(f"{self.prog}: {message}") + + +def _capture_output(fn: Callable[[], object]) -> str: + stdout = io.StringIO() + stderr = io.StringIO() + code = 0 + with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr): + try: + result = fn() + if isinstance(result, int) and result: + raise SystemExit(result) + except SystemExit as exc: + code = int(exc.code or 0) + text = stdout.getvalue() + stderr.getvalue() + if code: + raise ConsoleCommandError(text.strip() or f"Command exited with status {code}") + return text.rstrip() + + +def _is_status_footer_rule(line: str) -> bool: + stripped = _strip_ansi(line).strip() + if len(stripped) < 8: + return False + normalized = stripped.replace("\u2500", "-") + return set(normalized) <= {"-"} + + +def _strip_console_status_footer(text: str) -> str: + lines = text.splitlines() + while lines and not _strip_ansi(lines[-1]).strip(): + lines.pop() + if len(lines) < 2: + return text.rstrip() + + last = _strip_ansi(lines[-1]).strip() + prev = _strip_ansi(lines[-2]).strip() + if not ( + prev.startswith("Run 'hermes doctor'") + and last.startswith("Run 'hermes setup'") + ): + return text.rstrip() + + lines = lines[:-2] + while lines and not _strip_ansi(lines[-1]).strip(): + lines.pop() + if lines and _is_status_footer_rule(lines[-1]): + lines.pop() + return "\n".join(lines).rstrip() + + +def _table_summary(summary: str, *, limit: int = 76) -> str: + summary = " ".join(summary.split()) + if len(summary) <= limit: + return summary + return f"{summary[: limit - 3].rstrip()}..." + + +def _split_line(line: str) -> list[str]: + try: + return shlex.split(line, comments=False, posix=True) + except ValueError as exc: + raise ConsoleCommandError(f"Could not parse command: {exc}") from exc + + +def _contains_shell_syntax(line: str, tokens: Sequence[str]) -> bool: + if "$(" in line or "`" in line: + return True + shell_tokens = {"|", "||", "&", "&&", ";", ">", ">>", "<", "<<", "2>", "2>>"} + if any(token in shell_tokens for token in tokens): + return True + return any(ch in line for ch in "|<>;") + + +def _format_sessions(sessions: Sequence[dict]) -> str: + if not sessions: + return "No sessions found." + lines = [f"{'ID':<32} {'Source':<12} {'Msgs':>5} Title / Preview"] + lines.append("-" * 82) + for session in sessions: + sid = str(session.get("id") or "")[:32] + source = str(session.get("source") or "-")[:12] + messages = session.get("message_count") or 0 + title = session.get("title") or session.get("preview") or "" + title = str(title).replace("\n", " ")[:60] + lines.append(f"{sid:<32} {source:<12} {messages:>5} {title}") + return "\n".join(lines) + + +def _format_job(job: dict, action: str) -> str: + job_id = job.get("id") or job.get("job_id") or "?" + name = job.get("name") or "(unnamed)" + state = job.get("state") or ("scheduled" if job.get("enabled", True) else "paused") + return f"{action} job: {name} ({job_id}) [{state}]" + + +EXPECTED_HOSTED_PATHS: tuple[tuple[str, ...], ...] = ( + ("status",), + ("doctor",), + ("logs",), + ("version",), + ("prompt-size",), + ("insights",), + ("security", "audit"), + ("portal", "info"), + ("portal", "tools"), + ("send",), + ("config", "show"), + ("config", "path"), + ("config", "env-path"), + ("config", "check"), + ("config", "migrate"), + ("config", "set"), + ("sessions", "list"), + ("sessions", "stats"), + ("sessions", "export"), + ("sessions", "rename"), + ("sessions", "optimize"), + ("sessions", "repair"), + ("cron", "list"), + ("cron", "status"), + ("cron", "create"), + ("cron", "edit"), + ("cron", "pause"), + ("cron", "resume"), + ("cron", "run"), + ("cron", "remove"), + ("cron", "tick"), + ("profile",), + ("profile", "list"), + ("profile", "show"), + ("profile", "info"), + ("tools", "list"), + ("tools", "enable"), + ("tools", "disable"), + ("tools", "post-setup"), + ("skills", "browse"), + ("skills", "search"), + ("skills", "inspect"), + ("skills", "list"), + ("skills", "check"), + ("skills", "list-modified"), + ("skills", "diff"), + ("skills", "install"), + ("skills", "update"), + ("skills", "audit"), + ("skills", "uninstall"), + ("skills", "reset"), + ("skills", "opt-in"), + ("skills", "opt-out"), + ("skills", "repair-official"), + ("skills", "snapshot", "export"), + ("skills", "tap", "list"), + ("mcp", "list"), + ("mcp", "catalog"), + ("mcp", "test"), + ("mcp", "add"), + ("mcp", "remove"), + ("mcp", "install"), + ("mcp", "login"), + ("mcp", "reauth"), + ("mcp", "configure"), + ("mcp", "picker"), + ("memory", "status"), + ("auth", "list"), + ("auth", "status"), + ("auth", "reset"), + ("auth", "spotify", "status"), + ("pairing", "list"), + ("pairing", "approve"), + ("pairing", "revoke"), + ("pairing", "clear-pending"), + ("webhook", "list"), + ("webhook", "subscribe"), + ("webhook", "remove"), + ("webhook", "test"), +) + + +def _parser_root() -> tuple[_ArgumentParser, argparse._SubParsersAction]: + parser = _ArgumentParser(prog="hermes", add_help=False) + subparsers = parser.add_subparsers(dest="_console_command") + return parser, subparsers + + +def _subparser_actions(parser: argparse.ArgumentParser) -> list[argparse._SubParsersAction]: + return [ + action + for action in parser._actions + if isinstance(action, argparse._SubParsersAction) + ] + + +def _choice_help(action: argparse._SubParsersAction, name: str) -> str: + for choice in action._choices_actions: + if getattr(choice, "dest", None) == name or getattr(choice, "metavar", None) == name: + help_text = getattr(choice, "help", None) + if help_text and help_text is not argparse.SUPPRESS: + return str(help_text) + return "" + + +def _clean_summary(text: str | None) -> str: + if not text: + return "" + if text is argparse.SUPPRESS: + return "" + summary = " ".join(str(text).split()) + if not summary: + return "" + if summary.startswith("Run `hermes "): + return "" + return summary + + +def _summaries_from_parser(parser: argparse.ArgumentParser) -> dict[tuple[str, ...], str]: + summaries: dict[tuple[str, ...], str] = {} + + def walk(current: argparse.ArgumentParser, path: tuple[str, ...]) -> None: + for action in _subparser_actions(current): + for name, child in action.choices.items(): + child_path = (*path, name) + summary = _clean_summary(_choice_help(action, name)) or _clean_summary( + child.description + ) + if summary: + summaries.setdefault(child_path, summary) + walk(child, child_path) + + walk(parser, ()) + return summaries + + +def _noop_console_command(_args: argparse.Namespace) -> None: + return None + + +# The CLI surface these helpers reflect is process-static: they import a +# subcommand module and build a throwaway argparse tree purely to extract help +# summaries. Nothing about the result changes across engine instances, but the +# dashboard opens a fresh HermesConsoleEngine per /api/console connection, so +# without memoization every reconnect re-imports + re-parses the whole surface. +# Cache by args (all hashable strings); callers only read the returned map. +@functools.lru_cache(maxsize=None) +def _extracted_summaries( + module_name: str, + builder_name: str, + main_handler_name: str, +) -> dict[tuple[str, ...], str]: + try: + parser, subparsers = _parser_root() + module = importlib.import_module(module_name) + builder = getattr(module, builder_name) + builder(subparsers, **{main_handler_name: _noop_console_command}) + return _summaries_from_parser(parser) + except Exception: + return {} + + +@functools.lru_cache(maxsize=None) +def _registered_summaries( + root: str, + module_name: str, + register_name: str, +) -> dict[tuple[str, ...], str]: + try: + parser, subparsers = _parser_root() + module = importlib.import_module(module_name) + top_parser = subparsers.add_parser(root) + register = getattr(module, register_name) + register(top_parser) + return _summaries_from_parser(parser) + except Exception: + return {} + + +@functools.lru_cache(maxsize=None) +def _builder_summaries( + module_name: str, + builder_name: str, +) -> dict[tuple[str, ...], str]: + try: + parser, subparsers = _parser_root() + module = importlib.import_module(module_name) + getattr(module, builder_name)(subparsers) + return _summaries_from_parser(parser) + except Exception: + return {} + + +@functools.lru_cache(maxsize=None) +def _adder_summaries(module_name: str, add_name: str) -> dict[tuple[str, ...], str]: + try: + parser, subparsers = _parser_root() + module = importlib.import_module(module_name) + getattr(module, add_name)(subparsers) + return _summaries_from_parser(parser) + except Exception: + return {} + + +def _invoke_namespace(args: argparse.Namespace) -> object: + func = getattr(args, "func", None) + if not callable(func): + raise ConsoleCommandError("No handler is available for that console command.") + return func(args) + + +def _set_attrs(args: argparse.Namespace, **attrs: object) -> argparse.Namespace: + for name, value in attrs.items(): + setattr(args, name, value) + return args + + +def _dispatch_extracted_subcommand( + *, + root: str, + fixed: Sequence[str], + args: Sequence[str], + module_name: str, + builder_name: str, + main_handler_name: str, + console_context: ConsoleContext, + namespace_update: Callable[[argparse.Namespace, ConsoleContext], None] | None = None, +) -> str: + parser, subparsers = _parser_root() + module = importlib.import_module(module_name) + main_module = importlib.import_module("hermes_cli.main") + builder = getattr(module, builder_name) + main_handler = getattr(main_module, main_handler_name) + builder(subparsers, **{main_handler_name: main_handler}) + namespace = parser.parse_args([root, *fixed, *args]) + if namespace_update: + namespace_update(namespace, console_context) + return _capture_output(lambda: _invoke_namespace(namespace)) + + +def _dispatch_registered_subcommand( + *, + root: str, + fixed: Sequence[str], + args: Sequence[str], + module_name: str, + register_name: str, + handler_name: str | None = None, + console_context: ConsoleContext, + namespace_update: Callable[[argparse.Namespace, ConsoleContext], None] | None = None, +) -> str: + parser, subparsers = _parser_root() + module = importlib.import_module(module_name) + top_parser = subparsers.add_parser(root) + register = getattr(module, register_name) + register(top_parser) + if handler_name: + top_parser.set_defaults(func=getattr(module, handler_name)) + namespace = parser.parse_args([root, *fixed, *args]) + if namespace_update: + namespace_update(namespace, console_context) + return _capture_output(lambda: _invoke_namespace(namespace)) + + +def _dispatch_builder_subcommand( + *, + root: str, + fixed: Sequence[str], + args: Sequence[str], + module_name: str, + builder_name: str, + main_handler_name: str, + console_context: ConsoleContext, + namespace_update: Callable[[argparse.Namespace, ConsoleContext], None] | None = None, +) -> str: + parser, subparsers = _parser_root() + module = importlib.import_module(module_name) + main_module = importlib.import_module("hermes_cli.main") + top_parser = getattr(module, builder_name)(subparsers) + top_parser.set_defaults(func=getattr(main_module, main_handler_name)) + namespace = parser.parse_args([root, *fixed, *args]) + if namespace_update: + namespace_update(namespace, console_context) + return _capture_output(lambda: _invoke_namespace(namespace)) + + +def _dispatch_adder_subcommand( + *, + root: str, + fixed: Sequence[str], + args: Sequence[str], + module_name: str, + add_name: str, + console_context: ConsoleContext, + namespace_update: Callable[[argparse.Namespace, ConsoleContext], None] | None = None, +) -> str: + parser, subparsers = _parser_root() + module = importlib.import_module(module_name) + getattr(module, add_name)(subparsers) + namespace = parser.parse_args([root, *fixed, *args]) + if namespace_update: + namespace_update(namespace, console_context) + return _capture_output(lambda: _invoke_namespace(namespace)) + + +def _extracted_handler( + root: str, + fixed: Sequence[str], + module_name: str, + builder_name: str, + main_handler_name: str, + namespace_update: Callable[[argparse.Namespace, ConsoleContext], None] | None = None, +) -> Callable[["HermesConsoleEngine", list[str]], str]: + def handler(_engine: HermesConsoleEngine, args: list[str]) -> str: + return _dispatch_extracted_subcommand( + root=root, + fixed=fixed, + args=args, + module_name=module_name, + builder_name=builder_name, + main_handler_name=main_handler_name, + console_context=_engine.context, + namespace_update=namespace_update, + ) + + return handler + + +def _registered_handler( + root: str, + fixed: Sequence[str], + module_name: str, + register_name: str, + handler_name: str | None = None, + namespace_update: Callable[[argparse.Namespace, ConsoleContext], None] | None = None, +) -> Callable[["HermesConsoleEngine", list[str]], str]: + def handler(_engine: HermesConsoleEngine, args: list[str]) -> str: + return _dispatch_registered_subcommand( + root=root, + fixed=fixed, + args=args, + module_name=module_name, + register_name=register_name, + handler_name=handler_name, + console_context=_engine.context, + namespace_update=namespace_update, + ) + + return handler + + +def _builder_handler( + root: str, + fixed: Sequence[str], + module_name: str, + builder_name: str, + main_handler_name: str, + namespace_update: Callable[[argparse.Namespace, ConsoleContext], None] | None = None, +) -> Callable[["HermesConsoleEngine", list[str]], str]: + def handler(_engine: HermesConsoleEngine, args: list[str]) -> str: + return _dispatch_builder_subcommand( + root=root, + fixed=fixed, + args=args, + module_name=module_name, + builder_name=builder_name, + main_handler_name=main_handler_name, + console_context=_engine.context, + namespace_update=namespace_update, + ) + + return handler + + +def _adder_handler( + root: str, + fixed: Sequence[str], + module_name: str, + add_name: str, + namespace_update: Callable[[argparse.Namespace, ConsoleContext], None] | None = None, +) -> Callable[["HermesConsoleEngine", list[str]], str]: + def handler(_engine: HermesConsoleEngine, args: list[str]) -> str: + return _dispatch_adder_subcommand( + root=root, + fixed=fixed, + args=args, + module_name=module_name, + add_name=add_name, + console_context=_engine.context, + namespace_update=namespace_update, + ) + + return handler + + +def _register_command_family( + engine: "HermesConsoleEngine", + *, + root: str, + paths: Iterable[Sequence[str]], + handler_factory: Callable[[Sequence[str]], Callable[["HermesConsoleEngine", list[str]], str]], + mutating: Iterable[Sequence[str]] = (), + hosted: Iterable[Sequence[str]] = (), + summary: str = "", + summaries: dict[tuple[str, ...], str] | None = None, + confirmation: str = "", +) -> None: + mutating_paths = {tuple(path) for path in mutating} + hosted_paths = {tuple(path) for path in hosted} + for child_path in paths: + child_key = tuple(child_path) + full_path = (root, *tuple(child_path)) + usage = " ".join(full_path) + command_summary = summary or (summaries or {}).get(full_path) or f"Run `hermes {usage}`." + engine.register( + full_path, + usage, + command_summary, + handler_factory(tuple(child_path)), + mutating=child_key in mutating_paths, + confirmation=confirmation or f"Run `hermes {usage}`?", + contexts=ALL_CONTEXTS if child_key in hosted_paths else LOCAL_CONTEXTS, + ) + + +class HermesConsoleEngine: + """Curated line-command executor for Hermes Console.""" + + def __init__(self, *, output_limit: int = 20000, context: ConsoleContext = "local"): + if context not in ALL_CONTEXTS: + raise ValueError(f"Unknown console context: {context}") + self.context = context + self.output_limit = output_limit + self.history: list[str] = [] + self.commands: dict[tuple[str, ...], ConsoleCommand] = {} + self._register_defaults() + + def execute(self, line: str, *, confirmed: bool = False) -> ConsoleResult: + raw_line = line.strip() + if not raw_line: + return ConsoleResult("ok") + + try: + tokens = _split_line(raw_line) + if tokens and tokens[0] == "hermes": + tokens = tokens[1:] + if not tokens: + return self._help_result() + + if _contains_shell_syntax(raw_line, tokens): + raise ConsoleCommandError( + "Hermes Console does not run shell syntax. Use one supported " + "Hermes command at a time." + ) + + builtin = self._execute_builtin(tokens) + if builtin is not None: + if raw_line not in {"history", "clear"}: + self.history.append(raw_line) + return builtin + + command, args = self._resolve_command(tokens) + if command.mutating and not confirmed: + return ConsoleResult( + "confirm_required", + command=raw_line, + confirmation_message=command.confirmation + or f"Run `{command.usage}`?", + ) + + output = command.handler(self, args).rstrip() + output = self._cap_output(output) + self.history.append(raw_line) + return ConsoleResult("ok", output=output, command=raw_line) + except ConsoleCommandError as exc: + return ConsoleResult("error", output=str(exc).strip(), command=raw_line) + + def help_text(self, subject: str | None = None) -> str: + if subject: + tokens = subject.split() + command, _args = self._resolve_command(tokens) + return f"{command.usage}\n{command.summary}" + + lines = [ + "Hermes Console", + "", + "Supported commands:", + ] + for command in sorted(self.commands.values(), key=lambda c: c.usage): + if self.context not in command.contexts: + continue + marker = " *" if command.mutating else " " + lines.append(f"{marker} {command.usage:<32} {_table_summary(command.summary)}") + lines.extend( + [ + "", + "* requires confirmation", + "Built-ins: help, help , history, clear, exit, quit", + ] + ) + return "\n".join(lines) + + def _register_defaults(self) -> None: + self.register(("status",), "status", "Show Hermes component status.", _status, contexts=ALL_CONTEXTS) + self.register(("doctor",), "doctor", "Run diagnostics without auto-fix.", _doctor, contexts=ALL_CONTEXTS) + self.register(("logs",), "logs [name] [-n N]", "Show recent Hermes logs.", _logs, contexts=ALL_CONTEXTS) + self.register(("sessions", "list"), "sessions list [--limit N]", "List recent sessions.", _sessions_list, contexts=ALL_CONTEXTS) + self.register(("sessions", "stats"), "sessions stats", "Show session store statistics.", _sessions_stats, contexts=ALL_CONTEXTS) + self.register(("config", "show"), "config show", "Show current configuration.", _config_show, contexts=ALL_CONTEXTS) + self.register(("config", "path"), "config path", "Print config.yaml path.", _config_path, contexts=ALL_CONTEXTS) + self.register( + ("config", "set"), + "config set ", + "Set a configuration value.", + _config_set, + mutating=True, + confirmation="Update Hermes configuration?", + contexts=ALL_CONTEXTS, + ) + self.register(("cron", "list"), "cron list [--all]", "List scheduled jobs.", _cron_list, contexts=ALL_CONTEXTS) + self.register(("cron", "status"), "cron status", "Show cron scheduler status.", _cron_status, contexts=ALL_CONTEXTS) + self.register( + ("cron", "pause"), + "cron pause ", + "Pause a scheduled job.", + _cron_pause, + mutating=True, + confirmation="Pause this cron job?", + contexts=ALL_CONTEXTS, + ) + self.register( + ("cron", "resume"), + "cron resume ", + "Resume a paused cron job.", + _cron_resume, + mutating=True, + confirmation="Resume this cron job?", + contexts=ALL_CONTEXTS, + ) + self.register( + ("cron", "run"), + "cron run ", + "Run a job on the next scheduler tick.", + _cron_run, + mutating=True, + confirmation="Trigger this cron job?", + contexts=ALL_CONTEXTS, + ) + self._register_broad_cli_surface() + + def _register_broad_cli_surface(self) -> None: + """Register non-admin CLI commands that are safe for Hermes Console.""" + + extracted = { + "version": ( + "hermes_cli.subcommands.version", + "build_version_parser", + "cmd_version", + [()], + set(), + ), + "dump": ( + "hermes_cli.subcommands.dump", + "build_dump_parser", + "cmd_dump", + [()], + set(), + ), + "debug": ( + "hermes_cli.subcommands.debug", + "build_debug_parser", + "cmd_debug", + [("share",), ("delete",)], + {("share",), ("delete",)}, + ), + "prompt-size": ( + "hermes_cli.subcommands.prompt_size", + "build_prompt_size_parser", + "cmd_prompt_size", + [()], + set(), + ), + "insights": ( + "hermes_cli.subcommands.insights", + "build_insights_parser", + "cmd_insights", + [()], + set(), + ), + "security": ( + "hermes_cli.subcommands.security", + "build_security_parser", + "cmd_security", + [("audit",)], + set(), + ), + "backup": ( + "hermes_cli.subcommands.backup", + "build_backup_parser", + "cmd_backup", + [()], + {()}, + ), + "import": ( + "hermes_cli.subcommands.import_cmd", + "build_import_cmd_parser", + "cmd_import", + [()], + {()}, + ), + "config": ( + "hermes_cli.subcommands.config", + "build_config_parser", + "cmd_config", + [("env-path",), ("check",)], + set(), + ), + "tools": ( + "hermes_cli.subcommands.tools", + "build_tools_parser", + "cmd_tools", + [("list",), ("enable",), ("disable",), ("post-setup",)], + {("enable",), ("disable",), ("post-setup",)}, + ), + "plugins": ( + "hermes_cli.subcommands.plugins", + "build_plugins_parser", + "cmd_plugins", + [("list",), ("enable",), ("disable",), ("install",), ("update",), ("remove",)], + {("enable",), ("disable",), ("install",), ("update",), ("remove",)}, + ), + "skills": ( + "hermes_cli.subcommands.skills", + "build_skills_parser", + "cmd_skills", + [ + ("browse",), + ("search",), + ("inspect",), + ("list",), + ("check",), + ("list-modified",), + ("diff",), + ("install",), + ("update",), + ("audit",), + ("uninstall",), + ("reset",), + ("opt-in",), + ("opt-out",), + ("repair-official",), + ("snapshot", "export"), + ("snapshot", "import"), + ("tap", "list"), + ("tap", "add"), + ("tap", "remove"), + ], + { + ("install",), + ("update",), + ("audit",), + ("uninstall",), + ("reset",), + ("opt-in",), + ("opt-out",), + ("repair-official",), + ("snapshot", "export"), + ("snapshot", "import"), + ("tap", "add"), + ("tap", "remove"), + }, + ), + "mcp": ( + "hermes_cli.subcommands.mcp", + "build_mcp_parser", + "cmd_mcp", + [ + ("list",), + ("catalog",), + ("test",), + ("add",), + ("remove",), + ("install",), + ("login",), + ("reauth",), + ("configure",), + ("picker",), + ], + { + ("add",), + ("remove",), + ("install",), + ("login",), + ("reauth",), + ("configure",), + ("picker",), + }, + ), + "memory": ( + "hermes_cli.subcommands.memory", + "build_memory_parser", + "cmd_memory", + [("status",), ("off",), ("reset",)], + {("off",), ("reset",)}, + ), + "auth": ( + "hermes_cli.subcommands.auth", + "build_auth_parser", + "cmd_auth", + [ + ("list",), + ("status",), + ("reset",), + ("add",), + ("remove",), + ("logout",), + ("spotify", "status"), + ("spotify", "login"), + ("spotify", "logout"), + ], + { + ("reset",), + ("add",), + ("remove",), + ("logout",), + ("spotify", "login"), + ("spotify", "logout"), + }, + ), + "pairing": ( + "hermes_cli.subcommands.pairing", + "build_pairing_parser", + "cmd_pairing", + [("list",), ("approve",), ("revoke",), ("clear-pending",)], + {("approve",), ("revoke",), ("clear-pending",)}, + ), + "webhook": ( + "hermes_cli.subcommands.webhook", + "build_webhook_parser", + "cmd_webhook", + [("list",), ("subscribe",), ("remove",), ("test",)], + {("subscribe",), ("remove",)}, + ), + "hooks": ( + "hermes_cli.subcommands.hooks", + "build_hooks_parser", + "cmd_hooks", + [("list",), ("test",), ("doctor",), ("revoke",)], + {("test",), ("doctor",), ("revoke",)}, + ), + "slack": ( + "hermes_cli.subcommands.slack", + "build_slack_parser", + "cmd_slack", + [("manifest",)], + set(), + ), + "profile": ( + "hermes_cli.subcommands.profile", + "build_profile_parser", + "cmd_profile", + [ + ("list",), + ("show",), + ("info",), + ("create",), + ("use",), + ("describe",), + ("rename",), + ("delete",), + ("export",), + ("import",), + ("install",), + ("update",), + ], + { + ("create",), + ("use",), + ("describe",), + ("rename",), + ("delete",), + ("export",), + ("import",), + ("install",), + ("update",), + }, + ), + "cron": ( + "hermes_cli.subcommands.cron", + "build_cron_parser", + "cmd_cron", + [("create",), ("edit",), ("remove",), ("tick",)], + {("create",), ("edit",), ("remove",), ("tick",)}, + ), + } + + for root, (module, builder, main_handler, paths, mutating) in extracted.items(): + summaries = _extracted_summaries(module, builder, main_handler) + _register_command_family( + self, + root=root, + paths=paths, + mutating=mutating, + summaries=summaries, + handler_factory=lambda fixed, root=root, module=module, builder=builder, main_handler=main_handler: _extracted_handler( + root, + fixed, + module, + builder, + main_handler, + namespace_update=_apply_confirmed_defaults, + ), + ) + + self.register( + ("config", "migrate"), + "config migrate", + "Update config with new options.", + _config_migrate, + mutating=True, + confirmation="Update Hermes configuration with missing defaults?", + ) + self.register( + ("sessions", "export"), + "sessions export [--source SOURCE] [--session-id ID]", + "Export sessions to JSONL.", + _sessions_export, + mutating=True, + confirmation="Export session data?", + ) + self.register( + ("sessions", "rename"), + "sessions rename ", + "Rename a session.", + _sessions_rename, + mutating=True, + confirmation="Rename this session?", + ) + self.register( + ("sessions", "optimize"), + "sessions optimize", + "Optimize the session store.", + _sessions_optimize, + mutating=True, + confirmation="Optimize the session database?", + ) + self.register( + ("sessions", "repair"), + "sessions repair [--check-only] [--no-backup]", + "Repair a malformed session database schema.", + _sessions_repair, + mutating=True, + confirmation="Repair the session database?", + ) + + self.register( + ("profile",), + "profile", + "Show active profile status.", + _profile_status, + ) + self.register( + ("send",), + "send --to <target> <message>", + "Send a message to a configured platform.", + _adder_handler("send", (), "hermes_cli.send_cmd", "register_send_subparser"), + mutating=True, + confirmation="Send this message?", + ) + + portal_paths = [("info",), ("tools",)] + _register_command_family( + self, + root="portal", + paths=portal_paths, + summaries=_adder_summaries("hermes_cli.portal_cli", "add_parser"), + handler_factory=lambda fixed: _adder_handler( + "portal", + fixed, + "hermes_cli.portal_cli", + "add_parser", + ), + ) + + _register_command_family( + self, + root="project", + paths=[ + ("list",), + ("show",), + ("create",), + ("add-folder",), + ("remove-folder",), + ("rename",), + ("set-primary",), + ("use",), + ("archive",), + ("restore",), + ("bind-board",), + ], + summaries=_builder_summaries("hermes_cli.projects_cmd", "build_parser"), + mutating=[ + ("create",), + ("add-folder",), + ("remove-folder",), + ("rename",), + ("set-primary",), + ("use",), + ("archive",), + ("restore",), + ("bind-board",), + ], + handler_factory=lambda fixed: _builder_handler( + "project", + fixed, + "hermes_cli.projects_cmd", + "build_parser", + "cmd_project", + ), + ) + + _register_command_family( + self, + root="kanban", + paths=[ + ("init",), + ("boards", "list"), + ("boards", "create"), + ("boards", "rm"), + ("boards", "switch"), + ("boards", "current"), + ("boards", "rename"), + ("boards", "set-workdir"), + ("create",), + ("list",), + ("show",), + ("assign",), + ("reclaim",), + ("reassign",), + ("diagnose",), + ("link",), + ("unlink",), + ("claim",), + ("comment",), + ("complete",), + ("edit",), + ("block",), + ("schedule",), + ("unblock",), + ("promote",), + ("archive",), + ("stats",), + ("runs",), + ("heartbeat",), + ("assignments",), + ("context",), + ], + summaries=_builder_summaries("hermes_cli.kanban", "build_parser"), + mutating=[ + ("init",), + ("boards", "create"), + ("boards", "rm"), + ("boards", "switch"), + ("boards", "rename"), + ("boards", "set-workdir"), + ("create",), + ("assign",), + ("reclaim",), + ("reassign",), + ("link",), + ("unlink",), + ("claim",), + ("comment",), + ("complete",), + ("edit",), + ("block",), + ("schedule",), + ("unblock",), + ("promote",), + ("archive",), + ], + handler_factory=lambda fixed: _builder_handler( + "kanban", + fixed, + "hermes_cli.kanban", + "build_parser", + "cmd_kanban", + ), + ) + + registered = { + "bundles": ( + "hermes_cli.bundles", + "register_cli", + "bundles_command", + [("list",), ("show",), ("create",), ("delete",), ("reload",)], + {("create",), ("delete",), ("reload",)}, + ), + "checkpoints": ( + "hermes_cli.checkpoints", + "register_cli", + None, + [("status",), ("list",), ("prune",), ("clear",), ("clear-legacy",)], + {("prune",), ("clear",), ("clear-legacy",)}, + ), + "curator": ( + "hermes_cli.curator", + "register_cli", + None, + [ + ("status",), + ("run",), + ("pause",), + ("resume",), + ("pin",), + ("unpin",), + ("restore",), + ("list-archived",), + ("archive",), + ("prune",), + ("backup",), + ("rollback",), + ], + { + ("run",), + ("pause",), + ("resume",), + ("pin",), + ("unpin",), + ("restore",), + ("archive",), + ("prune",), + ("backup",), + ("rollback",), + }, + ), + "pets": ( + "hermes_cli.pets", + "register_cli", + None, + [("list",), ("install",), ("select",), ("show",), ("off",), ("scale",), ("remove",), ("doctor",)], + {("install",), ("select",), ("off",), ("scale",), ("remove",)}, + ), + } + for root, (module, register, handler_name, paths, mutating) in registered.items(): + summaries = _registered_summaries(root, module, register) + _register_command_family( + self, + root=root, + paths=paths, + mutating=mutating, + summaries=summaries, + handler_factory=lambda fixed, root=root, module=module, register=register, handler_name=handler_name: _registered_handler( + root, + fixed, + module, + register, + handler_name=handler_name, + namespace_update=_apply_confirmed_defaults, + ), + ) + + self._mark_hosted(EXPECTED_HOSTED_PATHS) + + def register( + self, + path: Iterable[str], + usage: str, + summary: str, + handler: Callable[["HermesConsoleEngine", list[str]], str], + *, + mutating: bool = False, + confirmation: str = "", + contexts: Iterable[ConsoleContext] = LOCAL_CONTEXTS, + ) -> None: + key = tuple(path) + self.commands[key] = ConsoleCommand( + path=key, + usage=usage, + summary=summary, + handler=handler, + mutating=mutating, + confirmation=confirmation, + contexts=frozenset(contexts), + ) + + def _mark_hosted(self, paths: Iterable[Sequence[str]]) -> None: + for path in paths: + key = tuple(path) + command = self.commands.get(key) + if command is None: + raise RuntimeError(f"Hosted console policy references unknown command: {' '.join(key)}") + self.commands[key] = replace( + command, + contexts=command.contexts | frozenset({"hosted"}), + ) + + def _execute_builtin(self, tokens: list[str]) -> ConsoleResult | None: + head = tokens[0] + if head == "help": + subject = " ".join(tokens[1:]).strip() or None + try: + return ConsoleResult("ok", output=self.help_text(subject)) + except ConsoleCommandError as exc: + return ConsoleResult("error", output=str(exc)) + if head == "history": + output = "\n".join(f"{idx + 1}: {cmd}" for idx, cmd in enumerate(self.history)) + return ConsoleResult("ok", output=output or "No history yet.") + if head == "clear": + return ConsoleResult("clear", output="\033[2J\033[H") + if head in {"exit", "quit"}: + return ConsoleResult("exit") + return None + + def _resolve_command(self, tokens: Sequence[str]) -> tuple[ConsoleCommand, list[str]]: + rejected = self._rejection_for(tokens) + if rejected: + raise ConsoleCommandError(rejected) + + for size in range(min(len(tokens), 3), 0, -1): + key = tuple(tokens[:size]) + command = self.commands.get(key) + if command: + if self.context not in command.contexts: + raise ConsoleCommandError( + f"`hermes {command.usage}` is not available in " + f"{self.context} Hermes Console." + ) + self._enforce_context_policy(command, list(tokens[size:])) + return command, list(tokens[size:]) + + available = [ + " ".join(path) + for path, command in self.commands.items() + if self.context in command.contexts + ] + probe = " ".join(tokens[:2]) if len(tokens) > 1 else tokens[0] + suggestions = difflib.get_close_matches(probe, available, n=3, cutoff=0.45) + suffix = f" Did you mean: {', '.join(suggestions)}?" if suggestions else "" + raise ConsoleCommandError(f"Unsupported Hermes Console command: {probe}.{suffix}") + + def _enforce_context_policy(self, command: ConsoleCommand, args: list[str]) -> None: + if self.context != "hosted": + return + _enforce_hosted_line_policy(command.path, args) + + def _rejection_for(self, tokens: Sequence[str]) -> str: + first = tokens[0] + if first.startswith("-"): + return f"{first} is not available in Hermes Console." + blocked_top = { + "acp", + "chat", + "claw", + "completion", + "dashboard", + "desktop", + "fallback", + "gateway", + "gui", + "login", + "logout", + "model", + "moa", + "oneshot", + "postinstall", + "proxy", + "serve", + "setup", + "uninstall", + "update", + "whatsapp", + "whatsapp-cloud", + } + if first in blocked_top: + return f"`hermes {first}` is not available in Hermes Console." + blocked_pairs = { + ("config", "edit"): "`config edit` opens an editor and is not available in Hermes Console.", + ("mcp", "serve"): "`mcp serve` starts a server and is not available in Hermes Console.", + ("profile", "alias"): "`profile alias` creates shell wrappers and is not available in Hermes Console.", + ("skills", "config"): "`skills config` is interactive and is not available in Hermes Console.", + ("skills", "publish"): "`skills publish` is not available in Hermes Console.", + ("portal", "login"): "`portal login` is interactive and is not available in Hermes Console.", + ("portal", "open"): "`portal open` opens a browser and is not available in Hermes Console.", + ("kanban", "tail"): "`kanban tail` streams output and is not available in Hermes Console.", + ("kanban", "watch"): "`kanban watch` streams output and is not available in Hermes Console.", + ("kanban", "daemon"): "`kanban daemon` starts a service and is not available in Hermes Console.", + ("kanban", "dispatcher"): "`kanban dispatcher` starts a worker and is not available in Hermes Console.", + ("kanban", "swarm"): "`kanban swarm` starts agent work and is not available in Hermes Console.", + ("kanban", "decompose"): "`kanban decompose` starts agent work and is not available in Hermes Console.", + ("kanban", "specify"): "`kanban specify` starts agent work and is not available in Hermes Console.", + ("kanban", "gc"): "`kanban gc` is not available in Hermes Console.", + } + if len(tokens) >= 2: + pair = (tokens[0], tokens[1]) + if pair in blocked_pairs: + return blocked_pairs[pair] + if tuple(tokens[:2]) in {("sessions", "delete"), ("sessions", "prune")}: + return "`sessions delete` and `sessions prune` are not available in Hermes Console." + return "" + + def _help_result(self) -> ConsoleResult: + return ConsoleResult("ok", output=self.help_text()) + + def _cap_output(self, output: str) -> str: + if len(output) <= self.output_limit: + return output + omitted = len(output) - self.output_limit + return f"{output[:self.output_limit]}\n... output truncated ({omitted} bytes omitted)" + + +def _expect_no_args(args: Sequence[str], usage: str) -> None: + if args: + raise ConsoleCommandError(f"Usage: {usage}") + + +HOSTED_CONFIG_ALLOWED_PREFIXES = ( + "display.", + "ui.", + "tts.", + "voice.", + "speech.", + "sessions.", + "cron.", +) +HOSTED_CONFIG_ALLOWED_KEYS = { + "display.interface", +} +HOSTED_CONFIG_BLOCKED_PREFIXES = ( + "auth.", + "dashboard.", + "gateway.", + "managed.", + "model.", + "portal.", + "provider.", + "providers.", + "tool_gateway.", + "custom_providers.", + "mcp_servers.", +) +HOSTED_CONFIG_BLOCKED_NAMES = { + "portal_url", + "portal.url", + "portal.base_url", + "inference_url", + "inference.url", + "inference.base_url", + "nous.portal_url", + "nous.inference_url", + "openrouter_api_key", + "openai_api_key", + "anthropic_api_key", +} + + +def _flag_present(args: Sequence[str], flag: str) -> bool: + return any(arg == flag or arg.startswith(f"{flag}=") for arg in args) + + +def _flag_value(args: Sequence[str], flag: str) -> str | None: + for index, arg in enumerate(args): + if arg == flag: + if index + 1 < len(args): + return args[index + 1] + return "" + prefix = f"{flag}=" + if arg.startswith(prefix): + return arg[len(prefix) :] + return None + + +def _hosted_config_key_allowed(key: str) -> bool: + normalized = key.strip().lower() + if normalized in HOSTED_CONFIG_BLOCKED_NAMES: + return False + if normalized.startswith(HOSTED_CONFIG_BLOCKED_PREFIXES): + return False + return normalized in HOSTED_CONFIG_ALLOWED_KEYS or normalized.startswith( + HOSTED_CONFIG_ALLOWED_PREFIXES + ) + + +def _enforce_hosted_line_policy(path: tuple[str, ...], args: Sequence[str]) -> None: + if path == ("config", "set"): + key = args[0] if args else "" + if key and not _hosted_config_key_allowed(key): + raise ConsoleCommandError( + f"`config set {key}` is not available in hosted Hermes Console. " + "Use the dashboard setting for hosted account/provider changes." + ) + return + + if path == ("mcp", "add"): + if _flag_present(args, "--command") or _flag_present(args, "--args"): + raise ConsoleCommandError( + "Hosted Hermes Console does not add stdio MCP servers. " + "Use catalog install or an HTTP/SSE URL." + ) + if _flag_present(args, "--preset"): + raise ConsoleCommandError( + "Hosted Hermes Console does not add MCP presets directly. " + "Use `mcp install <catalog-name>`." + ) + url = _flag_value(args, "--url") + if not url: + raise ConsoleCommandError( + "Hosted Hermes Console requires `mcp add` to use --url with " + "an HTTP/SSE endpoint." + ) + scheme = urlparse(url).scheme.lower() + if scheme not in {"http", "https"}: + raise ConsoleCommandError( + "Hosted Hermes Console only accepts http:// or https:// MCP URLs." + ) + return + + if path in {("cron", "create"), ("cron", "edit")}: + for flag in ("--script", "--no-agent", "--workdir"): + if _flag_present(args, flag): + raise ConsoleCommandError( + f"`cron {' '.join(path[1:])} {flag}` is not available in " + "hosted Hermes Console." + ) + + +def _apply_confirmed_defaults(args: argparse.Namespace, context: ConsoleContext) -> None: + """Skip nested prompts after the console-level confirmation has happened.""" + + for attr in ("yes",): + if hasattr(args, attr): + setattr(args, attr, True) + if getattr(args, "_console_command", None) == "import": + setattr(args, "force", True) + if getattr(args, "checkpoints_command", None) in {"clear", "clear-legacy"}: + setattr(args, "force", True) + if getattr(args, "plugins_action", None) == "install": + if not getattr(args, "enable", False) and not getattr(args, "no_enable", False): + setattr(args, "no_enable", True) + if getattr(args, "auth_action", None) == "add": + auth_type = getattr(args, "auth_type", None) + if auth_type in {"api-key", "api_key"} and not getattr(args, "api_key", None): + raise ConsoleCommandError("auth add --type api-key requires --api-key in Hermes Console.") + if getattr(args, "import_name", None) is not None: + # profile import has no prompt flag; leave it alone. + return + if getattr(args, "skills_action", None) in { + "install", + "reset", + "opt-out", + "repair-official", + }: + setattr(args, "yes", True) + if getattr(args, "memory_command", None) == "reset": + setattr(args, "yes", True) + + +def _status(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "status") + from types import SimpleNamespace + + from hermes_cli.status import show_status + + output = _capture_output(lambda: show_status(SimpleNamespace(all=False, deep=False))) + return _strip_console_status_footer(output) + + +def _doctor(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "doctor") + from types import SimpleNamespace + + from hermes_cli.doctor import run_doctor + + return _capture_output(lambda: run_doctor(SimpleNamespace(fix=False, ack=None))) + + +def _logs(_engine: HermesConsoleEngine, args: list[str]) -> str: + if "-f" in args or "--follow" in args: + raise ConsoleCommandError("`logs -f` is not available in Hermes Console.") + parser = _ArgumentParser(prog="logs", add_help=False) + parser.add_argument("log_name", nargs="?", default="agent") + parser.add_argument("-n", "--lines", type=int, default=50) + parser.add_argument("--level") + parser.add_argument("--session") + parser.add_argument("--since") + parser.add_argument("--component") + ns = parser.parse_args(args) + if ns.lines < 1 or ns.lines > 500: + raise ConsoleCommandError("logs --lines must be between 1 and 500") + + from hermes_cli.logs import list_logs, tail_log + + if ns.log_name == "list": + return _capture_output(list_logs) + return _capture_output( + lambda: tail_log( + ns.log_name, + num_lines=ns.lines, + follow=False, + level=ns.level, + session=ns.session, + since=ns.since, + component=ns.component, + ) + ) + + +def _sessions_list(_engine: HermesConsoleEngine, args: list[str]) -> str: + parser = _ArgumentParser(prog="sessions list", add_help=False) + parser.add_argument("--limit", type=int, default=20) + ns = parser.parse_args(args) + if ns.limit < 1 or ns.limit > 200: + raise ConsoleCommandError("sessions list --limit must be between 1 and 200") + + from hermes_state import SessionDB + + db = SessionDB() + try: + sessions = db.list_sessions_rich( + exclude_sources=["tool"], + limit=ns.limit, + order_by_last_active=True, + ) + finally: + db.close() + return _format_sessions(sessions) + + +def _sessions_stats(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "sessions stats") + from hermes_state import SessionDB + + db = SessionDB() + try: + total = db.session_count() + listable = db.session_count(exclude_children=True, exclude_sources=["tool"]) + messages = db.message_count() + lines = [ + f"Total sessions: {total}", + f"Listable sessions: {listable}", + f"Total messages: {messages}", + ] + for source in ["cli", "tui", "telegram", "discord", "slack", "cron"]: + count = db.session_count(source=source) + if count: + lines.append(f" {source}: {count}") + return "\n".join(lines) + finally: + db.close() + + +def _config_show(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "config show") + from hermes_cli.config import show_config + + return _capture_output(show_config) + + +def _config_path(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "config path") + from hermes_cli.config import get_config_path + + return str(get_config_path()) + + +def _config_set(_engine: HermesConsoleEngine, args: list[str]) -> str: + if len(args) < 2: + raise ConsoleCommandError("Usage: config set <key> <value>") + key = args[0] + value = " ".join(args[1:]) + from hermes_cli.config import set_config_value + + return _capture_output(lambda: set_config_value(key, value)) + + +def _config_migrate(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "config migrate") + + def _run() -> None: + from hermes_cli.config import migrate_config + + results = migrate_config(interactive=False, quiet=False) + if results.get("env_added") or results.get("config_added"): + print("Configuration updated.") + else: + print("Configuration is up to date.") + warnings = results.get("warnings") or [] + for warning in warnings: + print(f"Warning: {warning}") + + return _capture_output(_run) + + +def _sessions_export(_engine: HermesConsoleEngine, args: list[str]) -> str: + parser = _ArgumentParser(prog="sessions export", add_help=False) + parser.add_argument("output") + parser.add_argument("--source") + parser.add_argument("--session-id") + ns = parser.parse_args(args) + + def _run() -> None: + from hermes_state import SessionDB + + db = SessionDB() + try: + if ns.session_id: + resolved_session_id = db.resolve_session_id(ns.session_id) + if not resolved_session_id: + raise ConsoleCommandError(f"Session '{ns.session_id}' not found.") + data = db.export_session(resolved_session_id) + if not data: + raise ConsoleCommandError(f"Session '{ns.session_id}' not found.") + rows = [data] + else: + rows = db.export_all(source=ns.source) + + lines = [json.dumps(row, ensure_ascii=False) for row in rows] + text = "\n".join(lines) + if text: + text += "\n" + if ns.output == "-": + sys.stdout.write(text) + else: + Path(ns.output).expanduser().write_text(text, encoding="utf-8") + print(f"Exported {len(rows)} session(s) to {ns.output}") + finally: + db.close() + + return _capture_output(_run) + + +def _sessions_rename(_engine: HermesConsoleEngine, args: list[str]) -> str: + parser = _ArgumentParser(prog="sessions rename", add_help=False) + parser.add_argument("session_id") + parser.add_argument("title", nargs="+") + ns = parser.parse_args(args) + + def _run() -> None: + from hermes_state import SessionDB + + db = SessionDB() + try: + resolved_session_id = db.resolve_session_id(ns.session_id) + if not resolved_session_id: + raise ConsoleCommandError(f"Session '{ns.session_id}' not found.") + title = " ".join(ns.title) + if not db.set_session_title(resolved_session_id, title): + raise ConsoleCommandError(f"Session '{ns.session_id}' not found.") + print(f"Session '{resolved_session_id}' renamed to: {title}") + finally: + db.close() + + return _capture_output(_run) + + +def _sessions_optimize(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "sessions optimize") + + def _run() -> None: + from hermes_state import SessionDB + + db = SessionDB() + try: + count = db.vacuum() + print(f"Optimized {count} FTS index(es).") + finally: + db.close() + + return _capture_output(_run) + + +def _sessions_repair(_engine: HermesConsoleEngine, args: list[str]) -> str: + parser = _ArgumentParser(prog="sessions repair", add_help=False) + parser.add_argument("--check-only", action="store_true") + parser.add_argument("--no-backup", action="store_true") + ns = parser.parse_args(args) + + def _run() -> None: + from hermes_state import DEFAULT_DB_PATH, _db_opens_cleanly, repair_state_db_schema + + db_path = DEFAULT_DB_PATH + if not db_path.exists(): + print(f"No session database at {db_path} (nothing to repair).") + return + reason = _db_opens_cleanly(db_path) + if reason is None: + print(f"{db_path} opens cleanly; no repair needed.") + return + print(f"{db_path} does not open cleanly: {reason}") + if ns.check_only: + return + report = repair_state_db_schema(db_path, backup=not ns.no_backup) + if report.get("repaired"): + if report.get("backup_path"): + print(f"backup: {report['backup_path']}") + print(f"strategy: {report.get('strategy')}") + print("Repaired session database.") + return + raise ConsoleCommandError(f"Repair failed: {report.get('error')}") + + return _capture_output(_run) + + +def _profile_status(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "profile") + return _dispatch_extracted_subcommand( + root="profile", + fixed=(), + args=(), + module_name="hermes_cli.subcommands.profile", + builder_name="build_profile_parser", + main_handler_name="cmd_profile", + console_context=_engine.context, + ) + + +def _cron_list(_engine: HermesConsoleEngine, args: list[str]) -> str: + parser = _ArgumentParser(prog="cron list", add_help=False) + parser.add_argument("--all", action="store_true") + ns = parser.parse_args(args) + from hermes_cli.cron import cron_list + + return _capture_output(lambda: cron_list(show_all=ns.all)) + + +def _cron_status(_engine: HermesConsoleEngine, args: list[str]) -> str: + _expect_no_args(args, "cron status") + from hermes_cli.cron import cron_status + + return _capture_output(cron_status) + + +def _cron_pause(_engine: HermesConsoleEngine, args: list[str]) -> str: + if len(args) != 1: + raise ConsoleCommandError("Usage: cron pause <job>") + from cron.jobs import AmbiguousJobReference, pause_job + + try: + job = pause_job(args[0], reason="paused from hermes console") + except AmbiguousJobReference as exc: + raise ConsoleCommandError(str(exc)) from exc + if not job: + raise ConsoleCommandError(f"Job not found: {args[0]}") + return _format_job(job, "Paused") + + +def _cron_resume(_engine: HermesConsoleEngine, args: list[str]) -> str: + if len(args) != 1: + raise ConsoleCommandError("Usage: cron resume <job>") + from cron.jobs import AmbiguousJobReference, resume_job + + try: + job = resume_job(args[0]) + except AmbiguousJobReference as exc: + raise ConsoleCommandError(str(exc)) from exc + if not job: + raise ConsoleCommandError(f"Job not found: {args[0]}") + return _format_job(job, "Resumed") + + +def _cron_run(_engine: HermesConsoleEngine, args: list[str]) -> str: + if len(args) != 1: + raise ConsoleCommandError("Usage: cron run <job>") + from cron.jobs import AmbiguousJobReference, trigger_job + + try: + job = trigger_job(args[0]) + except AmbiguousJobReference as exc: + raise ConsoleCommandError(str(exc)) from exc + if not job: + raise ConsoleCommandError(f"Job not found: {args[0]}") + return _format_job(job, "Triggered") + + +def run_console_repl( + *, + stdin=None, + stdout=None, + stderr=None, + interactive: bool | None = None, +) -> int: + """Run the local ``hermes console`` REPL.""" + + stdin = stdin or sys.stdin + stdout = stdout or sys.stdout + stderr = stderr or sys.stderr + if interactive is None: + interactive = bool(getattr(stdin, "isatty", lambda: False)()) + + engine = HermesConsoleEngine() + if interactive: + print("Hermes Console. Type `help` for commands, `exit` to quit.", file=stdout) + + while True: + if interactive: + print("hermes> ", end="", file=stdout, flush=True) + line = stdin.readline() + if line == "": + if interactive: + print(file=stdout) + return 0 + + result = engine.execute(line) + if result.status == "confirm_required": + if not interactive: + print( + f"Confirmation required: {result.confirmation_message}", + file=stderr, + ) + return 1 + print(f"{result.confirmation_message} [y/N] ", end="", file=stdout, flush=True) + answer = stdin.readline() + if answer.strip().lower() not in {"y", "yes"}: + print("Cancelled.", file=stdout) + continue + result = engine.execute(result.command, confirmed=True) + + if result.output: + stream = stderr if result.status == "error" else stdout + print(result.output, file=stream) + if result.status == "exit": + return 0 diff --git a/hermes_cli/dashboard_auth/routes.py b/hermes_cli/dashboard_auth/routes.py index 568a11957..9e80f2583 100644 --- a/hermes_cli/dashboard_auth/routes.py +++ b/hermes_cli/dashboard_auth/routes.py @@ -608,7 +608,8 @@ async def api_auth_ws_ticket(request: Request): Browsers cannot set ``Authorization`` on a WebSocket upgrade, so in gated mode the SPA POSTs this endpoint to get a ``?ticket=`` value to - append to ``/api/pty``, ``/api/ws``, ``/api/pub``, or ``/api/events``. + append to ``/api/pty``, ``/api/console``, ``/api/ws``, ``/api/pub``, or + ``/api/events``. The ticket has a 30-second TTL and is single-use. Calling this endpoint multiple times in quick succession (e.g. one ticket per WS) is the diff --git a/hermes_cli/main.py b/hermes_cli/main.py index aedcb603e..4e483d318 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -287,6 +287,7 @@ from hermes_cli.subcommands.debug import build_debug_parser from hermes_cli.subcommands.backup import build_backup_parser from hermes_cli.subcommands.import_cmd import build_import_cmd_parser from hermes_cli.subcommands.config import build_config_parser +from hermes_cli.subcommands.console import build_console_parser from hermes_cli.subcommands.version import build_version_parser from hermes_cli.subcommands.update import build_update_parser from hermes_cli.subcommands.uninstall import build_uninstall_parser @@ -8776,6 +8777,193 @@ def _wait_for_windows_update_gateway_exit( return survivors +def _venv_core_imports_healthy() -> tuple[bool, str]: + """Probe the project venv for the core imports the backend needs to boot. + + Runs a tiny import check inside the venv interpreter (NOT this process — + ``hermes update`` may be driven by a different Python). Catches the + half-updated-venv state: git checkout current but a dependency sync that + failed or was killed partway (e.g. Windows access-denied on a loaded + .pyd), leaving imports like ``fastapi``'s new transitive deps missing. + Without this probe, ``hermes update`` on a current checkout prints + "Already up to date!" and returns without ever re-syncing dependencies — + the user's install stays broken no matter how many times they update + (ryanc's incident, July 2026). + + Returns ``(healthy, detail)``. Never raises; unknown states report + healthy so a probe failure can't force needless reinstalls. + """ + venv_dir = PROJECT_ROOT / "venv" + python_name = "python.exe" if _is_windows() else "python" + bin_dir = "Scripts" if _is_windows() else "bin" + venv_python = venv_dir / bin_dir / python_name + if not venv_python.exists(): + # No venv interpreter at all. In a dev checkout that's normal (the + # dev may run hermes from any interpreter), so report healthy to + # avoid forcing reinstalls. But on a MANAGED install (the Windows + # installer / desktop bootstrap stamps `.hermes-bootstrap-complete`, + # and an interrupted update leaves `.update-incomplete`), the venv + # IS the install — its absence means a repair got interrupted after + # the old venv was moved aside, and "Already up to date!" would + # gaslight the user while nothing can run. + managed_markers = ( + PROJECT_ROOT / ".hermes-bootstrap-complete", + _update_marker_path(), + ) + if any(m.exists() for m in managed_markers): + return False, f"venv python missing ({venv_python})" + return True, "" + + # Core web/serve imports plus their newest transitive deps. Import (not + # just metadata) — a package can have intact dist-info but a missing + # module after an interrupted uninstall/install cycle. + check = ( + "import importlib\n" + "mods = ['fastapi', 'uvicorn', 'pydantic', 'openai', 'yaml']\n" + "missing = []\n" + "for m in mods:\n" + " try: importlib.import_module(m)\n" + " except Exception as e: missing.append(f'{m}: {e}')\n" + "print('\\n'.join(missing))\n" + ) + try: + result = subprocess.run( + [str(venv_python), "-c", check], + capture_output=True, + text=True, + timeout=60, + cwd=PROJECT_ROOT, + ) + except Exception as exc: + logger.debug("venv health probe failed to run: %s", exc) + return True, "" + + missing = [line.strip() for line in (result.stdout or "").splitlines() if line.strip()] + if result.returncode != 0 and not missing: + # Interpreter itself is broken (e.g. deleted stdlib) — that IS unhealthy. + detail = (result.stderr or "").strip().splitlines() + return False, detail[0] if detail else "venv python failed to run" + if missing: + return False, "; ".join(missing[:4]) + return True, "" + + +def _detect_venv_python_processes( + *, exclude_pids: set[int] | None = None +) -> list[tuple[int, str, str]]: + """Find live processes running from the project venv's interpreter. + + The hermes.exe shim guard misses the biggest lock-holder class on + Windows: the Desktop app's backend (``python.exe -m hermes_cli.main + serve``) and anything else running straight off ``venv\\Scripts\\python + (w).exe``. Those processes keep native ``.pyd`` extensions mapped, so a + dependency sync mid-update dies with access-denied and strands the venv + half-updated (ryanc's brotlicffi/_sodium.pyd incidents, July 2026). + + Killing them from here is pointless — the Desktop app supervises its + backend and respawns it within seconds — so the caller should refuse and + tell the user to close the app instead. Returns ``(pid, name, cmdline)`` + tuples; empty off-Windows / without psutil / when nothing matches. The + calling process and its ancestors are always excluded (a CLI ``hermes + update`` itself runs from the venv python). Never raises. + """ + if not _is_windows(): + return [] + try: + import psutil + except Exception: + return [] + + venv_dir = PROJECT_ROOT / "venv" + try: + venv_prefix = str(venv_dir.resolve()).lower().rstrip(os.sep) + os.sep + except OSError: + venv_prefix = str(venv_dir).lower().rstrip(os.sep) + os.sep + try: + root_prefix = str(PROJECT_ROOT.resolve()).lower().rstrip(os.sep) + os.sep + except OSError: + root_prefix = str(PROJECT_ROOT).lower().rstrip(os.sep) + os.sep + + skip: set[int] = set(exclude_pids or set()) + skip.add(os.getpid()) + try: + for anc in psutil.Process().parents(): + skip.add(int(anc.pid)) + except Exception: + pass + + matches: list[tuple[int, str, str]] = [] + try: + proc_iter = psutil.process_iter(["pid", "exe", "name", "cmdline", "cwd"]) + except Exception: + return [] + for proc in proc_iter: + try: + info = proc.info + except Exception: + continue + pid = info.get("pid") + exe = info.get("exe") + if not exe or pid is None or int(pid) in skip: + continue + try: + exe_norm = str(Path(exe).resolve()).lower() + except (OSError, ValueError): + exe_norm = str(exe).lower() + cmdline_raw = " ".join(info.get("cmdline") or []) + cmdline_low = cmdline_raw.lower() + cwd_low = str(info.get("cwd") or "").lower().rstrip(os.sep) + os.sep + + # Primary match: the executable itself lives under this venv + # (venv\Scripts\python(w).exe — the desktop backend / gateway case). + is_holder = exe_norm.startswith(venv_prefix) + # Fallback: uv/base-interpreter trampolines run a python whose exe is + # OUTSIDE the venv but which still imports from it and holds its .pyd + # files. Catch those by what they're running: a cmdline that references + # this venv's path, or a `-m hermes_cli.main ...` invocation tied to + # this install (install root in the cmdline or as the working dir). + if not is_holder and venv_prefix in cmdline_low: + is_holder = True + if not is_holder and "hermes_cli.main" in cmdline_low: + if root_prefix in cmdline_low or cwd_low.startswith(root_prefix): + is_holder = True + if not is_holder: + continue + name = info.get("name") or Path(exe).name + matches.append((int(pid), str(name), cmdline_raw[:120])) + return matches + + +def _format_venv_python_holders_message(matches: list[tuple[int, str, str]]) -> str: + """Explain which venv processes block the update and how to clear them.""" + lines = [ + "✗ Other Hermes processes are running from this install's venv:", + ] + for pid, name, cmdline in matches[:6]: + hint = "" + low = cmdline.lower() + if "serve" in low or "dashboard" in low: + hint = " ← Hermes Desktop backend (close the desktop app)" + elif "gateway" in low: + hint = " ← gateway" + lines.append(f" PID {pid} {name} {cmdline}{hint}") + if len(matches) > 6: + lines.append(f" ... and {len(matches) - 6} more") + lines.append("") + lines.append( + " On Windows these keep native extension files (.pyd) locked, so the" + ) + lines.append( + " dependency update would fail partway and leave a broken install." + ) + lines.append( + " Close the Hermes desktop app / other Hermes terminals, then re-run:" + ) + lines.append(" hermes update") + lines.append(" (or use `hermes update --force-venv` to proceed anyway at your own risk)") + return "\n".join(lines) + + def _pause_windows_gateways_for_update() -> dict | None: """Stop running Windows gateways before mutating the checkout or venv. @@ -9235,6 +9423,23 @@ def _cmd_update_impl(args, gateway_mode: bool): _windows_gateway_resume, ) + # With gateways paused, anything still running from the venv interpreter + # (most commonly the Desktop app's `hermes serve` backend) will keep .pyd + # files locked and corrupt the dependency sync below. Refuse rather than + # race: killing the desktop backend is futile (the app supervises and + # respawns it), so the user must close the app. Deliberately NOT bypassed + # by plain --force: the desktop bootstrap updater passes --force to skip + # the hermes.exe shim guard above, but its lock probe only checks the shim + # and app.asar — a non-desktop venv python holding a .pyd would sail + # through and corrupt the sync (the exact failure this guard exists for). + # --force-venv is the explicit escape hatch. + if _is_windows() and not getattr(args, "force_venv", False): + _venv_holders = _detect_venv_python_processes() + if _venv_holders: + print(_format_venv_python_holders_message(_venv_holders)) + _resume_windows_gateways_after_update(_windows_gateway_resume) + sys.exit(2) + # Try git-based update first, fall back to ZIP download on Windows # when git file I/O is broken (antivirus, NTFS filter drivers, etc.) use_zip_update = False @@ -9436,7 +9641,57 @@ def _cmd_update_impl(args, gateway_mode: bool): text=True, check=False, ) - print("✓ Already up to date!") + + # A current checkout does NOT imply a healthy install: a previous + # dependency sync may have failed partway (classic on Windows, + # where a running gateway/desktop backend keeps .pyd files locked + # and uv/pip dies with access-denied, stranding the venv between + # versions). Probe the venv's core imports and repair if broken — + # otherwise "Already up to date!" gaslights the user while their + # install stays bricked. + healthy, detail = _venv_core_imports_healthy() + if not healthy: + print("⚠ Checkout is current, but the venv is unhealthy:") + print(f" {detail}") + print("→ Repairing Python dependencies...") + _write_update_incomplete_marker() + from hermes_cli.managed_uv import ensure_uv + + repair_uv = ensure_uv() + # A managed install whose venv is gone entirely (interrupted + # repair after the old venv was moved aside) needs the venv + # recreated before dependencies can be installed into it. + venv_python_missing = not ( + PROJECT_ROOT + / "venv" + / ("Scripts" if _is_windows() else "bin") + / ("python.exe" if _is_windows() else "python") + ).exists() + if venv_python_missing and repair_uv: + print("→ Recreating virtual environment...") + subprocess.run( + [repair_uv, "venv", "venv"], + cwd=PROJECT_ROOT, + check=False, + ) + if repair_uv: + repair_env = {**os.environ, "VIRTUAL_ENV": str(PROJECT_ROOT / "venv")} + _install_python_dependencies_with_optional_fallback( + [repair_uv, "pip"], env=repair_env, group="all" + ) + else: + _install_python_dependencies_with_optional_fallback( + [sys.executable, "-m", "pip"], group="all" + ) + _clear_update_incomplete_marker() + healthy_after, detail_after = _venv_core_imports_healthy() + if healthy_after: + print("✓ Dependencies repaired!") + else: + print(f"⚠ Venv still unhealthy after repair: {detail_after}") + print(" Close all Hermes windows/gateways and re-run: hermes update") + else: + print("✓ Already up to date!") _resume_windows_gateways_after_update(_windows_gateway_resume) return @@ -11896,6 +12151,13 @@ def cmd_logs(args): ) +def cmd_console(args): + """Open the safe Hermes command console.""" + from hermes_cli.console_engine import run_console_repl + + return run_console_repl() + + def _build_provider_choices() -> list[str]: """Build the --provider choices list from CANONICAL_PROVIDERS + 'auto'.""" try: @@ -11925,7 +12187,7 @@ _BUILTIN_SUBCOMMANDS = frozenset( { "acp", "auth", "backup", "bundles", "checkpoints", "claw", "completion", "computer-use", - "config", "cron", "curator", "dashboard", "serve", "debug", "doctor", + "config", "console", "cron", "curator", "dashboard", "serve", "debug", "doctor", "dump", "fallback", "gateway", "hooks", "import", "insights", "gui", "desktop", "kanban", "login", "logout", "logs", "lsp", "mcp", "memory", "migrate", "moa", "journey", "memory-graph", "learning", @@ -12748,6 +13010,11 @@ def main(): # ========================================================================= build_config_parser(subparsers, cmd_config=cmd_config) + # ========================================================================= + # console command (parser built in hermes_cli/subcommands/console.py) + # ========================================================================= + build_console_parser(subparsers, cmd_console=cmd_console) + # ========================================================================= # pairing command (parser built in hermes_cli/subcommands/pairing.py) # ========================================================================= diff --git a/hermes_cli/model_setup_flows.py b/hermes_cli/model_setup_flows.py index 312677dab..b6769b69d 100644 --- a/hermes_cli/model_setup_flows.py +++ b/hermes_cli/model_setup_flows.py @@ -1446,7 +1446,7 @@ def _model_flow_named_custom(config, provider_info): model = {"default": model} if model else {} cfg["model"] = model if provider_key: - model["provider"] = provider_key + model["provider"] = "custom:" + provider_key.strip().lower().replace(" ", "-") model.pop("base_url", None) model.pop("api_key", None) else: diff --git a/hermes_cli/subcommands/console.py b/hermes_cli/subcommands/console.py new file mode 100644 index 000000000..f952e3706 --- /dev/null +++ b/hermes_cli/subcommands/console.py @@ -0,0 +1,18 @@ +"""``hermes console`` subcommand parser.""" + +from __future__ import annotations + +from typing import Callable + + +def build_console_parser(subparsers, *, cmd_console: Callable) -> None: + """Attach the safe Hermes Console REPL subcommand.""" + console_parser = subparsers.add_parser( + "console", + help="Open the safe Hermes command console", + description=( + "Open a curated Hermes command REPL. This is not a raw shell and " + "does not expose the full Hermes CLI." + ), + ) + console_parser.set_defaults(func=cmd_console) diff --git a/hermes_cli/subcommands/update.py b/hermes_cli/subcommands/update.py index b2a632f20..bbd5e43e0 100644 --- a/hermes_cli/subcommands/update.py +++ b/hermes_cli/subcommands/update.py @@ -65,6 +65,12 @@ def build_update_parser(subparsers, *, cmd_update: Callable) -> None: "--force", action="store_true", default=False, - help="Windows: proceed with the update even when another hermes.exe is detected. The concurrent process will likely cause WinError 32 warnings and may leave a reboot-deferred .exe replacement.", + help="Windows: proceed with the update even when another hermes.exe is detected. The concurrent process will likely cause WinError 32 warnings and may leave a reboot-deferred .exe replacement. Does NOT bypass the venv-process guard (see --force-venv).", + ) + update_parser.add_argument( + "--force-venv", + action="store_true", + default=False, + help="Windows: mutate the venv even while other processes are running from its interpreter (desktop backend, gateway, terminals). Those processes keep native .pyd files locked, so the dependency sync will likely fail partway and strand the install half-updated. Use only if you know the detected holders are false positives.", ) update_parser.set_defaults(func=cmd_update) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index ea4e71ffc..0e2863cc2 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -12,8 +12,11 @@ Usage: from contextlib import asynccontextmanager, contextmanager import asyncio +import atexit import base64 import binascii +import concurrent.futures +import functools from dataclasses import dataclass from datetime import datetime, timezone import hmac @@ -1186,6 +1189,19 @@ _FS_READDIR_HIDDEN = { "target", "venv", } + +# Filenames that must never be listed, read, or downloaded through the +# managed-files API. These typically contain credentials (API keys, tokens) +# and exposing them through the dashboard file browser is a security leak — +# see issue #57505. +def _is_sensitive_filename(name: str) -> bool: + """Return True for ``.env`` and any ``.env.<suffix>`` variant. + + Case-insensitive so ``.ENV`` / ``.Env.local`` on case-insensitive + filesystems (macOS/Windows mounts) can't slip past the guard. + """ + lowered = name.lower() + return lowered == ".env" or lowered.startswith(".env.") _FS_DATA_URL_MAX_BYTES = 16 * 1024 * 1024 _FS_TEXT_SOURCE_MAX_BYTES = 64 * 1024 * 1024 _FS_TEXT_PREVIEW_MAX_BYTES = 512 * 1024 @@ -1616,7 +1632,11 @@ async def list_managed_files(request: Request, path: Optional[str] = None): raise HTTPException(status_code=400, detail="Path is not a directory") try: - entries = [_managed_file_entry(policy, child) for child in target.iterdir()] + entries = [ + _managed_file_entry(policy, child) + for child in target.iterdir() + if not _is_sensitive_filename(child.name) + ] except PermissionError: raise HTTPException(status_code=403, detail="Directory is not readable") except OSError as exc: @@ -1642,6 +1662,8 @@ async def read_managed_file(request: Request, path: str): raise HTTPException(status_code=404, detail="File not found") if not target.is_file(): raise HTTPException(status_code=400, detail="Path is not a file") + if _is_sensitive_filename(target.name): + raise HTTPException(status_code=403, detail="Access to sensitive files is not allowed") try: size = target.stat().st_size @@ -1684,6 +1706,8 @@ async def download_managed_file(request: Request, path: str): raise HTTPException(status_code=404, detail="File not found") if not target.is_file(): raise HTTPException(status_code=400, detail="Path is not a file") + if _is_sensitive_filename(target.name): + raise HTTPException(status_code=403, detail="Access to sensitive files is not allowed") try: size = target.stat().st_size @@ -12750,6 +12774,582 @@ def _ws_close_reason(text: str) -> str: return encoded[:120].decode("utf-8", "ignore") + "..." +# --------------------------------------------------------------------------- +# /api/console — safe Hermes Console command WebSocket. +# +# Unlike /api/pty, this endpoint never spawns a PTY, shell, or full Hermes CLI +# subprocess. It runs the curated console engine in-process and exchanges +# structured JSON frames with the dashboard xterm overlay. +# --------------------------------------------------------------------------- + +_CONSOLE_PROMPT = "hermes> " +_CONSOLE_COMMAND_TIMEOUT_SECONDS = 60.0 +_CONSOLE_OUTPUT_LIMIT = 50000 + +# Console commands run in a worker thread. On a timeout, asyncio.wait_for cancels +# the *awaitable*, but Python threads aren't preemptible, so a genuinely stuck +# worker keeps running to completion. To keep that from exhausting the shared +# default thread pool (asyncio.to_thread), we run console commands on a small +# dedicated, bounded pool: a leaked worker is capped, and concurrent console +# execution is bounded to a fixed number of threads regardless of reconnects. +_CONSOLE_EXECUTOR_MAX_WORKERS = 4 +_console_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None +_console_executor_lock = threading.Lock() + + +def _get_console_executor() -> concurrent.futures.ThreadPoolExecutor: + """Lazily create the bounded console worker pool (once per process).""" + global _console_executor + if _console_executor is None: + with _console_executor_lock: + if _console_executor is None: + _console_executor = concurrent.futures.ThreadPoolExecutor( + max_workers=_CONSOLE_EXECUTOR_MAX_WORKERS, + thread_name_prefix="hermes-console", + ) + # Ensure the pool is torn down on interpreter exit. Don't wait on + # in-flight workers: a stuck 60s console command must not block + # shutdown (cancel_futures drops anything not yet started). + atexit.register( + lambda: _console_executor + and _console_executor.shutdown(wait=False, cancel_futures=True) + ) + return _console_executor + + +def _dashboard_console_context() -> str: + """Choose local vs hosted command policy for the dashboard console.""" + return "hosted" if _default_hermes_root_is_opt_data() else "local" + + +def _console_profile_from_ws(ws: WebSocket) -> Optional[str]: + profile = (ws.query_params.get("profile") or "").strip() + return profile or None + + +def _execute_console_line( + engine: Any, + line: str, + *, + confirmed: bool, + profile: Optional[str], +) -> Any: + # _profile_scope swaps process-global skill module paths; keep it inside + # the worker thread and never hold it across awaits. + with _profile_scope(profile): + return engine.execute(line, confirmed=confirmed) + + +async def _console_send( + ws: WebSocket, + send_lock: asyncio.Lock, + payload: Dict[str, Any], +) -> None: + async with send_lock: + await ws.send_json(payload) + + +async def _console_send_result( + ws: WebSocket, + send_lock: asyncio.Lock, + result: Any, + *, + command_id: int, +) -> None: + command = result.command or "" + status = result.status + if status == "ok": + if result.output: + await _console_send( + ws, + send_lock, + { + "type": "output", + "id": command_id, + "stream": "stdout", + "data": result.output, + "command": command, + }, + ) + await _console_send( + ws, + send_lock, + { + "type": "complete", + "id": command_id, + "status": "ok", + "command": command, + "prompt": _CONSOLE_PROMPT, + }, + ) + return + + if status == "error": + await _console_send( + ws, + send_lock, + { + "type": "error", + "id": command_id, + "message": result.output or "Command failed.", + "command": command, + }, + ) + await _console_send( + ws, + send_lock, + { + "type": "complete", + "id": command_id, + "status": "error", + "command": command, + "prompt": _CONSOLE_PROMPT, + }, + ) + return + + if status == "confirm_required": + await _console_send( + ws, + send_lock, + { + "type": "confirm_required", + "id": command_id, + "command": command, + "message": result.confirmation_message or f"Run `{command}`?", + "prompt": _CONSOLE_PROMPT, + }, + ) + await _console_send( + ws, + send_lock, + { + "type": "complete", + "id": command_id, + "status": "confirm_required", + "command": command, + "prompt": _CONSOLE_PROMPT, + }, + ) + return + + if status == "clear": + await _console_send(ws, send_lock, {"type": "clear", "id": command_id}) + await _console_send( + ws, + send_lock, + { + "type": "complete", + "id": command_id, + "status": "clear", + "command": command, + "prompt": _CONSOLE_PROMPT, + }, + ) + return + + if status == "exit": + await _console_send( + ws, + send_lock, + { + "type": "complete", + "id": command_id, + "status": "exit", + "command": command, + "prompt": "", + }, + ) + return + + await _console_send( + ws, + send_lock, + { + "type": "error", + "id": command_id, + "message": f"Unknown console result status: {status}", + "command": command, + }, + ) + + +def _console_json_payload(msg: Any) -> tuple[Optional[dict[str, Any]], Optional[str]]: + raw: str | bytes | None = msg.get("text") + if raw is None: + raw = msg.get("bytes") + if raw is None: + return None, None + if isinstance(raw, bytes): + try: + raw = raw.decode("utf-8") + except UnicodeDecodeError: + return None, "Console frames must be UTF-8 JSON." + try: + payload = json.loads(raw) + except json.JSONDecodeError: + return None, "Console frames must be JSON objects." + if not isinstance(payload, dict): + return None, "Console frames must be JSON objects." + return payload, None + + +@app.websocket("/api/console") +async def console_ws(ws: WebSocket) -> None: + peer = ws.client.host if ws.client else "?" + + if not _DASHBOARD_EMBEDDED_CHAT_ENABLED: + _log.info("console refused: embedded chat disabled peer=%s", peer) + await ws.close(code=4404, reason="embedded chat disabled") + return + + auth_reason, cred = _ws_auth_reason(ws) + mode = _ws_auth_mode() + if auth_reason is not None: + _log.warning( + "console auth rejected reason=%s mode=%s cred=%s peer=%s", + auth_reason, mode, cred, peer, + ) + await ws.close(code=4401, reason=_ws_close_reason(f"auth: {auth_reason}")) + return + + host_origin_reason = _ws_host_origin_reason(ws) + if host_origin_reason is not None: + _log.warning("console refused: %s peer=%s", host_origin_reason, peer) + await ws.close(code=4403, reason=_ws_close_reason(host_origin_reason)) + return + + client_reason = _ws_client_reason(ws) + if client_reason is not None: + _log.warning("console refused: %s", client_reason) + await ws.close(code=4408, reason=_ws_close_reason(client_reason)) + return + + await ws.accept() + + profile = _console_profile_from_ws(ws) + context = _dashboard_console_context() + send_lock = asyncio.Lock() + + try: + from hermes_cli.console_engine import HermesConsoleEngine + + engine = HermesConsoleEngine( + output_limit=_CONSOLE_OUTPUT_LIMIT, + context=context, # type: ignore[arg-type] + ) + if profile and profile.lower() != "current": + _resolve_profile_dir(profile) + except HTTPException as exc: + await _console_send( + ws, + send_lock, + { + "type": "error", + "message": str(exc.detail), + "prompt": "", + }, + ) + await ws.close(code=4400, reason=_ws_close_reason(str(exc.detail))) + return + except Exception as exc: + _log.exception("console failed to initialize") + await _console_send( + ws, + send_lock, + { + "type": "error", + "message": f"Console unavailable: {exc}", + "prompt": "", + }, + ) + await ws.close(code=1011) + return + + _log.info( + "console accepted peer=%s mode=%s cred=%s context=%s profile=%s", + peer, + mode, + cred, + context, + profile or "current", + ) + await _console_send( + ws, + send_lock, + { + "type": "ready", + "context": context, + "profile": profile or "current", + "prompt": _CONSOLE_PROMPT, + }, + ) + + active_task: asyncio.Task | None = None + pending_confirmation: Optional[str] = None + command_generation = 0 + + async def run_command(line: str, *, confirmed: bool, command_id: int) -> None: + nonlocal active_task, pending_confirmation, command_generation + try: + loop = asyncio.get_running_loop() + result = await asyncio.wait_for( + loop.run_in_executor( + _get_console_executor(), + functools.partial( + _execute_console_line, + engine, + line, + confirmed=confirmed, + profile=profile, + ), + ), + timeout=_CONSOLE_COMMAND_TIMEOUT_SECONDS, + ) + except asyncio.CancelledError: + raise + except asyncio.TimeoutError: + if command_id == command_generation: + pending_confirmation = None + await _console_send( + ws, + send_lock, + { + "type": "error", + "id": command_id, + "message": ( + "Command timed out. Hermes Console returned to the prompt." + ), + "command": line, + }, + ) + await _console_send( + ws, + send_lock, + { + "type": "complete", + "id": command_id, + "status": "timeout", + "command": line, + "prompt": _CONSOLE_PROMPT, + }, + ) + except Exception as exc: + if command_id == command_generation: + pending_confirmation = None + _log.exception("console command failed") + await _console_send( + ws, + send_lock, + { + "type": "error", + "id": command_id, + "message": str(exc) or exc.__class__.__name__, + "command": line, + }, + ) + await _console_send( + ws, + send_lock, + { + "type": "complete", + "id": command_id, + "status": "error", + "command": line, + "prompt": _CONSOLE_PROMPT, + }, + ) + else: + if command_id != command_generation: + return + pending_confirmation = ( + result.command if result.status == "confirm_required" else None + ) + await _console_send_result( + ws, + send_lock, + result, + command_id=command_id, + ) + if result.status == "exit": + await ws.close(code=1000) + finally: + if command_id == command_generation: + active_task = None + + async def start_command(line: str, *, confirmed: bool = False) -> None: + nonlocal active_task, command_generation + command_generation += 1 + command_id = command_generation + active_task = asyncio.create_task( + run_command(line, confirmed=confirmed, command_id=command_id) + ) + + try: + while True: + try: + msg = await ws.receive() + except RuntimeError: + break + msg_type = msg.get("type") + if msg_type == "websocket.disconnect": + break + + payload, error = _console_json_payload(msg) + if error: + await _console_send( + ws, + send_lock, + { + "type": "error", + "message": error, + "prompt": _CONSOLE_PROMPT, + }, + ) + continue + if payload is None: + continue + + frame_type = str(payload.get("type") or "").strip().lower() + if frame_type == "ping": + await _console_send( + ws, + send_lock, + { + "type": "pong", + "prompt": _CONSOLE_PROMPT, + }, + ) + continue + + if frame_type == "cancel": + if active_task and not active_task.done(): + command_generation += 1 + active_task.cancel() + active_task = None + pending_confirmation = None + await _console_send( + ws, + send_lock, + { + "type": "complete", + "status": "cancelled", + "prompt": _CONSOLE_PROMPT, + }, + ) + elif pending_confirmation: + pending_confirmation = None + await _console_send( + ws, + send_lock, + { + "type": "complete", + "status": "cancelled", + "prompt": _CONSOLE_PROMPT, + }, + ) + else: + await _console_send( + ws, + send_lock, + { + "type": "complete", + "status": "idle", + "prompt": _CONSOLE_PROMPT, + }, + ) + continue + + if active_task and not active_task.done(): + await _console_send( + ws, + send_lock, + { + "type": "error", + "message": "A console command is already running.", + "prompt": _CONSOLE_PROMPT, + }, + ) + continue + + if frame_type == "confirm": + command = str(payload.get("command") or pending_confirmation or "").strip() + if not pending_confirmation: + await _console_send( + ws, + send_lock, + { + "type": "error", + "message": "No command is waiting for confirmation.", + "prompt": _CONSOLE_PROMPT, + }, + ) + continue + if command != pending_confirmation: + await _console_send( + ws, + send_lock, + { + "type": "error", + "message": "Confirmation does not match the pending command.", + "prompt": _CONSOLE_PROMPT, + }, + ) + continue + pending_confirmation = None + await start_command(command, confirmed=True) + continue + + if frame_type in {"input", "command"}: + line = str(payload.get("line") or payload.get("command") or "").strip() + if not line: + await _console_send( + ws, + send_lock, + { + "type": "complete", + "status": "ok", + "prompt": _CONSOLE_PROMPT, + }, + ) + continue + if pending_confirmation: + await _console_send( + ws, + send_lock, + { + "type": "error", + "message": ( + "Confirm or cancel the pending command before " + "running another one." + ), + "prompt": _CONSOLE_PROMPT, + }, + ) + continue + await start_command(line) + continue + + await _console_send( + ws, + send_lock, + { + "type": "error", + "message": f"Unsupported console frame: {frame_type or '?'}", + "prompt": _CONSOLE_PROMPT, + }, + ) + except WebSocketDisconnect: + pass + finally: + if active_task and not active_task.done(): + active_task.cancel() + try: + await active_task + except (asyncio.CancelledError, Exception): + pass + + @app.websocket("/api/pty") async def pty_ws(ws: WebSocket) -> None: peer = ws.client.host if ws.client else "?" diff --git a/optional-skills/security/unbroker/README.md b/optional-skills/security/unbroker/README.md new file mode 100644 index 000000000..d249293cd --- /dev/null +++ b/optional-skills/security/unbroker/README.md @@ -0,0 +1,164 @@ +# unbroker + +An agent-native skill that finds a consenting person's exposed personal information across data +brokers and people-search sites and removes it. It runs automatically wherever it can, and hands off +to a human only where a site demands a CAPTCHA it cannot clear, a government ID, a phone call, or a +fax. + +<p align="center"> + <img src="assets/unbroker.png" + alt="unbroker: autonomous removal pipeline (exposure field, the loop, ledger, re-scan horizon)" + width="720"> +</p> + +## About + +Hundreds of data brokers publish people's names, current and prior addresses, phone numbers, emails, +relatives, and property records. That exposure fuels doxxing, stalking, harassment, and identity +theft. Removing the data is the documented antidote, but it is high-volume work, full of dark +patterns, and perishable (brokers re-list you). Commercial services such as EasyOptOuts, Incogni, and +DeleteMe solve this for a fee, but they are closed, and you hand a company you know nothing about the +exact data you are trying to erase. + +unbroker brings those core capabilities together (EasyOptOuts' automation breadth, Incogni's +legal-request engine, DeleteMe's verification and reporting) as a transparent, auditable, +self-hosted skill that the user's own agent runs. It is **multi-tenant** (manage yourself, family, or +clients, each isolated), **consent-gated**, and built for **maximum automation with a human +fallback**. Scope is **US-first**, with EU/UK (GDPR) and global coverage on the roadmap. + +The design is **Hermes-native**: a small deterministic Python CLI (`scripts/pdd.py`) owns the state +(config, dossiers, broker DB, tier planning, ledger, drafts, reports), while the agent does the +scanning and submitting with native tools (`web_extract`, `browser_*`, email, `cronjob`, +`delegate_task`). [`SKILL.md`](SKILL.md) is the authoritative reference. + +## Install + +```bash +hermes skills install official/security/unbroker +``` + +Then start a new Hermes session and drive it (below). The skill works zero-config; a few optional +env vars unlock more automation (all documented in `SKILL.md` under Prerequisites): + +- `BROWSERBASE_API_KEY`: the recommended default browser. A real residential-IP cloud browser that + clears soft/managed CAPTCHAs (Turnstile, hCaptcha/reCAPTCHA checkbox) as normal operation, so + those brokers stay automated. It is not a solver and does not defeat hard challenges. +- Hands-off email, two ways: **browser mode** (`pdd.py setup --email-mode browser`, no stored + password; the agent sends opt-outs and opens verification links through your logged-in webmail), + or **`EMAIL_ADDRESS` + `EMAIL_PASSWORD`** for SMTP send + IMAP verification. Without either, it + falls back to writing drafts for you to send. +- the `age` binary: at-rest encryption of dossiers and ledgers. +- the `google-workspace` skill: a shared Google Sheets status tracker. + +## Usage + +Drive it from a Hermes session: + +> "Use the unbroker skill to remove my data from data brokers. Here is my consent. Run it hands-off +> and show me the human-task digest at the end." + +The agent configures itself (`setup --auto` selects programmatic email if `EMAIL_*` creds exist, the +cloud browser if available, and encryption if `age` is installed), records your consent, then drains +the autonomous queue: scan, opt out (parents first), send and verify emails, schedule re-checks. You +hear from it twice: at intake, and with one digest of anything only a human can do. + +The underlying CLI (run via `terminal`, as `python3 scripts/pdd.py <cmd>`): + +| Command | Purpose | +|---|---| +| `pdd.py setup --auto` / `doctor` | Self-configure (most-autonomous valid config) and readiness check | +| `pdd.py intake` | Create a consenting subject (captures aliases, multiple emails/phones, prior addresses) | +| `pdd.py next` | The loop driver: ordered agent actions right now, the human digest, and the next wake time | +| `pdd.py brokers` / `refresh-brokers` | List people-search brokers, or pull the latest BADBOOL list plus the CA registry | +| `pdd.py registry` | State data-broker registry coverage (CA ~545 ingested; VT/OR/TX portals); `--search` to find one | +| `pdd.py drop` | The CA DROP one-shot: delete from all registered brokers in a single request | +| `pdd.py plan` | Per-broker tier, method, search vectors, and the exact fields to disclose | +| `pdd.py fanout` | Batch brokers into parallel `delegate_task` subagents | +| `pdd.py record` | Update the ledger (validated state machine); auto-stamps recheck dates | +| `pdd.py send-email` | Render and send an opt-out / CCPA / GDPR request (recipient locked to the broker's own address) | +| `pdd.py poll-verification` / `verify-link` | Resolve email-verification links (IMAP poll, or browser-mode from pasted text) | +| `pdd.py render-email` | Draft-only fallback (least-disclosure) | +| `pdd.py due` / `tasks` | Recheck queue for cron, and the consolidated human-task digest | +| `pdd.py status` / `report` | Per-subject status, plus optional Google Sheets rows | + +## How it works + +- **Autonomous by default.** After one human conversation (intake plus consent), the agent drains a + deterministic action queue (`pdd.py next`): scan, opt out parents-first, send and verify emails, + re-check on schedule, all without pausing to ask. Human-only work (gov-ID sites, phone callbacks, + hard-CAPTCHA sites) accumulates silently into a single end-of-run digest (`pdd.py tasks`). +- **Tiered automation (T0 to T3).** Every broker opt-out is classified from fully automated, to + automated with verification, to human-verified, to human-only. The agent always takes the highest + viable tier and escalates to a human task only when genuinely blocked. +- **Cluster parents first.** Many brokers are resold shells of a few parents, so one removal can + clear a dozen child sites. The planner orders parents ahead of standalone listings and ships + field-verified, per-parent playbooks that usually prefer the **right-to-delete** lane over mere + suppression (for example Whitepages' privacy email, which sidesteps the phone-callback tool), with + per-broker exceptions where the record says otherwise (PeopleConnect: deleting your user data wipes + your suppressions and does not stop public-records re-listing, so suppress-and-maintain instead). +- **Multi-identifier fan-out.** A person is indexed under every name/alias, phone, email, and + address. The planner expands all of them (filtered by what each broker supports) so listings under + a maiden name or an old address are found, not just "primary name plus current city". +- **Verify before you disclose.** Nothing is submitted until a real listing is confirmed, the match + is confirmed as the subject and not a namesake or relative, and only the exact fields a broker + requires are sent (least-disclosure; SSN and ID numbers are never volunteered). +- **Jurisdiction-aware.** Requests file under the framework that applies where the subject lives: + CCPA/CPRA in California, GDPR in the EU/UK, a general right-to-delete request otherwise. It never + cites a right the subject cannot invoke. +- **Coverage that matches or exceeds commercial services.** Two lanes: (1) people-search sites with + per-site opt-out mechanics (19 curated records, including FamilyTreeNow, Radaris, and Nuwber, plus + a live pull from [BADBOOL](https://github.com/yaelwrites/Big-Ass-Data-Broker-Opt-Out-List)), and + (2) the **state data-broker registries** as a distinct legal-coverage lane: the **California Data + Broker Registry** (~545 registered brokers, the authoritative universe the commercial services draw + from) is ingested, with Vermont, Oregon, and Texas surfaced as search portals. +- **The DROP one-shot.** California's Delete Request and Opt-out Platform is live: for a CA resident, + a single verified request deletes their data from **every registered broker at once**, and + `pdd.py next` surfaces it as the highest-leverage action. +- **Ledger, audit, and re-scan.** Every case is a validated state machine, every PII disclosure is + logged (field names only), and confirmed removals are re-scanned on a schedule so a re-listing is + caught and re-filed. Ledger writes are file-locked for safe concurrent runs. +- **Privacy by default.** Opaque subject ids (no name in ids, paths, or logs), optional `age` at-rest + encryption of dossiers, and everything local. The skill ships placeholder data only. + +## Tests + +85 hermetic tests (no network, browser, or email; SMTP and IMAP are exercised through injected +fakes): + +```bash +scripts/run_tests.sh tests/skills/test_unbroker_skill.py # CI-parity harness +python3 tests/skills/test_unbroker_skill.py # dependency-free fallback runner +``` + +## Safety and ethics + +- **Consent-gated.** The engine refuses to scan or act on a subject without a recorded + authorization. It is a removal tool, not a people-search aggregator. +- **Sanctioned browser only, no solver farms.** The default cloud browser clears soft/managed + CAPTCHAs the way any real browser would, but there is no CAPTCHA-solving service and no fingerprint + spoofing. Hard interactive challenges escalate to a human task. +- **Least-disclosure and honest reporting.** The skill submits only what a broker requires. "Hidden + from free search" is never reported as "deleted", and residual exposure (public records, paid-tier + retention) is disclosed. +- **PII handling.** Dossiers live under the Hermes home directory (`0600`, optionally + `age`-encrypted), with opaque ids. + +## Status + +**v1.0.** The deterministic engine, the autonomous loop, the verified cluster-parent deletion lanes, +and full broker-registry coverage (the CA Data Broker Registry plus the DROP one-shot) are built and +covered by 85 hermetic tests. The skill ships placeholder data only. Live agent-driven submission +against broker sites is the active field-testing frontier. + +## Credits and license + +- Broker dataset adapted from the **Big-Ass Data Broker Opt-Out List (BADBOOL)** by **Yael Grauer**, + licensed [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) (attribution + required, non-commercial). See [yaelwrites.com](https://yaelwrites.com/). +- Code: MIT. + +## Disclaimer + +This is not legal advice. Only operate on people who have authorized removal of their own data. +Removing data from brokers reduces exposure but does not guarantee total erasure. Public records +(voter, property, court) and offline vectors are out of scope. diff --git a/optional-skills/security/unbroker/SKILL.md b/optional-skills/security/unbroker/SKILL.md new file mode 100644 index 000000000..8cbb9c83e --- /dev/null +++ b/optional-skills/security/unbroker/SKILL.md @@ -0,0 +1,300 @@ +--- +name: unbroker +description: Autonomously remove your info from data-broker sites. +version: 1.0.0 +author: SHL0MS (github.com/SHL0MS) +license: MIT +platforms: [linux, macos, windows] +prerequisites: + commands: [python3] +metadata: + hermes: + tags: [privacy, data-broker, opt-out, ccpa, gdpr, security, doxxing] + category: security + related_skills: [google-workspace, agentmail, himalaya, scrapling, osint-investigation] + homepage: https://github.com/NousResearch/hermes-agent +--- + +# unbroker + +Find where a person's personal information (name, addresses, phone, email, relatives) is exposed on +data brokers and people-search sites, then remove it - automatically where possible, with guided +human steps only where a site demands a CAPTCHA, government ID, phone call, or fax. Manages multiple +people independently. It does **not** defeat anti-bot systems, does **not** act on anyone without +recorded consent, and does **not** remove public records (voter/property/court) or accounts the +person controls. + +The Python CLI (`scripts/pdd.py`) owns the deterministic state - config, dossiers + consent, the +broker database, tier planning, the ledger, drafts, reports, **email sending (SMTP), verification-link +polling (IMAP), and the autonomous action queue (`next`)**. You (the agent) do the scanning and +form-driving with native tools: `web_extract` and `browser_navigate` for searching and web forms, and +`cronjob` for recurring re-scans. + +## Autonomy contract + +This skill is designed to run **hands-off**. After intake (+ recorded consent) there are exactly TWO +legitimate human touchpoints: (1) the intake conversation itself, and (2) ONE consolidated human-task +digest at the end of the run (`$PDD tasks`). Between those: + +- **Never ask the operator to choose configuration.** `$PDD setup --auto` detects capabilities and + picks the most autonomous valid config itself. +- **Never pause before individual submissions** when `autonomy=full` (the default): the consent + recorded at intake is standing authorization for T0-T2 opt-outs. (`autonomy=assisted` restores + per-submission confirmation for cautious operators - honor `confirm_first` flags in `next` output.) +- **Never interrupt the run for human-only work.** Record it (`record ... human_task_queued + --reason "..."`) and keep going; it all surfaces once in the final digest. +- **Drive the whole run as a loop over `$PDD next <subject>`** - it returns the exact ordered actions + to take right now (scan, poll verification, re-check, opt out parents-first, requeue blocked), plus + the human digest. Execute every action, record outcomes, re-run `next`, repeat until + `done_for_now`. Then present the digest, report, and schedule the cron. + +The hard limits that autonomy never overrides: no acting without recorded consent, no disclosure +beyond `disclosure_fields`, no CAPTCHA/anti-bot bypass, and `confirmed_removed` only after a +verifying re-scan. + +## When to Use + +- "Remove my (or my family member's) data from data brokers / people-search sites." +- "Opt me out", "delete me from Spokeo/Whitepages/etc.", "clean up after a doxxing." +- "Set up recurring privacy monitoring" (brokers re-list people). +- Checking which brokers still expose someone and why. + +## Prerequisites + +- `python3` (stdlib only; no extra packages needed for the core engine). +- **Optional upgrades** (the skill works zero-config without these; `setup --auto` turns on every + one it detects, reading credentials from the shell env **and from `$HERMES_HOME/.env`** so keys + Hermes already loads for its own tools are picked up without re-exporting - each one converts a + class of human tasks into agent actions): + - **Cloud browser (recommended default): `BROWSERBASE_API_KEY`.** `setup --auto` selects it + whenever the key is present, and it is the intended baseline: a real residential-IP cloud + browser **clears soft/managed CAPTCHAs (Cloudflare Turnstile, hCaptcha/reCAPTCHA checkbox) as + normal operation**, so those brokers stay automated (T1) instead of becoming human tasks. This + is not CAPTCHA "solving" - no solver service, no fingerprint spoofing; only interactive/behavioral + ("hard") challenges the browser genuinely cannot pass fall back to a human task. Without the key, + the plain agent browser is used and soft-CAPTCHA brokers drop to T2 (human). + - Email automation, two credential-free-or-not options: + - **Browser mode (no password): `setup --email-mode browser`.** The agent sends opt-out/CCPA + emails and opens verification links through the operator's **logged-in webmail** using + `browser_*` tools. Nothing is stored. This requires Hermes to be pointed at the operator's own + logged-in browser, **NOT** a cloud browser: a headless cloud browser (Browserbase) holds no + webmail session and is itself Cloudflare/DataDome-gated on webmail and on session-bound broker + gates (e.g. PeopleConnect guided-mode). Drive the operator's real Chrome over CDP - launch + `chrome --remote-debugging-port=9222 --user-data-dir="$HOME/.hermes/chrome-debug"` (a dedicated + debug profile signed into the webmail once, not the Default profile) and connect the browser + tools to `127.0.0.1:9222`. **`$PDD cdp` launches this for you** (finds Chrome/Chromium/Brave/Edge, + starts it detached on the dedicated profile, prints the CDP endpoint; `--check` to test, `--print` + for the command). See `references/methods.md` -> "Browser backends: scan vs execute". + Falls back to drafts for an email if the inbox isn't reachable. + - **SMTP/IMAP (stored creds): `EMAIL_ADDRESS` + `EMAIL_PASSWORD`** (+ `EMAIL_SMTP_HOST` / + `EMAIL_IMAP_HOST` for non-mainstream providers; gmail/outlook/yahoo/icloud/fastmail inferred). + The CLI sends via `send-email` and reads verify links via `poll-verification`. The `agentmail` + skill (per-broker aliases) also counts. + - Google Sheets tracker: the `google-workspace` skill. + - The `scrapling` skill for stealth/Cloudflare-protected pages. + +## How to Run + +Run everything through the `terminal` tool. From this skill's directory: + +```bash +PDD="python3 scripts/pdd.py" +``` + +The engine stores data under `$PDD_DATA_DIR` (default `$HERMES_HOME/unbroker`), written +`0600`. Run via `terminal`, **not** `execute_code` (that sandbox scrubs env and redacts output, which +breaks reading the dossier). + +## Quick Reference + +| Command | Purpose | +|---|---| +| `$PDD setup --auto` | **Autonomous setup**: detect capabilities, pick the most autonomous valid config (no questions) | +| `$PDD doctor` | Readiness check: config, broker count, and which upgrades are on/available | +| `$PDD cdp [--check] [--print] [--port N]` | Launch/detect the operator's Chrome over CDP for Phase-2 browser + webmail (dedicated debug profile; the reliable way to send webmail and clear session-bound gates) | +| `$PDD intake --full-name "..." [--alias ...] [--email ... --phone ...] [--city --state] [--prior-location "City,ST"] --consent` | Create a consenting subject; captures aliases + multiple emails/phones + prior locations; prints `subject_id` | +| `$PDD next <subject>` | **The autonomous loop driver**: ordered agent actions right now + human digest + `next_wake_at` | +| `$PDD brokers [--priority crucial]` | List the people-search broker database (curated + live) | +| `$PDD refresh-brokers` | Pull the latest BADBOOL people-search list **and the CA Data Broker Registry** (`next` requeues this automatically when the cache is stale) | +| `$PDD registry [--search NAME]` | State registry coverage (CA ~545 ingested; VT/OR/TX portals surfaced); the DROP/email lane, not scanned | +| `$PDD drop <subject> [--filed]` | **The one-shot legal lever**: one CA DROP request deletes from ALL registered brokers; `--filed` records it | +| `$PDD plan <subject> [--priority crucial]` | Per-broker tier + method + `search_vectors` + the exact fields to disclose | +| `$PDD plan <subject> --batch` | **Reduce view**: overlays ledger state, groups brokers by next action (unscanned/found/indirect/blocked/in_progress/done), collapses ownership clusters, **orders `found` cluster-parents-first + emits a tailored `parent_playbook`**, prints `next_actions` | +| `$PDD fanout <subject> [--priority crucial] [--size 5]` | Batch brokers into parallel `delegate_task` subagents (auto for large runs; batches of 5 - 8+ time out) | +| `$PDD record <subject> <broker> <state> [--found true] [--evidence JSON] [--disclosed F --channel C] [--reason "..."]` | Update the ledger (validated state machine); **auto-stamps `next_recheck_at`** | +| `$PDD show <subject> <broker>` | Read back a case's recorded state + evidence + disclosure log (so the parent re-verifies a subagent's `found` without re-deriving the listing URL) | +| `$PDD send-email <subject> <broker> --listing <url> [--kind ccpa_indirect ...]` | Render + record the request (recipient locked to the broker's own address). **browser** mode returns a `compose` payload to send via webmail (no password); **programmatic** mode SMTP-sends | +| `$PDD verify-link <subject> <broker> --text '<body>'` | **browser mode**: extract a broker's verification link from webmail text you read (anti-phishing scored) | +| `$PDD poll-verification <subject> [--broker <id>]` | **programmatic mode**: poll IMAP for verification links (anti-phishing scored); auto-advances `submitted → verification_pending` | +| `$PDD render-email <subject> <broker> --listing <url>` | Draft only (fallback when no email mode is configured) | +| `$PDD due <subject>` | Cases whose recheck window arrived (the cron re-scan queue) | +| `$PDD tasks <subject>` | ONE consolidated human-task digest (present at END of run) | +| `$PDD status <subject>` | Markdown status report | +| `$PDD report <subject> --sheets` | Rows for the Google Sheets tracker | + +## Batch operation (two-phase: crawl-all, then delete) + +For anything past a couple of brokers, run this as **map → reduce → act**, not broker-by-broker: + +- **Phase 1 - DISCOVER (read-only, parallel, idempotent).** Crawl *every* broker first and record a + verdict for each (`found` / `not_found` / `indirect_exposure` / `blocked`). Scanning has no side + effects, so it is safe to parallelize and retry. Getting the full exposure map *before* acting is + what unlocks cluster dedup and prioritization below. **Default: the parent drives `web_extract` + probes directly** - most people-search sites render name/phone/address results as static HTML that + `web_extract` reads in seconds. Escalate to `browser_*` only for the few JS-only sites, and to + `delegate_task` subagents only for genuinely *reasoning*-heavy work (large-scale namesake/relative + disambiguation). **Do NOT hand a browser-toolset subagent a big list of brokers to crawl** - in the + field this timed out repeatedly (600s, ~5-6 brokers each, no summary) because browser navigation is + heavy; the ledger writes that survived came at 10x the cost of parent `web_extract`. A `blocked` + (DataDome/Cloudflare/`antibot`) site is *not* a subagent job either: record `blocked` and requeue it + for a stealth/cloud browser (Browserbase) pass. Subagent reports are self-reports - the parent + re-fetches key URLs to confirm a `found` before trusting it (this cuts both ways: it caught a real + listing the parent had wrongly assumed was a false positive). +- **REDUCE - `$PDD plan <subject> --batch`.** Collapses the crawl into a phase-oriented plan: groups by + next action, **collapses ownership clusters** (a parent removal that clears children is ONE action, + not N - e.g. one Intelius/PeopleConnect suppression covers Truthfinder/Instant Checkmate/US Search/…), + and prints `next_actions`. `phase` is `discover` while anything is unscanned, else `delete`. +- **Phase 2 - DELETE (sequential, irreversible).** Work the reduced groups **parents first**: + `plan --batch` orders the `found` group cluster-parents-first (most children first) and emits a + `parent_playbook` with tailored, ordered steps per parent - follow that order and those steps + (full recipes in `references/methods.md` → "Ownership clusters - DO PARENTS FIRST"). Do the + cluster parents (skipping the covered children), **re-scan each parent's children after it confirms** + (they usually drop out), then the standalone listings; send the `indirect_exposure` cases as + CCPA/GDPR delete-my-PII emails (`send-email --kind ccpa_indirect`), and defer `blocked` to the + stealth-browser pass. Opt-outs hit CAPTCHAs, email-verification loops, and session binding - work + them **one at a time, carefully** (this is the opposite of fan-out), but do NOT stop to ask + permission per submission in `autonomy=full`; in `assisted`, confirm each one. **Usually prefer + deletion over suppression** where a broker offers both (Spokeo/BeenVerified) - but follow the + record's `deletion.prefer`: **PeopleConnect is the exception** (`prefer: false`), where deleting + your user data removes your suppressions and does not stop public-records re-listing, so you + suppress-and-maintain instead. + +Subagent reports are self-reports: the parent re-verifies key claims (listing URLs, match basis) before +recording `found` and before any deletion. + +## Procedure (the autonomous loop) + +1. **Setup (once, no questions).** Run `$PDD setup --auto` - it detects capabilities and configures + the most autonomous valid combination itself (programmatic email when `EMAIL_*` creds exist, + Browserbase when its key exists, `age` encryption when the binary exists, `autonomy=full`). Then + `$PDD doctor` and show the operator the readiness output **for information, not as a question** - + proceed immediately. Mention what would unlock more automation (e.g. email creds) but do not wait. +2. **Intake + consent (the ONE human conversation).** `$PDD intake ...` with `--consent` (and + `--consent-method`). Without consent the engine refuses to plan or act. Collect everything in one + pass - names/aliases, current + prior cities, emails, phones - so you never have to come back with + questions. For California subjects, also read `references/legal/drop.md`: `next` will surface a + `drop_submit` one-shot that deletes from every registered broker (~545) at once, which is the + single highest-leverage action. File it, then `drop <subject> --filed`. For non-CA subjects the + registry is covered by targeted CCPA/GDPR emails (`registry --search`, then `send-email`); the + people-search sites are worked directly in either case. +3. **Drain the queue.** Loop: + + ``` + while true: + q = $PDD next <subject> + if q.actions is empty: break + execute EVERY action in order; record each outcome via $PDD record + ``` + + `next` emits, in order: `refresh_brokers` (stale cache), `fanout_scan`/`scan_inline` (Phase 1 + crawl - see step 4), `poll_verification` (in-flight email confirmations), `verify_removal` (due + re-checks), `optout_web_form`/`optout_email_send` (Phase 2, parents-first with playbook steps), + `indirect_email_send`, and `stealth_rescan`. Human-only work never appears as an action - it + accumulates in `q.human_digest`. In `autonomy=full`, execute actions without pausing; honor + `confirm_first` in `assisted` mode. +4. **Scanning (when `next` says so).** For `fanout_scan`: run `$PDD fanout <subject>` and **spawn one + `delegate_task` subagent per `batch`, in parallel, passing that batch's ready-made `brief`** - do + not scan all brokers yourself sequentially. For `scan_inline`: scan the few brokers yourself. + Either way, each broker gets **every** `search_vectors` entry via the `references/methods.md` + ladder (`web_extract` → `site:` probe → `browser_navigate` → `scrapling`), a 404 is INCONCLUSIVE + (not `not_found`), `blocked` is recorded when `antibot` is set and no stealth browser is available, + and subject vs namesake/relative is confirmed before recording: + `$PDD record <subject> <broker> <found|not_found|indirect_exposure|blocked> --found <bool> --evidence '{"listing_urls":[...]}'`. + The parent re-verifies key `found` claims from subagents before trusting them. +5. **Opt-outs (when `next` says so).** Actions come pre-ordered parents-first with `steps` from each + broker record's own `optout.playbook` (field-verified; cluster parents like PeopleConnect, + Whitepages, BeenVerified, Spokeo have exact, live-checked recipes). **Deletion usually beats + suppression**: when an action carries `prefer_deletion`, complete the record's DELETION lane, not + just the hide-my-listing flow. When it carries `prefer_suppression` instead (**PeopleConnect** - + deleting removes your suppressions and does not stop re-listing), do the suppression flow and keep + it maintained; use their Delete button only for a deliberate data-purge. Per method: + - **web_form** → drive `optout_url` with `browser_navigate`/`browser_type`/`browser_click`, submit + only `disclosure_fields`, screenshot the confirmation, then the action's `after` record command. + Playbooks may end with a right-to-delete `send-email` follow-up - do it (full erasure, not just + listing suppression). + - **email** → `$PDD send-email <subject> <broker> --kind <ccpa|gdpr|generic> --to <addr> + --listing <url>` records + discloses in one step (recipient locked to addresses the broker + record declares; `next` picks the kind from residency - never claim CCPA/GDPR for someone who + can't). In **browser** mode it returns a recipient-locked `compose` payload: compose a new + message to `compose.to` with `compose.subject`/`compose.body` exactly in the operator's webmail + via `browser_*` and send (no password); in **programmatic** mode it SMTP-sends. `next` also + routes human-gated forms (phone-callback/gov-ID) through a broker's deletion email when one + exists - the **rescue lane** (verified Whitepages pattern). Draft-only falls back to + `render-email` + a digest entry. + - **captcha** → soft/managed challenges clear automatically on the default cloud browser (proceed + as normal); only a hard interactive/behavioral challenge it can't pass is recorded `blocked` + (requeued for the stealth/operator-browser pass). Never a solver service. + - **phone_callback / account / gov_id / fax / mail / voice (T3)** *without a deletion email* → + never an agent action; `next` already routed these to the digest. Record them: + `$PDD record <subject> <broker> human_task_queued --reason "..."`. + 6. **Verification (when `next` says so).** In **programmatic** mode `$PDD poll-verification <subject>` + finds arrived confirmation links via IMAP (anti-phishing scored, auto-advances state). In + **browser** mode, open the broker's confirmation email in the operator's webmail and run + `$PDD verify-link <subject> <broker> --text '<body>'` to score the link. Either way **open the + link in the same browser** (several brokers bind the verification session to the browser that + opens it), finish the flow, then record `awaiting_processing`. `confirmed_removed` ONLY after a + verifying re-scan shows the listing gone - never off the submission flow's own confirmation page. +7. **Wrap up (once per run).** When `next` returns no actions: present `$PDD tasks <subject>` (the + consolidated human digest) if non-empty, then `$PDD status <subject>`; if the Sheets tracker is + on, append `$PDD report <subject> --sheets` rows via the `google-workspace` skill. +8. **Schedule the next wake-up.** `next` returns `next_wake_at` (earliest due re-check). Create ONE + `cronjob` that re-runs this skill's loop for the subject (a prompt like: *"run the + unbroker loop for <subject_id>: `$PDD next` and execute all actions"*). Processing + windows, verification polls, and reappearance sweeps all flow through the same queue, so the case + keeps advancing with zero human attention. + +## Pitfalls + +- **Never disclose more than the broker already shows.** Submit only `disclosure_fields`. The engine + never volunteers SSN/ID numbers; you must not either. +- **No consent, no action.** The engine enforces this; do not work around it to "research" a third party. +- **`send-email` is idempotent + rate-limited.** It refuses to re-send a case already `submitted` + or beyond (use `--force` only if a genuine re-send is needed), and SMTP sends are paced by + `email_min_interval_seconds` (default 20s) with retry/backoff. Do not loop it to "make sure" - + a successful SMTP handoff is not proof of delivery; the due-queue re-scan is the real confirmation. +- **Ledger writes are locked.** Concurrent runs (cron + manual) serialize safely; if you ever see a + lock timeout, another run is mid-write - let it finish, don't delete the `.lock` by hand. +- **Autonomy ≠ improvisation.** Full autonomy means not *asking* between steps; it does not loosen any + gate. If a broker demands MORE than the planned `disclosure_fields` mid-flow, stop that case and + queue it (`human_task_queued --reason`) rather than deciding alone to disclose extra PII. +- **Don't interrupt the run with questions.** Config choices are `setup --auto`'s job; human-only work + goes to the digest. The only mid-run question that's ever warranted is a missing-identity fact that + blocks scanning (e.g. no city at all) - and that should have been collected at intake. +- **Use `terminal`, not `execute_code`** for `pdd.py` (secret scrubbing + output redaction break it). +- **Dossiers are plaintext by default** (JSON, `0600` under `HERMES_HOME`). For at-rest encryption run + `$PDD setup --encryption age` - it generates a local `age` key and encrypts dossiers + ledgers (the + audit log holds field names only and stays plaintext). It guards casual/backup/commit exposure, not + a full-`HERMES_HOME` read; set `PDD_AGE_IDENTITY` to a separate volume for real key separation. + `$PDD doctor` shows whether encryption is *actually* engaged (not just whether `age` is installed). +- **"Hidden from free search" ≠ deleted.** Only mark `confirmed_removed` after verifying the record is + actually gone; note paid-tier retention in the report. +- **Soft CAPTCHAs clear by default; don't fight the hard ones.** The default cloud browser passes + managed/soft challenges as normal operation (those brokers stay T1). For a hard interactive one it + genuinely can't pass, record `blocked` and let the stealth/operator-browser pass take it - never a + third-party solver service or fingerprint spoofing. +- **Broker pages change.** If a flow breaks, `$PDD record ... blocked` and flag the broker file in + `references/brokers/` for re-verification instead of guessing. +- **Verify non-field-verified records before submitting.** `confidence: auto` records came from + parsing BADBOOL (read `optout.notes`/`optout.links`, confirm the real opt-out URL). `confidence: + documented` records (several people-search sites) carry the correct published opt-out URL but have + **not** been field-verified (they 403 datacenter IPs), so confirm the live flow via the operator's + residential browser on first use, then set `last_verified`. Field-verified curated records (no + `confidence`, e.g. the cluster parents) have checked mechanics and take precedence. + +## Verification + +- `scripts/run_tests.sh tests/skills/test_unbroker_skill.py` (hermetic; no network), or the + dependency-free runner `python3 tests/skills/test_unbroker_skill.py`. +- Dry run: `$PDD setup --auto && $PDD doctor && SID=$($PDD intake --full-name "Test Person" + --email t@example.com --consent | python3 -c 'import sys,json;print(json.load(sys.stdin)["subject_id"])') + && $PDD next "$SID"` and confirm a readiness summary plus an ordered action queue. diff --git a/optional-skills/security/unbroker/assets/unbroker.png b/optional-skills/security/unbroker/assets/unbroker.png new file mode 100644 index 000000000..3ad0c28d1 Binary files /dev/null and b/optional-skills/security/unbroker/assets/unbroker.png differ diff --git a/optional-skills/security/unbroker/references/brokers/advancedbackgroundchecks.json b/optional-skills/security/unbroker/references/brokers/advancedbackgroundchecks.json new file mode 100644 index 000000000..ecc3932a6 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/advancedbackgroundchecks.json @@ -0,0 +1,50 @@ +{ + "id": "advancedbackgroundchecks", + "name": "AdvancedBackgroundChecks", + "category": "people_search", + "priority": "high", + "jurisdictions": ["US"], + "search": { + "method": "url_pattern", + "url": "https://www.advancedbackgroundchecks.com", + "fetch": "web_extract", + "match_signal": "result", + "by": ["name", "phone", "address"], + "url_patterns": { + "name": "https://www.advancedbackgroundchecks.com/name/{first}-{last}", + "name_state": "https://www.advancedbackgroundchecks.com/name/{first}-{last}/in/{ST}", + "phone": "https://www.advancedbackgroundchecks.com/phone/{digits10}", + "address": "https://www.advancedbackgroundchecks.com/address/{num}-{street-with-type}-{city}-{ST}-{zip}", + "person_profile": "https://www.advancedbackgroundchecks.com/find/person/{first}-{last}-{22charID}" + }, + "url_format_quirks": [ + "All path segments lowercase, spaces -> hyphens. Name: /name/jane-public ; name+state: /name/jane-public/in/NY (state 2-letter UPPERCASE).", + "Phone: /phone/5551234567 (10 digits, NO punctuation).", + "Address: /address/123-main-st-anytown-ny-12345 (all hyphen-joined incl. ZIP; abbreviations like 'S' and 'Ave' kept verbatim, not expanded).", + "Removable unit is the person profile: /find/person/{first}-{last}-{22charID} (opaque mixed-case ID). Also /find/name/{first}-{last} and /find/name/{first}-{last}/in/{ST}.", + "NO 404s for plausible patterns: an empty search returns a soft-200 page with 'similar' fuzzy rows. The real 'not found' signal is the ABSENCE of an exact-match block in the results heading, NOT an HTTP error. Do not record not_found off a 404 here; read the heading.", + "No anti-bot gating on search/result/phone/address pages (web_extract reads them fine). Site self-brands as 'ActualPeopleSearch' in FAQ text.", + "Search tabs: /?tab=phone /?tab=email /?tab=address. Directories: /lastnames/{surname} /firstnames/{first} /people/{state-name}/{city-name} /people/zip/{zip} /people/areacode/{code}." + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.advancedbackgroundchecks.com/opt-out", + "requires": { + "profile_url": false, + "email_verification": true, + "captcha": true, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": ["full_name", "contact_email"], + "notes": "Two-step email-verification flow: initial form (radio 'I am: The subject of the request / An authorized agent of the subject', First name*, Middle name, Last name*, Email*, consent) -> emailed link -> full form -> confirmation (processed within 45 days). CAPTCHA-gated: Google reCAPTCHA ('Recaptcha requires verification'). Verified read-only 2026-06-30; not submitted.", + "est_processing_days": 3, + "reappearance_risk": "medium" + }, + "last_verified": "2026-06-30", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/beenverified.json b/optional-skills/security/unbroker/references/brokers/beenverified.json new file mode 100644 index 000000000..8fad47fe3 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/beenverified.json @@ -0,0 +1,56 @@ +{ + "id": "beenverified", + "name": "BeenVerified", + "category": "people_search", + "priority": "crucial", + "jurisdictions": ["US"], + "parent": "beenverified", + "owns": ["peoplelooker", "peoplesmart"], + "search": { + "method": "url_pattern", + "url": "https://www.beenverified.com/app/optout/search", + "fetch": "browser", + "match_signal": "result", + "by": ["name", "phone", "email", "address"] + }, + "optout": { + "tier": "T1", + "method": "web_form", + "url": "https://www.beenverified.com/svc/optout/search/optouts", + "email": "privacy@beenverified.com", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": ["full_name", "contact_email", "profile_url"], + "deletion": { + "via": "email_followup", + "email": "privacy@beenverified.com", + "kinds": ["ccpa", "generic"], + "notes": "privacy@beenverified.com is the documented address for opt-out, access, deletion, and correction requests (verified from the live privacy policy 2026-07-01). Controller is The Lifetime Value Co. -- a deletion request here can name their other properties too. DPO: dpo@beenverified.com. CCPA window: respond within 45 days (extendable to 90)." + }, + "playbook": [ + "Opt-out tool at beenverified.com/svc/optout/search/optouts ('Do Not Sell or Share' footer link; the legacy /app/optout/search path serves the same search) -- clears PeopleLooker + PeopleSmart.", + "Search the subject, open the matching listing, and submit with the confirmed profile URL + contact email. Email verification: poll-verification picks up the confirmation link; open it in the agent's own browser.", + "ONE opt-out per email address via the tool; for additional listings email support@beenverified.com or privacy@beenverified.com.", + "FULL DELETION follow-up (right-to-delete, beyond listing suppression): send-email --kind ccpa (CA) / generic to privacy@beenverified.com naming the listing URL(s); as the controller is The Lifetime Value Co., ask that the deletion cover affiliated LTV properties (NeighborWho, Ownerly, NumberGuru, Bumper) in the same request.", + "A separate property-search opt-out exists (NeighborWho/Ownerly are property-focused sisters); note residual exposure if relevant and re-scan the children after the parent confirms." + ], + "notes": "Opt-out form + deletion email both verified from the live privacy policy 2026-07-01 (policy dated 2025-10-21). Authorized agents: signed authorization letter or CA POA emailed to privacy@beenverified.com; agent-submitted delete/know requests must include the consumer's name, valid email, age, and address.", + "quirks": [ + "The 'Do Not Sell or Share My Personal Information' footer link now points to /svc/optout/search/optouts (observed 2026-07-01); records citing /app/optout/search are the same tool's older entry.", + "One opt-out per email address through the tool; email support for additional removals. The same browser/inbox must open the confirmation link.", + "Sister LTV Co. properties (NeighborWho, Ownerly, NumberGuru, Bumper) keep separate opt-out tools -- do NOT assume the BV tool cleared them; the privacy@ deletion request can name them, then verify by scanning each.", + "Right-to-know requests get an initial response within 10 days; deletion/access within 45 days (CCPA)." + ], + "est_processing_days": 7, + "reappearance_risk": "medium" + }, + "last_verified": "2026-07-01", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/clustal.json b/optional-skills/security/unbroker/references/brokers/clustal.json new file mode 100644 index 000000000..c90a6fdfd --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/clustal.json @@ -0,0 +1,47 @@ +{ + "id": "clustal", + "name": "Clustal", + "category": "people_search", + "priority": "high", + "jurisdictions": ["US"], + "search": { + "method": "url_pattern", + "url": "https://www.clustal.org/", + "fetch": "web_extract", + "match_signal": "record", + "by": ["name"], + "url_patterns": { + "name_index": "https://www.clustal.org/people-search/{letter}/{first-last}/", + "name_state": "https://www.clustal.org/people-search/{letter}/{first-last}/{st}/", + "record": "https://www.clustal.org/record/{first-last}-{8charID}/" + }, + "url_format_quirks": [ + "Name index: /people-search/{first-initial}/{first-last}/ e.g. /people-search/k/jane-public/ (lowercase, first letter of LAST name as the {letter} segment, hyphen-joined name). Optional state refine appends /{st}/ lowercase 2-letter.", + "Detail (removable unit): /record/{first-last}-{8charID}/ e.g. /record/jane-public-a1b2c3d4/ (opaque mixed-case 8-char id). The index page links each match to its /record/ URL.", + "Readable via web_extract (no hard bot gate as of 2026-07-01). Rich profile: age/DOB, current+prior addresses, phones, emails, relatives, neighbors, associates.", + "Name only: search is by name(+state); no reverse phone/email/address path observed. NOTE: clustal.org squats the 'Clustal' bioinformatics brand but IS a real people-search/data-broker site ('Find Your DNA Relatives'), not the sequence-alignment tool - do not dismiss it as a false positive." + ] + }, + "optout": { + "tier": "T0", + "method": "web_form", + "url": "https://www.clustal.org/privacy-control/", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": ["profile_url", "contact_email"], + "notes": "Opt-out at /privacy-control/ using the /record/ profile URL; support help@clustal.org. Verify the exact form fields + any CAPTCHA live before submitting (requires flags below are provisional from the site's opt-out copy, not yet a live form walk-through).", + "email": "help@clustal.org", + "est_processing_days": 7, + "reappearance_risk": "medium" + }, + "last_verified": "2026-07-01", + "source": "BADBOOL", + "confidence": "curated" +} diff --git a/optional-skills/security/unbroker/references/brokers/clustrmaps.json b/optional-skills/security/unbroker/references/brokers/clustrmaps.json new file mode 100644 index 000000000..988aeddd6 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/clustrmaps.json @@ -0,0 +1,52 @@ +{ + "id": "clustrmaps", + "name": "ClustrMaps", + "category": "people_search", + "priority": "high", + "jurisdictions": [ + "US" + ], + "parent": "clustrmaps", + "owns": [], + "search": { + "method": "url_pattern", + "url": "https://clustrmaps.com/", + "fetch": "browser", + "match_signal": "result", + "antibot": "cloudflare", + "by": [ + "name", + "phone", + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://clustrmaps.com/bl/opt-out", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "contact_email", + "profile_url" + ], + "notes": "Address/resident aggregator. Opt-out at /bl/opt-out: submit the profile URL + email, confirm the link.", + "quirks": [ + "Opt-out URL is the documented public endpoint; datacenter IPs get 403 (anti-bot), so confirm the live flow via the operator's residential browser before the first submission, then set last_verified.", + "Needs the confirmed profile_url (the address/person page).", + "Email verification required." + ], + "est_processing_days": 3, + "reappearance_risk": "medium" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/cyberbackgroundchecks.json b/optional-skills/security/unbroker/references/brokers/cyberbackgroundchecks.json new file mode 100644 index 000000000..a592504f7 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/cyberbackgroundchecks.json @@ -0,0 +1,53 @@ +{ + "id": "cyberbackgroundchecks", + "name": "CyberBackgroundChecks", + "category": "people_search", + "priority": "high", + "jurisdictions": [ + "US" + ], + "parent": "cyberbackgroundchecks", + "owns": [], + "search": { + "method": "url_pattern", + "url": "https://www.cyberbackgroundchecks.com/", + "fetch": "browser", + "match_signal": "result", + "antibot": "cloudflare", + "by": [ + "name", + "phone", + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.cyberbackgroundchecks.com/removal", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "full_name", + "contact_email", + "profile_url" + ], + "notes": "Free people-search. Opt-out at /removal: find the record, submit email, confirm link.", + "quirks": [ + "Opt-out URL is the documented public endpoint; datacenter IPs get 403 (anti-bot), so confirm the live flow via the operator's residential browser before the first submission, then set last_verified.", + "Needs the confirmed profile_url.", + "Email verification required." + ], + "est_processing_days": 3, + "reappearance_risk": "medium" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/familytreenow.json b/optional-skills/security/unbroker/references/brokers/familytreenow.json new file mode 100644 index 000000000..5700d57d0 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/familytreenow.json @@ -0,0 +1,50 @@ +{ + "id": "familytreenow", + "name": "FamilyTreeNow", + "category": "people_search", + "priority": "crucial", + "jurisdictions": [ + "US" + ], + "parent": "familytreenow", + "owns": [], + "search": { + "method": "url_pattern", + "url": "https://www.familytreenow.com/", + "fetch": "browser", + "match_signal": "result", + "antibot": "cloudflare", + "by": [ + "name", + "phone", + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.familytreenow.com/optout", + "requires": { + "profile_url": false, + "email_verification": false, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "full_name" + ], + "notes": "Notorious free people-search / doxxing site (no registry). Opt-out: search the record, select it, confirm removal on-site. No account, free.", + "quirks": [ + "Opt-out URL is the documented public endpoint; datacenter IPs get 403 (anti-bot), so confirm the live flow via the operator's residential browser before the first submission, then set last_verified.", + "Select the exact record from search results, then confirm removal; historically no email step, but re-lists periodically so keep the re-scan scheduled." + ], + "est_processing_days": 2, + "reappearance_risk": "high" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/fastpeoplesearch.json b/optional-skills/security/unbroker/references/brokers/fastpeoplesearch.json new file mode 100644 index 000000000..389464f34 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/fastpeoplesearch.json @@ -0,0 +1,44 @@ +{ + "id": "fastpeoplesearch", + "name": "FastPeopleSearch", + "category": "people_search", + "priority": "high", + "jurisdictions": ["US"], + "search": { + "method": "url_pattern", + "url": "https://www.fastpeoplesearch.com/", + "fetch": "browser", + "antibot": "datadome", + "match_signal": "result", + "match_signal_notes": "SEO TRAP: title/H1/intro echoes the query ('Over 100+ FREE public records found for {Name}') with no real match behind it, and /name/{first}-{last} list pages are fuzzy-SURNAME namesakes (different states, no address overlap). Record `found` ONLY on a result CARD corroborated by the subject's address or DOB. Ignore templated title/intro/H1 text.", + "by": ["name", "phone", "address"], + "url_patterns": { + "name": "https://www.fastpeoplesearch.com/name/{first}-{last}" + }, + "url_format_quirks": [ + "Name path /name/jane-public (lowercase, hyphen join) was ACCEPTED by the router under challenge, so the name pattern is confirmed even though content never rendered.", + "Sibling of truepeoplesearch (near-identical removal flow/branding); phone pattern is LIKELY /{digits} or /phone/{digits} and address /address/... but UNCONFIRMED - do not assume.", + "MOST aggressively gated of the people-search trio (2026-06-30): both server-side scraping (Firecrawl 504) AND headless browser (Cloudflare -> DataDome) fail on search AND /removal. Treat as fully blocked; needs residential-proxy/stealth browser to scan or opt out." + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.fastpeoplesearch.com/removal", + "requires": { + "profile_url": false, + "email_verification": true, + "captcha": true, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": ["full_name", "contact_email"], + "notes": "Opt-out NOT directly observed (2026-06-30): /removal is itself behind DataDome. By sibling-site pattern almost certainly mirrors truepeoplesearch (email-link flow, subject/agent toggle, name+email fields, hCaptcha) - INFERRED, not verified. Confirm before acting.", + "est_processing_days": 3, + "reappearance_risk": "high" + }, + "last_verified": "2026-06-30", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/intelius.json b/optional-skills/security/unbroker/references/brokers/intelius.json new file mode 100644 index 000000000..78482811f --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/intelius.json @@ -0,0 +1,95 @@ +{ + "id": "intelius", + "name": "Intelius", + "category": "people_search", + "priority": "crucial", + "jurisdictions": [ + "US" + ], + "parent": "peopleconnect", + "owns": [ + "truthfinder", + "instantcheckmate", + "ussearch", + "zabasearch", + "classmates", + "peoplefinder", + "peoplelookup", + "addresses", + "anywho", + "publicrecords" + ], + "search": { + "method": "url_pattern", + "url": "https://www.intelius.com/", + "fetch": "web_extract", + "match_signal": "result", + "by": [ + "name", + "phone", + "address" + ], + "url_patterns": { + "name": "https://www.intelius.com/people-search/{First}-{Last}/", + "name_state": "https://www.intelius.com/people-search/{first}-{last}/{state-name}/", + "full_report": "https://www.intelius.com/search/?firstName={First}&lastName={Last}&city={City}&state={ST}&traffic%5Bsource%5D=INTSEO" + }, + "url_format_quirks": [ + "Name summary page: /people-search/{First}-{Last}/ (hyphen join; case-insensitive, both Jane-Public and jane-public work). Readable via web_extract (no hard bot gate as of 2026-06-30).", + "State refine: /people-search/{first}-{last}/{state-name}/ with the state SPELLED OUT lowercase, e.g. /jane-public/new-york/ (NOT the 2-letter code).", + "The summary shows last-known address + a 'possible relatives' list + a FAQ ('Where does X live?'). Corroborate the subject by address before acting; the 'Top People with the Last Name {Last}' block is unrelated namesakes.", + "Part of PeopleConnect: same data surfaces on Truthfinder/InstantCheckmate/USSearch; one suppression at suppression.peopleconnect.us covers the cluster." + ] + }, + "optout": { + "tier": "T1", + "method": "web_form", + "url": "https://suppression.peopleconnect.us/login", + "email": "privacy@peopleconnect.us", + "requires": { + "profile_url": false, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false, + "dob": true + }, + "inputs": [ + "contact_email", + "full_name", + "date_of_birth" + ], + "deletion": { + "via": "in_flow", + "prefer": false, + "url": "https://suppression.peopleconnect.us/guided-mode", + "email": "privacy@peopleconnect.us", + "kinds": ["ccpa", "generic"], + "notes": "INVERTED for PeopleConnect: do NOT use 'Right to Delete / DELETE MY USER DATA' if the goal is staying out of search results. Per their privacy-center, deleting your user data ALSO deletes any suppressions you have, and deletion does NOT stop the people-search sites from showing you (public-records-sourced data re-lists). Suppression is the do-not-display list, so it is the effective lever and must be maintained. Use deletion ONLY if the goal is purging the account data they hold, accepting that you will re-list and must re-suppress. privacy@peopleconnect.us is the rights-request address for that data-purge path." + }, + "playbook": [ + "PeopleConnect portal (suppression.peopleconnect.us/login, privacy-center entry at /privacy-center) -- ONE flow here covers Truthfinder, Instant Checkmate, US Search, ZabaSearch, Classmates and ~15 more. DO THIS PARENT FIRST.", + "Step 1 asks ONLY for an email + consent checkbox (no name/DOB, no CAPTCHA) -> sends a verification email. Least-disclosure entry: just the contact email.", + "poll-verification will pick up the verify link. The link is a JWT (aud PeopleConnect-email-login then -registration), carries a deviceId, has a ~15-min TTL, and is Cloudflare-gated; it authenticates a SESSION bound to the browser that OPENS it. The SAME agent browser that submitted step 1 must open the link and drive guided-mode straight through. Do NOT hard-navigate to /guided-mode after auth -- that drops the in-memory session and bounces to /login. If the session is lost, re-request a fresh verify email and follow it through without navigating away.", + "guided-mode is a 5-STEP IDENTITY GATE, not a one-click suppress: (1) enter contact email + consent -> verify email; (2) open the verify link in the SAME browser (session/device-bound); (3) enter identity details -- this HARD-REQUIRES date of birth (immutable once saved, no skip) plus legal name; (4) Matching Records -- select the record that describes you, corroborating by address/email/phone, NOT name+DOB alone (namesakes exist); the matched record often aggregates MORE identifiers than the public listing showed (extra emails/addresses) -- expected, not alarming; (5) complete the SUPPRESSION action. So this opt-out discloses DOB + legal name + alias beyond the contact email -- collect DOB at intake (requires.dob=true) or expect a mid-flow pause.", + "SUPPRESS, do NOT delete (this cluster is the exception to 'deletion beats suppression'). In guided-mode, complete the SUPPRESSION flow -- it puts you on the do-not-display list, which is what actually removes you from Intelius/TruthFinder/etc. Their privacy-center states: deleting your user data 'must delete any and all suppressions associated with your user', and 'Deleting your user information will NOT prevent other users from searching for your information through the people search websites. To suppress your information ... you must maintain your user information on file with the Suppression Center.'", + "Therefore do NOT press 'Right to Delete / DELETE MY USER DATA' if the goal is search-visibility removal: it wipes your suppression and the public-records listing re-appears. Use the delete button ONLY if the operator's explicit goal is purging held account data (accept re-listing + re-suppression).", + "Keep the account/suppression on file; do not delete it later. If the portal breaks: sister addresses privacy@intelius.com / privacy@truthfinder.com / privacy@instantcheckmate.com / support@ussearch.com / privacy@classmates.com; phone 1-888-245-1655.", + "After suppression confirms, re-scan the covered children (they normally drop out) before submitting any duplicate child opt-out." + ], + "notes": "PeopleConnect portal covers the cluster via SUPPRESSION (maintained), not deletion (see the deletion lane note: delete removes suppressions and does not stop public-records re-listing). Authorized-agent requests: signed written authorization (full name, address, phone, the email the consumer uses) or POA; for Right-to-Delete they verify agent authority with the consumer by email. Verified from the live privacy policy + suppression privacy-center 2026-07-02.", + "quirks": [ + "Step 1 (suppression.peopleconnect.us/login) asks ONLY for an email + a consent checkbox, then 'Continue' -> a verification email with a link. No CAPTCHA, no name/DOB at step 1. Least-disclosure entry: just the contact email. Verified live 2026-06-30.", + "The verification link authenticates a SESSION and lands on /guided-mode. That session is bound to the browser that OPENED it; a different browser hitting /guided-mode is redirected back to /login. So for hands-off automation the SAME agent browser must open the verify link (Mode B: read inbox -> agent browser navigates the link -> drive guided-mode). Link is a JWT (aud PeopleConnect-email-login -> -registration) carrying a deviceId, ~15-min TTL, Cloudflare-gated. Do NOT hard-navigate to /guided-mode after auth (drops the in-memory session -> /login); if lost, re-request a fresh verify email and follow it straight through.", + "DOB GATE: guided-mode hard-requires date of birth (immutable once saved, no skip) to match records, so requires.dob=true. DOB is not collected at intake by default (sensitive, unneeded for scanning). If absent, the planner pre-warns (needs_operator_input) that this broker needs a human touchpoint; collect it with `intake --dob` up front to run hands-off. The matching step discloses DOB + legal name + alias beyond the contact email -- corroborate the record by address/email/phone, never name+DOB alone.", + "INVERTED delete/suppress: SUPPRESSION is the do-not-display list and is what removes you from the people-search sites; it requires keeping your identifiers on file. 'DELETE MY USER DATA' deletes those suppressions and does NOT stop the sites showing you (public records re-list). Verbatim from the privacy-center: deleting user data 'must delete any and all suppressions associated with your user'; and 'Deleting your user information will NOT prevent other users from searching for your information ... To suppress your information ... you must maintain your user information on file with the Suppression Center.' So prefer suppression; use delete only for a deliberate data-purge. Verified live 2026-07-02.", + "Their published request metrics (2025): 33,513 deletion requests, median response < 1 day -- deletion is fast, but per above it is the wrong lever for search-visibility on this cluster." + ], + "est_processing_days": 7, + "reappearance_risk": "medium" + }, + "last_verified": "2026-07-01", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/mylife.json b/optional-skills/security/unbroker/references/brokers/mylife.json new file mode 100644 index 000000000..cdb739c19 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/mylife.json @@ -0,0 +1,35 @@ +{ + "id": "mylife", + "name": "MyLife", + "category": "people_search", + "priority": "crucial", + "jurisdictions": ["US"], + "search": { + "method": "url_pattern", + "url": "https://www.mylife.com", + "fetch": "browser", + "match_signal": "profile", + "by": ["name"] + }, + "optout": { + "tier": "T3", + "method": "phone", + "url": "https://www.mylife.com/privacyrequest", + "requires": { + "profile_url": true, + "email_verification": false, + "captcha": false, + "gov_id": true, + "account": false, + "phone_callback": false, + "phone_voice": true, + "payment": false + }, + "inputs": ["full_name", "profile_url"], + "notes": "Often pushes a driver's-license upload or a call to (888) 704-1900. Emailing privacy@mylife.com with name + profile link is an alternative. Also covers Wink.com.", + "est_processing_days": 14, + "reappearance_risk": "high" + }, + "last_verified": "2026-06-28", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/nuwber.json b/optional-skills/security/unbroker/references/brokers/nuwber.json new file mode 100644 index 000000000..f8d4424a3 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/nuwber.json @@ -0,0 +1,52 @@ +{ + "id": "nuwber", + "name": "Nuwber", + "category": "people_search", + "priority": "high", + "jurisdictions": [ + "US" + ], + "parent": "nuwber", + "owns": [], + "search": { + "method": "url_pattern", + "url": "https://nuwber.com/", + "fetch": "browser", + "match_signal": "result", + "antibot": "cloudflare", + "by": [ + "name", + "phone", + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://nuwber.com/removal/link", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "contact_email", + "profile_url" + ], + "notes": "People-search. Opt-out: submit the profile URL + email at /removal/link, confirm via the emailed link.", + "quirks": [ + "Opt-out URL is the documented public endpoint; datacenter IPs get 403 (anti-bot), so confirm the live flow via the operator's residential browser before the first submission, then set last_verified.", + "Needs the confirmed profile_url (paste the listing URL you recorded).", + "Email verification required." + ], + "est_processing_days": 3, + "reappearance_risk": "medium" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/peekyou.json b/optional-skills/security/unbroker/references/brokers/peekyou.json new file mode 100644 index 000000000..0c46810dc --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/peekyou.json @@ -0,0 +1,53 @@ +{ + "id": "peekyou", + "name": "PeekYou", + "category": "people_search", + "priority": "high", + "jurisdictions": [ + "US" + ], + "parent": "peekyou", + "owns": [], + "search": { + "method": "url_pattern", + "url": "https://www.peekyou.com/", + "fetch": "browser", + "match_signal": "result", + "antibot": "cloudflare", + "by": [ + "name", + "phone", + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.peekyou.com/about/contact/optout", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "full_name", + "contact_email", + "profile_url" + ], + "notes": "Aggregates social/web profiles. Opt-out: paste your PeekYou profile URL(s), pick a reason, provide email, confirm the link.", + "quirks": [ + "Opt-out URL is the documented public endpoint; datacenter IPs get 403 (anti-bot), so confirm the live flow via the operator's residential browser before the first submission, then set last_verified.", + "Needs the confirmed PeekYou profile_url.", + "Email verification required." + ], + "est_processing_days": 7, + "reappearance_risk": "medium" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/peoplefinders.json b/optional-skills/security/unbroker/references/brokers/peoplefinders.json new file mode 100644 index 000000000..6187f0c14 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/peoplefinders.json @@ -0,0 +1,53 @@ +{ + "id": "peoplefinders", + "name": "PeopleFinders", + "category": "people_search", + "priority": "high", + "jurisdictions": [ + "US" + ], + "parent": "peoplefinders", + "owns": [], + "search": { + "method": "url_pattern", + "url": "https://www.peoplefinders.com/", + "fetch": "browser", + "match_signal": "result", + "antibot": "cloudflare", + "by": [ + "name", + "phone", + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.peoplefinders.com/opt-out", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "full_name", + "contact_email", + "profile_url" + ], + "notes": "Standalone people-search (Confi-Chek family). Opt-out: find the listing, submit with a contact email, confirm via the emailed link.", + "quirks": [ + "Opt-out URL is the documented public endpoint; datacenter IPs get 403 (anti-bot), so confirm the live flow via the operator's residential browser before the first submission, then set last_verified.", + "Needs the confirmed profile_url from evidence.", + "Email verification: poll-verification picks up the link; open it in the agent browser." + ], + "est_processing_days": 7, + "reappearance_risk": "medium" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/radaris.json b/optional-skills/security/unbroker/references/brokers/radaris.json new file mode 100644 index 000000000..dbfb29a9a --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/radaris.json @@ -0,0 +1,58 @@ +{ + "id": "radaris", + "name": "Radaris", + "category": "people_search", + "priority": "crucial", + "jurisdictions": [ + "US" + ], + "search": { + "method": "url_pattern", + "url": "https://radaris.com/", + "fetch": "web_extract", + "match_signal": "View Profile", + "by": [ + "name", + "phone", + "address" + ], + "url_patterns": { + "name": "https://radaris.com/p/{First}/{Last}/" + }, + "url_format_quirks": [ + "Name profile page: /p/{First}/{Last}/ with First/Last Capitalized and a TRAILING slash, e.g. /p/Jane/Public/ . Readable via web_extract (no hard bot gate as of 2026-06-30).", + "The page aggregates: a 'phone numbers & home addresses' table (the DIRECT hit for the subject), a relatives/namesakes carousel ('Review the potential relatives'), and resume/CV records. The subject's removable row is in the address table; the carousel entries are OTHER people - disambiguate by address/age before acting.", + "Page is noisy with testimonials/reviews boilerplate; the signal blocks are 'phone numbers & home addresses N' and 'resumes & CV records N'." + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://radaris.com/control-privacy", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": true, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "profile_url", + "contact_email" + ], + "notes": "Multi-step control-privacy wizard: step 1 NEXT -> step 2 'identify your personal page' (paste the NUMBERED profile URL into the 'Or Enter URL of your page' field; typing reveals a NEXT submit button) -> then user_email + Google reCAPTCHA -> emailed verification link. If there is no View Profile button for the subject, email customer-service@radaris.com and reply to the auto-response until removed.", + "email": "customer-service@radaris.com", + "quirks": [ + "CAPTCHA: the form carries a Google reCAPTCHA (hidden field g-recaptcha-response) plus anti-bot fingerprint tokens (jfp, token). Tier is T2 without a captcha-clearing browser backend (T1 with Browserbase). (Record previously mis-declared captcha:false.)", + "The /control-privacy form REQUIRES the per-person NUMBERED profile URL. Pasting the aggregate /p/{First}/{Last}/ URL is rejected with the validation error: 'The URL must include unique number at the end'. The real removable URL looks like https://{region}.radaris.com/person/~First-Last/1234567890 (see the field placeholder).", + "The subject's own record may appear ONLY as a static row in the 'phone numbers & home addresses' table with NO 'View Profile' link, so no numbered profile URL is exposed for them (the /p/{First}/{Last}/ page deep-links only to relatives/namesakes via JS onclick, not static hrefs). When that happens the /control-privacy form cannot be used -- do NOT fabricate or submit a relative's or namesake's profile URL. Fall back to email: write customer-service@radaris.com with the subject's own listed details and request removal, replying to the auto-response until confirmed.", + "Pressing Enter in the URL field reloads the wizard to step 1 (does not submit); type into the field and click the revealed NEXT button instead." + ], + "est_processing_days": 14, + "reappearance_risk": "high" + }, + "last_verified": "2026-06-30", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/rehold.json b/optional-skills/security/unbroker/references/brokers/rehold.json new file mode 100644 index 000000000..c8e633957 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/rehold.json @@ -0,0 +1,48 @@ +{ + "id": "rehold", + "name": "Rehold", + "category": "property_records", + "priority": "long_tail", + "jurisdictions": [ + "US" + ], + "search": { + "method": "url_pattern", + "url": "https://rehold.com/", + "fetch": "browser", + "match_signal": "result", + "match_signal_notes": "PROPERTY-RECORD, NOT PII. An address match here shows only PUBLIC PROPERTY RECORDS (build year, beds/baths, last sale price, incident history). Resident/owner NAMES sit behind 'View full report', which leads to a paywall/signup, so no personal PII is publicly exposed. Public property records are NOT removable. Record `found` ONLY if a resident NAME matching the subject is publicly displayed on the free page; an address-only match is `not_found` (nothing to opt out of).", + "access": "paywall", + "by": [ + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://rehold.com/optout", + "requires": { + "profile_url": true, + "email_verification": false, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "profile_url" + ], + "notes": "Address-anchored property/reverse-address site. Only pursue an opt-out if the scan found a publicly displayed resident NAME for the subject (see match_signal_notes); a bare public property record is not personal PII and is not removable. If the subject's personal profile IS shown, submit the profile URL to the opt-out endpoint and confirm the live flow in a residential browser before the first submission, then set last_verified.", + "quirks": [ + "Distinguish 'address exists in a public property DB' (non-removable) from 'the subject's personal profile is displayed' (removable). Only the latter is an actionable exposure.", + "'View full report' is a paywall/signup, not proof of a public listing.", + "Opt-out endpoint UNVERIFIED: confirm the live flow before the first submission." + ], + "est_processing_days": 3, + "reappearance_risk": "low" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/searchpeoplefree.json b/optional-skills/security/unbroker/references/brokers/searchpeoplefree.json new file mode 100644 index 000000000..47be96535 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/searchpeoplefree.json @@ -0,0 +1,53 @@ +{ + "id": "searchpeoplefree", + "name": "SearchPeopleFree", + "category": "people_search", + "priority": "high", + "jurisdictions": [ + "US" + ], + "parent": "searchpeoplefree", + "owns": [], + "search": { + "method": "url_pattern", + "url": "https://www.searchpeoplefree.com/", + "fetch": "browser", + "match_signal": "result", + "antibot": "cloudflare", + "by": [ + "name", + "phone", + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.searchpeoplefree.com/opt-out", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "full_name", + "contact_email", + "profile_url" + ], + "notes": "Free people-search. Opt-out: find the listing, submit with an email, confirm the link.", + "quirks": [ + "Opt-out URL is the documented public endpoint; datacenter IPs get 403 (anti-bot), so confirm the live flow via the operator's residential browser before the first submission, then set last_verified.", + "Needs the confirmed profile_url.", + "Email verification required." + ], + "est_processing_days": 3, + "reappearance_risk": "medium" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/spokeo.json b/optional-skills/security/unbroker/references/brokers/spokeo.json new file mode 100644 index 000000000..d63c0f101 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/spokeo.json @@ -0,0 +1,56 @@ +{ + "id": "spokeo", + "name": "Spokeo", + "category": "people_search", + "priority": "crucial", + "jurisdictions": ["US"], + "parent": "spokeo", + "owns": ["freepeopledirectory"], + "search": { + "method": "url_pattern", + "url": "https://www.spokeo.com/search", + "fetch": "browser", + "match_signal": "profile", + "by": ["name", "phone", "email", "address"] + }, + "optout": { + "tier": "T1", + "method": "web_form", + "url": "https://www.spokeo.com/optout", + "email": "privacy@spokeo.com", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": ["profile_url", "contact_email"], + "deletion": { + "via": "email_followup", + "email": "privacy@spokeo.com", + "kinds": ["ccpa", "generic"], + "notes": "privacy@spokeo.com is the documented direct privacy contact (verified live on /optout 2026-07-01). Use for full CCPA deletion beyond listing opt-out, for listings the form rejects, and when more listings keep surfacing." + }, + "playbook": [ + "Opt-out form at spokeo.com/optout -- clears FreePeopleDirectory. Inputs: the confirmed profile URL + contact email; the form emails a confirmation link (poll-verification picks it up; open it in the agent's own browser).", + "EVERY LISTING SEPARATELY: 'you may have multiple listings on Spokeo. Each one is identified by a unique URL and must be opted out individually' (their own wording). Run ALL search vectors first, collect every listing URL, and submit one opt-out per URL.", + "Fast: requests process in 24-48 hours per their page -- re-scan after 2 days and record the real outcome.", + "FULL DELETION follow-up: send-email --kind ccpa (CA) / generic to privacy@spokeo.com naming all listing URLs -- covers data retained beyond the free-search suppression.", + "Paid/account data may be retained even when hidden from free search -- verify actual removal via re-scan; never mark confirmed_removed off the free-search view alone." + ], + "notes": "Form + privacy@spokeo.com verified live 2026-07-01. Their page: opting out does not remove data from original sources and listings may reappear as new public records arrive -- keep the reappearance re-scan scheduled.", + "quirks": [ + "Profile URL formats the form accepts: listing URL like spokeo.com/{First}-{Last}/{City}/{ST}/p{id} or a purchase URL spokeo.com/purchase?q=... (examples shown on /optout).", + "Multiple listings per person are NORMAL (one per name x location x record cluster); each unique URL must be opted out individually -- feed every found listing URL through the form.", + "Processing is 24-48h ('depending on the nature of your request and the amount of data').", + "Paid accounts may retain data even when hidden from free search - verify actual removal, don't mark confirmed_removed off the free-search view alone." + ], + "est_processing_days": 2, + "reappearance_risk": "medium" + }, + "last_verified": "2026-07-01", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/thatsthem.json b/optional-skills/security/unbroker/references/brokers/thatsthem.json new file mode 100644 index 000000000..dd6113313 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/thatsthem.json @@ -0,0 +1,46 @@ +{ + "id": "thatsthem", + "name": "That's Them", + "category": "people_search", + "priority": "crucial", + "jurisdictions": ["US"], + "search": { + "method": "url_pattern", + "url": "https://thatsthem.com/", + "fetch": "browser", + "match_signal": "result", + "by": ["name", "phone", "email", "address"], + "url_patterns": { + "name": "https://thatsthem.com/name/{First}-{Last}/{City}-{ST}", + "phone": "https://thatsthem.com/phone/{areacode}-{prefix}-{line}", + "email": "https://thatsthem.com/email/{email}", + "address": "https://thatsthem.com/address/{Street}-{City}-{ST}-{ZIP}" + }, + "url_format_quirks": [ + "ADDRESS path is fully hyphen-joined and joins street+city+state+ZIP with hyphens (e.g. /address/123-Main-St-Anytown-NY-12345) and REQUIRES the ZIP. The older slash form (/address/Street/City-ST) and any ZIP-less form 404.", + "NAME and PHONE and EMAIL paths use a slash before city/state and do NOT need a ZIP: /name/First-Last/City-ST , /phone/AAA-PPP-LLLL , /email/addr@host.", + "Street abbreviations are kept verbatim (Ave, Pl, St) - do NOT expand to Avenue/Place/Street; expanding 404s.", + "A 404 on a constructed URL means WRONG PATTERN, not 'no record present'. Treat as INCONCLUSIVE: fall back to the on-site search box (browser_type into the address/name field + submit) and read the resulting canonical URL." + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://thatsthem.com/optout", + "requires": { + "profile_url": false, + "email_verification": false, + "captcha": true, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": ["full_name", "street", "city", "state", "postal", "contact_email", "phone"], + "notes": "Opt-out form is gated by a Cloudflare Turnstile CAPTCHA (so tier is T2 without a captcha-clearing browser backend; T1 with Browserbase). All 7 fields are marked required by the form (Full Name, Street Address, City, State, ZIP, Email, Phone). Site sends a confirmation email and states ~72h processing (this is a completion notice, not a click-to-verify link). Least-disclosure tension: the form DEMANDS email+phone even though a public record may show less - disclose only to remove a record that is actually about the subject. Do not click the Spokeo identity-theft-protection link (paid product).", + "est_processing_days": 3, + "reappearance_risk": "low" + }, + "last_verified": "2026-06-30", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/truepeoplesearch.json b/optional-skills/security/unbroker/references/brokers/truepeoplesearch.json new file mode 100644 index 000000000..1ce9bc7b1 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/truepeoplesearch.json @@ -0,0 +1,44 @@ +{ + "id": "truepeoplesearch", + "name": "TruePeopleSearch", + "category": "people_search", + "priority": "high", + "jurisdictions": ["US"], + "search": { + "method": "url_pattern", + "url": "https://www.truepeoplesearch.com/", + "fetch": "browser", + "antibot": "datadome", + "match_signal": "result", + "match_signal_notes": "SEO TRAP: the page title/H1/intro auto-inserts the query ('FREE public records found for {Name} in {City}') even with ZERO real matches. That templated echo is NOT a result. Record `found` ONLY on an actual result CARD corroborated by the subject's address or DOB; unrelated same-name cards in other states are namesakes. Ignore the title/intro/H1 text entirely.", + "by": ["name", "phone", "address", "email"], + "url_patterns": { + "name": "https://www.truepeoplesearch.com/results?name={First%20Last}&citystatezip={City,%20ST}" + }, + "url_format_quirks": [ + "Search results URL is QUERY-PARAM, not path: /results?name=Jane%20Public&citystatezip=Anytown,%20NY (space-encoded; comma between city and state). Confirmed as the canonical form the site's own search box generates.", + "On-site search tabs: Name / Phone / Address / Email / Neighbors (so name-, phone-, address-, and email-searchable).", + "HARD-BLOCKED for automated reads (2026-06-30): Cloudflare 'Just a moment...' interstitial -> DataDome device-check CAPTCHA (geo.captcha-delivery.com) on every results navigation. Page chrome (header/footer) may render while the result body stays blocked; submitting any search bounces to DataDome. web_extract/Firecrawl 504-timed out. Needs a residential-proxy/stealth browser (e.g. Browserbase) to scan; otherwise record 'blocked'." + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.truepeoplesearch.com/removal", + "requires": { + "profile_url": false, + "email_verification": true, + "captcha": true, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": ["full_name", "contact_email"], + "notes": "Removal page renders even when results are blocked. Flow: name+email+captcha -> emailed link -> fill opt-out form matching the record -> confirmation ('allow 3 days'). Fields: dropdown 'Exercise my rights as a: The subject of the request / An authorized agent of the subject', First Name*, Middle Name, Last Name*, Email Address*, consent checkbox. CAPTCHA-gated: hCaptcha ('I am human'). Contact support@truepeoplesearch.com; PO Box 7775 PMB 29296, San Francisco CA 94120-7775. Verified read-only 2026-06-30; not submitted. (Record previously mis-declared email_verification:false.)", + "est_processing_days": 3, + "reappearance_risk": "high" + }, + "last_verified": "2026-06-30", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/brokers/usphonebook.json b/optional-skills/security/unbroker/references/brokers/usphonebook.json new file mode 100644 index 000000000..ff1d0082b --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/usphonebook.json @@ -0,0 +1,53 @@ +{ + "id": "usphonebook", + "name": "USPhoneBook", + "category": "people_search", + "priority": "high", + "jurisdictions": [ + "US" + ], + "parent": "usphonebook", + "owns": [], + "search": { + "method": "url_pattern", + "url": "https://www.usphonebook.com/", + "fetch": "browser", + "match_signal": "result", + "antibot": "cloudflare", + "by": [ + "name", + "phone", + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://www.usphonebook.com/opt-out", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "full_name", + "contact_email", + "profile_url" + ], + "notes": "Reverse-phone + people-search. Opt-out: paste the listing URL, provide an email, confirm via the emailed link.", + "quirks": [ + "Opt-out URL is the documented public endpoint; datacenter IPs get 403 (anti-bot), so confirm the live flow via the operator's residential browser before the first submission, then set last_verified.", + "Needs the confirmed profile_url.", + "Email verification required." + ], + "est_processing_days": 3, + "reappearance_risk": "medium" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/whitepages.json b/optional-skills/security/unbroker/references/brokers/whitepages.json new file mode 100644 index 000000000..ddc6a0362 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/whitepages.json @@ -0,0 +1,57 @@ +{ + "id": "whitepages", + "name": "Whitepages", + "category": "people_search", + "priority": "crucial", + "jurisdictions": ["US"], + "parent": "whitepages", + "owns": ["411"], + "search": { + "method": "url_pattern", + "url": "https://www.whitepages.com/", + "fetch": "browser", + "match_signal": "listing", + "by": ["name", "phone", "address"] + }, + "optout": { + "tier": "T1", + "method": "email", + "url": "https://www.whitepages.com/suppression_requests", + "email": "privacyrequest@whitepages.com", + "requires": { + "profile_url": true, + "email_verification": true, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": ["full_name", "contact_email", "profile_url"], + "deletion": { + "via": "email", + "email": "privacyrequest@whitepages.com", + "url": "https://whitepagesprivacy.zendesk.com/hc/en-us/requests/new", + "kinds": ["ccpa", "generic"], + "notes": "privacyrequest@whitepages.com handles BOTH opt-out (removal from the site) and CCPA deletion requests, explicitly offered for people who do not want to provide a phone number for the automated tool. ~2 business day reply; opt-outs processed within 15 days. Verified from whitepages.com/privacy/consumer-rights 2026-07-01 (page dated 2026-06-22)." + }, + "playbook": [ + "EMAIL LANE (fully autonomous -- use this): send the removal + deletion request to privacyrequest@whitepages.com including the confirmed listing URL. This is Whitepages' own documented alternative 'if you would prefer not to provide a phone number'. Expect a reply within ~2 business days; they may ask identity-verification questions (name, email) -- answer with least-disclosure, never an ID number.", + "Include in the email: the listing URL(s), the subject's full name as listed, and the request to (a) remove the listing(s), (b) opt out of sale/sharing, and (c) delete personal data (CCPA 1798.105 for CA residents; they honor requests from other states per their consumer-rights page).", + "'Once a listing is removed, all known connected listings are also removed and the requester's information will not be sold by Whitepages in any capacity' -- one request covers connected listings; still re-scan afterward, and check Whitepages Premium + 411.com separately.", + "Alternative 1: webform at whitepagesprivacy.zendesk.com/hc/en-us/requests/new (same ~2-day handling; good fallback if the mailbox bounces).", + "Alternative 2 (only with an operator on the phone): the automated tool at whitepages.com/suppression_requests -- paste the listing URL, provide a phone number, answer the automated voice call and enter the 4-digit code. Fastest (est 1 day) but NOT autonomous.", + "Opt-out requests may take up to 15 days; verify with a re-scan before recording confirmed_removed." + ], + "notes": "Email/webform lane verified 2026-07-01: privacyrequest@whitepages.com or the Zendesk form replace the phone-callback tool ('if you would prefer not to provide a phone number... submit a request to a customer service agent'). Authorized agents: written signed permission required. General privacy contact: support@whitepages.com.", + "quirks": [ + "The automated suppression tool (whitepages.com/suppression_requests) REQUIRES a phone number + automated voice call with a 4-digit code + agreeing to their ToS -- that lane is phone_callback/T2. The email/webform lane exists precisely to avoid it; prefer email for autonomy.", + "Deletion exceptions they state: public records (property, court), fraud-prevention data, active-subscription data, legal holds. 'Hidden from search' vs 'deleted' distinction applies -- their opt-out removes the listing; the CCPA deletion covers non-public account data.", + "CCPA opt-out requests: up to 15 days to process. Right-to-know/access requests also via privacyrequest@whitepages.com or the Zendesk form." + ], + "est_processing_days": 15, + "reappearance_risk": "medium" + }, + "last_verified": "2026-07-01", + "source": "BADBOOL" +} diff --git a/optional-skills/security/unbroker/references/legal/ccpa.md b/optional-skills/security/unbroker/references/legal/ccpa.md new file mode 100644 index 000000000..475a7ce01 --- /dev/null +++ b/optional-skills/security/unbroker/references/legal/ccpa.md @@ -0,0 +1,27 @@ +# CCPA / CPRA (California) + +Use for California residents (`residency_jurisdiction` starts with `US-CA`) and, in practice, many US +brokers that honor CCPA-style requests nationwide. + +## Rights invoked + +- **Delete** personal information (Cal. Civ. Code 1798.105). +- **Opt out** of sale/sharing of personal information (1798.120). + +## Request content + +Render with `legal.render_request("ccpa", broker, fields)` -> `templates/emails/ccpa-deletion.txt`. +Include only: full legal name, the contact email for correspondence, and the confirmed listing +URL(s). Do **not** include SSN or government IDs. + +## Authorized agent + +When acting for another consenting subject, use `render_request("ccpa_agent", ...)` +(`templates/emails/ccpa-authorized-agent.txt`) and attach the authorization artifact recorded in the +dossier (`consent.authorization_artifact`). The broker may separately verify the consumer's identity. + +## Notes + +- Brokers must respond within 45 days (extendable). Track as `awaiting_processing` until confirmed. +- "Hidden from free search" is not deletion - verify the record is actually gone before + `confirmed_removed`. diff --git a/optional-skills/security/unbroker/references/legal/drop.md b/optional-skills/security/unbroker/references/legal/drop.md new file mode 100644 index 000000000..3b96333fc --- /dev/null +++ b/optional-skills/security/unbroker/references/legal/drop.md @@ -0,0 +1,34 @@ +# California DROP portal (highest-leverage lever) + +The California **Delete Request and Opt-out Platform** (`privacy.ca.gov/drop`) lets a California +resident demand deletion from **every registered data broker** with a single verified request, for +free. DROP is **live** (as of 2026); registered brokers must begin processing requests on +**2026-08-01**. The registered universe is the **California Data Broker Registry** (~545 brokers in +2025), which this skill ingests as its own coverage lane (`pdd.py registry`); one DROP request covers +all of them, which is how this skill reaches (and exceeds) the breadth of commercial services. + +## When to use + +For any subject with `residency_jurisdiction` starting `US-CA`, sequence DROP **first**: `pdd.py next` +surfaces a single `drop_submit` action covering the whole registry. Then handle the individual +people-search sites (which are also worked directly because they hold free, indexed listings). After +filing, run `pdd.py drop <subject> --filed` so the loop stops re-surfacing it. For non-CA subjects +DROP does not apply; cover the registry brokers with targeted CCPA/GDPR deletion emails +(`pdd.py registry --search`, then `pdd.py send-email`). + +## Flow (agent-assisted, mostly human verification) + +1. The operator creates/verifies a DROP account (identity verification is required by the state; this + is a human step - `human_task_queued`). +2. Submit one deletion request covering all registered brokers. +3. Record a single ledger case `case_<subject>_drop` to track it; mark `submitted` -> + `awaiting_processing`. Registered brokers must process deletions on the state's schedule. +4. After the DROP cycle, re-scan the people-search long tail and only act on sites still showing data. + +## Caveats + +- DROP covers **registered data brokers**, not every people-search site. Keep doing the individual + opt-outs for non-registered sites. +- Identity verification means parts of this cannot (and should not) be fully automated. +- FCRA-regulated brokers (flagged in the registry, `optout.fcra`) hold consumer-report data with + separate rules; deletion may be limited and a dispute or security-freeze may apply instead. diff --git a/optional-skills/security/unbroker/references/legal/gdpr.md b/optional-skills/security/unbroker/references/legal/gdpr.md new file mode 100644 index 000000000..d00d54693 --- /dev/null +++ b/optional-skills/security/unbroker/references/legal/gdpr.md @@ -0,0 +1,20 @@ +# GDPR / UK-GDPR (roadmap - Phase 3) + +For EU/UK subjects. Not part of the P0 US-first scope; templates and routing land in Phase 3. + +## Rights invoked + +- **Erasure** ("right to be forgotten") - Article 17. +- **Object** to processing - Article 21. + +## Request content + +Render with `legal.render_request("gdpr", broker, fields)` -> +`templates/emails/gdpr-erasure.txt`. Address the controller's privacy/DPO contact. Include the data +subject's name, the contact email, and the listing URL(s); cite Article 17. + +## Notes + +- Controllers must respond within one month (Article 12(3)). +- EU-specific brokers and portals (e.g. Acxiom's EU consumer portals) are added in Phase 3 with + `jurisdictions: ["EU"]` records and residency-aware routing. diff --git a/optional-skills/security/unbroker/references/methods.md b/optional-skills/security/unbroker/references/methods.md new file mode 100644 index 000000000..1eb167483 --- /dev/null +++ b/optional-skills/security/unbroker/references/methods.md @@ -0,0 +1,304 @@ +# Opt-out method playbooks + +How the agent executes each broker `optout.method` using native Hermes tools. Always obey the +**verify-before-disclose** rule: confirm a real listing exists, then submit only the fields the broker +requires (`pdd.py plan` lists them per broker). + +**Autonomy:** `pdd.py next <subject>` sequences all of this - it decides which method applies, orders +parents first, and routes human-only work to the digest. In `autonomy=full` (default), execute its +actions without pausing per submission; the consent recorded at intake is the authorization. These +playbooks are the HOW for each action type. + +## Scan ladder (all methods) + +Confirm exposure before acting, cheapest first. Run **every** `search_vectors` entry from `pdd.py +plan` (each name x location, phone, email, and address the broker's `search.by` supports) - different +vectors surface different listings for the same person; dedupe found URLs. + +1. `web_extract` on the broker `search.url` (fast HTML -> markdown). Look for `search.match_signal`. + Build per-vector URLs from `search.url_patterns` and heed `search.url_format_quirks` (see below). +1b. **`site:` search-engine probe (cheap, do it early and in parallel).** `web_search` with + `site:<broker-domain> "First Last"` (add a city/ZIP or a unique phone/address to cut namesake + noise) often returns the **exact profile-slug URL** in one shot - which both confirms the listing + exists AND hands you the opaque `/find/person/<id>` or `/p/<slug>` URL you'd otherwise have to + derive. Two big wins seen in the field: (a) it disambiguates namesakes fast - the SERP snippet + shows age/city so you can tell the subject from a same-name relative before fetching anything; and + (b) a broad `"First Last" <ZIP OR unique-address>` search (no `site:`) surfaces **brokers not yet in + your DB** (e.g. information.com, peoplefinders.com) - record those as bonus exposures. Note: empty + `site:` results are INCONCLUSIVE (many broker pages aren't indexed / are `noindex`), not `not_found`. +2. If the page is JS-rendered or returns nothing useful, `browser_navigate` + `browser_snapshot` + (and `browser_type`/`browser_click` to run the site's search box). +3. If blocked by stealth/Cloudflare, use the `scrapling` skill via `terminal`. **If the broker record + has `search.antibot` set (e.g. `datadome`), results are behind a device-check CAPTCHA**: a + cloud/stealth browser (Browserbase) or `scrapling` may get through; if none is available, do **not** + burn attempts - `pdd.py record <subject> <broker> blocked` and move on (a re-scan with a stealth + backend can pick it up later). +3b. **Operator-browser path (the reliable unblock for anti-bot sites).** Cloudflare/DataDome key on + datacenter IPs + headless fingerprints, so `web_extract`, the proxyless agent browser, and even a + cloud browser often fail - but the **operator's own everyday browser (residential IP, real + fingerprint) sails straight through**. For any `blocked` site, hand the operator a paste-ready + search URL (built from `search.url_patterns`), give them the identity anchors to judge by (current + + prior addresses, age, a distinguishing detail) and the namesake/relative watch-list, and ask for + the verdict or a screenshot (the agent can read screenshots). This is a **first-class scan path, not + a fallback** - treat the operator's live check as authoritative and record the real verdict + (`found` / `not_found` / `indirect_exposure`), citing `scanned_via: operator_browser`. Same for + opt-out forms the agent's browser can't reach: guide the operator field-by-field (least-disclosure), + pausing before submit. (This is exactly why the same trick clears email-verification links the agent + can't open - see the Verification loop.) +4. Capture evidence: save listing URLs and a `browser` screenshot into the subject's `evidence/` dir, + then `pdd.py record <subject> <broker> found --found true --evidence '{"listing_urls":[...]}'`. + +If a listing genuinely does not exist: `pdd.py record <subject> <broker> not_found` and move on. + +### A 404 (or empty body) is INCONCLUSIVE, not "not_found" + +A constructed search URL that 404s almost always means the **URL pattern is wrong**, not that the +person is absent. Never record `not_found` off a 404. Instead: + 1. Re-check the broker's `search.url_patterns` / `url_format_quirks` and rebuild the URL. + 2. Fall back to the **on-site search box**: `browser_navigate` to the search page, `browser_type` + the raw query, `browser_click` Search, then read the **canonical result URL** the site lands on. + 3. Only after the site's own search returns an empty result set do you record `not_found`. + 4. If a pattern was wrong, fix it in `references/brokers/<id>.json` (`url_patterns` + + `url_format_quirks`) so the next run is correct - see the rule below. + +### Log URL/format quirks for every site you scrape + +Whenever you discover how a broker's URLs are actually shaped (path layout, hyphen-vs-slash joins, +whether ZIP is required, abbreviation handling, query-param search, anti-bot gating), record it in +that broker's `references/brokers/<id>.json` under `search.url_patterns` (the templates) and +`search.url_format_quirks` (the gotchas, including which forms 404). Bump `last_verified`. This makes +the deterministic URL path reliable across runs and subjects instead of rediscovered each time. If the +opt-out form's real requirements differ from the record (extra required fields, a CAPTCHA, an account), +fix `optout.requires` / `optout.inputs` / `optout.tier` too - those drive tier selection and +least-disclosure. Log opt-out mechanics gotchas (a broker that needs a profile URL but doesn't expose +one for the subject, an email-only fallback, an authorized-agent toggle) in `optout.quirks` - the +planner surfaces these as `optout_quirks` per broker. Example: Radaris sometimes shows the subject only +as a static address-table row with no "View Profile" link, so `/control-privacy` (which needs a profile +URL) can't be used - fall back to `optout.email` rather than submitting a namesake's URL. + +### Distinguish the subject from namesakes and relatives + +People-search sites are dense with namesakes and family clusters. Before recording `found`, confirm the +record is the **subject themselves** (corroborate via DOB, a known current/prior address, or the +identifier you searched). Two non-removable patterns to record as evidence but NOT as the subject's own +listing: + - **Namesake:** same name, different person (different DOB/location with no overlap). Not the subject. + - **Relative record:** the listing is about a *different* person (a relative) and merely *names* the + subject in a "Family" field, or carries the subject's email/phone as a secondary datum. This is a + third party's record - the consent gate correctly blocks acting on it. See "Indirect exposure" in + the web_form section for what the subject *can* still request. + +Two more false-positive traps that a naive scan records as `found` when it should not: + - **Property record != PII (address-anchored sites).** Reverse-address / property sites (rehold, + clustrmaps-style) can match on a public **property record** (build year, beds/baths, last sale + price, incidents) without exposing the subject's personal info - the resident/owner NAME is behind + a "View full report" paywall/signup. Distinguish "this address exists in a public property DB" + (non-removable, `not_found`) from "the subject's personal profile is displayed" (removable, + `found`). Record `found` ONLY if a resident name matching the subject is publicly shown; an + address-only match is `not_found` - there is nothing to opt out of, and public property records are + not removable anyway. See `rehold.json` `search.match_signal_notes`. + - **SEO-templated title/H1 fakes a "found".** Many people-search sites auto-insert the query into the + page `<title>`, H1, and intro copy ("FREE public records found for {Name} in {City}", "Over 100+ + FREE public records found for {Name}"). That echo is **templating, not a result** - the actual + result cards are often unrelated namesakes in other states. A `match_signal` on title/intro text + yields false positives. Require a real result **card** corroborated by the subject's address or + DOB, and ignore the templated title/intro/H1 entirely. See `truepeoplesearch.json` / + `fastpeoplesearch.json` `search.match_signal_notes`. + +Both are why the **parent re-verifies every `found` before acting** rule is load-bearing (`pdd.py show +<subject> <broker>` reads back a subagent's recorded evidence so the parent can re-verify without +re-deriving the listing URL). If a `found` turns out to be a false positive, correct it with a fresh +`record ... not_found` carrying an evidence note explaining the retraction. + +## web_form + +1. `browser_navigate` to `optout.url`; `browser_snapshot` to read the form. +2. Fill only the planned `disclosure_fields` with `browser_type`/`browser_click`; for `profile_url`, + paste the confirmed listing URL from evidence. +3. Submit; `browser_snapshot` to confirm the success state; screenshot to `evidence/`. +4. `pdd.py record <subject> <broker> submitted --disclosed <field> --disclosed <field> --channel web_form`. +5. If the broker requires email verification, follow **Verification loop** below. + +### Indirect exposure (named as a relative / your email on someone else's record) + +You asked the right question: if a broker lists a *relative* and names you in their "Family" field, or +shows **your** email/phone on **their** record, that IS personal information about you - even though the +record's primary subject is a third party. Resolve it in two distinct lanes: + +- **The self-service opt-out form does NOT cover this.** That form removes a record whose *primary + subject* is you. It has no notion of "scrub my identifiers from this other person's record," and + submitting it with the relative's address to force a match would be (a) disclosing data the listing + doesn't tie to you and (b) acting on a third party's record. Don't. The consent gate exists to stop + exactly that. +- **What you CAN do - a targeted "delete my personal information" request (CCPA 1798.105 / GDPR Art.17).** + These rights attach to *your* personal information *wherever the business holds it*, including as a + data point on another person's profile. So the subject may email the broker's privacy address and + request suppression of **their own specific identifiers** (this email address, this phone number, my + name in family/relative associations), citing the relative listings as the locations. This is a + narrower request than a full opt-out and does not require the relative's consent - you are only asking + them to delete data about *you*. Use `render-email` with the `ccpa`/`gdpr` template, list only the + subject's own identifiers + the URLs where they appear, and record it as a normal `submitted` → + `awaiting_processing` email case. Verify by re-scanning those identifier vectors (email/phone) after + the statutory window - `confirmed_removed` only when the subject's identifier no longer appears. +- **Caveat:** the broker may decline to alter a third party's record beyond removing your specific + identifiers, and "your name in a family graph" can be derived from public records they'll re-list. + Note residual exposure in the report rather than marking a clean removal. (Operational guidance, not + legal advice.) + +## email + +`pdd.py send-email <subject> <broker> --listing <url> [--kind ccpa|gdpr|ccpa_indirect]` always does +the deterministic parts (recipient locked to an address the broker record declares, refusing anything +else; `--listing` mandatory; records `submitted`, logs disclosure, stamps `next_recheck_at`). How it +actually sends depends on `email_mode`: + +1. **browser mode (no password, autonomous):** the command returns a recipient-locked `compose` + payload (`to`/`subject`/`body`). Compose a NEW message in the operator's **logged-in webmail** via + `browser_*` (paste `compose.body` exactly, disclosing nothing beyond it) and send. No credentials + stored. Requires the inbox signed in in the browser Hermes uses. +2. **programmatic mode (SMTP creds):** the command SMTP-sends it directly, no human. +3. **draft_only fallback:** `pdd.py render-email <subject> <broker> --listing <url>`; a digest entry + tells the operator to send it, and the agent records `submitted --channel email` afterward. + +Then follow the **Verification loop** if the broker emails a confirmation link. + +## Verification loop (email_verification brokers) + +- **browser mode (autonomous, no password):** open the broker's confirmation email in the operator's + webmail (`browser_*`), then `pdd.py verify-link <subject> <broker> --text '<email body>'` returns + the anti-phishing-scored link. `browser_navigate` it **in the same browser** (several brokers, e.g. + PeopleConnect, bind the session to the browser that opens the link), finish the flow, record + `awaiting_processing`. +- **programmatic mode (IMAP):** `pdd.py poll-verification <subject>` polls IMAP for every in-flight + case, extracts the link (anti-phishing scored: only opt-out-looking links on the broker's own + domains), and auto-advances `submitted → verification_pending`. Then `browser_navigate` the link in + the agent's own browser, finish the flow, record `awaiting_processing`. +- **draft_only:** the digest tells the operator to click the link in the subject's inbox; the agent + records `awaiting_processing` on their word. +- Either way, the due queue (`pdd.py due`) brings the case back after the broker's processing window + for the verifying re-scan; only that re-scan justifies `confirmed_removed`. + +## phone_callback (e.g. Whitepages) + +Submit the web form, then the site places an automated call with a numeric code. If the operator is +available to read the code, capture it and complete the form (T2). Otherwise queue a human task. + +## phone (voice menu) / fax / mail / gov_id -> human task (T3) + +Do **not** attempt to automate. Create a `todo` task and `pdd.py record <subject> <broker> +human_task_queued` with exact instructions and an explicit **withhold** list (never SSN; never a +driver's-license number unless the subject chooses to and crosses out the ID number). Capture the +confirmation reference back into the ledger when the operator completes it. + +## captcha + +**Default: soft/managed CAPTCHAs clear automatically.** The recommended baseline backend is the +Browserbase cloud browser (`setup --auto` selects it when `BROWSERBASE_API_KEY` is set). Being a +real browser on a residential IP, it passes managed challenges - Cloudflare Turnstile, hCaptcha / +reCAPTCHA checkbox - as normal operation, so those brokers stay T1 and you just proceed. This is +**not** CAPTCHA solving: no solver service, no fingerprint spoofing. + +Only a **hard** challenge the browser genuinely can't pass (interactive image grids, behavioral +scoring that flags the session) becomes a fallback: `record ... blocked` and requeue it for the +stealth/operator-browser pass (`methods.md` → scan ladder 3b - the operator's own residential +browser is the reliable unblock). Without a cloud browser configured, soft-CAPTCHA brokers drop to +T2 and become human tasks. **Never use a third-party CAPTCHA-defeating service.** + +## Browser backends: scan vs execute + +Two different jobs need two different browsers. Getting this wrong is the single biggest cause of a +run stalling in Phase 2. + +- **Phase 1 (scan, read-only):** a cloud stealth browser (Browserbase) or the `scrapling` skill is + ideal. On a residential IP with a real fingerprint it passes managed challenges (Cloudflare + Turnstile, hCaptcha checkbox) and reads anti-bot people-search pages that `web_extract` and the + proxyless agent browser cannot. This is what the skill's `browser_backend` setting governs + (`auto` picks Browserbase when `BROWSERBASE_API_KEY` is present - now also read from + `$HERMES_HOME/.env`, not just the shell env, so `doctor`/`setup --auto` detect the key Hermes + already loads for its own tools). +- **Phase 2 (execute: opt-out forms, webmail sends, session-bound multi-step gates):** the work must + run in the **operator's own everyday browser** - real fingerprint, residential IP, AND the + operator's logged-in sessions. A headless cloud browser is the WRONG default here for two reasons: + (1) it is not signed into the operator's webmail, so browser-mode email sends and confirmation-link + opens have no inbox to act in; and (2) it is itself Cloudflare/DataDome-gated on exactly the + multi-step flows that matter (e.g. PeopleConnect guided-mode, whose verify link is session- and + device-bound to the browser that opens it - a cloud browser both fails the challenge and breaks the + binding). +- **How to drive the operator's browser (CDP).** Point Hermes's browser tools at the operator's real + Chrome over the DevTools protocol: launch + `chrome --remote-debugging-port=9222 --user-data-dir="$HOME/.hermes/chrome-debug"` and connect the + browser backend to `127.0.0.1:9222`. Use a **dedicated debug profile** (`chrome-debug`), NOT the + operator's Default Chrome profile, and have the operator sign into their webmail (and any needed + broker accounts) in that profile once. That single browser then carries residential IP + real + fingerprint + logged-in sessions, which is precisely what Phase-2 flows need. (This is a Hermes-side + browser setup, not a `pdd` config value; `browser_backend` above only selects the Phase-1 scan + browser.) **The skill launches this for you: `pdd.py cdp`** finds a Chrome/Chromium/Brave/Edge + binary, starts it detached on the dedicated profile, waits for the debug port, and prints the CDP + endpoint (`webSocketDebuggerUrl`). `pdd.py cdp --check` reports whether a debug browser is already + live (and never launches a second one); `pdd.py cdp --print` just emits the exact command for the + operator to run themselves. Point the browser tools at the `endpoint` it returns. +- **Always-available fallback:** if no CDP browser is wired up, use the operator-in-the-loop path + (scan ladder 3b) - hand over paste-ready URLs and field-by-field least-disclosure guidance, pausing + before submit. It never fails; it just needs a human present. + +Backend precedence, most to least autonomous: **operator Chrome over CDP** (Phase 2, hands-off once +the profile is signed in) > **Browserbase cloud stealth** (Phase 1 scanning, plus managed-captcha +forms that need no login) > **proxyless agent browser** (only already-unblocked sites) > +**operator-in-the-loop** (paste-ready URLs; the last-resort unblock that always works). + +## Ownership clusters - DO PARENTS FIRST (playbooks live in the broker records) + +Many brokers are resold shells of a few parents, so **one parent removal clears a whole cluster of +children** (see `owns` in each record). In Phase 2 you MUST work the cluster **parents first**, then +the standalone listings - doing a child before its parent wastes a submission the parent would have +covered. `pdd.py plan <subject> --batch` **orders the `found` group parents-first** and emits a +`parent_playbook` whose `steps` come verbatim from each record's **`optout.playbook`** - the single +source of truth, field-verified, updated as live runs discover mechanics. What follows is the +operating doctrine; the exact steps are in `references/brokers/<id>.json`. + +**Deletion USUALLY beats suppression, email lanes beat forms -- but check the record.** Each parent +record carries a structured `optout.deletion` lane (`via: in_flow | email | email_followup`, a +privacy address, and `prefer`). The autopilot routes accordingly, and when `deletion.prefer` is +false it emits `prefer_suppression` instead of `prefer_deletion`: + +- **`in_flow`** (PeopleConnect, `prefer: false`): the deletion control lives inside the web flow, but + for this cluster it is the WRONG lever for search-visibility (see the exception below). Complete the + **suppression** flow and maintain it; do not press Delete unless the goal is a data-purge. +- **`via: email`** (Whitepages): the fully-autonomous lane - `send-email` the request (residency-picked + kind: CCPA for US-CA, GDPR for EU/UK, generic otherwise), then `poll-verification` for their reply + and answer identity questions with least-disclosure. This is also the **rescue lane**: any broker + whose form demands a phone-callback/gov-ID/account but that declares a deletion email gets routed + here instead of the human digest. +- **`email_followup`** (BeenVerified, Spokeo): the opt-out form is the fast primary (it clears the + listing), and the playbook then sends a right-to-delete email for full erasure beyond suppression. + +Verified parent facts (live-checked 2026-07-02; details + steps in the records): + +- **Intelius/PeopleConnect** (~15+ sites in one flow) -- **EXCEPTION to deletion-beats-suppression.** + Portal entry asks only email + consent → verify link is **session-bound to the browser that opens + it** → guided-mode. Complete the **SUPPRESSION** flow and keep the account on file: suppression is + the do-not-display list that removes you. Per their privacy-center, **'DELETE MY USER DATA' deletes + your suppressions and does NOT stop the sites from showing you** (public records re-list), so use it + only for a deliberate data-purge. `privacy@peopleconnect.us` is the rights-request address for that + path; published metrics: 33.5k deletion requests, median response < 1 day. +- **Whitepages**: `privacyrequest@whitepages.com` (or the Zendesk form) handles removal + CCPA + deletion **without the phone-callback tool** - that phone call is only required by the automated + tool. One removal also drops "all known connected listings". ≤15 days; check 411.com + Premium. +- **BeenVerified**: opt-out tool (footer "Do Not Sell" link → `/svc/optout/search/optouts`) + email + verification; one opt-out per email address. Then `privacy@beenverified.com` deletion follow-up - + controller is The Lifetime Value Co., so name their sister properties (NeighborWho, Ownerly, + NumberGuru, Bumper) in the same request, and verify each separately. +- **Spokeo**: form takes ONE listing URL at a time and **each listing must be opted out + individually** - collect every listing URL from all search vectors first, then submit one opt-out + per URL. 24-48h processing. `privacy@spokeo.com` for full deletion beyond free-search suppression. + +After each parent removal is confirmed, **re-scan its children** before submitting anything for them - +usually they drop out and need no separate opt-out. + +### Any other parent +A parent without a hand-verified `optout.playbook` gets synthesised steps from its structured record +(URL/email, `requires` flags, deletion lane, notes/quirks). Follow those, and **write what you learn +back into `references/brokers/<id>.json`** (`optout.playbook`, `optout.deletion`, `quirks`, +`last_verified`) so the next run is exact - that file, not this one, is where per-broker knowledge +accrues. + diff --git a/optional-skills/security/unbroker/references/state-machine.md b/optional-skills/security/unbroker/references/state-machine.md new file mode 100644 index 000000000..feae5a3ef --- /dev/null +++ b/optional-skills/security/unbroker/references/state-machine.md @@ -0,0 +1,43 @@ +# Case state machine + +One case = one (subject x broker). `pdd.py record` validates every transition against this table and +appends it to `audit.jsonl`. Authoritative definition lives in `scripts/ledger.py`. + +## States + +| State | Meaning | +|---|---| +| `new` | Case created, nothing done | +| `searching` | Scan in progress | +| `not_found` | Subject not listed (will be re-checked next cycle) | +| `found` | Listing confirmed; action needed | +| `indirect_exposure` | Subject's PII (email/phone/name) appears on a **third party's** record (e.g. named in a relative's "Family" field). Not removable via self-service opt-out; needs a targeted CCPA/GDPR delete-my-PII request | +| `action_selected` | Tier/method chosen | +| `submitted` | Opt-out submitted | +| `verification_pending` | Awaiting email/callback verification | +| `awaiting_processing` | Submitted, no verification needed; broker processing | +| `confirmed_removed` | Verified gone | +| `reappeared` | Was removed, now listed again | +| `human_task_queued` | Needs an operator step (captcha/ID/phone/fax/mail) | +| `blocked` | Broker dead / mechanics broken -> flag for DB re-verification | + +## Allowed transitions + +``` +new -> searching | found | not_found | indirect_exposure | blocked +searching -> not_found | found | indirect_exposure | blocked +not_found -> searching | found | indirect_exposure | blocked +found -> action_selected | submitted | human_task_queued | indirect_exposure | blocked +indirect_exposure -> submitted | human_task_queued | not_found | found | blocked +action_selected -> submitted | human_task_queued | blocked +submitted -> verification_pending | awaiting_processing | human_task_queued | blocked +verification_pending -> confirmed_removed | human_task_queued | blocked +awaiting_processing -> confirmed_removed | human_task_queued | blocked +confirmed_removed -> reappeared | confirmed_removed (recheck refreshes the date) +reappeared -> found | indirect_exposure +human_task_queued -> found | indirect_exposure | action_selected | submitted | verification_pending + | awaiting_processing | confirmed_removed | blocked +blocked -> searching | found | not_found | indirect_exposure | action_selected +``` + +A transition to the same state is always allowed (idempotent field updates). diff --git a/optional-skills/security/unbroker/scripts/autopilot.py b/optional-skills/security/unbroker/scripts/autopilot.py new file mode 100644 index 000000000..746106185 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/autopilot.py @@ -0,0 +1,417 @@ +"""Autonomous action queue: what should the agent do RIGHT NOW for this subject? + +`next_actions` turns (dossier, broker DB, config, ledger) into an ordered queue of +concrete agent actions plus a human digest. The agent's whole run becomes a loop: + + while True: + q = pdd.py next <subject> + if not q["actions"]: break + execute each action, record outcomes + present q["human_digest"] once; schedule cron at q["next_wake_at"] + +Policy (cfg["autonomy"]): + full - intake consent is standing authorization; T0-T2 agent actions are + executed without pausing. Humans appear only in the digest. + assisted - same queue, but every submission action carries confirm_first=True. + +The queue is deterministic and side-effect free: it never mutates the ledger, it +only reads. Executing + recording stays with the agent (and the record command). +""" +from __future__ import annotations + +import datetime as _dt +import os +from pathlib import Path + +import brokers as brokers_mod +import emailer +import ledger as ledger_mod +import paths +import registry +import tiers + +CACHE_STALE_DAYS = 7 # refresh the live broker list after this +FANOUT_THRESHOLD = 8 # above this many unscanned brokers, use delegate_task fan-out + +# States with nothing left to do (absent a due recheck). +_TERMINAL = {"not_found", "confirmed_removed"} +_IN_FLIGHT = {"submitted", "verification_pending", "awaiting_processing"} + + +def cache_age_days(now: float | None = None) -> float | None: + """Age of the live BADBOOL cache in days, or None if never pulled.""" + p: Path = paths.brokers_cache_path() + if not p.exists(): + return None + now = now if now is not None else _dt.datetime.now().timestamp() + return max(0.0, (now - p.stat().st_mtime) / 86400.0) + + +def _now_iso() -> str: + return _dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _min_future_recheck(ledger: dict, at: str) -> str | None: + future = [c.get("next_recheck_at") for c in ledger.values() + if c.get("next_recheck_at") and c["next_recheck_at"] > at] + return min(future) if future else None + + +def _digest(broker_row: dict, reason: str, steps: list[str], prep: list[str] | None = None) -> dict: + return { + "broker_id": broker_row.get("broker_id"), + "broker_name": broker_row.get("broker_name"), + "reason": reason, + "agent_prep": prep or [], # commands the agent runs BEFORE handing this to the human + "steps": steps, # what the human actually does + "withhold": ["SSN", "full driver's-license / passport numbers"], + } + + +def request_kind(dossier: dict, allowed: list[str] | None = None) -> str: + """Pick the honest legal basis for a deletion request from the subject's residency. + + ccpa only for California residents, gdpr only for EU/UK residents, generic otherwise. + `allowed` (from the broker's deletion.kinds) can restrict DOWN to generic but never + upgrades to a law the subject can't truthfully claim. + """ + res = (dossier.get("residency_jurisdiction") or "US").upper() + if res.startswith("US-CA"): + kind = "ccpa" + elif res.startswith(("EU", "UK", "GB")): + kind = "gdpr" + else: + kind = "generic" + if allowed and kind not in allowed and "generic" in allowed: + kind = "generic" + return kind + + +_HUMAN_GATES = ("gov_id", "fax", "mail", "phone_voice", "phone_callback", "account") + + +def _email_lane(row: dict) -> tuple[str | None, str]: + """(address, why) for the autonomous email lane of this broker, if one exists. + + Lane rules: + 1. the broker's primary opt-out method IS email; + 2. the record marks its deletion lane email-preferred (deletion.via == "email"); + 3. RESCUE: the primary flow is human-gated (gov ID / fax / phone / account) but a + right-to-delete email exists - the email lane restores full autonomy (this is the + verified Whitepages pattern: privacyrequest@ accepts requests precisely so people + don't have to do the phone-callback tool). + """ + deletion = row.get("deletion") or {} + req = row.get("optout_requires") or {} + if row.get("method") == "email": + addr = row.get("optout_email") or deletion.get("email") + return (addr, "primary opt-out method is email") if addr else (None, "") + if deletion.get("via") == "email" and deletion.get("email"): + return deletion["email"], "record prefers the right-to-delete email lane" + if (row.get("tier") == "T3" or any(req.get(k) for k in _HUMAN_GATES)) and deletion.get("email"): + return deletion["email"], "rescue: primary flow is human-gated; deletion email restores autonomy" + return None, "" + + +def _optout_action(row: dict, playbook: dict[str, dict], subject_id: str, dossier: dict, + email_mode: str, smtp_ok: bool, confirm_first: bool) -> tuple[dict | None, dict | None]: + """Map one actionable `found` row to (agent_action, human_digest_entry). + + Routing order maximizes autonomy: (1) the email lane (primary email method, preferred + right-to-delete email, or rescue from a human-gated form) beats everything when SMTP is + up; (2) genuinely human-only flows go to the digest; (3) web forms are driven with the + record's own field-verified playbook steps. + """ + bid = row["broker_id"] + req = row.get("optout_requires") or {} + tier = row.get("tier") + deletion = row.get("deletion") or {} + + # 1) The autonomous EMAIL LANE (right-to-delete by email + confirm the reply). + # Autonomous when SMTP is configured (programmatic/alias) OR in browser mode (agent sends via + # the operator's logged-in webmail; no password needed). + email_addr, lane_why = _email_lane(row) + can_email = (email_mode in ("programmatic", "alias") and smtp_ok) or email_mode == "browser" + if email_addr and can_email: + kind = request_kind(dossier, deletion.get("kinds")) + via = "browser" if email_mode == "browser" else "smtp" + then = ("send-email records it + returns a recipient-locked payload; compose and send it in " + "the operator's webmail via browser_*, then `verify-link` on the reply and open the link" + if via == "browser" else + "state auto-records as submitted; poll-verification picks up their verification reply, " + "open its link, then record") + return { + "type": "optout_email_send", + "broker_id": bid, "broker_name": row.get("broker_name"), "tier": tier, + "confirm_first": confirm_first, "send_via": via, + "to": email_addr, "kind": kind, "why": lane_why, + "command": f"python3 scripts/pdd.py send-email {subject_id} {bid} --kind {kind} " + f"--to {email_addr} --listing <confirmed-url>", + "then": then, + }, None + if row.get("method") == "email": + return None, _digest(row, "email opt-out (draft mode: a human must hit send)", + ["Send the rendered draft from your own mail client", + f"Then: python3 scripts/pdd.py record {subject_id} {bid} submitted " + f"--disclosed contact_email --channel email"], + prep=[f"python3 scripts/pdd.py render-email {subject_id} {bid} --listing <confirmed-url>"]) + + # 2) Genuinely human-only work goes to the digest (no email lane could rescue it). + if tier == "T3": + return None, _digest(row, "human-only opt-out (gov ID / fax / mail / voice phone)", + [f"Follow the broker's process at {row.get('optout_url') or row.get('optout_email')}", + "Provide only the fields the listing already shows; cross out ID numbers on any document"]) + if req.get("phone_callback"): + return None, _digest(row, "phone-callback verification (operator must be on the phone)", + [f"Open {row.get('optout_url')} and submit with only the planned fields", + "Answer the automated call and enter the 4-digit code to finish"], + prep=[f"python3 scripts/pdd.py plan {subject_id} --batch # confirm fields first"]) + if req.get("account"): + return None, _digest(row, "requires creating/holding an account with the broker", + [f"Create/log in at {row.get('optout_url')} and submit the opt-out", + "Use the subject's contact email; no extra PII beyond the planned fields"]) + + # 3) web_form: drive the browser with the record's own playbook steps. + steps = (playbook.get(bid) or {}).get("steps") or list(row.get("optout_playbook") or []) \ + or tiers.synthesize_steps(row) + action = { + "type": "optout_web_form", + "broker_id": bid, "broker_name": row.get("broker_name"), "tier": tier, + "confirm_first": confirm_first, + "optout_url": row.get("optout_url"), + "clears_children": row.get("clears_children") or [], + "steps": steps, + "after": f"python3 scripts/pdd.py record {subject_id} {bid} submitted " + f"--disclosed <field>... --channel web_form", + } + if deletion: + if deletion.get("prefer", True): + action["prefer_deletion"] = ("this record has a right-to-delete lane -- complete the " + "DELETION flow, not just suppression" + + (f" ({deletion.get('notes')})" if deletion.get("notes") else "")) + else: + # Some brokers invert the usual rule: deleting the account removes suppressions and + # does not stop public-records re-listing (e.g. PeopleConnect). Suppress and maintain. + action["prefer_suppression"] = (deletion.get("notes") + or "suppression (maintained) is what removes you here; " + "deleting undoes it and does not stop re-listing") + if req.get("captcha"): + action["note"] = ("CAPTCHA-gated: attempt with the configured browser backend once; if it " + "does not clear, record blocked (do NOT retry-loop or bypass)") + return action, None + + +def next_actions(dossier: dict, brokers_list: list[dict], cfg: dict, + ledger: dict | None = None, env: dict | None = None) -> dict: + env = os.environ if env is None else env + ledger = ledger or {} + subject_id = dossier.get("subject_id", "") + autonomy = cfg.get("autonomy", "full") + confirm_first = autonomy == "assisted" + email_mode = cfg.get("email_mode", "draft_only") + mail = emailer.available(env) + at = _now_iso() + + batch = tiers.batch_plan(dossier, brokers_list, cfg, ledger, + browser_clears_captcha=cfg.get("browser_backend") == "browserbase" + or bool(env.get("BROWSERBASE_API_KEY"))) + groups = batch["groups"] + playbook = {p["broker_id"]: p for p in batch.get("parent_playbook") or []} + by_id = {b.get("id"): b for b in brokers_list} + + actions: list[dict] = [] + digest: list[dict] = [] + + # 0) keep the broker DB fresh (autonomously) + age = cache_age_days() + if age is None or age > CACHE_STALE_DAYS: + actions.append({ + "type": "refresh_brokers", + "why": "live broker cache missing" if age is None else f"cache is {age:.0f} days old", + "command": "python3 scripts/pdd.py refresh-brokers", + }) + + # 0b) DROP one-shot: for a CA resident, ONE request deletes from every registered + # broker (the whole CA Data Broker Registry) -- the highest-leverage removal there is. + registry_recs = brokers_mod.load_registry_cache() + residency = (dossier.get("residency_jurisdiction") or "US").upper() + drop_filed = bool((dossier.get("preferences") or {}).get("drop_filed_at")) + if registry_recs and residency.startswith("US-CA") and not drop_filed: + actions.append({ + "type": "drop_submit", + "one_shot": True, + "registry_count": len(registry_recs), + "url": registry.DROP_URL, + "command": f"python3 scripts/pdd.py drop {subject_id}", + "why": f"CA resident: one DROP request deletes from all {len(registry_recs)} registered " + "data brokers at once (superset of what commercial services cover).", + "after": f"python3 scripts/pdd.py drop {subject_id} --filed", + }) + + # 1) Phase 1 crawl: everything unscanned (read-only, parallel-safe) + unscanned = groups.get("unscanned") or [] + if unscanned: + ids = [r["broker_id"] for r in unscanned] + if len(ids) > FANOUT_THRESHOLD: + actions.append({ + "type": "fanout_scan", + "broker_ids": ids, + "command": f"python3 scripts/pdd.py fanout {subject_id}", + "how": "spawn ONE delegate_task subagent per batch IN PARALLEL with each batch's brief; " + "parent re-verifies key `found` claims before trusting them", + }) + else: + actions.append({ + "type": "scan_inline", + "broker_ids": ids, + "command": f"python3 scripts/pdd.py plan {subject_id}", + "how": "run every search_vector per broker via the methods.md ladder " + "(web_extract -> site: probe -> browser), record a verdict per broker", + }) + + # 2) in-flight email verifications: poll the inbox (or hand to the human in draft mode) + for st in ("submitted", "verification_pending"): + for bid, case in sorted(ledger.items()): + if case.get("state") != st: + continue + broker = by_id.get(bid) or {} + if not ((broker.get("optout") or {}).get("requires") or {}).get("email_verification"): + continue + if mail["imap"]: + actions.append({ + "type": "poll_verification", "via": "imap", + "broker_id": bid, + "command": f"python3 scripts/pdd.py poll-verification {subject_id} --broker {bid}", + "then": "browser_navigate the returned link IN THE SAME AGENT BROWSER (sessions are " + "browser-bound), complete the flow, then record: awaiting_processing", + }) + elif email_mode == "browser": + actions.append({ + "type": "poll_verification", "via": "browser", "broker_id": bid, + "how": "open the broker's confirmation email in the operator's logged-in webmail " + f"(browser_*), then `python3 scripts/pdd.py verify-link {subject_id} {bid} " + "--text '<email body>'` to score the link, browser_navigate it in the SAME " + "browser, then record awaiting_processing", + }) + else: + digest.append(_digest( + {"broker_id": bid, "broker_name": (broker.get("name") or bid)}, + "verification email must be opened by a human (draft mode, no inbox access)", + ["Open the broker's verification email in the subject's inbox and click the link", + f"Then: python3 scripts/pdd.py record {subject_id} {bid} awaiting_processing"])) + + # 3) due rechecks: processing windows elapsed / reappearance sweeps + for case in ledger_mod.due(subject_id, at=at, ledger=ledger): + bid = case.get("broker_id") + st = case.get("state") + if st in ("awaiting_processing", "confirmed_removed"): + actions.append({ + "type": "verify_removal", + "broker_id": bid, + "why": "processing window elapsed" if st == "awaiting_processing" else "periodic reappearance re-scan", + "how": "re-run this broker's search_vectors; if gone record confirmed_removed; " + "if still listed record reappeared and requeue the opt-out", + }) + elif st in ("submitted", "verification_pending") and not mail["imap"]: + pass # already covered by the digest entry above + + # 4) Phase 2 opt-outs: parents first (batch_plan already ordered them) + for row in groups.get("found") or []: + action, task = _optout_action(row, playbook, subject_id, dossier, + email_mode, mail["smtp"], confirm_first) + if action: + actions.append(action) + if task: + digest.append(task) + + # 5) indirect exposure: targeted delete-my-PII requests + for row in groups.get("indirect_exposure") or []: + bid = row["broker_id"] + has_email = bool(row.get("optout_email") or (row.get("deletion") or {}).get("email")) + if not has_email and row.get("optout_url"): + # No email lane (e.g. ThatsThem is web-form-only): drive the opt-out FORM, submitting + # ONLY the subject's own identifiers to scrub from the third party's record. + actions.append({ + "type": "indirect_web_form", + "broker_id": bid, "confirm_first": confirm_first, + "optout_url": row.get("optout_url"), + "steps": [f"browser_navigate {row.get('optout_url')}", + "submit ONLY the subject's own identifiers (the fields the form requires) to " + "remove them from the third party's record; disclose nothing extra", + "confirm the success state, screenshot into evidence/"], + "after": f"python3 scripts/pdd.py record {subject_id} {bid} submitted --channel web_form", + }) + elif (email_mode in ("programmatic", "alias") and mail["smtp"]) or email_mode == "browser": + actions.append({ + "type": "indirect_email_send", + "broker_id": bid, "confirm_first": confirm_first, + "send_via": "browser" if email_mode == "browser" else "smtp", + "command": f"python3 scripts/pdd.py send-email {subject_id} {bid} --kind ccpa_indirect " + f"--listing <third-party-listing-url>", + }) + else: + digest.append(_digest(row, "indirect-exposure request (draft mode: a human must hit send)", + ["Send the rendered ccpa_indirect draft", + f"Then: python3 scripts/pdd.py record {subject_id} {bid} submitted " + f"--disclosed contact_email --channel email"], + prep=[f"python3 scripts/pdd.py render-email {subject_id} {bid} " + f"--kind ccpa_indirect --listing <url>"])) + + # 6) blocked sites: stealth pass if we have one, else the operator-browser path + blocked = groups.get("blocked") or [] + if blocked: + ids = [r["broker_id"] for r in blocked] + if bool(env.get("BROWSERBASE_API_KEY")): + actions.append({ + "type": "stealth_rescan", + "broker_ids": ids, + "how": "retry these with the cloud/stealth browser backend, then record real verdicts", + }) + else: + for r in blocked: + digest.append(_digest(r, "site blocks automated access (anti-bot); a human browser gets through", + ["Open the paste-ready search URL from `plan` in your everyday browser", + "Report the verdict (or a screenshot) back to the agent", + f"Agent records: python3 scripts/pdd.py record {subject_id} " + f"{r['broker_id']} <found|not_found|indirect_exposure>"])) + + # 7) anything already parked as a human task + for bid, case in sorted(ledger.items()): + if case.get("state") == "human_task_queued": + broker = by_id.get(bid) or {} + digest.append(_digest({"broker_id": bid, "broker_name": broker.get("name") or bid}, + case.get("human_task_reason") or "queued manual step", + ["See `pdd.py tasks` for the exact steps recorded with this case"])) + + # registry coverage summary (breadth beyond the scannable people-search sites) + coverage = None + if registry_recs: + coverage = { + "people_search_sites": len(brokers_list), + "registered_data_brokers": len(registry_recs), + "worked_via": "CA DROP one-shot" if residency.startswith("US-CA") else "targeted CCPA/GDPR email", + } + if not residency.startswith("US-CA"): + coverage["note"] = ("DROP is CA-only; for this subject the registry is covered by targeted " + "CCPA/GDPR deletion emails (`registry --search` then `send-email`), " + "not a single portal request.") + elif drop_filed: + coverage["note"] = "DROP already filed; registry deletions are in the brokers' hands." + + next_wake = _min_future_recheck(ledger, at) + return { + "subject": subject_id, + "autonomy": autonomy, + "phase": batch.get("phase"), + "counts": batch.get("counts"), + "actions": actions, + "human_digest": digest, + "coverage": coverage, + "done_for_now": not actions, + "fully_done": not actions and not digest and not next_wake, + "next_wake_at": next_wake, + "note": ("assisted mode: pause for operator confirmation on every action with confirm_first=true" + if confirm_first else + "full autonomy: recorded intake consent authorizes these submissions; do not pause. " + "Present human_digest ONCE at the end of the run, not per item."), + } diff --git a/optional-skills/security/unbroker/scripts/badbool.py b/optional-skills/security/unbroker/scripts/badbool.py new file mode 100644 index 000000000..9a415ceef --- /dev/null +++ b/optional-skills/security/unbroker/scripts/badbool.py @@ -0,0 +1,177 @@ +"""Pull and parse the Big-Ass Data Broker Opt-Out List (BADBOOL) into broker records. + +BADBOOL (https://github.com/yaelwrites/Big-Ass-Data-Broker-Opt-Out-List) is a +maintained, frequently-updated markdown list. `refresh` fetches it and parses the +"People Search Sites" section into records that merge UNDER the curated DB (curated +records always win). Auto-parsed records carry source="BADBOOL-auto" and +confidence="auto" so the agent treats their URLs as best guesses to verify first. + +`parse()` is pure (markdown in, records out) so it is tested offline; `fetch()` is +the only network call and can be bypassed by passing markdown directly to refresh(). +""" +from __future__ import annotations + +import re +import urllib.request +from pathlib import Path + +import storage + +DEFAULT_URL = ( + "https://raw.githubusercontent.com/yaelwrites/" + "Big-Ass-Data-Broker-Opt-Out-List/master/README.md" +) +USER_AGENT = "Mozilla/5.0 (compatible; unbroker/1.0; data opt-out)" + +# BADBOOL legend symbols. +SYMBOLS = { + "crucial": "\U0001F490", # 💐 + "high": "\u2620", # ☠ + "gov_id": "\U0001F3AB", # 🎫 + "phone": "\U0001F4DE", # 📞 + "payment": "\U0001F4B0", # 💰 +} + +_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") +_OPTOUT_HINT = re.compile( + r"opt[\- ]?out|optout|removal|remove|suppress|control-privacy|delete", re.I +) +_FIND_HINT = re.compile(r"find|your information|search|look ?up|look for", re.I) + + +def slug(name: str) -> str: + # Drop a trailing .com/.org/.info on the displayed name so "FastPeopleSearch.com" + # matches the curated id "fastpeoplesearch"; keep .net/.id so distinct sites differ. + n = re.sub(r"\.(com|org|info)\b", "", name.strip(), flags=re.I) + return re.sub(r"[^a-z0-9]+", "", n.lower()) + + +def _heading_flags(heading: str) -> tuple[str, dict]: + flags = {key: (sym in heading) for key, sym in SYMBOLS.items()} + name = heading + for sym in SYMBOLS.values(): + name = name.replace(sym, "") + name = name.replace("\ufe0f", "").strip() + return name, flags + + +def _priority(flags: dict) -> str: + if flags["crucial"]: + return "crucial" + if flags["high"]: + return "high" + return "standard" + + +def _pick(links: list[tuple[str, str]], hint: re.Pattern) -> str | None: + for _text, url in links: + if hint.search(url): + return url + for text, url in links: + if hint.search(text): + return url + return None + + +def _clean(text: str) -> str: + return re.sub(r"\s+", " ", text).strip()[:600] + + +def _build(name: str, flags: dict, body: str) -> dict: + links = _LINK_RE.findall(body) + web = [(t, u) for t, u in links if u.lower().startswith("http")] + mailtos = [u[7:] for _t, u in links if u.lower().startswith("mailto:")] + optout_url = _pick(web, _OPTOUT_HINT) + search_url = _pick(web, _FIND_HINT) or (web[0][1] if web else None) + + if flags["phone"]: + method = "phone" + elif optout_url: + method = "web_form" + elif mailtos: + method = "email" + else: + method = "manual" + + return { + "id": slug(name), + "name": name, + "category": "people_search", + "priority": _priority(flags), + "jurisdictions": ["US"], + "search": {"method": "url_pattern", "url": search_url, "fetch": "browser", + "match_signal": "result", "by": ["name", "phone", "address"]}, + "optout": { + "method": method, + "url": optout_url, + "email": mailtos[0] if mailtos else None, + "requires": { + "gov_id": flags["gov_id"], + "phone_voice": flags["phone"], + "payment": flags["payment"], + "email_verification": False, + "captcha": False, + "account": False, + "phone_callback": False, + }, + "inputs": ["full_name", "contact_email"], + "notes": _clean(body), + "links": [{"text": t, "url": u} for t, u in links], + "est_processing_days": 14, # unknown for auto records; drives next_recheck_at + }, + "source": "BADBOOL-auto", + "confidence": "auto", + "last_verified": None, + } + + +def parse(markdown: str) -> list[dict]: + """Parse the 'People Search Sites' section of BADBOOL into broker records.""" + records: list[dict] = [] + in_people = False + heading: str | None = None + body: list[str] = [] + + def flush() -> None: + nonlocal heading, body + if heading is not None: + name, flags = _heading_flags(heading) + if name: + records.append(_build(name, flags, "\n".join(body).strip())) + heading, body = None, [] + + for line in markdown.splitlines(): + if line.startswith("## "): + flush() + in_people = line[3:].strip().lower().startswith("people search") + continue + if not in_people: + continue + if line.startswith("### "): + flush() + heading = line[4:].strip() + elif heading is not None: + body.append(line) + flush() + return records + + +def fetch(url: str = DEFAULT_URL, timeout: int = 30) -> str: + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310 + return resp.read().decode("utf-8", errors="replace") + + +MIN_EXPECTED = 20 # BADBOOL's People Search section lists ~47; far fewer => upstream reorg, warn + + +def refresh(cache_path: Path, url: str = DEFAULT_URL, markdown: str | None = None) -> dict: + """Fetch (or accept) BADBOOL markdown, parse it, and write the snapshot cache.""" + md = markdown if markdown is not None else fetch(url) + records = parse(md) + storage.write_json(cache_path, records) + out = {"parsed": len(records), "cache_path": str(cache_path), "source_url": url} + if len(records) < MIN_EXPECTED: + out["warning"] = (f"only {len(records)} parsed (expected >{MIN_EXPECTED}); BADBOOL's " + "'People Search Sites' section may have moved/reorganized - check the parser") + return out diff --git a/optional-skills/security/unbroker/scripts/brokers.py b/optional-skills/security/unbroker/scripts/brokers.py new file mode 100644 index 000000000..4cb63a5c6 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/brokers.py @@ -0,0 +1,77 @@ +"""Load and query the broker database (references/brokers/*.json). + +Each broker is one JSON file for clean diffs/PRs. Files beginning with `_` are +ignored (reserved for notes/scratch). +""" +from __future__ import annotations + +import json +from pathlib import Path + +import paths +import storage + +PRIORITY_ORDER = {"crucial": 0, "high": 1, "standard": 2, "long_tail": 3} + + +def _load_curated(directory: Path | None = None) -> list[dict]: + directory = directory or paths.brokers_dir() + out: list[dict] = [] + if not directory.exists(): + return out + for fp in sorted(directory.glob("*.json")): + if fp.name.startswith("_"): + continue + out.append(json.loads(fp.read_text(encoding="utf-8"))) + return out + + +def load_live_cache() -> list[dict]: + """Records pulled from BADBOOL via `refresh-brokers` (empty until refreshed).""" + return storage.read_json(paths.brokers_cache_path(), []) or [] + + +def load_registry_cache() -> list[dict]: + """CA Data Broker Registry records (separate coverage lane; empty until refreshed). + + Kept OUT of load_all() by default: these are not people-search sites to scan, they + are worked via the CA DROP one-shot + CCPA email. Consumers of the scan/plan/fanout + pipeline must not receive them; use this directly for coverage counts and the DROP/ + email lanes. + """ + return storage.read_json(paths.registry_cache_path(), []) or [] + + +def load_all(directory: Path | None = None, include_live: bool = True) -> list[dict]: + """Curated records, with live BADBOOL records merged underneath (curated wins).""" + merged: dict[str, dict] = {b["id"]: b for b in _load_curated(directory)} + if include_live: + for b in load_live_cache(): + bid = b.get("id") + if bid and bid not in merged: + merged[bid] = b + out = list(merged.values()) + out.sort(key=lambda b: (PRIORITY_ORDER.get(b.get("priority", "standard"), 9), b.get("id", ""))) + return out + + +def get(broker_id: str, directory: Path | None = None) -> dict | None: + for b in load_all(directory): + if b.get("id") == broker_id: + return b + return None + + +def by_priority(*levels: str, directory: Path | None = None) -> list[dict]: + wanted = set(levels) if levels else None + return [b for b in load_all(directory) if wanted is None or b.get("priority") in wanted] + + +def clusters(directory: Path | None = None) -> dict[str, list[str]]: + """Map a parent broker id -> child site ids it can clear (force-multipliers).""" + out: dict[str, list[str]] = {} + for b in load_all(directory): + owns = b.get("owns") or [] + if owns: + out[b["id"]] = list(owns) + return out diff --git a/optional-skills/security/unbroker/scripts/cdp.py b/optional-skills/security/unbroker/scripts/cdp.py new file mode 100644 index 000000000..0d4beeaac --- /dev/null +++ b/optional-skills/security/unbroker/scripts/cdp.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +"""Launch (or detect) the operator's local Chrome/Chromium over the DevTools Protocol (CDP). + +Phase-2 work -- sending opt-out/CCPA email through the operator's logged-in webmail, and driving +session-bound multi-step opt-out gates (e.g. PeopleConnect guided-mode) -- must run in the +operator's OWN browser: real fingerprint, residential IP, and the operator's signed-in sessions. +A headless cloud browser (Browserbase) is the wrong tool there (it has no webmail session and is +itself anti-bot-gated on those exact flows). This module launches the operator's real Chrome with +remote debugging on a DEDICATED profile so Hermes's browser tools can attach at 127.0.0.1:<port>. + +Stdlib only; cross-platform (macOS / Linux / Windows). Nothing here touches a password or PII. +""" +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import sys +import urllib.error +import urllib.request +from pathlib import Path + +import paths + +DEFAULT_PORT = 9222 + +# Chromium-family binaries we know how to drive, in preference order. Names first (works on any OS +# where one is on PATH), then per-OS absolute-path fallbacks below. +_PATH_NAMES = ( + "google-chrome", "google-chrome-stable", "chromium", "chromium-browser", + "brave-browser", "microsoft-edge", "microsoft-edge-stable", "chrome", +) + + +def default_profile() -> Path: + """Dedicated debug profile dir, NOT the operator's Default Chrome profile. + + Chrome refuses remote-debugging on a profile that is already open in another Chrome instance, + so we isolate the debug session in its own user-data-dir under HERMES_HOME. + """ + return paths.hermes_home() / "chrome-debug" + + +def _mac_candidates() -> list[str]: + return [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Chromium.app/Contents/MacOS/Chromium", + "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser", + "/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge", + "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary", + ] + + +def _windows_candidates() -> list[str]: + bases = [ + os.environ.get("ProgramFiles", r"C:\Program Files"), + os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)"), + os.environ.get("LOCALAPPDATA", ""), + ] + rels = [ + r"Google\Chrome\Application\chrome.exe", + r"Chromium\Application\chrome.exe", + r"BraveSoftware\Brave-Browser\Application\brave.exe", + r"Microsoft\Edge\Application\msedge.exe", + ] + out: list[str] = [] + for base in bases: + if not base: + continue + for rel in rels: + out.append(str(Path(base) / rel)) + return out + + +def find_browser(override: str | None = None) -> str | None: + """Return the first usable Chromium-family browser path/command, or None. + + `override` (an explicit path, or a command on PATH) wins when it resolves. + """ + if override: + if Path(override).exists(): + return override + return shutil.which(override) # may be None -> caller reports "not found" + for name in _PATH_NAMES: + found = shutil.which(name) + if found: + return found + if sys.platform == "darwin": + candidates = _mac_candidates() + elif sys.platform == "win32": + candidates = _windows_candidates() + else: + candidates = [] + for cand in candidates: + if Path(cand).exists(): + return cand + return None + + +def launch_command(browser: str, port: int = DEFAULT_PORT, profile: Path | None = None) -> list[str]: + """The exact argv used to start the debug browser (also handy for `--print`).""" + profile = profile or default_profile() + return [ + browser, + f"--remote-debugging-port={int(port)}", + f"--user-data-dir={profile}", + "--no-first-run", + "--no-default-browser-check", + ] + + +def _http_get(url: str, timeout: float) -> bytes: + req = urllib.request.Request(url, headers={"User-Agent": "unbroker-cdp/1.0"}) + with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310 (localhost only) + return resp.read() + + +def endpoint_status(port: int = DEFAULT_PORT, host: str = "127.0.0.1", + timeout: float = 1.0) -> dict | None: + """Return the CDP `/json/version` dict if a debuggable browser is live at host:port, else None. + + (Chrome restricts this endpoint to localhost/IP Host headers, so we always hit 127.0.0.1.) + """ + url = f"http://{host}:{int(port)}/json/version" + try: + raw = _http_get(url, timeout) + except (urllib.error.URLError, TimeoutError, ConnectionError, OSError, ValueError): + return None + try: + data = json.loads(raw.decode("utf-8", errors="replace")) + except (ValueError, AttributeError): + return None + return data if isinstance(data, dict) else None + + +def launch(browser: str, port: int = DEFAULT_PORT, profile: Path | None = None) -> int: + """Start the browser detached with remote debugging; return the child PID. + + Detach so the browser outlives this short-lived CLI call. POSIX uses start_new_session (which + avoids referencing os.setsid, so there is no Windows import-time footgun); Windows uses + DETACHED_PROCESS + a new process group. + """ + profile = profile or default_profile() + profile.mkdir(parents=True, exist_ok=True) + cmd = launch_command(browser, port, profile) + kwargs: dict = { + "stdin": subprocess.DEVNULL, + "stdout": subprocess.DEVNULL, + "stderr": subprocess.DEVNULL, + } + if sys.platform == "win32": + kwargs["creationflags"] = ( + subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP # windows-footgun: ok + ) + else: + kwargs["start_new_session"] = True + proc = subprocess.Popen(cmd, **kwargs) + return proc.pid diff --git a/optional-skills/security/unbroker/scripts/config.py b/optional-skills/security/unbroker/scripts/config.py new file mode 100644 index 000000000..e1eb1c3c7 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/config.py @@ -0,0 +1,144 @@ +"""Install-wide configuration with easiest-first defaults. + +Everything works zero-config. `setup --auto` (the autonomous path) detects what +this environment can do and picks the MOST AUTONOMOUS valid configuration without +asking anyone; plain `setup` keeps the easiest-first defaults and only upgrades a +setting when a flag opts in. + +`autonomy` is policy, orthogonal to capability: + full - intake consent is standing authorization; the agent submits T0-T2 + opt-outs without pausing per submission (default). + assisted - the agent pauses for operator confirmation before each submission. +""" +from __future__ import annotations + +import os +from pathlib import Path +from shutil import which + +import emailer +import paths +import storage + +DEFAULT_CONFIG = { + "autonomy": "full", # hands-off after intake+consent + "email_mode": "draft_only", # zero credentials + "browser_backend": "auto", # auto = Browserbase when BROWSERBASE_API_KEY is set + # (recommended default; clears soft CAPTCHAs), else plain browser + "tracker_backend": "local-json", # no external dependency + "encryption": "none", # files still written 0600 + "default_rescan_interval_days": 120, + "email_min_interval_seconds": 20, # pace SMTP sends so a run can't torch the account +} + +VALID = { + "autonomy": {"full", "assisted"}, + # email_mode: + # draft_only - render drafts; the operator sends + clicks verify links (zero setup) + # browser - the agent sends + opens verify links through the operator's logged-in + # webmail via browser_* tools (NO password stored; needs a browser the + # operator's inbox is signed into) + # programmatic - CLI sends via SMTP + reads verify links via IMAP (needs EMAIL_* creds) + # alias - AgentMail agent-owned inboxes / per-broker aliases + "email_mode": {"draft_only", "browser", "programmatic", "alias"}, + "browser_backend": {"auto", "browserbase", "agent-browser", "camofox"}, + "tracker_backend": {"local-json", "google-sheets"}, + "encryption": {"none", "age"}, +} + + +def load_config() -> dict: + cfg = dict(DEFAULT_CONFIG) + cfg.update(storage.read_json(paths.config_path(), {}) or {}) + return cfg + + +def save_config(cfg: dict) -> Path: + merged = dict(DEFAULT_CONFIG) + merged.update(cfg) + for key, allowed in VALID.items(): + if merged.get(key) not in allowed: + raise ValueError(f"invalid {key!r}: {merged.get(key)!r} (allowed: {sorted(allowed)})") + return storage.write_json(paths.config_path(), merged) + + +def dotenv_env() -> dict: + """Shell env overlaid on `$HERMES_HOME/.env`, so capability detection sees the creds Hermes + loads for its own tools (BROWSERBASE_API_KEY, EMAIL_*, AGENTMAIL_API_KEY, ...) even though the + terminal-tool shell doesn't export them. Shell env wins; the .env only fills gaps.""" + merged: dict = {} + p = paths.hermes_home() / ".env" + if p.exists(): + try: + for line in p.read_text(encoding="utf-8", errors="replace").splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + merged[k.strip()] = v.strip().strip('"').strip("'") + except OSError: + pass + merged.update(os.environ) + return merged + + +def detect_capabilities(env: dict | None = None) -> dict: + """Report which opt-in upgrades are available without extra setup.""" + env = os.environ if env is None else env + home = paths.hermes_home() + google = ( + (home / "google_token.json").exists() + or (home / "skills" / "productivity" / "google-workspace").exists() + or (home / "skills" / "google-workspace").exists() + ) + mail = emailer.available(env) + return { + "browserbase": bool(env.get("BROWSERBASE_API_KEY")), + "agentmail": bool(env.get("AGENTMAIL_API_KEY")), + "email_imap_smtp": bool(env.get("EMAIL_ADDRESS") and env.get("EMAIL_PASSWORD")), + "smtp_send": mail["smtp"], # CLI can SEND opt-out emails itself + "imap_read": mail["imap"], # CLI can POLL verification links itself + "google_workspace": google, + "age": which("age") is not None, + } + + +def auto_configure(env: dict | None = None) -> dict: + """Pick the most autonomous configuration this environment supports (no questions). + + - email: programmatic when SMTP creds exist (CLI sends + IMAP-verifies itself); + alias mode when only AgentMail exists; draft_only as the capability floor. + - browser: browserbase when the key exists (clears soft CAPTCHAs -> more T1). + - encryption: age when the binary is installed (free privacy, zero human cost). + - tracker: stays local-json (google-sheets needs a sheet id -> a human choice). + """ + caps = detect_capabilities(env) + cfg = load_config() + cfg["autonomy"] = "full" + if caps["smtp_send"]: + cfg["email_mode"] = "programmatic" + elif caps["agentmail"]: + cfg["email_mode"] = "alias" + else: + cfg["email_mode"] = "draft_only" + cfg["browser_backend"] = "browserbase" if caps["browserbase"] else "auto" + if caps["age"]: + cfg["encryption"] = "age" + return cfg + + +def browser_clears_captcha(cfg: dict, env: dict | None = None) -> bool: + """True if the chosen browser backend can clear soft CAPTCHAs (shifts T2 -> T1). + + Browserbase is the recommended default: a real residential-IP cloud browser passes + soft/managed challenges (Turnstile, hCaptcha/reCAPTCHA checkbox) as normal operation. + This is NOT solving/spoofing - hard interactive challenges still escalate to a human. + `auto` inherits this whenever BROWSERBASE_API_KEY is present. + """ + backend = cfg.get("browser_backend", "auto") + if backend == "browserbase": + return True + if backend == "auto": + env = os.environ if env is None else env + return bool(env.get("BROWSERBASE_API_KEY")) + return False diff --git a/optional-skills/security/unbroker/scripts/crypto.py b/optional-skills/security/unbroker/scripts/crypto.py new file mode 100644 index 000000000..98594f5e8 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/crypto.py @@ -0,0 +1,88 @@ +"""At-rest encryption for sensitive files via the `age` binary (optional). + +Engaged ONLY when config `encryption: age` AND an age identity key exists AND the +`age`/`age-keygen` binaries are available. When engaged, JSON docs under +`subjects/` (dossier, ledger) are written as `<file>.age` ciphertext; the audit +log (field NAMES + states only, no raw PII values), `config.json`, and the broker +cache stay plaintext so the engine can read them. + +Threat model (be honest): this protects against casual disk inspection, accidental +`git add`/commits, screen-shares, and backup/cloud-sync leakage. The identity key +defaults to living beside the data at `$PDD_DATA_DIR/age-identity.txt` (0600); set +`PDD_AGE_IDENTITY` to a separate volume/token for true key separation. It does NOT +protect against an attacker who can already read your whole HERMES_HOME (they get +key + data together). +""" +from __future__ import annotations + +import json +import subprocess +from pathlib import Path +from shutil import which + +import paths + + +def age_available() -> bool: + return which("age") is not None and which("age-keygen") is not None + + +def encryption_setting() -> str: + """Read `encryption` straight from config.json (no config/storage import => no cycle).""" + cfg = paths.config_path() + if not cfg.exists(): + return "none" + try: + return (json.loads(cfg.read_text(encoding="utf-8")) or {}).get("encryption", "none") + except (ValueError, OSError): + return "none" + + +def identity_path() -> Path: + return paths.age_identity_path() + + +def ensure_identity() -> Path: + """Generate an age identity (X25519 keypair) if missing; return its path.""" + if not age_available(): + raise RuntimeError("`age`/`age-keygen` not found; cannot enable encryption") + p = identity_path() + if not p.exists(): + p.parent.mkdir(parents=True, exist_ok=True) + try: + p.parent.chmod(0o700) + except OSError: + pass + subprocess.run(["age-keygen", "-o", str(p)], check=True, capture_output=True) + try: + p.chmod(0o600) + except OSError: + pass + return p + + +def recipient() -> str: + """The age public key (recipient) for the identity, parsed from its header.""" + p = ensure_identity() + for line in p.read_text(encoding="utf-8").splitlines(): + s = line.strip() + if s.lower().startswith("# public key:"): + return s.split(":", 1)[1].strip() + if s.startswith("age1"): + return s + raise RuntimeError(f"no public key found in {p}") + + +def is_engaged() -> bool: + """True only when encryption is actually active (configured + available + key present).""" + return encryption_setting() == "age" and age_available() and identity_path().exists() + + +def encrypt(data: bytes) -> bytes: + out = subprocess.run(["age", "-r", recipient()], input=data, capture_output=True, check=True) + return out.stdout + + +def decrypt(data: bytes) -> bytes: + out = subprocess.run(["age", "-d", "-i", str(identity_path())], input=data, capture_output=True, check=True) + return out.stdout diff --git a/optional-skills/security/unbroker/scripts/dossier.py b/optional-skills/security/unbroker/scripts/dossier.py new file mode 100644 index 000000000..50b4c8c6b --- /dev/null +++ b/optional-skills/security/unbroker/scripts/dossier.py @@ -0,0 +1,135 @@ +"""Subject dossier management + consent gate + least-disclosure field selection.""" +from __future__ import annotations + +import datetime as _dt +import hashlib +import os +from pathlib import Path + +import paths +import storage + +# Identifiers we never volunteer in an opt-out (would expand exposure, not reduce it). +NEVER_VOLUNTEER = {"ssn", "social_security_number", "passport", "drivers_license"} + +VALID_CONSENT_METHODS = {"self", "written_authorization", "poa"} + + +def now() -> str: + return _dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def new_subject_id(full_name: str = "") -> str: + # Opaque id: derives NOTHING from the name, so PII never leaks into directory names, + # case ids, drafts, or the audit log. full_name kept only for call compatibility. + return "sub_" + hashlib.sha1(os.urandom(8)).hexdigest()[:10] + + +def create(identity: dict, consent: dict, residency: str = "US", prefs: dict | None = None) -> dict: + dossier = { + "subject_id": new_subject_id(identity.get("full_name", "subject")), + "consent": consent, + "identity": identity, + "residency_jurisdiction": residency, + "preferences": prefs or {"email_mode": "draft_only", "rescan_interval_days": 120}, + "created_at": now(), + } + save(dossier) + return dossier + + +def load(subject_id: str) -> dict | None: + return storage.read_json(paths.dossier_path(subject_id), None) + + +def save(dossier: dict) -> Path: + return storage.write_json(paths.dossier_path(dossier["subject_id"]), dossier) + + +def is_authorized(dossier: dict) -> bool: + c = dossier.get("consent") or {} + return bool(c.get("authorized")) and c.get("method") in VALID_CONSENT_METHODS + + +def require_authorized(dossier: dict) -> None: + if not is_authorized(dossier): + raise PermissionError( + f"subject {dossier.get('subject_id')!r} has no recorded authorization; refusing to act" + ) + + +def all_names(dossier: dict) -> list[str]: + """Primary name + aliases (maiden/married/nicknames), deduped, in priority order.""" + ident = dossier.get("identity", {}) + out: list[str] = [] + seen: set[str] = set() + for n in [ident.get("full_name"), *(ident.get("also_known_as") or [])]: + if n and n.lower() not in seen: + seen.add(n.lower()) + out.append(n) + return out + + +def all_addresses(dossier: dict) -> list[dict]: + """Current + prior addresses, each tagged with `kind` (current|prior).""" + ident = dossier.get("identity", {}) + out: list[dict] = [] + cur = ident.get("current_address") + if cur: + out.append({**cur, "kind": cur.get("kind", "current")}) + for a in ident.get("prior_addresses") or []: + out.append({**a, "kind": a.get("kind", "prior")}) + return out + + +def all_locations(dossier: dict) -> list[dict]: + """Distinct city/state pairs across all addresses (the vectors for name searches).""" + out: list[dict] = [] + seen: set[tuple] = set() + for a in all_addresses(dossier): + city = a.get("city") + key = ((city or "").lower(), (a.get("state") or "").lower()) + if city and key not in seen: + seen.add(key) + out.append({"city": city, "state": a.get("state")}) + return out + + +def contact_email(dossier: dict) -> str | None: + """The single email used for opt-out correspondence (designated, else the first).""" + ident = dossier.get("identity", {}) + prefs = dossier.get("preferences", {}) + emails = ident.get("emails") or [] + return prefs.get("contact_email_for_optouts") or (emails[0] if emails else None) + + +def select_disclosure(dossier: dict, inputs: list[str], override_email: str | None = None) -> dict: + """Return ONLY the dossier fields a broker's opt-out actually requires. + + Enforces least-disclosure: skips anything in NEVER_VOLUNTEER, and skips + `profile_url` (that is captured per-listing at submit time, not from the dossier). + A single contact email is used for correspondence even when the subject has several + (see all_names / all_addresses / search vectors for using every alternate to *find* listings). + """ + ident = dossier.get("identity", {}) + addr = ident.get("current_address") or {} + phones = ident.get("phones") or [] + available = { + "full_name": ident.get("full_name"), + "first_name": (ident.get("full_name") or "").split(" ")[0] or None, + "contact_email": override_email or contact_email(dossier), + "current_address": addr or None, + "street": addr.get("line1"), + "city": addr.get("city"), + "state": addr.get("state"), + "postal": addr.get("postal"), + "date_of_birth": ident.get("date_of_birth"), + "phone": phones[0] if phones else None, + } + out: dict = {} + for key in inputs: + if key in NEVER_VOLUNTEER or key == "profile_url": + continue + if available.get(key) is not None: + out[key] = available[key] + return out diff --git a/optional-skills/security/unbroker/scripts/email_modes.py b/optional-skills/security/unbroker/scripts/email_modes.py new file mode 100644 index 000000000..d5b40eb84 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/email_modes.py @@ -0,0 +1,76 @@ +"""Email modes A/B/C helpers + anti-phishing verification-link extraction. + +Mode A (default): render a ready-to-send draft to disk; the operator sends it. +Mode B/C: the agent SENDS via a Hermes email mechanism (IMAP/SMTP gateway, +`himalaya`, AgentMail, or Gmail via `google-workspace`) and READS the reply to +resolve the verification link with `extract_verification_link`. Those transports +are driven by the agent through native tools; this module stays network-free so +the hermetic tests pass. +""" +from __future__ import annotations + +import re +from pathlib import Path + +import legal +import paths + +_LINK_RE = re.compile(r"https?://[^\s\"'<>)\]]+", re.IGNORECASE) +_VERIFY_HINTS = ("opt", "remov", "verif", "confirm", "unsubscrib", "suppress", "delete", "privacy") + + +def render_draft(broker: dict, fields: dict, out_dir: Path | None = None) -> Path: + """Mode A: write a ready-to-send opt-out email for the operator to send.""" + body = legal.render_optout_email(broker, fields) + out_dir = out_dir or (paths.data_dir() / "drafts") + out_dir.mkdir(parents=True, exist_ok=True) + fp = out_dir / f"{broker.get('id', 'broker')}.txt" + fp.write_text(body, encoding="utf-8") + return fp + + +def render_request_draft(broker: dict, fields: dict, kind: str = "generic", + out_dir: Path | None = None) -> Path: + """Mode A: write a ready-to-send request of a specific KIND. + + kind: generic | ccpa | ccpa_agent | ccpa_indirect | gdpr. Used for indirect-exposure + (ccpa_indirect) and explicit legal requests, where the generic opt-out wording is wrong. + The filename is suffixed with the kind so an indirect request does not overwrite an opt-out draft. + """ + body = legal.render_request(kind, broker, fields) + out_dir = out_dir or (paths.data_dir() / "drafts") + out_dir.mkdir(parents=True, exist_ok=True) + suffix = "" if kind == "generic" else f"-{kind}" + fp = out_dir / f"{broker.get('id', 'broker')}{suffix}.txt" + fp.write_text(body, encoding="utf-8") + return fp + + +def extract_verification_link(email_body: str, broker: dict | None = None) -> str | None: + """Return the most likely opt-out/verification link from an email body. + + Anti-phishing: a link is only returned if its URL matches an opt-out hint + and/or the broker's own domain; arbitrary links score 0 and are ignored. + """ + candidates = _LINK_RE.findall(email_body or "") + if not candidates: + return None + + domain = "" + if broker: + url = (broker.get("optout") or {}).get("url") or (broker.get("search") or {}).get("url") or "" + m = re.search(r"https?://([^/]+)", url) + if m: + domain = m.group(1).replace("www.", "") + + best_score, best_link = 0, None + for link in candidates: + low = link.lower() + score = 0 + if any(h in low for h in _VERIFY_HINTS): + score += 2 + if domain and domain in low: + score += 3 + if score > best_score: + best_score, best_link = score, link + return best_link diff --git a/optional-skills/security/unbroker/scripts/emailer.py b/optional-skills/security/unbroker/scripts/emailer.py new file mode 100644 index 000000000..927bc1535 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/emailer.py @@ -0,0 +1,342 @@ +"""Programmatic email (Mode B) via stdlib smtplib/imaplib - no human in the loop. + +This is what turns email opt-outs autonomous: `send()` delivers the rendered +request straight to the broker's known opt-out address, and `find_verification_link()` +polls the inbox for the broker's confirmation email and extracts the link (scored +by email_modes.extract_verification_link, so arbitrary/phishing links are ignored). +The agent still OPENS the link with its own browser - several brokers bind the +verification session to the browser that opens it (see the intelius record). + +Configuration comes from the same env vars the Hermes email gateway uses: + EMAIL_ADDRESS / EMAIL_PASSWORD (required for Mode B) + EMAIL_SMTP_HOST / EMAIL_SMTP_PORT (optional; inferred for common providers) + EMAIL_IMAP_HOST / EMAIL_IMAP_PORT (optional; inferred for common providers) + +Anti-misuse: `send()` refuses a recipient that is not the broker record's own +opt-out/privacy address - this module cannot be repurposed to email arbitrary people. +All network calls live behind small functions that the hermetic tests monkeypatch. +""" +from __future__ import annotations + +import email as _email +import email.utils +import imaplib +import json +import os +import re +import smtplib +import time +from email.message import EmailMessage +from pathlib import Path + +import email_modes +import paths + +# provider domain -> (smtp_host, smtp_port, imap_host, imap_port) +PROVIDERS = { + "gmail.com": ("smtp.gmail.com", 587, "imap.gmail.com", 993), + "googlemail.com": ("smtp.gmail.com", 587, "imap.gmail.com", 993), + "outlook.com": ("smtp-mail.outlook.com", 587, "outlook.office365.com", 993), + "hotmail.com": ("smtp-mail.outlook.com", 587, "outlook.office365.com", 993), + "live.com": ("smtp-mail.outlook.com", 587, "outlook.office365.com", 993), + "yahoo.com": ("smtp.mail.yahoo.com", 587, "imap.mail.yahoo.com", 993), + "icloud.com": ("smtp.mail.me.com", 587, "imap.mail.me.com", 993), + "me.com": ("smtp.mail.me.com", 587, "imap.mail.me.com", 993), + "fastmail.com": ("smtp.fastmail.com", 587, "imap.fastmail.com", 993), +} + + +def _domain(address: str) -> str: + return address.rsplit("@", 1)[-1].lower() if "@" in address else "" + + +def smtp_settings(env: dict | None = None) -> dict | None: + """SMTP connection settings, or None when sending is not configured.""" + env = os.environ if env is None else env + address, password = env.get("EMAIL_ADDRESS"), env.get("EMAIL_PASSWORD") + if not (address and password): + return None + inferred = PROVIDERS.get(_domain(address)) + host = env.get("EMAIL_SMTP_HOST") or (inferred[0] if inferred else None) + if not host: + return None # unknown provider and no explicit host + port = int(env.get("EMAIL_SMTP_PORT") or (inferred[1] if inferred else 587)) + return {"host": host, "port": port, "address": address, "password": password} + + +def imap_settings(env: dict | None = None) -> dict | None: + """IMAP connection settings, or None when inbox reading is not configured.""" + env = os.environ if env is None else env + address, password = env.get("EMAIL_ADDRESS"), env.get("EMAIL_PASSWORD") + if not (address and password): + return None + inferred = PROVIDERS.get(_domain(address)) + host = env.get("EMAIL_IMAP_HOST") or (inferred[2] if inferred else None) + if not host: + return None + port = int(env.get("EMAIL_IMAP_PORT") or (inferred[3] if inferred else 993)) + return {"host": host, "port": port, "address": address, "password": password} + + +def available(env: dict | None = None) -> dict: + return {"smtp": smtp_settings(env) is not None, "imap": imap_settings(env) is not None} + + +# --- sending ------------------------------------------------------------------ + +def broker_addresses(broker: dict) -> list[str]: + """Every address the broker record itself declares (the ONLY valid recipients). + + Includes the primary opt-out email, the right-to-delete lane's email + (optout.deletion.email), and any mailto: links parsed from BADBOOL. + """ + opt = broker.get("optout") or {} + out = [a for a in [opt.get("email"), (opt.get("deletion") or {}).get("email")] if a] + for link in opt.get("links") or []: + url = (link.get("url") or "") + if url.lower().startswith("mailto:"): + out.append(url[7:].split("?")[0]) + seen: set[str] = set() + deduped = [] + for a in out: + if a.lower() not in seen: + seen.add(a.lower()) + deduped.append(a) + return deduped + + +def _split_subject_body(text: str) -> tuple[str, str]: + """Templates start with a 'Subject: ...' line; split it out for the MIME header.""" + lines = text.splitlines() + if lines and lines[0].lower().startswith("subject:"): + return lines[0].split(":", 1)[1].strip(), "\n".join(lines[1:]).lstrip("\n") + return "Data removal request", text + + +def browser_send_payload(broker: dict, body_text: str, to: str | None = None) -> dict: + """Build a recipient-locked {to, subject, body} for the agent to send via browser webmail. + + No network and no credentials: the deterministic part (recipient-lock to the broker's own + declared address, subject/body split) happens here; the agent then composes and sends it in + the operator's logged-in webmail with browser_* tools. Same recipient guard as `send()`, so + the browser lane cannot be pointed at an arbitrary person either. + """ + allowed = broker_addresses(broker) + if not allowed: + raise RuntimeError(f"broker {broker.get('id')!r} declares no opt-out email address") + recipient = to or allowed[0] + if recipient.lower() not in {a.lower() for a in allowed}: + raise PermissionError( + f"refusing to target {recipient!r}: not an address the broker record declares " + f"(allowed: {allowed})" + ) + subject, body = _split_subject_body(body_text) + return {"to": recipient, "subject": subject, "body": body} + + +def _rate_limit_path() -> Path: + return paths.data_dir() / "email-rate.json" + + +def _respect_rate_limit(min_interval: float, sleep, now, state_path=None) -> None: + """Pace sends across CLI invocations so a run can't torch the sending account. + + Persists the last-send wall-clock time; if the next send is too soon, sleep the + remainder. Cross-process because each `send-email` is a separate invocation. + """ + if min_interval <= 0: + return + p = state_path or _rate_limit_path() + last = 0.0 + try: + last = float(json.loads(p.read_text(encoding="utf-8")).get("last", 0.0)) + except (OSError, ValueError, TypeError): + last = 0.0 + wait = min_interval - (now() - last) + if wait > 0: + sleep(min(wait, min_interval)) + try: + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(json.dumps({"last": now()}), encoding="utf-8") + except OSError: + pass + + +# SMTP errors that are permanent (don't retry) vs transient (retry with backoff). +_SMTP_PERMANENT = (smtplib.SMTPAuthenticationError, smtplib.SMTPRecipientsRefused, + smtplib.SMTPSenderRefused, smtplib.SMTPDataError) + + +def send(broker: dict, body_text: str, to: str | None = None, + env: dict | None = None, _smtp_factory=None, + min_interval: float = 0.0, max_retries: int = 3, + _sleep=time.sleep, _now=time.time, _rate_state=None) -> dict: + """Send an opt-out/legal request to the broker's own opt-out address. + + Recipient is locked to an address the broker record declares (PermissionError + otherwise). `min_interval` paces sends across invocations (deliverability / + account-safety); transient SMTP/socket failures retry with exponential backoff, + permanent ones (auth, recipient refused) raise immediately. NOTE: a successful + SMTP handoff is NOT proof of delivery - real bounces arrive later as inbound mail; + in programmatic mode `poll-verification`/inbox review surfaces them, and the + due-queue re-scan is the true confirmation. Returns send metadata. + """ + settings = smtp_settings(env) + if not settings: + raise RuntimeError( + "programmatic email not configured (need EMAIL_ADDRESS + EMAIL_PASSWORD, and " + "EMAIL_SMTP_HOST for non-mainstream providers); fall back to `render-email` drafts" + ) + allowed = broker_addresses(broker) + if not allowed: + raise RuntimeError(f"broker {broker.get('id')!r} declares no opt-out email address") + recipient = to or allowed[0] + if recipient.lower() not in {a.lower() for a in allowed}: + raise PermissionError( + f"refusing to send to {recipient!r}: not an address the broker record declares " + f"(allowed: {allowed})" + ) + + subject, body = _split_subject_body(body_text) + msg = EmailMessage() + msg["From"] = settings["address"] + msg["To"] = recipient + msg["Subject"] = subject + msg["Date"] = email.utils.formatdate(localtime=True) + msg["Message-ID"] = email.utils.make_msgid() + msg.set_content(body) + + _respect_rate_limit(min_interval, _sleep, _now, _rate_state) + + factory = _smtp_factory or smtplib.SMTP + attempts = 0 + while True: + attempts += 1 + try: + with factory(settings["host"], settings["port"], timeout=30) as smtp: + smtp.ehlo() + try: + smtp.starttls() + smtp.ehlo() + except smtplib.SMTPNotSupportedError: + pass # already-TLS ports / test doubles + smtp.login(settings["address"], settings["password"]) + smtp.send_message(msg) + break + except _SMTP_PERMANENT: + raise # auth / recipient refused: retrying won't help + except (smtplib.SMTPException, OSError) as exc: + if attempts > max_retries: + raise RuntimeError(f"SMTP send failed after {attempts} attempts: {exc}") from exc + _sleep(min(2 ** (attempts - 1), 30)) # 1s, 2s, 4s... capped + return {"to": recipient, "subject": subject, "message_id": msg["Message-ID"], + "from": settings["address"], "attempts": attempts, + "delivery_note": "SMTP accepted; not proof of delivery - a bounce would arrive as " + "inbound mail. The due-queue re-scan is the real confirmation."} + + +# --- inbox polling ------------------------------------------------------------ + +def _decode_part(part) -> str: + try: + payload = part.get_payload(decode=True) + if payload is None: + return "" + charset = part.get_content_charset() or "utf-8" + return payload.decode(charset, errors="replace") + except Exception: # noqa: BLE001 - malformed MIME must not kill the poll + return "" + + +def message_text(msg) -> str: + """All text/plain + text/html content of a parsed email message.""" + chunks: list[str] = [] + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() in ("text/plain", "text/html"): + chunks.append(_decode_part(part)) + else: + chunks.append(_decode_part(msg)) + return "\n".join(c for c in chunks if c) + + +def _broker_domains(broker: dict) -> list[str]: + """Domains this broker legitimately mails from (site domains + optout email domain).""" + domains: list[str] = [] + for section in ("optout", "search"): + url = ((broker.get(section) or {}).get("url")) or "" + m = re.search(r"https?://([^/]+)", url) + if m: + domains.append(m.group(1).lower().removeprefix("www.")) + opt_email = (broker.get("optout") or {}).get("email") + if opt_email and "@" in opt_email: + domains.append(_domain(opt_email)) + # strip subdomains to the registrable-ish tail (mailer.intelius.com -> intelius.com) + tails = {".".join(d.split(".")[-2:]) for d in domains if d} + return sorted(tails) + + +def fetch_recent(env: dict | None = None, since_days: int = 3, limit: int = 30, + _imap_factory=None) -> list[dict]: + """Fetch recent inbox messages: [{from, subject, date, text}], newest first.""" + settings = imap_settings(env) + if not settings: + raise RuntimeError("IMAP not configured (need EMAIL_ADDRESS + EMAIL_PASSWORD, and " + "EMAIL_IMAP_HOST for non-mainstream providers)") + import datetime as _dt + since = (_dt.date.today() - _dt.timedelta(days=max(0, since_days))).strftime("%d-%b-%Y") + + factory = _imap_factory or imaplib.IMAP4_SSL + conn = factory(settings["host"], settings["port"]) + try: + conn.login(settings["address"], settings["password"]) + conn.select("INBOX", readonly=True) + _typ, data = conn.search(None, "SINCE", since) + ids = (data[0].split() if data and data[0] else [])[-limit:] + out: list[dict] = [] + for mid in reversed(ids): # newest first + _typ, msg_data = conn.fetch(mid, "(RFC822)") + raw = next((p[1] for p in msg_data or [] if isinstance(p, tuple)), None) + if not raw: + continue + msg = _email.message_from_bytes(raw) + out.append({ + "from": msg.get("From", ""), + "subject": msg.get("Subject", ""), + "date": msg.get("Date", ""), + "text": message_text(msg), + }) + return out + finally: + try: + conn.logout() + except Exception: # noqa: BLE001 + pass + + +def link_from_messages(messages: list[dict], broker: dict) -> dict | None: + """Pure: find the broker's verification link in already-fetched messages. + + A message is only considered if its From domain OR any contained link matches + the broker's own domains; the link itself must pass the anti-phishing scorer. + """ + domains = _broker_domains(broker) + for m in messages: + sender = (m.get("from") or "").lower() + text = m.get("text") or "" + sender_match = any(d in sender for d in domains) + body_match = any(d in text.lower() for d in domains) + if not (sender_match or body_match): + continue + link = email_modes.extract_verification_link(text, broker) + if link: + return {"link": link, "from": m.get("from"), "subject": m.get("subject"), + "date": m.get("date")} + return None + + +def find_verification_link(broker: dict, env: dict | None = None, since_days: int = 3, + _imap_factory=None) -> dict | None: + """Poll the inbox and return the broker's verification link (or None yet).""" + messages = fetch_recent(env, since_days=since_days, _imap_factory=_imap_factory) + return link_from_messages(messages, broker) diff --git a/optional-skills/security/unbroker/scripts/ledger.py b/optional-skills/security/unbroker/scripts/ledger.py new file mode 100644 index 000000000..2483ee6a8 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/ledger.py @@ -0,0 +1,170 @@ +"""Case ledger: opt-out state machine + append-only audit log. + +A "case" is one (subject x broker) record. State changes are validated against +TRANSITIONS and mirrored into audit.jsonl so every action is auditable. +""" +from __future__ import annotations + +import datetime as _dt +from pathlib import Path + +import paths +import storage + +STATES = [ + "new", "searching", "not_found", "found", "indirect_exposure", "action_selected", "submitted", + "verification_pending", "awaiting_processing", "confirmed_removed", "reappeared", + "human_task_queued", "blocked", +] + +TRANSITIONS: dict[str, set[str]] = { + "new": {"searching", "found", "not_found", "indirect_exposure", "blocked"}, + "searching": {"not_found", "found", "indirect_exposure", "blocked"}, + "not_found": {"searching", "found", "indirect_exposure", "blocked"}, + # found -> not_found: a parent re-verification (or re-scan) found the "found" was a false + # positive (namesake, or an address-only property-record match) -- retract it with evidence. + "found": {"action_selected", "submitted", "human_task_queued", "indirect_exposure", "blocked", + "not_found"}, + # indirect_exposure: subject's PII (email/phone/name) sits on a THIRD PARTY's record. The + # self-service opt-out form does not apply; the lever is a targeted CCPA/GDPR delete-my-PII + # request (-> submitted) or a human task. Re-scan can clear it (-> not_found) or upgrade it to a + # direct listing (-> found). + "indirect_exposure": {"submitted", "human_task_queued", "not_found", "found", "blocked"}, + "action_selected": {"submitted", "human_task_queued", "blocked"}, + "submitted": {"verification_pending", "awaiting_processing", "human_task_queued", "blocked"}, + # verification_pending -> awaiting_processing: the verify link was opened/acknowledged and the + # broker is now processing the removal (their stated window). confirmed_removed still requires a + # verifying re-scan, never the submission flow's own say-so. + "verification_pending": {"awaiting_processing", "confirmed_removed", "human_task_queued", "blocked"}, + "awaiting_processing": {"confirmed_removed", "human_task_queued", "blocked"}, + "confirmed_removed": {"reappeared", "confirmed_removed"}, + "reappeared": {"found", "indirect_exposure"}, + "human_task_queued": { + "found", "indirect_exposure", "action_selected", "submitted", "verification_pending", + "awaiting_processing", "confirmed_removed", "blocked", + }, + # blocked: automated tools (web_extract/proxyless browser) couldn't read the site. A later pass + # -- a stealth/cloud browser OR guiding the operator's own (residential) browser -- can resolve it + # to any real scan verdict, so blocked reaches not_found / indirect_exposure too, not just found. + # blocked -> human_task_queued: some blocked sites need an operator step to proceed at all + # (face-recognition sites needing a selfie/gov-ID, etc.), so route them to the digest. + "blocked": {"searching", "found", "not_found", "indirect_exposure", "action_selected", + "human_task_queued"}, +} + + +def now() -> str: + return _dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def load(subject_id: str) -> dict: + return storage.read_json(paths.ledger_path(subject_id), {}) or {} + + +def save(subject_id: str, ledger: dict) -> Path: + return storage.write_json(paths.ledger_path(subject_id), ledger) + + +def new_case(subject_id: str, broker_id: str) -> dict: + return { + "case_id": f"case_{subject_id}_{broker_id}", + "subject_id": subject_id, + "broker_id": broker_id, + "state": "new", + "found": None, + "evidence": {}, + "disclosure_log": [], + "history": [], + } + + +def get_case(subject_id: str, broker_id: str) -> dict: + return load(subject_id).get(broker_id) or new_case(subject_id, broker_id) + + +def can_transition(old: str, new: str) -> bool: + return new == old or new in TRANSITIONS.get(old, set()) + + +def transition(subject_id: str, broker_id: str, new_state: str, **fields) -> dict: + if new_state not in STATES: + raise ValueError(f"unknown state {new_state!r}") + # Lock the whole load-modify-save so a concurrent cron re-scan / other tenant + # can't read a stale ledger and clobber this transition. + with storage.locked(paths.ledger_path(subject_id)): + ledger = load(subject_id) + case = ledger.get(broker_id) or new_case(subject_id, broker_id) + old = case.get("state", "new") + if not can_transition(old, new_state): + raise ValueError(f"illegal transition {old!r} -> {new_state!r} for broker {broker_id!r}") + case["state"] = new_state + for key, value in fields.items(): + case[key] = value + stamp = now() + case.setdefault("history", []).append({"at": stamp, "from": old, "to": new_state}) + ledger[broker_id] = case + save(subject_id, ledger) + storage.append_jsonl( + paths.audit_path(subject_id), + {"at": stamp, "broker_id": broker_id, "event": "transition", "from": old, "to": new_state}, + ) + return case + + +DEFAULT_PROCESSING_DAYS = 14 # when a broker record doesn't state est_processing_days +VERIFICATION_POLL_DAYS = 1 # how soon to re-poll for an unarrived verification email + + +def _plus_days(days: int, start: str | None = None) -> str: + base = _dt.datetime.strptime(start, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=_dt.timezone.utc) \ + if start else _dt.datetime.now(_dt.timezone.utc) + return (base + _dt.timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def followup_fields(new_state: str, broker: dict | None = None, + dossier: dict | None = None) -> dict: + """Auto-scheduling stamps for a transition, so nobody has to remember follow-ups. + + submitted / awaiting_processing -> recheck after the broker's stated processing window; + verification_pending -> re-poll the inbox quickly; + confirmed_removed -> periodic reappearance re-scan per subject preference. + """ + if new_state in ("submitted", "awaiting_processing"): + days = ((broker or {}).get("optout") or {}).get("est_processing_days") or DEFAULT_PROCESSING_DAYS + return {"next_recheck_at": _plus_days(int(days))} + if new_state == "verification_pending": + return {"next_recheck_at": _plus_days(VERIFICATION_POLL_DAYS)} + if new_state == "confirmed_removed": + interval = ((dossier or {}).get("preferences") or {}).get("rescan_interval_days") or 120 + return {"removal_confirmed_at": now(), "next_recheck_at": _plus_days(int(interval))} + return {} + + +def due(subject_id: str, at: str | None = None, ledger: dict | None = None) -> list[dict]: + """Cases whose next_recheck_at has arrived - the autonomous follow-up queue.""" + stamp = at or now() + out = [] + for case in (ledger if ledger is not None else load(subject_id)).values(): + when = case.get("next_recheck_at") + if when and when <= stamp: + out.append(case) + out.sort(key=lambda c: c.get("next_recheck_at") or "") + return out + + +def log_disclosure(subject_id: str, broker_id: str, fields: list[str], channel: str) -> dict: + """Record exactly which PII field *names* were disclosed to a broker.""" + with storage.locked(paths.ledger_path(subject_id)): + ledger = load(subject_id) + case = ledger.get(broker_id) or new_case(subject_id, broker_id) + stamp = now() + record = {"at": stamp, "fields": sorted(fields), "channel": channel} + case.setdefault("disclosure_log", []).append(record) + ledger[broker_id] = case + save(subject_id, ledger) + storage.append_jsonl( + paths.audit_path(subject_id), + {"at": stamp, "broker_id": broker_id, "event": "disclosure", + "fields": record["fields"], "channel": channel}, + ) + return record diff --git a/optional-skills/security/unbroker/scripts/legal.py b/optional-skills/security/unbroker/scripts/legal.py new file mode 100644 index 000000000..325687b27 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/legal.py @@ -0,0 +1,63 @@ +"""Render opt-out / legal request text from templates/ with safe substitution. + +Templates use {field} placeholders. Missing fields are left literal (never crash, +never inject blanks that look like real data). Field values come from the +least-disclosure selection in dossier.select_disclosure. +""" +from __future__ import annotations + +from pathlib import Path + +import paths + + +class _SafeDict(dict): + def __missing__(self, key): # leave unknown placeholders untouched + return "{" + key + "}" + + +def template_path(name: str) -> Path: + return paths.templates_dir() / name + + +def render(template_name: str, fields: dict) -> str: + text = template_path(template_name).read_text(encoding="utf-8") + return text.format_map(_SafeDict(fields)) + + +def _join_listings(value) -> str: + if isinstance(value, (list, tuple)): + return "\n".join(str(v) for v in value) + return str(value or "") + + +def _join_identifiers(value) -> str: + """Render the subject's OWN identifiers as a bullet list for an indirect-exposure request.""" + if isinstance(value, (list, tuple)): + return "\n".join(f" - {v}" for v in value if v) + return f" - {value}" if value else "" + + +def render_optout_email(broker: dict, fields: dict) -> str: + ctx = dict(fields) + ctx.setdefault("broker_name", broker.get("name", "the data broker")) + ctx["listing_urls"] = _join_listings(fields.get("listing_urls")) + ctx.setdefault("full_name", fields.get("full_name", "[your name]")) + ctx.setdefault("contact_email", fields.get("contact_email", "[your email]")) + return render("emails/generic-optout.txt", ctx) + + +def render_request(kind: str, broker: dict, fields: dict) -> str: + """kind: generic | ccpa | ccpa_agent | ccpa_indirect | gdpr""" + template = { + "generic": "emails/generic-optout.txt", + "ccpa": "emails/ccpa-deletion.txt", + "ccpa_agent": "emails/ccpa-authorized-agent.txt", + "ccpa_indirect": "emails/ccpa-indirect-deletion.txt", + "gdpr": "emails/gdpr-erasure.txt", + }.get(kind, "emails/generic-optout.txt") + ctx = dict(fields) + ctx.setdefault("broker_name", broker.get("name", "the data broker")) + ctx["listing_urls"] = _join_listings(fields.get("listing_urls")) + ctx["my_identifiers"] = _join_identifiers(fields.get("my_identifiers")) + return render(template, ctx) diff --git a/optional-skills/security/unbroker/scripts/paths.py b/optional-skills/security/unbroker/scripts/paths.py new file mode 100644 index 000000000..887748f41 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/paths.py @@ -0,0 +1,79 @@ +"""Filesystem paths for the unbroker skill (stdlib only). + +All per-subject data lives under PDD_DATA_DIR (default: $HERMES_HOME/unbroker), +which is the same trust boundary Hermes uses for .env and OAuth tokens. +""" +from __future__ import annotations + +import os +from pathlib import Path + + +def hermes_home() -> Path: + return Path(os.environ.get("HERMES_HOME") or (Path.home() / ".hermes")) + + +def data_dir() -> Path: + override = os.environ.get("PDD_DATA_DIR") + return Path(override) if override else hermes_home() / "unbroker" + + +def config_path() -> Path: + return data_dir() / "config.json" + + +def subjects_dir() -> Path: + return data_dir() / "subjects" + + +def subject_dir(subject_id: str) -> Path: + return subjects_dir() / subject_id + + +def dossier_path(subject_id: str) -> Path: + return subject_dir(subject_id) / "dossier.json" + + +def ledger_path(subject_id: str) -> Path: + return subject_dir(subject_id) / "ledger.json" + + +def audit_path(subject_id: str) -> Path: + return subject_dir(subject_id) / "audit.jsonl" + + +def evidence_dir(subject_id: str) -> Path: + return subject_dir(subject_id) / "evidence" + + +def skill_root() -> Path: + """The skill directory (parent of scripts/).""" + return Path(__file__).resolve().parent.parent + + +def brokers_dir() -> Path: + return skill_root() / "references" / "brokers" + + +def brokers_cache_path() -> Path: + """Live broker snapshot pulled from BADBOOL (merged under the curated DB).""" + return data_dir() / "brokers-cache" / "badbool.json" + + +def registry_cache_path() -> Path: + """CA Data Broker Registry snapshot (separate coverage lane; DROP/email, not scanned).""" + return data_dir() / "brokers-cache" / "ca-registry.json" + + +def age_identity_path() -> Path: + """age identity (private key) used for at-rest encryption when enabled. + + Defaults beside the data; point PDD_AGE_IDENTITY at a separate volume/token + for real key separation from the encrypted data. + """ + override = os.environ.get("PDD_AGE_IDENTITY") + return Path(override) if override else data_dir() / "age-identity.txt" + + +def templates_dir() -> Path: + return skill_root() / "templates" diff --git a/optional-skills/security/unbroker/scripts/pdd.py b/optional-skills/security/unbroker/scripts/pdd.py new file mode 100644 index 000000000..ab9f77b78 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/pdd.py @@ -0,0 +1,914 @@ +#!/usr/bin/env python3 +"""unbroker - deterministic CLI helper. + +The Hermes agent orchestrates scanning and opt-out submission with native tools +(`web_extract`, `browser_navigate`, email mechanisms). THIS CLI owns the +deterministic state: config, dossiers + consent, the broker DB, tier planning, +the ledger + audit log, draft/template rendering, and reports. + +Run it through the `terminal` tool (it can read PII files under HERMES_HOME); +do NOT run it through `execute_code` (that sandbox scrubs env and redacts output). + +Examples: + python pdd.py setup + python pdd.py intake --full-name "Jane Q. Public" --email jane@example.com \ + --city Oakland --state CA --residency US-CA --consent --consent-method self + python pdd.py plan sub_xxxx --priority crucial + python pdd.py record sub_xxxx spokeo found --found true \ + --evidence '{"listing_urls":["https://www.spokeo.com/..."]}' + python pdd.py render-email sub_xxxx spokeo --listing https://www.spokeo.com/... + python pdd.py status sub_xxxx +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import autopilot # noqa: E402 +import badbool # noqa: E402 +import cdp # noqa: E402 +import brokers as brokers_mod # noqa: E402 +import config as config_mod # noqa: E402 +import crypto # noqa: E402 +import dossier as dossier_mod # noqa: E402 +import email_modes # noqa: E402 +import emailer # noqa: E402 +import ledger as ledger_mod # noqa: E402 +import legal # noqa: E402 +import paths as paths_mod # noqa: E402 +import registry # noqa: E402 +import report as report_mod # noqa: E402 +import tiers # noqa: E402 + + +def _out(obj) -> None: + print(json.dumps(obj, indent=2, ensure_ascii=False)) + + +def _require_subject(subject_id: str) -> dict: + d = dossier_mod.load(subject_id) + if not d: + sys.exit(f"error: unknown subject {subject_id!r} (run `intake` first)") + return d + + +def cmd_setup(args) -> None: + if getattr(args, "auto", False): + # Autonomous path: detect capabilities and pick the most autonomous valid config without + # asking anyone. Read creds from $HERMES_HOME/.env too (the terminal shell doesn't export + # them). Explicit flags still win below. + cfg = config_mod.auto_configure(env=config_mod.dotenv_env()) + else: + cfg = config_mod.load_config() + for key in ("autonomy", "email_mode", "browser_backend", "tracker_backend", "encryption"): + val = getattr(args, key) + if val: + cfg[key] = val + if cfg.get("encryption") == "age": + if not crypto.age_available(): + sys.exit("error: encryption=age requested but `age`/`age-keygen` not found. " + "Install age (e.g. `brew install age`) or use `--encryption none`.") + crypto.ensure_identity() # generate the key now so encryption is actually engaged + path = config_mod.save_config(cfg) + migrated = _migrate_subjects() # rewrite existing dossiers/ledgers into the new at-rest format + out = { + "config_path": str(path), + "config": cfg, + "encryption_engaged": crypto.is_engaged(), + "detected_upgrades": config_mod.detect_capabilities(), + "migrated_subjects": migrated, + "note": "Defaults are easiest-first (draft email, auto browser, local tracker, no encryption). " + "Pass flags to opt into upgrades, then run `doctor` for a readiness summary.", + } + if cfg.get("encryption") == "age": + out["age_identity"] = str(crypto.identity_path()) + _out(out) + + +def _migrate_subjects() -> int: + """Re-save each subject's dossier + ledger so they match the current at-rest format.""" + sd = paths_mod.subjects_dir() + if not sd.exists(): + return 0 + n = 0 + for child in sorted(sd.iterdir()): + if not child.is_dir(): + continue + sid = child.name + d = dossier_mod.load(sid) + if d is not None: + dossier_mod.save(d) + n += 1 + led = ledger_mod.load(sid) + if led: + ledger_mod.save(sid, led) + return n + + +def _check_writable(path) -> bool: + try: + path.mkdir(parents=True, exist_ok=True) + probe = path / ".write_test" + probe.write_text("x", encoding="utf-8") + probe.unlink() + return True + except OSError: + return False + + +def cmd_doctor(args) -> None: + import platform + + cfg = config_mod.load_config() + caps = config_mod.detect_capabilities(config_mod.dotenv_env()) # see creds in $HERMES_HOME/.env too + data = paths_mod.data_dir() + writable = _check_writable(data) + curated = len(brokers_mod._load_curated()) + live = len(brokers_mod.load_live_cache()) + total = len(brokers_mod.load_all()) + + L = ["unbroker - readiness check", "=" * 42, + f"Python : {platform.python_version()}", + f"Data dir : {data} ({'writable' if writable else 'NOT writable'})", + f"Config : autonomy={cfg.get('autonomy', 'full')} email={cfg['email_mode']} " + f"browser={cfg['browser_backend']} " + f"tracker={cfg['tracker_backend']} encryption={cfg['encryption']}", + f"Brokers : {total} available ({curated} curated + {live} live" + + ("" if live else ", run `refresh-brokers` to expand to ~50") + ")", + "", "Opt-in upgrades:"] + rows = [ + ("Cloud browser (Browserbase) *RECOMMENDED*", caps["browserbase"], + "default backend: clears soft CAPTCHAs (Turnstile/hCaptcha) -> more T1", "set BROWSERBASE_API_KEY"), + ("Email auto (AgentMail)", caps["agentmail"], + "send + auto-verify, per-broker aliases (Mode B/C)", "install agentmail skill / set AGENTMAIL_API_KEY"), + ("Email send (CLI SMTP)", caps["smtp_send"], + "`send-email` delivers opt-outs itself (Mode B)", "set EMAIL_ADDRESS / EMAIL_PASSWORD (+ EMAIL_SMTP_HOST)"), + ("Verify-link poll (CLI IMAP)", caps["imap_read"], + "`poll-verification` reads confirmation links itself", "set EMAIL_ADDRESS / EMAIL_PASSWORD (+ EMAIL_IMAP_HOST)"), + ("Google Sheets tracker", caps["google_workspace"], + "shared status dashboard", "set up the google-workspace skill"), + ] + for name, ok, enables, how in rows: + L.append(f" [{'ON ' if ok else 'off'}] {name:<28} {enables}") + if not ok: + L.append(f" enable: {how}") + + # At-rest encryption: report TRUE engagement (configured + key present), not just binary presence. + engaged = crypto.is_engaged() + L.append(f" [{'ON ' if engaged else 'off'}] {'At-rest encryption (age)':<28} " + "encrypts dossiers + ledgers on disk") + if engaged: + L.append(f" key: {crypto.identity_path()} (0600) - guards casual/backup/commit " + "exposure, NOT a full-HERMES_HOME read") + elif cfg["encryption"] == "age": + L.append(" WARNING: encryption=age is SET but NOT engaged (age binary or key missing);" + " dossiers would be PLAINTEXT") + elif caps["age"]: + L.append(" off - dossiers are plaintext (0600). enable: `setup --encryption age`") + else: + L.append(" off - dossiers are plaintext (0600). install `age` first to enable") + + L += ["", "Verdict:", " Ready now in DRAFT mode (no setup needed): scan brokers, draft opt-out", + " emails for you to send, and track everything in the ledger."] + if caps["browserbase"]: + L.append(" Cloud browser ON (recommended default): soft/managed CAPTCHAs " + "(Turnstile/hCaptcha) clear automatically -> those brokers stay T1.") + else: + L.append(" No cloud browser: set BROWSERBASE_API_KEY (the recommended default) so soft " + "CAPTCHAs clear automatically; without it those brokers drop to T2 (human tasks).") + if cfg["email_mode"] == "draft_only": + L.append(" Email is draft-only: you send drafts + click verify links. For hands-off email " + "WITHOUT storing a password, run `setup --email-mode browser` (agent sends + opens " + "verify links via your logged-in webmail); or set EMAIL_* for SMTP/IMAP.") + elif cfg["email_mode"] == "browser": + L.append(" Email mode: browser (no password) - the agent sends opt-outs and opens verify " + "links via the operator's logged-in webmail. This needs Hermes pointed at the " + "operator's OWN Chrome over CDP (launch with --remote-debugging-port=9222 " + "--user-data-dir=~/.hermes/chrome-debug, signed into the webmail once); else it falls " + "back to drafts. Run `pdd.py cdp` to launch it (or `pdd.py cdp --print` for the command). " + "See methods.md 'Browser backends'.") + cloud_scan = cfg.get("browser_backend") == "browserbase" or ( + cfg.get("browser_backend") == "auto" and caps.get("browserbase")) + if cloud_scan: + L.append(" NOTE: your scan backend is a cloud browser (Browserbase). It is great for " + "Phase-1 scanning but CANNOT be the browser that sends webmail (no inbox session) " + "and is itself Cloudflare/DataDome-gated on session-bound gates (e.g. PeopleConnect). " + "For Phase-2 email/verify, launch the operator's Chrome over CDP: `pdd.py cdp`.") + if not crypto.is_engaged(): + L.append(" Storage: dossiers are PLAINTEXT JSON (0600 under HERMES_HOME). " + "Run `setup --encryption age` for at-rest encryption.") + if not live: + L.append(" Next: run `refresh-brokers` to load the full broker list.") + + # Freshness: warn when cached lists / curated mechanics are going stale (silent broker rot). + import time as _time + STALE_CACHE_DAYS, STALE_VERIFY_DAYS = 30, 180 + + def _age_days(p) -> float | None: + try: + return (_time.time() - p.stat().st_mtime) / 86400.0 + except OSError: + return None + + fresh = [] + for label, p in [("BADBOOL", paths_mod.brokers_cache_path()), + ("CA registry", paths_mod.registry_cache_path())]: + age = _age_days(p) + if age is None: + fresh.append(f"{label}: not pulled") + elif age > STALE_CACHE_DAYS: + fresh.append(f"{label}: {age:.0f}d old (stale, re-pull)") + stale_curated = documented = 0 + for b in brokers_mod._load_curated(): + conf = b.get("confidence") + lv = b.get("last_verified") + if conf == "documented" or not lv: + documented += 1 + continue + try: + if (_time.time() - _time.mktime(_time.strptime(lv, "%Y-%m-%d"))) / 86400.0 > STALE_VERIFY_DAYS: + stale_curated += 1 + except (ValueError, TypeError): + pass + if fresh: + L.append(" Freshness: " + "; ".join(fresh) + " (run `refresh-brokers`).") + if stale_curated or documented: + L.append(f" Freshness: {stale_curated} curated broker(s) last-verified >{STALE_VERIFY_DAYS}d ago; " + f"{documented} documented broker(s) awaiting first-use verification.") + print("\n".join(L)) + + +def cmd_cdp(args) -> None: + """Launch (or detect) the operator's Chrome over CDP for Phase-2 browser + webmail work. + + A cloud browser cannot send the operator's webmail or clear session-bound gates; this points + Hermes at the operator's real Chrome on a dedicated debug profile (see methods.md). + """ + import shlex + import time + + port = args.port + profile = Path(args.profile).expanduser() if args.profile else cdp.default_profile() + + live = cdp.endpoint_status(port) + if live: + _out({"running": True, "endpoint": f"127.0.0.1:{port}", + "browser": live.get("Browser"), + "webSocketDebuggerUrl": live.get("webSocketDebuggerUrl"), + "note": "a debuggable browser is already listening; point Hermes's browser tools at " + f"127.0.0.1:{port} and make sure the operator's webmail is signed in in THAT browser."}) + return + + if getattr(args, "check", False): + _out({"running": False, "endpoint": f"127.0.0.1:{port}", + "note": f"no debuggable browser here yet; run `pdd.py cdp --port {port}` (no --check) to launch one."}) + return + + browser = cdp.find_browser(args.browser) + if not browser: + _out({"running": False, "error": "no Chrome/Chromium-family browser found", + "fix": "install Google Chrome, or pass --browser /path/to/chrome (or a command on PATH)"}) + return + + cmd = cdp.launch_command(browser, port, profile) + if getattr(args, "print_only", False): + _out({"running": False, "browser": browser, "profile": str(profile), "command": cmd, + "shell": " ".join(shlex.quote(c) for c in cmd), + "note": "run this yourself to launch the debug browser, then sign into your webmail once."}) + return + + pid = cdp.launch(browser, port, profile) + live = None + for _ in range(20): # give Chrome a few seconds to open the debug port + live = cdp.endpoint_status(port) + if live: + break + time.sleep(0.5) + _out({"running": bool(live), "launched_pid": pid, "browser": browser, + "profile": str(profile), "endpoint": f"127.0.0.1:{port}", + "webSocketDebuggerUrl": (live or {}).get("webSocketDebuggerUrl"), + "next": ([f"point Hermes's browser tools at 127.0.0.1:{port} (CDP)", + "in the launched browser, sign into the operator's webmail ONCE (dedicated debug profile)", + "then run email/verify flows in browser mode -- they use this logged-in session"] + if live else + ["browser launched but the debug port has not answered yet; give it a few seconds, then " + f"re-run `pdd.py cdp --check --port {port}`"])}) + + +def cmd_intake(args) -> None: + if args.json: + data = json.loads(Path(args.json).read_text(encoding="utf-8")) + identity = data["identity"] + consent = data.get("consent", {}) + residency = data.get("residency_jurisdiction", "US") + prefs = data.get("preferences") + else: + if not args.full_name: + sys.exit("error: --full-name (or --json) is required") + identity = {"full_name": args.full_name, "emails": args.email or [], "phones": args.phone or []} + if args.alias: + identity["also_known_as"] = args.alias + if args.dob: + identity["date_of_birth"] = args.dob + addr = {k: v for k, v in {"line1": args.street, "city": args.city, + "state": args.state, "postal": args.postal}.items() if v} + if addr: + identity["current_address"] = addr + priors = [] + for loc in args.prior_location or []: + parts = [p.strip() for p in loc.split(",") if p.strip()] + if not parts: + continue + entry = {"city": parts[0]} + if len(parts) > 1: + entry["state"] = parts[1] + if len(parts) > 2: + entry["postal"] = parts[2] + priors.append(entry) + if priors: + identity["prior_addresses"] = priors + cfg = config_mod.load_config() + consent = {"authorized": bool(args.consent), "method": args.consent_method, "recorded_at": dossier_mod.now()} + residency = args.residency or "US" + prefs = { + "email_mode": args.email_mode or cfg["email_mode"], + "rescan_interval_days": cfg["default_rescan_interval_days"], + } + if args.contact_email: + prefs["contact_email_for_optouts"] = args.contact_email + d = dossier_mod.create(identity, consent, residency, prefs) + _out({"subject_id": d["subject_id"], "authorized": dossier_mod.is_authorized(d), + "residency": residency, "email_mode": (prefs or {}).get("email_mode"), + "names": dossier_mod.all_names(d), + "emails": len(d["identity"].get("emails") or []), + "phones": len(d["identity"].get("phones") or []), + "addresses": len(dossier_mod.all_addresses(d))}) + + +def cmd_brokers(args) -> None: + bl = brokers_mod.by_priority(*(args.priority or [])) if args.priority else brokers_mod.load_all() + _out([ + {"id": b.get("id"), "name": b.get("name"), "priority": b.get("priority"), + "method": (b.get("optout") or {}).get("method"), "owns": b.get("owns") or [], + "source": b.get("source"), "confidence": b.get("confidence", "curated")} + for b in bl + ]) + + +def cmd_refresh_brokers(args) -> None: + res = badbool.refresh(paths_mod.brokers_cache_path()) + curated_ids = {b["id"] for b in brokers_mod._load_curated()} + new = [b["id"] for b in brokers_mod.load_live_cache() if b["id"] not in curated_ids] + out = {**res, "curated": len(curated_ids), "new_from_live": len(new), + "people_search_total": len(brokers_mod.load_all()), + "note": "Live records have confidence=auto; verify their opt-out URL before acting."} + if not getattr(args, "no_registry", False): + try: + reg = registry.refresh_all(paths_mod.registry_cache_path()) + out["registry"] = {"total": reg["total"], "sources": reg["sources"], + "portals": reg["portals"], + "note": "Coverage lane worked via the CA DROP one-shot + CCPA email, " + "not the people-search scan. VT/OR/TX are search portals (no " + "bulk export); CA is the superset. See `drop` and `registry`."} + except Exception as exc: # noqa: BLE001 - registry pull is best-effort + out["registry_error"] = str(exc) + _out(out) + + +def cmd_registry(args) -> None: + recs = brokers_mod.load_registry_cache() + if not recs: + _out({"registered_brokers": 0, + "note": "registry empty - run `refresh-brokers` (pulls the CA Data Broker Registry)"}) + return + fcra = sum(1 for r in recs if (r.get("optout") or {}).get("fcra")) + out = {"registered_brokers": len(recs), "fcra_regulated": fcra, + "source": "CA Data Broker Registry (CPPA, 2025)", "drop_url": registry.DROP_URL, + "other_state_portals": registry.portals()} + if args.search: + q = args.search.lower() + hits = [r for r in recs if q in (r.get("name") or "").lower() + or q in (r.get("id") or "") or q in ((r.get("optout") or {}).get("email") or "").lower()] + out["matches"] = [{"id": r["id"], "name": r["name"], + "email": (r.get("optout") or {}).get("email"), + "url": (r.get("optout") or {}).get("url"), + "fcra": (r.get("optout") or {}).get("fcra")} for r in hits[:args.limit]] + out["match_count"] = len(hits) + _out(out) + + +def cmd_drop(args) -> None: + """The one-shot legal lever: CA DROP deletes from ALL registered brokers at once.""" + d = _require_subject(args.subject) + dossier_mod.require_authorized(d) + reg = brokers_mod.load_registry_cache() + res = (d.get("residency_jurisdiction") or "US").upper() + eligible = res.startswith("US-CA") + if args.filed: + prefs = d.setdefault("preferences", {}) + prefs["drop_filed_at"] = dossier_mod.now() + dossier_mod.save(d) + _out({"subject": args.subject, "drop_filed_at": prefs["drop_filed_at"], + "note": "recorded; `next` will stop surfacing the DROP one-shot"}) + return + _out({ + "subject": args.subject, + "eligible": eligible, + "residency": res, + "drop_url": registry.DROP_URL, + "covers_registered_brokers": len(reg), + "steps": ([ + "Go to privacy.ca.gov/drop and create/verify a DROP account (CA resident).", + "Submit ONE deletion request; it applies to EVERY registered data broker " + f"({len(reg)} in the current registry). Brokers must process starting 2026-08-01.", + "After filing, run `drop <subject> --filed` so the loop stops re-surfacing it.", + ] if eligible else [ + "DROP is a California mechanism; this subject's residency is not US-CA.", + "Parity path for non-CA: work the people-search sites via `next`, and send targeted " + "CCPA/GDPR deletion emails to registry brokers that hold this person's data " + "(`registry --search`, then `send-email`).", + ]), + "note": "DROP is the highest-leverage removal: one request covers the whole registry.", + }) + + +def cmd_plan(args) -> None: + d = _require_subject(args.subject) + dossier_mod.require_authorized(d) + cfg = config_mod.load_config() + bl = brokers_mod.by_priority(*(args.priority or [])) if args.priority else brokers_mod.load_all() + bcc = config_mod.browser_clears_captcha(cfg) + if getattr(args, "batch", False): + _out(tiers.batch_plan(d, bl, cfg, ledger_mod.load(args.subject), bcc)) + else: + _out(tiers.plan(d, bl, cfg, bcc)) + + +def cmd_fanout(args) -> None: + d = _require_subject(args.subject) + dossier_mod.require_authorized(d) + bl = brokers_mod.by_priority(*(args.priority or [])) if args.priority else brokers_mod.load_all() + grouping = tiers.fanout(bl, batch_size=args.size) + mode = "scan AND opt-out (operator authorized submissions)" if args.optout \ + else "READ-ONLY scan (submit nothing; reconnaissance only)" + batches = [] + for i, ids in enumerate(grouping["batches"], 1): + brief = ( + f"You are scan worker {i} of {len(grouping['batches'])} for the `unbroker` skill. First " + f"load the `unbroker` skill and read its references/methods.md. Use the `web` toolset " + f"(web_search `site:` + web_extract), NOT `browser` (browser navigation is heavy and times " + f"out). Subject id: {args.subject}. Handle ONLY these brokers: {', '.join(ids)}. " + f"For EACH broker: read references/brokers/<id>.json; run EVERY search vector from " + f"`pdd.py plan {args.subject}` (filtered to your brokers); build URLs from search.url_patterns " + f"and heed url_format_quirks; a 404 is INCONCLUSIVE (rebuild/try the on-site search box), not " + f"not_found. ECONOMY: at most ~3 web calls per broker; the moment a page shows antibot " + f"(Cloudflare 'just a moment'/DataDome) or hangs, record `blocked` and move on -- do NOT " + f"retry-loop. Confirm the SUBJECT vs namesakes/relatives by ADDRESS/DOB before recording " + f"`found` (ignore SEO-templated page titles/intro that just echo the query -- require a real " + f"result card; a public property/address record with no displayed personal NAME is " + f"not_found, not found). Record each outcome via `pdd.py record {args.subject} <broker> " + f"<found|not_found|indirect_exposure|blocked> --found <bool> --evidence '{{\"listing_urls\":[...]}}'`. " + f"Mode: {mode}. Broker JSON files are READ-ONLY for you -- do NOT edit them; if you discover " + f"a URL/quirk, put it in your report for the parent to fold in. Return a concise structured " + f"per-broker report." + ) + batches.append({"batch": i, "brokers": ids, "brief": brief}) + _out({ + "subject": args.subject, + "broker_count": grouping["broker_count"], + "batch_size": grouping["batch_size"], + "should_fanout": grouping["should_fanout"], + "batch_count": len(batches), + "batches": batches, + "instruction": ( + "If should_fanout is true you MUST spawn ONE delegate_task subagent per batch IN PARALLEL, " + "passing each batch's `brief`; do not scan all brokers yourself sequentially. Wait for every " + "report, consolidate, then proceed to opt-outs. If false, just scan the brokers inline." + ), + }) + + +def cmd_record(args) -> None: + d = _require_subject(args.subject) + dossier_mod.require_authorized(d) + broker = brokers_mod.get(args.broker) + # Auto-stamp follow-up scheduling (next_recheck_at / removal_confirmed_at) so the + # autonomous loop knows when to come back without anyone remembering to set it. + fields = ledger_mod.followup_fields(args.state, broker, d) + if args.found is not None: + fields["found"] = args.found + if args.evidence: + fields["evidence"] = json.loads(args.evidence) + if args.reason: + fields["human_task_reason"] = args.reason + case = ledger_mod.transition(args.subject, args.broker, args.state, **fields) + if args.disclosed: + ledger_mod.log_disclosure(args.subject, args.broker, args.disclosed, args.channel or "unknown") + _out({"broker": args.broker, "state": case["state"], + "next_recheck_at": case.get("next_recheck_at")}) + + +def _email_request(d: dict, b: dict, kind: str, listings, identifiers) -> tuple[dict, list[str]]: + """Least-disclosure (fields, disclosed_names) for an opt-out/legal email of KIND. + + A removal letter must self-identify. Name + a contact email are already known to the + broker (the name is displayed on the very listing being removed), so not extra exposure. + """ + fields = dossier_mod.select_disclosure(d, (b.get("optout") or {}).get("inputs", [])) + ident = d.get("identity", {}) + if ident.get("full_name"): + fields.setdefault("full_name", ident["full_name"]) + fields.setdefault("contact_email", dossier_mod.contact_email(d) or "") + if listings: + fields["listing_urls"] = listings + if kind == "ccpa_indirect": + # Indirect exposure: name ONLY the subject's own identifiers to scrub from a third party's + # record. Default to the contact email + the subject's name-as-relative if none specified. + # The indirect template renders ONLY these placeholders; do not over-report disclosure with + # unrelated dossier fields (phone/street/postal) that select_disclosure happened to populate. + ids = list(identifiers or []) + if not ids: + ids = [contact for contact in [dossier_mod.contact_email(d)] if contact] + ids.append(f'the name "{ident.get("full_name")}" where it appears as a relative/associated person') + fields = { + "full_name": fields.get("full_name"), + "contact_email": fields.get("contact_email"), + "listing_urls": fields.get("listing_urls"), + "my_identifiers": ids, + } + return fields, ["contact_email", "full_name", "my_identifiers"] + return fields, sorted(fields.keys()) + + +def cmd_render_email(args) -> None: + d = _require_subject(args.subject) + dossier_mod.require_authorized(d) + b = brokers_mod.get(args.broker) + if not b: + sys.exit(f"error: unknown broker {args.broker!r}") + kind = getattr(args, "kind", "generic") or "generic" + fields, disclosed = _email_request(d, b, kind, args.listing, getattr(args, "identifier", None)) + if kind == "generic": + draft = email_modes.render_draft(b, fields) + else: + draft = email_modes.render_request_draft(b, fields, kind=kind) + ledger_mod.log_disclosure(args.subject, args.broker, list(disclosed), f"email_draft:{kind}") + _out({"draft": str(draft), "kind": kind, "disclosed_fields": disclosed}) + + +def cmd_send_email(args) -> None: + """Mode B: render AND deliver the opt-out/legal request - no human in the loop. + + Sends ONLY to an address the broker record itself declares (emailer enforces it), + then records the ledger transition + disclosure and auto-stamps the recheck date. + """ + d = _require_subject(args.subject) + dossier_mod.require_authorized(d) + b = brokers_mod.get(args.broker) + if not b: + sys.exit(f"error: unknown broker {args.broker!r}") + cfg = config_mod.load_config() + mode = cfg.get("email_mode") + if mode not in ("programmatic", "alias", "browser"): + sys.exit("error: email_mode is draft_only; run `setup --email-mode browser` (no password; " + "sends via your logged-in webmail) or `--email-mode programmatic`, or use " + "`render-email` and send it yourself") + if not args.listing: + sys.exit("error: --listing <confirmed-url> is required (verify-before-disclose: never " + "email a broker about an unconfirmed listing)") + # Idempotency: don't re-send if this case is already submitted/beyond (prevents duplicate + # requests when an action is retried). --force overrides. + _POST_SUBMIT = {"submitted", "verification_pending", "awaiting_processing", "confirmed_removed"} + current = ledger_mod.get_case(args.subject, args.broker).get("state") + if current in _POST_SUBMIT and not getattr(args, "force", False): + _out({"skipped": True, "broker": args.broker, "state": current, + "note": "already submitted; not re-sending (idempotent). Use --force to re-send."}) + return + kind = getattr(args, "kind", "generic") or "generic" + fields, disclosed = _email_request(d, b, kind, args.listing, getattr(args, "identifier", None)) + body = legal.render_optout_email(b, fields) if kind == "generic" else legal.render_request(kind, b, fields) + + if mode == "browser": + # No network / no credentials: hand the agent a recipient-locked payload to send in the + # operator's webmail via browser_* tools. State still records deterministically here. + payload = emailer.browser_send_payload(b, body, to=args.to) + ledger_mod.log_disclosure(args.subject, args.broker, list(disclosed), f"email_browser:{kind}") + case = ledger_mod.transition(args.subject, args.broker, "submitted", + **ledger_mod.followup_fields("submitted", b, d)) + _out({"send_via": "browser", "compose": payload, "kind": kind, "disclosed_fields": disclosed, + "state": case["state"], "next_recheck_at": case.get("next_recheck_at"), + "instruction": "In the operator's logged-in webmail, compose a NEW email to compose.to " + "with compose.subject/body EXACTLY (disclose nothing beyond it) and send " + "it via browser_* tools. Then use `verify-link` on any confirmation reply.", + "note": "recipient is locked to the broker's declared address"}) + return + + result = emailer.send(b, body, to=args.to, + min_interval=float(cfg.get("email_min_interval_seconds", 0) or 0)) + ledger_mod.log_disclosure(args.subject, args.broker, list(disclosed), f"email_sent:{kind}") + case = ledger_mod.transition(args.subject, args.broker, "submitted", + **ledger_mod.followup_fields("submitted", b, d)) + _out({"sent": result, "send_via": "smtp", "kind": kind, "disclosed_fields": disclosed, + "state": case["state"], "next_recheck_at": case.get("next_recheck_at"), + "note": "if this broker verifies by email, `poll-verification` will pick up the link"}) + + +def cmd_verify_link(args) -> None: + """Extract a broker's verification link from email text the agent read in webmail (browser mode). + + IMAP-free counterpart to `poll-verification`: the agent opens the broker's confirmation email + in the operator's webmail, pastes the body here, and gets the anti-phishing-scored link back. + """ + _require_subject(args.subject) + b = brokers_mod.get(args.broker) + if not b: + sys.exit(f"error: unknown broker {args.broker!r}") + text = args.text + if args.file: + text = Path(args.file).read_text(encoding="utf-8", errors="replace") + if not text: + sys.exit("error: provide --text '<email body>' (or --file) from the broker's confirmation email") + link = email_modes.extract_verification_link(text, b) + _out({"broker": args.broker, "verification_link": link, + "next": ("browser_navigate the link IN THE SAME browser (sessions are browser-bound), " + f"complete the flow, then `record {args.subject} {args.broker} awaiting_processing`" + if link else + "no broker/opt-out-scoped link found in that text; confirm you opened the right email")}) + + +def cmd_poll_verification(args) -> None: + """Poll the inbox for brokers' verification links (Mode B) - replaces the human click-chase. + + For each in-flight case (submitted / verification_pending with email_verification), + extract the broker's link (anti-phishing scored). A found link auto-advances + submitted -> verification_pending (the email HAS arrived); the agent must then OPEN + the link in its own browser (sessions are browser-bound) and record the next state. + """ + d = _require_subject(args.subject) + dossier_mod.require_authorized(d) + led = ledger_mod.load(args.subject) + targets = [] + for bid, case in sorted(led.items()): + if args.broker and bid != args.broker: + continue + if case.get("state") not in ("submitted", "verification_pending"): + continue + b = brokers_mod.get(bid) + if b and (((b.get("optout") or {}).get("requires")) or {}).get("email_verification"): + targets.append((bid, case, b)) + if not targets: + _out({"subject": args.subject, "results": [], + "note": "no in-flight cases awaiting email verification"}) + return + results = [] + for bid, case, b in targets: + hit = emailer.find_verification_link(b, since_days=args.since_days) + if hit: + if case.get("state") == "submitted": + ledger_mod.transition(args.subject, bid, "verification_pending", + **ledger_mod.followup_fields("verification_pending", b, d)) + results.append({"broker": bid, "verification_link": hit["link"], + "email_from": hit.get("from"), "email_subject": hit.get("subject"), + "next": f"browser_navigate the link IN THE AGENT'S OWN BROWSER, complete " + f"the flow, then `record {args.subject} {bid} awaiting_processing` " + f"(or confirmed_removed only after a verifying re-scan)"}) + else: + results.append({"broker": bid, "verification_link": None, + "next": "no matching email yet; poll again later (next_recheck_at is set)"}) + _out({"subject": args.subject, "results": results}) + + +def cmd_next(args) -> None: + d = _require_subject(args.subject) + dossier_mod.require_authorized(d) + cfg = config_mod.load_config() + bl = brokers_mod.by_priority(*(args.priority or [])) if args.priority else brokers_mod.load_all() + _out(autopilot.next_actions(d, bl, cfg, ledger_mod.load(args.subject))) + + +def cmd_tasks(args) -> None: + _require_subject(args.subject) + print(report_mod.human_tasks_markdown(args.subject)) + + +def cmd_due(args) -> None: + _require_subject(args.subject) + cases = ledger_mod.due(args.subject) + _out({"subject": args.subject, "due_count": len(cases), + "cases": [{"broker_id": c.get("broker_id"), "state": c.get("state"), + "next_recheck_at": c.get("next_recheck_at")} for c in cases], + "note": "run `next` for the concrete follow-up action per case"}) + + +def cmd_show(args) -> None: + """Read a case's recorded state + evidence (so the parent can re-verify a subagent's `found` + without re-deriving listing URLs).""" + _require_subject(args.subject) + case = ledger_mod.get_case(args.subject, args.broker) + _out({"broker": args.broker, "state": case.get("state"), "found": case.get("found"), + "evidence": case.get("evidence") or {}, + "disclosure_log": case.get("disclosure_log") or [], + "next_recheck_at": case.get("next_recheck_at"), + "human_task_reason": case.get("human_task_reason"), + "history": case.get("history") or []}) + + +def cmd_status(args) -> None: + _require_subject(args.subject) + print(report_mod.render_markdown(args.subject)) + + +def cmd_report(args) -> None: + _require_subject(args.subject) + if args.sheets: + _out(report_mod.sheets_rows(args.subject)) + else: + print(report_mod.render_markdown(args.subject)) + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(prog="pdd", description="unbroker helper CLI") + sub = p.add_subparsers(dest="cmd", required=True) + + s = sub.add_parser("setup", help="write install config (easiest-first defaults; --auto = most autonomous)") + s.add_argument("--auto", action="store_true", + help="detect capabilities and pick the most autonomous valid config (no questions)") + s.add_argument("--autonomy", dest="autonomy", choices=sorted(config_mod.VALID["autonomy"])) + s.add_argument("--email-mode", dest="email_mode", choices=sorted(config_mod.VALID["email_mode"])) + s.add_argument("--browser-backend", dest="browser_backend", choices=sorted(config_mod.VALID["browser_backend"])) + s.add_argument("--tracker-backend", dest="tracker_backend", choices=sorted(config_mod.VALID["tracker_backend"])) + s.add_argument("--encryption", dest="encryption", choices=sorted(config_mod.VALID["encryption"])) + s.set_defaults(func=cmd_setup) + + s = sub.add_parser("doctor", help="readiness check: config, brokers, available upgrades") + s.set_defaults(func=cmd_doctor) + + s = sub.add_parser("cdp", + help="launch/detect the operator's Chrome over CDP (Phase-2 browser + webmail)") + s.add_argument("--port", type=int, default=cdp.DEFAULT_PORT, help="remote debugging port (default 9222)") + s.add_argument("--profile", + help="user-data-dir (default: $HERMES_HOME/chrome-debug, a dedicated debug profile)") + s.add_argument("--browser", help="path to (or PATH name of) a Chrome/Chromium/Brave/Edge binary") + s.add_argument("--check", action="store_true", + help="only report whether a debug browser is live; do not launch") + s.add_argument("--print", dest="print_only", action="store_true", + help="print the launch command instead of launching it (run it yourself)") + s.set_defaults(func=cmd_cdp) + + s = sub.add_parser("intake", help="create a subject dossier (records consent)") + s.add_argument("--json", help="path to a dossier JSON file (overrides flags)") + s.add_argument("--full-name") + s.add_argument("--alias", action="append", metavar="NAME", + help="other name the subject is listed under (maiden/married/nickname); repeatable") + s.add_argument("--email", action="append", metavar="EMAIL", help="repeatable") + s.add_argument("--phone", action="append", metavar="PHONE", help="repeatable") + s.add_argument("--street", help="current street line1 (enables reverse-address search)") + s.add_argument("--city") + s.add_argument("--state") + s.add_argument("--postal") + s.add_argument("--prior-location", dest="prior_location", action="append", metavar="City,ST", + help="a past city/state (or City,ST,ZIP); repeatable") + s.add_argument("--dob", help="date of birth YYYY-MM-DD (only used if a broker requires it)") + s.add_argument("--contact-email", dest="contact_email", + help="which email to use for opt-out correspondence (default: first)") + s.add_argument("--residency", help="e.g. US, US-CA") + s.add_argument("--consent", action="store_true", help="subject authorizes removal on their behalf") + s.add_argument("--consent-method", default="self", choices=["self", "written_authorization", "poa"]) + s.add_argument("--email-mode", dest="email_mode", choices=sorted(config_mod.VALID["email_mode"])) + s.set_defaults(func=cmd_intake) + + s = sub.add_parser("brokers", help="list the broker database (curated + live)") + s.add_argument("--priority", action="append", choices=["crucial", "high", "standard", "long_tail"]) + s.set_defaults(func=cmd_brokers) + + s = sub.add_parser("refresh-brokers", + help="pull the latest BADBOOL people-search list + the CA data broker registry") + s.add_argument("--no-registry", dest="no_registry", action="store_true", + help="skip the CA registry pull (BADBOOL people-search only)") + s.set_defaults(func=cmd_refresh_brokers) + + s = sub.add_parser("registry", + help="CA Data Broker Registry coverage (hundreds of brokers; DROP/email lane)") + s.add_argument("--search", help="find registered brokers by name / id / email substring") + s.add_argument("--limit", type=int, default=25, help="max matches to print (default 25)") + s.set_defaults(func=cmd_registry) + + s = sub.add_parser("drop", + help="CA DROP one-shot: delete from ALL registered brokers in one request") + s.add_argument("subject") + s.add_argument("--filed", action="store_true", help="mark DROP as filed (stops `next` surfacing it)") + s.set_defaults(func=cmd_drop) + + s = sub.add_parser("plan", help="compute per-broker tier + next action for a subject") + s.add_argument("subject") + s.add_argument("--priority", action="append", choices=["crucial", "high", "standard", "long_tail"]) + s.add_argument("--batch", action="store_true", + help="phase-oriented batch view: overlays ledger state, groups by next action " + "(unscanned/found/indirect/blocked/in_progress/done), collapses ownership clusters") + s.set_defaults(func=cmd_plan) + + s = sub.add_parser("fanout", help="batch brokers into parallel delegate_task subagents (large runs)") + s.add_argument("subject") + s.add_argument("--priority", action="append", choices=["crucial", "high", "standard", "long_tail"]) + s.add_argument("--size", type=int, default=5, help="brokers per subagent batch (default 5; 8+ times out)") + s.add_argument("--optout", action="store_true", + help="brief authorizes opt-out submission (default: read-only scan)") + s.set_defaults(func=cmd_fanout) + + s = sub.add_parser("record", help="record a ledger state transition after an agent action") + s.add_argument("subject") + s.add_argument("broker") + s.add_argument("state", choices=ledger_mod.STATES) + s.add_argument("--found", type=lambda v: v.strip().lower() in ("1", "true", "yes", "y")) + s.add_argument("--evidence", help="JSON object stored as case.evidence") + s.add_argument("--disclosed", action="append", metavar="FIELD", help="field name disclosed") + s.add_argument("--channel", help="disclosure channel, e.g. web_form / email") + s.add_argument("--reason", help="for human_task_queued: why a human is needed (shown in `tasks`)") + s.set_defaults(func=cmd_record) + + s = sub.add_parser("next", help="autonomous action queue: exactly what to do right now") + s.add_argument("subject") + s.add_argument("--priority", action="append", choices=["crucial", "high", "standard", "long_tail"]) + s.set_defaults(func=cmd_next) + + s = sub.add_parser("send-email", help="Mode B: render AND send the opt-out/legal request (records it)") + s.add_argument("subject") + s.add_argument("broker") + s.add_argument("--listing", action="append", metavar="URL", required=False, + help="confirmed listing URL (required: verify-before-disclose)") + s.add_argument("--kind", choices=["generic", "ccpa", "ccpa_agent", "ccpa_indirect", "gdpr"], + default="generic") + s.add_argument("--identifier", action="append", metavar="ID", + help="(ccpa_indirect only) a specific own-identifier to remove; repeatable") + s.add_argument("--to", help="override recipient (must be an address the broker record declares)") + s.add_argument("--force", action="store_true", help="re-send even if already submitted (default: idempotent skip)") + s.set_defaults(func=cmd_send_email) + + s = sub.add_parser("poll-verification", + help="Mode B (IMAP): poll the inbox for brokers' verification links (anti-phishing scored)") + s.add_argument("subject") + s.add_argument("--broker", help="only this broker (default: every in-flight verification case)") + s.add_argument("--since-days", dest="since_days", type=int, default=3) + s.set_defaults(func=cmd_poll_verification) + + s = sub.add_parser("verify-link", + help="browser mode: extract a broker's verification link from pasted webmail text") + s.add_argument("subject") + s.add_argument("broker") + s.add_argument("--text", help="the confirmation email body (read from the operator's webmail)") + s.add_argument("--file", help="path to a file with the email body (alternative to --text)") + s.set_defaults(func=cmd_verify_link) + + s = sub.add_parser("tasks", help="ONE consolidated human-task digest (present at end of run)") + s.add_argument("subject") + s.set_defaults(func=cmd_tasks) + + s = sub.add_parser("show", help="read a case's state + evidence (for parent re-verification)") + s.add_argument("subject") + s.add_argument("broker") + s.set_defaults(func=cmd_show) + + s = sub.add_parser("due", help="cases whose recheck window has arrived (cron re-scan queue)") + s.add_argument("subject") + s.set_defaults(func=cmd_due) + + s = sub.add_parser("render-email", help="render a Mode-A opt-out / legal-request draft (least-disclosure)") + s.add_argument("subject") + s.add_argument("broker") + s.add_argument("--listing", action="append", metavar="URL", help="confirmed listing URL") + s.add_argument("--kind", choices=["generic", "ccpa", "ccpa_agent", "ccpa_indirect", "gdpr"], + default="generic", + help="request type. 'ccpa_indirect' = delete MY identifiers from a third party's " + "record (indirect exposure); default 'generic' opt-out.") + s.add_argument("--identifier", action="append", metavar="ID", + help="(ccpa_indirect only) a specific own-identifier to request removal of " + "(e.g. an email or phone). Repeatable. Defaults to the contact email + " + "name-as-relative if omitted.") + s.set_defaults(func=cmd_render_email) + + s = sub.add_parser("status", help="print a Markdown status report") + s.add_argument("subject") + s.set_defaults(func=cmd_status) + + s = sub.add_parser("report", help="status report (default) or --sheets rows") + s.add_argument("subject") + s.add_argument("--sheets", action="store_true", help="emit Google Sheets rows as JSON") + s.set_defaults(func=cmd_report) + return p + + +def main(argv=None) -> None: + args = build_parser().parse_args(argv) + try: + args.func(args) + except (PermissionError, ValueError, RuntimeError, FileNotFoundError) as exc: + sys.exit(f"error: {exc}") + + +if __name__ == "__main__": + main() diff --git a/optional-skills/security/unbroker/scripts/registry.py b/optional-skills/security/unbroker/scripts/registry.py new file mode 100644 index 000000000..5e42356de --- /dev/null +++ b/optional-skills/security/unbroker/scripts/registry.py @@ -0,0 +1,293 @@ +"""Ingest the California Data Broker Registry into broker records (coverage breadth). + +The CA registry (CPPA, under the Delete Act) is the authoritative universe of data +brokers doing business with California residents -- ~545 businesses in 2025, each +required to publish a name, website, contact email, and a CCPA-rights/deletion URL. +This is the same universe commercial services (DeleteMe/Incogni/Optery) draw from, +plus the FCRA/GLBA-regulated and marketing/risk brokers most lists omit. + +These are NOT people-search sites you scan with a name -- most have no per-person +lookup UI. They are worked through the LEGAL lane: the CA DROP portal +(privacy.ca.gov/drop) is a single request that deletes from ALL registered brokers +at once (CA residents), and per-broker CCPA deletion emails to the contact address +are the fallback / non-CA path. So registry records are kept in their own lane +(loaded only when asked) and never dumped into the people-search scan pipeline. + +`parse()` is pure (CSV text in, records out) so it is tested offline; `fetch()` is +the only network call and can be bypassed by passing csv_text directly to refresh(). +""" +from __future__ import annotations + +import csv +import datetime +import io +import re +import urllib.request +from pathlib import Path + +import storage + +# CA CPPA registry CSVs are published per year (registry2024.csv, registry2025.csv, ...). +# 2025 is the latest COMPLETE dataset; the current year's file is empty until the Jan +# registration window closes. DEFAULT_URL is the known-good fallback; `ca_candidate_urls` +# probes newer years first so coverage auto-advances when the next year is published. +_CA_CSV = "https://cppa.ca.gov/data_broker_registry/registry{year}.csv" +_CA_FLOOR_YEAR = 2025 +DEFAULT_URL = _CA_CSV.format(year=_CA_FLOOR_YEAR) +DROP_URL = "https://privacy.ca.gov/drop" +USER_AGENT = "Mozilla/5.0 (compatible; unbroker/1.0; data opt-out)" + + +def ca_candidate_urls(today: datetime.date | None = None) -> list[str]: + """Newest-year-first CA registry URLs to try (auto-advances; never below the 2025 floor).""" + year = (today or datetime.date.today()).year + years = list(range(max(year, _CA_FLOOR_YEAR), _CA_FLOOR_YEAR - 1, -1)) + return [_CA_CSV.format(year=y) for y in years] + +# Multi-source registry lane. Only California publishes a clean bulk CSV (with contact email + +# CCPA-rights URL per broker) AND offers a one-shot deletion portal (DROP). Vermont, Oregon, and +# Texas maintain registries too, but only as searchable PORTALS (no reliable bulk export) and with +# no DROP-equivalent -- and they overlap CA heavily (CA is effectively the superset). So they are +# wired as first-class portal sources (official URL surfaced to the operator) rather than scraped. +# Adding any state that later publishes a CSV is a one-line "format: csv" entry (the parser is +# column-detection based, not CA-specific). +SOURCES = { + "ca": {"jurisdiction": "US-CA", "format": "csv", "url": DEFAULT_URL, "has_drop": True, + "name": "California Data Broker Registry (CPPA)"}, + "vt": {"jurisdiction": "US-VT", "format": "portal", "has_drop": False, + "url": "https://bizfilings.vermont.gov/online/DatabrokerInquire/", + "name": "Vermont Data Broker Registry (Secretary of State)"}, + "or": {"jurisdiction": "US-OR", "format": "portal", "has_drop": False, + "url": "https://dfr.oregon.gov/business/licensing/data-broker-registry/Pages/index.aspx", + "name": "Oregon Data Broker Registry (DCBS)"}, + "tx": {"jurisdiction": "US-TX", "format": "portal", "has_drop": False, + "url": "https://texas-sos.appianportalsgov.com/data-broker-registry", + "name": "Texas Data Broker Registry (Secretary of State)"}, +} + + +def portals() -> list[dict]: + """Registry sources that are searchable portals (no bulk export) -- surfaced to the operator.""" + return [{"key": k, "jurisdiction": s["jurisdiction"], "name": s["name"], "url": s["url"]} + for k, s in SOURCES.items() if s["format"] == "portal"] + +# Field label -> substring to locate its column on the header row (robust to +# year-to-year column shifts; the registry re-orders/adds columns between years). +_LABELS = { + "name": "data broker name:", + "dba": "doing business as", + "website": "data broker primary website:", + "email": "primary contact email", + "rights_url": "exercise their ca consumer privacy act rights", + "fcra": "regulated by the federal fair credit reporting act (fcra):", +} + + +def _norm(s: str) -> str: + """Registry CSVs use NBSPs and a BOM; normalize for matching + clean values.""" + return re.sub(r"\s+", " ", (s or "").replace("\ufeff", "").replace("\xa0", " ")).strip() + + +def slug(name: str, website: str = "") -> str: + base = re.sub(r"\.(com|org|net|io|ai|inc|co|us|info|llc)\b", "", (name or "").strip(), flags=re.I) + s = re.sub(r"[^a-z0-9]+", "", base.lower()) + if s: + return s + dom = re.sub(r"^https?://(www\.)?", "", (website or "").lower()) + return re.sub(r"[^a-z0-9]+", "", dom.split("/")[0]) or "broker" + + +def _domain(website: str) -> str: + dom = re.sub(r"^https?://(www\.)?", "", (website or "").strip().lower()) + return dom.split("/")[0] + + +def _find_colmap(rows: list[list[str]]) -> tuple[int, dict[str, int]]: + """Locate the label row (col0 == 'Data broker name:') and map fields to columns.""" + for i, row in enumerate(rows[:5]): + if row and _norm(row[0]).lower().startswith("data broker name:"): + colmap: dict[str, int] = {} + for field, needle in _LABELS.items(): + for j, cell in enumerate(row): + c = _norm(cell).lower() + if needle in c and not c.startswith("if the data broker"): + colmap[field] = j + break + return i, colmap + raise ValueError("CA registry: could not locate the header row") + + +def _get(row: list[str], idx: int | None) -> str: + return _norm(row[idx]) if idx is not None and idx < len(row) else "" + + +def _build(row: list[str], cm: dict[str, int], jurisdiction: str = "US-CA", + has_drop: bool = True) -> dict | None: + name = _get(row, cm.get("name")) + website = _get(row, cm.get("website")) + if not (name or website): + return None + email = _get(row, cm.get("email")) + rights = _get(row, cm.get("rights_url")) + dba = _get(row, cm.get("dba")) + fcra = _get(row, cm.get("fcra")).lower().startswith("y") + state = jurisdiction.split("-")[-1] + + method = "email" if email else ("web_form" if rights else "drop") + if has_drop: + notes = ("Registered CA data broker. One CA DROP request (privacy.ca.gov/drop) deletes from " + "this and every registered broker at once; or send a CCPA deletion request to the " + "contact email.") + else: + notes = (f"Registered {state} data broker (no one-shot delete portal in {state}). Send a " + "CCPA/state-law deletion request to the contact email.") + if fcra: + notes += (" FCRA-regulated: some data is credit-reporting data with separate rules -- deletion " + "may be limited; a consumer report dispute/security-freeze may apply instead.") + return { + "id": slug(name, website), + "name": name or _domain(website), + "dba": dba or None, + "category": "data_broker", + "priority": "long_tail", + "jurisdictions": [jurisdiction], + "search": {"method": "none", "url": website, "fetch": "none", "by": ["registry"]}, + "optout": { + "method": method, + "url": rights or website or None, + "email": email or None, + "requires": {"profile_url": False, "email_verification": False, "captcha": False, + "gov_id": False, "account": False, "phone_callback": False, "payment": False}, + "inputs": ["full_name", "contact_email"], + "deletion": { + "via": "drop" if has_drop else "email", + "email": email or None, + "url": rights or None, + "kinds": ["ccpa", "generic"], + "notes": ("Covered by the CA DROP one-shot (privacy.ca.gov/drop); CCPA email fallback." + if has_drop else "CCPA/state-law deletion email (no one-shot portal)."), + }, + "fcra": fcra, + "est_processing_days": 45, + "notes": notes, + }, + "source": f"{state}-registry", + "confidence": "registry", + "last_verified": None, + } + + +def parse(csv_text: str, jurisdiction: str = "US-CA", has_drop: bool = True) -> list[dict]: + """Parse a data-broker-registry CSV into broker records (deduped by id). + + Column detection is by header label, not fixed position, so any state that publishes a + registry CSV with name/website/email/rights columns parses without new code. + """ + rows = list(csv.reader(io.StringIO(csv_text))) + if not rows: + return [] + header_i, cm = _find_colmap(rows) + out: list[dict] = [] + seen: dict[str, int] = {} + for row in rows[header_i + 1:]: + if not any(c.strip() for c in row): + continue + rec = _build(row, cm, jurisdiction, has_drop) + if not rec: + continue + bid = rec["id"] + if bid in seen: # disambiguate id collisions by domain, then a counter + dom = re.sub(r"[^a-z0-9]+", "", _domain(rec["search"]["url"])) + cand = f"{bid}-{dom}" if dom and dom != bid else bid + while cand in seen: + seen[bid] += 1 + cand = f"{bid}-{seen[bid]}" + rec["id"] = cand + seen.setdefault(rec["id"], 0) + seen.setdefault(bid, 0) + out.append(rec) + return out + + +MIN_EXPECTED_CA = 100 # CA registry has ~500+; far fewer => wrong/empty file, warn + + +def fetch(url: str = DEFAULT_URL, timeout: int = 60) -> str: + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310 + return resp.read().decode("utf-8", errors="replace") + + +def _fetch_ca_latest() -> tuple[str, list[dict]]: + """Try newest CA registry year first; return (url, records) for the first non-empty.""" + last: tuple[str, list[dict]] = (DEFAULT_URL, []) + for url in ca_candidate_urls(): + try: + recs = parse(fetch(url), jurisdiction="US-CA", has_drop=True) + except Exception: # noqa: BLE001 - a missing year 404s; fall through to older years + continue + if recs: + return url, recs + last = (url, recs) + return last + + +def refresh(cache_path: Path, url: str = DEFAULT_URL, csv_text: str | None = None) -> dict: + """CA single-source refresh: fetch (or accept) the CA CSV and write the cache.""" + text = csv_text if csv_text is not None else fetch(url) + records = parse(text) + storage.write_json(cache_path, records) + fcra = sum(1 for r in records if (r.get("optout") or {}).get("fcra")) + return {"parsed": len(records), "fcra_regulated": fcra, + "cache_path": str(cache_path), "source_url": url} + + +def refresh_all(cache_path: Path, fetched: dict[str, str] | None = None) -> dict: + """Multi-source refresh: pull every CSV source, dedupe across states by domain, cache. + + `fetched` optionally supplies {source_key: csv_text} to bypass the network (tests). CSV + sources are ingested as broker records; portal sources contribute their URL for the operator + (no bulk export exists) but no records. CA is processed first so it wins domain collisions. + """ + all_recs: list[dict] = [] + seen_domains: set[str] = set() + per_source: dict[str, dict] = {} + for key, src in SOURCES.items(): + if src["format"] != "csv": + per_source[key] = {"jurisdiction": src["jurisdiction"], "format": "portal", + "url": src["url"], "records": 0, + "note": "searchable portal (no bulk export); operator/agent searches by name"} + continue + used_url = src["url"] + try: + if fetched is not None: + text = fetched.get(key) + if text is None: + raise RuntimeError("no CSV text supplied") + recs = parse(text, jurisdiction=src["jurisdiction"], has_drop=src["has_drop"]) + elif key == "ca": + used_url, recs = _fetch_ca_latest() # newest-year-first with fallback + else: + recs = parse(fetch(src["url"]), jurisdiction=src["jurisdiction"], has_drop=src["has_drop"]) + except Exception as exc: # noqa: BLE001 - one source failing must not sink the rest + per_source[key] = {"jurisdiction": src["jurisdiction"], "format": "csv", "error": str(exc)} + continue + added = 0 + for r in recs: + dom = _domain(r["search"]["url"]) + if dom and dom in seen_domains: + continue + if dom: + seen_domains.add(dom) + all_recs.append(r) + added += 1 + entry = {"jurisdiction": src["jurisdiction"], "format": "csv", "url": used_url, + "parsed": len(recs), "added_after_dedupe": added, + "fcra": sum(1 for r in recs if (r.get("optout") or {}).get("fcra"))} + if key == "ca" and len(recs) < MIN_EXPECTED_CA: + entry["warning"] = (f"only {len(recs)} parsed (expected >{MIN_EXPECTED_CA}); the CA " + "registry file may be empty/moved - verify the source URL") + per_source[key] = entry + storage.write_json(cache_path, all_recs) + return {"total": len(all_recs), "sources": per_source, "portals": portals(), + "cache_path": str(cache_path)} diff --git a/optional-skills/security/unbroker/scripts/report.py b/optional-skills/security/unbroker/scripts/report.py new file mode 100644 index 000000000..1c38164a2 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/report.py @@ -0,0 +1,161 @@ +"""Status dashboards, Markdown reports, human-task digest, and Google Sheets row export.""" +from __future__ import annotations + +import brokers as brokers_mod +import ledger as ledger_mod + +STATE_LABELS = { + "new": "Not started", + "searching": "Searching", + "not_found": "Not found", + "found": "Found (action needed)", + "indirect_exposure": "Indirect exposure (PII on a relative's record)", + "action_selected": "Action selected", + "submitted": "Submitted", + "verification_pending": "Awaiting verification", + "awaiting_processing": "Processing", + "confirmed_removed": "Removed", + "reappeared": "Reappeared", + "human_task_queued": "Human task", + "blocked": "Blocked", +} + + +def status_counts(subject_id: str) -> dict: + counts: dict[str, int] = {} + for case in ledger_mod.load(subject_id).values(): + state = case.get("state", "new") + counts[state] = counts.get(state, 0) + 1 + return counts + + +def metrics(subject_id: str) -> dict: + """Outcome metrics: what's actually confirmed vs merely claimed, and what's overdue. + + removal_rate is confirmed_removed over cases we actually acted on (found/submitted/... ), + NOT over the whole broker DB, so it reflects real progress on real exposure. `in_flight` + is 'claimed' (submitted/verifying/processing) but not yet re-scan-confirmed. `overdue` + counts cases whose recheck window has already passed (the cron backlog). + """ + c = status_counts(subject_id) + removed = c.get("confirmed_removed", 0) + in_flight = c.get("submitted", 0) + c.get("verification_pending", 0) + c.get("awaiting_processing", 0) + open_found = c.get("found", 0) + c.get("reappeared", 0) + c.get("action_selected", 0) \ + + c.get("indirect_exposure", 0) + acted = removed + in_flight + open_found + c.get("human_task_queued", 0) + c.get("blocked", 0) + return { + "confirmed_removed": removed, + "in_flight_claimed": in_flight, # submitted but NOT yet verified gone + "open_needs_action": open_found, + "blocked": c.get("blocked", 0), + "human_tasks": c.get("human_task_queued", 0), + "acted_total": acted, + "removal_rate": round(removed / acted, 3) if acted else 0.0, + "overdue_rechecks": len(ledger_mod.due(subject_id)), + } + + +def render_markdown(subject_id: str) -> str: + ledger = ledger_mod.load(subject_id) + counts = status_counts(subject_id) + total = sum(counts.values()) + removed = counts.get("confirmed_removed", 0) + + m = metrics(subject_id) + lines = [ + f"# unbroker - status for `{subject_id}`", + "", + f"**{removed} / {total} confirmed removed** · removal rate (of acted-on cases): " + f"{int(m['removal_rate'] * 100)}%", + "", + f"- Confirmed removed: {m['confirmed_removed']}", + f"- In flight (submitted, not yet re-scan-confirmed): {m['in_flight_claimed']}", + f"- Open / needs action: {m['open_needs_action']}", + f"- Blocked (anti-bot): {m['blocked']} · Human tasks: {m['human_tasks']}", + f"- Overdue rechecks (cron backlog): {m['overdue_rechecks']}", + "", + "| State | Count |", + "|---|---|", + ] + for state in ledger_mod.STATES: + if counts.get(state): + lines.append(f"| {STATE_LABELS.get(state, state)} | {counts[state]} |") + + tasks = [c for c in ledger.values() if c.get("state") == "human_task_queued"] + if tasks: + lines += ["", "## Outstanding human tasks"] + for c in tasks: + reason = c.get("human_task_reason", "manual step required") + lines.append(f"- **{c.get('broker_id')}** - {reason}") + + indirect = [c for c in ledger.values() if c.get("state") == "indirect_exposure"] + if indirect: + lines += ["", "## Indirect exposure (your PII on third-party records)", + "Not removable via the broker's self-service opt-out (the record is about someone " + "else). Lever: a targeted CCPA/GDPR delete-my-PII request naming only your own " + "identifiers."] + for c in indirect: + ev = c.get("evidence") or {} + note = ev.get("summary") or "subject's identifiers appear on another person's listing" + lines.append(f"- **{c.get('broker_id')}** - {note}") + return "\n".join(lines) + "\n" + + +def human_tasks_markdown(subject_id: str) -> str: + """ONE consolidated digest of everything that genuinely needs a human. + + The autonomous run accumulates human-only work silently (never interrupting); + this digest is presented once, at the end, so the operator clears it in a + single sitting. Includes queued tasks and blocked-site operator-browser checks. + """ + ledger = ledger_mod.load(subject_id) + tasks = [(bid, c) for bid, c in sorted(ledger.items()) if c.get("state") == "human_task_queued"] + blocked = [(bid, c) for bid, c in sorted(ledger.items()) if c.get("state") == "blocked"] + + lines = [f"# Human tasks for `{subject_id}`", ""] + if not tasks and not blocked: + lines.append("Nothing needs a human right now.") + return "\n".join(lines) + "\n" + + lines.append(f"{len(tasks)} manual step(s) + {len(blocked)} blocked site(s). " + "Everything else ran (or will run) autonomously.") + if tasks: + lines += ["", "## Manual steps"] + for bid, c in tasks: + b = brokers_mod.get(bid) or {} + opt = b.get("optout") or {} + lines.append(f"### {b.get('name', bid)}") + lines.append(f"- Why: {c.get('human_task_reason', 'manual step required')}") + where = opt.get("url") or opt.get("email") or "(see broker record)" + lines.append(f"- Where: {where}") + for q in (opt.get("quirks") or [])[:2]: + lines.append(f"- Note: {q}") + lines.append("- Withhold: SSN and full ID numbers - always.") + lines.append(f"- When done, tell the agent so it records the outcome for `{bid}`.") + if blocked: + lines += ["", "## Blocked sites (open in YOUR browser - it gets through where bots don't)"] + for bid, c in blocked: + b = brokers_mod.get(bid) or {} + url = ((b.get("search") or {}).get("url")) or "(see broker record)" + lines.append(f"- **{b.get('name', bid)}** - open {url}, search the subject, and report " + "the verdict (or a screenshot) back to the agent.") + return "\n".join(lines) + "\n" + + +def sheets_rows(subject_id: str) -> list[list[str]]: + """Header + one row per case for the optional Google Sheets tracker. + + The agent appends these via the `google-workspace` skill, e.g.: + google_api.py sheets append <SHEET_ID> "Sheet1!A:F" --values <json-rows> + """ + rows = [["broker_id", "state", "found", "tier", "removed_at", "next_recheck"]] + for bid, c in sorted(ledger_mod.load(subject_id).items()): + rows.append([ + bid, + c.get("state", ""), + str(c.get("found", "")), + (c.get("automation") or {}).get("tier_used", ""), + c.get("removal_confirmed_at") or "", + c.get("next_recheck_at") or "", + ]) + return rows diff --git a/optional-skills/security/unbroker/scripts/scan.py b/optional-skills/security/unbroker/scripts/scan.py new file mode 100644 index 000000000..3c91c30e7 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/scan.py @@ -0,0 +1,32 @@ +"""Stdlib fetch helper for simple url_pattern brokers (osint-style). + +For JS-rendered or anti-bot pages the agent should use the `web_extract` or +`browser_navigate` tools (and the `scrapling` skill for stealth/Cloudflare). +This helper only covers plain static pages and is intentionally network-light so +it can be mocked in tests. +""" +from __future__ import annotations + +import urllib.error +import urllib.request + +USER_AGENT = "Mozilla/5.0 (compatible; unbroker/1.0; data opt-out)" + + +def fetch(url: str, timeout: int = 20) -> tuple[int, str]: + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310 (https only by convention) + charset = resp.headers.get_content_charset() or "utf-8" + return getattr(resp, "status", 200), resp.read().decode(charset, errors="replace") + except urllib.error.HTTPError as exc: + return exc.code, "" + except (urllib.error.URLError, TimeoutError, ValueError): + return 0, "" + + +def looks_listed(html: str, match_signal: str | None) -> bool: + """Naive confirmation heuristic for static pages: does the match signal appear?""" + if not html or not match_signal: + return False + return match_signal.lower() in html.lower() diff --git a/optional-skills/security/unbroker/scripts/storage.py b/optional-skills/security/unbroker/scripts/storage.py new file mode 100644 index 000000000..29a66bb80 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/storage.py @@ -0,0 +1,138 @@ +"""Storage helpers (stdlib only): atomic JSON, append-only JSONL, strict perms. + +Default backend is local-json. The optional google-sheets tracker is handled in +report.py by emitting rows for the `google-workspace` skill; this module stays +dependency-free so the hermetic tests never touch the network. +""" +from __future__ import annotations + +import contextlib +import json +import os +import time +from pathlib import Path +from typing import Any + +import crypto +import paths + + +@contextlib.contextmanager +def locked(target: Path, timeout: float = 10.0, stale: float = 30.0): + """Portable advisory lock via an O_EXCL lockfile next to `target`. + + Serializes read-modify-write on shared JSON (the ledger) across concurrent + processes - a cron re-scan overlapping a manual run, or multiple tenants - + so one writer can't clobber another's update. A lock older than `stale` + seconds is treated as abandoned (crashed writer) and broken, so a dead + process can never deadlock the queue. Works on macOS/Linux/Windows (O_EXCL). + """ + ensure_dir(target.parent) + lock = target.with_name(target.name + ".lock") + deadline = time.monotonic() + timeout + while True: + try: + fd = os.open(str(lock), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600) + try: + os.write(fd, str(os.getpid()).encode()) + finally: + os.close(fd) + break + except FileExistsError: + try: + if time.time() - lock.stat().st_mtime > stale: + lock.unlink(missing_ok=True) + continue + except OSError: + pass + if time.monotonic() >= deadline: + raise TimeoutError(f"could not acquire lock {lock} within {timeout}s") + time.sleep(0.05) + try: + yield + finally: + with contextlib.suppress(OSError): + lock.unlink(missing_ok=True) + + +def _secure(path: Path, mode: int) -> None: + try: + os.chmod(path, mode) + except OSError: + pass # non-POSIX / unsupported FS; HERMES_HOME directory perms still apply + + +def ensure_dir(path: Path) -> Path: + path.mkdir(parents=True, exist_ok=True) + _secure(path, 0o700) + return path + + +def _is_sensitive(path: Path) -> bool: + """Per-subject docs (dossier, ledger) are sensitive; config/cache are not.""" + try: + Path(path).resolve().relative_to(paths.subjects_dir().resolve()) + return True + except (ValueError, OSError): + return False + + +def _age_path(path: Path) -> Path: + return path.with_name(path.name + ".age") + + +def _atomic_write(path: Path, data: bytes) -> Path: + tmp = path.with_name(path.name + ".tmp") + tmp.write_bytes(data) + _secure(tmp, 0o600) + os.replace(tmp, path) + _secure(path, 0o600) + return path + + +def write_json(path: Path, obj: Any) -> Path: + ensure_dir(path.parent) + data = (json.dumps(obj, indent=2, ensure_ascii=False) + "\n").encode("utf-8") + if _is_sensitive(path) and crypto.encryption_setting() == "age": + if not crypto.age_available(): + raise RuntimeError( + "encryption=age is configured but `age` is not available; " + "refusing to write PII as plaintext. Install age or run `setup --encryption none`." + ) + target = _atomic_write(_age_path(path), crypto.encrypt(data)) + if path.exists(): + path.unlink() # migrate plaintext -> ciphertext + return target + target = _atomic_write(path, data) + ap = _age_path(path) + if ap.exists(): + ap.unlink() # encryption turned off -> drop stale ciphertext + return target + + +def read_json(path: Path, default: Any = None) -> Any: + ap = _age_path(path) + if ap.exists(): + return json.loads(crypto.decrypt(ap.read_bytes()).decode("utf-8")) + if path.exists(): + return json.loads(path.read_text(encoding="utf-8")) + return default + + +def append_jsonl(path: Path, record: dict) -> Path: + ensure_dir(path.parent) + with path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps(record, ensure_ascii=False) + "\n") + _secure(path, 0o600) + return path + + +def read_jsonl(path: Path) -> list[dict]: + if not path.exists(): + return [] + out: list[dict] = [] + for line in path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if line: + out.append(json.loads(line)) + return out diff --git a/optional-skills/security/unbroker/scripts/tiers.py b/optional-skills/security/unbroker/scripts/tiers.py new file mode 100644 index 000000000..d83efcf33 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/tiers.py @@ -0,0 +1,283 @@ +"""Automation-tier selection and per-subject action planning. + +Tiers: + T0 fully automated, no verification loop + T1 automated submit + automated verification (email mode B/C, or backend-cleared captcha) + T2 automated submit, verification needs a human (hard captcha / phone callback / account) + T3 human-required end-to-end (gov ID, fax, mail, voice-only phone) +""" +from __future__ import annotations + +import dossier as dossier_mod +import vectors as vectors_mod + +HARD_HUMAN = ("gov_id", "fax", "mail", "phone_voice") + + +def select_tier(broker: dict, email_mode: str = "draft_only", + browser_clears_captcha: bool = False) -> str: + req = ((broker.get("optout") or {}).get("requires")) or {} + if not isinstance(req, dict): + req = {} # defensive: a malformed record (e.g. requires as a list) must not crash planning + + if any(req.get(k) for k in HARD_HUMAN): + return "T3" + if req.get("account"): + return "T2" + + captcha = bool(req.get("captcha")) + if (captcha and not browser_clears_captcha) or req.get("phone_callback"): + return "T2" + + if req.get("email_verification"): + return "T1" if email_mode in ("programmatic", "alias") else "T2" + + if captcha and browser_clears_captcha: + return "T1" + return "T0" + + +def plan(subject_dossier: dict, brokers_list: list[dict], cfg: dict, + browser_clears_captcha: bool = False) -> list[dict]: + email_mode = (subject_dossier.get("preferences") or {}).get("email_mode") \ + or cfg.get("email_mode", "draft_only") + actions: list[dict] = [] + for b in brokers_list: + opt = b.get("optout") or {} + search = b.get("search") or {} + # Defensive shape coercion: a subagent may have written a malformed record (requires as a + # list, quirks as a string). Normalize here so nothing downstream crashes on a bad broker file. + req = opt.get("requires") if isinstance(opt.get("requires"), dict) else {} + q = opt.get("quirks") + quirks = q if isinstance(q, list) else ([q] if isinstance(q, str) and q else []) + tier = select_tier(b, email_mode, browser_clears_captcha) + disclosure = dossier_mod.select_disclosure(subject_dossier, opt.get("inputs", [])) + svectors = vectors_mod.search_vectors(subject_dossier, b) + # Pre-warn (don't discover mid-flow): a broker whose identity gate hard-requires DOB will + # force a human touchpoint if DOB was not collected at intake (§4.1). Surface it now. + prewarn: list[str] = [] + if req.get("dob") and not (subject_dossier.get("identity") or {}).get("date_of_birth"): + prewarn.append("date_of_birth: this broker's identity gate requires DOB to match records; " + "collect it up front (intake --dob) or expect a mid-flow human pause") + actions.append({ + "broker_id": b.get("id"), + "broker_name": b.get("name"), + "priority": b.get("priority"), + "method": opt.get("method"), + "tier": tier, + "human_required": tier == "T3", + "search_url": search.get("url"), + "fetch": search.get("fetch", "web_extract"), + "antibot": search.get("antibot"), + "search_by": vectors_mod.supported_by(b), + "search_vectors": svectors, + "optout_url": opt.get("url"), + "optout_email": opt.get("email"), + "disclosure_fields": sorted(disclosure.keys()), + "needs_operator_input": prewarn, + "owns": b.get("owns") or [], + "notes": opt.get("notes", ""), + "optout_quirks": quirks, + "optout_requires": req, + # The DELETION lane (right-to-delete), distinct from listing suppression. Structured so + # the autopilot can route to it: {via: email|in_flow|web_form, email?, url?, kinds?, notes?} + "deletion": opt.get("deletion") or {}, + # Exact ordered opt-out steps maintained IN the broker record (field-verified knowledge + # lives with the data, not in code). + "optout_playbook": opt.get("playbook") or [], + }) + return actions + + +def fanout(brokers_list: list[dict], batch_size: int = 5) -> dict: + """Group brokers into batches for parallel `delegate_task` scan subagents. + + Scanning many brokers serially is slow and burns context; above `batch_size` + the agent is expected to spawn one subagent per batch (see SKILL.md). + """ + ids = [b.get("id") for b in brokers_list if b.get("id")] + batches = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)] + return { + "broker_count": len(ids), + "batch_size": batch_size, + "should_fanout": len(ids) > batch_size, + "batches": batches, + } + + +# States that mean "the crawl reached a verdict for this broker". +_SCANNED_STATES = {"found", "not_found", "indirect_exposure", "blocked", "submitted", + "verification_pending", "awaiting_processing", "confirmed_removed", "reappeared", + "action_selected", "human_task_queued"} +# States that still need a deletion action taken. +_ACTIONABLE_STATES = {"found", "indirect_exposure", "reappeared", "action_selected"} + + +def batch_plan(subject_dossier: dict, brokers_list: list[dict], cfg: dict, + ledger: dict | None = None, browser_clears_captcha: bool = False) -> dict: + """Reduce the per-broker plan into a phase-oriented batch view. + + Overlays the current ledger state on each broker, groups by what the operator + should DO next, and collapses ownership clusters so a parent removal that clears + children is ONE action, not N. Read-only: computes, never mutates the ledger. + """ + ledger = ledger or {} + actions = plan(subject_dossier, brokers_list, cfg, browser_clears_captcha) + + # child id -> parent id (only for parents present in this plan set) + child_to_parent: dict[str, str] = {} + for a in actions: + for child in a.get("owns") or []: + child_to_parent[child] = a["broker_id"] + + def state_of(bid: str) -> str: + return (ledger.get(bid) or {}).get("state", "new") + + groups: dict[str, list[dict]] = { + "unscanned": [], # no verdict yet -> Phase 1 crawl + "found": [], # direct removable listing -> Phase 2 opt-out (incl. reappeared/action_selected) + "indirect_exposure": [],# PII on a third party's record -> CCPA/GDPR delete email + "blocked": [], # anti-bot / needs stealth browser -> requeue + "in_progress": [], # submitted / verification_pending / awaiting_processing + "human": [], # human_task_queued -> the end-of-run digest, NOT re-scanning + "done": [], # confirmed_removed + "not_found": [], + } + covered_by_parent: dict[str, list[str]] = {} + + for a in actions: + bid = a["broker_id"] + st = state_of(bid) + # cluster collapse: if a parent in this set is already actioned, the child is covered + parent = child_to_parent.get(bid) + if parent and state_of(parent) in ("found", "reappeared", "action_selected", "submitted", + "verification_pending", "awaiting_processing", + "confirmed_removed", "human_task_queued"): + covered_by_parent.setdefault(parent, []).append(bid) + continue + + row = {"broker_id": bid, "broker_name": a["broker_name"], "priority": a["priority"], + "tier": a["tier"], "method": a["method"], "state": st, + "optout_url": a["optout_url"], "optout_email": a.get("optout_email"), + "clears_children": a.get("owns") or [], + "optout_requires": a.get("optout_requires") or {}, + "optout_quirks": a.get("optout_quirks") or [], + "deletion": a.get("deletion") or {}, + "optout_playbook": a.get("optout_playbook") or [], + "notes": a.get("notes", "")} + if st in ("submitted", "verification_pending", "awaiting_processing"): + groups["in_progress"].append(row) + elif st == "confirmed_removed": + groups["done"].append(row) + elif st in ("reappeared", "action_selected"): + groups["found"].append(row) # still needs the opt-out action + elif st == "human_task_queued": + groups["human"].append(row) # parked for the digest; never re-queued as work + elif st in groups: + groups[st].append(row) + elif st not in _SCANNED_STATES: + groups["unscanned"].append(row) + else: + groups.setdefault(st, []).append(row) + + # PARENTS FIRST: within the actionable 'found' group, order cluster parents (a removal + # that clears children) ahead of standalone listings, most-children first. Working a + # parent before its children is what makes the cluster dedup real -- do them in this order. + groups["found"].sort(key=lambda r: (-len(r.get("clears_children") or []), + {"T0": 0, "T1": 1, "T2": 2, "T3": 3}.get(r.get("tier") or "", 9), + r["broker_id"])) + + return { + "subject": subject_dossier.get("subject_id"), + "phase": "discover" if groups["unscanned"] else "delete", + "counts": {k: len(v) for k, v in groups.items()}, + "groups": groups, + "cluster_savings": {p: kids for p, kids in covered_by_parent.items()}, + "parent_playbook": _parent_playbook(groups["found"]), + "next_actions": _batch_next(groups, covered_by_parent), + } + + +def synthesize_steps(r: dict) -> list[str]: + """Generic ordered opt-out steps derived from an optout record's structured fields. + + Used for any broker without a hand-verified `optout.playbook`. Bespoke, field-verified + step lists live IN the broker JSON (`optout.playbook`) - single source of truth that + accrues knowledge as live runs discover mechanics (see methods.md logging rule). + """ + steps = [f"Opt out at {r.get('optout_url') or r.get('optout_email') or '(see broker record)'}" + + (f" -- clears {', '.join(r['clears_children'])}." if r.get("clears_children") else ".")] + req = r.get("optout_requires") or {} + if req.get("profile_url"): + steps.append("Needs the confirmed profile_url (paste the listing URL you recorded).") + if req.get("email_verification"): + steps.append("Email verification: the same browser/inbox must open the confirmation link.") + if req.get("phone_callback"): + steps.append("Phone-callback code required; queue a human task if no operator is available.") + if req.get("gov_id"): + steps.append("Government ID demanded (T3): human task; never send SSN or a full ID number.") + d = r.get("deletion") or {} + if d.get("email"): + steps.append(f"DELETION lane: a right-to-delete request can be emailed to {d['email']}" + + (f" ({d['notes']})" if d.get("notes") else "") + + " -- prefer deletion over suppression.") + if r.get("notes"): + steps.append(str(r["notes"])) + for q in (r.get("optout_quirks") or [])[:3]: + steps.append(str(q)) + return steps + + +def _parent_playbook(found_rows: list[dict]) -> list[dict]: + """Tailored, ordered opt-out instructions for each cluster PARENT in the found group. + + Steps come from the broker record's own `optout.playbook` (field-verified, maintained with + the data) with a synthesised fallback so the guidance is never empty. Standalone listings + are intentionally omitted -- the playbook exists to make the parents-first order concrete. + """ + playbook: list[dict] = [] + for i, r in enumerate([x for x in found_rows if x.get("clears_children")], start=1): + steps = list(r.get("optout_playbook") or []) or synthesize_steps(r) + playbook.append({ + "order": i, + "broker_id": r["broker_id"], + "broker_name": r["broker_name"], + "tier": r["tier"], + "clears_children": r["clears_children"], + "optout_url": r.get("optout_url"), + "optout_email": r.get("optout_email"), + "deletion": r.get("deletion") or {}, + "steps": steps, + }) + return playbook + + +def _batch_next(groups: dict, covered: dict) -> list[str]: + tips: list[str] = [] + if groups["unscanned"]: + tips.append(f"PHASE 1 (crawl): {len(groups['unscanned'])} broker(s) unscanned -- run `fanout` and " + "scan read-only before any deletion.") + if groups["found"]: + parents = [r for r in groups["found"] if r.get("clears_children")] + if parents: + order = " -> ".join(r["broker_id"] for r in parents) + tips.append(f"PHASE 2 (opt-out): {len(groups['found'])} direct listing(s). DO CLUSTER PARENTS " + f"FIRST, in this order: {order} (see `parent_playbook` for tailored per-parent " + "steps), then the standalone listings.") + else: + tips.append(f"PHASE 2 (opt-out): {len(groups['found'])} direct listing(s) to remove.") + if groups["indirect_exposure"]: + tips.append(f"{len(groups['indirect_exposure'])} indirect-exposure case(s): send a targeted " + "CCPA/GDPR delete-my-PII email (render-email --kind ccpa_indirect), do NOT use the opt-out form.") + if groups["blocked"]: + tips.append(f"{len(groups['blocked'])} blocked (anti-bot): requeue for a stealth/cloud browser " + "pass; don't burn subagent time fighting CAPTCHAs.") + if covered: + n = sum(len(v) for v in covered.values()) + tips.append(f"Cluster dedup: {n} child site(s) covered by parent removals -- skip separate opt-outs.") + if groups["in_progress"]: + tips.append(f"{len(groups['in_progress'])} in progress: resolve verification links, then confirm removal.") + if groups.get("human"): + tips.append(f"{len(groups['human'])} parked human task(s): present via `tasks` at end of run " + "(do not re-scan or re-queue them).") + return tips diff --git a/optional-skills/security/unbroker/scripts/vectors.py b/optional-skills/security/unbroker/scripts/vectors.py new file mode 100644 index 000000000..4fcf006e1 --- /dev/null +++ b/optional-skills/security/unbroker/scripts/vectors.py @@ -0,0 +1,53 @@ +"""Enumerate the search queries to run per broker, across ALL of a subject's identifiers. + +People-search sites index a person under every name, phone, email, and address they +have. A subject with two names (maiden/married) and three past cities can have many +distinct listings on one broker, each found via a different search. `search_vectors` +expands the dossier into the concrete searches to run, filtered by what each broker +supports (`broker.search.by`, default ["name"]). +""" +from __future__ import annotations + +import dossier as dossier_mod + +# What a broker can be searched by; default if a record doesn't declare it. +DEFAULT_BY = ["name"] + + +def supported_by(broker: dict) -> list[str]: + return list((broker.get("search") or {}).get("by") or DEFAULT_BY) + + +def search_vectors(subject_dossier: dict, broker: dict) -> list[dict]: + """List of {by, query} searches to run for this subject on this broker.""" + by = set(supported_by(broker)) + ident = subject_dossier.get("identity", {}) + vectors: list[dict] = [] + + if "name" in by: + names = dossier_mod.all_names(subject_dossier) + locations = dossier_mod.all_locations(subject_dossier) + if locations: + for name in names: + for loc in locations: + vectors.append({"by": "name", + "query": {"full_name": name, "city": loc.get("city"), "state": loc.get("state")}}) + else: + for name in names: + vectors.append({"by": "name", "query": {"full_name": name}}) + + if "phone" in by: + for phone in ident.get("phones") or []: + vectors.append({"by": "phone", "query": {"phone": phone}}) + + if "email" in by: + for email in ident.get("emails") or []: + vectors.append({"by": "email", "query": {"email": email}}) + + if "address" in by: + for a in dossier_mod.all_addresses(subject_dossier): + if a.get("line1"): + vectors.append({"by": "address", + "query": {k: a.get(k) for k in ("line1", "city", "state", "postal")}}) + + return vectors diff --git a/optional-skills/security/unbroker/templates/consent/authorization.md b/optional-skills/security/unbroker/templates/consent/authorization.md new file mode 100644 index 000000000..981591672 --- /dev/null +++ b/optional-skills/security/unbroker/templates/consent/authorization.md @@ -0,0 +1,15 @@ +# Authorization to act on my behalf (data removal) + +I, **{full_name}**, authorize the operator of this tool to act as my agent for the limited purpose of +removing my personal information from data brokers and people-search websites, including submitting +opt-out, deletion, and do-not-sell/share requests under applicable privacy laws (e.g. CCPA/CPRA, +GDPR) on my behalf. + +This authorization is limited to data removal. It does not authorize any other use of my information. + +- Full name: {full_name} +- Date: {date} +- Signature: ______________________________ + +Store the signed copy at the path recorded in the dossier `consent.authorization_artifact`. Required +only when `consent.method` is `written_authorization` or `poa` (not for `self`). diff --git a/optional-skills/security/unbroker/templates/emails/ccpa-authorized-agent.txt b/optional-skills/security/unbroker/templates/emails/ccpa-authorized-agent.txt new file mode 100644 index 000000000..e5e4a09a1 --- /dev/null +++ b/optional-skills/security/unbroker/templates/emails/ccpa-authorized-agent.txt @@ -0,0 +1,24 @@ +Subject: CCPA/CPRA request submitted by authorized agent (delete and opt out) + +To the {broker_name} privacy team, + +I am an authorized agent acting on behalf of the consumer named below, under Cal. Civ. Code +1798.135 and the CCPA/CPRA. Written authorization is on file and available on request. + +On the consumer's behalf I request that you: + + 1. DELETE all personal information you hold about them, and + 2. OPT them OUT of the sale and sharing of their personal information. + +Their information appears at: +{listing_urls} + +Consumer: + Name: {full_name} + Email for confirmation: {contact_email} + +Please confirm completion in writing to the email above. Do not request more sensitive information +than necessary to verify and process this request. + +Sincerely, +Authorized agent for {full_name} diff --git a/optional-skills/security/unbroker/templates/emails/ccpa-deletion.txt b/optional-skills/security/unbroker/templates/emails/ccpa-deletion.txt new file mode 100644 index 000000000..db2d201c7 --- /dev/null +++ b/optional-skills/security/unbroker/templates/emails/ccpa-deletion.txt @@ -0,0 +1,22 @@ +Subject: CCPA/CPRA request to delete and opt out (do not sell or share) + +To the {broker_name} privacy team, + +Under the California Consumer Privacy Act as amended by the CPRA (Cal. Civ. Code 1798.105 and +1798.120), I request that you: + + 1. DELETE all personal information you hold about me, and + 2. OPT me OUT of the sale and sharing of my personal information. + +My information appears at: +{listing_urls} + +Identifying details for this request: + Name: {full_name} + Email: {contact_email} + +Please do not request more sensitive information than necessary to process this request. Confirm +completion in writing to the email above within the statutory timeframe. + +Sincerely, +{full_name} diff --git a/optional-skills/security/unbroker/templates/emails/ccpa-indirect-deletion.txt b/optional-skills/security/unbroker/templates/emails/ccpa-indirect-deletion.txt new file mode 100644 index 000000000..e561e8d6e --- /dev/null +++ b/optional-skills/security/unbroker/templates/emails/ccpa-indirect-deletion.txt @@ -0,0 +1,30 @@ +Subject: Request to delete my personal information from third-party listings (CCPA/CPRA where applicable) + +To the {broker_name} privacy team, + +I am not the primary subject of the listings below, but each one currently exposes MY personal +information as a secondary data point (my email address and/or my name shown as a relative or +associated person). I am writing only about my own personal information, not about the individuals +who are the primary subjects of these records, and I am not requesting any change to their data. + +Please delete and suppress the following personal information about me wherever it appears in your +database and on Spokeo-operated sites, including Thatsthem.com: +{my_identifiers} + +These items currently appear at: +{listing_urls} + +To the extent the California Consumer Privacy Act as amended by the CPRA (Cal. Civ. Code 1798.105) +applies to my personal information, please treat this as a request to delete it and to opt me out of +its sale or sharing (1798.120). Where CCPA does not apply by residency, I ask that you honor this as a +standard removal of my personal information consistent with the policy you apply to such requests. + +Please do not request more information than is necessary to locate and remove these items, and please +do not add any new identifiers to my data in the course of processing this request. Confirm completion +in writing to the email below. + +Name: {full_name} +Contact email: {contact_email} + +Sincerely, +{full_name} diff --git a/optional-skills/security/unbroker/templates/emails/gdpr-erasure.txt b/optional-skills/security/unbroker/templates/emails/gdpr-erasure.txt new file mode 100644 index 000000000..b1b7936af --- /dev/null +++ b/optional-skills/security/unbroker/templates/emails/gdpr-erasure.txt @@ -0,0 +1,19 @@ +Subject: Request for erasure under GDPR Article 17 + +To the {broker_name} data protection officer, + +Under Article 17 of the EU General Data Protection Regulation (and/or the UK GDPR), I request the +erasure of all personal data you hold about me, and under Article 21 I object to its processing. + +My personal data appears at: +{listing_urls} + +Identifying details: + Name: {full_name} + Email: {contact_email} + +Please confirm erasure in writing to the email above within one month, as required by Article 12(3). +Do not request more personal data than necessary to action this request. + +Sincerely, +{full_name} diff --git a/optional-skills/security/unbroker/templates/emails/generic-optout.txt b/optional-skills/security/unbroker/templates/emails/generic-optout.txt new file mode 100644 index 000000000..3fef1bf98 --- /dev/null +++ b/optional-skills/security/unbroker/templates/emails/generic-optout.txt @@ -0,0 +1,15 @@ +Subject: Opt-out and data removal request + +To the {broker_name} privacy team, + +I am writing to request the removal of my personal information from {broker_name} and any sites you +operate or supply. My information currently appears at: +{listing_urls} + +Please suppress and delete the record(s) associated with my name, {full_name}, and do not sell or +share my personal information. + +Please confirm completion to this email address: {contact_email} + +Thank you, +{full_name} diff --git a/plugins/image_gen/openai/__init__.py b/plugins/image_gen/openai/__init__.py index e214271bc..cfa9e42c9 100644 --- a/plugins/image_gen/openai/__init__.py +++ b/plugins/image_gen/openai/__init__.py @@ -146,7 +146,10 @@ def _load_image_bytes(ref: str) -> Tuple[bytes, str]: if "image/" in header: ext = header.split("image/", 1)[1].split(";", 1)[0] or "png" return base64.b64decode(b64), f"image.{ext}" - # Local file path. + # Local file path — enforce the shared credential-read guard before reading. + from agent.file_safety import raise_if_read_blocked + + raise_if_read_blocked(ref) with open(ref, "rb") as fh: data = fh.read() name = os.path.basename(ref) or "image.png" diff --git a/plugins/image_gen/openrouter/__init__.py b/plugins/image_gen/openrouter/__init__.py index a3cc348cc..a5c6c164d 100644 --- a/plugins/image_gen/openrouter/__init__.py +++ b/plugins/image_gen/openrouter/__init__.py @@ -97,6 +97,10 @@ def _to_image_url_part(ref: str) -> Optional[str]: if ref.startswith(("http://", "https://", "data:")): return ref path = Path(ref) + # Enforce the shared credential-read guard before inlining local bytes. + from agent.file_safety import raise_if_read_blocked + + raise_if_read_blocked(ref) try: raw = path.read_bytes() except OSError as exc: diff --git a/plugins/image_gen/xai/__init__.py b/plugins/image_gen/xai/__init__.py index 31a0b719b..5ce9f26cb 100644 --- a/plugins/image_gen/xai/__init__.py +++ b/plugins/image_gen/xai/__init__.py @@ -137,6 +137,11 @@ def _xai_image_field(source: str) -> Dict[str, str]: import base64 import os as _os + # Enforce the shared credential-read guard before reading local bytes + # (same boundary the OpenAI / OpenRouter / Codex image providers apply). + from agent.file_safety import raise_if_read_blocked + + raise_if_read_blocked(source) with open(_os.path.expanduser(source), "rb") as fh: # windows-footgun: ok raw = fh.read() ext = (_os.path.splitext(source)[1].lstrip(".") or "png").lower() diff --git a/plugins/platforms/matrix/adapter.py b/plugins/platforms/matrix/adapter.py index ebe9ebbbf..dac4dbd16 100644 --- a/plugins/platforms/matrix/adapter.py +++ b/plugins/platforms/matrix/adapter.py @@ -2351,7 +2351,18 @@ class MatrixAdapter(BasePlatformAdapter): if inspect.isawaitable(tasks): tasks = await tasks if tasks: - await asyncio.gather(*tasks) + # return_exceptions=True so one failing event handler doesn't abort + # the whole gather and silently drop the SIBLING events in the same + # sync response (a bare gather re-raises the first exception, leaving + # the rest of the batch unprocessed). Mirrors the invite/redaction + # gathers above. Surface each failure instead of swallowing it. + results = await asyncio.gather(*tasks, return_exceptions=True) + for result in results: + if isinstance(result, Exception): + logger.warning( + "Matrix: event handler failed during sync dispatch: %s", + result, + ) def _is_self_sender(self, sender: str) -> bool: """Return True if the sender refers to the bot's own account. diff --git a/plugins/video_gen/xai/__init__.py b/plugins/video_gen/xai/__init__.py index edc981c78..90dfa57bf 100644 --- a/plugins/video_gen/xai/__init__.py +++ b/plugins/video_gen/xai/__init__.py @@ -127,6 +127,22 @@ def _xai_headers(api_key: str) -> Dict[str, str]: } +def _raise_if_blocked_local_input(ref: str) -> None: + """Refuse to read a local media path that Hermes' read deny-list blocks. + + Thin wrapper over the shared ``agent.file_safety.raise_if_read_blocked`` + chokepoint so xAI video inputs enforce the same credential-store guard as + the image providers. Fails open if the guard machinery is unavailable + (defense-in-depth, per the denylist's own framing). + """ + try: + from agent.file_safety import raise_if_read_blocked + except Exception as exc: # noqa: BLE001 - guard must never break loading + logger.debug("xAI media input read guard unavailable: %s", exc) + return + raise_if_read_blocked(ref) + + def _image_ref_to_xai_url(value: str) -> str: """Return a URL/data URI accepted by xAI for image inputs.""" ref = (value or "").strip() @@ -140,6 +156,8 @@ def _image_ref_to_xai_url(value: str) -> str: if not path.is_file(): return ref + _raise_if_blocked_local_input(ref) + mime = mimetypes.guess_type(path.name)[0] or "application/octet-stream" if not mime.startswith("image/"): return ref @@ -195,6 +213,8 @@ def _video_ref_to_xai_url(value: str) -> str: if not path.is_file(): return ref + _raise_if_blocked_local_input(ref) + mime = mimetypes.guess_type(path.name)[0] or "video/mp4" if not mime.startswith("video/"): return ref diff --git a/run_agent.py b/run_agent.py index aaafd469a..cdd82f459 100644 --- a/run_agent.py +++ b/run_agent.py @@ -5623,7 +5623,10 @@ class AIAgent: New DELEGATE_TASK_SCHEMA fields only need to be added here to reach all invocation paths (concurrent, sequential, inline). """ - from tools.delegate_tool import delegate_task as _delegate_task + from tools.delegate_tool import ( + _strip_model_hidden_task_fields, + delegate_task as _delegate_task, + ) # Delegations from the top-level MODEL always run in the background — # the model does not get to choose. delegate_task returns immediately # with a handle (one per task) and each subagent's result re-enters the @@ -5639,10 +5642,8 @@ class AIAgent: return _delegate_task( goal=function_args.get("goal"), context=function_args.get("context"), - tasks=function_args.get("tasks"), + tasks=_strip_model_hidden_task_fields(function_args.get("tasks")), max_iterations=function_args.get("max_iterations"), - acp_command=function_args.get("acp_command"), - acp_args=function_args.get("acp_args"), role=function_args.get("role"), background=(not _is_subagent), parent_agent=self, diff --git a/scripts/install.ps1 b/scripts/install.ps1 index 15f98f4b4..19df6d313 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -1602,7 +1602,12 @@ function Install-Venv { Write-Info "Creating virtual environment with Python $PythonVersion..." Push-Location $InstallDir - + + # Tasks we disabled below and must re-enable no matter how this stage + # exits. Populated only with tasks that were ENABLED before we touched + # them, so a task the user deliberately disabled is never re-armed. + $gatewayTasksDisabled = @() + try { if (Test-Path "venv") { Write-Info "Virtual environment already exists, recreating..." # On Windows, native Python extensions (e.g. _bcrypt.pyd, tornado's @@ -1614,6 +1619,31 @@ function Install-Venv { if ($env:OS -eq "Windows_NT") { $myPid = $PID Write-Info "Stopping any running hermes processes before recreating venv..." + # Disarm the respawner FIRST: the gateway autostart Scheduled Task + # relaunches a killed gateway within seconds, and losing that race + # re-locks the venv's .pyd files between our kill sweep and + # Remove-Item (the July 2026 _brotlicffi.pyd incident). schtasks + # /End stops a running task instance; /Change /DISABLE stops it + # from re-firing mid-install. (The Startup-folder .vbs fallback is + # NOT touched: it only fires at logon, so it cannot respawn a + # gateway mid-install.) Re-enabled in the finally below — including + # on failure — but only for tasks that were enabled to begin with. + # Best-effort: a missing task just errors quietly. + try { + schtasks /Query /FO CSV 2>$null | ConvertFrom-Csv | Where-Object { $_.TaskName -like '*Hermes_Gateway*' } | ForEach-Object { + $tn = $_.TaskName + if ($_.Status -eq 'Disabled') { + Write-Info " gateway autostart task $tn is already disabled; leaving it that way" + return + } + schtasks /End /TN $tn 2>$null | Out-Null + schtasks /Change /TN $tn /DISABLE 2>$null | Out-Null + $gatewayTasksDisabled += $tn + Write-Info " disabled gateway autostart task $tn for the duration of the install" + } + } catch { + Write-Warn "Could not enumerate gateway scheduled tasks: $($_.Exception.Message)" + } # The launcher CLI (hermes.exe) plus its child tree. & taskkill /F /T /IM hermes.exe /FI "PID ne $myPid" 2>$null | Out-Null # taskkill /IM hermes.exe is NOT enough: the gateway/agent that a @@ -1632,27 +1662,68 @@ function Install-Venv { # ExecutablePath for a process it cannot inspect (a different session) # instead of throwing, so an unreadable process is skipped rather than # aborting the whole sweep. + # + # The sweep is a bounded LOOP, not single-shot: supervised processes + # (the Desktop app's backend, a watchdog-managed gateway) respawn in + # the window between one kill pass and the delete. Each pass re- + # enumerates; three consecutive clean passes (or the attempt cap) + # ends the loop. $venvPrefix = [System.IO.Path]::GetFullPath((Join-Path $InstallDir "venv")).TrimEnd('\') + '\' - try { - Get-CimInstance Win32_Process -ErrorAction Stop | - Where-Object { $_.ProcessId -ne $myPid -and $_.ExecutablePath -and $_.ExecutablePath.StartsWith($venvPrefix, [System.StringComparison]::OrdinalIgnoreCase) } | - ForEach-Object { - Write-Info " stopping PID $($_.ProcessId) ($($_.Name)) running from venv" - Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue - } - } catch { - Write-Warn "Could not enumerate venv processes: $($_.Exception.Message)" + $cleanPasses = 0 + for ($sweep = 0; $sweep -lt 10 -and $cleanPasses -lt 3; $sweep++) { + $found = 0 + try { + Get-CimInstance Win32_Process -ErrorAction Stop | + Where-Object { $_.ProcessId -ne $myPid -and $_.ExecutablePath -and $_.ExecutablePath.StartsWith($venvPrefix, [System.StringComparison]::OrdinalIgnoreCase) } | + ForEach-Object { + $found++ + Write-Info " stopping PID $($_.ProcessId) ($($_.Name)) running from venv" + Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue + } + } catch { + Write-Warn "Could not enumerate venv processes: $($_.Exception.Message)" + break + } + if ($found -eq 0) { $cleanPasses++ } else { $cleanPasses = 0 } + Start-Sleep -Milliseconds 400 } - Start-Sleep -Milliseconds 800 } - Remove-Item -Recurse -Force "venv" -ErrorAction SilentlyContinue - # A killed process can take a moment to release its file handles, so a - # first Remove-Item may still hit a locked .pyd. Retry once after a short - # pause before giving up and letting the stage fail loudly. - if (Test-Path "venv") { - Start-Sleep -Seconds 2 - Remove-Item -Recurse -Force "venv" + # Rename-then-delete: on Windows a directory RENAME succeeds even while + # files inside it are mapped as DLLs (only in-place delete/replace of + # the mapped file is denied, and only same-volume renames are atomic + # moves). Moving the old venv aside means `uv venv` can create a fresh + # one immediately even if some straggler still holds a .pyd from the + # old tree; the renamed dir is deleted best-effort (now, and by the + # cleanup pass below on the NEXT install if a handle outlives this one). + $staleName = "venv.stale.{0}" -f (Get-Date -Format "yyyyMMddHHmmss") + $renamed = $false + try { + Rename-Item -Path "venv" -NewName $staleName -ErrorAction Stop + $renamed = $true + } catch { + Write-Warn "Could not rename venv aside ($($_.Exception.Message)); falling back to in-place delete" } + if ($renamed) { + Remove-Item -Recurse -Force $staleName -ErrorAction SilentlyContinue + if (Test-Path $staleName) { + Write-Warn "Old venv parked at $staleName (a process still holds files in it); it will be cleaned up on the next install" + } + } else { + Remove-Item -Recurse -Force "venv" -ErrorAction SilentlyContinue + # A killed process can take a moment to release its file handles, so a + # first Remove-Item may still hit a locked .pyd. Retry once after a short + # pause before giving up and letting the stage fail loudly. + if (Test-Path "venv") { + Start-Sleep -Seconds 2 + Remove-Item -Recurse -Force "venv" + } + } + } + + # Clean up parked venvs from previous installs whose handles have since + # been released. Best-effort — a still-held tree just stays for next time. + Get-ChildItem -Directory -Filter "venv.stale.*" -ErrorAction SilentlyContinue | ForEach-Object { + Remove-Item -Recurse -Force $_.FullName -ErrorAction SilentlyContinue } # uv creates the venv and pins the Python version in one step. uv emits @@ -1666,7 +1737,6 @@ function Install-Venv { # ok=true) when the venv was never created. $venvExitCode = $LASTEXITCODE if ($venvExitCode -ne 0) { - Pop-Location throw "Failed to create virtual environment (uv venv exited with $venvExitCode)" } @@ -1681,9 +1751,23 @@ function Install-Venv { if (Test-Path $venvPythonExe) { $env:UV_PYTHON = $venvPythonExe } + } finally { + Pop-Location + # Re-arm the gateway autostart tasks disabled during the venv teardown + # — in a finally so a failed teardown/creation can never strand the + # user's gateway autostart in the disabled state. Same function scope, + # so the list survives even under the stage-per-process bootstrap. + # Deliberately NOT started here — dependencies aren't installed yet; + # the task fires normally on next logon and `hermes update` / the + # gateway resume path handles the immediate restart. + if ($gatewayTasksDisabled -and $gatewayTasksDisabled.Count -gt 0) { + foreach ($tn in $gatewayTasksDisabled) { + schtasks /Change /TN $tn /ENABLE 2>$null | Out-Null + } + Write-Info "Re-enabled gateway autostart task(s): $($gatewayTasksDisabled -join ', ')" + } + } - Pop-Location - Write-Success "Virtual environment ready (Python $PythonVersion)" } diff --git a/scripts/release.py b/scripts/release.py index 1b3d3a672..11f374e9f 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -46,6 +46,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json" # Auto-extracted from noreply emails + manual overrides AUTHOR_MAP = { "infinitycrew39@gmail.com": "infinitycrew39", # PR #56431 salvage (honor live vLLM context limits on local endpoints) + "jonathan.kovacs999@gmail.com": "CocaKova", # PR #57692 salvage (cron: run jobs under the profile secret scope so get_secret does not fail-close with UnscopedSecretError under profile isolation) "hermes.wanderer@yahoo.com": "trismegistus-wanderer", # PR #31856 salvage (gateway: defer idle-TTL agent-cache eviction until the session store says the session actually expired, so the expiry watcher can still fire MemoryProvider.on_session_end with the live transcript; #11205) "louis@letsfive.io": "Mibayy", # PR #3243 salvage (/compact alias + preview/aggressive flags for /compress) "louis@letsfive.io": "Mibayy", # PR #3176 salvage (api-server: per-client model routing via model_routes) @@ -1134,6 +1135,7 @@ AUTHOR_MAP = { "hata1234@gmail.com": "hata1234", "hmbown@gmail.com": "Hmbown", "iacobs@m0n5t3r.info": "m0n5t3r", + "iacobs@webflakes.com": "m0n5t3r", "jiayuw794@gmail.com": "JiayuuWang", "jinhyuk9714@gmail.com": "sjh9714", "jonny@nousresearch.com": "yoniebans", @@ -1388,6 +1390,7 @@ AUTHOR_MAP = { "holynn@placeholder.local": "holynn-q", "agent@hermes.local": "jacdevos", "sunsky.lau@gmail.com": "liuhao1024", + "suninrain086@gmail.com": "suninrain086", # PR #57651 salvage of #50685 (vision custom-endpoint creds) "mohamed.origami@gmail.com": "mohamedorigami-jpg", # PR #32117 (cron storage root anchor; #32091) "58446328+sherman-yang@users.noreply.github.com": "sherman-yang", # PR #32788 (cron per-job MCP merge; #23997) "rob@rbrtbn.com": "rbrtbn", diff --git a/tests/agent/test_auxiliary_main_first.py b/tests/agent/test_auxiliary_main_first.py index 94181d468..0b8b0a044 100644 --- a/tests/agent/test_auxiliary_main_first.py +++ b/tests/agent/test_auxiliary_main_first.py @@ -543,6 +543,124 @@ class TestResolveVisionMainFirst: mock_strict.assert_called_once_with("nous", None) +# ── Vision — custom provider endpoint credential passthrough ──────────────── + + +class TestResolveVisionCustomProvider: + """Custom-endpoint mains must forward base_url/api_key to Step 1. + + Regression: a ``custom:<name>`` main provider resolves to the bare + runtime provider id ``"custom"``. ``resolve_provider_client("custom")`` + has no built-in endpoint, so without forwarding the live base_url/api_key + it returns ``(None, None)`` and vision falls through to OpenRouter / Nous, + which an offline / aggregator-less user has never configured — breaking + vision entirely with ``No LLM provider configured for task=vision + provider=auto``. The fix recovers the live endpoint that + ``set_runtime_main()`` recorded for the turn. + """ + + def test_custom_main_forwards_runtime_endpoint(self, monkeypatch): + """custom main with recorded runtime endpoint → Step 1 builds a client.""" + import agent.auxiliary_client as aux + + monkeypatch.setattr(aux, "_RUNTIME_MAIN_BASE_URL", "https://my.endpoint.example/v1") + monkeypatch.setattr(aux, "_RUNTIME_MAIN_API_KEY", "sk-runtime-key") + monkeypatch.setattr(aux, "_RUNTIME_MAIN_API_MODE", "anthropic_messages") + + with patch( + "agent.auxiliary_client._read_main_provider", return_value="custom", + ), patch( + "agent.auxiliary_client._read_main_model", return_value="claude-opus-4-8", + ), patch( + "agent.auxiliary_client._resolve_task_provider_model", + return_value=("auto", None, None, None, None), + ), patch( + "agent.auxiliary_client.resolve_provider_client" + ) as mock_resolve: + mock_client = MagicMock() + mock_resolve.return_value = (mock_client, "claude-opus-4-8") + + from agent.auxiliary_client import resolve_vision_provider_client + + provider, client, model = resolve_vision_provider_client() + + assert provider == "custom" + assert client is mock_client + assert model == "claude-opus-4-8" + # The endpoint credentials recorded for the turn MUST be forwarded, + # otherwise resolve_provider_client("custom") returns (None, None). + kwargs = mock_resolve.call_args.kwargs + assert kwargs.get("explicit_base_url") == "https://my.endpoint.example/v1" + assert kwargs.get("explicit_api_key") == "sk-runtime-key" + assert kwargs.get("is_vision") is True + + def test_custom_prefixed_main_forwards_runtime_endpoint(self, monkeypatch): + """A ``custom:<name>`` provider id also forwards the runtime endpoint.""" + import agent.auxiliary_client as aux + + monkeypatch.setattr(aux, "_RUNTIME_MAIN_BASE_URL", "https://named.example/v1") + monkeypatch.setattr(aux, "_RUNTIME_MAIN_API_KEY", "sk-named") + monkeypatch.setattr(aux, "_RUNTIME_MAIN_API_MODE", "") + + with patch( + "agent.auxiliary_client._read_main_provider", + return_value="custom:copilot-gateway", + ), patch( + "agent.auxiliary_client._read_main_model", return_value="claude-opus-4-8", + ), patch( + "agent.auxiliary_client._resolve_task_provider_model", + return_value=("auto", None, None, None, None), + ), patch( + "agent.auxiliary_client.resolve_provider_client" + ) as mock_resolve: + mock_client = MagicMock() + mock_resolve.return_value = (mock_client, "claude-opus-4-8") + + from agent.auxiliary_client import resolve_vision_provider_client + + provider, client, model = resolve_vision_provider_client() + + assert provider == "custom:copilot-gateway" + assert client is mock_client + kwargs = mock_resolve.call_args.kwargs + assert kwargs.get("explicit_base_url") == "https://named.example/v1" + assert kwargs.get("explicit_api_key") == "sk-named" + assert kwargs.get("is_vision") is True + + def test_custom_main_no_runtime_falls_back_to_configured_endpoint(self, monkeypatch): + """No recorded runtime endpoint → resolve the configured custom endpoint.""" + import agent.auxiliary_client as aux + + monkeypatch.setattr(aux, "_RUNTIME_MAIN_BASE_URL", "") + monkeypatch.setattr(aux, "_RUNTIME_MAIN_API_KEY", "") + monkeypatch.setattr(aux, "_RUNTIME_MAIN_API_MODE", "") + + with patch( + "agent.auxiliary_client._read_main_provider", return_value="custom", + ), patch( + "agent.auxiliary_client._read_main_model", return_value="claude-opus-4-8", + ), patch( + "agent.auxiliary_client._resolve_task_provider_model", + return_value=("auto", None, None, None, None), + ), patch( + "agent.auxiliary_client._resolve_custom_runtime", + return_value=("https://configured.example/v1", "sk-configured", "chat_completions"), + ), patch( + "agent.auxiliary_client.resolve_provider_client" + ) as mock_resolve: + mock_client = MagicMock() + mock_resolve.return_value = (mock_client, "claude-opus-4-8") + + from agent.auxiliary_client import resolve_vision_provider_client + + provider, client, model = resolve_vision_provider_client() + + assert client is mock_client + kwargs = mock_resolve.call_args.kwargs + assert kwargs.get("explicit_base_url") == "https://configured.example/v1" + assert kwargs.get("explicit_api_key") == "sk-configured" + + # ── Constant cleanup ──────────────────────────────────────────────────────── diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py index 6f9b9b292..675823112 100644 --- a/tests/agent/test_image_routing.py +++ b/tests/agent/test_image_routing.py @@ -97,11 +97,21 @@ class TestDecideImageInputMode: with patch("agent.image_routing._lookup_supports_vision", return_value=None): assert decide_image_input_mode("openrouter", "brand-new-slug", {}) == "text" - def test_auto_respects_aux_vision_override_even_for_vision_model(self): - """If the user configured a dedicated vision backend, don't bypass it.""" + def test_auto_prefers_native_for_vision_capable_main_model_even_with_aux_configured(self): + """Regression #29135: vision-capable main model wins over aux fallback. + + Auxiliary.vision is a fallback for text-only main models; it must + not preempt native vision on a vision-capable main model. + """ cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}} with patch("agent.image_routing._lookup_supports_vision", return_value=True): - assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text" + assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native" + + def test_auto_uses_aux_vision_fallback_for_text_only_main_model(self): + """#29135: aux vision still acts as fallback for non-vision main models.""" + cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}} + with patch("agent.image_routing._lookup_supports_vision", return_value=False): + assert decide_image_input_mode("deepseek", "deepseek-v4-pro", cfg) == "text" def test_none_config_is_auto(self): with patch("agent.image_routing._lookup_supports_vision", return_value=True): @@ -224,6 +234,37 @@ class TestSupportsVisionOverride: cfg = {"model": "some-string", "providers": ["not-a-dict"]} assert _supports_vision_override(cfg, "custom", "my-llava") is None + def test_custom_colon_name_stripped_suffix_lookup(self): + # model.provider: custom:my-proxy → should resolve stripped key "my-proxy" + cfg = { + "model": {"provider": "custom:my-proxy"}, + "providers": { + "my-proxy": {"models": {"gpt-5.5": {"supports_vision": True}}}, + }, + } + assert _supports_vision_override(cfg, "custom", "gpt-5.5") is True + + def test_custom_colon_name_stripped_suffix_false(self): + # Explicitly disabled vision on the stripped key. + cfg = { + "model": {"provider": "custom:my-proxy"}, + "providers": { + "my-proxy": {"models": {"gpt-5.5": {"supports_vision": False}}}, + }, + } + assert _supports_vision_override(cfg, "custom", "gpt-5.5") is False + + def test_custom_colon_name_no_stripped_key_falls_through(self): + # custom:my-proxy but providers only has "custom" — stripped key + # doesn't match, but "custom" does via runtime provider. + cfg = { + "model": {"provider": "custom:my-proxy"}, + "providers": { + "custom": {"models": {"gpt-5.5": {"supports_vision": True}}}, + }, + } + assert _supports_vision_override(cfg, "custom", "gpt-5.5") is True + # ─── _lookup_supports_vision (override-aware) ──────────────────────────────── @@ -294,15 +335,25 @@ class TestAutoModeRespectsOverride: with patch("agent.models_dev.get_model_capabilities", return_value=None): assert decide_image_input_mode("custom", "unknown", {}) == "text" - def test_explicit_aux_vision_override_still_wins(self): - # If the user has configured a dedicated vision aux backend, respect - # it even when supports_vision: true is also set. + def test_explicit_aux_vision_no_longer_overrides_native_capable_main(self): + # #29135: aux.vision is a fallback for text-only main models; it + # must NOT preempt native routing when the main model can take + # images directly (supports_vision: true). cfg = { "model": {"supports_vision": True}, "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}}, } with patch("agent.models_dev.get_model_capabilities", return_value=None): - assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "text" + assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "native" + + def test_explicit_aux_vision_used_when_main_model_supports_vision_false(self): + # #29135 counterpart: text-only main model + aux fallback → text. + cfg = { + "model": {"supports_vision": False}, + "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}}, + } + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert decide_image_input_mode("custom", "deepseek-v4", cfg) == "text" # ─── build_native_content_parts ────────────────────────────────────────────── diff --git a/tests/cron/test_run_one_job.py b/tests/cron/test_run_one_job.py index 7da6b1c14..c8528109d 100644 --- a/tests/cron/test_run_one_job.py +++ b/tests/cron/test_run_one_job.py @@ -117,3 +117,46 @@ def test_run_one_job_exception_marks_failure(monkeypatch): assert ok is False assert marks == [("j6", False)] + + +def test_run_one_job_installs_secret_scope_under_multiplex(monkeypatch, tmp_path): + """Regression: under profile isolation (multiplex active), run_one_job must + execute run_job inside a profile secret scope so credential reads + (resolve_runtime_provider -> get_secret) don't fail-close with + UnscopedSecretError, and must tear the scope down afterward. + + Behavior contract: a scope is present during run_job and absent after, + regardless of the concrete secret values. + """ + from agent import secret_scope as ss + + # Point cron's home resolution at a profile whose .env carries a secret. + (tmp_path / ".env").write_text("OPENROUTER_BASE_URL=https://openrouter.ai/api/v1\n") + monkeypatch.setattr(s, "_get_hermes_home", lambda: tmp_path) + + scope_during_run = {} + + def fake_run_job(job): + # This is where resolve_runtime_provider() would read a secret. Prove a + # scope is installed and the profile's secret resolves without raising. + scope_during_run["scope"] = ss.current_secret_scope() + scope_during_run["base_url"] = ss.get_secret("OPENROUTER_BASE_URL") + return (True, "out", "final", None) + + monkeypatch.setattr(s, "run_job", fake_run_job) + monkeypatch.setattr(s, "save_job_output", lambda jid, out: f"/tmp/{jid}.txt") + monkeypatch.setattr(s, "_deliver_result", lambda *a, **k: None) + monkeypatch.setattr(s, "mark_job_run", lambda *a, **k: None) + + ss.set_multiplex_active(True) + try: + ok = s.run_one_job({"id": "j7", "name": "t"}) + finally: + ss.set_multiplex_active(False) + + assert ok is True + # Scope was installed during run_job and the profile secret resolved. + assert scope_during_run["scope"] is not None + assert scope_during_run["base_url"] == "https://openrouter.ai/api/v1" + # And it was torn down after run_one_job returned (no leak). + assert ss.current_secret_scope() is None diff --git a/tests/gateway/test_api_server_media_data_urls.py b/tests/gateway/test_api_server_media_data_urls.py index 960f4b194..bf0036b32 100644 --- a/tests/gateway/test_api_server_media_data_urls.py +++ b/tests/gateway/test_api_server_media_data_urls.py @@ -72,6 +72,40 @@ class TestResolveMediaToDataUrls(unittest.TestCase): out = _resolve_media_to_data_urls(f"MEDIA:{p1}\nand MEDIA:{p2}") self.assertEqual(out.count("data:image/png;base64,"), 2) + def test_relative_traversal_path_not_inlined(self): + """A relative/traversal path must never be inlined — the anchored + MEDIA_TAG_CLEANUP_RE matcher requires an absolute-path prefix + (~/, /, or a Windows drive letter), so a bare relative token after + MEDIA: is left as literal text rather than resolved against cwd.""" + text = "MEDIA:../../../../etc/passwd.png" + self.assertEqual(_resolve_media_to_data_urls(text), text) + + def test_credential_path_not_inlined_even_with_image_extension(self): + """An absolute path under the credential/system-path denylist + (validate_media_delivery_path) must not be inlined even though it + has an allowed image extension and the tag matcher's shape.""" + text = "MEDIA:~/.ssh/id_rsa.png" + self.assertEqual(_resolve_media_to_data_urls(text), text) + + def test_symlink_escaping_to_denylisted_target_not_inlined(self): + """A symlink whose resolved target lands under a denylisted system + prefix (/etc) must not be inlined — validate_media_delivery_path + resolves symlinks before the containment/denylist check runs, so + the traversal can't be laundered through an innocuous-looking + image-suffixed symlink name.""" + import os + import tempfile + from pathlib import Path + + d = Path(tempfile.mkdtemp(prefix="hermes_media_test_symlink")) + link = d / "shot.png" + try: + os.symlink("/etc/hosts", link) + except OSError: + self.skipTest("symlink creation not supported in this environment") + text = f"MEDIA:{link}" + self.assertEqual(_resolve_media_to_data_urls(text), text) + if __name__ == "__main__": unittest.main() diff --git a/tests/gateway/test_image_input_routing_runtime.py b/tests/gateway/test_image_input_routing_runtime.py new file mode 100644 index 000000000..5bf34d390 --- /dev/null +++ b/tests/gateway/test_image_input_routing_runtime.py @@ -0,0 +1,140 @@ +import pytest + +from gateway.config import GatewayConfig, Platform, PlatformConfig +from gateway.platforms.base import MessageEvent, MessageType +from gateway.run import GatewayRunner +from gateway.session import SessionSource + + +def _make_runner() -> GatewayRunner: + runner = object.__new__(GatewayRunner) + runner.config = GatewayConfig( + platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="fake")} + ) + runner.adapters = {} + runner._pending_native_image_paths_by_session = {} + runner._session_model_overrides = {} + runner._session_reasoning_overrides = {} + return runner + + +def _source() -> SessionSource: + return SessionSource( + platform=Platform.TELEGRAM, + chat_id="273403055", + chat_type="dm", + user_id="42", + user_name="Maxim", + ) + + +def _image_event(text: str = "look") -> MessageEvent: + return MessageEvent( + text=text, + message_type=MessageType.PHOTO, + source=_source(), + media_urls=["/tmp/cashback.png"], + media_types=["image/png"], + ) + + +def _auto_config() -> dict: + return { + "agent": {"image_input_mode": "auto"}, + "auxiliary": {"vision": {"provider": "auto", "model": "", "base_url": ""}}, + "model": {"provider": "xiaomi", "default": "mimo-v2.5-pro"}, + } + + +@pytest.mark.asyncio +async def test_prepare_image_routing_uses_session_vision_model_override(monkeypatch): + """Telegram /model overrides must affect native-vs-text image routing. + + Regression: _prepare_inbound_message_text used config.yaml's default model + before the per-session model override was installed on auxiliary_client's + runtime globals. A Telegram session switched to a vision model still had + screenshots pre-analyzed as text when config.default was text-only. + """ + runner = _make_runner() + source = _source() + event = _image_event() + cfg = _auto_config() + + monkeypatch.setattr("gateway.run._load_gateway_config", lambda: cfg) + monkeypatch.setattr("hermes_cli.config.load_config", lambda: cfg) + monkeypatch.setattr("agent.auxiliary_client._read_main_provider", lambda: "xiaomi") + monkeypatch.setattr("agent.auxiliary_client._read_main_model", lambda: "mimo-v2.5-pro") + monkeypatch.setattr( + runner, + "_resolve_session_agent_runtime", + lambda **_: ("gpt-5.5", {"provider": "openai-codex"}), + ) + + def fake_supports(provider, model, config): + return provider == "openai-codex" and model == "gpt-5.5" + + monkeypatch.setattr("agent.image_routing._lookup_supports_vision", fake_supports) + + async def fail_enrich(*_args, **_kwargs): + pytest.fail("vision-capable session override should use native image routing") + + monkeypatch.setattr(runner, "_enrich_message_with_vision", fail_enrich) + + result = await runner._prepare_inbound_message_text( + event=event, + source=source, + history=[], + ) + + session_key = runner._session_key_for_source(source) + assert result == "look" + assert runner._pending_native_image_paths_by_session[session_key] == [ + "/tmp/cashback.png" + ] + + +@pytest.mark.asyncio +async def test_prepare_image_routing_falls_back_to_text_for_text_only_session_override(monkeypatch): + """A text-only session override should get vision_analyze text fallback. + + Regression mirror case: if config.default is a vision model but the current + Telegram session is switched to a text-only provider (for example Mimo), + auto routing must not attach pixels natively to the text-only model. + """ + runner = _make_runner() + source = _source() + event = _image_event() + cfg = _auto_config() + cfg["model"] = {"provider": "openai-codex", "default": "gpt-5.5"} + + monkeypatch.setattr("gateway.run._load_gateway_config", lambda: cfg) + monkeypatch.setattr("hermes_cli.config.load_config", lambda: cfg) + monkeypatch.setattr("agent.auxiliary_client._read_main_provider", lambda: "openai-codex") + monkeypatch.setattr("agent.auxiliary_client._read_main_model", lambda: "gpt-5.5") + monkeypatch.setattr( + runner, + "_resolve_session_agent_runtime", + lambda **_: ("mimo-v2.5-pro", {"provider": "xiaomi"}), + ) + + def fake_supports(provider, model, config): + return provider == "openai-codex" and model == "gpt-5.5" + + monkeypatch.setattr("agent.image_routing._lookup_supports_vision", fake_supports) + + async def fake_enrich(user_text, image_paths): + assert user_text == "look" + assert image_paths == ["/tmp/cashback.png"] + return "[vision summary]\n\nlook" + + monkeypatch.setattr(runner, "_enrich_message_with_vision", fake_enrich) + + result = await runner._prepare_inbound_message_text( + event=event, + source=source, + history=[], + ) + + session_key = runner._session_key_for_source(source) + assert result == "[vision summary]\n\nlook" + assert runner._pending_native_image_paths_by_session.get(session_key) is None diff --git a/tests/gateway/test_matrix.py b/tests/gateway/test_matrix.py index 748422045..d239728b7 100644 --- a/tests/gateway/test_matrix.py +++ b/tests/gateway/test_matrix.py @@ -5276,3 +5276,36 @@ class TestDeviceIdRecoveryOnReconnect: assert None not in _verify_call.args[0]["@bot:example.org"] await adapter.disconnect() + + +class TestMatrixDispatchSyncIsolation: + """A failing mautrix event handler must not abort the whole sync batch. + + ``_dispatch_sync`` gathers the per-event handler tasks. Without + ``return_exceptions=True`` the first exception aborts the gather and the + sibling events in the same sync response are silently dropped. + """ + + @pytest.mark.asyncio + async def test_dispatch_sync_isolates_failing_handler(self, caplog): + import logging + + adapter = _make_adapter() + ran = {"ok": False} + + async def _boom(): + raise RuntimeError("handler boom") + + async def _ok(): + ran["ok"] = True + + client = MagicMock() + client.handle_sync = MagicMock(return_value=[_boom(), _ok()]) + adapter._client = client + + with caplog.at_level(logging.WARNING): + # Must not raise despite the failing handler. + await adapter._dispatch_sync({"next_batch": "s1"}) + + assert ran["ok"] is True # the sibling handler still ran + assert "event handler failed" in caplog.text # failure surfaced, not swallowed diff --git a/tests/gateway/test_native_image_buffer_isolation.py b/tests/gateway/test_native_image_buffer_isolation.py index f8fb2e65a..dbaa4350a 100644 --- a/tests/gateway/test_native_image_buffer_isolation.py +++ b/tests/gateway/test_native_image_buffer_isolation.py @@ -14,7 +14,7 @@ def _make_runner() -> GatewayRunner: runner.adapters = {} runner._model = "openai/gpt-4.1-mini" runner._base_url = None - runner._decide_image_input_mode = lambda: "native" + runner._decide_image_input_mode = lambda **_: "native" return runner @@ -77,3 +77,20 @@ async def test_native_image_buffer_not_cleared_by_other_sessions_without_images( assert runner._consume_pending_native_image_paths(build_session_key(source_a)) == ["/tmp/a.png"] assert runner._consume_pending_native_image_paths(build_session_key(source_b)) == [] + + +@pytest.mark.asyncio +async def test_native_image_buffer_uses_resolved_session_key_when_provided(): + runner = _make_runner() + source = _source("chat-a") + runner._session_key_for_source = lambda _source: "source-derived-key" + + await runner._prepare_inbound_message_text( + event=_image_event(source, "/tmp/a.png"), + source=source, + history=[], + session_key="canonical-session-key", + ) + + assert runner._consume_pending_native_image_paths("source-derived-key") == [] + assert runner._consume_pending_native_image_paths("canonical-session-key") == ["/tmp/a.png"] diff --git a/tests/gateway/test_queued_native_image_session_key.py b/tests/gateway/test_queued_native_image_session_key.py new file mode 100644 index 000000000..e24897561 --- /dev/null +++ b/tests/gateway/test_queued_native_image_session_key.py @@ -0,0 +1,151 @@ +import base64 +import importlib +import sys +import types +from types import SimpleNamespace + +import pytest + +from gateway.config import Platform, PlatformConfig +from gateway.platforms.base import BasePlatformAdapter, MessageEvent, MessageType, SendResult +from gateway.session import SessionSource + + +_ONE_BY_ONE_PNG = base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO6L2ioAAAAASUVORK5CYII=" +) + + +class CaptureAdapter(BasePlatformAdapter): + def __init__(self): + super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM) + self.sent = [] + self.typing = [] + + async def connect(self) -> bool: + return True + + async def disconnect(self) -> None: + return None + + async def send(self, chat_id, content, reply_to=None, metadata=None) -> SendResult: + self.sent.append( + { + "chat_id": chat_id, + "content": content, + "reply_to": reply_to, + "metadata": metadata, + } + ) + return SendResult(success=True, message_id="sent-1") + + async def send_typing(self, chat_id, metadata=None) -> None: + self.typing.append({"chat_id": chat_id, "metadata": metadata}) + + async def stop_typing(self, chat_id) -> None: + return None + + async def get_chat_info(self, chat_id: str): + return {"id": chat_id} + + +class CaptureQueuedNativeImageAgent: + calls = [] + + def __init__(self, **kwargs): + self.tools = [] + self.tool_progress_callback = kwargs.get("tool_progress_callback") + + def run_conversation(self, message, conversation_history=None, task_id=None): + type(self).calls.append(message) + return { + "final_response": f"done-{len(type(self).calls)}", + "messages": [], + "api_calls": 1, + } + + +def _make_runner(adapter): + gateway_run = importlib.import_module("gateway.run") + runner = object.__new__(gateway_run.GatewayRunner) + runner.adapters = {adapter.platform: adapter} + runner._voice_mode = {} + runner._prefill_messages = [] + runner._ephemeral_system_prompt = "" + runner._reasoning_config = None + runner._provider_routing = {} + runner._fallback_model = None + runner._session_db = None + runner._running_agents = {} + runner._session_run_generation = {} + runner.hooks = SimpleNamespace(loaded_hooks=False) + runner.config = SimpleNamespace( + thread_sessions_per_user=False, + group_sessions_per_user=False, + stt_enabled=False, + ) + runner._model = "openai/gpt-4.1-mini" + runner._base_url = None + runner._decide_image_input_mode = lambda **_kw: "native" + return runner + + +@pytest.mark.asyncio +async def test_queued_followup_uses_pending_event_session_key_for_native_images(monkeypatch, tmp_path): + CaptureQueuedNativeImageAgent.calls = [] + + fake_dotenv = types.ModuleType("dotenv") + fake_dotenv.load_dotenv = lambda *args, **kwargs: None + monkeypatch.setitem(sys.modules, "dotenv", fake_dotenv) + + fake_run_agent = types.ModuleType("run_agent") + fake_run_agent.AIAgent = CaptureQueuedNativeImageAgent + monkeypatch.setitem(sys.modules, "run_agent", fake_run_agent) + + gateway_run = importlib.import_module("gateway.run") + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + monkeypatch.setattr(gateway_run, "_resolve_runtime_agent_kwargs", lambda: {"api_key": "***"}) + + adapter = CaptureAdapter() + runner = _make_runner(adapter) + + image_path = tmp_path / "queued-image.png" + image_path.write_bytes(_ONE_BY_ONE_PNG) + + source = SessionSource( + platform=Platform.TELEGRAM, + chat_id="-1001", + chat_type="group", + ) + pending_source = SessionSource( + platform=Platform.TELEGRAM, + chat_id="-1001", + chat_type="group", + thread_id="17585", + ) + + adapter._pending_messages["agent:main:telegram:group:-1001"] = MessageEvent( + text="describe this", + message_type=MessageType.PHOTO, + source=pending_source, + media_urls=[str(image_path)], + media_types=["image/png"], + message_id="queued-1", + ) + + result = await runner._run_agent( + message="hello", + context_prompt="", + history=[], + source=source, + session_id="sess-native-image-followup", + session_key="agent:main:telegram:group:-1001", + ) + + assert result["final_response"] == "done-2" + assert len(CaptureQueuedNativeImageAgent.calls) == 2 + queued_message = CaptureQueuedNativeImageAgent.calls[1] + assert isinstance(queued_message, list) + assert queued_message[0]["type"] == "text" + assert queued_message[0]["text"].startswith("describe this") + assert any(part.get("type") == "image_url" for part in queued_message) diff --git a/tests/hermes_cli/test_console_engine.py b/tests/hermes_cli/test_console_engine.py new file mode 100644 index 000000000..ac94facbd --- /dev/null +++ b/tests/hermes_cli/test_console_engine.py @@ -0,0 +1,681 @@ +from __future__ import annotations + +import io +import sys +from pathlib import Path + +import pytest + +from hermes_cli.console_engine import HermesConsoleEngine, run_console_repl + + +EXPECTED_CONSOLE_COMMANDS = { + ("status",), + ("doctor",), + ("logs",), + ("version",), + ("dump",), + ("debug", "share"), + ("debug", "delete"), + ("prompt-size",), + ("insights",), + ("security", "audit"), + ("portal", "info"), + ("portal", "tools"), + ("backup",), + ("import",), + ("send",), + ("config", "show"), + ("config", "path"), + ("config", "env-path"), + ("config", "check"), + ("config", "migrate"), + ("config", "set"), + ("sessions", "list"), + ("sessions", "stats"), + ("sessions", "export"), + ("sessions", "rename"), + ("sessions", "optimize"), + ("sessions", "repair"), + ("cron", "list"), + ("cron", "status"), + ("cron", "create"), + ("cron", "edit"), + ("cron", "pause"), + ("cron", "resume"), + ("cron", "run"), + ("cron", "remove"), + ("cron", "tick"), + ("profile",), + ("profile", "list"), + ("profile", "show"), + ("profile", "info"), + ("profile", "create"), + ("profile", "use"), + ("profile", "describe"), + ("profile", "rename"), + ("profile", "delete"), + ("profile", "export"), + ("profile", "import"), + ("profile", "install"), + ("profile", "update"), + ("tools", "list"), + ("tools", "enable"), + ("tools", "disable"), + ("tools", "post-setup"), + ("plugins", "list"), + ("plugins", "enable"), + ("plugins", "disable"), + ("plugins", "install"), + ("plugins", "update"), + ("plugins", "remove"), + ("skills", "browse"), + ("skills", "search"), + ("skills", "inspect"), + ("skills", "list"), + ("skills", "check"), + ("skills", "list-modified"), + ("skills", "diff"), + ("skills", "install"), + ("skills", "update"), + ("skills", "audit"), + ("skills", "uninstall"), + ("skills", "reset"), + ("skills", "opt-in"), + ("skills", "opt-out"), + ("skills", "repair-official"), + ("skills", "snapshot", "export"), + ("skills", "snapshot", "import"), + ("skills", "tap", "list"), + ("skills", "tap", "add"), + ("skills", "tap", "remove"), + ("mcp", "list"), + ("mcp", "catalog"), + ("mcp", "test"), + ("mcp", "add"), + ("mcp", "remove"), + ("mcp", "install"), + ("mcp", "login"), + ("mcp", "reauth"), + ("mcp", "configure"), + ("mcp", "picker"), + ("memory", "status"), + ("memory", "off"), + ("memory", "reset"), + ("auth", "list"), + ("auth", "status"), + ("auth", "reset"), + ("auth", "add"), + ("auth", "remove"), + ("auth", "logout"), + ("auth", "spotify", "status"), + ("auth", "spotify", "login"), + ("auth", "spotify", "logout"), + ("pairing", "list"), + ("pairing", "approve"), + ("pairing", "revoke"), + ("pairing", "clear-pending"), + ("webhook", "list"), + ("webhook", "subscribe"), + ("webhook", "remove"), + ("webhook", "test"), + ("hooks", "list"), + ("hooks", "test"), + ("hooks", "doctor"), + ("hooks", "revoke"), + ("slack", "manifest"), + ("project", "list"), + ("project", "show"), + ("project", "create"), + ("project", "add-folder"), + ("project", "remove-folder"), + ("project", "rename"), + ("project", "set-primary"), + ("project", "use"), + ("project", "archive"), + ("project", "restore"), + ("project", "bind-board"), + ("kanban", "init"), + ("kanban", "boards", "list"), + ("kanban", "boards", "create"), + ("kanban", "boards", "rm"), + ("kanban", "boards", "switch"), + ("kanban", "boards", "current"), + ("kanban", "boards", "rename"), + ("kanban", "boards", "set-workdir"), + ("kanban", "create"), + ("kanban", "list"), + ("kanban", "show"), + ("kanban", "assign"), + ("kanban", "reclaim"), + ("kanban", "reassign"), + ("kanban", "diagnose"), + ("kanban", "link"), + ("kanban", "unlink"), + ("kanban", "claim"), + ("kanban", "comment"), + ("kanban", "complete"), + ("kanban", "edit"), + ("kanban", "block"), + ("kanban", "schedule"), + ("kanban", "unblock"), + ("kanban", "promote"), + ("kanban", "archive"), + ("kanban", "stats"), + ("kanban", "runs"), + ("kanban", "heartbeat"), + ("kanban", "assignments"), + ("kanban", "context"), + ("bundles", "list"), + ("bundles", "show"), + ("bundles", "create"), + ("bundles", "delete"), + ("bundles", "reload"), + ("checkpoints", "status"), + ("checkpoints", "list"), + ("checkpoints", "prune"), + ("checkpoints", "clear"), + ("checkpoints", "clear-legacy"), + ("curator", "status"), + ("curator", "run"), + ("curator", "pause"), + ("curator", "resume"), + ("curator", "pin"), + ("curator", "unpin"), + ("curator", "restore"), + ("curator", "list-archived"), + ("curator", "archive"), + ("curator", "prune"), + ("curator", "backup"), + ("curator", "rollback"), + ("pets", "list"), + ("pets", "install"), + ("pets", "select"), + ("pets", "show"), + ("pets", "off"), + ("pets", "scale"), + ("pets", "remove"), + ("pets", "doctor"), +} + + +MUTATING_CONFIRMATION_SMOKE_COMMANDS = [ + "config set console.test true", + "config migrate", + "sessions rename abc123 new title", + "sessions optimize", + "cron create 'every 1h' 'say hello'", + "cron remove abc123", + "profile create tester --no-alias --no-skills", + "profile delete tester", + "tools disable web", + "plugins install owner/repo --no-enable", + "skills install openai/skills/example", + "mcp add demo --url https://example.com/sse", + "mcp configure github", + "mcp picker", + "backup --quick -o /tmp/hermes-console-test.zip", + "import /tmp/hermes-console-test.zip", + "send --to telegram hello", + "memory reset --target memory", + "auth remove openrouter 1", + "pairing approve abc123", + "webhook subscribe test --prompt hello", + "hooks test pre_tool_call", + "project create demo", + "kanban create 'demo task'", + "bundles create demo --skill skill-a", + "checkpoints prune", + "curator pause", + "pets install cat", +] + + +def test_console_parses_bare_and_hermes_prefixed_commands(_isolate_hermes_home): + engine = HermesConsoleEngine() + + bare = engine.execute("config path") + prefixed = engine.execute("hermes config path") + + assert bare.status == "ok" + assert prefixed.status == "ok" + assert bare.output == prefixed.output + assert bare.output.endswith("config.yaml") + + +def test_console_status_hides_cli_next_step_footer( + monkeypatch: pytest.MonkeyPatch, + _isolate_hermes_home, +): + import hermes_cli.status as status_mod + + def fake_show_status(_args): + print("◆ Sessions") + print("Active: 3 session(s)") + print() + rule = "\u2500" * 60 + print(f"\x1b[2m{rule}\x1b[0m") + print("\x1b[2m Run 'hermes doctor' for detailed diagnostics\x1b[0m") + print("\x1b[2m Run 'hermes setup' to configure\x1b[0m") + print() + + monkeypatch.setattr(status_mod, "show_status", fake_show_status) + + result = HermesConsoleEngine().execute("status") + + assert result.status == "ok" + assert "Sessions" in result.output + assert "Active: 3 session(s)" in result.output + assert "hermes doctor" not in result.output + assert "hermes setup" not in result.output + assert "\u2500" not in result.output + + +def test_console_status_hides_osc_linked_cli_next_step_footer( + monkeypatch: pytest.MonkeyPatch, + _isolate_hermes_home, +): + import hermes_cli.status as status_mod + + def osc_link(text: str) -> str: + return f"\x1b]8;;https://example.test\x1b\\{text}\x1b]8;;\x1b\\" + + def fake_show_status(_args): + print("◆ Sessions") + print("Active: 3 session(s)") + print() + print(osc_link("\u2500" * 60)) + print(osc_link(" Run 'hermes doctor' for detailed diagnostics")) + print(osc_link(" Run 'hermes setup' to configure")) + print() + + monkeypatch.setattr(status_mod, "show_status", fake_show_status) + + result = HermesConsoleEngine().execute("status") + + assert result.status == "ok" + assert "Sessions" in result.output + assert "Active: 3 session(s)" in result.output + assert "hermes doctor" not in result.output + assert "hermes setup" not in result.output + assert "https://example.test" not in result.output + assert "\u2500" not in result.output + + +def test_console_help_uses_cli_subcommand_summaries(): + help_text = HermesConsoleEngine().help_text() + + assert "skills list" in help_text + assert "List installed skills" in help_text + assert "Show all tools and their enabled/disabled status" in help_text + assert "Remove an MCP server" in help_text + assert "Check pet setup + terminal graphics support" in help_text + assert "Run `hermes skills list`" not in help_text + assert "Run `hermes tools list`" not in help_text + + +def test_console_help_table_keeps_long_summaries_compact(): + help_text = HermesConsoleEngine().help_text() + + slack_line = next( + line for line in help_text.splitlines() if line.strip().startswith("slack manifest") + ) + + assert len(slack_line) <= 112 + assert slack_line.endswith("...") + + +def test_console_help_for_command_uses_cli_summary(): + help_text = HermesConsoleEngine().help_text("skills list") + + assert help_text == "skills list\nList installed skills" + + +def test_console_registry_covers_non_admin_cli_surface(): + registered = set(HermesConsoleEngine().commands) + + missing = EXPECTED_CONSOLE_COMMANDS - registered + + assert missing == set() + + +EXPECTED_HOSTED_CONSOLE_COMMANDS = { + ("status",), + ("doctor",), + ("logs",), + ("version",), + ("prompt-size",), + ("insights",), + ("security", "audit"), + ("portal", "info"), + ("portal", "tools"), + ("send",), + ("config", "show"), + ("config", "path"), + ("config", "env-path"), + ("config", "check"), + ("config", "migrate"), + ("config", "set"), + ("sessions", "list"), + ("sessions", "stats"), + ("sessions", "export"), + ("sessions", "rename"), + ("sessions", "optimize"), + ("sessions", "repair"), + ("cron", "list"), + ("cron", "status"), + ("cron", "create"), + ("cron", "edit"), + ("cron", "pause"), + ("cron", "resume"), + ("cron", "run"), + ("cron", "remove"), + ("cron", "tick"), + ("profile",), + ("profile", "list"), + ("profile", "show"), + ("profile", "info"), + ("tools", "list"), + ("tools", "enable"), + ("tools", "disable"), + ("tools", "post-setup"), + ("skills", "browse"), + ("skills", "search"), + ("skills", "inspect"), + ("skills", "list"), + ("skills", "check"), + ("skills", "list-modified"), + ("skills", "diff"), + ("skills", "install"), + ("skills", "update"), + ("skills", "audit"), + ("skills", "uninstall"), + ("skills", "reset"), + ("skills", "opt-in"), + ("skills", "opt-out"), + ("skills", "repair-official"), + ("skills", "snapshot", "export"), + ("skills", "tap", "list"), + ("mcp", "list"), + ("mcp", "catalog"), + ("mcp", "test"), + ("mcp", "add"), + ("mcp", "remove"), + ("mcp", "install"), + ("mcp", "login"), + ("mcp", "reauth"), + ("mcp", "configure"), + ("mcp", "picker"), + ("memory", "status"), + ("auth", "list"), + ("auth", "status"), + ("auth", "reset"), + ("auth", "spotify", "status"), + ("pairing", "list"), + ("pairing", "approve"), + ("pairing", "revoke"), + ("pairing", "clear-pending"), + ("webhook", "list"), + ("webhook", "subscribe"), + ("webhook", "remove"), + ("webhook", "test"), +} + + +def test_hosted_console_registry_exposes_only_hosted_safe_surface(): + engine = HermesConsoleEngine(context="hosted") + hosted = { + path for path, command in engine.commands.items() if "hosted" in command.contexts + } + + assert hosted == EXPECTED_HOSTED_CONSOLE_COMMANDS + + +@pytest.mark.parametrize( + "line", + [ + "portal login", + "auth add nous --type oauth", + "auth logout nous", + "profile create tester", + "profile use default", + "plugins list", + "plugins install owner/repo", + "kanban list", + "hooks list", + "checkpoints clear", + "curator pause", + "pets install cat", + "backup --quick", + "import /tmp/hermes-console-test.zip", + "mcp serve", + "model", + "setup", + "dashboard", + "gateway restart", + "update", + "uninstall", + ], +) +def test_hosted_console_rejects_local_only_or_dangerous_commands(line): + result = HermesConsoleEngine(context="hosted").execute(line) + + assert result.status == "error" + assert result.output + + +@pytest.mark.parametrize( + "line", + [ + "mcp add demo --url https://example.com/sse", + "mcp install n8n", + "mcp configure github", + "mcp picker", + "config set display.interface cli", + "cron create 'every 1h' 'say hello'", + ], +) +def test_hosted_console_allows_guarded_useful_commands_before_confirmation(line): + result = HermesConsoleEngine(context="hosted").execute(line) + + assert result.status == "confirm_required" + + +@pytest.mark.parametrize( + "line", + [ + "mcp add local --command npx --args foo", + "mcp add local --preset unsafe", + "mcp add local --url file:///tmp/server", + "config set model.provider openrouter", + "config set portal.url https://evil.example", + "cron create 'every 1h' 'say hello' --script scripts/ping.py", + "cron create 'every 1h' 'say hello' --no-agent", + "cron edit abc123 --workdir /tmp/project", + ], +) +def test_hosted_console_blocks_known_footgun_arguments_before_confirmation(line): + result = HermesConsoleEngine(context="hosted").execute(line) + + assert result.status == "error" + assert result.output + + +@pytest.mark.parametrize( + "line", + [ + "sessions delete abc123", + "sessions prune --older-than 1", + "chat", + "--cli", + "--tui", + "oneshot hello", + "model", + "setup", + "postinstall", + "fallback add", + "moa configure", + "claw migrate", + "gateway restart", + "gateway start", + "gateway stop", + "dashboard", + "serve", + "proxy start", + "mcp serve", + "skills config", + "skills publish ./skill", + "completion bash", + "acp", + "update", + "uninstall", + "gui", + "desktop", + "login", + "logout", + "--tui", + "logs | cat", + "config show > out.txt", + ], +) +def test_console_rejects_destructive_and_shell_like_commands(line): + result = HermesConsoleEngine().execute(line) + + assert result.status == "error" + assert result.output + + +@pytest.mark.parametrize("line", MUTATING_CONFIRMATION_SMOKE_COMMANDS) +def test_mutating_console_commands_require_confirmation(line): + result = HermesConsoleEngine().execute(line) + + assert result.status == "confirm_required" + assert result.confirmation_message + + +def test_help_lists_supported_commands_and_not_full_cli(): + result = HermesConsoleEngine().execute("help") + + assert result.status == "ok" + assert "sessions list" in result.output + assert "config set" in result.output + assert "dashboard" not in result.output + assert "gateway restart" not in result.output + + +def test_config_set_requires_confirmation_then_writes(_isolate_hermes_home): + engine = HermesConsoleEngine() + + pending = engine.execute("config set console.test true") + assert pending.status == "confirm_required" + + from hermes_cli.config import read_raw_config + + assert read_raw_config() == {} + + result = engine.execute("config set console.test true", confirmed=True) + + assert result.status == "ok" + assert "console.test" in result.output + assert read_raw_config()["console"]["test"] is True + + +def test_sessions_list_and_stats_use_isolated_session_store(_isolate_hermes_home): + from hermes_state import SessionDB + + db = SessionDB() + try: + db.create_session("chat-session", source="cli", model="test/model") + db.create_session("tool-session", source="tool", model="test/model") + finally: + db.close() + + engine = HermesConsoleEngine() + listed = engine.execute("sessions list --limit 10") + stats = engine.execute("sessions stats") + + assert listed.status == "ok" + assert "chat-session" in listed.output + assert "tool-session" not in listed.output + assert "Total sessions: 2" in stats.output + assert "Listable sessions: 1" in stats.output + + +def test_cron_pause_resume_and_run_require_confirmation(_isolate_hermes_home): + from cron.jobs import create_job, get_job + + job = create_job(prompt="say hello", schedule="every 1h", name="alpha") + engine = HermesConsoleEngine() + + pending = engine.execute(f"cron pause {job['id']}") + assert pending.status == "confirm_required" + stored = get_job(job["id"]) + assert stored is not None + assert stored["state"] == "scheduled" + + paused = engine.execute(f"cron pause {job['id']}", confirmed=True) + assert paused.status == "ok" + stored = get_job(job["id"]) + assert stored is not None + assert stored["state"] == "paused" + + resumed = engine.execute("cron resume alpha", confirmed=True) + assert resumed.status == "ok" + stored = get_job(job["id"]) + assert stored is not None + assert stored["state"] == "scheduled" + + triggered = engine.execute("cron run alpha", confirmed=True) + assert triggered.status == "ok" + assert "Triggered job" in triggered.output + + +def test_repl_runs_non_interactive_lines_without_prompts(_isolate_hermes_home): + stdin = io.StringIO("help\nexit\n") + stdout = io.StringIO() + stderr = io.StringIO() + + code = run_console_repl( + stdin=stdin, + stdout=stdout, + stderr=stderr, + interactive=False, + ) + + assert code == 0 + assert "Hermes Console" in stdout.getvalue() + assert "hermes>" not in stdout.getvalue() + assert stderr.getvalue() == "" + + +def test_repl_refuses_non_interactive_confirmation(_isolate_hermes_home): + stdin = io.StringIO("config set console.test true\n") + stdout = io.StringIO() + stderr = io.StringIO() + + code = run_console_repl( + stdin=stdin, + stdout=stdout, + stderr=stderr, + interactive=False, + ) + + assert code == 1 + assert "Confirmation required" in stderr.getvalue() + + +def test_main_console_subcommand_smoke(_isolate_hermes_home): + import subprocess + + result = subprocess.run( + [sys.executable, "-m", "hermes_cli.main", "console"], + cwd=Path(__file__).resolve().parents[2], + input="help\nexit\n", + text=True, + capture_output=True, + timeout=20, + check=False, + ) + + assert result.returncode == 0 + assert "Hermes Console" in result.stdout diff --git a/tests/hermes_cli/test_dashboard_auth_ws_auth.py b/tests/hermes_cli/test_dashboard_auth_ws_auth.py index c10d8839f..2d28bcf1d 100644 --- a/tests/hermes_cli/test_dashboard_auth_ws_auth.py +++ b/tests/hermes_cli/test_dashboard_auth_ws_auth.py @@ -1,9 +1,9 @@ """Tests for the WS-upgrade auth helper (Phase 5 task 5.2). -The dashboard's four WS endpoints (``/api/pty``, ``/api/ws``, ``/api/pub``, -``/api/events``) share an auth gate: ``_ws_auth_ok``. In loopback mode it -accepts ``?token=<_SESSION_TOKEN>``; in gated mode it accepts a single-use -``?ticket=`` minted by ``POST /api/auth/ws-ticket``. +The dashboard's WS endpoints (``/api/pty``, ``/api/console``, ``/api/ws``, +``/api/pub``, ``/api/events``) share an auth gate: ``_ws_auth_ok``. In +loopback mode it accepts ``?token=<_SESSION_TOKEN>``; in gated mode it accepts +a single-use ``?ticket=`` minted by ``POST /api/auth/ws-ticket``. These tests exercise the helper at the unit level (no actual WS upgrade) plus the ticket-mint endpoint under realistic gated-mode setup. We don't @@ -315,9 +315,10 @@ class TestWsRequestIsAllowedGated: (intended only for unauthenticated loopback dev) must not also reject those upgrades: the OAuth gate + single-use ticket is the auth. - Regression coverage: every WS endpoint (``/api/pty``, ``/api/ws``, - ``/api/pub``, ``/api/events``) calls ``_ws_request_is_allowed`` after - ``_ws_auth_ok``. If the peer-IP check rejects gated mode, the chat + Regression coverage: every WS endpoint (``/api/pty``, ``/api/console``, + ``/api/ws``, ``/api/pub``, ``/api/events``) calls + ``_ws_request_is_allowed`` after ``_ws_auth_ok``. If the peer-IP check + rejects gated mode, the chat tab + sidebar tool feed silently fail to connect even after a successful OAuth login. """ diff --git a/tests/hermes_cli/test_model_provider_persistence.py b/tests/hermes_cli/test_model_provider_persistence.py index 76d5ee741..dd007d442 100644 --- a/tests/hermes_cli/test_model_provider_persistence.py +++ b/tests/hermes_cli/test_model_provider_persistence.py @@ -207,6 +207,51 @@ class TestProviderPersistsAfterModelSave: assert model.get("base_url") == "https://packy.example.com/v1" assert model.get("api_mode") == "codex_responses" + def test_named_custom_provider_with_builtin_slug_persists_custom_prefix( + self, config_home, monkeypatch + ): + """providers.<builtin-slug> must persist as a named custom provider.""" + import yaml + + from hermes_cli.main import _model_flow_named_custom + + config_path = config_home / "config.yaml" + config_path.write_text( + "providers:\n" + " minimax-cn:\n" + " name: MiniMax CN Proxy\n" + " api: https://mimimax.cn/v1\n" + " key_env: MINIMAX_CN_PROXY_KEY\n" + " transport: chat_completions\n" + " model: MiniMax-M3\n" + " default_model: MiniMax-M3\n" + ) + monkeypatch.setenv("MINIMAX_CN_PROXY_KEY", "proxy-secret") + + provider_info = { + "name": "MiniMax CN Proxy", + "base_url": "https://mimimax.cn/v1", + "api_key": "", + "key_env": "MINIMAX_CN_PROXY_KEY", + "model": "MiniMax-M3", + "api_mode": "chat_completions", + "provider_key": "minimax-cn", + } + + with patch("hermes_cli.auth._save_model_choice"), \ + patch("hermes_cli.auth.deactivate_provider"), \ + patch("hermes_cli.models.fetch_api_models", return_value=["MiniMax-M3"]), \ + patch("hermes_cli.curses_ui.curses_radiolist", side_effect=OSError("no tty in test")), \ + patch("builtins.input", return_value="1"): + _model_flow_named_custom({}, provider_info) + + config = yaml.safe_load(config_path.read_text()) or {} + model = config.get("model") + assert isinstance(model, dict) + assert model.get("provider") == "custom:minimax-cn" + assert "base_url" not in model + assert "api_key" not in model + def test_copilot_acp_provider_saved_when_selected(self, config_home): """_model_flow_copilot_acp should persist provider/base_url/model together.""" from hermes_cli.main import _model_flow_copilot_acp @@ -555,4 +600,3 @@ class TestZaiEndpointPicker: _select_zai_endpoint(custom_url) assert captured["default"] == expected_default - diff --git a/tests/hermes_cli/test_runtime_provider_resolution.py b/tests/hermes_cli/test_runtime_provider_resolution.py index 54f58b858..df47efcf8 100644 --- a/tests/hermes_cli/test_runtime_provider_resolution.py +++ b/tests/hermes_cli/test_runtime_provider_resolution.py @@ -1272,6 +1272,33 @@ def test_resolve_requested_provider_precedence(monkeypatch): assert rp.resolve_requested_provider() == "auto" +def test_resolve_runtime_provider_named_custom_with_builtin_slug(monkeypatch): + monkeypatch.setenv("MINIMAX_CN_PROXY_KEY", "proxy-secret") + monkeypatch.setattr( + rp, + "load_config", + lambda: { + "model": {"provider": "custom:minimax-cn"}, + "providers": { + "minimax-cn": { + "name": "MiniMax CN Proxy", + "api": "https://mimimax.cn/v1", + "key_env": "MINIMAX_CN_PROXY_KEY", + "transport": "chat_completions", + "default_model": "MiniMax-M3", + } + }, + }, + ) + + resolved = rp.resolve_runtime_provider() + + assert resolved["provider"] == "custom" + assert resolved["base_url"] == "https://mimimax.cn/v1" + assert resolved["api_key"] == "proxy-secret" + assert resolved["api_mode"] == "chat_completions" + + # ── api_mode config override tests ────────────────────────────────────── diff --git a/tests/hermes_cli/test_update_venv_health.py b/tests/hermes_cli/test_update_venv_health.py new file mode 100644 index 000000000..16a9a1428 --- /dev/null +++ b/tests/hermes_cli/test_update_venv_health.py @@ -0,0 +1,346 @@ +"""Tests for the Windows half-updated-venv hardening (July 2026 incident). + +Covers three additions to ``hermes update``: + +1. ``_venv_core_imports_healthy`` — the venv health probe that lets an + "Already up to date" checkout still repair a broken dependency install. +2. ``_detect_venv_python_processes`` — the venv-interpreter process guard + that refuses to mutate the venv while a desktop backend / stray python + holds .pyd files mapped. +3. The commit_count == 0 repair branch wiring in ``_cmd_update_impl``. + +All Windows-specific paths are exercised via ``_is_windows`` patching so +they run on any host (same approach as test_update_concurrent_quarantine). +""" + +from __future__ import annotations + +import subprocess +import sys +import types +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest + +from hermes_cli import main as cli_main + + +# --------------------------------------------------------------------------- +# _venv_core_imports_healthy +# --------------------------------------------------------------------------- + + +def test_venv_health_reports_healthy_when_no_venv(tmp_path): + """No venv python in a DEV checkout → nothing to probe → healthy.""" + with patch.object(cli_main, "PROJECT_ROOT", tmp_path): + healthy, detail = cli_main._venv_core_imports_healthy() + assert healthy is True + assert detail == "" + + +def test_venv_health_missing_venv_unhealthy_on_managed_install(tmp_path): + """On a managed install (bootstrap marker) the venv IS the install — + its absence must be reported unhealthy so the repair lane runs instead + of 'Already up to date!'.""" + (tmp_path / ".hermes-bootstrap-complete").write_text("done") + with patch.object(cli_main, "PROJECT_ROOT", tmp_path): + healthy, detail = cli_main._venv_core_imports_healthy() + assert healthy is False + assert "venv python missing" in detail + + +def test_venv_health_missing_venv_unhealthy_with_interrupted_marker(tmp_path): + """An interrupted-update breadcrumb also flips missing-venv to unhealthy.""" + (tmp_path / ".update-incomplete").write_text("started=1\npid=1\n") + with patch.object(cli_main, "PROJECT_ROOT", tmp_path): + healthy, detail = cli_main._venv_core_imports_healthy() + assert healthy is False + assert "venv python missing" in detail + + +def _fake_venv_python(tmp_path, *, windows: bool = False): + bin_dir = tmp_path / "venv" / ("Scripts" if windows else "bin") + bin_dir.mkdir(parents=True) + py = bin_dir / ("python.exe" if windows else "python") + py.write_bytes(b"") + return py + + +def test_venv_health_reports_missing_imports(tmp_path): + """Probe output lines are surfaced as the unhealthy detail.""" + _fake_venv_python(tmp_path) + + fake = SimpleNamespace( + returncode=0, + stdout="fastapi: No module named 'annotated_doc'\n", + stderr="", + ) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object( + cli_main.subprocess, "run", return_value=fake + ): + healthy, detail = cli_main._venv_core_imports_healthy() + + assert healthy is False + assert "annotated_doc" in detail + + +def test_venv_health_healthy_when_probe_clean(tmp_path): + _fake_venv_python(tmp_path) + fake = SimpleNamespace(returncode=0, stdout="", stderr="") + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object( + cli_main.subprocess, "run", return_value=fake + ): + healthy, detail = cli_main._venv_core_imports_healthy() + assert healthy is True + + +def test_venv_health_broken_interpreter_is_unhealthy(tmp_path): + """Nonzero exit with no module list = interpreter itself is broken.""" + _fake_venv_python(tmp_path) + fake = SimpleNamespace(returncode=1, stdout="", stderr="Fatal Python error: init failed\n") + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object( + cli_main.subprocess, "run", return_value=fake + ): + healthy, detail = cli_main._venv_core_imports_healthy() + assert healthy is False + assert "Fatal Python error" in detail + + +def test_venv_health_probe_failure_reports_healthy(tmp_path): + """A probe that can't run must NOT force needless reinstalls.""" + _fake_venv_python(tmp_path) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.object( + cli_main.subprocess, + "run", + side_effect=subprocess.TimeoutExpired(cmd="python", timeout=60), + ): + healthy, _detail = cli_main._venv_core_imports_healthy() + assert healthy is True + + +# --------------------------------------------------------------------------- +# _detect_venv_python_processes +# --------------------------------------------------------------------------- + + +def _proc(pid: int, exe: str, name: str, cmdline: list[str] | None = None, cwd: str = ""): + proc = MagicMock() + proc.info = { + "pid": pid, + "exe": exe, + "name": name, + "cmdline": cmdline or [], + "cwd": cwd, + } + return proc + + +def test_detect_venv_python_off_windows_is_empty(): + with patch.object(cli_main, "_is_windows", return_value=False): + assert cli_main._detect_venv_python_processes() == [] + + +@patch.object(cli_main, "_is_windows", return_value=True) +def test_detect_venv_python_finds_backend(_winp, tmp_path): + venv_py = str(tmp_path / "venv" / "Scripts" / "python.exe") + other_py = "C:\\Python311\\python.exe" + + me = MagicMock() + me.parents.return_value = [] + fake_psutil = types.SimpleNamespace( + process_iter=lambda attrs: iter( + [ + _proc(101, venv_py, "python.exe", ["python.exe", "-m", "hermes_cli.main", "serve"]), + _proc(102, other_py, "python.exe", ["python.exe", "somescript.py"]), + ] + ), + Process=lambda *a, **k: me, + ) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict( + sys.modules, {"psutil": fake_psutil} + ): + matches = cli_main._detect_venv_python_processes() + + assert [m[0] for m in matches] == [101] + assert "serve" in matches[0][2] + + +@patch.object(cli_main, "_is_windows", return_value=True) +def test_detect_venv_python_excludes_self_and_ancestors(_winp, tmp_path): + import os as _os + + venv_py = str(tmp_path / "venv" / "Scripts" / "python.exe") + parent = MagicMock() + parent.pid = 555 + me = MagicMock() + me.parents.return_value = [parent] + fake_psutil = types.SimpleNamespace( + process_iter=lambda attrs: iter( + [ + _proc(_os.getpid(), venv_py, "python.exe"), + _proc(555, venv_py, "hermes.exe"), + ] + ), + Process=lambda *a, **k: me, + ) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict( + sys.modules, {"psutil": fake_psutil} + ): + assert cli_main._detect_venv_python_processes() == [] + + +@patch.object(cli_main, "_is_windows", return_value=True) +def test_detect_venv_python_no_psutil_is_empty(_winp, tmp_path): + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict( + sys.modules, {"psutil": None} + ): + assert cli_main._detect_venv_python_processes() == [] + + +def test_format_venv_holders_message_flags_desktop_backend(tmp_path): + matches = [ + (101, "python.exe", "python.exe -m hermes_cli.main serve --host 127.0.0.1"), + (102, "pythonw.exe", "pythonw.exe -m hermes_cli.main gateway run"), + ] + msg = cli_main._format_venv_python_holders_message(matches) + assert "101" in msg + assert "desktop app" in msg.lower() + assert "gateway" in msg + assert "hermes update" in msg + assert "--force-venv" in msg + + +@patch.object(cli_main, "_is_windows", return_value=True) +def test_detect_venv_python_catches_outside_venv_trampoline(_winp, tmp_path): + """uv/base-interpreter trampoline: exe OUTSIDE the venv, but the cmdline + clearly runs Hermes from this install → must still be flagged as a holder + (it imports from the venv and holds its .pyd files).""" + base_py = "C:\\Python311\\python.exe" + venv_path = str(tmp_path / "venv" / "Scripts" / "python.exe") + + me = MagicMock() + me.parents.return_value = [] + fake_psutil = types.SimpleNamespace( + process_iter=lambda attrs: iter( + [ + # cmdline references the venv path directly + _proc(201, base_py, "python.exe", [base_py, venv_path, "-m", "x"]), + # `-m hermes_cli.main serve` with the install root as cwd + _proc( + 202, + base_py, + "python.exe", + [base_py, "-m", "hermes_cli.main", "serve"], + cwd=str(tmp_path), + ), + # unrelated base-interpreter python → NOT a holder + _proc(203, base_py, "python.exe", [base_py, "somescript.py"], cwd="C:\\other"), + ] + ), + Process=lambda *a, **k: me, + ) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict( + sys.modules, {"psutil": fake_psutil} + ): + matches = cli_main._detect_venv_python_processes() + + assert sorted(m[0] for m in matches) == [201, 202] + + +@patch.object(cli_main, "_is_windows", return_value=True) +def test_detect_venv_hermes_cli_cmdline_outside_install_not_matched(_winp, tmp_path): + """A hermes_cli.main process belonging to a DIFFERENT install (neither + install root in cmdline nor cwd under it) must not be flagged.""" + base_py = "C:\\Python311\\python.exe" + me = MagicMock() + me.parents.return_value = [] + fake_psutil = types.SimpleNamespace( + process_iter=lambda attrs: iter( + [ + _proc( + 301, + base_py, + "python.exe", + [base_py, "-m", "hermes_cli.main", "serve"], + cwd="C:\\other-install", + ), + ] + ), + Process=lambda *a, **k: me, + ) + with patch.object(cli_main, "PROJECT_ROOT", tmp_path), patch.dict( + sys.modules, {"psutil": fake_psutil} + ): + assert cli_main._detect_venv_python_processes() == [] + + +# --------------------------------------------------------------------------- +# --force vs --force-venv gating of the venv-holder guard +# --------------------------------------------------------------------------- + + +def _update_args(**overrides): + defaults = dict( + gateway=False, + check=False, + no_backup=True, + backup=False, + yes=True, + branch=None, + force=False, + force_venv=False, + ) + defaults.update(overrides) + return SimpleNamespace(**defaults) + + +def _run_update_until_guard(args): + """Drive _cmd_update_impl just far enough to hit the venv-holder guard. + + Everything before the guard is stubbed; the guard firing is observed via + SystemExit(2). The first statement AFTER the guard is + ``git_dir = PROJECT_ROOT / ".git"`` — a PROJECT_ROOT sentinel whose + ``__truediv__`` raises marks 'guard passed'.""" + + class _PastGuard(Exception): + pass + + class _RootSentinel: + def __truediv__(self, _other): + raise _PastGuard + + with patch.object(cli_main, "_is_windows", return_value=True), patch.object( + cli_main, "_venv_scripts_dir", return_value=None + ), patch.object(cli_main, "_run_pre_update_backup"), patch.object( + cli_main, "_pause_windows_gateways_for_update", return_value=None + ), patch.object( + cli_main, "_resume_windows_gateways_after_update" + ), patch.object( + cli_main, + "_detect_venv_python_processes", + return_value=[(101, "python.exe", "python.exe -m hermes_cli.main serve")], + ), patch.object( + cli_main, "PROJECT_ROOT", _RootSentinel() + ): + try: + cli_main._cmd_update_impl(args, gateway_mode=False) + except _PastGuard: + return "past_guard" + except SystemExit as exc: + return f"exit_{exc.code}" + return "returned" + + +@pytest.mark.parametrize( + "force,force_venv,expected", + [ + (False, False, "exit_2"), # guard fires + (True, False, "exit_2"), # plain --force does NOT bypass the venv guard + (False, True, "past_guard"), # --force-venv is the explicit escape hatch + (True, True, "past_guard"), + ], +) +def test_venv_holder_guard_force_semantics(force, force_venv, expected, capsys): + result = _run_update_until_guard(_update_args(force=force, force_venv=force_venv)) + assert result == expected, capsys.readouterr().out diff --git a/tests/hermes_cli/test_web_server_console_ws.py b/tests/hermes_cli/test_web_server_console_ws.py new file mode 100644 index 000000000..538251ec7 --- /dev/null +++ b/tests/hermes_cli/test_web_server_console_ws.py @@ -0,0 +1,134 @@ +"""Dashboard Hermes Console websocket tests.""" + +from __future__ import annotations + +import time +from urllib.parse import urlencode + +import pytest +from starlette.testclient import TestClient +from starlette.websockets import WebSocketDisconnect + +from hermes_cli import web_server + + +@pytest.fixture +def console_client(monkeypatch, _isolate_hermes_home): + previous_auth_required = getattr(web_server.app.state, "auth_required", None) + previous_bound_host = getattr(web_server.app.state, "bound_host", None) + web_server.app.state.auth_required = False + web_server.app.state.bound_host = None + monkeypatch.setattr(web_server, "_DASHBOARD_EMBEDDED_CHAT_ENABLED", True) + + client = TestClient(web_server.app) + try: + yield client + finally: + close = getattr(client, "close", None) + if close is not None: + close() + if previous_auth_required is None: + if hasattr(web_server.app.state, "auth_required"): + delattr(web_server.app.state, "auth_required") + else: + web_server.app.state.auth_required = previous_auth_required + if previous_bound_host is None: + if hasattr(web_server.app.state, "bound_host"): + delattr(web_server.app.state, "bound_host") + else: + web_server.app.state.bound_host = previous_bound_host + + +def _url(token: str | None = None, **params: str) -> str: + query = {"token": web_server._SESSION_TOKEN, **params} + if token is not None: + query["token"] = token + return f"/api/console?{urlencode(query)}" + + +def _recv_until(conn, frame_type: str, *, status: str | None = None) -> dict: + deadline = time.monotonic() + 5.0 + while time.monotonic() < deadline: + frame = conn.receive_json() + if frame.get("type") != frame_type: + continue + if status is not None and frame.get("status") != status: + continue + return frame + raise AssertionError(f"Timed out waiting for {frame_type} frame") + + +def test_console_ws_rejects_missing_or_bad_token(console_client): + with pytest.raises(WebSocketDisconnect) as exc: + with console_client.websocket_connect("/api/console"): + pass + assert exc.value.code == 4401 + + with pytest.raises(WebSocketDisconnect) as exc: + with console_client.websocket_connect(_url(token="wrong")): + pass + assert exc.value.code == 4401 + + +def test_console_ws_runs_read_only_command(console_client): + with console_client.websocket_connect(_url()) as conn: + ready = conn.receive_json() + assert ready["type"] == "ready" + assert ready["context"] == "local" + assert ready["prompt"] == "hermes> " + + conn.send_json({"type": "input", "line": "help"}) + + output = _recv_until(conn, "output") + assert "Hermes Console" in output["data"] + complete = _recv_until(conn, "complete", status="ok") + assert complete["prompt"] == "hermes> " + + +def test_console_ws_confirmed_command_executes_after_confirmation(console_client): + from hermes_cli.config import load_config + + with console_client.websocket_connect(_url()) as conn: + assert conn.receive_json()["type"] == "ready" + conn.send_json({"type": "input", "line": "config set display.interface cli"}) + + confirmation = _recv_until(conn, "confirm_required") + assert confirmation["command"] == "config set display.interface cli" + assert confirmation["message"] + + conn.send_json({"type": "confirm", "command": confirmation["command"]}) + _recv_until(conn, "complete", status="ok") + + assert load_config()["display"]["interface"] == "cli" + + +def test_console_ws_uses_hosted_context_for_opt_data_policy(console_client, monkeypatch): + monkeypatch.setattr(web_server, "_default_hermes_root_is_opt_data", lambda: True) + + with console_client.websocket_connect(_url()) as conn: + ready = conn.receive_json() + assert ready["type"] == "ready" + assert ready["context"] == "hosted" + + conn.send_json({"type": "input", "line": "profile create nope"}) + + error = _recv_until(conn, "error") + assert "hosted Hermes Console" in error["message"] + + +def test_console_ws_cancel_returns_to_prompt(console_client, monkeypatch): + from hermes_cli.console_engine import ConsoleResult, HermesConsoleEngine + + def slow_execute(self, line: str, *, confirmed: bool = False): + time.sleep(0.5) + return ConsoleResult("ok", output="late", command=line) + + monkeypatch.setattr(HermesConsoleEngine, "execute", slow_execute) + + with console_client.websocket_connect(_url()) as conn: + assert conn.receive_json()["type"] == "ready" + conn.send_json({"type": "input", "line": "status"}) + conn.send_json({"type": "cancel"}) + + complete = _recv_until(conn, "complete", status="cancelled") + assert complete["prompt"] == "hermes> " diff --git a/tests/hermes_cli/test_web_server_files.py b/tests/hermes_cli/test_web_server_files.py index b295f0ab9..63a8b39ff 100644 --- a/tests/hermes_cli/test_web_server_files.py +++ b/tests/hermes_cli/test_web_server_files.py @@ -488,3 +488,75 @@ def test_stream_upload_cleans_temp_on_cancellation(forced_files_client): # ... and no .upload temp file was left behind. leftovers = [p.name for p in target.parent.iterdir() if ".upload" in p.name] assert leftovers == [], f"temp upload files leaked on cancellation: {leftovers}" + + +def test_sensitive_env_files_hidden_from_listing(forced_files_client): + """Regression test for #57505: .env files must not appear in directory listings.""" + client, root = forced_files_client + + # Create a regular file and .env variants including shorthand suffixes. + root.mkdir(parents=True, exist_ok=True) + regular = root / "config.txt" + regular.write_text("safe content") + env_file = root / ".env" + env_file.write_text("SECRET_KEY=abc123") + env_local = root / ".env.local" + env_local.write_text("LOCAL_SECRET=def456") + env_prod = root / ".env.prod" + env_prod.write_text("PROD_SECRET=ghi789") + + listing = client.get("/api/files", params={"path": str(root)}) + assert listing.status_code == 200 + names = [e["name"] for e in listing.json()["entries"]] + assert "config.txt" in names + assert ".env" not in names + assert ".env.local" not in names + assert ".env.prod" not in names + + +def test_sensitive_env_files_blocked_read(forced_files_client): + """Regression test for #57505: .env files must not be readable.""" + client, root = forced_files_client + + root.mkdir(parents=True, exist_ok=True) + env_file = root / ".env" + env_file.write_text("SECRET_KEY=abc123") + + resp = client.get("/api/files/read", params={"path": str(env_file)}) + assert resp.status_code == 403 + + +def test_sensitive_env_files_blocked_download(forced_files_client): + """Regression test for #57505: .env files must not be downloadable.""" + client, root = forced_files_client + + root.mkdir(parents=True, exist_ok=True) + env_file = root / ".env" + env_file.write_text("SECRET_KEY=abc123") + + resp = client.get("/api/files/download", params={"path": str(env_file)}) + assert resp.status_code == 403 + + +def test_sensitive_env_suffix_variants_blocked(forced_files_client): + """Regression: .env.<suffix> shorthand variants (e.g. .env.prod) must also be blocked.""" + client, root = forced_files_client + + root.mkdir(parents=True, exist_ok=True) + for suffix in ("prod", "dev", "staging.local", "ci"): + p = root / f".env.{suffix}" + p.write_text(f"SECRET_{suffix}=abc123") + assert client.get("/api/files/read", params={"path": str(p)}).status_code == 403 + assert client.get("/api/files/download", params={"path": str(p)}).status_code == 403 + + +def test_sensitive_env_case_insensitive_blocked(forced_files_client): + """Regression: .ENV / .Env.local casings must be blocked too (case-insensitive FS mounts).""" + client, root = forced_files_client + + root.mkdir(parents=True, exist_ok=True) + for name in (".ENV", ".Env.local", ".eNv.PROD"): + p = root / name + p.write_text("SECRET=abc123") + assert client.get("/api/files/read", params={"path": str(p)}).status_code == 403 + assert client.get("/api/files/download", params={"path": str(p)}).status_code == 403 diff --git a/tests/plugins/image_gen/test_openai_provider.py b/tests/plugins/image_gen/test_openai_provider.py index 8a6a49850..2ac61c54e 100644 --- a/tests/plugins/image_gen/test_openai_provider.py +++ b/tests/plugins/image_gen/test_openai_provider.py @@ -124,6 +124,68 @@ class TestModelResolution: # ── Generate ──────────────────────────────────────────────────────────────── +class TestSourceImageLoading: + def test_load_image_bytes_blocks_credential_store(self, tmp_path, monkeypatch): + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + auth_json = hermes_home / "auth.json" + auth_json.write_text('{"api_key":"sk-secret"}', encoding="utf-8") + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + with pytest.raises(ValueError, match="credential store"): + openai_plugin._load_image_bytes(str(auth_json)) + + def test_load_image_bytes_never_opens_blocked_credential(self, tmp_path, monkeypatch): + """The guard must fire BEFORE the file is opened — a credential store + must never be read into memory (#57698). Spy builtins.open and assert + it is never called for the blocked path.""" + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + auth_json = hermes_home / "auth.json" + auth_json.write_text('{"api_key":"sk-secret"}', encoding="utf-8") + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + import builtins + + real_open = builtins.open + opened: list = [] + + def _spy_open(file, *a, **k): + opened.append(str(file)) + return real_open(file, *a, **k) + + monkeypatch.setattr(builtins, "open", _spy_open) + with pytest.raises(ValueError, match="credential store"): + openai_plugin._load_image_bytes(str(auth_json)) + assert str(auth_json) not in opened, "blocked credential must never be opened" + + def test_load_image_bytes_allows_legit_local_image(self, tmp_path, monkeypatch): + """Negative control: a legitimate local image path is NOT blocked and + loads normally — proves the guard doesn't over-fire on everything.""" + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + img = tmp_path / "pic.png" + img.write_bytes(b"\x89PNG\r\n\x1a\nfake-image-bytes") + + data, name = openai_plugin._load_image_bytes(str(img)) + assert data == b"\x89PNG\r\n\x1a\nfake-image-bytes" + assert name == "pic.png" + + def test_load_image_bytes_passthrough_data_uri_not_blocked(self, tmp_path, monkeypatch): + """Negative control: data: URIs are decoded, never routed through the + local-path guard (the guard only applies to local file reads).""" + import base64 + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + b64 = base64.b64encode(b"xyz").decode("ascii") + data, name = openai_plugin._load_image_bytes(f"data:image/png;base64,{b64}") + assert data == b"xyz" + assert name.endswith(".png") + + class TestGenerate: def test_empty_prompt_rejected(self, provider): result = provider.generate("", aspect_ratio="square") diff --git a/tests/plugins/image_gen/test_openrouter_compat_provider.py b/tests/plugins/image_gen/test_openrouter_compat_provider.py index cef2f4394..95052a9f8 100644 --- a/tests/plugins/image_gen/test_openrouter_compat_provider.py +++ b/tests/plugins/image_gen/test_openrouter_compat_provider.py @@ -169,6 +169,43 @@ class TestHelpers: assert _to_image_url_part("/no/such/file.png") is None + def test_to_image_url_part_blocks_credential_store(self, tmp_path, monkeypatch): + from plugins.image_gen.openrouter import _to_image_url_part + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + auth_json = hermes_home / "auth.json" + auth_json.write_text('{"api_key":"sk-secret"}', encoding="utf-8") + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + with pytest.raises(ValueError, match="credential store"): + _to_image_url_part(str(auth_json)) + + def test_to_image_url_part_never_reads_blocked_credential(self, tmp_path, monkeypatch): + """The guard must fire BEFORE path.read_bytes() — the credential store + must never be inlined into a provider request (#57698).""" + from pathlib import Path as _P + + from plugins.image_gen.openrouter import _to_image_url_part + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + auth_json = hermes_home / "auth.json" + auth_json.write_text('{"api_key":"sk-secret"}', encoding="utf-8") + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + real_read_bytes = _P.read_bytes + read: list = [] + + def _spy_read_bytes(self, *a, **k): + read.append(str(self)) + return real_read_bytes(self, *a, **k) + + monkeypatch.setattr(_P, "read_bytes", _spy_read_bytes) + with pytest.raises(ValueError, match="credential store"): + _to_image_url_part(str(auth_json)) + assert str(auth_json) not in read, "blocked credential must never be read" + def test_extract_images(self): from plugins.image_gen.openrouter import _extract_images diff --git a/tests/plugins/image_gen/test_xai_provider.py b/tests/plugins/image_gen/test_xai_provider.py index cf9708dae..a233240cf 100644 --- a/tests/plugins/image_gen/test_xai_provider.py +++ b/tests/plugins/image_gen/test_xai_provider.py @@ -492,3 +492,50 @@ def test_xai_image_field_expands_user_home(tmp_path, monkeypatch): field = _xai_image_field("~/pic.png") assert field["type"] == "image_url" assert field["url"].startswith("data:image/png;base64,") + + +class TestXAIImageFieldReadGuard: + """#57698: local image inputs must not read Hermes credential stores.""" + + def test_xai_image_field_blocks_credential_store(self, tmp_path, monkeypatch): + from plugins.image_gen.xai import _xai_image_field + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + auth_json = hermes_home / "auth.json" + auth_json.write_text('{"api_key":"sk-secret"}', encoding="utf-8") + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + with pytest.raises(ValueError, match="credential store"): + _xai_image_field(str(auth_json)) + + def test_xai_image_field_never_opens_blocked_credential(self, tmp_path, monkeypatch): + """Guard fires before open() — credential store never read into memory.""" + import builtins + + from plugins.image_gen.xai import _xai_image_field + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + auth_json = hermes_home / "auth.json" + auth_json.write_text('{"api_key":"sk-secret"}', encoding="utf-8") + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + real_open = builtins.open + opened: list = [] + + def _spy_open(file, *a, **k): + opened.append(str(file)) + return real_open(file, *a, **k) + + monkeypatch.setattr(builtins, "open", _spy_open) + with pytest.raises(ValueError, match="credential store"): + _xai_image_field(str(auth_json)) + assert str(auth_json) not in opened, "blocked credential must never be opened" + + def test_xai_image_field_passthrough_url_not_blocked(self, monkeypatch): + """Negative control: remote URLs and data: URIs pass through unguarded.""" + from plugins.image_gen.xai import _xai_image_field + + assert _xai_image_field("https://example.com/pic.png")["url"] == "https://example.com/pic.png" + assert _xai_image_field("data:image/png;base64,eHl6")["url"].startswith("data:image/png") diff --git a/tests/plugins/video_gen/test_xai_plugin.py b/tests/plugins/video_gen/test_xai_plugin.py index eb495b969..e1a2a5ec9 100644 --- a/tests/plugins/video_gen/test_xai_plugin.py +++ b/tests/plugins/video_gen/test_xai_plugin.py @@ -186,3 +186,41 @@ def test_video_input_from_public_url_rejects_bare_file_id(): ) ) assert result is None + + +def test_xai_video_image_input_blocks_credential_store_symlink(tmp_path, monkeypatch): + from plugins.video_gen.xai import _image_ref_to_xai_input + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + auth_json = hermes_home / "auth.json" + auth_json.write_text('{"api_key":"sk-secret"}', encoding="utf-8") + image_link = hermes_home / "leak.png" + try: + image_link.symlink_to(auth_json) + except OSError as exc: + pytest.skip(f"symlink unavailable on this platform: {exc}") + + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + with pytest.raises(ValueError, match="credential store"): + _image_ref_to_xai_input(str(image_link)) + + +def test_xai_video_file_input_blocks_credential_store_symlink(tmp_path, monkeypatch): + from plugins.video_gen.xai import _video_ref_to_xai_url + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + auth_json = hermes_home / "auth.json" + auth_json.write_text('{"api_key":"sk-secret"}', encoding="utf-8") + video_link = hermes_home / "leak.mp4" + try: + video_link.symlink_to(auth_json) + except OSError as exc: + pytest.skip(f"symlink unavailable on this platform: {exc}") + + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + with pytest.raises(ValueError, match="credential store"): + _video_ref_to_xai_url(str(video_link)) diff --git a/tests/skills/test_unbroker_skill.py b/tests/skills/test_unbroker_skill.py new file mode 100644 index 000000000..ced394d93 --- /dev/null +++ b/tests/skills/test_unbroker_skill.py @@ -0,0 +1,1447 @@ +"""Hermetic tests for the unbroker skill. + +Stdlib + pytest only; NO live network, NO browser, NO email. Each test runs against +an isolated temp PDD_DATA_DIR. Runnable with pytest or directly: + + python3 -m pytest tests/test_unbroker_skill.py -q + python3 tests/test_unbroker_skill.py # portable fallback runner +""" +from __future__ import annotations + +import contextlib +import os +import shutil +import sys +import tempfile +from pathlib import Path + +# Resolve the skill's scripts dir across layouts: standalone dev repo (tests/) and hermes-agent +# (tests/skills/ -> optional-skills/security/unbroker/scripts). +_HERE = Path(__file__).resolve() +_REL = ("optional-skills", "security", "unbroker", "scripts") +_CANDIDATES = [ + _HERE.parent.parent / "skill" / "scripts", # standalone dev repo + _HERE.parent.parent.joinpath(*_REL), # standalone layout + _HERE.parent.parent.parent.joinpath(*_REL), # hermes-agent (tests/skills/) +] +SCRIPTS = next((c for c in _CANDIDATES if (c / "pdd.py").exists()), _CANDIDATES[0]) +sys.path.insert(0, str(SCRIPTS)) + +import autopilot # noqa: E402 +import contextlib as _ctx # noqa: E402 +import io as _io # noqa: E402 +import json as _json # noqa: E402 +import smtplib as _smtplib # noqa: E402 +import time as _time # noqa: E402 + +import badbool # noqa: E402 +import brokers # noqa: E402 +import cdp # noqa: E402 +import config # noqa: E402 +import crypto # noqa: E402 +import dossier # noqa: E402 +import email_modes # noqa: E402 +import emailer # noqa: E402 +import pdd # noqa: E402 +import legal # noqa: E402 +import ledger # noqa: E402 +import paths # noqa: E402 +import registry # noqa: E402 +import report # noqa: E402 +import storage # noqa: E402 +import tiers # noqa: E402 +import vectors # noqa: E402 + +_AGE = bool(shutil.which("age") and shutil.which("age-keygen")) + + +@contextlib.contextmanager +def temp_env(): + """Isolate every test in a fresh PDD_DATA_DIR.""" + prev = os.environ.get("PDD_DATA_DIR") + with tempfile.TemporaryDirectory() as d: + os.environ["PDD_DATA_DIR"] = str(Path(d) / "pdd") + try: + yield Path(os.environ["PDD_DATA_DIR"]) + finally: + if prev is None: + os.environ.pop("PDD_DATA_DIR", None) + else: + os.environ["PDD_DATA_DIR"] = prev + + +def _consenting(full_name="Jane Q. Public"): + return { + "subject_id": "sub_test01", + "consent": {"authorized": True, "method": "self"}, + "identity": { + "full_name": full_name, + "emails": ["jane@example.com"], + "phones": ["+1-415-555-0137"], + "date_of_birth": "1987-04-12", + "current_address": {"city": "Oakland", "state": "CA", "postal": "94601"}, + }, + "preferences": {"email_mode": "draft_only"}, + } + + +# --- config ------------------------------------------------------------------- + +def test_config_defaults_are_easiest(): + with temp_env(): + cfg = config.load_config() + assert cfg["email_mode"] == "draft_only" + assert cfg["browser_backend"] == "auto" + assert cfg["tracker_backend"] == "local-json" + assert cfg["encryption"] == "none" + + +def test_config_roundtrip_and_validation(): + with temp_env(): + config.save_config({"email_mode": "programmatic"}) + assert config.load_config()["email_mode"] == "programmatic" + try: + config.save_config({"email_mode": "bogus"}) + except ValueError: + pass + else: + raise AssertionError("invalid email_mode should raise") + + +def test_browser_clears_captcha_logic(): + assert config.browser_clears_captcha({"browser_backend": "browserbase"}) is True + assert config.browser_clears_captcha({"browser_backend": "agent-browser"}) is False + assert config.browser_clears_captcha({"browser_backend": "auto"}, env={}) is False + assert config.browser_clears_captcha({"browser_backend": "auto"}, env={"BROWSERBASE_API_KEY": "x"}) is True + + +# --- storage ------------------------------------------------------------------ + +def test_storage_json_and_jsonl_roundtrip(): + with temp_env() as data: + p = data / "x.json" + storage.write_json(p, {"a": 1}) + assert storage.read_json(p) == {"a": 1} + assert storage.read_json(data / "missing.json", []) == [] + log = data / "audit.jsonl" + storage.append_jsonl(log, {"e": 1}) + storage.append_jsonl(log, {"e": 2}) + assert [r["e"] for r in storage.read_jsonl(log)] == [1, 2] + + +# --- at-rest encryption ------------------------------------------------------- + +def test_encryption_off_writes_plaintext(): + with temp_env(): + d = _consenting() + dossier.save(d) + p = paths.dossier_path(d["subject_id"]) + assert p.exists() and not Path(str(p) + ".age").exists() + + +def test_encryption_age_round_trip(): + if not _AGE: + return # age not installed -> effectively skipped (keeps hermetic CI green) + with temp_env(): + config.save_config({"encryption": "age"}) + crypto.ensure_identity() + assert crypto.is_engaged() + d = _consenting() + dossier.save(d) + plain = paths.dossier_path(d["subject_id"]) + enc = Path(str(plain) + ".age") + assert enc.exists() and not plain.exists() # only ciphertext on disk + assert not enc.read_bytes().lstrip().startswith(b"{") # not plaintext JSON + assert dossier.load(d["subject_id"])["identity"]["full_name"] == "Jane Q. Public" + + +def test_encryption_keeps_config_and_audit_plaintext(): + if not _AGE: + return + with temp_env(): + config.save_config({"encryption": "age"}) + crypto.ensure_identity() + # config.json must stay readable plaintext (crypto reads it to decide) + assert config.load_config()["encryption"] == "age" + assert not Path(str(paths.config_path()) + ".age").exists() + # audit log holds field NAMES only, kept plaintext by design + ledger.transition("sub_test01", "spokeo", "found", found=True) + assert paths.audit_path("sub_test01").exists() + + +# --- broker DB ---------------------------------------------------------------- + +def test_seed_broker_db_loads_and_is_well_formed(): + everyone = brokers.load_all() + assert len(everyone) >= 10 + ids = {b["id"] for b in everyone} + assert {"spokeo", "whitepages", "mylife"} <= ids + for b in everyone: + assert b.get("id") and b.get("name") and b.get("priority") in {"crucial", "high", "standard", "long_tail"} + assert (b.get("optout") or {}).get("method") + + +def test_clusters_expose_ownership(): + cl = brokers.clusters() + assert "freepeopledirectory" in cl.get("spokeo", []) + assert "peoplelooker" in cl.get("beenverified", []) + + +# --- tier selection ----------------------------------------------------------- + +def test_every_broker_resolves_to_valid_tier(): + for b in brokers.load_all(): + assert tiers.select_tier(b) in {"T0", "T1", "T2", "T3"} + + +def test_email_verification_tier_shifts_with_mode(): + spokeo = brokers.get("spokeo") + assert tiers.select_tier(spokeo, "draft_only") == "T2" + assert tiers.select_tier(spokeo, "programmatic") == "T1" + assert tiers.select_tier(spokeo, "alias") == "T1" + + +def test_captcha_tier_shifts_with_browser(): + tps = brokers.get("truepeoplesearch") + assert tiers.select_tier(tps, "programmatic", browser_clears_captcha=False) == "T2" + assert tiers.select_tier(tps, "programmatic", browser_clears_captcha=True) == "T1" + + +def test_hard_human_requirements_force_t3(): + assert tiers.select_tier(brokers.get("mylife")) == "T3" # gov_id + # thatsthem's opt-out is Cloudflare-Turnstile gated (captcha:true) -> T2 without a + # captcha-clearing browser backend, T1 with one. (Corrected 2026-06-30 after the + # live scan found the real form gated; the record previously mis-declared captcha:false.) + assert tiers.select_tier(brokers.get("thatsthem")) == "T2" + assert tiers.select_tier(brokers.get("thatsthem"), browser_clears_captcha=True) == "T1" + + +def test_plan_excludes_disallowed_fields(): + d = _consenting() + actions = tiers.plan(d, brokers.load_all(), config.DEFAULT_CONFIG) + for a in actions: + assert "ssn" not in a["disclosure_fields"] + assert "profile_url" not in a["disclosure_fields"] + + +def test_disclosure_maps_street_when_broker_requires_it(): + # thatsthem's opt-out form requires a street line; select_disclosure must surface it from + # current_address.line1 (regression: 'street' was in broker inputs but unmapped, silently dropped). + d = _consenting() + d["identity"]["current_address"]["line1"] = "123 Main St" + out = dossier.select_disclosure(d, ["full_name", "street", "city", "state", "postal"]) + assert out["street"] == "123 Main St" + # and when there is no street on file, it is simply omitted (never a blank/placeholder) + d2 = _consenting() + out2 = dossier.select_disclosure(d2, ["full_name", "street", "city"]) + assert "street" not in out2 + + +def _mini_broker(bid, owns=None, requires=None, notes="", quirks=None): + return {"id": bid, "name": bid.title(), "priority": "high", + "search": {"by": ["name"]}, + "optout": {"method": "web_form", "url": f"https://{bid}.example/optout", + "requires": requires or {}, "inputs": ["full_name"], "owns": owns or [], + "notes": notes, "quirks": quirks or []}, + "owns": owns or []} + + +def test_batch_plan_groups_by_ledger_state(): + d = _consenting() + bl = [_mini_broker("aaa"), _mini_broker("bbb"), _mini_broker("ccc"), _mini_broker("ddd")] + ledger = { + "aaa": {"state": "found"}, + "bbb": {"state": "not_found"}, + "ccc": {"state": "blocked"}, + # ddd absent -> unscanned/new + } + bp = tiers.batch_plan(d, bl, config.DEFAULT_CONFIG, ledger) + assert bp["phase"] == "discover" # ddd is unscanned + assert bp["counts"]["found"] == 1 + assert bp["counts"]["not_found"] == 1 + assert bp["counts"]["blocked"] == 1 + assert bp["counts"]["unscanned"] == 1 + assert any("PHASE 1" in t for t in bp["next_actions"]) + + +def test_batch_plan_collapses_ownership_clusters(): + # a parent that is being acted on (found/submitted/...) covers its children -> child dropped + d = _consenting() + bl = [_mini_broker("parent", owns=["kid"]), _mini_broker("kid")] + ledger = {"parent": {"state": "found"}, "kid": {"state": "found"}} + bp = tiers.batch_plan(d, bl, config.DEFAULT_CONFIG, ledger) + assert bp["cluster_savings"] == {"parent": ["kid"]} + # the child must NOT also appear as its own actionable 'found' row + found_ids = [r["broker_id"] for r in bp["groups"]["found"]] + assert "parent" in found_ids and "kid" not in found_ids + + +def test_batch_plan_orders_found_parents_first(): + # found group must be sorted parents-first, most-children-first, standalone last. + d = _consenting() + bl = [_mini_broker("standalone"), + _mini_broker("smallparent", owns=["c1"]), + _mini_broker("bigparent", owns=["c1b", "c2b", "c3b"])] + ledger = {"standalone": {"state": "found"}, "smallparent": {"state": "found"}, + "bigparent": {"state": "found"}} + bp = tiers.batch_plan(d, bl, config.DEFAULT_CONFIG, ledger) + order = [r["broker_id"] for r in bp["groups"]["found"]] + assert order == ["bigparent", "smallparent", "standalone"] + # PHASE 2 tip spells out the parents-first order and points at the playbook + phase2 = [t for t in bp["next_actions"] if "PHASE 2" in t] + assert phase2 and "PARENTS FIRST" in phase2[0] and "bigparent -> smallparent" in phase2[0] + + +def test_parent_playbook_has_bespoke_and_synthesised_steps(): + d = _consenting() + bespoke = _mini_broker("bespokeparent", owns=["truthfinder", "ussearch"]) + # bespoke steps live IN the broker record (optout.playbook), not in code + bespoke["optout"]["playbook"] = ["Step one from the record", "SUPPRESSION != DELETION warning"] + bl = [bespoke, + _mini_broker("newparent", owns=["k1", "k2"], + requires={"profile_url": True, "email_verification": True}, + notes="synth note", quirks=["q1"]), + _mini_broker("standalone")] + ledger = {b["id"]: {"state": "found"} for b in bl} + bp = tiers.batch_plan(d, bl, config.DEFAULT_CONFIG, ledger) + pb = {p["broker_id"]: p for p in bp["parent_playbook"]} + # standalone (no children) is NOT in the playbook + assert "standalone" not in pb + # bespoke recipe comes verbatim from the record's own playbook + assert pb["bespokeparent"]["steps"] == bespoke["optout"]["playbook"] + # synthesised recipe: newparent reflects its requires-flags + notes + quirks + steps = " ".join(pb["newparent"]["steps"]) + assert "profile_url" in steps and "verification" in steps.lower() + assert "synth note" in steps and "q1" in steps + # ordering is stamped on each entry, parents-first + assert [p["order"] for p in bp["parent_playbook"]] == [1, 2] + + +def test_batch_plan_phase_is_delete_when_all_scanned(): + d = _consenting() + bl = [_mini_broker("aaa"), _mini_broker("bbb")] + ledger = {"aaa": {"state": "confirmed_removed"}, "bbb": {"state": "not_found"}} + bp = tiers.batch_plan(d, bl, config.DEFAULT_CONFIG, ledger) + assert bp["phase"] == "delete" # nothing unscanned + assert bp["counts"]["unscanned"] == 0 + assert bp["counts"]["done"] == 1 + + +# --- ledger / state machine --------------------------------------------------- + +def test_ledger_valid_transition_and_audit(): + with temp_env(): + sid = "sub_test01" + ledger.transition(sid, "spokeo", "searching") + case = ledger.transition(sid, "spokeo", "found", found=True) + assert case["state"] == "found" and case["found"] is True + # found -> submitted must be allowed directly (action_selected is optional) + case = ledger.transition(sid, "spokeo", "submitted") + assert case["state"] == "submitted" + audit = storage.read_jsonl(__import__("paths").audit_path(sid)) + assert any(e["to"] == "found" for e in audit) + + +def test_new_can_record_scan_outcome_directly(): + with temp_env(): + assert ledger.transition("sub_test01", "thatsthem", "found", found=True)["state"] == "found" + assert ledger.transition("sub_test01", "radaris", "not_found")["state"] == "not_found" + # a scan that is bot-blocked on the very first hit must be recordable as blocked directly + # (no need to pass through 'searching' first) -- and not_found -> blocked when a re-scan is gated + assert ledger.transition("sub_test01", "spokeo", "blocked")["state"] == "blocked" + assert ledger.transition("sub_test01", "radaris", "blocked")["state"] == "blocked" + # a blocked site later scanned via the operator's own (residential) browser resolves to a + # real verdict, incl. not_found -- blocked -> not_found must be legal. + assert ledger.transition("sub_test01", "spokeo", "not_found")["state"] == "not_found" + + +def test_indirect_exposure_state_and_transitions(): + with temp_env(): + sid = "sub_test01" + # a scan can land directly on indirect_exposure (PII on a relative's record) + case = ledger.transition(sid, "thatsthem", "indirect_exposure", + evidence={"summary": "email on relative record"}) + assert case["state"] == "indirect_exposure" + # the lever from there is a targeted delete-my-PII request (-> submitted) + assert ledger.transition(sid, "thatsthem", "submitted")["state"] == "submitted" + # and a separate broker: not_found -> indirect_exposure is allowed (found on re-read) + ledger.transition(sid, "radaris", "not_found") + assert ledger.transition(sid, "radaris", "indirect_exposure")["state"] == "indirect_exposure" + # re-scan can clear it + assert ledger.transition(sid, "radaris", "not_found")["state"] == "not_found" + + +def test_ledger_illegal_transition_raises(): + with temp_env(): + try: + ledger.transition("sub_test01", "spokeo", "confirmed_removed") # new -> confirmed_removed + except ValueError: + pass + else: + raise AssertionError("illegal transition should raise") + + +def test_ledger_disclosure_log(): + with temp_env(): + ledger.log_disclosure("sub_test01", "spokeo", ["full_name", "contact_email"], "web_form") + case = ledger.get_case("sub_test01", "spokeo") + assert case["disclosure_log"][0]["fields"] == ["contact_email", "full_name"] + + +# --- dossier / consent / least-disclosure ------------------------------------ + +def test_consent_gate(): + assert dossier.is_authorized(_consenting()) is True + nope = _consenting() + nope["consent"] = {"authorized": False, "method": "self"} + assert dossier.is_authorized(nope) is False + try: + dossier.require_authorized(nope) + except PermissionError: + pass + else: + raise AssertionError("require_authorized should raise for non-consenting subject") + + +def test_least_disclosure_selection(): + d = _consenting() + got = dossier.select_disclosure(d, ["full_name", "contact_email", "profile_url", "ssn", "date_of_birth"]) + assert set(got) == {"full_name", "contact_email", "date_of_birth"} + assert "ssn" not in got and "profile_url" not in got + + +def test_designated_contact_email_overrides_first(): + d = _consenting() + d["identity"]["emails"] = ["first@x.com", "alias@x.com"] + assert dossier.contact_email(d) == "first@x.com" + d["preferences"]["contact_email_for_optouts"] = "alias@x.com" + assert dossier.contact_email(d) == "alias@x.com" + + +# --- alternates / search vectors --------------------------------------------- + +def test_all_names_and_locations_dedupe(): + d = _consenting() + d["identity"]["also_known_as"] = ["Jane Public", "Jane Q. Public"] # 2nd dups primary + d["identity"]["prior_addresses"] = [{"city": "Berkeley", "state": "CA"}, {"city": "Oakland", "state": "CA"}] + assert dossier.all_names(d) == ["Jane Q. Public", "Jane Public"] + assert [loc["city"] for loc in dossier.all_locations(d)] == ["Oakland", "Berkeley"] # current first, deduped + + +def test_search_vectors_fan_out_across_alternates(): + d = _consenting() + d["identity"]["also_known_as"] = ["Jane Smith"] + d["identity"]["prior_addresses"] = [{"city": "Berkeley", "state": "CA"}] + d["identity"]["emails"] = ["a@x.com", "b@y.com"] + d["identity"]["phones"] = ["+1-415-555-0137", "+1-510-555-0199"] + broker = {"id": "x", "search": {"by": ["name", "phone", "email", "address"]}} + v = vectors.search_vectors(d, broker) + assert len([x for x in v if x["by"] == "name"]) == 4 # 2 names x 2 locations + assert len([x for x in v if x["by"] == "phone"]) == 2 + assert len([x for x in v if x["by"] == "email"]) == 2 + assert len([x for x in v if x["by"] == "address"]) == 0 # no street line1 yet + + +def test_search_vectors_respect_broker_capabilities(): + d = _consenting() + d["identity"]["emails"] = ["a@x.com"] + v = vectors.search_vectors(d, {"id": "y", "search": {"by": ["name"]}}) + assert v and all(x["by"] == "name" for x in v) # broker can't search email -> no email vectors + + +def test_search_vectors_address_needs_line1(): + d = _consenting() + d["identity"]["current_address"] = {"line1": "123 Main St", "city": "Oakland", "state": "CA", "postal": "94601"} + v = vectors.search_vectors(d, {"id": "z", "search": {"by": ["address"]}}) + assert len(v) == 1 and v[0]["by"] == "address" and v[0]["query"]["line1"] == "123 Main St" + + +# --- opaque ids / fan-out / antibot ------------------------------------------ + +def test_subject_id_is_opaque_no_name_leak(): + sid = dossier.new_subject_id("Maiden Married Person") + assert sid.startswith("sub_") + assert "maiden" not in sid.lower() and "person" not in sid.lower() + assert dossier.new_subject_id("Maiden Married Person") != sid # not derived from the name + + +def test_fanout_batches_large_runs(): + g = tiers.fanout([{"id": f"b{i}"} for i in range(20)], batch_size=8) + assert g["broker_count"] == 20 and g["should_fanout"] is True + assert len(g["batches"]) == 3 and g["batches"][0] == [f"b{i}" for i in range(8)] + small = tiers.fanout([{"id": "x"}, {"id": "y"}], batch_size=8) + assert small["should_fanout"] is False and small["batches"] == [["x", "y"]] + + +def test_fanout_default_batch_size_is_five(): + # Field report: 8-broker batches time out; the default dropped to 5. + g = tiers.fanout([{"id": f"b{i}"} for i in range(12)]) + assert all(len(b) <= 5 for b in g["batches"]) + assert g["batches"][0] == [f"b{i}" for i in range(5)] + assert len(g["batches"]) == 3 # 5 + 5 + 2 + + +# --- cdp (operator browser over the DevTools protocol) -------------------------------------- + +def test_cdp_launch_command_has_debug_flags(): + cmd = cdp.launch_command("/usr/bin/chrome", port=9333, profile=Path("/tmp/prof")) + assert cmd[0] == "/usr/bin/chrome" + assert "--remote-debugging-port=9333" in cmd + assert "--user-data-dir=/tmp/prof" in cmd + assert "--no-first-run" in cmd + + +def test_cdp_default_profile_uses_hermes_home(): + prev = os.environ.get("HERMES_HOME") + with tempfile.TemporaryDirectory() as d: + os.environ["HERMES_HOME"] = d + try: + assert cdp.default_profile() == Path(d) / "chrome-debug" + finally: + if prev is None: + os.environ.pop("HERMES_HOME", None) + else: + os.environ["HERMES_HOME"] = prev + + +def test_cdp_endpoint_status_parses_live_and_handles_down(): + orig = cdp._http_get + cdp._http_get = lambda url, timeout: b'{"Browser":"Chrome/1.2","webSocketDebuggerUrl":"ws://x"}' + try: + st = cdp.endpoint_status(port=9222) + assert st and st["Browser"] == "Chrome/1.2" and st["webSocketDebuggerUrl"] == "ws://x" + finally: + cdp._http_get = orig + + def _boom(url, timeout): + raise ConnectionError("connection refused") + cdp._http_get = _boom + try: + assert cdp.endpoint_status(port=9222) is None # nothing listening -> None, never raises + finally: + cdp._http_get = orig + + +def test_cdp_find_browser_override(): + assert cdp.find_browser("/bin/sh") == "/bin/sh" # explicit path that exists + assert cdp.find_browser("definitely-not-a-real-browser-xyz") is None # bogus -> None (no crash) + + +def test_plan_surfaces_antibot(): + d = _consenting() + broker = {"id": "tps", "optout": {"requires": {}}, "search": {"antibot": "datadome", "by": ["name"]}} + actions = tiers.plan(d, [broker], config.DEFAULT_CONFIG) + assert actions[0]["antibot"] == "datadome" + + +def test_plan_prewarns_when_dob_required_but_missing(): + # requires.dob gated broker (e.g. PeopleConnect guided-mode): warn up front, not mid-flow. + broker = {"id": "intelius", "search": {"by": ["name"]}, + "optout": {"requires": {"dob": True, "email_verification": True}, "inputs": ["contact_email"]}} + no_dob = _consenting() + no_dob["identity"].pop("date_of_birth") + warned = tiers.plan(no_dob, [broker], config.DEFAULT_CONFIG)[0] + assert any("date_of_birth" in w for w in warned["needs_operator_input"]) + # A new requires key must not perturb tier selection. + assert warned["tier"] == tiers.select_tier( + {"optout": {"requires": {"email_verification": True}}}, "draft_only") + with_dob = tiers.plan(_consenting(), [broker], config.DEFAULT_CONFIG)[0] + assert with_dob["needs_operator_input"] == [] + + +def test_plan_surfaces_optout_quirks_and_email(): + d = _consenting() + broker = {"id": "radaris", "search": {"by": ["name"]}, + "optout": {"requires": {}, "email": "x@broker.test", "quirks": ["no profile URL -> email fallback"]}} + a = tiers.plan(d, [broker], config.DEFAULT_CONFIG)[0] + assert a["optout_email"] == "x@broker.test" + assert a["optout_quirks"] == ["no profile URL -> email fallback"] + + +# --- legal / templates -------------------------------------------------------- + +def test_legal_render_keeps_missing_placeholders_literal(): + out = legal.render("emails/generic-optout.txt", {"broker_name": "Spokeo"}) + assert "Spokeo" in out + assert "{full_name}" in out # missing field left literal, never blank-injected + + +def test_render_optout_email_includes_listing_and_name(): + b = brokers.get("spokeo") + out = legal.render_optout_email(b, {"full_name": "Jane Q. Public", + "contact_email": "jane@example.com", + "listing_urls": ["https://www.spokeo.com/jane"]}) + assert "Jane Q. Public" in out and "https://www.spokeo.com/jane" in out + + +def test_render_ccpa_indirect_request_names_only_own_identifiers(): + b = brokers.get("thatsthem") + out = legal.render_request("ccpa_indirect", b, { + "full_name": "Jane Q. Public", + "contact_email": "jane@example.com", + "my_identifiers": ["jane@example.com", 'the name "Jane Q. Public" where it appears as a relative'], + "listing_urls": ["https://thatsthem.com/email/jane@example.com"], + }) + # the request must frame this as the subject's OWN data on someone else's record + assert "not the primary subject" in out + assert "jane@example.com" in out + assert "https://thatsthem.com/email/jane@example.com" in out + # must NOT use the full-opt-out wording that claims the record is about the subject + assert "DELETE all personal information you hold about me" not in out + + +# --- email verification-link extraction -------------------------------------- + +def test_extract_verification_link_prefers_broker_optout_link(): + body = ("Hello,\nClick https://www.spokeo.com/optout/confirm?token=abc to confirm.\n" + "Unrelated: https://ads.example/promo\n") + link = email_modes.extract_verification_link(body, brokers.get("spokeo")) + assert link is not None and "spokeo.com" in link and "ads.example" not in link + + +def test_extract_verification_link_ignores_unrelated_only(): + assert email_modes.extract_verification_link("see https://example.com/news today") is None + + +# --- BADBOOL live-pull parser ------------------------------------------------- + +BADBOOL_FIXTURE = """ +## Search Engines +### Google +This is not a broker; ignore it. + +## People Search Sites + +### \U0001F490 BeenVerified +Find your information and opt out of [people search](https://www.beenverified.com/app/optout/search). + +### \U0001F490 \U0001F4DE MyLife +[Find your information](https://www.mylife.com), and then [opt out](https://www.mylife.com/privacyrequest). + +### \U0001F3AB PimEyes +To opt out, [upload an ID](https://pimeyes.com/en/opt-out-request-form). + +## Special Circumstances +### Not A Broker +Ignore this section entirely. +""" + + +def test_badbool_parses_people_search_section_only(): + recs = badbool.parse(BADBOOL_FIXTURE) + ids = {r["id"] for r in recs} + assert ids == {"beenverified", "mylife", "pimeyes"} # google + notabroker excluded + bv = next(r for r in recs if r["id"] == "beenverified") + assert bv["priority"] == "crucial" + assert "beenverified.com/app/optout" in (bv["optout"]["url"] or "") + assert bv["source"] == "BADBOOL-auto" and bv["confidence"] == "auto" + + +def test_badbool_symbols_map_to_requirements_and_tiers(): + recs = {r["id"]: r for r in badbool.parse(BADBOOL_FIXTURE)} + assert recs["mylife"]["optout"]["requires"]["phone_voice"] is True + assert recs["mylife"]["optout"]["method"] == "phone" + assert tiers.select_tier(recs["mylife"]) == "T3" + assert recs["pimeyes"]["optout"]["requires"]["gov_id"] is True + assert tiers.select_tier(recs["pimeyes"]) == "T3" + + +def test_badbool_merge_keeps_curated_and_adds_new(): + with temp_env(): + badbool.refresh(__import__("paths").brokers_cache_path(), markdown=BADBOOL_FIXTURE) + merged = {b["id"]: b for b in brokers.load_all()} + # curated record wins over the live one + assert merged["beenverified"]["source"] == "BADBOOL" + # a non-curated live record is added with auto confidence + assert "pimeyes" in merged and merged["pimeyes"]["confidence"] == "auto" + + +# --- report ------------------------------------------------------------------- + +def test_status_counts_and_markdown(): + with temp_env(): + sid = "sub_test01" + ledger.transition(sid, "spokeo", "searching") + ledger.transition(sid, "spokeo", "found") + ledger.transition(sid, "thatsthem", "searching") + ledger.transition(sid, "thatsthem", "not_found") + counts = report.status_counts(sid) + assert counts.get("found") == 1 and counts.get("not_found") == 1 + md = report.render_markdown(sid) + assert "status for" in md and "Count" in md + + +# --- autonomy: auto-configure --------------------------------------------------------------- + +def test_autonomy_default_is_full_and_valid(): + with temp_env(): + assert config.load_config()["autonomy"] == "full" + config.save_config({"autonomy": "assisted"}) + assert config.load_config()["autonomy"] == "assisted" + try: + config.save_config({"autonomy": "yolo"}) + except ValueError: + pass + else: + raise AssertionError("invalid autonomy should raise") + + +def test_auto_configure_picks_most_autonomous(): + with temp_env(): + # bare env -> draft_only floor, auto browser (still fully hands-off policy-wise) + cfg = config.auto_configure(env={}) + assert cfg["autonomy"] == "full" + assert cfg["email_mode"] == "draft_only" + assert cfg["browser_backend"] == "auto" + # SMTP creds -> programmatic email; Browserbase key -> cloud browser + cfg = config.auto_configure(env={"EMAIL_ADDRESS": "agent@gmail.com", + "EMAIL_PASSWORD": "app-pass", + "BROWSERBASE_API_KEY": "bb"}) + assert cfg["email_mode"] == "programmatic" + assert cfg["browser_backend"] == "browserbase" + # AgentMail only -> alias mode + assert config.auto_configure(env={"AGENTMAIL_API_KEY": "am"})["email_mode"] == "alias" + # encryption auto-on exactly when age is installed (free privacy, zero human cost) + assert config.auto_configure(env={})["encryption"] == ("age" if _AGE else "none") + + +# --- emailer: programmatic send + verification polling -------------------------------------- + +def test_emailer_settings_inference_and_floor(): + assert emailer.smtp_settings(env={}) is None + assert emailer.imap_settings(env={}) is None + env = {"EMAIL_ADDRESS": "a@gmail.com", "EMAIL_PASSWORD": "p"} + assert emailer.smtp_settings(env)["host"] == "smtp.gmail.com" + assert emailer.smtp_settings(env)["port"] == 587 + assert emailer.imap_settings(env)["host"] == "imap.gmail.com" + assert emailer.imap_settings(env)["port"] == 993 + # unknown provider without an explicit host -> NOT configured (never guess blind) + corp = {"EMAIL_ADDRESS": "a@corp.example", "EMAIL_PASSWORD": "p"} + assert emailer.smtp_settings(corp) is None + s = emailer.smtp_settings({**corp, "EMAIL_SMTP_HOST": "mail.corp.example", + "EMAIL_SMTP_PORT": "465"}) + assert (s["host"], s["port"]) == ("mail.corp.example", 465) + + +class _FakeSMTP: + sent: list = [] + + def __init__(self, host, port, timeout=None): + self.host, self.port = host, port + + def __enter__(self): + return self + + def __exit__(self, *a): + return False + + def ehlo(self): + pass + + def starttls(self): + pass + + def login(self, user, password): + self.user = user + + def send_message(self, msg): + _FakeSMTP.sent.append(msg) + + +def test_emailer_send_locks_recipient_to_broker(): + env = {"EMAIL_ADDRESS": "agent@gmail.com", "EMAIL_PASSWORD": "p"} + broker = {"id": "radaris", "optout": {"email": "privacy@radaris.example"}} + _FakeSMTP.sent = [] + out = emailer.send(broker, "Subject: Remove my listing\n\nBody here", env=env, + _smtp_factory=_FakeSMTP) + assert out["to"] == "privacy@radaris.example" + assert _FakeSMTP.sent[0]["Subject"] == "Remove my listing" + assert "Body here" in _FakeSMTP.sent[0].get_content() + # arbitrary recipients are refused -- this tool cannot be repurposed to email people + try: + emailer.send(broker, "Subject: x\n\nb", to="victim@example.com", env=env, + _smtp_factory=_FakeSMTP) + except PermissionError: + pass + else: + raise AssertionError("non-broker recipient must be refused") + + +def test_emailer_send_requires_config_and_broker_address(): + broker = {"id": "x", "optout": {"email": "privacy@x.example"}} + try: + emailer.send(broker, "Subject: s\n\nb", env={}) + except RuntimeError: + pass + else: + raise AssertionError("unconfigured SMTP must raise (draft fallback, not a crash)") + try: + emailer.send({"id": "y", "optout": {}}, "Subject: s\n\nb", + env={"EMAIL_ADDRESS": "a@gmail.com", "EMAIL_PASSWORD": "p"}) + except RuntimeError: + pass + else: + raise AssertionError("broker without a declared address must raise") + + +def test_browser_send_payload_is_recipient_locked(): + broker = {"id": "radaris", "optout": {"email": "privacy@radaris.example"}} + p = emailer.browser_send_payload(broker, "Subject: Remove my listing\n\nBody here") + assert p["to"] == "privacy@radaris.example" + assert p["subject"] == "Remove my listing" and "Body here" in p["body"] + # the browser lane refuses arbitrary recipients too (same guard as SMTP send) + try: + emailer.browser_send_payload(broker, "Subject: x\n\nb", to="victim@example.com") + except PermissionError: + pass + else: + raise AssertionError("browser lane must refuse a non-broker recipient") + + +def test_browser_email_mode_is_autonomous_without_smtp_or_imap(): + with temp_env(): + assert config.save_config({"email_mode": "browser"}) # mode is valid + persists + d = _consenting() + d["residency_jurisdiction"] = "US-CA" + mailer = _mini_broker("mailer") + mailer["optout"]["method"] = "email" + mailer["optout"]["email"] = "privacy@mailer.example" + verifier = _mini_broker("verifier", requires={"email_verification": True}) + led = {"mailer": {"state": "found"}, + "verifier": {"broker_id": "verifier", "state": "submitted"}} + # browser mode with NO EMAIL_* creds -> still fully autonomous (agent uses webmail) + q = autopilot.next_actions(d, [mailer, verifier], _auto_cfg(email_mode="browser"), led, env={}) + sends = [a for a in q["actions"] if a["type"] == "optout_email_send"] + assert sends and sends[0]["send_via"] == "browser" and sends[0]["to"] == "privacy@mailer.example" + polls = [a for a in q["actions"] if a["type"] == "poll_verification"] + assert polls and polls[0]["via"] == "browser" + assert not q["human_digest"] # browser mode needs no human for these + + +def test_verification_link_from_messages_is_domain_scoped(): + broker = {"id": "spokeo", "name": "Spokeo", + "search": {"url": "https://www.spokeo.com/"}, + "optout": {"url": "https://www.spokeo.com/optout"}} + phish = {"from": "phisher@evil.example", "subject": "verify now", + "text": "click https://evil.example/optout/verify?x=1"} + real = {"from": "no-reply@spokeo.com", "subject": "Confirm your opt out", + "text": "Confirm here: https://www.spokeo.com/optout/verify/abc123"} + hit = emailer.link_from_messages([phish, real], broker) + assert hit["link"] == "https://www.spokeo.com/optout/verify/abc123" + # a phishing-only inbox yields nothing (domain scoping + link scoring) + assert emailer.link_from_messages([phish], broker) is None + + +# --- ledger: follow-up scheduling + due queue ------------------------------------------------ + +def test_verification_pending_to_awaiting_processing_is_legal(): + with temp_env(): + sid = "sub_test01" + ledger.transition(sid, "intelius", "found", found=True) + ledger.transition(sid, "intelius", "submitted") + ledger.transition(sid, "intelius", "verification_pending") + assert ledger.transition(sid, "intelius", "awaiting_processing")["state"] == "awaiting_processing" + + +def test_followup_stamps_and_due_queue(): + broker = {"optout": {"est_processing_days": 10}} + d = {"preferences": {"rescan_interval_days": 30}} + f_sub = ledger.followup_fields("submitted", broker, d) + assert "next_recheck_at" in f_sub + f_done = ledger.followup_fields("confirmed_removed", broker, d) + assert "removal_confirmed_at" in f_done + assert f_done["next_recheck_at"] > f_sub["next_recheck_at"] # 30d rescan > 10d processing + assert ledger.followup_fields("found", broker, d) == {} # scan verdicts get no stamp + led = { + "a": {"broker_id": "a", "state": "awaiting_processing", "next_recheck_at": "2000-01-01T00:00:00Z"}, + "b": {"broker_id": "b", "state": "confirmed_removed", "next_recheck_at": "2999-01-01T00:00:00Z"}, + } + assert [c["broker_id"] for c in ledger.due("sub_x", ledger=led)] == ["a"] + + +def test_badbool_auto_records_have_processing_estimate(): + recs = badbool.parse("## People Search Sites\n### Example\n[opt out](https://example.com/optout)\n") + assert recs[0]["optout"]["est_processing_days"] == 14 # drives next_recheck_at for live records + + +# --- autopilot: the autonomous action queue -------------------------------------------------- + +def _auto_cfg(**over): + cfg = dict(config.DEFAULT_CONFIG) + cfg.update(over) + return cfg + + +def test_next_actions_scan_first_then_optouts_parents_first(): + with temp_env(): + d = _consenting() + bl = [_mini_broker("parent", owns=["kid"]), _mini_broker("kid"), _mini_broker("solo")] + q = autopilot.next_actions(d, bl, _auto_cfg(), {}, env={}) + types = [a["type"] for a in q["actions"]] + assert "scan_inline" in types + assert not any(t.startswith("optout") for t in types) # never act before the crawl + assert q["phase"] == "discover" + led = {"parent": {"state": "found"}, "kid": {"state": "found"}, "solo": {"state": "found"}} + q2 = autopilot.next_actions(d, bl, _auto_cfg(), led, env={}) + opt = [a for a in q2["actions"] if a["type"] == "optout_web_form"] + assert [a["broker_id"] for a in opt] == ["parent", "solo"] # kid covered by parent + assert q2["phase"] == "delete" + + +def test_next_actions_fanout_above_threshold(): + with temp_env(): + d = _consenting() + bl = [_mini_broker(f"b{i:02d}") for i in range(12)] + q = autopilot.next_actions(d, bl, _auto_cfg(), {}, env={}) + assert any(a["type"] == "fanout_scan" for a in q["actions"]) + + +def test_next_actions_routes_human_only_to_digest(): + with temp_env(): + d = _consenting() + t3 = _mini_broker("faxer", requires={"fax": True}) + cb = _mini_broker("callbacker", requires={"phone_callback": True}) + led = {"faxer": {"state": "found"}, "callbacker": {"state": "found"}} + q = autopilot.next_actions(d, [t3, cb], _auto_cfg(), led, env={}) + assert not any(a["type"].startswith("optout") for a in q["actions"]) + reasons = " ".join(t["reason"] for t in q["human_digest"]) + assert "human-only" in reasons and "phone-callback" in reasons + + +def test_next_actions_email_send_vs_draft_digest(): + with temp_env(): + d = _consenting() + b = _mini_broker("mailer") + b["optout"]["method"] = "email" + b["optout"]["email"] = "privacy@mailer.example" + led = {"mailer": {"state": "found"}} + env = {"EMAIL_ADDRESS": "agent@gmail.com", "EMAIL_PASSWORD": "p"} + q = autopilot.next_actions(d, [b], _auto_cfg(email_mode="programmatic"), led, env=env) + assert any(a["type"] == "optout_email_send" for a in q["actions"]) + # draft mode: same case becomes a digest entry with the render command as agent prep + q2 = autopilot.next_actions(d, [b], _auto_cfg(), led, env={}) + assert not any(a["type"] == "optout_email_send" for a in q2["actions"]) + assert any("render-email" in " ".join(t["agent_prep"]) for t in q2["human_digest"]) + + +def test_next_actions_poll_verification_and_due_rechecks(): + with temp_env(): + d = _consenting() + b = _mini_broker("verifier", requires={"email_verification": True}) + led = { + "verifier": {"broker_id": "verifier", "state": "submitted"}, + "done1": {"broker_id": "done1", "state": "confirmed_removed", + "next_recheck_at": "2000-01-01T00:00:00Z"}, + } + env = {"EMAIL_ADDRESS": "agent@gmail.com", "EMAIL_PASSWORD": "p"} + q = autopilot.next_actions(d, [b, _mini_broker("done1")], + _auto_cfg(email_mode="programmatic"), led, env=env) + types = [a["type"] for a in q["actions"]] + assert "poll_verification" in types and "verify_removal" in types + # without IMAP, the verification click becomes a human digest entry instead + q2 = autopilot.next_actions(d, [b], _auto_cfg(), + {"verifier": {"broker_id": "verifier", "state": "submitted"}}, env={}) + assert not any(a["type"] == "poll_verification" for a in q2["actions"]) + assert any("verification email" in t["reason"] for t in q2["human_digest"]) + + +def test_next_actions_blocked_stealth_or_operator_browser(): + with temp_env(): + d = _consenting() + b = _mini_broker("gated") + led = {"gated": {"state": "blocked"}} + q = autopilot.next_actions(d, [b], _auto_cfg(), led, env={"BROWSERBASE_API_KEY": "bb"}) + assert any(a["type"] == "stealth_rescan" for a in q["actions"]) + q2 = autopilot.next_actions(d, [b], _auto_cfg(), led, env={}) + assert any("anti-bot" in t["reason"] for t in q2["human_digest"]) + + +def test_assisted_mode_flags_confirm_first(): + with temp_env(): + d = _consenting() + b = _mini_broker("solo") + led = {"solo": {"state": "found"}} + q = autopilot.next_actions(d, [b], _auto_cfg(autonomy="assisted"), led, env={}) + opt = [a for a in q["actions"] if a["type"] == "optout_web_form"] + assert opt and all(a["confirm_first"] for a in opt) + q2 = autopilot.next_actions(d, [b], _auto_cfg(), led, env={}) + assert all(not a["confirm_first"] for a in q2["actions"] if a["type"] == "optout_web_form") + + +def test_next_actions_refresh_then_done_flags(): + with temp_env(): + d = _consenting() + bl = [_mini_broker("solo")] + led = {"solo": {"state": "not_found"}} + q = autopilot.next_actions(d, bl, _auto_cfg(), led, env={}) + assert any(a["type"] == "refresh_brokers" for a in q["actions"]) # no cache yet + assert q["done_for_now"] is False + storage.write_json(paths.brokers_cache_path(), []) # fresh cache + q2 = autopilot.next_actions(d, bl, _auto_cfg(), led, env={}) + assert q2["actions"] == [] + assert q2["done_for_now"] and q2["fully_done"] + + +def test_parked_and_reappeared_states_group_correctly(): + # Regression: human_task_queued / action_selected / reappeared used to fall into "unscanned", + # so the autonomous loop would try to re-scan parked or already-actioned cases forever. + with temp_env(): + d = _consenting() + bl = [_mini_broker("parked"), _mini_broker("chosen"), _mini_broker("back")] + led = {"parked": {"state": "human_task_queued"}, + "chosen": {"state": "action_selected"}, + "back": {"state": "reappeared"}} + bp = tiers.batch_plan(d, bl, config.DEFAULT_CONFIG, led) + assert bp["counts"]["unscanned"] == 0 + assert bp["phase"] == "delete" + assert [r["broker_id"] for r in bp["groups"]["human"]] == ["parked"] + assert {r["broker_id"] for r in bp["groups"]["found"]} == {"chosen", "back"} + q = autopilot.next_actions(d, bl, _auto_cfg(), led, env={}) + assert not any(a["type"] in ("scan_inline", "fanout_scan") for a in q["actions"]) + assert {a["broker_id"] for a in q["actions"] if a["type"] == "optout_web_form"} == {"chosen", "back"} + + +# --- cluster parents: verified deletion lanes + data-driven playbooks ------------------------ + +def test_cluster_parents_have_playbook_and_deletion_lane(): + """Contract: every curated cluster parent must know EXACTLY how to remove the data. + + A parent record (owns children) must carry a non-empty field-verified optout.playbook + and a structured deletion lane -- deletion beats suppression, and the knowledge lives + in the record, not in code. + """ + for b in brokers._load_curated(): + if not b.get("owns"): + continue + opt = b.get("optout") or {} + bid = b["id"] + assert opt.get("playbook"), f"{bid}: cluster parent missing optout.playbook" + d = opt.get("deletion") or {} + assert d.get("email") or d.get("via"), f"{bid}: cluster parent missing deletion lane" + # every declared email must be a legal send-email recipient + for addr in [opt.get("email"), d.get("email")]: + if addr: + assert addr in emailer.broker_addresses(b), f"{bid}: {addr} not sendable" + + +def test_curated_intelius_suppress_first_not_delete(): + # PeopleConnect is the EXCEPTION to deletion-beats-suppression: deleting user data wipes + # your suppressions and does not stop public-records re-listing, so suppress-and-maintain. + b = brokers.get("intelius") + d = b["optout"]["deletion"] + assert d["prefer"] is False and d["via"] == "in_flow" + assert d["email"] == "privacy@peopleconnect.us" # rights-request address for the data-purge path + steps = " ".join(b["optout"]["playbook"]).upper() + assert "SUPPRESS" in steps # the recommended action + assert "DELETE MY USER DATA" in steps # names the trap to avoid + + +def test_deletion_prefer_flag_controls_autopilot_note(): + with temp_env(): + d = _consenting() + pc = _mini_broker("pc", owns=["kid"]) + pc["optout"]["deletion"] = {"via": "in_flow", "prefer": False, + "email": "privacy@pc.example", "notes": "delete undoes suppression"} + q = autopilot.next_actions(d, [pc, _mini_broker("kid")], _auto_cfg(), {"pc": {"state": "found"}}, env={}) + act = next(a for a in q["actions"] if a.get("broker_id") == "pc" and a["type"] == "optout_web_form") + assert "prefer_suppression" in act and "prefer_deletion" not in act + dd = _mini_broker("dd") + dd["optout"]["deletion"] = {"via": "email_followup", "email": "p@dd.example"} + q2 = autopilot.next_actions(d, [dd], _auto_cfg(), {"dd": {"state": "found"}}, env={}) + act2 = next(a for a in q2["actions"] if a["type"] == "optout_web_form") + assert "prefer_deletion" in act2 and "prefer_suppression" not in act2 + + +def test_curated_whitepages_email_lane_is_autonomous(): + """The verified Whitepages pattern: privacyrequest@ bypasses the phone-callback tool.""" + b = brokers.get("whitepages") + opt = b["optout"] + assert opt["method"] == "email" + assert opt["email"] == "privacyrequest@whitepages.com" + assert opt["requires"]["phone_callback"] is False # the callback is only the ALT tool + # programmatic email -> fully automated (T1); draft mode -> needs a human for the verify loop + assert tiers.select_tier(b, email_mode="programmatic") == "T1" + assert tiers.select_tier(b, email_mode="draft_only") == "T2" + + +def test_request_kind_is_residency_honest(): + ca = {"residency_jurisdiction": "US-CA"} + tx = {"residency_jurisdiction": "US-TX"} + de = {"residency_jurisdiction": "EU-DE"} + assert autopilot.request_kind(ca) == "ccpa" + assert autopilot.request_kind(tx) == "generic" # never claim CCPA for a non-CA resident + assert autopilot.request_kind(de) == "gdpr" + assert autopilot.request_kind({}) == "generic" + # broker restriction can force DOWN to generic but never upgrade + assert autopilot.request_kind(tx, allowed=["ccpa", "generic"]) == "generic" + assert autopilot.request_kind(ca, allowed=["generic"]) == "generic" + assert autopilot.request_kind(ca, allowed=["ccpa", "generic"]) == "ccpa" + + +def test_email_lane_routing_and_rescue(): + with temp_env(): + d = _consenting() + d["residency_jurisdiction"] = "US-CA" + env = {"EMAIL_ADDRESS": "agent@gmail.com", "EMAIL_PASSWORD": "p"} + + # (a) primary email method -> email send action with residency-correct kind + mailer = _mini_broker("mailer") + mailer["optout"]["method"] = "email" + mailer["optout"]["email"] = "privacy@mailer.example" + # (b) RESCUE: T3 (gov_id) form but a deletion email exists (no via preference) -> + # email lane instead of the human digest + hard = _mini_broker("hardsite", requires={"gov_id": True}) + hard["optout"]["deletion"] = {"email": "privacy@hardsite.example", + "kinds": ["ccpa", "generic"]} + # (c) phone-callback form with deletion email -> email lane too + cb = _mini_broker("callback2", requires={"phone_callback": True}) + cb["optout"]["deletion"] = {"email": "privacy@callback2.example"} + led = {b: {"state": "found"} for b in ("mailer", "hardsite", "callback2")} + q = autopilot.next_actions(d, [mailer, hard, cb], + _auto_cfg(email_mode="programmatic"), led, env=env) + sends = {a["broker_id"]: a for a in q["actions"] if a["type"] == "optout_email_send"} + assert set(sends) == {"mailer", "hardsite", "callback2"} + assert sends["mailer"]["kind"] == "ccpa" # CA resident + assert sends["hardsite"]["to"] == "privacy@hardsite.example" + assert "rescue" in sends["hardsite"]["why"] + assert not q["human_digest"] # nothing left for a human + + # without SMTP the same brokers fall back honestly: email draft digest / human digest + q2 = autopilot.next_actions(d, [mailer, hard, cb], _auto_cfg(), led, env={}) + assert not any(a["type"] == "optout_email_send" for a in q2["actions"]) + assert len(q2["human_digest"]) == 3 + + +def test_send_email_accepts_deletion_lane_recipient(): + env = {"EMAIL_ADDRESS": "agent@gmail.com", "EMAIL_PASSWORD": "p"} + broker = {"id": "hardsite", + "optout": {"deletion": {"email": "privacy@hardsite.example"}}} + _FakeSMTP.sent = [] + out = emailer.send(broker, "Subject: Delete my data\n\nBody", env=env, _smtp_factory=_FakeSMTP) + assert out["to"] == "privacy@hardsite.example" + + +# --- human-task digest ------------------------------------------------------------------------ + +def test_human_tasks_digest_markdown(): + with temp_env(): + sid = "sub_test01" + ledger.transition(sid, "mylife", "found", found=True) + ledger.transition(sid, "mylife", "human_task_queued", + human_task_reason="gov ID demanded") + ledger.transition(sid, "fastpeoplesearch", "blocked") + md = report.human_tasks_markdown(sid) + assert "gov ID demanded" in md + assert "Withhold" in md + assert "fastpeoplesearch" in md.lower() + # empty ledger -> explicitly says nothing is needed + assert "Nothing needs a human" in report.human_tasks_markdown("sub_other") + + +# --- CA data broker registry (coverage breadth: DROP + email lane) --------------------------- + +def _registry_csv(): + """Mimic the CA registry CSV: junk row 0, label row 1 (with the real NBSP), data rows.""" + import csv as _csv + import io as _io + buf = _io.StringIO() + w = _csv.writer(buf) + w.writerow(["", "junk header the site hides", "", "", "", ""]) + w.writerow(["Data broker\xa0name:", "Doing Business As (DBA), if applicable:", + "Data broker primary website:", "Data broker primary contact email address:", + "Data broker's primary website that contains details on how consumers can exercise " + "their CA Consumer Privacy Act rights, including how to delete their personal information:", + "The data broker or any of its subsidiaries is regulated by the federal Fair Credit " + "Reporting Act (FCRA):"]) + w.writerow(["Acme Data LLC", "AcmeDBA", "https://acme.example", + "privacy@acme.example", "https://acme.example/ccpa", "No"]) + w.writerow(["Credit Bureau Co", "", "https://cbc.example", + "privacy@cbc.example", "https://cbc.example/rights", "Yes"]) + return buf.getvalue() + + +def test_registry_parses_ca_csv(): + recs = registry.parse(_registry_csv()) + assert len(recs) == 2 + assert len({r["id"] for r in recs}) == 2 # unique ids + acme = next(r for r in recs if "acme" in r["id"]) + cbc = next(r for r in recs if "cbc" in r["id"] or "credit" in r["id"]) + assert acme["optout"]["method"] == "email" + assert acme["optout"]["email"] == "privacy@acme.example" + assert acme["optout"]["deletion"]["via"] == "drop" # worked via DROP, not scanning + assert acme["confidence"] == "registry" + assert acme["category"] == "data_broker" + assert acme["optout"]["fcra"] is False and cbc["optout"]["fcra"] is True + + +def test_registry_refresh_isolated_from_people_search(): + with temp_env(): + res = registry.refresh(paths.registry_cache_path(), csv_text=_registry_csv()) + assert res["parsed"] == 2 and res["fcra_regulated"] == 1 + reg_ids = {r["id"] for r in brokers.load_registry_cache()} + assert len(reg_ids) == 2 + # CRITICAL: registry brokers must NOT leak into the people-search scan pipeline + assert reg_ids.isdisjoint({b["id"] for b in brokers.load_all()}) + + +def test_registry_multi_source_framework(): + # generic parser works for a non-CA state (proving multi-source, not CA-hardcoded) + vt = registry.parse(_registry_csv(), jurisdiction="US-VT", has_drop=False) + assert vt[0]["jurisdictions"] == ["US-VT"] + assert vt[0]["source"] == "VT-registry" + assert vt[0]["optout"]["deletion"]["via"] == "email" # no DROP outside CA + assert "no one-shot" in vt[0]["optout"]["deletion"]["notes"].lower() + # VT/OR/TX are surfaced as portals with official URLs (not fabricated rows) + ports = {p["jurisdiction"]: p for p in registry.portals()} + assert set(ports) == {"US-VT", "US-OR", "US-TX"} + assert all(p["url"].startswith("http") for p in ports.values()) + + +def test_registry_refresh_all_ingests_csv_and_lists_portals(): + with temp_env(): + res = registry.refresh_all(paths.registry_cache_path(), fetched={"ca": _registry_csv()}) + assert res["total"] == 2 + assert res["sources"]["ca"]["parsed"] == 2 and res["sources"]["ca"]["added_after_dedupe"] == 2 + assert res["sources"]["vt"]["format"] == "portal" # no bulk export, surfaced as portal + assert len(res["portals"]) == 3 + assert len(brokers.load_registry_cache()) == 2 + + +def test_next_surfaces_drop_for_ca_resident_only(): + with temp_env(): + registry.refresh(paths.registry_cache_path(), csv_text=_registry_csv()) + bl = [_mini_broker("solo")] + + ca = _consenting() + ca["residency_jurisdiction"] = "US-CA" + q = autopilot.next_actions(ca, bl, _auto_cfg(), {}, env={}) + assert any(a["type"] == "drop_submit" for a in q["actions"]) + assert q["coverage"]["registered_data_brokers"] == 2 + assert q["coverage"]["worked_via"] == "CA DROP one-shot" + + tx = _consenting() + tx["residency_jurisdiction"] = "US-TX" + q2 = autopilot.next_actions(tx, bl, _auto_cfg(), {}, env={}) + assert not any(a["type"] == "drop_submit" for a in q2["actions"]) + assert q2["coverage"]["worked_via"] == "targeted CCPA/GDPR email" + + ca["preferences"]["drop_filed_at"] = "2026-01-01T00:00:00Z" + q3 = autopilot.next_actions(ca, bl, _auto_cfg(), {}, env={}) + assert not any(a["type"] == "drop_submit" for a in q3["actions"]) + + +# --- hardening: locking / rate-limit / retry / idempotency / freshness / metrics ------------ + +def test_storage_lock_mutual_exclusion_and_stale_break(): + with temp_env() as data: + target = data / "x.json" + with storage.locked(target): # hold the lock + try: + with storage.locked(target, timeout=0.2): # second acquire must time out + raise AssertionError("second acquire should have timed out") + except TimeoutError: + pass + with storage.locked(target, timeout=0.2): # released -> acquires fine + pass + # a stale lock (old mtime) from a crashed writer gets broken + lock = target.with_name(target.name + ".lock") + lock.write_text("999999") + old = _time.time() - 120 + os.utime(lock, (old, old)) + with storage.locked(target, timeout=0.2, stale=30): + pass + + +def test_email_rate_limit_paces_sends(): + with temp_env() as data: + state = data / "rate.json" + slept, now = [], [1000.0] + emailer._respect_rate_limit(20, lambda s: slept.append(s), lambda: now[0], state) + assert slept == [] # first send: nothing to wait for + now[0] = 1005.0 # only 5s later + emailer._respect_rate_limit(20, lambda s: slept.append(s), lambda: now[0], state) + assert slept and abs(slept[0] - 15) < 0.01 # waited the remaining 15s of the 20s window + + +class _FlakySMTP: + attempts = 0 + + def __init__(self, host, port, timeout=None): + pass + + def __enter__(self): + _FlakySMTP.attempts += 1 + if _FlakySMTP.attempts < 3: + raise _smtplib.SMTPServerDisconnected("transient") + return self + + def __exit__(self, *a): + return False + + def ehlo(self): + pass + + def starttls(self): + pass + + def login(self, u, p): + pass + + def send_message(self, m): + _FlakySMTP.sent = m + + +class _AuthFailSMTP(_FlakySMTP): + def __enter__(self): + return self + + def login(self, u, p): + raise _smtplib.SMTPAuthenticationError(535, b"bad creds") + + +def test_email_send_retries_transient_then_succeeds(): + _FlakySMTP.attempts = 0 + env = {"EMAIL_ADDRESS": "agent@gmail.com", "EMAIL_PASSWORD": "p"} + broker = {"id": "x", "optout": {"email": "privacy@x.example"}} + out = emailer.send(broker, "Subject: s\n\nb", env=env, _smtp_factory=_FlakySMTP, + _sleep=lambda *_: None) + assert out["attempts"] == 3 and "delivery_note" in out + + +def test_email_send_does_not_retry_permanent_error(): + env = {"EMAIL_ADDRESS": "agent@gmail.com", "EMAIL_PASSWORD": "p"} + broker = {"id": "x", "optout": {"email": "privacy@x.example"}} + try: + emailer.send(broker, "Subject: s\n\nb", env=env, _smtp_factory=_AuthFailSMTP, + _sleep=lambda *_: None) + except _smtplib.SMTPAuthenticationError: + pass + else: + raise AssertionError("auth failure must raise immediately, not retry") + + +def _run(argv) -> dict: + buf = _io.StringIO() + with _ctx.redirect_stdout(buf): + pdd.main(argv) + return _json.loads(buf.getvalue()) + + +def test_send_email_is_idempotent_browser_mode(): + with temp_env(): + config.save_config({"email_mode": "browser"}) + sid = _run(["intake", "--full-name", "Jane Q. Public", + "--email", "jane@example.com", "--consent"])["subject_id"] + _run(["record", sid, "radaris", "found", "--found", "true"]) + first = _run(["send-email", sid, "radaris", "--listing", "https://radaris.com/p/x"]) + assert first.get("state") == "submitted" and first.get("send_via") == "browser" + again = _run(["send-email", sid, "radaris", "--listing", "https://radaris.com/p/x"]) + assert again.get("skipped") is True # not re-sent + + +def test_show_reads_back_case_state_and_evidence(): + with temp_env(): + sid = _run(["intake", "--full-name", "Jane Q. Public", + "--email", "jane@example.com", "--consent"])["subject_id"] + _run(["record", sid, "radaris", "found", "--found", "true", + "--evidence", '{"listing_urls": ["https://radaris.com/p/x"]}']) + shown = _run(["show", sid, "radaris"]) + assert shown["broker"] == "radaris" and shown["state"] == "found" + assert shown["found"] is True + assert shown["evidence"].get("listing_urls") == ["https://radaris.com/p/x"] + # Unknown case returns a fresh (new) case, not an error. + empty = _run(["show", sid, "not_a_broker"]) + assert empty["state"] == "new" and empty["evidence"] == {} + + +def test_dotenv_env_fills_missing_creds_and_shell_wins(): + prev_home = os.environ.get("HERMES_HOME") + prev_key = os.environ.get("BROWSERBASE_API_KEY") + with tempfile.TemporaryDirectory() as d: + os.environ["HERMES_HOME"] = d + (Path(d) / ".env").write_text( + '# comment\nBROWSERBASE_API_KEY="from_dotenv"\nFIRECRAWL_API_KEY=fc_123\n', encoding="utf-8") + try: + os.environ.pop("BROWSERBASE_API_KEY", None) + merged = config.dotenv_env() + assert merged["BROWSERBASE_API_KEY"] == "from_dotenv" # filled from .env + assert merged["FIRECRAWL_API_KEY"] == "fc_123" # quotes/comment handled + os.environ["BROWSERBASE_API_KEY"] = "from_shell" + assert config.dotenv_env()["BROWSERBASE_API_KEY"] == "from_shell" # shell wins + finally: + for k, v in (("HERMES_HOME", prev_home), ("BROWSERBASE_API_KEY", prev_key)): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v + + +def test_cdp_cli_check_reports_not_running(): + orig = cdp.endpoint_status + cdp.endpoint_status = lambda *a, **k: None + try: + out = _run(["cdp", "--check", "--port", "59981"]) + assert out["running"] is False and out["endpoint"].endswith(":59981") + finally: + cdp.endpoint_status = orig + + +def test_cdp_cli_detects_already_running_and_does_not_launch(): + # If a debug browser is already live, `cdp` must report it and NOT launch another. + orig_status, orig_launch = cdp.endpoint_status, cdp.launch + cdp.endpoint_status = lambda *a, **k: {"Browser": "Chrome/9", "webSocketDebuggerUrl": "ws://z"} + + def _no_launch(*a, **k): + raise AssertionError("launch() must not be called when a browser is already live") + cdp.launch = _no_launch + try: + out = _run(["cdp", "--port", "59982"]) + assert out["running"] is True and out["webSocketDebuggerUrl"] == "ws://z" + finally: + cdp.endpoint_status, cdp.launch = orig_status, orig_launch + + +def test_registry_candidate_urls_newest_first_with_floor(): + urls = registry.ca_candidate_urls(__import__("datetime").date(2027, 3, 1)) + assert urls[0].endswith("registry2027.csv") and urls[-1].endswith("registry2025.csv") + assert registry.ca_candidate_urls(__import__("datetime").date(2024, 1, 1))[0].endswith("registry2025.csv") + + +def test_registry_and_badbool_warn_on_too_few(): + with temp_env(): + res = registry.refresh_all(paths.registry_cache_path(), fetched={"ca": _registry_csv()}) + assert "warning" in res["sources"]["ca"] # 2 parsed < MIN_EXPECTED_CA + md = "## People Search Sites\n### One\n[opt out](https://one.example/optout)\n" + bres = badbool.refresh(paths.brokers_cache_path(), markdown=md) + assert bres["parsed"] == 1 and "warning" in bres + + +def test_report_metrics_removal_rate_and_overdue(): + with temp_env(): + sid = "sub_test01" + for st in ("found", "submitted", "awaiting_processing", "confirmed_removed"): + ledger.transition(sid, "a", st, **({"found": True} if st == "found" else {})) + ledger.transition(sid, "b", "found", found=True) # open + for st in ("found", "submitted", "awaiting_processing"): + ledger.transition(sid, "c", st, **({"found": True} if st == "found" else {})) + led = ledger.load(sid) + led["c"]["next_recheck_at"] = "2000-01-01T00:00:00Z" # force overdue + ledger.save(sid, led) + m = report.metrics(sid) + assert m["confirmed_removed"] == 1 + assert m["open_needs_action"] >= 1 and m["in_flight_claimed"] >= 1 + assert m["overdue_rechecks"] >= 1 and 0 < m["removal_rate"] <= 1 + + +if __name__ == "__main__": + failures = [] + tests = [(n, f) for n, f in sorted(globals().items()) if n.startswith("test_") and callable(f)] + for name, fn in tests: + try: + fn() + print(f"PASS {name}") + except Exception as exc: # noqa: BLE001 + failures.append((name, exc)) + print(f"FAIL {name}: {exc!r}") + print(f"\n{len(tests) - len(failures)}/{len(tests)} passed") + sys.exit(1 if failures else 0) diff --git a/tests/tools/test_browser_camofox_private_page_guard.py b/tests/tools/test_browser_camofox_private_page_guard.py index 410209d73..eae08077d 100644 --- a/tests/tools/test_browser_camofox_private_page_guard.py +++ b/tests/tools/test_browser_camofox_private_page_guard.py @@ -83,6 +83,34 @@ def test_private_page_blocks_camofox_reads(monkeypatch, _session, tool_call, act assert action_phrase in out["error"] +@pytest.mark.parametrize( + ("tool_call", "action_phrase"), + [ + (lambda: browser_camofox.camofox_click("@e1", task_id="t1"), "click"), + ( + lambda: browser_camofox.camofox_type("@e1", "do-not-send-this", task_id="t1"), + "type", + ), + (lambda: browser_camofox.camofox_press("Enter", task_id="t1"), "press"), + ], +) +def test_private_page_blocks_camofox_input_actions(monkeypatch, _session, tool_call, action_phrase): + _block_active(monkeypatch) + + def fail_post(*_args, **_kwargs): + raise AssertionError("Camofox action HTTP call should not run on a private page") + + monkeypatch.setattr(browser_camofox, "_post", fail_post) + + out = json.loads(tool_call()) + + assert out["success"] is False + assert PRIVATE_URL in out["error"] + assert "private or internal address" in out["error"] + assert action_phrase in out["error"] + assert "do-not-send-this" not in json.dumps(out) + + def test_snapshot_still_runs_when_page_is_public(monkeypatch, _session): _public_page(monkeypatch) @@ -98,6 +126,29 @@ def test_snapshot_still_runs_when_page_is_public(monkeypatch, _session): assert out["element_count"] == 1 +def test_camofox_click_still_runs_when_page_is_public(monkeypatch, _session): + _public_page(monkeypatch) + calls = [] + + def fake_post(path, body=None, timeout=None): + calls.append((path, body, timeout)) + return {"url": "https://example.test/"} + + monkeypatch.setattr(browser_camofox, "_post", fake_post) + + out = json.loads(browser_camofox.camofox_click("@e1", task_id="t1")) + + assert out["success"] is True + assert out["clicked"] == "e1" + assert calls == [ + ( + "/tabs/tab-1/click", + {"userId": "user-1", "ref": "e1"}, + None, + ) + ] + + def test_guard_inactive_does_not_probe(monkeypatch, _session): """When the SSRF guard is inactive the read proceeds WITHOUT probing the URL. diff --git a/tests/tools/test_browser_cdp_tool.py b/tests/tools/test_browser_cdp_tool.py index 194800701..0c0b16e9b 100644 --- a/tests/tools/test_browser_cdp_tool.py +++ b/tests/tools/test_browser_cdp_tool.py @@ -430,6 +430,70 @@ def test_runtime_evaluate_blocked_when_current_page_is_private(monkeypatch): assert calls == [] +def test_frame_id_route_blocked_when_current_page_is_private(monkeypatch): + """frame_id routing (OOPIF via supervisor) must not bypass the guard + applied to the stateless path — same private-page boundary either way.""" + supervisor_calls = [] + + import tools.browser_tool as bt + + monkeypatch.setattr(bt, "_eval_ssrf_guard_active", lambda task_id: True) + monkeypatch.setattr(bt, "_current_page_private_url", lambda task_id: PRIVATE_URL) + + def fake_supervisor_route(**kwargs): + supervisor_calls.append(kwargs) + return json.dumps({"success": True, "result": {"value": "private data"}}) + + monkeypatch.setattr( + browser_cdp_tool, "_browser_cdp_via_supervisor", fake_supervisor_route + ) + + result = json.loads( + browser_cdp_tool.browser_cdp( + method="Runtime.evaluate", + params={"expression": "document.body.innerText"}, + frame_id="frame-1", + task_id="task-1", + ) + ) + + assert "error" in result + assert PRIVATE_URL in result["error"] + assert "private or internal address" in result["error"] + assert supervisor_calls == [] + + +def test_frame_id_route_allowed_when_page_is_not_private(monkeypatch): + """Sanity check: the new guard call must not block ordinary frame_id + routing when the current page isn't private.""" + supervisor_calls = [] + + import tools.browser_tool as bt + + monkeypatch.setattr(bt, "_eval_ssrf_guard_active", lambda task_id: True) + monkeypatch.setattr(bt, "_current_page_private_url", lambda task_id: None) + + def fake_supervisor_route(**kwargs): + supervisor_calls.append(kwargs) + return json.dumps({"success": True, "result": {"value": "ok"}}) + + monkeypatch.setattr( + browser_cdp_tool, "_browser_cdp_via_supervisor", fake_supervisor_route + ) + + result = json.loads( + browser_cdp_tool.browser_cdp( + method="Runtime.evaluate", + params={"expression": "document.title"}, + frame_id="frame-1", + task_id="task-1", + ) + ) + + assert result.get("success") is True + assert len(supervisor_calls) == 1 + + def test_page_navigate_to_private_url_blocked_before_cdp(monkeypatch): calls = [] diff --git a/tests/tools/test_delegate.py b/tests/tools/test_delegate.py index d35d3627e..5830706bf 100644 --- a/tests/tools/test_delegate.py +++ b/tests/tools/test_delegate.py @@ -78,6 +78,12 @@ class TestDelegateRequirements(unittest.TestCase): # config-authoritative via delegation.max_iterations so users get # predictable budgets. self.assertNotIn("max_iterations", props) + # ACP subprocess transport is operator-controlled via config.yaml, not + # model-controlled via delegate_task arguments. + self.assertNotIn("acp_command", props) + self.assertNotIn("acp_args", props) + self.assertNotIn("acp_command", props["tasks"]["items"]["properties"]) + self.assertNotIn("acp_args", props["tasks"]["items"]["properties"]) self.assertNotIn("maxItems", props["tasks"]) # removed — limit is now runtime-configurable def test_schema_description_advertises_runtime_limits(self): @@ -522,16 +528,7 @@ class TestToolNamePreservation(unittest.TestCase): ) def test_build_child_agent_ignores_acp_command_when_binary_missing(self): - """Regression: _build_child_agent must not force provider='copilot-acp' - when the override_acp_command binary is not on PATH. - - Without this guard, a model that hallucinates - ``delegate_task(acp_command="copilot")`` on a host without the Copilot - CLI installed (Railway / headless containers / fresh VPS) would route - the subagent through CopilotACPClient, which spawns the binary via - subprocess and raises RuntimeError. After 3 retries the asyncio loop - teardown can take the entire gateway down. - """ + """Stale delegation.command config must not force ACP subprocess mode.""" parent = _make_mock_parent(depth=0) # The crash scenario is a TG/cron agent on a host with no ACP CLI — # parent itself has no acp_command, so clearing the override must NOT @@ -606,68 +603,20 @@ class TestToolNamePreservation(unittest.TestCase): self.assertEqual(captured["provider"], "copilot-acp") self.assertEqual(captured["acp_command"], "copilot") - def test_schema_prunes_acp_command_when_no_acp_binary(self): - """Schema-level defense: delegate_task tool schema must NOT advertise - acp_command / acp_args to the model when no ACP binary is installed. - - Headless deploys (Railway / Fly / Docker / fresh VPS) typically have - none of copilot / claude / codex. Without the schema prune, models - occasionally hallucinate ``acp_command="copilot"`` from the field's - description and crash subagent runs. - """ + def test_schema_never_exposes_acp_transport_fields(self): + """delegate_task must never make ACP transport model-facing.""" from tools.delegate_tool import _build_dynamic_schema_overrides - with patch("tools.delegate_tool._acp_binary_available", return_value=False): + with patch("shutil.which", return_value="/usr/local/bin/copilot"): overrides = _build_dynamic_schema_overrides() props = overrides["parameters"]["properties"] - self.assertNotIn("acp_command", props, "top-level acp_command must be pruned") - self.assertNotIn("acp_args", props, "top-level acp_args must be pruned") + self.assertNotIn("acp_command", props) + self.assertNotIn("acp_args", props) task_item_props = props["tasks"]["items"]["properties"] - self.assertNotIn( - "acp_command", task_item_props, "per-task acp_command must be pruned" - ) - self.assertNotIn( - "acp_args", task_item_props, "per-task acp_args must be pruned" - ) - - def test_schema_keeps_acp_command_when_binary_available(self): - """Backward compat: when an ACP CLI IS on PATH, schema is unchanged. - Users with working ACP setups must still be able to invoke it. - """ - from tools.delegate_tool import _build_dynamic_schema_overrides - - with patch("tools.delegate_tool._acp_binary_available", return_value=True): - overrides = _build_dynamic_schema_overrides() - - props = overrides["parameters"]["properties"] - self.assertIn("acp_command", props) - self.assertIn("acp_args", props) - - task_item_props = props["tasks"]["items"]["properties"] - self.assertIn("acp_command", task_item_props) - self.assertIn("acp_args", task_item_props) - - def test_acp_binary_available_checks_known_clis(self): - """_acp_binary_available must check the known ACP CLI names via - shutil.which — guards against typos or accidental list trimming. - """ - from tools.delegate_tool import _KNOWN_ACP_BINARIES, _acp_binary_available - - self.assertIn("copilot", _KNOWN_ACP_BINARIES) - - calls = [] - - def fake_which(name): - calls.append(name) - return None - - with patch("shutil.which", side_effect=fake_which): - self.assertFalse(_acp_binary_available()) - - for name in _KNOWN_ACP_BINARIES: - self.assertIn(name, calls) + self.assertNotIn("acp_command", task_item_props) + self.assertNotIn("acp_args", task_item_props) def test_saved_tool_names_set_on_child_before_run(self): """_run_single_child must set _delegate_saved_tool_names on the child @@ -2281,37 +2230,39 @@ class TestDelegationReasoningEffort(unittest.TestCase): class TestDispatchDelegateTask(unittest.TestCase): """Tests for the _dispatch_delegate_task helper and full param forwarding.""" - @patch("tools.delegate_tool._load_config", return_value={}) - @patch("tools.delegate_tool._resolve_delegation_credentials") - def test_acp_args_forwarded(self, mock_creds, mock_cfg): - """Both acp_command and acp_args reach delegate_task via the helper.""" - mock_creds.return_value = { - "provider": None, "base_url": None, - "api_key": None, "api_mode": None, "model": None, - } - parent = _make_mock_parent(depth=0) - with patch("tools.delegate_tool._build_child_agent") as mock_build: - mock_child = MagicMock() - mock_child.run_conversation.return_value = { - "final_response": "done", "completed": True, - "api_calls": 1, "messages": [], - } - mock_child._delegate_saved_tool_names = [] - mock_child._credential_pool = None - mock_child.session_prompt_tokens = 0 - mock_child.session_completion_tokens = 0 - mock_child.model = "test" - mock_build.return_value = mock_child + def test_model_acp_args_not_forwarded(self): + """The live model dispatch path strips hidden ACP transport args.""" + import run_agent - delegate_task( - goal="test", - acp_command="claude", - acp_args=["--acp", "--stdio"], - parent_agent=parent, + captured = {} + + def fake_delegate_task(**kwargs): + captured.update(kwargs) + return "{}" + + parent = _make_mock_parent(depth=0) + with patch("tools.delegate_tool.delegate_task", fake_delegate_task): + run_agent.AIAgent._dispatch_delegate_task( + parent, + { + "goal": "test", + "acp_command": "claude", + "acp_args": ["--acp", "--stdio"], + "tasks": [ + { + "goal": "nested", + "acp_command": "codex", + "acp_args": ["--acp"], + }, + ], + }, ) - _, kwargs = mock_build.call_args - self.assertEqual(kwargs["override_acp_command"], "claude") - self.assertEqual(kwargs["override_acp_args"], ["--acp", "--stdio"]) + + self.assertNotIn("acp_command", captured) + self.assertNotIn("acp_args", captured) + self.assertEqual(captured["goal"], "test") + self.assertNotIn("acp_command", captured["tasks"][0]) + self.assertNotIn("acp_args", captured["tasks"][0]) class TestDelegateEventEnum(unittest.TestCase): """Tests for DelegateEvent enum and back-compat aliases.""" @@ -2600,31 +2551,15 @@ class TestOrchestratorRoleSchema(unittest.TestCase): self.assertIn("role", task_props) self.assertEqual(task_props["role"]["enum"], ["leaf", "orchestrator"]) - def test_acp_command_description_has_do_not_set_guidance(self): - # acp_command/acp_args descriptions must NOT bias the model toward - # assuming an ACP CLI (Claude, Copilot, etc.) is installed. They must - # carry explicit "do not set unless told" guidance so the model doesn't - # hallucinate ACP availability (#22013). + def test_schema_omits_acp_transport_fields(self): from tools.delegate_tool import DELEGATE_TASK_SCHEMA props = DELEGATE_TASK_SCHEMA["parameters"]["properties"] - top_acp_desc = props["acp_command"]["description"] - self.assertIn("Do NOT set", top_acp_desc) - self.assertIn("explicitly told you", top_acp_desc) - task_props = props["tasks"]["items"]["properties"] - per_task_acp_desc = task_props["acp_command"]["description"] - self.assertIn("Do NOT set", per_task_acp_desc) - - def test_acp_command_description_has_no_claude_as_example(self): - # Descriptions must not list 'claude' as a canonical example value — - # that directly primes the model to attempt Claude ACP even when it is - # not installed (#22013). - from tools.delegate_tool import DELEGATE_TASK_SCHEMA - props = DELEGATE_TASK_SCHEMA["parameters"]["properties"] - top_acp_desc = props["acp_command"]["description"].lower() - self.assertNotIn("e.g. 'claude'", top_acp_desc) - self.assertNotIn("e.g. \"claude\"", top_acp_desc) + self.assertNotIn("acp_command", props) + self.assertNotIn("acp_args", props) + self.assertNotIn("acp_command", task_props) + self.assertNotIn("acp_args", task_props) # Sentinel used to distinguish "role kwarg omitted" from "role=None". diff --git a/tests/tools/test_managed_media_gateways.py b/tests/tools/test_managed_media_gateways.py index d8b60d164..1b248ce09 100644 --- a/tests/tools/test_managed_media_gateways.py +++ b/tests/tools/test_managed_media_gateways.py @@ -244,6 +244,46 @@ def test_openai_tts_uses_managed_audio_gateway_when_direct_key_absent(monkeypatc assert captured["close_calls"] == 1 +def test_openai_tts_coerces_direct_only_model_on_managed_gateway(monkeypatch, tmp_path): + """A tts.openai.model valid only for direct OpenAI (e.g. tts-1-hd) must be + coerced to a managed-supported model, else the gateway 400s with + 'Unsupported managed OpenAI speech model'.""" + captured = {} + _install_fake_tools_package() + _install_fake_openai_module(captured) + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.setenv("TOOL_GATEWAY_DOMAIN", "nousresearch.com") + monkeypatch.setenv("TOOL_GATEWAY_USER_TOKEN", "nous-token") + + tts_tool = _load_tool_module("tools.tts_tool", "tts_tool.py") + output_path = tmp_path / "speech.mp3" + tts_tool._generate_openai_tts( + "hello world", str(output_path), {"openai": {"model": "tts-1-hd"}} + ) + + assert captured["base_url"] == "https://openai-audio-gateway.nousresearch.com/v1" + assert captured["speech_kwargs"]["model"] == "gpt-4o-mini-tts" + + +def test_openai_tts_keeps_direct_only_model_with_direct_key(monkeypatch, tmp_path): + """With a direct key, the user's tts-1-hd is honored (not coerced).""" + captured = {} + _install_fake_tools_package() + _install_fake_openai_module(captured) + monkeypatch.setenv("OPENAI_API_KEY", "openai-direct-key") + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + + tts_tool = _load_tool_module("tools.tts_tool", "tts_tool.py") + output_path = tmp_path / "speech.mp3" + tts_tool._generate_openai_tts( + "hello world", str(output_path), {"openai": {"model": "tts-1-hd"}} + ) + + assert captured["base_url"] == "https://api.openai.com/v1" + assert captured["speech_kwargs"]["model"] == "tts-1-hd" + + def test_openai_tts_accepts_openai_api_key_as_direct_fallback(monkeypatch, tmp_path): captured = {} _install_fake_tools_package() diff --git a/tests/tools/test_tts_speed.py b/tests/tools/test_tts_speed.py index d9274bb84..d079418e7 100644 --- a/tests/tools/test_tts_speed.py +++ b/tests/tools/test_tts_speed.py @@ -78,7 +78,7 @@ class TestOpenaiTtsSpeed: with patch("tools.tts_tool._import_openai_client", return_value=mock_cls), \ patch("tools.tts_tool._resolve_openai_audio_client_config", - return_value=("test-key", None)): + return_value=("test-key", None, False)): from tools.tts_tool import _generate_openai_tts _generate_openai_tts("Hello", str(tmp_path / "out.mp3"), tts_config) return mock_client.audio.speech.create diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py index 98bdd2276..73cc672a6 100644 --- a/tests/tools/test_vision_tools.py +++ b/tests/tools/test_vision_tools.py @@ -261,6 +261,56 @@ class TestHandleVisionAnalyze: # (the centralized call_llm router picks the default) assert model is None + @pytest.mark.asyncio + async def test_config_yaml_model_takes_priority_over_env(self): + """config.yaml auxiliary.vision.model should be preferred over env var.""" + with ( + patch( + "tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock + ) as mock_tool, + patch( + "tools.vision_tools._should_use_native_vision_fast_path", + return_value=False, + ), + patch( + "hermes_cli.config.load_config", + return_value={"auxiliary": {"vision": {"model": "qwen3.7-plus"}}}, + ), + patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "env-model"}), + ): + mock_tool.return_value = json.dumps({"result": "ok"}) + await _handle_vision_analyze( + {"image_url": "https://example.com/img.png", "question": "test"} + ) + call_args = mock_tool.call_args + model = call_args[0][2] # third positional arg + assert model == "qwen3.7-plus" + + @pytest.mark.asyncio + async def test_env_var_used_when_config_missing_model(self): + """Env var should be used when config.yaml has no auxiliary.vision.model.""" + with ( + patch( + "tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock + ) as mock_tool, + patch( + "tools.vision_tools._should_use_native_vision_fast_path", + return_value=False, + ), + patch( + "hermes_cli.config.load_config", + return_value={"auxiliary": {"vision": {}}}, + ), + patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "fallback-model"}), + ): + mock_tool.return_value = json.dumps({"result": "ok"}) + await _handle_vision_analyze( + {"image_url": "https://example.com/img.png", "question": "test"} + ) + call_args = mock_tool.call_args + model = call_args[0][2] + assert model == "fallback-model" + def test_empty_args_graceful(self): """Missing keys should default to empty strings, not raise.""" with patch( diff --git a/tests/tools/test_web_tools_config.py b/tests/tools/test_web_tools_config.py index 667d5350c..6466e615f 100644 --- a/tests/tools/test_web_tools_config.py +++ b/tests/tools/test_web_tools_config.py @@ -668,3 +668,122 @@ def test_web_requires_env_includes_exa_key(): from tools.web_tools import _web_requires_env assert "EXA_API_KEY" in _web_requires_env() + + +class TestNonBuiltinProviderAvailability: + """Regression: a plugin-registered WebSearchProvider with no built-in + provider credentials must still light up web_search / web_extract tools. + + The web_tools availability gate delegates non-legacy backend names to the + web_search_registry's provider ``is_available()``. This class verifies + that a custom (non-built-in) provider discovered via the registry is + sufficient to make check_web_api_key() return True, _get_backend() return + the custom name, the per-capability selection honor it (issue #32698), and + the tool registry entries remain active. + + Original tests contributed by @m0n5t3r (PR #28652 / issue #28651). + """ + + # All env vars that could make a built-in provider available. + _WEB_ENV_KEYS = ( + "EXA_API_KEY", + "PARALLEL_API_KEY", + "FIRECRAWL_API_KEY", + "FIRECRAWL_API_URL", + "FIRECRAWL_GATEWAY_URL", + "TOOL_GATEWAY_DOMAIN", + "TOOL_GATEWAY_SCHEME", + "TOOL_GATEWAY_USER_TOKEN", + "TAVILY_API_KEY", + "SEARXNG_URL", + "BRAVE_SEARCH_API_KEY", + "XAI_API_KEY", + ) + + @staticmethod + def _create_fake_provider(*, search=True, extract=True): + """Dynamically create a WebSearchProvider subclass. + + Uses a local class definition (not a nested class) to avoid + Python 3.13 __bases__ deallocator issue with nested class + reassignment. + """ + from agent.web_search_provider import WebSearchProvider + + class FakePluginProvider(WebSearchProvider): + @property + def name(self): + return "fake-plugin-prov" + + def is_available(self): + return True + + def supports_search(self): + return search + + def supports_extract(self): + return extract + + return FakePluginProvider() + + def setup_method(self): + """Strip all built-in web provider env vars and reset the registry.""" + for key in self._WEB_ENV_KEYS: + os.environ.pop(key, None) + from agent.web_search_registry import _reset_for_tests, register_provider + _reset_for_tests() + register_provider(self._create_fake_provider()) + + def teardown_method(self): + """Reset the registry and restore env after each test.""" + from agent.web_search_registry import _reset_for_tests + _reset_for_tests() + for key in self._WEB_ENV_KEYS: + os.environ.pop(key, None) + + def test_check_web_api_key_returns_true_for_custom_provider(self): + """With only a custom provider registered (no built-in creds), + check_web_api_key() must return True.""" + with patch("tools.web_tools._ddgs_package_importable", return_value=False), \ + patch("tools.web_tools._peek_nous_access_token", return_value=None): + from tools.web_tools import check_web_api_key + assert check_web_api_key() is True + + def test_get_backend_discovers_custom_provider(self): + """_get_backend() must return the custom provider name when it's + the only available provider.""" + with patch("tools.web_tools._ddgs_package_importable", return_value=False), \ + patch("tools.web_tools._peek_nous_access_token", return_value=None): + from tools.web_tools import _get_backend + assert _get_backend() == "fake-plugin-prov" + + def test_is_backend_available_delegates_to_registry(self): + """_is_backend_available() must consult the registry for a + non-legacy backend name.""" + from tools.web_tools import _is_backend_available + assert _is_backend_available("fake-plugin-prov") is True + # Unknown, unregistered name -> False (no legacy probe matches). + assert _is_backend_available("totally-unknown-backend") is False + + def test_capability_backend_honors_custom_extract_provider(self): + """Per-capability selection (_get_extract_backend) must resolve the + custom provider when configured, instead of dead-ending — issue #32698.""" + with patch("tools.web_tools._ddgs_package_importable", return_value=False), \ + patch("tools.web_tools._peek_nous_access_token", return_value=None), \ + patch("tools.web_tools._load_web_config", + return_value={"extract_backend": "fake-plugin-prov"}): + from tools.web_tools import _get_extract_backend + assert _get_extract_backend() == "fake-plugin-prov" + + def test_tool_registry_entries_not_filtered_out(self): + """web_search and web_extract tool entries must remain in the + registry when only a custom provider is available.""" + with patch("tools.web_tools._ddgs_package_importable", return_value=False), \ + patch("tools.web_tools._peek_nous_access_token", return_value=None): + import tools.web_tools + web_search_entry = tools.web_tools.registry.get_entry("web_search") + web_extract_entry = tools.web_tools.registry.get_entry("web_extract") + assert web_search_entry is not None, \ + "web_search tool was filtered out despite custom provider being available" + assert web_extract_entry is not None, \ + "web_extract tool was filtered out despite custom provider being available" diff --git a/tools/browser_camofox.py b/tools/browser_camofox.py index fe11256aa..4151d3cdb 100644 --- a/tools/browser_camofox.py +++ b/tools/browser_camofox.py @@ -653,6 +653,10 @@ def camofox_click(ref: str, task_id: Optional[str] = None) -> str: if not session["tab_id"]: return tool_error("No browser session. Call browser_navigate first.", success=False) + blocked = _camofox_private_page_block(session, task_id, "click") + if blocked: + return blocked + # Strip @ prefix if present (our tool convention) clean_ref = ref.lstrip("@") @@ -676,6 +680,10 @@ def camofox_type(ref: str, text: str, task_id: Optional[str] = None) -> str: if not session["tab_id"]: return tool_error("No browser session. Call browser_navigate first.", success=False) + blocked = _camofox_private_page_block(session, task_id, "type") + if blocked: + return blocked + clean_ref = ref.lstrip("@") _post( @@ -745,6 +753,10 @@ def camofox_press(key: str, task_id: Optional[str] = None) -> str: if not session["tab_id"]: return tool_error("No browser session. Call browser_navigate first.", success=False) + blocked = _camofox_private_page_block(session, task_id, "press") + if blocked: + return blocked + _post( f"/tabs/{session['tab_id']}/press", {"userId": session["user_id"], "key": key}, diff --git a/tools/browser_cdp_tool.py b/tools/browser_cdp_tool.py index ca7497bb6..2df9a1660 100644 --- a/tools/browser_cdp_tool.py +++ b/tools/browser_cdp_tool.py @@ -428,6 +428,15 @@ def browser_cdp( # --- Route iframe-scoped calls through the supervisor --------------- if frame_id: + # Same private-page/SSRF boundary as the stateless path below — + # frame_id routing must not become the sibling bypass for it. + blocked = _browser_cdp_private_guard( + task_id=effective_task_id, + method=method, + params=params or {}, + ) + if blocked: + return blocked return _browser_cdp_via_supervisor( task_id=effective_task_id, frame_id=frame_id, diff --git a/tools/delegate_tool.py b/tools/delegate_tool.py index b3172e51a..2baf4da30 100644 --- a/tools/delegate_tool.py +++ b/tools/delegate_tool.py @@ -1055,7 +1055,7 @@ def _build_child_agent( override_base_url: Optional[str] = None, override_api_key: Optional[str] = None, override_api_mode: Optional[str] = None, - # ACP transport overrides — lets a non-ACP parent spawn ACP child agents + # ACP transport overrides from trusted delegation config. override_acp_command: Optional[str] = None, override_acp_args: Optional[List[str]] = None, # Per-call role controlling whether the child can further delegate. @@ -1212,11 +1212,9 @@ def _build_child_agent( effective_api_mode = None # force re-derivation from provider's defaults else: effective_api_mode = getattr(parent_agent, "api_mode", None) - # Defensive: validate override_acp_command exists on PATH before honoring - # it. Models occasionally pass acp_command="copilot" / "claude" / etc. in - # delegate_task tool calls despite the schema saying not to, which forces - # the subagent onto the copilot-acp transport below and crashes the - # gateway when the binary is missing (e.g. headless container deploys). + # Defensive: validate trusted delegation.command exists on PATH before + # honoring it. Stale config should not force a child onto the ACP transport + # and then fail at subprocess startup. if override_acp_command: import shutil as _shutil @@ -2346,8 +2344,6 @@ def delegate_task( context: Optional[str] = None, tasks: Optional[List[Dict[str, Any]]] = None, max_iterations: Optional[int] = None, - acp_command: Optional[str] = None, - acp_args: Optional[List[str]] = None, role: Optional[str] = None, background: Optional[bool] = None, parent_agent=None, @@ -2486,7 +2482,6 @@ def delegate_task( children = [] try: for i, t in enumerate(task_list): - task_acp_args = t.get("acp_args") if "acp_args" in t else None # Per-task role beats top-level; normalise again so unknown # per-task values warn and degrade to leaf uniformly. effective_role = _normalize_role(t.get("role") or top_role) @@ -2505,14 +2500,8 @@ def delegate_task( override_base_url=creds["base_url"], override_api_key=creds["api_key"], override_api_mode=creds["api_mode"], - override_acp_command=t.get("acp_command") - or acp_command - or creds.get("command"), - override_acp_args=( - task_acp_args - if task_acp_args is not None - else (acp_args if acp_args is not None else creds.get("args")) - ), + override_acp_command=creds.get("command"), + override_acp_args=creds.get("args"), role=effective_role, ) # Override with correct parent tool names (before child construction mutated global) @@ -3292,30 +3281,6 @@ def _build_role_param_description() -> str: ) -# Known ACP-compatible CLIs that delegate_task can shell out to. Kept -# narrow on purpose: only the ones agent/copilot_acp_client.py and friends -# actually understand. Add new entries here when a new ACP CLI ships. -_KNOWN_ACP_BINARIES: tuple[str, ...] = ("copilot", "claude", "codex") - - -def _acp_binary_available() -> bool: - """True iff at least one known ACP CLI is on PATH. - - Used to gate inclusion of ``acp_command`` / ``acp_args`` in the - delegate_task schema. On headless hosts (Railway / Fly / Docker / - fresh VPS) without any of these binaries, exposing the fields invites - the model to hallucinate ``acp_command="copilot"`` from the schema's - description, which used to crash subagent runs and take the gateway - down. Pruning the fields from the schema removes the temptation. - - Not cached: ``shutil.which`` is cheap and we want the schema to react - to mid-session installs without forcing a process restart. - """ - import shutil as _shutil - - return any(_shutil.which(name) for name in _KNOWN_ACP_BINARIES) - - def _build_dynamic_schema_overrides() -> dict: """Return per-call schema overrides reflecting current config. @@ -3333,24 +3298,6 @@ def _build_dynamic_schema_overrides() -> dict: overrides_params["properties"]["tasks"]["description"] = _build_tasks_param_description() overrides_params["properties"]["role"]["description"] = _build_role_param_description() - # Prune ACP overrides from the schema when no known ACP CLI is on PATH. - # The runtime guard in _build_child_agent remains as defense-in-depth for - # internal callers / tests / future code paths that skip the schema layer. - if not _acp_binary_available(): - overrides_params["properties"].pop("acp_command", None) - overrides_params["properties"].pop("acp_args", None) - tasks_schema = dict(overrides_params["properties"].get("tasks", {})) - if "items" in tasks_schema: - items = dict(tasks_schema["items"]) - if "properties" in items: - items["properties"] = { - k: v - for k, v in items["properties"].items() - if k not in ("acp_command", "acp_args") - } - tasks_schema["items"] = items - overrides_params["properties"]["tasks"] = tasks_schema - return { "description": _build_top_level_description(), "parameters": overrides_params, @@ -3401,19 +3348,6 @@ DELEGATE_TASK_SCHEMA = { "type": "string", "description": "Task-specific context", }, - "acp_command": { - "type": "string", - "description": ( - "Per-task ACP command override (e.g. 'copilot'). " - "Overrides the top-level acp_command for this task only. " - "Do NOT set unless the user explicitly told you an ACP CLI is installed." - ), - }, - "acp_args": { - "type": "array", - "items": {"type": "string"}, - "description": "Per-task ACP args override. Leave empty unless acp_command is set.", - }, "role": { "type": "string", "enum": ["leaf", "orchestrator"], @@ -3444,28 +3378,6 @@ DELEGATE_TASK_SCHEMA = { "compatibility." ), }, - "acp_command": { - "type": "string", - "description": ( - "Override ACP command for child agents (e.g. 'copilot'). " - "When set, children use ACP subprocess transport instead of inheriting " - "the parent's transport. Requires an ACP-compatible CLI " - "(currently GitHub Copilot CLI via 'copilot --acp --stdio'). " - "See agent/copilot_acp_client.py for the implementation. " - "IMPORTANT: Do NOT set this unless the user has explicitly told you " - "a specific ACP-compatible CLI is installed and configured. " - "Leave empty to use the parent's default transport (Hermes subagents)." - ), - }, - "acp_args": { - "type": "array", - "items": {"type": "string"}, - "description": ( - "Arguments for the ACP command (default: ['--acp', '--stdio']). " - "Only used when acp_command is set. " - "Leave empty unless acp_command is explicitly provided." - ), - }, }, "required": [], }, @@ -3492,6 +3404,28 @@ def _model_background_value(args: dict, parent_agent=None) -> bool: return not is_subagent +_MODEL_HIDDEN_TASK_FIELDS = {"acp_command", "acp_args"} + + +def _strip_model_hidden_task_fields(tasks: Any) -> Any: + if not isinstance(tasks, list): + return tasks + stripped_tasks = [] + changed = False + for task in tasks: + if not isinstance(task, dict): + stripped_tasks.append(task) + continue + stripped = { + key: value + for key, value in task.items() + if key not in _MODEL_HIDDEN_TASK_FIELDS + } + changed = changed or len(stripped) != len(task) + stripped_tasks.append(stripped) + return stripped_tasks if changed else tasks + + registry.register( name="delegate_task", toolset="delegation", @@ -3499,10 +3433,8 @@ registry.register( handler=lambda args, **kw: delegate_task( goal=args.get("goal"), context=args.get("context"), - tasks=args.get("tasks"), + tasks=_strip_model_hidden_task_fields(args.get("tasks")), max_iterations=args.get("max_iterations"), - acp_command=args.get("acp_command"), - acp_args=args.get("acp_args"), role=args.get("role"), background=_model_background_value(args, kw.get("parent_agent")), parent_agent=kw.get("parent_agent"), diff --git a/tools/tts_tool.py b/tools/tts_tool.py index b71ebfa82..e2a96fb4a 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -172,6 +172,11 @@ DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB" # Adam DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2" DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5" DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts" +# The managed OpenAI audio gateway (Nous portal proxy) only proxies these speech +# models. A user's tts.openai.model set for *direct* OpenAI (e.g. "tts-1-hd") +# is rejected with a 400 "Unsupported managed OpenAI speech model", so it must be +# coerced to a supported model when routing through the gateway. +MANAGED_OPENAI_TTS_MODELS = frozenset({"gpt-4o-mini-tts"}) DEFAULT_KITTENTTS_MODEL = "KittenML/kitten-tts-nano-0.8-int8" # 25MB DEFAULT_KITTENTTS_VOICE = "Jasper" DEFAULT_PIPER_VOICE = "en_US-lessac-medium" # balanced size/quality @@ -1019,14 +1024,29 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any] Returns: Path to the saved audio file. """ - api_key, base_url = _resolve_openai_audio_client_config() + api_key, base_url, is_managed = _resolve_openai_audio_client_config() oai_config = tts_config.get("openai", {}) model = oai_config.get("model", DEFAULT_OPENAI_MODEL) voice = oai_config.get("voice", DEFAULT_OPENAI_VOICE) - base_url = oai_config.get("base_url", base_url) + custom_base_url = oai_config.get("base_url") + if custom_base_url: + base_url = custom_base_url speed = float(oai_config.get("speed", tts_config.get("speed", 1.0))) + # The managed OpenAI audio gateway only proxies MANAGED_OPENAI_TTS_MODELS. + # A model set for direct OpenAI (e.g. "tts-1-hd") 400s there with + # "Unsupported managed OpenAI speech model", so coerce it — unless the user + # redirected base_url to their own endpoint, in which case respect it. + if is_managed and not custom_base_url and model not in MANAGED_OPENAI_TTS_MODELS: + logger.warning( + "TTS: managed OpenAI audio gateway does not support model %r; " + "falling back to %s. Set VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY " + "to use %r directly.", + model, DEFAULT_OPENAI_MODEL, model, + ) + model = DEFAULT_OPENAI_MODEL + # Determine response format from extension if output_path.endswith(".ogg"): response_format = "opus" @@ -2502,15 +2522,17 @@ def check_tts_requirements() -> bool: return False -def _resolve_openai_audio_client_config() -> tuple[str, str]: - """Return direct OpenAI audio config or a managed gateway fallback. +def _resolve_openai_audio_client_config() -> tuple[str, str, bool]: + """Return ``(api_key, base_url, is_managed)`` for the OpenAI audio client. - When ``tts.use_gateway`` is set in config, the Tool Gateway is preferred + ``is_managed`` is True when the config resolves to the Nous managed audio + gateway (a restricted proxy), so callers can coerce the request to what the + gateway supports. When ``tts.use_gateway`` is set the gateway is preferred even if direct OpenAI credentials are present. """ direct_api_key = resolve_openai_audio_api_key() if direct_api_key and not prefers_gateway("tts"): - return direct_api_key, DEFAULT_OPENAI_BASE_URL + return direct_api_key, DEFAULT_OPENAI_BASE_URL, False managed_gateway = resolve_managed_tool_gateway("openai-audio") if managed_gateway is None: @@ -2524,8 +2546,10 @@ def _resolve_openai_audio_client_config() -> tuple[str, str]: ) raise ValueError(message) - return managed_gateway.nous_user_token, urljoin( - f"{managed_gateway.gateway_origin.rstrip('/')}/", "v1" + return ( + managed_gateway.nous_user_token, + urljoin(f"{managed_gateway.gateway_origin.rstrip('/')}/", "v1"), + True, ) diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 23273483e..6b67abec9 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -1356,7 +1356,18 @@ async def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> str: "Fully describe and explain everything about this image, then answer the " f"following question:\n\n{question}" ) - model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None + # Prefer config.yaml auxiliary.vision.model; env var is a legacy override. + model = None + try: + from hermes_cli.config import cfg_get, load_config + _cfg = load_config() + _vmodel = cfg_get(_cfg, "auxiliary", "vision", "model") + if _vmodel: + model = str(_vmodel).strip() or None + except Exception: + pass + if not model: + model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None return await vision_analyze_tool(image_url, full_prompt, model) @@ -1718,7 +1729,19 @@ def _handle_video_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: "including visual content, motion, audio cues, text overlays, and scene " f"transitions. Then answer the following question:\n\n{question}" ) - model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None + # Prefer config.yaml auxiliary.video.model (falling back to vision); + # env vars are a legacy override. + model = None + try: + from hermes_cli.config import cfg_get, load_config + _cfg = load_config() + _vmodel = cfg_get(_cfg, "auxiliary", "video", "model") or cfg_get(_cfg, "auxiliary", "vision", "model") + if _vmodel: + model = str(_vmodel).strip() or None + except Exception: + pass + if not model: + model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None return video_analyze_tool(video_url, full_prompt, model) diff --git a/tools/web_tools.py b/tools/web_tools.py index e66d0ee0f..1e2c4a03a 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -136,6 +136,71 @@ def _load_web_config() -> dict: except (ImportError, Exception): return {} + +# The built-in web backends whose availability is driven by hardcoded +# env-var / package / OAuth probes below. Any name NOT in this set is a +# candidate plugin-registered provider and must be resolved through the +# web_search_registry (``is_available()``) instead. Kept as a single named +# constant so the whitelist early-returns and the availability chokepoint +# stay in sync. +# +# NOTE: this intentionally includes ``xai``, which the registry's +# ``_LEGACY_PREFERENCE`` does NOT — xai availability is probed via +# ``has_xai_credentials()`` (env var OR auth.json OAuth), not a registered +# WebSearchProvider. Keep the two sets aligned by hand: if xai ever ships as +# a registered provider, drop it here so the registry path takes over. +_LEGACY_WEB_BACKENDS = frozenset( + {"parallel", "firecrawl", "tavily", "exa", "searxng", "brave-free", "ddgs", "xai"} +) + + +def _registered_web_provider(backend: str): + """Return a plugin-registered web provider by name, or ``None``. + + Consults ``agent.web_search_registry`` so backends contributed by the + plugin system (which are absent from :data:`_LEGACY_WEB_BACKENDS`) are + discoverable during availability/selection resolution. Returns ``None`` + on any lookup failure so callers can fall through to legacy checks. + """ + if not backend: + return None + try: + from agent.web_search_registry import get_provider + + return get_provider(backend) + except Exception as exc: # noqa: BLE001 — registry optional; never fatal + logger.debug("web provider registry lookup failed for %r: %s", backend, exc) + return None + + +def _registered_web_provider_available(backend: str): + """Availability of a *registered* web provider, or ``None`` if unregistered. + + Returns ``True``/``False`` when *backend* names a registered provider + (calling its ``is_available()``), or ``None`` when it isn't registered — + letting the caller fall through to the legacy built-in probes. + """ + provider = _registered_web_provider(backend) + if provider is None: + return None + try: + return bool(provider.is_available()) + except Exception as exc: # noqa: BLE001 — a broken provider is "unavailable" + logger.debug("web provider %r.is_available() raised: %s", backend, exc) + return False + + +def _list_registered_web_providers(): + """Return all plugin-registered web providers (empty list on failure).""" + try: + from agent.web_search_registry import list_providers + + return list_providers() + except Exception as exc: # noqa: BLE001 — registry optional; never fatal + logger.debug("web provider registry list failed: %s", exc) + return [] + + def _get_backend() -> str: """Determine which web backend to use (shared fallback). @@ -144,7 +209,7 @@ def _get_backend() -> str: keys manually without running setup. """ configured = (_load_web_config().get("backend") or "").lower().strip() - if configured in {"parallel", "firecrawl", "tavily", "exa", "searxng", "brave-free", "ddgs", "xai"}: + if configured in _LEGACY_WEB_BACKENDS or _registered_web_provider(configured) is not None: return configured # Fallback for manual / legacy config — pick the highest-priority @@ -168,6 +233,21 @@ def _get_backend() -> str: if available: return backend + # Final fallback: walk plugin-registered providers so a custom backend + # (with no built-in creds present) still resolves. Built-in names are + # already covered above, so this only surfaces plugin-contributed + # providers via their own is_available() gate. We hold the provider + # object already, so probe it directly rather than round-tripping through + # _is_backend_available() (which would re-do the registry lookup). + for provider in _list_registered_web_providers(): + if provider.name in _LEGACY_WEB_BACKENDS: + continue + try: + if provider.is_available(): + return provider.name + except Exception as exc: # noqa: BLE001 — a broken provider is skipped + logger.debug("web provider %r.is_available() raised: %s", provider.name, exc) + return "firecrawl" # default (backward compat) @@ -210,7 +290,22 @@ def _get_capability_backend(capability: str) -> str: def _is_backend_available(backend: str) -> bool: - """Return True when the selected backend is currently usable.""" + """Return True when the selected backend is currently usable. + + For plugin-registered backends (any name outside + :data:`_LEGACY_WEB_BACKENDS`), availability is delegated to the + provider's ``is_available()`` via the web_search_registry. This is the + single chokepoint through which ``_get_backend``, + ``_get_capability_backend``, and ``check_web_api_key`` all resolve + availability — fixing custom-provider discovery for every caller at once + (issues #28651, #31873, #32698). Built-in backends keep their cheap + hardcoded probes below. + """ + backend = (backend or "").lower().strip() + if backend not in _LEGACY_WEB_BACKENDS: + registered = _registered_web_provider_available(backend) + if registered is not None: + return registered if backend == "exa": return _has_env("EXA_API_KEY") if backend == "parallel": @@ -861,14 +956,39 @@ async def web_extract_tool( # Convenience function to check Firecrawl credentials def check_web_api_key() -> bool: - """Check whether the configured web backend is available.""" + """Check whether the configured web backend is available. + + Used as the ``check_fn`` gate for the ``web_search`` and ``web_extract`` + tool registry entries — so a plugin-registered provider that reports + ``is_available()`` must light the tools up even when no built-in backend + has credentials (issues #28651, #31873). Resolution funnels through + :func:`_is_backend_available`, which delegates non-legacy names to the + registry. + """ configured = _load_web_config().get("backend", "").lower().strip() - if configured in {"exa", "parallel", "firecrawl", "tavily", "searxng", "brave-free", "ddgs", "xai"}: - return _is_backend_available(configured) - return any( - _is_backend_available(backend) - for backend in ("exa", "parallel", "firecrawl", "tavily", "searxng", "brave-free", "ddgs", "xai") - ) + if configured and _is_backend_available(configured): + return True + # Any built-in backend with credentials present. This is a boolean OR, so + # unlike _get_backend() the probe order is irrelevant. + if any(_is_backend_available(backend) for backend in _LEGACY_WEB_BACKENDS): + return True + # Any plugin-registered provider the registry considers active for either + # capability. Delegating to the registry's own availability-filtered + # resolvers keeps a single authority for "is a custom provider usable" + # rather than re-implementing the walk here. + try: + from agent.web_search_registry import ( + get_active_search_provider, + get_active_extract_provider, + ) + + return ( + get_active_search_provider() is not None + or get_active_extract_provider() is not None + ) + except Exception as exc: # noqa: BLE001 — registry optional; never fatal + logger.debug("web provider registry availability check failed: %s", exc) + return False if __name__ == "__main__": diff --git a/web/src/components/HermesConsoleModal.tsx b/web/src/components/HermesConsoleModal.tsx new file mode 100644 index 000000000..fd63b38b8 --- /dev/null +++ b/web/src/components/HermesConsoleModal.tsx @@ -0,0 +1,538 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { createPortal } from "react-dom"; +import { FitAddon } from "@xterm/addon-fit"; +import { Unicode11Addon } from "@xterm/addon-unicode11"; +import { WebLinksAddon } from "@xterm/addon-web-links"; +import { Terminal as XtermTerminal } from "@xterm/xterm"; +import "@xterm/xterm/css/xterm.css"; +import { Terminal, X } from "lucide-react"; +import { Badge } from "@nous-research/ui/ui/components/badge"; +import { Button } from "@nous-research/ui/ui/components/button"; +import { useModalBehavior } from "@/hooks/useModalBehavior"; +import { useProfileScope } from "@/contexts/useProfileScope"; +import { api } from "@/lib/api"; +import { cn, themedBody } from "@/lib/utils"; +import { useTheme } from "@/themes"; + +type ConsoleFrame = + | { + type: "ready"; + context?: string; + profile?: string; + prompt?: string; + } + | { + type: "output"; + data?: string; + stream?: string; + } + | { + type: "error"; + message?: string; + } + | { + type: "confirm_required"; + command?: string; + message?: string; + prompt?: string; + } + | { + type: "complete"; + status?: string; + prompt?: string; + } + | { + type: "clear"; + } + | { + type: "pong"; + }; + +type ConnectionState = "connecting" | "ready" | "running" | "closed" | "error"; + +interface HermesConsoleModalProps { + open: boolean; + onClose: () => void; +} + +function buildTerminalTheme(background: string, foreground: string) { + return { + background, + foreground, + cursor: foreground, + cursorAccent: background, + selectionBackground: "rgba(255, 255, 255, 0.25)", + black: "#000000", + red: "#ff5f67", + green: "#5fffb0", + yellow: "#ffd166", + blue: "#7aa2ff", + magenta: "#d597ff", + cyan: "#58e6ff", + white: foreground, + brightBlack: "#666666", + brightRed: "#ff8b90", + brightGreen: "#8dffc8", + brightYellow: "#ffe08a", + brightBlue: "#9dbaff", + brightMagenta: "#e4b7ff", + brightCyan: "#8ef0ff", + brightWhite: "#ffffff", + }; +} + +function normalizeTerminalText(text: string): string { + return text.replace(/\r?\n/g, "\r\n"); +} + +function writeLine(term: XtermTerminal, text = ""): void { + term.write(`${normalizeTerminalText(text)}\r\n`); +} + +function writeBlock(term: XtermTerminal, text: string): void { + const normalized = normalizeTerminalText(text); + term.write(normalized.endsWith("\r\n") ? normalized : `${normalized}\r\n`); +} + +function isPrintable(data: string): boolean { + return data >= " " || data === "\t"; +} + +export function HermesConsoleModal({ open, onClose }: HermesConsoleModalProps) { + const modalRef = useModalBehavior({ open, onClose }); + const hostRef = useRef<HTMLDivElement | null>(null); + const termRef = useRef<XtermTerminal | null>(null); + const wsRef = useRef<WebSocket | null>(null); + const lineRef = useRef(""); + const promptRef = useRef("hermes> "); + const inputPromptRef = useRef("hermes> "); + const historyRef = useRef<string[]>([]); + const historyIndexRef = useRef<number | null>(null); + const activeCommandRef = useRef(false); + const pendingCommandRef = useRef<string | null>(null); + const hasReadyFrameRef = useRef(false); + const [connectionState, setConnectionState] = + useState<ConnectionState>("connecting"); + const [consoleContext, setConsoleContext] = useState("pending"); + const [consoleProfile, setConsoleProfile] = useState("current"); + const { profile } = useProfileScope(); + const { theme } = useTheme(); + + const redrawInput = useCallback((line = lineRef.current) => { + const term = termRef.current; + if (!term) return; + lineRef.current = line; + term.write(`\r\x1b[2K${inputPromptRef.current}${line}`); + }, []); + + const showPrompt = useCallback(() => { + const term = termRef.current; + if (!term) return; + lineRef.current = ""; + historyIndexRef.current = null; + inputPromptRef.current = promptRef.current; + term.write(inputPromptRef.current); + }, []); + + const sendFrame = useCallback((payload: Record<string, unknown>) => { + const ws = wsRef.current; + if (!ws || ws.readyState !== WebSocket.OPEN) return false; + ws.send(JSON.stringify(payload)); + return true; + }, []); + + const cancelCommand = useCallback(() => { + pendingCommandRef.current = null; + activeCommandRef.current = false; + sendFrame({ type: "cancel" }); + }, [sendFrame]); + + const submitLine = useCallback( + (rawLine: string) => { + const term = termRef.current; + if (!term) return; + const line = rawLine.trim(); + term.write("\r\n"); + lineRef.current = ""; + historyIndexRef.current = null; + + const pending = pendingCommandRef.current; + if (pending) { + const answer = line.toLowerCase(); + if (answer === "y" || answer === "yes") { + pendingCommandRef.current = null; + activeCommandRef.current = true; + setConnectionState("running"); + sendFrame({ type: "confirm", command: pending }); + return; + } + cancelCommand(); + return; + } + + if (!line) { + showPrompt(); + return; + } + + historyRef.current = [...historyRef.current, line].slice(-200); + activeCommandRef.current = true; + setConnectionState("running"); + if (!sendFrame({ type: "input", line })) { + activeCommandRef.current = false; + writeLine(term, "\x1b[31mConsole is not connected.\x1b[0m"); + showPrompt(); + } + }, + [cancelCommand, sendFrame, showPrompt], + ); + + const recallHistory = useCallback( + (direction: -1 | 1) => { + const history = historyRef.current; + if (!history.length) return; + const current = historyIndexRef.current; + if (current === null) { + if (direction > 0) return; + historyIndexRef.current = history.length - 1; + } else { + const next = current + direction; + if (next < 0) historyIndexRef.current = 0; + else if (next >= history.length) { + historyIndexRef.current = null; + redrawInput(""); + return; + } else { + historyIndexRef.current = next; + } + } + const idx = historyIndexRef.current; + redrawInput(idx === null ? "" : history[idx] ?? ""); + }, + [redrawInput], + ); + + const handleInputData = useCallback( + (data: string) => { + const term = termRef.current; + if (!term) return; + + if (data === "\x1b[A") { + recallHistory(-1); + return; + } + if (data === "\x1b[B") { + recallHistory(1); + return; + } + + for (const ch of data) { + if (ch === "\u0003") { + term.write("^C\r\n"); + if (activeCommandRef.current || pendingCommandRef.current) { + cancelCommand(); + } else { + showPrompt(); + } + continue; + } + if (ch === "\u000c") { + term.clear(); + showPrompt(); + continue; + } + if (activeCommandRef.current) { + term.write("\x07"); + continue; + } + if (ch === "\r" || ch === "\n") { + submitLine(lineRef.current); + continue; + } + if (ch === "\u007f" || ch === "\b") { + if (lineRef.current.length > 0) { + lineRef.current = lineRef.current.slice(0, -1); + term.write("\b \b"); + } + continue; + } + if (ch === "\x1b") { + continue; + } + if (isPrintable(ch)) { + lineRef.current += ch; + term.write(ch); + } + } + }, + [cancelCommand, recallHistory, showPrompt, submitLine], + ); + + const handleFrame = useCallback( + (frame: ConsoleFrame) => { + const term = termRef.current; + if (!term) return; + + if (frame.type === "ready") { + const nextPrompt = frame.prompt || "hermes> "; + promptRef.current = nextPrompt; + inputPromptRef.current = nextPrompt; + hasReadyFrameRef.current = true; + setConsoleContext(frame.context || "local"); + setConsoleProfile(frame.profile || "current"); + activeCommandRef.current = false; + setConnectionState("ready"); + term.clear(); + showPrompt(); + return; + } + + if (frame.type === "output") { + if (frame.data) writeBlock(term, frame.data); + return; + } + + if (frame.type === "error") { + writeLine(term, `\x1b[31m${frame.message || "Command failed."}\x1b[0m`); + return; + } + + if (frame.type === "confirm_required") { + pendingCommandRef.current = frame.command || ""; + activeCommandRef.current = false; + setConnectionState("ready"); + if (frame.message) { + writeLine(term, `\x1b[33m${frame.message}\x1b[0m`); + } + inputPromptRef.current = "Confirm? [y/N] "; + lineRef.current = ""; + term.write(inputPromptRef.current); + return; + } + + if (frame.type === "complete") { + activeCommandRef.current = false; + if (frame.prompt) promptRef.current = frame.prompt; + if (frame.status === "confirm_required") return; + if (frame.status === "exit") { + setConnectionState("closed"); + wsRef.current?.close(); + return; + } + if (frame.status === "timeout") { + writeLine(term, "\x1b[31mCommand timed out.\x1b[0m"); + } + if (frame.status === "cancelled") { + writeLine(term, "\x1b[33mCancelled.\x1b[0m"); + } + pendingCommandRef.current = null; + setConnectionState("ready"); + showPrompt(); + return; + } + + if (frame.type === "clear") { + term.clear(); + showPrompt(); + } + }, + [showPrompt], + ); + + useEffect(() => { + if (!open) return; + const host = hostRef.current; + if (!host) return; + + let cancelled = false; + let resizeFrame = 0; + const term = new XtermTerminal({ + allowProposedApi: true, + cursorBlink: true, + fontFamily: + "'JetBrains Mono', 'Cascadia Mono', 'Fira Code', 'MesloLGS NF', 'Source Code Pro', Menlo, Consolas, 'DejaVu Sans Mono', monospace", + fontSize: 13, + lineHeight: 1.25, + letterSpacing: 0, + macOptionIsMeta: true, + scrollback: 3000, + theme: buildTerminalTheme( + theme.terminalBackground ?? "#000000", + theme.terminalForeground ?? "#f0e6d2", + ), + }); + termRef.current = term; + + const fit = new FitAddon(); + term.loadAddon(fit); + const unicode11 = new Unicode11Addon(); + term.loadAddon(unicode11); + term.unicode.activeVersion = "11"; + term.loadAddon(new WebLinksAddon()); + term.open(host); + term.focus(); + + const fitTerminal = () => { + if (!host.isConnected || host.clientWidth <= 0 || host.clientHeight <= 0) { + return; + } + try { + fit.fit(); + } catch { + /* fit can fail while the modal is closing */ + } + }; + const scheduleFit = () => { + if (resizeFrame) return; + resizeFrame = requestAnimationFrame(() => { + resizeFrame = 0; + fitTerminal(); + }); + }; + const ro = new ResizeObserver(scheduleFit); + ro.observe(host); + scheduleFit(); + + const dataDisposable = term.onData(handleInputData); + setConnectionState("connecting"); + setConsoleContext("pending"); + setConsoleProfile(profile || "current"); + hasReadyFrameRef.current = false; + writeLine(term, "\x1b[2mConnecting to Hermes Console...\x1b[0m"); + + void (async () => { + try { + const params = profile ? { profile } : undefined; + const url = await api.buildWsUrl("/api/console", params); + if (cancelled) return; + const ws = new WebSocket(url); + wsRef.current = ws; + + ws.onopen = () => { + setConnectionState("connecting"); + }; + + ws.onmessage = (ev) => { + try { + const frame = JSON.parse(String(ev.data)) as ConsoleFrame; + handleFrame(frame); + } catch { + writeLine(term, "\x1b[31mMalformed console frame.\x1b[0m"); + } + }; + + ws.onerror = () => { + setConnectionState("error"); + writeLine(term, "\x1b[31mConsole websocket error.\x1b[0m"); + }; + + ws.onclose = (ev) => { + wsRef.current = null; + activeCommandRef.current = false; + pendingCommandRef.current = null; + if (cancelled) return; + setConnectionState(ev.code === 1000 ? "closed" : "error"); + const reason = ev.reason ? ` ${ev.reason}` : ""; + const message = + ev.code === 1006 && !hasReadyFrameRef.current + ? "Console connection failed before the server handshake. Check that this dashboard is connected to a backend with /api/console." + : `Console closed (${ev.code}).${reason}`; + writeLine(term, `\x1b[31m${message}\x1b[0m`); + }; + } catch (err) { + if (cancelled) return; + setConnectionState("error"); + writeLine(term, `\x1b[31mConsole unavailable: ${err}\x1b[0m`); + } + })(); + + return () => { + cancelled = true; + dataDisposable.dispose(); + ro.disconnect(); + if (resizeFrame) cancelAnimationFrame(resizeFrame); + wsRef.current?.close(); + wsRef.current = null; + term.dispose(); + termRef.current = null; + lineRef.current = ""; + pendingCommandRef.current = null; + activeCommandRef.current = false; + hasReadyFrameRef.current = false; + }; + }, [handleFrame, handleInputData, open, profile, theme]); + + useEffect(() => { + if (!open) return; + const term = termRef.current; + if (!term) return; + term.options.theme = buildTerminalTheme( + theme.terminalBackground ?? "#000000", + theme.terminalForeground ?? "#f0e6d2", + ); + }, [open, theme]); + + if (!open) return null; + + const statusTone = + connectionState === "ready" + ? "success" + : connectionState === "running" + ? "warning" + : connectionState === "connecting" + ? "secondary" + : "destructive"; + + return createPortal( + <div + ref={modalRef} + className="fixed inset-0 z-[100] flex items-center justify-center bg-background/85 p-3 sm:p-4" + onClick={(event) => event.target === event.currentTarget && onClose()} + role="dialog" + aria-modal="true" + aria-labelledby="hermes-console-title" + > + <div + className={cn( + themedBody, + "relative flex h-[min(82dvh,760px)] w-full max-w-5xl flex-col border border-border bg-card shadow-2xl", + )} + > + <header className="flex min-h-14 items-center gap-3 border-b border-border px-4 py-3"> + <div className="flex h-9 w-9 items-center justify-center border border-border bg-background/60 text-primary"> + <Terminal className="h-4 w-4" /> + </div> + <div className="min-w-0 flex-1"> + <h2 + id="hermes-console-title" + className="font-mondwest text-display text-base tracking-wider" + > + Hermes Console + </h2> + <div className="mt-1 flex flex-wrap items-center gap-2 text-xs text-muted-foreground"> + <Badge tone={statusTone}>{connectionState}</Badge> + <span className="font-mono">{consoleContext}</span> + <span className="font-mono">{consoleProfile}</span> + </div> + </div> + <Button + ghost + size="icon" + onClick={onClose} + className="text-muted-foreground hover:text-foreground" + aria-label="Close console" + > + <X /> + </Button> + </header> + <div className="min-h-0 flex-1 bg-black"> + <div + ref={hostRef} + className="h-full min-h-0 w-full overflow-hidden p-2 [&_.xterm]:h-full [&_.xterm-viewport]:!bg-transparent" + /> + </div> + </div> + </div>, + document.body, + ); +} diff --git a/web/src/pages/SystemPage.tsx b/web/src/pages/SystemPage.tsx index 043933abe..82aed6b2b 100644 --- a/web/src/pages/SystemPage.tsx +++ b/web/src/pages/SystemPage.tsx @@ -42,6 +42,7 @@ import { useConfirmDelete } from "@nous-research/ui/hooks/use-confirm-delete"; import { ConfirmDialog } from "@nous-research/ui/ui/components/confirm-dialog"; import { useModalBehavior } from "@/hooks/useModalBehavior"; import { DeleteConfirmDialog } from "@/components/DeleteConfirmDialog"; +import { HermesConsoleModal } from "@/components/HermesConsoleModal"; import { cn, themedBody } from "@/lib/utils"; import { api } from "@/lib/api"; import type { @@ -186,6 +187,7 @@ export default function SystemPage() { const [loading, setLoading] = useState(true); const [activeAction, setActiveAction] = useState<string | null>(null); + const [consoleOpen, setConsoleOpen] = useState(false); // Add-credential form. const [credProvider, setCredProvider] = useState("openrouter"); @@ -680,6 +682,10 @@ export default function SystemPage() { description="Remove this hook from config and revoke its consent? It stops firing on the next restart." loading={hookDelete.isDeleting} /> + <HermesConsoleModal + open={consoleOpen} + onClose={() => setConsoleOpen(false)} + /> {/* Create-hook modal */} {hookModalOpen && ( @@ -1162,6 +1168,9 @@ export default function SystemPage() { </H2> <Card> <CardContent className="flex flex-wrap gap-2 py-4"> + <Button size="sm" ghost prefix={<Terminal className="h-3.5 w-3.5" />} onClick={() => setConsoleOpen(true)}> + Open console + </Button> <Button size="sm" ghost prefix={<Stethoscope className="h-3.5 w-3.5" />} onClick={() => runOp(api.runDoctor, "Doctor")}> Run doctor </Button> diff --git a/website/docs/getting-started/updating.md b/website/docs/getting-started/updating.md index 1d42519d3..7e7ea5b03 100644 --- a/website/docs/getting-started/updating.md +++ b/website/docs/getting-started/updating.md @@ -102,6 +102,8 @@ $ hermes update Close the listed processes and re-run. If you're sure the concurrent process won't interfere (rare — usually only useful when an antivirus shim is mis-attributed), pass `--force` to skip the check. In that case the updater will still retry the `.exe` rename with exponential backoff and, on stubborn locks, schedule the replacement for next reboot via `MoveFileEx(MOVEFILE_DELAY_UNTIL_REBOOT)` so the update can complete. +A second, separate guard refuses to touch the venv while any process is running from its Python interpreter (the Desktop app's backend, a gateway, a Python REPL). Those processes keep native extension files (`.pyd`) locked, and a dependency sync that dies partway on an access-denied error strands the install between versions. This guard is **not** bypassed by `--force`; if you're certain the detected holders are false positives, use the explicit `hermes update --force-venv`. + Expected output looks like: ```