From f8e36f0f31ba657df34a12796d335bae55728268 Mon Sep 17 00:00:00 2001 From: SHL0MS Date: Fri, 3 Jul 2026 13:05:28 -0400 Subject: [PATCH] field-report fixes: dob pre-warn, .env creds, show cmd, false-positive guards from a live run (NY subject, 43 brokers): - fanout default 8->5 (8+ batches time out) - setup/doctor read $HERMES_HOME/.env so creds hermes already loads are detected - new `show `: reads back case state+evidence for cheap parent re-verify - intelius: requires.dob + 5-step guided-mode gate; planner pre-warns when dob is missing - rehold.json: property-record != PII (an address-only match is not_found, not removable) - tps/fps: match_signal_notes tell the scanner to ignore SEO-templated titles - methods.md: browser backends (scan vs execute + operator chrome over CDP), property/SEO callouts - doctor: warn when browser email-mode pairs with a cloud scan backend (needs operator chrome/CDP) - ledger: found->not_found retract (false-positive), blocked->human_task_queued - autopilot: indirect-exposure web-form fallback; drop a stray f-string tests: standalone 92 pass; ruff clean. --- optional-skills/security/unbroker/SKILL.md | 16 +++-- .../references/brokers/fastpeoplesearch.json | 1 + .../unbroker/references/brokers/intelius.json | 13 ++-- .../unbroker/references/brokers/rehold.json | 48 ++++++++++++++ .../references/brokers/truepeoplesearch.json | 1 + .../security/unbroker/references/methods.md | 60 +++++++++++++++++ .../security/unbroker/scripts/autopilot.py | 18 ++++- .../security/unbroker/scripts/config.py | 20 ++++++ .../security/unbroker/scripts/ledger.py | 10 ++- .../security/unbroker/scripts/pdd.py | 65 ++++++++++++++----- .../security/unbroker/scripts/tiers.py | 20 +++++- tests/skills/test_unbroker_skill.py | 60 +++++++++++++++++ 12 files changed, 301 insertions(+), 31 deletions(-) create mode 100644 optional-skills/security/unbroker/references/brokers/rehold.json diff --git a/optional-skills/security/unbroker/SKILL.md b/optional-skills/security/unbroker/SKILL.md index e4cf6be25..7f9401599 100644 --- a/optional-skills/security/unbroker/SKILL.md +++ b/optional-skills/security/unbroker/SKILL.md @@ -63,7 +63,9 @@ verifying re-scan. - `python3` (stdlib only; no extra packages needed for the core engine). - **Optional upgrades** (the skill works zero-config without these; `setup --auto` turns on every - one it detects - each one converts a class of human tasks into agent actions): + one it detects, reading credentials from the shell env **and from `$HERMES_HOME/.env`** so keys + Hermes already loads for its own tools are picked up without re-exporting - each one converts a + class of human tasks into agent actions): - **Cloud browser (recommended default): `BROWSERBASE_API_KEY`.** `setup --auto` selects it whenever the key is present, and it is the intended baseline: a real residential-IP cloud browser **clears soft/managed CAPTCHAs (Cloudflare Turnstile, hCaptcha/reCAPTCHA checkbox) as @@ -74,8 +76,13 @@ verifying re-scan. - Email automation, two credential-free-or-not options: - **Browser mode (no password): `setup --email-mode browser`.** The agent sends opt-out/CCPA emails and opens verification links through the operator's **logged-in webmail** using - `browser_*` tools. Nothing is stored. Needs the inbox signed in in the browser Hermes uses - (a cloud browser like Browserbase won't hold the session; use a local/operator browser). + `browser_*` tools. Nothing is stored. This requires Hermes to be pointed at the operator's own + logged-in browser, **NOT** a cloud browser: a headless cloud browser (Browserbase) holds no + webmail session and is itself Cloudflare/DataDome-gated on webmail and on session-bound broker + gates (e.g. PeopleConnect guided-mode). Drive the operator's real Chrome over CDP - launch + `chrome --remote-debugging-port=9222 --user-data-dir="$HOME/.hermes/chrome-debug"` (a dedicated + debug profile signed into the webmail once, not the Default profile) and connect the browser + tools to `127.0.0.1:9222`. See `references/methods.md` -> "Browser backends: scan vs execute". Falls back to drafts for an email if the inbox isn't reachable. - **SMTP/IMAP (stored creds): `EMAIL_ADDRESS` + `EMAIL_PASSWORD`** (+ `EMAIL_SMTP_HOST` / `EMAIL_IMAP_HOST` for non-mainstream providers; gmail/outlook/yahoo/icloud/fastmail inferred). @@ -110,8 +117,9 @@ breaks reading the dossier). | `$PDD drop [--filed]` | **The one-shot legal lever**: one CA DROP request deletes from ALL registered brokers; `--filed` records it | | `$PDD plan [--priority crucial]` | Per-broker tier + method + `search_vectors` + the exact fields to disclose | | `$PDD plan --batch` | **Reduce view**: overlays ledger state, groups brokers by next action (unscanned/found/indirect/blocked/in_progress/done), collapses ownership clusters, **orders `found` cluster-parents-first + emits a tailored `parent_playbook`**, prints `next_actions` | -| `$PDD fanout [--priority crucial] [--size 8]` | Batch brokers into parallel `delegate_task` subagents (auto for large runs) | +| `$PDD fanout [--priority crucial] [--size 5]` | Batch brokers into parallel `delegate_task` subagents (auto for large runs; batches of 5 - 8+ time out) | | `$PDD record [--found true] [--evidence JSON] [--disclosed F --channel C] [--reason "..."]` | Update the ledger (validated state machine); **auto-stamps `next_recheck_at`** | +| `$PDD show ` | Read back a case's recorded state + evidence + disclosure log (so the parent re-verifies a subagent's `found` without re-deriving the listing URL) | | `$PDD send-email --listing [--kind ccpa_indirect ...]` | Render + record the request (recipient locked to the broker's own address). **browser** mode returns a `compose` payload to send via webmail (no password); **programmatic** mode SMTP-sends | | `$PDD verify-link --text ''` | **browser mode**: extract a broker's verification link from webmail text you read (anti-phishing scored) | | `$PDD poll-verification [--broker ]` | **programmatic mode**: poll IMAP for verification links (anti-phishing scored); auto-advances `submitted → verification_pending` | diff --git a/optional-skills/security/unbroker/references/brokers/fastpeoplesearch.json b/optional-skills/security/unbroker/references/brokers/fastpeoplesearch.json index 3cd7c90b1..389464f34 100644 --- a/optional-skills/security/unbroker/references/brokers/fastpeoplesearch.json +++ b/optional-skills/security/unbroker/references/brokers/fastpeoplesearch.json @@ -10,6 +10,7 @@ "fetch": "browser", "antibot": "datadome", "match_signal": "result", + "match_signal_notes": "SEO TRAP: title/H1/intro echoes the query ('Over 100+ FREE public records found for {Name}') with no real match behind it, and /name/{first}-{last} list pages are fuzzy-SURNAME namesakes (different states, no address overlap). Record `found` ONLY on a result CARD corroborated by the subject's address or DOB. Ignore templated title/intro/H1 text.", "by": ["name", "phone", "address"], "url_patterns": { "name": "https://www.fastpeoplesearch.com/name/{first}-{last}" diff --git a/optional-skills/security/unbroker/references/brokers/intelius.json b/optional-skills/security/unbroker/references/brokers/intelius.json index 01e8b85a2..78482811f 100644 --- a/optional-skills/security/unbroker/references/brokers/intelius.json +++ b/optional-skills/security/unbroker/references/brokers/intelius.json @@ -53,10 +53,13 @@ "gov_id": false, "account": false, "phone_callback": false, - "payment": false + "payment": false, + "dob": true }, "inputs": [ - "contact_email" + "contact_email", + "full_name", + "date_of_birth" ], "deletion": { "via": "in_flow", @@ -69,7 +72,8 @@ "playbook": [ "PeopleConnect portal (suppression.peopleconnect.us/login, privacy-center entry at /privacy-center) -- ONE flow here covers Truthfinder, Instant Checkmate, US Search, ZabaSearch, Classmates and ~15 more. DO THIS PARENT FIRST.", "Step 1 asks ONLY for an email + consent checkbox (no name/DOB, no CAPTCHA) -> sends a verification email. Least-disclosure entry: just the contact email.", - "poll-verification will pick up the verify link. The link authenticates a SESSION bound to the browser that OPENS it: the SAME agent browser must open the link and drive /guided-mode (a different browser is bounced to /login).", + "poll-verification will pick up the verify link. The link is a JWT (aud PeopleConnect-email-login then -registration), carries a deviceId, has a ~15-min TTL, and is Cloudflare-gated; it authenticates a SESSION bound to the browser that OPENS it. The SAME agent browser that submitted step 1 must open the link and drive guided-mode straight through. Do NOT hard-navigate to /guided-mode after auth -- that drops the in-memory session and bounces to /login. If the session is lost, re-request a fresh verify email and follow it through without navigating away.", + "guided-mode is a 5-STEP IDENTITY GATE, not a one-click suppress: (1) enter contact email + consent -> verify email; (2) open the verify link in the SAME browser (session/device-bound); (3) enter identity details -- this HARD-REQUIRES date of birth (immutable once saved, no skip) plus legal name; (4) Matching Records -- select the record that describes you, corroborating by address/email/phone, NOT name+DOB alone (namesakes exist); the matched record often aggregates MORE identifiers than the public listing showed (extra emails/addresses) -- expected, not alarming; (5) complete the SUPPRESSION action. So this opt-out discloses DOB + legal name + alias beyond the contact email -- collect DOB at intake (requires.dob=true) or expect a mid-flow pause.", "SUPPRESS, do NOT delete (this cluster is the exception to 'deletion beats suppression'). In guided-mode, complete the SUPPRESSION flow -- it puts you on the do-not-display list, which is what actually removes you from Intelius/TruthFinder/etc. Their privacy-center states: deleting your user data 'must delete any and all suppressions associated with your user', and 'Deleting your user information will NOT prevent other users from searching for your information through the people search websites. To suppress your information ... you must maintain your user information on file with the Suppression Center.'", "Therefore do NOT press 'Right to Delete / DELETE MY USER DATA' if the goal is search-visibility removal: it wipes your suppression and the public-records listing re-appears. Use the delete button ONLY if the operator's explicit goal is purging held account data (accept re-listing + re-suppression).", "Keep the account/suppression on file; do not delete it later. If the portal breaks: sister addresses privacy@intelius.com / privacy@truthfinder.com / privacy@instantcheckmate.com / support@ussearch.com / privacy@classmates.com; phone 1-888-245-1655.", @@ -78,7 +82,8 @@ "notes": "PeopleConnect portal covers the cluster via SUPPRESSION (maintained), not deletion (see the deletion lane note: delete removes suppressions and does not stop public-records re-listing). Authorized-agent requests: signed written authorization (full name, address, phone, the email the consumer uses) or POA; for Right-to-Delete they verify agent authority with the consumer by email. Verified from the live privacy policy + suppression privacy-center 2026-07-02.", "quirks": [ "Step 1 (suppression.peopleconnect.us/login) asks ONLY for an email + a consent checkbox, then 'Continue' -> a verification email with a link. No CAPTCHA, no name/DOB at step 1. Least-disclosure entry: just the contact email. Verified live 2026-06-30.", - "The verification link authenticates a SESSION and lands on /guided-mode. That session is bound to the browser that OPENED it; a different browser hitting /guided-mode is redirected back to /login. So for hands-off automation the SAME agent browser must open the verify link (Mode B: read inbox -> agent browser navigates the link -> drive guided-mode).", + "The verification link authenticates a SESSION and lands on /guided-mode. That session is bound to the browser that OPENED it; a different browser hitting /guided-mode is redirected back to /login. So for hands-off automation the SAME agent browser must open the verify link (Mode B: read inbox -> agent browser navigates the link -> drive guided-mode). Link is a JWT (aud PeopleConnect-email-login -> -registration) carrying a deviceId, ~15-min TTL, Cloudflare-gated. Do NOT hard-navigate to /guided-mode after auth (drops the in-memory session -> /login); if lost, re-request a fresh verify email and follow it straight through.", + "DOB GATE: guided-mode hard-requires date of birth (immutable once saved, no skip) to match records, so requires.dob=true. DOB is not collected at intake by default (sensitive, unneeded for scanning). If absent, the planner pre-warns (needs_operator_input) that this broker needs a human touchpoint; collect it with `intake --dob` up front to run hands-off. The matching step discloses DOB + legal name + alias beyond the contact email -- corroborate the record by address/email/phone, never name+DOB alone.", "INVERTED delete/suppress: SUPPRESSION is the do-not-display list and is what removes you from the people-search sites; it requires keeping your identifiers on file. 'DELETE MY USER DATA' deletes those suppressions and does NOT stop the sites showing you (public records re-list). Verbatim from the privacy-center: deleting user data 'must delete any and all suppressions associated with your user'; and 'Deleting your user information will NOT prevent other users from searching for your information ... To suppress your information ... you must maintain your user information on file with the Suppression Center.' So prefer suppression; use delete only for a deliberate data-purge. Verified live 2026-07-02.", "Their published request metrics (2025): 33,513 deletion requests, median response < 1 day -- deletion is fast, but per above it is the wrong lever for search-visibility on this cluster." ], diff --git a/optional-skills/security/unbroker/references/brokers/rehold.json b/optional-skills/security/unbroker/references/brokers/rehold.json new file mode 100644 index 000000000..c8e633957 --- /dev/null +++ b/optional-skills/security/unbroker/references/brokers/rehold.json @@ -0,0 +1,48 @@ +{ + "id": "rehold", + "name": "Rehold", + "category": "property_records", + "priority": "long_tail", + "jurisdictions": [ + "US" + ], + "search": { + "method": "url_pattern", + "url": "https://rehold.com/", + "fetch": "browser", + "match_signal": "result", + "match_signal_notes": "PROPERTY-RECORD, NOT PII. An address match here shows only PUBLIC PROPERTY RECORDS (build year, beds/baths, last sale price, incident history). Resident/owner NAMES sit behind 'View full report', which leads to a paywall/signup, so no personal PII is publicly exposed. Public property records are NOT removable. Record `found` ONLY if a resident NAME matching the subject is publicly displayed on the free page; an address-only match is `not_found` (nothing to opt out of).", + "access": "paywall", + "by": [ + "address" + ] + }, + "optout": { + "tier": "T2", + "method": "web_form", + "url": "https://rehold.com/optout", + "requires": { + "profile_url": true, + "email_verification": false, + "captcha": false, + "gov_id": false, + "account": false, + "phone_callback": false, + "payment": false + }, + "inputs": [ + "profile_url" + ], + "notes": "Address-anchored property/reverse-address site. Only pursue an opt-out if the scan found a publicly displayed resident NAME for the subject (see match_signal_notes); a bare public property record is not personal PII and is not removable. If the subject's personal profile IS shown, submit the profile URL to the opt-out endpoint and confirm the live flow in a residential browser before the first submission, then set last_verified.", + "quirks": [ + "Distinguish 'address exists in a public property DB' (non-removable) from 'the subject's personal profile is displayed' (removable). Only the latter is an actionable exposure.", + "'View full report' is a paywall/signup, not proof of a public listing.", + "Opt-out endpoint UNVERIFIED: confirm the live flow before the first submission." + ], + "est_processing_days": 3, + "reappearance_risk": "low" + }, + "last_verified": null, + "source": "curated", + "confidence": "documented" +} diff --git a/optional-skills/security/unbroker/references/brokers/truepeoplesearch.json b/optional-skills/security/unbroker/references/brokers/truepeoplesearch.json index 0523fdab1..1ce9bc7b1 100644 --- a/optional-skills/security/unbroker/references/brokers/truepeoplesearch.json +++ b/optional-skills/security/unbroker/references/brokers/truepeoplesearch.json @@ -10,6 +10,7 @@ "fetch": "browser", "antibot": "datadome", "match_signal": "result", + "match_signal_notes": "SEO TRAP: the page title/H1/intro auto-inserts the query ('FREE public records found for {Name} in {City}') even with ZERO real matches. That templated echo is NOT a result. Record `found` ONLY on an actual result CARD corroborated by the subject's address or DOB; unrelated same-name cards in other states are namesakes. Ignore the title/intro/H1 text entirely.", "by": ["name", "phone", "address", "email"], "url_patterns": { "name": "https://www.truepeoplesearch.com/results?name={First%20Last}&citystatezip={City,%20ST}" diff --git a/optional-skills/security/unbroker/references/methods.md b/optional-skills/security/unbroker/references/methods.md index ced0c8a80..14b4aafef 100644 --- a/optional-skills/security/unbroker/references/methods.md +++ b/optional-skills/security/unbroker/references/methods.md @@ -88,6 +88,28 @@ listing: third party's record - the consent gate correctly blocks acting on it. See "Indirect exposure" in the web_form section for what the subject *can* still request. +Two more false-positive traps that a naive scan records as `found` when it should not: + - **Property record != PII (address-anchored sites).** Reverse-address / property sites (rehold, + clustrmaps-style) can match on a public **property record** (build year, beds/baths, last sale + price, incidents) without exposing the subject's personal info - the resident/owner NAME is behind + a "View full report" paywall/signup. Distinguish "this address exists in a public property DB" + (non-removable, `not_found`) from "the subject's personal profile is displayed" (removable, + `found`). Record `found` ONLY if a resident name matching the subject is publicly shown; an + address-only match is `not_found` - there is nothing to opt out of, and public property records are + not removable anyway. See `rehold.json` `search.match_signal_notes`. + - **SEO-templated title/H1 fakes a "found".** Many people-search sites auto-insert the query into the + page ``, H1, and intro copy ("FREE public records found for {Name} in {City}", "Over 100+ + FREE public records found for {Name}"). That echo is **templating, not a result** - the actual + result cards are often unrelated namesakes in other states. A `match_signal` on title/intro text + yields false positives. Require a real result **card** corroborated by the subject's address or + DOB, and ignore the templated title/intro/H1 entirely. See `truepeoplesearch.json` / + `fastpeoplesearch.json` `search.match_signal_notes`. + +Both are why the **parent re-verifies every `found` before acting** rule is load-bearing (`pdd.py show +<subject> <broker>` reads back a subagent's recorded evidence so the parent can re-verify without +re-deriving the listing URL). If a `found` turns out to be a false positive, correct it with a fresh +`record ... not_found` carrying an evidence note explaining the retraction. + ## web_form 1. `browser_navigate` to `optout.url`; `browser_snapshot` to read the form. @@ -182,6 +204,44 @@ stealth/operator-browser pass (`methods.md` → scan ladder 3b - the operator's browser is the reliable unblock). Without a cloud browser configured, soft-CAPTCHA brokers drop to T2 and become human tasks. **Never use a third-party CAPTCHA-defeating service.** +## Browser backends: scan vs execute + +Two different jobs need two different browsers. Getting this wrong is the single biggest cause of a +run stalling in Phase 2. + +- **Phase 1 (scan, read-only):** a cloud stealth browser (Browserbase) or the `scrapling` skill is + ideal. On a residential IP with a real fingerprint it passes managed challenges (Cloudflare + Turnstile, hCaptcha checkbox) and reads anti-bot people-search pages that `web_extract` and the + proxyless agent browser cannot. This is what the skill's `browser_backend` setting governs + (`auto` picks Browserbase when `BROWSERBASE_API_KEY` is present - now also read from + `$HERMES_HOME/.env`, not just the shell env, so `doctor`/`setup --auto` detect the key Hermes + already loads for its own tools). +- **Phase 2 (execute: opt-out forms, webmail sends, session-bound multi-step gates):** the work must + run in the **operator's own everyday browser** - real fingerprint, residential IP, AND the + operator's logged-in sessions. A headless cloud browser is the WRONG default here for two reasons: + (1) it is not signed into the operator's webmail, so browser-mode email sends and confirmation-link + opens have no inbox to act in; and (2) it is itself Cloudflare/DataDome-gated on exactly the + multi-step flows that matter (e.g. PeopleConnect guided-mode, whose verify link is session- and + device-bound to the browser that opens it - a cloud browser both fails the challenge and breaks the + binding). +- **How to drive the operator's browser (CDP).** Point Hermes's browser tools at the operator's real + Chrome over the DevTools protocol: launch + `chrome --remote-debugging-port=9222 --user-data-dir="$HOME/.hermes/chrome-debug"` and connect the + browser backend to `127.0.0.1:9222`. Use a **dedicated debug profile** (`chrome-debug`), NOT the + operator's Default Chrome profile, and have the operator sign into their webmail (and any needed + broker accounts) in that profile once. That single browser then carries residential IP + real + fingerprint + logged-in sessions, which is precisely what Phase-2 flows need. (This is a Hermes-side + browser setup, not a `pdd` config value; `browser_backend` above only selects the Phase-1 scan + browser.) +- **Always-available fallback:** if no CDP browser is wired up, use the operator-in-the-loop path + (scan ladder 3b) - hand over paste-ready URLs and field-by-field least-disclosure guidance, pausing + before submit. It never fails; it just needs a human present. + +Backend precedence, most to least autonomous: **operator Chrome over CDP** (Phase 2, hands-off once +the profile is signed in) > **Browserbase cloud stealth** (Phase 1 scanning, plus managed-captcha +forms that need no login) > **proxyless agent browser** (only already-unblocked sites) > +**operator-in-the-loop** (paste-ready URLs; the last-resort unblock that always works). + ## Ownership clusters - DO PARENTS FIRST (playbooks live in the broker records) Many brokers are resold shells of a few parents, so **one parent removal clears a whole cluster of diff --git a/optional-skills/security/unbroker/scripts/autopilot.py b/optional-skills/security/unbroker/scripts/autopilot.py index 76d6f9caa..746106185 100644 --- a/optional-skills/security/unbroker/scripts/autopilot.py +++ b/optional-skills/security/unbroker/scripts/autopilot.py @@ -283,7 +283,7 @@ def next_actions(dossier: dict, brokers_list: list[dict], cfg: dict, "broker_id": bid, "command": f"python3 scripts/pdd.py poll-verification {subject_id} --broker {bid}", "then": "browser_navigate the returned link IN THE SAME AGENT BROWSER (sessions are " - f"browser-bound), complete the flow, then record: awaiting_processing", + "browser-bound), complete the flow, then record: awaiting_processing", }) elif email_mode == "browser": actions.append({ @@ -327,7 +327,21 @@ def next_actions(dossier: dict, brokers_list: list[dict], cfg: dict, # 5) indirect exposure: targeted delete-my-PII requests for row in groups.get("indirect_exposure") or []: bid = row["broker_id"] - if (email_mode in ("programmatic", "alias") and mail["smtp"]) or email_mode == "browser": + has_email = bool(row.get("optout_email") or (row.get("deletion") or {}).get("email")) + if not has_email and row.get("optout_url"): + # No email lane (e.g. ThatsThem is web-form-only): drive the opt-out FORM, submitting + # ONLY the subject's own identifiers to scrub from the third party's record. + actions.append({ + "type": "indirect_web_form", + "broker_id": bid, "confirm_first": confirm_first, + "optout_url": row.get("optout_url"), + "steps": [f"browser_navigate {row.get('optout_url')}", + "submit ONLY the subject's own identifiers (the fields the form requires) to " + "remove them from the third party's record; disclose nothing extra", + "confirm the success state, screenshot into evidence/"], + "after": f"python3 scripts/pdd.py record {subject_id} {bid} submitted --channel web_form", + }) + elif (email_mode in ("programmatic", "alias") and mail["smtp"]) or email_mode == "browser": actions.append({ "type": "indirect_email_send", "broker_id": bid, "confirm_first": confirm_first, diff --git a/optional-skills/security/unbroker/scripts/config.py b/optional-skills/security/unbroker/scripts/config.py index 7f53554fe..e1eb1c3c7 100644 --- a/optional-skills/security/unbroker/scripts/config.py +++ b/optional-skills/security/unbroker/scripts/config.py @@ -62,6 +62,26 @@ def save_config(cfg: dict) -> Path: return storage.write_json(paths.config_path(), merged) +def dotenv_env() -> dict: + """Shell env overlaid on `$HERMES_HOME/.env`, so capability detection sees the creds Hermes + loads for its own tools (BROWSERBASE_API_KEY, EMAIL_*, AGENTMAIL_API_KEY, ...) even though the + terminal-tool shell doesn't export them. Shell env wins; the .env only fills gaps.""" + merged: dict = {} + p = paths.hermes_home() / ".env" + if p.exists(): + try: + for line in p.read_text(encoding="utf-8", errors="replace").splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + merged[k.strip()] = v.strip().strip('"').strip("'") + except OSError: + pass + merged.update(os.environ) + return merged + + def detect_capabilities(env: dict | None = None) -> dict: """Report which opt-in upgrades are available without extra setup.""" env = os.environ if env is None else env diff --git a/optional-skills/security/unbroker/scripts/ledger.py b/optional-skills/security/unbroker/scripts/ledger.py index e5ec331a1..2483ee6a8 100644 --- a/optional-skills/security/unbroker/scripts/ledger.py +++ b/optional-skills/security/unbroker/scripts/ledger.py @@ -21,7 +21,10 @@ TRANSITIONS: dict[str, set[str]] = { "new": {"searching", "found", "not_found", "indirect_exposure", "blocked"}, "searching": {"not_found", "found", "indirect_exposure", "blocked"}, "not_found": {"searching", "found", "indirect_exposure", "blocked"}, - "found": {"action_selected", "submitted", "human_task_queued", "indirect_exposure", "blocked"}, + # found -> not_found: a parent re-verification (or re-scan) found the "found" was a false + # positive (namesake, or an address-only property-record match) -- retract it with evidence. + "found": {"action_selected", "submitted", "human_task_queued", "indirect_exposure", "blocked", + "not_found"}, # indirect_exposure: subject's PII (email/phone/name) sits on a THIRD PARTY's record. The # self-service opt-out form does not apply; the lever is a targeted CCPA/GDPR delete-my-PII # request (-> submitted) or a human task. Re-scan can clear it (-> not_found) or upgrade it to a @@ -43,7 +46,10 @@ TRANSITIONS: dict[str, set[str]] = { # blocked: automated tools (web_extract/proxyless browser) couldn't read the site. A later pass # -- a stealth/cloud browser OR guiding the operator's own (residential) browser -- can resolve it # to any real scan verdict, so blocked reaches not_found / indirect_exposure too, not just found. - "blocked": {"searching", "found", "not_found", "indirect_exposure", "action_selected"}, + # blocked -> human_task_queued: some blocked sites need an operator step to proceed at all + # (face-recognition sites needing a selfie/gov-ID, etc.), so route them to the digest. + "blocked": {"searching", "found", "not_found", "indirect_exposure", "action_selected", + "human_task_queued"}, } diff --git a/optional-skills/security/unbroker/scripts/pdd.py b/optional-skills/security/unbroker/scripts/pdd.py index 09039ab47..6d28a3a45 100644 --- a/optional-skills/security/unbroker/scripts/pdd.py +++ b/optional-skills/security/unbroker/scripts/pdd.py @@ -58,9 +58,10 @@ def _require_subject(subject_id: str) -> dict: def cmd_setup(args) -> None: if getattr(args, "auto", False): - # Autonomous path: detect capabilities and pick the most autonomous valid - # config without asking anyone. Explicit flags still win below. - cfg = config_mod.auto_configure() + # Autonomous path: detect capabilities and pick the most autonomous valid config without + # asking anyone. Read creds from $HERMES_HOME/.env too (the terminal shell doesn't export + # them). Explicit flags still win below. + cfg = config_mod.auto_configure(env=config_mod.dotenv_env()) else: cfg = config_mod.load_config() for key in ("autonomy", "email_mode", "browser_backend", "tracker_backend", "encryption"): @@ -123,7 +124,7 @@ def cmd_doctor(args) -> None: import platform cfg = config_mod.load_config() - caps = config_mod.detect_capabilities() + caps = config_mod.detect_capabilities(config_mod.dotenv_env()) # see creds in $HERMES_HOME/.env too data = paths_mod.data_dir() writable = _check_writable(data) curated = len(brokers_mod._load_curated()) @@ -185,8 +186,17 @@ def cmd_doctor(args) -> None: "verify links via your logged-in webmail); or set EMAIL_* for SMTP/IMAP.") elif cfg["email_mode"] == "browser": L.append(" Email mode: browser (no password) - the agent sends opt-outs and opens verify " - "links via the operator's logged-in webmail. Ensure that inbox is signed in in the " - "browser Hermes uses (a cloud browser won't hold the session); else it falls back to drafts.") + "links via the operator's logged-in webmail. This needs Hermes pointed at the " + "operator's OWN Chrome over CDP (launch with --remote-debugging-port=9222 " + "--user-data-dir=~/.hermes/chrome-debug, signed into the webmail once); else it falls " + "back to drafts. See methods.md 'Browser backends'.") + cloud_scan = cfg.get("browser_backend") == "browserbase" or ( + cfg.get("browser_backend") == "auto" and caps.get("browserbase")) + if cloud_scan: + L.append(" NOTE: your scan backend is a cloud browser (Browserbase). It is great for " + "Phase-1 scanning but CANNOT be the browser that sends webmail (no inbox session) " + "and is itself Cloudflare/DataDome-gated on session-bound gates (e.g. PeopleConnect). " + "For Phase-2 email/verify, drive the operator's Chrome over CDP as above.") if not crypto.is_engaged(): L.append(" Storage: dossiers are PLAINTEXT JSON (0600 under HERMES_HOME). " "Run `setup --encryption age` for at-rest encryption.") @@ -390,18 +400,23 @@ def cmd_fanout(args) -> None: batches = [] for i, ids in enumerate(grouping["batches"], 1): brief = ( - f"You are scan worker {i} of {len(grouping['batches'])} for the `unbroker` " - f"skill. First load the `unbroker` skill and read its references/methods.md. " - f"Subject id: {args.subject}. Handle ONLY these brokers: {', '.join(ids)}. " + f"You are scan worker {i} of {len(grouping['batches'])} for the `unbroker` skill. First " + f"load the `unbroker` skill and read its references/methods.md. Use the `web` toolset " + f"(web_search `site:` + web_extract), NOT `browser` (browser navigation is heavy and times " + f"out). Subject id: {args.subject}. Handle ONLY these brokers: {', '.join(ids)}. " f"For EACH broker: read references/brokers/<id>.json; run EVERY search vector from " f"`pdd.py plan {args.subject}` (filtered to your brokers); build URLs from search.url_patterns " f"and heed url_format_quirks; a 404 is INCONCLUSIVE (rebuild/try the on-site search box), not " - f"not_found; confirm the SUBJECT vs namesakes/relatives before recording; if search.antibot is " - f"set and no stealth/cloud browser is available, record `blocked`. Record each outcome via " - f"`pdd.py record {args.subject} <broker> <found|not_found|indirect_exposure|blocked> " - f"--found <bool> --evidence '{{\"listing_urls\":[...]}}'`. Mode: {mode}. " - f"Log any newly-discovered URL/format quirks into the broker JSON. " - f"Return a concise structured per-broker report." + f"not_found. ECONOMY: at most ~3 web calls per broker; the moment a page shows antibot " + f"(Cloudflare 'just a moment'/DataDome) or hangs, record `blocked` and move on -- do NOT " + f"retry-loop. Confirm the SUBJECT vs namesakes/relatives by ADDRESS/DOB before recording " + f"`found` (ignore SEO-templated page titles/intro that just echo the query -- require a real " + f"result card; a public property/address record with no displayed personal NAME is " + f"not_found, not found). Record each outcome via `pdd.py record {args.subject} <broker> " + f"<found|not_found|indirect_exposure|blocked> --found <bool> --evidence '{{\"listing_urls\":[...]}}'`. " + f"Mode: {mode}. Broker JSON files are READ-ONLY for you -- do NOT edit them; if you discover " + f"a URL/quirk, put it in your report for the parent to fold in. Return a concise structured " + f"per-broker report." ) batches.append({"batch": i, "brokers": ids, "brief": brief}) _out({ @@ -631,6 +646,19 @@ def cmd_due(args) -> None: "note": "run `next` for the concrete follow-up action per case"}) +def cmd_show(args) -> None: + """Read a case's recorded state + evidence (so the parent can re-verify a subagent's `found` + without re-deriving listing URLs).""" + _require_subject(args.subject) + case = ledger_mod.get_case(args.subject, args.broker) + _out({"broker": args.broker, "state": case.get("state"), "found": case.get("found"), + "evidence": case.get("evidence") or {}, + "disclosure_log": case.get("disclosure_log") or [], + "next_recheck_at": case.get("next_recheck_at"), + "human_task_reason": case.get("human_task_reason"), + "history": case.get("history") or []}) + + def cmd_status(args) -> None: _require_subject(args.subject) print(report_mod.render_markdown(args.subject)) @@ -716,7 +744,7 @@ def build_parser() -> argparse.ArgumentParser: s = sub.add_parser("fanout", help="batch brokers into parallel delegate_task subagents (large runs)") s.add_argument("subject") s.add_argument("--priority", action="append", choices=["crucial", "high", "standard", "long_tail"]) - s.add_argument("--size", type=int, default=8, help="brokers per subagent batch (default 8)") + s.add_argument("--size", type=int, default=5, help="brokers per subagent batch (default 5; 8+ times out)") s.add_argument("--optout", action="store_true", help="brief authorizes opt-out submission (default: read-only scan)") s.set_defaults(func=cmd_fanout) @@ -769,6 +797,11 @@ def build_parser() -> argparse.ArgumentParser: s.add_argument("subject") s.set_defaults(func=cmd_tasks) + s = sub.add_parser("show", help="read a case's state + evidence (for parent re-verification)") + s.add_argument("subject") + s.add_argument("broker") + s.set_defaults(func=cmd_show) + s = sub.add_parser("due", help="cases whose recheck window has arrived (cron re-scan queue)") s.add_argument("subject") s.set_defaults(func=cmd_due) diff --git a/optional-skills/security/unbroker/scripts/tiers.py b/optional-skills/security/unbroker/scripts/tiers.py index 8d145b8ce..d83efcf33 100644 --- a/optional-skills/security/unbroker/scripts/tiers.py +++ b/optional-skills/security/unbroker/scripts/tiers.py @@ -17,6 +17,8 @@ HARD_HUMAN = ("gov_id", "fax", "mail", "phone_voice") def select_tier(broker: dict, email_mode: str = "draft_only", browser_clears_captcha: bool = False) -> str: req = ((broker.get("optout") or {}).get("requires")) or {} + if not isinstance(req, dict): + req = {} # defensive: a malformed record (e.g. requires as a list) must not crash planning if any(req.get(k) for k in HARD_HUMAN): return "T3" @@ -43,9 +45,20 @@ def plan(subject_dossier: dict, brokers_list: list[dict], cfg: dict, for b in brokers_list: opt = b.get("optout") or {} search = b.get("search") or {} + # Defensive shape coercion: a subagent may have written a malformed record (requires as a + # list, quirks as a string). Normalize here so nothing downstream crashes on a bad broker file. + req = opt.get("requires") if isinstance(opt.get("requires"), dict) else {} + q = opt.get("quirks") + quirks = q if isinstance(q, list) else ([q] if isinstance(q, str) and q else []) tier = select_tier(b, email_mode, browser_clears_captcha) disclosure = dossier_mod.select_disclosure(subject_dossier, opt.get("inputs", [])) svectors = vectors_mod.search_vectors(subject_dossier, b) + # Pre-warn (don't discover mid-flow): a broker whose identity gate hard-requires DOB will + # force a human touchpoint if DOB was not collected at intake (§4.1). Surface it now. + prewarn: list[str] = [] + if req.get("dob") and not (subject_dossier.get("identity") or {}).get("date_of_birth"): + prewarn.append("date_of_birth: this broker's identity gate requires DOB to match records; " + "collect it up front (intake --dob) or expect a mid-flow human pause") actions.append({ "broker_id": b.get("id"), "broker_name": b.get("name"), @@ -61,10 +74,11 @@ def plan(subject_dossier: dict, brokers_list: list[dict], cfg: dict, "optout_url": opt.get("url"), "optout_email": opt.get("email"), "disclosure_fields": sorted(disclosure.keys()), + "needs_operator_input": prewarn, "owns": b.get("owns") or [], "notes": opt.get("notes", ""), - "optout_quirks": opt.get("quirks") or [], - "optout_requires": opt.get("requires") or {}, + "optout_quirks": quirks, + "optout_requires": req, # The DELETION lane (right-to-delete), distinct from listing suppression. Structured so # the autopilot can route to it: {via: email|in_flow|web_form, email?, url?, kinds?, notes?} "deletion": opt.get("deletion") or {}, @@ -75,7 +89,7 @@ def plan(subject_dossier: dict, brokers_list: list[dict], cfg: dict, return actions -def fanout(brokers_list: list[dict], batch_size: int = 8) -> dict: +def fanout(brokers_list: list[dict], batch_size: int = 5) -> dict: """Group brokers into batches for parallel `delegate_task` scan subagents. Scanning many brokers serially is slow and burns context; above `batch_size` diff --git a/tests/skills/test_unbroker_skill.py b/tests/skills/test_unbroker_skill.py index f4342d7e2..77599c934 100644 --- a/tests/skills/test_unbroker_skill.py +++ b/tests/skills/test_unbroker_skill.py @@ -472,6 +472,14 @@ def test_fanout_batches_large_runs(): assert small["should_fanout"] is False and small["batches"] == [["x", "y"]] +def test_fanout_default_batch_size_is_five(): + # Field report: 8-broker batches time out; the default dropped to 5. + g = tiers.fanout([{"id": f"b{i}"} for i in range(12)]) + assert all(len(b) <= 5 for b in g["batches"]) + assert g["batches"][0] == [f"b{i}" for i in range(5)] + assert len(g["batches"]) == 3 # 5 + 5 + 2 + + def test_plan_surfaces_antibot(): d = _consenting() broker = {"id": "tps", "optout": {"requires": {}}, "search": {"antibot": "datadome", "by": ["name"]}} @@ -479,6 +487,21 @@ def test_plan_surfaces_antibot(): assert actions[0]["antibot"] == "datadome" +def test_plan_prewarns_when_dob_required_but_missing(): + # requires.dob gated broker (e.g. PeopleConnect guided-mode): warn up front, not mid-flow. + broker = {"id": "intelius", "search": {"by": ["name"]}, + "optout": {"requires": {"dob": True, "email_verification": True}, "inputs": ["contact_email"]}} + no_dob = _consenting() + no_dob["identity"].pop("date_of_birth") + warned = tiers.plan(no_dob, [broker], config.DEFAULT_CONFIG)[0] + assert any("date_of_birth" in w for w in warned["needs_operator_input"]) + # A new requires key must not perturb tier selection. + assert warned["tier"] == tiers.select_tier( + {"optout": {"requires": {"email_verification": True}}}, "draft_only") + with_dob = tiers.plan(_consenting(), [broker], config.DEFAULT_CONFIG)[0] + assert with_dob["needs_operator_input"] == [] + + def test_plan_surfaces_optout_quirks_and_email(): d = _consenting() broker = {"id": "radaris", "search": {"by": ["name"]}, @@ -1269,6 +1292,43 @@ def test_send_email_is_idempotent_browser_mode(): assert again.get("skipped") is True # not re-sent +def test_show_reads_back_case_state_and_evidence(): + with temp_env(): + sid = _run(["intake", "--full-name", "Jane Q. Public", + "--email", "jane@example.com", "--consent"])["subject_id"] + _run(["record", sid, "radaris", "found", "--found", "true", + "--evidence", '{"listing_urls": ["https://radaris.com/p/x"]}']) + shown = _run(["show", sid, "radaris"]) + assert shown["broker"] == "radaris" and shown["state"] == "found" + assert shown["found"] is True + assert shown["evidence"].get("listing_urls") == ["https://radaris.com/p/x"] + # Unknown case returns a fresh (new) case, not an error. + empty = _run(["show", sid, "not_a_broker"]) + assert empty["state"] == "new" and empty["evidence"] == {} + + +def test_dotenv_env_fills_missing_creds_and_shell_wins(): + prev_home = os.environ.get("HERMES_HOME") + prev_key = os.environ.get("BROWSERBASE_API_KEY") + with tempfile.TemporaryDirectory() as d: + os.environ["HERMES_HOME"] = d + (Path(d) / ".env").write_text( + '# comment\nBROWSERBASE_API_KEY="from_dotenv"\nFIRECRAWL_API_KEY=fc_123\n', encoding="utf-8") + try: + os.environ.pop("BROWSERBASE_API_KEY", None) + merged = config.dotenv_env() + assert merged["BROWSERBASE_API_KEY"] == "from_dotenv" # filled from .env + assert merged["FIRECRAWL_API_KEY"] == "fc_123" # quotes/comment handled + os.environ["BROWSERBASE_API_KEY"] = "from_shell" + assert config.dotenv_env()["BROWSERBASE_API_KEY"] == "from_shell" # shell wins + finally: + for k, v in (("HERMES_HOME", prev_home), ("BROWSERBASE_API_KEY", prev_key)): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v + + def test_registry_candidate_urls_newest_first_with_floor(): urls = registry.ca_candidate_urls(__import__("datetime").date(2027, 3, 1)) assert urls[0].endswith("registry2027.csv") and urls[-1].endswith("registry2025.csv")