fix(gateway): block config.yaml media delivery and fix triggering tip

Two related fixes for a bug where /new could cause config.yaml to be sent as a Discord attachment: 1. Add config.yaml to the _media_delivery_denied_paths denylist in gateway/platforms/base.py. This prevents any accidental delivery of config.yaml as a native file attachment, matching the existing protection for .env, auth.json, and credentials/. 2. Reword the tip that triggered this: the tip 'hermes chat --ignore-user-config skips ~/.hermes/config.yaml' contained a bare home-relative path to config.yaml. When randomly selected during /new, the extract_local_files() method in _process_message_background would match ~/.hermes/config.yaml as a local file path, find it exists, and dispatch it as a native Discord document attachment.
fix(cli): stop OSC 11 bg probe from trapping users in a stray editor (#35441 )
2026-05-30 19:30:48 +00:00 · 2026-05-30 11:55:12 -05:00 · 2026-05-30 07:58:08 -07:00 · 2026-05-30 07:55:36 -07:00 · 2026-05-30 07:45:26 -07:00 · 2026-05-30 07:41:04 -07:00
995 changed files with 33802 additions and 5489 deletions
--- a/.github/workflows/contributor-check.yml
+++ b/.github/workflows/contributor-check.yml
@ -3,11 +3,9 @@ name: Contributor Attribution Check
 on:
  pull_request:
    branches: [main]
-    paths:
-      # Only run when code files change (not docs-only PRs)
-      - '*.py'
-      - '**/*.py'
-      - '.github/workflows/contributor-check.yml'
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).

 permissions:
  contents: read
@ -20,7 +18,21 @@ jobs:
        with:
          fetch-depth: 0  # Full history needed for git log

+      - name: Check if relevant files changed
+        id: filter
+        run: |
+          BASE="${{ github.event.pull_request.base.sha }}"
+          HEAD="${{ github.event.pull_request.head.sha }}"
+          CHANGED=$(git diff --name-only "$BASE"..."$HEAD" -- '*.py' '**/*.py' '.github/workflows/contributor-check.yml' || true)
+          if [ -n "$CHANGED" ]; then
+            echo "run=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "run=false" >> "$GITHUB_OUTPUT"
+            echo "No Python files changed, skipping attribution check."
+          fi
+
      - name: Check for unmapped contributor emails
+        if: steps.filter.outputs.run == 'true'
        run: |
          # Get the merge base between this PR and main
          MERGE_BASE=$(git merge-base origin/main HEAD)
--- a/.github/workflows/deploy-site.yml
+++ b/.github/workflows/deploy-site.yml
@ -22,7 +22,12 @@ concurrency:

 jobs:
  deploy-vercel:
-    if: github.event_name == 'release'
+    # Triggered automatically on release publish (production cuts) and
+    # manually via `gh workflow run deploy-site.yml` when an out-of-band
+    # main commit needs to ship live before the next release tag — e.g.
+    # a skills-index PR that doesn't touch website/** paths and so
+    # doesn't auto-deploy via the deploy-docs path.
+    if: github.event_name == 'release' || github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    steps:
      - name: Trigger Vercel Deploy
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@ -196,10 +196,26 @@ jobs:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

-      # Build once, load into the local daemon for smoke testing.  Cached
-      # to gha with a per-arch scope; the push step below reuses every
-      # layer from this build.
-      - name: Build image (arm64, smoke test)
+      # Build once, load into the local daemon for smoke testing. PR arm64
+      # builds deliberately avoid the gha cache: cold-cache arm64 builds can
+      # outlive GitHub's short-lived Azure cache SAS token, then fail while
+      # reading or writing cache blobs before the smoke test can run.
+      - name: Build image (arm64, smoke test, uncached PR)
+        if: github.event_name == 'pull_request'
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+        with:
+          context: .
+          file: Dockerfile
+          load: true
+          platforms: linux/arm64
+          tags: ${{ env.IMAGE_NAME }}:test
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
+
+      # Main/release builds still use the per-arch gha cache so the digest
+      # push below can reuse layers from this smoke-test build.
+      - name: Build image (arm64, smoke test, cached publish)
+        if: github.event_name != 'pull_request'
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
--- a/.github/workflows/supply-chain-audit.yml
+++ b/.github/workflows/supply-chain-audit.yml
@ -3,15 +3,9 @@ name: Supply Chain Audit
 on:
  pull_request:
    types: [opened, synchronize, reopened]
-    paths:
-      - '**/*.py'
-      - '**/*.pth'
-      - '**/setup.py'
-      - '**/setup.cfg'
-      - '**/sitecustomize.py'
-      - '**/usercustomize.py'
-      - '**/__init__.pth'
-      - 'pyproject.toml'
+  # No paths filter — the jobs must always run so required checks
+  # report a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).

 permissions:
  pull-requests: write
@ -27,8 +21,44 @@ permissions:
 # advisory-only workflow instead.

 jobs:
+  # ── Path filter (shared by both scan and dep-bounds) ───────────────
+  changes:
+    runs-on: ubuntu-latest
+    outputs:
+      # True when any file the scanner cares about changed in this PR
+      scan: ${{ steps.filter.outputs.scan }}
+      # True when pyproject.toml changed in this PR
+      deps: ${{ steps.filter.outputs.deps }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          fetch-depth: 0
+      - name: Check for relevant file changes
+        id: filter
+        run: |
+          BASE="${{ github.event.pull_request.base.sha }}"
+          HEAD="${{ github.event.pull_request.head.sha }}"
+          SCAN_FILES=$(git diff --name-only "$BASE"..."$HEAD" -- \
+            '*.py' '**/*.py' '*.pth' '**/*.pth' \
+            'setup.py' 'setup.cfg' \
+            'sitecustomize.py' 'usercustomize.py' '__init__.pth' \
+            'pyproject.toml' || true)
+          if [ -n "$SCAN_FILES" ]; then
+            echo "scan=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "scan=false" >> "$GITHUB_OUTPUT"
+          fi
+          DEPS_FILES=$(git diff --name-only "$BASE"..."$HEAD" -- 'pyproject.toml' || true)
+          if [ -n "$DEPS_FILES" ]; then
+            echo "deps=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "deps=false" >> "$GITHUB_OUTPUT"
+          fi
+
  scan:
    name: Scan PR for critical supply chain risks
+    needs: changes
+    if: needs.changes.outputs.scan == 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
@ -147,10 +177,24 @@ jobs:
          echo "::error::CRITICAL supply chain risk patterns detected in this PR. See the PR comment for details."
          exit 1

+  # Gate: reports success when scan was skipped (no relevant files changed).
+  # This ensures the required check always gets a status.
+  scan-gate:
+    name: Scan PR for critical supply chain risks
+    needs: changes
+    # always() so the gate still reports SUCCESS even if `changes` fails/is
+    # skipped — without it, a failed dependency would leave the required
+    # check unreported (i.e. "pending"), the exact failure mode this fixes.
+    if: always() && needs.changes.outputs.scan != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "No supply-chain-relevant files changed, skipping scan."
+
  dep-bounds:
    name: Check PyPI dependency upper bounds
+    needs: changes
+    if: needs.changes.outputs.deps == 'true'
    runs-on: ubuntu-latest
-    if: contains(github.event.pull_request.changed_files_url, 'pyproject.toml') || true
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
@ -211,3 +255,16 @@ jobs:
        run: |
          echo "::error::PyPI dependencies without upper bounds detected. Add <next_major ceiling per CONTRIBUTING.md policy."
          exit 1
+
+  # Gate: reports success when dep-bounds was skipped (no pyproject.toml changed).
+  # This ensures the required check always gets a status.
+  dep-bounds-gate:
+    name: Check PyPI dependency upper bounds
+    needs: changes
+    # always() so the gate still reports SUCCESS even if `changes` fails/is
+    # skipped — without it, a failed dependency would leave the required
+    # check unreported (i.e. "pending"), the exact failure mode this fixes.
+    if: always() && needs.changes.outputs.deps != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "No pyproject.toml changes, skipping dependency bounds check."
--- a/.gitignore
+++ b/.gitignore
@ -92,3 +92,7 @@ docs/superpowers/*
 # also created in-repo when an agent operates in this checkout). Plans, audit
 # logs, and per-session caches are never artifacts of the codebase.
 .hermes/
+
+# Tool Search live-test harness output — non-deterministic model transcripts,
+# regenerated by scripts/tool_search_livetest.py. Never an artifact of the repo.
+scripts/out/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -43,7 +43,7 @@ Bundled skills (in `skills/`) ship with every Hermes install. They should be **b
 - Document handling, web research, common dev workflows, system administration
 - Used regularly by a wide range of people

-If your skill is official and useful but not universally needed (e.g., a paid service integration, a heavyweight dependency), put it in **`optional-skills/`** — it ships with the repo but isn't activated by default. Users can discover it via `hermes skills browse` (labeled "official") and install it with `hermes skills install` (no third-party warning, builtin trust).
+If your skill is official and useful but not universally needed (e.g., a paid service integration, a heavyweight dependency), put it in **`optional-skills/`** — it ships with the repo but isn't activated by default. Users can discover it via `hermes skills browse` (labeled "official") and install it with `hermes skills install` (no third-party warning, built-in trust).

 If your skill is specialized, community-contributed, or niche, it's better suited for a **Skills Hub** — upload it to a skills registry and share it in the [Nous Research Discord](https://discord.gg/NousResearch). Users can install it with `hermes skills install`.

--- a/2
+++ b/2
@ -25,7 +25,7 @@ ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright
 # hermes process, the dashboard, and per-profile gateways.
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-    ca-certificates curl python3 python-is-python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli xz-utils && \
+    ca-certificates curl iputils-ping python3 python-is-python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli xz-utils && \
    rm -rf /var/lib/apt/lists/*

 # ---------- s6-overlay install ----------
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,9 @@
 graft skills
 graft optional-skills
+# Bundled plugin manifests (plugin.yaml / plugin.yml). Without these the
+# PluginManager scan (hermes_cli/plugins.py) finds zero plugins on installs
+# built from the sdist (e.g. Homebrew, downstream packagers). package-data
+# below covers the wheel; this covers the sdist. See #34034 / #28149.
+recursive-include plugins plugin.yaml plugin.yml
 global-exclude __pycache__
 global-exclude *.py[cod]
--- a/RELEASE_v0.15.1.md
+++ b/RELEASE_v0.15.1.md
@ -0,0 +1,110 @@
+# Hermes Agent v0.15.1 (v2026.5.29)
+
+**Release Date:** May 29, 2026
+**Since v0.15.0:** 28 commits · 21 merged PRs · hotfix release · 9 contributors
+
+> **The Patch Release.** A same-day hotfix for v0.15.0. Headline fix: the dashboard infinite-reload loop that hit anyone running v0.15.0 in loopback mode (Docker, hosted Hermes, fresh installs). A handful of other v0.15.0 follow-ups go along for the ride — kanban worker SIGTERM, `/model` picker unification, `/yolo` session bypass, the full 19,932-entry skills.sh catalog, `.md` media delivery restoration, gateway probe-stepdown safety, web-URL redaction passthrough, kanban worker vision on referenced images, hindsight observation-default. Docker users get an explicit `--insecure` opt-in env var (no more bind-host inference), MCP server bare-command PATH resolution, and arm64 PR-build cache fixes.
+
+---
+
+## ✨ Highlights
+
+- **Dashboard 401 reload loop fixed** — In loopback mode the dashboard's identity probe (`/api/auth/me`) returns 401 by design, but v0.15.0's stale-token reload guard treated every 401 as a rotated session token and full-page-reloaded to pick up a fresh one. Every successful sibling call cleared the one-shot reload guard, so the page reload-looped forever (Firefox: "Navigated to /sessions" storm; Chrome: React re-render storm). Fix adds an `allowUnauthorized` opt-out to `fetchJSON` that skips only the loopback stale-token reload — 401 still throws so `AuthWidget` swallows it, gated-mode `login_url` redirects are unaffected. Closes [#34206](https://github.com/NousResearch/hermes-agent/issues/34206), [#34202](https://github.com/NousResearch/hermes-agent/issues/34202). ([#30698](https://github.com/NousResearch/hermes-agent/pull/30698) — @austinpickett)
+
+- **Docker dashboard `--insecure` is now an explicit env opt-in, never derived from bind host** — Previously the Docker entrypoint inferred `--insecure` when the dashboard bound to a non-loopback host. That conflated "I want LAN access" with "I want to disable the same-origin guard." The fix splits them: bind host is bind host, and disabling the dashboard's loopback auth requires an explicit `HERMES_DASHBOARD_INSECURE=1`. Existing setups that genuinely wanted insecure binding must now set the env var. ([#34188](https://github.com/NousResearch/hermes-agent/pull/34188), [#34204](https://github.com/NousResearch/hermes-agent/pull/34204) — @benbarclay)
+
+- **MCP bare command resolution under Docker** — MCP servers configured with bare commands (`npx`, `npm`, `node`) now resolve against `/usr/local/bin` so they actually launch inside the Docker image where those binaries live. v0.15.0 left these failing silently in containers when the agent's effective PATH didn't include the Node toolchain location. ([#34186](https://github.com/NousResearch/hermes-agent/pull/34186) — @benbarclay)
+
+- **Skills page sidebar / source pills restored** — A stale `useMemo` dependency in the new dashboard skills page collapsed the source pills and category sidebar to "All" only. Fixed; both surfaces now reflect the live catalog state. ([#34194](https://github.com/NousResearch/hermes-agent/pull/34194))
+
+- **Kanban worker can be killed again** — `SIGTERM` on a kanban worker was being absorbed by an intermediate process and the worker stayed running. Closes [#28181](https://github.com/NousResearch/hermes-agent/issues/28181). ([#34045](https://github.com/NousResearch/hermes-agent/pull/34045))
+
+- **Full skills.sh catalog (858 → 19,932 entries)** — The skills hub page was pulling a partial paginated catalog. The fetch now walks the sitemap, so all 19,932 skills.sh entries surface in the picker instead of just the first 858. ([#34025](https://github.com/NousResearch/hermes-agent/pull/34025))
+
+---
+
+## 🐛 Bug Fixes
+
+### Dashboard / Web
+
+- **`/api/auth/me` 401 no longer triggers reload loop** in loopback mode — ([#30698](https://github.com/NousResearch/hermes-agent/pull/30698) — @austinpickett)
+- **Skills page source pills + category sidebar restored** — stale `useMemo` dep ([#34194](https://github.com/NousResearch/hermes-agent/pull/34194))
+
+### Docker
+
+- **`--insecure` is now explicit opt-in via env var**, not derived from bind host ([#34188](https://github.com/NousResearch/hermes-agent/pull/34188) — @benbarclay)
+- **Dashboard test suite repaired** to match the insecure-opt-in fix ([#34204](https://github.com/NousResearch/hermes-agent/pull/34204) — @benbarclay)
+- **arm64 PR builds skip the GHA cache** to avoid cache-thrash on cross-arch builders ([#33704](https://github.com/NousResearch/hermes-agent/pull/33704) — @BROCCOLO1D)
+
+### MCP
+
+- **Bare `npx`/`npm`/`node` resolve against `/usr/local/bin`** for Docker compatibility ([#34186](https://github.com/NousResearch/hermes-agent/pull/34186) — @benbarclay)
+
+### Kanban
+
+- **Worker SIGTERM actually terminates the process** ([#34045](https://github.com/NousResearch/hermes-agent/pull/34045))
+- **Workers receive images referenced in task bodies** for vision-capable models ([#34210](https://github.com/NousResearch/hermes-agent/pull/34210))
+
+### Gateway
+
+- **`.md` files deliver again** — media-delivery validation defaults to denylist-only instead of an overly-narrow allowlist ([#34022](https://github.com/NousResearch/hermes-agent/pull/34022))
+- **Probe stepdown safety** — on a context-overflow without an explicit provider context limit, the agent no longer steps down to a smaller model based on an unknown ceiling (salvage of [#33673](https://github.com/NousResearch/hermes-agent/pull/33673)) ([#33826](https://github.com/NousResearch/hermes-agent/pull/33826))
+
+### CLI
+
+- **`/yolo` mid-session enables the per-session bypass** instead of just toggling the env var (which the running agent had already snapshotted) ([#33931](https://github.com/NousResearch/hermes-agent/pull/33931) — @kshitijk4poor)
+- **`/model` and `hermes model` show the same list**, plus disk cache for picker startup ([#33867](https://github.com/NousResearch/hermes-agent/pull/33867))
+
+### Skills
+
+- **Full skills.sh catalog via sitemap** — 858 → 19,932 entries ([#34025](https://github.com/NousResearch/hermes-agent/pull/34025))
+
+### Redaction
+
+- **Web URLs pass through unchanged** — the redactor was eating query parameters that looked credential-shaped ([#34029](https://github.com/NousResearch/hermes-agent/pull/34029))
+
+---
+
+## ✨ Small Features
+
+- **Hindsight default narrowed to observation-only** for `recall_types` — tool path is also narrowed ([#34079](https://github.com/NousResearch/hermes-agent/pull/34079) — @nicoloboschi, follow-up [#34091](https://github.com/NousResearch/hermes-agent/pull/4df62d239e38bf8c212a595721c9c01e176f6c3a) — @kshitijk4poor)
+- **Memory providers receive completed-turn message context** — salvage of [#28065](https://github.com/NousResearch/hermes-agent/pull/28065) ([#34097](https://github.com/NousResearch/hermes-agent/pull/34097) — @kshitijk4poor, credit to @devwdave)
+
+---
+
+## 📚 Documentation
+
+- **`--no-supervise` / `HERMES_GATEWAY_NO_SUPERVISE` documented** in the reference docs (follow-up to [#33583](https://github.com/NousResearch/hermes-agent/pull/33583)) ([#33751](https://github.com/NousResearch/hermes-agent/pull/33751) — @r266-tech)
+
+---
+
+## 🛠️ Infrastructure
+
+- **Vercel deploy workflow accepts `workflow_dispatch`** so docs deploys can be manually triggered ([#34081](https://github.com/NousResearch/hermes-agent/pull/34081))
+- **`@nous-research/ui` bumped to 0.18.2** (Nix `npmDepsHash` also updated to match) ([#34193](https://github.com/NousResearch/hermes-agent/pull/34193) follow-ups — @austinpickett)
+
+---
+
+## 👥 Contributors
+
+### Core
+- @teknium1
+
+### Community
+- @austinpickett — dashboard 401 reload-loop fix (the headline), `@nous-research/ui` bump, Nix `npmDepsHash` updates
+- @benbarclay — Docker `--insecure` opt-in, MCP bare-command resolution, dashboard test repair
+- @kshitijk4poor — `/yolo` session bypass, completed-turn memory context salvage, hindsight follow-up docs
+- @nicoloboschi — hindsight `recall_types` observation default
+- @BROCCOLO1D — arm64 PR build cache fix
+- @r266-tech — `--no-supervise` reference docs
+- @yangguangjin — probe stepdown safety (salvage of @yanghd's #33673)
+- @devwdave — completed-turn memory context (credited via salvage)
+- @andrewhosf — co-author
+
+### Issue Reporters (the 401 loop)
+- @routesmith ([#34206](https://github.com/NousResearch/hermes-agent/issues/34206))
+- @beeaton ([#34202](https://github.com/NousResearch/hermes-agent/issues/34202))
+
+---
+
+**Full Changelog**: [v2026.5.28...v2026.5.29](https://github.com/NousResearch/hermes-agent/compare/v2026.5.28...v2026.5.29)
--- a/acp_adapter/tools.py
+++ b/acp_adapter/tools.py
@ -907,72 +907,6 @@ def _build_polished_completion_content(
    return [_text(text)]


-def _build_patch_mode_content(patch_text: str) -> List[Any]:
-    """Parse V4A patch mode input into ACP diff blocks when possible."""
-    if not patch_text:
-        return [acp.tool_content(acp.text_block(""))]
-
-    try:
-        from tools.patch_parser import OperationType, parse_v4a_patch
-
-        operations, error = parse_v4a_patch(patch_text)
-        if error or not operations:
-            return [acp.tool_content(acp.text_block(patch_text))]
-
-        content: List[Any] = []
-        for op in operations:
-            if op.operation == OperationType.UPDATE:
-                old_chunks: list[str] = []
-                new_chunks: list[str] = []
-                for hunk in op.hunks:
-                    old_lines = [line.content for line in hunk.lines if line.prefix in {" ", "-"}]
-                    new_lines = [line.content for line in hunk.lines if line.prefix in {" ", "+"}]
-                    if old_lines or new_lines:
-                        old_chunks.append("\n".join(old_lines))
-                        new_chunks.append("\n".join(new_lines))
-
-                old_text = "\n...\n".join(chunk for chunk in old_chunks if chunk)
-                new_text = "\n...\n".join(chunk for chunk in new_chunks if chunk)
-                if old_text or new_text:
-                    content.append(
-                        acp.tool_diff_content(
-                            path=op.file_path,
-                            old_text=old_text or None,
-                            new_text=new_text or "",
-                        )
-                    )
-                continue
-
-            if op.operation == OperationType.ADD:
-                added_lines = [line.content for hunk in op.hunks for line in hunk.lines if line.prefix == "+"]
-                content.append(
-                    acp.tool_diff_content(
-                        path=op.file_path,
-                        new_text="\n".join(added_lines),
-                    )
-                )
-                continue
-
-            if op.operation == OperationType.DELETE:
-                content.append(
-                    acp.tool_diff_content(
-                        path=op.file_path,
-                        old_text=f"Delete file: {op.file_path}",
-                        new_text="",
-                    )
-                )
-                continue
-
-            if op.operation == OperationType.MOVE:
-                content.append(
-                    acp.tool_content(acp.text_block(f"Move file: {op.file_path} -> {op.new_path}"))
-                )
-
-        return content or [acp.tool_content(acp.text_block(patch_text))]
-    except Exception:
-        return [acp.tool_content(acp.text_block(patch_text))]
-
-
 def _strip_diff_prefix(path: str) -> str:
    raw = str(path or "").strip()
    if raw.startswith(("a/", "b/")):
--- a/acp_registry/agent.json
+++ b/acp_registry/agent.json
@ -1,7 +1,7 @@
 {
  "id": "hermes-agent",
  "name": "Hermes Agent",
-  "version": "0.15.0",
+  "version": "0.15.1",
  "description": "Self-improving open-source AI agent by Nous Research with ACP editor integration, persistent memory, skills, and rich tool support.",
  "repository": "https://github.com/NousResearch/hermes-agent",
  "website": "https://hermes-agent.nousresearch.com/docs/user-guide/features/acp",
@ -9,7 +9,7 @@
  "license": "MIT",
  "distribution": {
    "uvx": {
-      "package": "hermes-agent[acp]==0.15.0",
+      "package": "hermes-agent[acp]==0.15.1",
      "args": ["hermes-acp"]
    }
  }
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@ -27,7 +27,6 @@ import threading
 import time
 import uuid
 from datetime import datetime
-from pathlib import Path
 from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse, parse_qs, urlunparse

@ -37,7 +36,6 @@ from agent.memory_manager import StreamingContextScrubber
 from agent.model_metadata import (
    MINIMUM_CONTEXT_LENGTH,
    fetch_model_metadata,
-    get_model_context_length,
    is_local_endpoint,
    query_ollama_num_ctx,
 )
@ -52,7 +50,6 @@ from agent.tool_guardrails import (
 from hermes_cli.config import cfg_get
 from hermes_cli.timeouts import get_provider_request_timeout
 from hermes_constants import get_hermes_home
-from model_tools import check_toolset_requirements, get_tool_definitions
 from utils import base_url_host_matches

 # Use the same logger name as run_agent so tests patching ``run_agent.logger``
@ -1201,6 +1198,18 @@ def init_agent(
        _agent_section = {}
    agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto")

+    # Universal task-completion guidance toggle.  Default True.  Surfaced
+    # as a separate flag from tool_use_enforcement because the guidance
+    # applies to ALL models, not just the model families enforcement
+    # targets.
+    agent._task_completion_guidance = bool(_agent_section.get("task_completion_guidance", True))
+
+    # Local Python toolchain probe toggle.  Default True.  When False,
+    # the probe is skipped entirely (no subprocess calls, no system-prompt
+    # line).  Useful for users on exotic setups where the probe heuristics
+    # are noisy.
+    agent._environment_probe = bool(_agent_section.get("environment_probe", True))
+
    # App-level API retry count (wraps each model API call).  Default 3,
    # overridable via agent.api_max_retries in config.yaml.  See #11616.
    try:
@ -1462,7 +1471,6 @@ def init_agent(

    # Reject models whose context window is below the minimum required
    # for reliable tool-calling workflows (64K tokens).
-    from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
    _ctx = getattr(agent.context_compressor, "context_length", 0)
    if _ctx and _ctx < MINIMUM_CONTEXT_LENGTH:
        raise ValueError(
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
@ -25,24 +25,17 @@ from __future__ import annotations
 import copy
 import json
 import logging
-import os
 import re
-import threading
 import time
-import uuid
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional

 from hermes_cli.timeouts import get_provider_request_timeout
-from agent.message_sanitization import (
-    _repair_tool_call_arguments,
-    _sanitize_surrogates,
-)
 from agent.tool_dispatch_helpers import _trajectory_normalize_msg, make_tool_result_message
 from agent.trajectory import convert_scratchpad_to_think
 from agent.credential_pool import STATUS_EXHAUSTED
-from agent.error_classifier import classify_api_error, FailoverReason
+from agent.error_classifier import FailoverReason
 from utils import base_url_host_matches, base_url_hostname, env_var_enabled, atomic_json_write

 logger = logging.getLogger(__name__)
@ -1699,6 +1692,8 @@ def invoke_tool(agent, function_name: str, function_args: dict, effective_task_i
            session_id=agent.session_id or "",
            enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
            skip_pre_tool_call_hook=True,
+            enabled_toolsets=getattr(agent, "enabled_toolsets", None),
+            disabled_toolsets=getattr(agent, "disabled_toolsets", None),
        )


--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -894,20 +894,6 @@ def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
    return None


-def read_claude_managed_key() -> Optional[str]:
-    """Read Claude's native managed key from ~/.claude.json for diagnostics only."""
-    claude_json = Path.home() / ".claude.json"
-    if claude_json.exists():
-        try:
-            data = json.loads(claude_json.read_text(encoding="utf-8"))
-            primary_key = data.get("primaryApiKey", "")
-            if isinstance(primary_key, str) and primary_key.strip():
-                return primary_key.strip()
-        except (json.JSONDecodeError, OSError, IOError) as e:
-            logger.debug("Failed to read ~/.claude.json: %s", e)
-    return None
-
-
 def is_claude_code_token_valid(creds: Dict[str, Any]) -> bool:
    """Check if Claude Code credentials have a non-expired access token."""
    import time
@ -1256,10 +1242,16 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
    print()

    try:
-        webbrowser.open(auth_url)
-        print("  (Browser opened automatically)")
+        from hermes_cli.auth import _can_open_graphical_browser as _can_open_gui
    except Exception:
-        pass
+        _can_open_gui = lambda: True  # noqa: E731 — degrade to prior behavior
+
+    if _can_open_gui():
+        try:
+            webbrowser.open(auth_url)
+            print("  (Browser opened automatically)")
+        except Exception:
+            pass

    print()
    print("After authorizing, you'll see a code. Paste it below.")
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -700,12 +700,20 @@ class _CodexCompletionsAdapter:
            # xAI's Responses endpoint rejects ``pattern`` and ``format`` JSON Schema
            # keywords (HTTP 400). Strip them here to match the parity guarantee that
            # chat_completion_helpers.py provides for the main-agent xAI path.
+            #
+            # Deep-copy before sanitizing — ``list(tools)`` is only a shallow
+            # copy of the outer list, but the sanitizers mutate the inner
+            # parameter dicts in place.  Without a deep copy the caller's
+            # tool registry permanently loses its slash-containing enum
+            # constraints after the first auxiliary xAI call.  See #27907.
            try:
+                import copy as _copy
                from tools.schema_sanitizer import (
                    strip_pattern_and_format,
                    strip_slash_enum,
                )
-                tools, _ = strip_pattern_and_format(list(tools))
+                tools = _copy.deepcopy(list(tools))
+                tools, _ = strip_pattern_and_format(tools)
                tools, _ = strip_slash_enum(tools)
            except Exception as exc:
                logger.warning(
@ -1235,8 +1243,23 @@ def _read_nous_auth() -> Optional[dict]:


 def _nous_api_key(provider: dict) -> str:
-    """Extract the Nous runtime credential from the compatibility field."""
-    return provider.get("agent_key") or provider.get("access_token", "")
+    """Extract a usable Nous inference JWT from stored auth state."""
+    from hermes_cli.auth import _nous_invoke_jwt_is_usable
+
+    for token_key, expiry_key in (
+        ("agent_key", "agent_key_expires_at"),
+        ("access_token", "expires_at"),
+    ):
+        token = provider.get(token_key)
+        if not isinstance(token, str) or not token.strip():
+            continue
+        if _nous_invoke_jwt_is_usable(
+            token,
+            scope=provider.get("scope"),
+            expires_at=provider.get(expiry_key),
+        ):
+            return token
+    return ""


 def _nous_base_url() -> str:
@ -1248,25 +1271,16 @@ def _resolve_nous_runtime_api(*, force_refresh: bool = False) -> Optional[tuple[
    """Return fresh Nous runtime credentials when available.

    This mirrors the main agent's 401 recovery path and keeps auxiliary
-    clients aligned with the singleton auth store + JWT/mint flow instead of
+    clients aligned with the singleton auth store + JWT refresh flow instead of
    relying only on whatever raw tokens happen to be sitting in auth.json
    or the credential pool.
    """
    try:
-        from hermes_cli.auth import (
-            NOUS_INFERENCE_AUTH_MODE_AUTO,
-            NOUS_INFERENCE_AUTH_MODE_LEGACY,
-            resolve_nous_runtime_credentials,
-        )
+        from hermes_cli.auth import resolve_nous_runtime_credentials

        creds = resolve_nous_runtime_credentials(
-            min_key_ttl_seconds=max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800"))),
            timeout_seconds=float(os.getenv("HERMES_NOUS_TIMEOUT_SECONDS", "15")),
-            inference_auth_mode=(
-                NOUS_INFERENCE_AUTH_MODE_LEGACY
-                if force_refresh
-                else NOUS_INFERENCE_AUTH_MODE_AUTO
-            ),
+            force_refresh=force_refresh,
        )
    except Exception as exc:
        logger.debug("Auxiliary Nous runtime credential resolution failed: %s", exc)
@ -1550,13 +1564,9 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
        _mark_provider_unhealthy("nous", ttl=60)
        return None, None
    if runtime is None and nous:
-        # Runtime credential mint failed but stored Nous auth is still present.
-        # Falls back to the raw stored token below; surface a debug line so
-        # operators investigating expired/invalid sessions have a breadcrumb,
-        # without blocking the fallback path the rest of this function relies on.
        logger.debug(
-            "Auxiliary Nous: runtime credential mint failed; falling back to "
-            "stored auth.json token."
+            "Auxiliary Nous: runtime JWT refresh failed; checking stored "
+            "auth.json token."
        )
    global auxiliary_is_nous
    auxiliary_is_nous = True
@ -1594,6 +1604,13 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
        api_key, base_url = runtime
    else:
        api_key = _nous_api_key(nous or {})
+        if not api_key:
+            logger.warning(
+                "Auxiliary Nous client unavailable: no usable inference JWT found "
+                "(run: hermes auth add nous)."
+            )
+            _mark_provider_unhealthy("nous", ttl=60)
+            return None, None
        base_url = str((nous or {}).get("inference_base_url") or _nous_base_url()).rstrip("/")
    return (
        OpenAI(
@ -1663,26 +1680,48 @@ def _read_main_provider() -> str:
 # per turn — no lock needed. Cleared by ``clear_runtime_main()``.
 _RUNTIME_MAIN_PROVIDER: str = ""
 _RUNTIME_MAIN_MODEL: str = ""
+_RUNTIME_MAIN_BASE_URL: str = ""
+_RUNTIME_MAIN_API_KEY: str = ""
+_RUNTIME_MAIN_API_MODE: str = ""


-def set_runtime_main(provider: str, model: str) -> None:
-    """Record the live runtime provider/model for the current AIAgent.
+def set_runtime_main(
+    provider: str,
+    model: str,
+    *,
+    base_url: str = "",
+    api_key: str = "",
+    api_mode: str = "",
+) -> None:
+    """Record the live runtime provider/model/credentials for the current AIAgent.

    Called by ``run_agent.AIAgent._sync_runtime_main_for_aux_routing`` (or
    equivalent setter) at the top of each turn so that
    ``_read_main_provider`` / ``_read_main_model`` reflect CLI/gateway
    overrides instead of the stale config.yaml default.
+
+    For ``custom:`` providers, ``base_url`` and ``api_key`` must also be
+    recorded so that ``_resolve_auto`` can construct a valid client in
+    Step 1 instead of falling through to the aggregator chain.
    """
    global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
+    global _RUNTIME_MAIN_BASE_URL, _RUNTIME_MAIN_API_KEY, _RUNTIME_MAIN_API_MODE
    _RUNTIME_MAIN_PROVIDER = (provider or "").strip().lower()
    _RUNTIME_MAIN_MODEL = (model or "").strip()
+    _RUNTIME_MAIN_BASE_URL = (base_url or "").strip()
+    _RUNTIME_MAIN_API_KEY = api_key.strip() if isinstance(api_key, str) else ""
+    _RUNTIME_MAIN_API_MODE = (api_mode or "").strip()


 def clear_runtime_main() -> None:
    """Clear the runtime override (e.g. on session end)."""
    global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
+    global _RUNTIME_MAIN_BASE_URL, _RUNTIME_MAIN_API_KEY, _RUNTIME_MAIN_API_MODE
    _RUNTIME_MAIN_PROVIDER = ""
    _RUNTIME_MAIN_MODEL = ""
+    _RUNTIME_MAIN_BASE_URL = ""
+    _RUNTIME_MAIN_API_KEY = ""
+    _RUNTIME_MAIN_API_MODE = ""


 def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]:
@ -2357,7 +2396,16 @@ def _is_auth_error(exc: Exception) -> bool:
    if status == 401:
        return True
    err_lower = str(exc).lower()
-    return "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower()
+    if "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower():
+        return True
+    # xAI returns HTTP 403 with "unauthenticated:bad-credentials" when an OAuth2
+    # access token has expired or is invalid — semantically a 401 auth failure,
+    # even though the status code is 403 (PermissionDenied).
+    if status == 403 and "bad-credentials" in err_lower:
+        return True
+    if "unauthenticated" in err_lower and "bad-credentials" in err_lower:
+        return True
+    return False


 def _is_unsupported_parameter_error(exc: Exception, param: str) -> bool:
@ -2510,6 +2558,8 @@ def _recoverable_pool_provider(
        return "copilot"
    if base_url_host_matches(base, "api.kimi.com"):
        return "kimi-coding"
+    if base_url_host_matches(base, "api.x.ai"):
+        return "xai-oauth"
    # For api_key providers not in the hardcoded list (e.g. opencode-go), match
    # the client base URL against all registered api_key providers so that
    # credential-pool rotation works for any provider the user configured.
@ -2706,15 +2756,11 @@ def _refresh_provider_credentials(provider: str) -> bool:
            _evict_cached_clients(normalized)
            return True
        if normalized == "nous":
-            from hermes_cli.auth import (
-                NOUS_INFERENCE_AUTH_MODE_LEGACY,
-                resolve_nous_runtime_credentials,
-            )
+            from hermes_cli.auth import resolve_nous_runtime_credentials

            creds = resolve_nous_runtime_credentials(
-                min_key_ttl_seconds=max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800"))),
                timeout_seconds=float(os.getenv("HERMES_NOUS_TIMEOUT_SECONDS", "15")),
-                inference_auth_mode=NOUS_INFERENCE_AUTH_MODE_LEGACY,
+                force_refresh=True,
            )
            if not str(creds.get("api_key", "") or "").strip():
                return False
@ -2731,6 +2777,24 @@ def _refresh_provider_credentials(provider: str) -> bool:
                return False
            _evict_cached_clients(normalized)
            return True
+        if normalized == "xai-oauth":
+            # Preference: pool-level refresh (uses refresh_token from pool entry),
+            # then fall back to singleton auth-store resolver.
+            pool = load_pool(normalized)
+            if pool and pool.has_credentials():
+                # Ensure a current entry is selected before trying to refresh.
+                pool.select()
+                refreshed = pool.try_refresh_current()
+                if refreshed is not None and str(getattr(refreshed, "runtime_api_key", "") or "").strip():
+                    _evict_cached_clients(normalized)
+                    return True
+            from hermes_cli.auth import resolve_xai_oauth_runtime_credentials
+
+            creds = resolve_xai_oauth_runtime_credentials(force_refresh=True)
+            if not str(creds.get("api_key", "") or "").strip():
+                return False
+            _evict_cached_clients(normalized)
+            return True
    except Exception as exc:
        logger.debug("Auxiliary provider credential refresh failed for %s: %s", normalized, exc)
        return False
@ -2938,6 +3002,18 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
    runtime_api_key = runtime.get("api_key", "")
    runtime_api_mode = str(runtime.get("api_mode") or "")

+    # Fall back to process-local globals when main_runtime dict was not
+    # provided or was incomplete.  ``set_runtime_main()`` now records
+    # base_url/api_key/api_mode alongside provider/model, so custom:
+    # providers get the full credential surface in Step 1 of the
+    # auto-detect chain.
+    if not runtime_base_url and _RUNTIME_MAIN_BASE_URL:
+        runtime_base_url = _RUNTIME_MAIN_BASE_URL
+    if not runtime_api_key and _RUNTIME_MAIN_API_KEY:
+        runtime_api_key = _RUNTIME_MAIN_API_KEY
+    if not runtime_api_mode and _RUNTIME_MAIN_API_MODE:
+        runtime_api_mode = _RUNTIME_MAIN_API_MODE
+
    # ── Warn once if OPENAI_BASE_URL is set but config.yaml uses a named
    #    provider (not 'custom').  This catches the common "env poisoning"
    #    scenario where a user switches providers via `hermes model` but the
@ -4683,24 +4759,23 @@ def _build_call_kwargs(
        kwargs["temperature"] = temperature

    if max_tokens is not None:
-        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
-        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
-        # ZAI vision models (glm-4v-flash, glm-4v-plus, etc.) reject max_tokens with
-        # error code 1210 ("API 调用参数有误") on multimodal requests — skip it.
-        _model_lower = (model or "").lower()
-        _skip_max_tokens = (
-            provider == "zai"
-            and ("4v" in _model_lower or "5v" in _model_lower or "-v" in _model_lower)
+        # We do NOT cap output by default. Most chat-completions providers treat
+        # an omitted max_tokens as "use the model's max output", which is what we
+        # want for auxiliary tasks (compression summaries, titles, vision, etc.) —
+        # an explicit cap only risks truncating a summary or 400-ing on providers
+        # that reject the parameter outright (e.g. GitHub Copilot / newer OpenAI
+        # GPT-5 models require max_completion_tokens, not max_tokens; ZAI vision
+        # models reject it entirely with error 1210). Omitting it sidesteps all of
+        # those wire-format quirks at once.
+        #
+        # The one exception is the Anthropic Messages wire (MiniMax and any
+        # ``/anthropic`` endpoint reached through the OpenAI SDK wrapper), where
+        # max_tokens is a MANDATORY field — omitting it is a hard 400. Keep it only
+        # there.
+        _effective_base = base_url or (
+            _current_custom_base_url() if provider == "custom" else ""
        )
-        if _skip_max_tokens:
-            pass  # ZAI vision models do not accept max_tokens
-        elif provider == "custom":
-            custom_base = base_url or _current_custom_base_url()
-            if base_url_hostname(custom_base) == "api.openai.com":
-                kwargs["max_completion_tokens"] = max_tokens
-            else:
-                kwargs["max_tokens"] = max_tokens
-        else:
+        if _is_anthropic_compat_endpoint(provider, _effective_base):
            kwargs["max_tokens"] = max_tokens

    if tools:
--- a/agent/bedrock_adapter.py
+++ b/agent/bedrock_adapter.py
@ -1167,18 +1167,6 @@ def _extract_provider_from_arn(arn: str) -> str:
    """
    match = re.search(r"foundation-model/([^.]+)", arn)
    return match.group(1) if match else ""
-
-
-def get_bedrock_model_ids(region: str) -> List[str]:
-    """Return a flat list of available Bedrock model IDs for the given region.
-
-    Convenience wrapper around ``discover_bedrock_models()`` for use in
-    the model selection UI.
-    """
-    models = discover_bedrock_models(region)
-    return [m["id"] for m in models]
-
-
 # ---------------------------------------------------------------------------
 # Error classification — Bedrock-specific exceptions
 # ---------------------------------------------------------------------------
--- a/agent/browser_registry.py
+++ b/agent/browser_registry.py
@ -186,37 +186,6 @@ def _resolve(configured: Optional[str]) -> Optional[BrowserProvider]:
    return None


-def get_active_browser_provider() -> Optional[BrowserProvider]:
-    """Resolve the currently-active cloud browser provider.
-
-    Reads ``browser.cloud_provider`` from config.yaml; falls back per the
-    module docstring. Returns None for local mode or when no provider is
-    available.
-    """
-    try:
-        from hermes_cli.config import read_raw_config
-
-        cfg = read_raw_config()
-        browser_cfg = cfg.get("browser", {})
-    except Exception as exc:
-        logger.debug("Could not read browser config: %s", exc)
-        browser_cfg = {}
-
-    configured: Optional[str] = None
-    if isinstance(browser_cfg, dict) and "cloud_provider" in browser_cfg:
-        try:
-            from tools.tool_backend_helpers import normalize_browser_cloud_provider
-
-            configured = normalize_browser_cloud_provider(
-                browser_cfg.get("cloud_provider")
-            )
-        except Exception as exc:
-            logger.debug("normalize_browser_cloud_provider failed: %s", exc)
-            configured = None
-
-    return _resolve(configured)
-
-
 def _reset_for_tests() -> None:
    """Clear the registry. **Test-only.**"""
    with _lock:
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@ -15,49 +15,23 @@ sites unchanged.  Symbols that tests patch on ``run_agent`` (e.g.

 from __future__ import annotations

-import concurrent.futures
-import contextvars
-import copy
 import json
 import logging
 import os
-import random
 import re
-import sys
 import threading
 import time
 import uuid
-from datetime import datetime
-from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Dict, List, Optional, Tuple
-from urllib.parse import urlparse, parse_qs, urlunparse
+from typing import Any, Dict, Optional

 from hermes_cli.timeouts import get_provider_request_timeout, get_provider_stale_timeout
 from hermes_constants import PARTIAL_STREAM_STUB_ID, FINISH_REASON_LENGTH
-from agent.error_classifier import classify_api_error, FailoverReason
+from agent.error_classifier import FailoverReason
 from agent.model_metadata import is_local_endpoint
 from agent.message_sanitization import (
    _sanitize_surrogates,
-    _sanitize_messages_surrogates,
-    _sanitize_structure_surrogates,
-    _sanitize_messages_non_ascii,
-    _sanitize_tools_non_ascii,
-    _sanitize_structure_non_ascii,
-    _strip_images_from_messages,
-    _strip_non_ascii,
    _repair_tool_call_arguments,
-    _escape_invalid_chars_in_json_strings,
-)
-from agent.tool_dispatch_helpers import (
-    _is_multimodal_tool_result,
-    _multimodal_text_summary,
-)
-from agent.retry_utils import jittered_backoff
-from agent.tool_guardrails import (
-    ToolGuardrailDecision,
-    append_toolguard_guidance,
-    toolguard_synthetic_result,
 )
 from tools.terminal_tool import is_persistent_env
 from utils import base_url_host_matches, base_url_hostname
@ -175,13 +149,6 @@ def interruptible_api_call(agent, api_kwargs: dict):
            request_client_holder["owner_tid"] = threading.get_ident()
        return client

-    def _take_request_client():
-        with request_client_lock:
-            client = request_client_holder.get("client")
-            request_client_holder["client"] = None
-            request_client_holder["owner_tid"] = None
-            return client
-
    def _close_request_client_once(reason: str) -> None:
        # #29507: dispatch on the calling thread.
        #
@ -310,8 +277,15 @@ def interruptible_api_call(agent, api_kwargs: dict):
    else:
        _codex_idle_timeout_default = 12.0

+    # No-byte TTFB cutoff. The OpenAI SDK's own streaming read timeout is far
+    # longer (openai 2.x DEFAULT_TIMEOUT.read = 600s), so a tight 12s default
+    # killed subscription-backed Codex requests mid-prefill before the backend
+    # had a chance to emit its first SSE event. Default to 120s — long enough to
+    # clear normal backend admission / prompt prefill, short enough to still
+    # reconnect promptly when the socket is genuinely wedged. Set
+    # HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0 to disable this watchdog entirely.
    _ttfb_enabled = _codex_watchdog_enabled
-    _ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 12.0)
+    _ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 120.0)
    if _ttfb_timeout <= 0:
        _ttfb_enabled = False
    elif _openai_codex_backend:
@ -333,7 +307,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
                _ttfb_disable_above,
            )
        else:
-            _ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 20.0)
+            _ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 120.0)
            if _ttfb_cap > 0 and _ttfb_timeout > _ttfb_cap:
                logger.info(
                    "Capping openai-codex no-byte TTFB timeout from %.0fs to %.0fs "
@ -614,12 +588,23 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
        # It also rejects ``enum`` values containing ``/`` (HuggingFace IDs
        # like ``Qwen/Qwen3.5-0.8B`` shipped by MCP servers) — same 400 with
        # the same opaque message; strip those enums too.
+        #
+        # Deep-copy ``tools_for_api`` before sanitizing: the sanitizers
+        # mutate in place (documented contract on ``strip_slash_enum`` /
+        # ``strip_pattern_and_format``), and ``tools_for_api`` is a direct
+        # reference to ``agent.tools``.  Without the copy, the first xAI
+        # request permanently strips constraints from the shared per-agent
+        # tool registry — every subsequent non-xAI call from the same
+        # agent (auxiliary task routed to Anthropic, OpenRouter fallback,
+        # main-model swap) sees the already-stripped schema.  See #27907.
        if is_xai_responses:
            try:
+                import copy as _copy
                from tools.schema_sanitizer import (
                    strip_pattern_and_format,
                    strip_slash_enum,
                )
+                tools_for_api = _copy.deepcopy(tools_for_api)
                tools_for_api, _ = strip_pattern_and_format(tools_for_api)
                tools_for_api, _ = strip_slash_enum(tools_for_api)
            except Exception as exc:
@ -1298,6 +1283,18 @@ def handle_max_iterations(agent, messages: list, api_call_count: int) -> str:
            agent._copy_reasoning_content_for_api(msg, api_msg)
            for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"):
                api_msg.pop(internal_field, None)
+            # Strict OpenAI-compatible gateways (Fireworks-backed OpenCode Go,
+            # Mistral, Moonshot/Kimi) reject any message key outside the Chat
+            # Completions schema. The main loop drops these via
+            # ChatCompletionsTransport.convert_messages(), but the summary path
+            # hand-builds messages and calls chat.completions.create() directly,
+            # bypassing the transport — so mirror that sanitization here:
+            # tool_name (SQLite FTS bookkeeping), the codex_* reasoning carriers,
+            # and every Hermes-internal underscore-prefixed scaffolding key.
+            for schema_foreign in ("tool_name", "codex_reasoning_items", "codex_message_items"):
+                api_msg.pop(schema_foreign, None)
+            for internal_key in [k for k in api_msg if isinstance(k, str) and k.startswith("_")]:
+                api_msg.pop(internal_key, None)
            if _needs_sanitize:
                agent._sanitize_tool_calls_for_strict_api(api_msg)
            api_messages.append(api_msg)
@ -1636,13 +1633,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
            request_client_holder["owner_tid"] = threading.get_ident()
        return client

-    def _take_request_client():
-        with request_client_lock:
-            client = request_client_holder.get("client")
-            request_client_holder["client"] = None
-            request_client_holder["owner_tid"] = None
-            return client
-
    def _close_request_client_once(reason: str) -> None:
        # See #29507 explanation in the non-streaming variant above. A
        # stranger thread (the interrupt-check / stale-stream detector loop)
--- a/agent/codex_responses_adapter.py
+++ b/agent/codex_responses_adapter.py
@ -980,6 +980,48 @@ def _extract_responses_reasoning_text(item: Any) -> str:
    return ""


+def _format_responses_error(error_obj: Any, response_status: str) -> str:
+    """Build a human-readable error string from a Responses ``response.error`` payload.
+
+    The OpenAI Responses API carries failure details under ``response.error``
+    on terminal ``response.failed`` events, in the shape
+    ``{"code": "rate_limit_exceeded", "message": "Slow down", "param": ...}``.
+    Earlier code only surfaced ``message``, which left users staring at bare
+    strings like ``"Slow down"`` while the failure mode (rate limit vs
+    context-length vs internal_error vs model-overloaded) was hidden in
+    ``code``. We now prefix ``code`` when both are present so consumers can
+    distinguish failure modes without parsing the bare message.
+
+    Falls back to ``code`` alone when ``message`` is empty, and to a stable
+    default referencing the response status when no error payload is
+    available at all. Adapted from anomalyco/opencode#28757.
+    """
+    # Pull code and message from either dict or attribute-style payloads.
+    code: Any = None
+    message: Any = None
+    if isinstance(error_obj, dict):
+        code = error_obj.get("code")
+        message = error_obj.get("message")
+    elif error_obj is not None:
+        code = getattr(error_obj, "code", None)
+        message = getattr(error_obj, "message", None)
+
+    code_str = str(code).strip() if isinstance(code, str) else (str(code).strip() if code else "")
+    message_str = str(message).strip() if isinstance(message, str) else (str(message).strip() if message else "")
+
+    if code_str and message_str:
+        return f"{code_str}: {message_str}"
+    if message_str:
+        return message_str
+    if code_str:
+        return code_str
+    if error_obj:
+        # Last-resort: stringify whatever the provider sent so it's at least
+        # visible in logs/UI rather than silently swallowed.
+        return str(error_obj)
+    return f"Responses API returned status '{response_status}'"
+
+
 # ---------------------------------------------------------------------------
 # Full response normalization
 # ---------------------------------------------------------------------------
@ -1023,10 +1065,7 @@ def _normalize_codex_response(

    if response_status in {"failed", "cancelled"}:
        error_obj = getattr(response, "error", None)
-        if isinstance(error_obj, dict):
-            error_msg = error_obj.get("message") or str(error_obj)
-        else:
-            error_msg = str(error_obj) if error_obj else f"Responses API returned status '{response_status}'"
+        error_msg = _format_responses_error(error_obj, response_status)
        raise RuntimeError(error_msg)

    content_parts: List[str] = []
--- a/agent/codex_runtime.py
+++ b/agent/codex_runtime.py
@ -16,7 +16,6 @@ compatibility.

 from __future__ import annotations

-import json
 import logging
 import os
 import time
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -40,17 +40,47 @@ SUMMARY_PREFIX = (
    "window — treat it as background reference, NOT as active instructions. "
    "Do NOT answer questions or fulfill requests mentioned in this summary; "
    "they were already addressed. "
-    "Your current task is identified in the '## Active Task' section of the "
-    "summary — resume exactly from there. "
+    "Respond ONLY to the latest user message that appears AFTER this "
+    "summary — that message is the single source of truth for what to do "
+    "right now. "
+    "If the latest user message is consistent with the '## Active Task' "
+    "section, you may use the summary as background. If the latest user "
+    "message contradicts, supersedes, changes topic from, or in any way "
+    "diverges from '## Active Task' / '## In Progress' / '## Pending User "
+    "Asks' / '## Remaining Work', the latest message WINS — discard those "
+    "stale items entirely and do not 'wrap up the old task first'. "
+    "Reverse signals in the latest message (e.g. 'stop', 'undo', 'roll "
+    "back', 'just verify', 'don't do that anymore', 'never mind', a new "
+    "topic) must immediately end any in-flight work described in the "
+    "summary; do not re-surface it in later turns. "
    "IMPORTANT: Your persistent memory (MEMORY.md, USER.md) in the system "
    "prompt is ALWAYS authoritative and active — never ignore or deprioritize "
    "memory content due to this compaction note. "
-    "Respond ONLY to the latest user message "
-    "that appears AFTER this summary. The current session state (files, "
-    "config, etc.) may reflect work described here — avoid repeating it:"
+    "The current session state (files, config, etc.) may reflect work "
+    "described here — avoid repeating it:"
 )
 LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"

+# Handoff prefixes that shipped in earlier releases. A summary persisted under
+# one of these can be inherited into a resumed lineage (#35344); when it is
+# re-normalized on re-compaction we must strip the OLD prefix too, otherwise the
+# stale directive it carried (e.g. "resume exactly from Active Task") survives
+# embedded in the body and keeps hijacking replies. Keep newest-first; entries
+# are matched literally. Add a frozen copy here whenever SUMMARY_PREFIX changes.
+_HISTORICAL_SUMMARY_PREFIXES = (
+    # Pre-#35344: contained the self-contradicting "resume exactly" directive.
+    "[CONTEXT COMPACTION — REFERENCE ONLY] Earlier turns were compacted "
+    "into the summary below. This is a handoff from a previous context "
+    "window — treat it as background reference, NOT as active instructions. "
+    "Do NOT answer questions or fulfill requests mentioned in this summary; "
+    "they were already addressed. "
+    "Your current task is identified in the '## Active Task' section of the "
+    "summary — resume exactly from there. "
+    "Respond ONLY to the latest user message "
+    "that appears AFTER this summary. The current session state (files, "
+    "config, etc.) may reflect work described here — avoid repeating it:",
+)
+
 # Minimum tokens for the summary output
 _MIN_SUMMARY_TOKENS = 2000
 # Proportion of compressed content to allocate for summary
@ -75,6 +105,44 @@ _IMAGE_TOKEN_ESTIMATE = 1600
 _IMAGE_CHAR_EQUIVALENT = _IMAGE_TOKEN_ESTIMATE * _CHARS_PER_TOKEN
 _SUMMARY_FAILURE_COOLDOWN_SECONDS = 600

+# Hard ceiling for the deterministic summary-failure handoff.  The fallback is
+# only meant to preserve continuity anchors from the dropped window, not to
+# become another unbounded transcript copy after the LLM summarizer failed.
+_FALLBACK_SUMMARY_MAX_CHARS = 8_000
+_FALLBACK_TURN_MAX_CHARS = 700
+
+
+_PATH_MENTION_RE = re.compile(r"(?:/|~/?|[A-Za-z]:\\)[^\s`'\")\]}<>]+")
+
+
+def _dedupe_append(items: list[str], value: str, *, limit: int) -> None:
+    value = value.strip()
+    if value and value not in items and len(items) < limit:
+        items.append(value)
+
+
+def _extract_tool_call_name_and_args(tool_call: Any) -> tuple[str, str]:
+    """Return a best-effort ``(name, arguments)`` pair for dict/object tool calls."""
+    if isinstance(tool_call, dict):
+        fn = tool_call.get("function") or {}
+        return str(fn.get("name") or "unknown"), str(fn.get("arguments") or "")
+
+    fn = getattr(tool_call, "function", None)
+    if fn is None:
+        return "unknown", ""
+    return str(getattr(fn, "name", None) or "unknown"), str(getattr(fn, "arguments", None) or "")
+
+
+def _extract_tool_call_id(tool_call: Any) -> str:
+    if isinstance(tool_call, dict):
+        return str(tool_call.get("id") or "")
+    return str(getattr(tool_call, "id", "") or "")
+
+
+def _collect_path_mentions(text: str, relevant_files: list[str], *, limit: int = 12) -> None:
+    for match in _PATH_MENTION_RE.findall(text):
+        _dedupe_append(relevant_files, match.rstrip(".,:;"), limit=limit)
+

 def _content_length_for_budget(raw_content: Any) -> int:
    """Return the effective char-length of a message's content for token budgeting.
@ -480,6 +548,10 @@ class ContextCompressor(ContextEngine):
        self._last_compression_savings_pct = 100.0
        self._ineffective_compression_count = 0
        self._summary_failure_cooldown_until = 0.0  # transient errors must not block a fresh session
+        self.last_real_prompt_tokens = 0
+        self.last_compression_rough_tokens = 0
+        self.last_rough_tokens_when_real_prompt_fit = 0
+        self.awaiting_real_usage_after_compression = False

    def update_model(
        self,
@ -537,8 +609,8 @@ class ContextCompressor(ContextEngine):
        self.quiet_mode = quiet_mode
        # When True, summary-generation failure aborts compression entirely
        # (returns messages unchanged, sets _last_compress_aborted=True).
-        # When False (default = historical behavior), insert a static
-        # "summary unavailable" placeholder and drop the middle window.
+        # When False (default = historical behavior), insert a
+        # deterministic "summary unavailable" handoff and drop the middle window.
        self.abort_on_summary_failure = abort_on_summary_failure

        self.context_length = get_model_context_length(
@ -577,6 +649,10 @@ class ContextCompressor(ContextEngine):

        self.last_prompt_tokens = 0
        self.last_completion_tokens = 0
+        self.last_real_prompt_tokens = 0
+        self.last_compression_rough_tokens = 0
+        self.last_rough_tokens_when_real_prompt_fit = 0
+        self.awaiting_real_usage_after_compression = False

        self.summary_model = summary_model_override or ""

@ -610,6 +686,44 @@ class ContextCompressor(ContextEngine):
        self.last_prompt_tokens = usage.get("prompt_tokens", 0)
        self.last_completion_tokens = usage.get("completion_tokens", 0)
        self.last_total_tokens = usage.get("total_tokens", self.last_prompt_tokens + self.last_completion_tokens)
+        if self.last_prompt_tokens > 0:
+            self.last_real_prompt_tokens = self.last_prompt_tokens
+            if self.last_prompt_tokens < self.threshold_tokens:
+                if self.awaiting_real_usage_after_compression and self.last_compression_rough_tokens > 0:
+                    self.last_rough_tokens_when_real_prompt_fit = self.last_compression_rough_tokens
+            else:
+                self.last_rough_tokens_when_real_prompt_fit = 0
+        self.awaiting_real_usage_after_compression = False
+
+    def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool:
+        """Return True when a high rough preflight estimate is known-noisy.
+
+        ``estimate_request_tokens_rough(..., tools=...)`` intentionally
+        overestimates schema-heavy requests so Hermes compresses before a
+        provider rejects the payload. After a successful compressed API call,
+        though, provider ``prompt_tokens`` are a better signal than repeating
+        compaction from the same rough schema overhead. Defer only while the
+        rough estimate has grown modestly since a request the provider proved
+        fit under the threshold.
+        """
+        if rough_tokens < self.threshold_tokens:
+            return False
+        if self.last_real_prompt_tokens <= 0:
+            return False
+        if self.last_real_prompt_tokens >= self.threshold_tokens:
+            return False
+
+        baseline = self.last_rough_tokens_when_real_prompt_fit or self.last_compression_rough_tokens
+        if baseline <= 0:
+            return False
+
+        growth = max(0, rough_tokens - baseline)
+        tolerated_growth = max(4096, int(self.threshold_tokens * 0.05))
+        if growth > tolerated_growth:
+            return False
+
+        self.last_rough_tokens_when_real_prompt_fit = max(baseline, rough_tokens)
+        return True

    def should_compress(self, prompt_tokens: int = None) -> bool:
        """Check if context exceeds the compression threshold.
@ -884,6 +998,195 @@ class ContextCompressor(ContextEngine):

        return "\n\n".join(parts)

+    def _build_static_fallback_summary(
+        self,
+        turns_to_summarize: List[Dict[str, Any]],
+        reason: str | None = None,
+    ) -> str:
+        """Build a deterministic handoff when the LLM summarizer is unavailable.
+
+        This is intentionally much less rich than an LLM-written summary, but it
+        is still better than a bare "N messages were removed" marker.  It keeps
+        the most useful continuity anchors that can be extracted locally:
+        recent user asks, assistant/tool actions, files/commands mentioned in
+        tool calls, and any error text.  The result uses the normal summary
+        structure so downstream prompts can recover gracefully after a provider
+        outage or summary-model failure.
+        """
+        user_asks: list[str] = []
+        assistant_actions: list[str] = []
+        tool_actions: list[str] = []
+        relevant_files: list[str] = []
+        blockers: list[str] = []
+        last_dropped_turns: list[str] = []
+
+        def _compact_fallback_turn(value: Any) -> str:
+            text = redact_sensitive_text(_content_text_for_contains(value))
+            text = re.sub(r"\bgh[pousr]_[A-Za-z0-9_]{8,}\b", "[REDACTED]", text)
+            text = re.sub(r"\s+", " ", text).strip()
+            if len(text) > _FALLBACK_TURN_MAX_CHARS:
+                text = text[: _FALLBACK_TURN_MAX_CHARS - 15].rstrip() + " ...[truncated]"
+            return re.sub(r"\bgh[pousr]_[A-Za-z0-9_.-]+", "[REDACTED]", text)
+
+        def _remember_dropped_turn(label: str, text: str, *, limit: int = 8) -> None:
+            text = text.strip()
+            if not text:
+                return
+            last_dropped_turns.append(f"{label}: {text}")
+            if len(last_dropped_turns) > limit:
+                del last_dropped_turns[0]
+
+        def _collect_paths_from_jsonish(obj: Any) -> None:
+            if isinstance(obj, dict):
+                for key, val in obj.items():
+                    if key in {"path", "workdir", "file_path", "output_path"} and isinstance(val, str):
+                        _dedupe_append(relevant_files, val, limit=12)
+                    _collect_paths_from_jsonish(val)
+            elif isinstance(obj, list):
+                for val in obj:
+                    _collect_paths_from_jsonish(val)
+            elif isinstance(obj, str):
+                _collect_path_mentions(obj, relevant_files)
+
+        call_id_to_tool: dict[str, tuple[str, str]] = {}
+        for msg in turns_to_summarize:
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                for tc in msg.get("tool_calls") or []:
+                    name, raw_args = _extract_tool_call_name_and_args(tc)
+                    args = redact_sensitive_text(raw_args)
+                    call_id = _extract_tool_call_id(tc)
+                    if call_id:
+                        call_id_to_tool[call_id] = (name, args)
+                    if args:
+                        try:
+                            parsed = json.loads(args)
+                        except Exception:
+                            parsed = args
+                        _collect_paths_from_jsonish(parsed)
+
+        for msg in turns_to_summarize:
+            role = msg.get("role", "unknown")
+            text = _compact_fallback_turn(msg.get("content"))
+            _collect_path_mentions(text, relevant_files)
+
+            turn_text = text
+            turn_tool_names: list[str] = []
+            if role == "assistant" and msg.get("tool_calls"):
+                for tc in msg.get("tool_calls") or []:
+                    name, _args = _extract_tool_call_name_and_args(tc)
+                    turn_tool_names.append(name)
+                if turn_tool_names:
+                    prefix = "tool calls: " + ", ".join(turn_tool_names[:6])
+                    turn_text = f"{prefix}; {turn_text}" if turn_text else prefix
+            _remember_dropped_turn(str(role).upper(), turn_text)
+
+            if len(text) > 600:
+                text = text[:420].rstrip() + " ... " + text[-160:].lstrip()
+
+            if role == "user" and text:
+                user_asks.append(text)
+            elif role == "assistant":
+                tool_names: list[str] = []
+                for tc in msg.get("tool_calls") or []:
+                    name, _args = _extract_tool_call_name_and_args(tc)
+                    tool_names.append(name)
+                if tool_names:
+                    assistant_actions.append(
+                        "Called tool(s): " + ", ".join(tool_names[:6])
+                    )
+                elif text:
+                    assistant_actions.append(text)
+            elif role == "tool":
+                call_id = str(msg.get("tool_call_id") or "")
+                tool_name, tool_args = call_id_to_tool.get(call_id, ("unknown", ""))
+                tool_actions.append(
+                    _summarize_tool_result(tool_name, tool_args, text or "")
+                )
+                if re.search(
+                    r"\b(error|failed|exception|traceback|timeout|timed out|fatal)\b",
+                    text,
+                    re.I,
+                ):
+                    blockers.append(text[:500])
+
+        def _bullets(items: list[str], limit: int = 8) -> str:
+            unique: list[str] = []
+            seen: set[str] = set()
+            for item in items:
+                item = item.strip()
+                if not item or item in seen:
+                    continue
+                seen.add(item)
+                unique.append(item)
+                if len(unique) >= limit:
+                    break
+            return "\n".join(f"- {item}" for item in unique) if unique else "None."
+
+        completed: list[str] = []
+        for idx, item in enumerate((assistant_actions + tool_actions)[:12], start=1):
+            completed.append(f"{idx}. {item}")
+
+        active_task = (
+            f"User asked: {user_asks[-1]!r}"
+            if user_asks
+            else "Unknown from deterministic fallback."
+        )
+        previous_summary_note = ""
+        if self._previous_summary:
+            previous_summary_note = (
+                "\n\nPrevious compaction summary was present and should still be treated as "
+                "background continuity context, but the latest LLM summary update failed."
+            )
+
+        reason_text = f" Summary failure reason: {reason}." if reason else ""
+        body = f"""## Active Task
+{active_task}
+
+## Goal
+Recovered from a deterministic fallback because the LLM context summarizer was unavailable. Continue from the protected recent messages after this summary and use current file/system state for exact details.{previous_summary_note}
+
+## Constraints & Preferences
+- This fallback was generated locally without an LLM summary call.
+- Secrets and credentials were redacted before preservation.
+- The summary may be incomplete; prefer verifying current files, git state, processes, and test results instead of assuming omitted details.
+
+## Completed Actions
+{chr(10).join(completed) if completed else "None recoverable from compacted turns."}
+
+## Active State
+Unknown from deterministic fallback. Inspect current repository/session state if needed.
+
+## In Progress
+{active_task}
+
+## Blocked
+{_bullets(blockers, limit=5)}
+
+## Key Decisions
+None recoverable from deterministic fallback.
+
+## Resolved Questions
+None recoverable from deterministic fallback.
+
+## Pending User Asks
+{active_task}
+
+## Relevant Files
+{_bullets(relevant_files, limit=12)}
+
+## Remaining Work
+Continue from the most recent unfulfilled user ask and protected tail messages. Verify state with tools before making claims.
+
+## Last Dropped Turns
+{_bullets(last_dropped_turns, limit=8)}
+
+## Critical Context
+Summary generation was unavailable, so this is a best-effort deterministic fallback for {len(turns_to_summarize)} compacted message(s).{reason_text}"""
+        summary = self._with_summary_prefix(redact_sensitive_text(body.strip()))
+        if len(summary) > _FALLBACK_SUMMARY_MAX_CHARS:
+            summary = summary[: _FALLBACK_SUMMARY_MAX_CHARS - 42].rstrip() + "\n...[fallback summary truncated]"
+        return summary
+
    def _fallback_to_main_for_compression(self, e: Exception, reason: str) -> None:
        """Switch from a separate ``summary_model`` back to the main model.

@ -911,7 +1214,11 @@ class ContextCompressor(ContextEngine):
        self.summary_model = ""  # empty = use main model
        self._summary_failure_cooldown_until = 0.0  # no cooldown — retry immediately

-    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None) -> Optional[str]:
+    def _generate_summary(
+        self,
+        turns_to_summarize: List[Dict[str, Any]],
+        focus_topic: Optional[str] = None,
+    ) -> Optional[str]:
        """Generate a structured summary of conversation turns.

        Uses a structured template (Goal, Progress, Decisions, Resolved/Pending
@ -959,11 +1266,27 @@ class ContextCompressor(ContextEngine):

        # Shared structured template (used by both paths).
        _template_sections = f"""## Active Task
-[THE SINGLE MOST IMPORTANT FIELD. Copy the user's most recent request or
-task assignment verbatim — the exact words they used. If multiple tasks
-were requested and only some are done, list only the ones NOT yet completed.
-Continuation should pick up exactly here. Example:
+[THE SINGLE MOST IMPORTANT FIELD. Capture the user's most recent unfulfilled
+input verbatim — the exact words they used. This includes:
+- Explicit task assignments ("refactor the auth module")
+- Questions awaiting an answer ("waarom staat X op Y?", "wat zijn de volgende stappen?")
+- Decisions awaiting input ("optie A of B?")
+- Ongoing discussions where the assistant owes the next substantive reply
+A conversation where the user just asked a question IS an active task — the
+task is "answer that question with full context". Do NOT write "None" merely
+because the user did not issue an imperative command; reserve "None" for the
+rare case where the last exchange was fully resolved and the user said
+something like "thanks, that's all".
+If multiple items are outstanding, list only the ones NOT yet completed.
+Continuation should pick up exactly here. Examples:
 "User asked: 'Now refactor the auth module to use JWT instead of sessions'"
+"User asked: 'Waarom stond provider ineens op openrouter?' — needs investigation + answer"
+"User chose option A; awaiting implementation of step 2"
+If the user's most recent message was a reverse signal (stop, undo, roll
+back, never mind, just verify, change of topic) that supersedes earlier
+work, write the reverse signal verbatim and DO NOT carry forward the
+cancelled task. Example: "User asked: 'Stop the i18n refactor and just
+verify the current diff' — earlier i18n in-flight work is cancelled."
 If no outstanding task exists, write "None."]

 ## Goal
@ -1029,7 +1352,7 @@ PREVIOUS SUMMARY:
 NEW TURNS TO INCORPORATE:
 {content_to_summarize}

-Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new completed actions to the numbered list (continue numbering). Move items from "In Progress" to "Completed Actions" when done. Move answered questions to "Resolved Questions". Update "Active State" to reflect current state. Remove information only if it is clearly obsolete. CRITICAL: Update "## Active Task" to reflect the user's most recent unfulfilled request — this is the most important field for task continuity.
+Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new completed actions to the numbered list (continue numbering). Move items from "In Progress" to "Completed Actions" when done. Move answered questions to "Resolved Questions". Update "Active State" to reflect current state. Remove information only if it is clearly obsolete. CRITICAL: Update "## Active Task" to reflect the user's most recent unfulfilled input — this includes any question, decision request, or discussion turn that the assistant has not yet answered. Only write "None" if the last exchange was fully resolved.

 {_template_sections}"""
        else:
@ -1193,9 +1516,16 @@ The user has requested that this compaction PRIORITISE preserving all informatio

    @staticmethod
    def _strip_summary_prefix(summary: str) -> str:
-        """Return summary body without the current or legacy handoff prefix."""
+        """Return summary body without the current, legacy, or any historical
+        handoff prefix.
+
+        Historical prefixes must be stripped too: a handoff persisted under an
+        older prefix can be inherited into a resumed lineage (#35344), and if we
+        only re-prepend the current prefix without removing the old one, the
+        stale directive it carried stays embedded in the body.
+        """
        text = (summary or "").strip()
-        for prefix in (SUMMARY_PREFIX, LEGACY_SUMMARY_PREFIX):
+        for prefix in (SUMMARY_PREFIX, LEGACY_SUMMARY_PREFIX, *_HISTORICAL_SUMMARY_PREFIXES):
            if text.startswith(prefix):
                return text[len(prefix):].lstrip()
        return text
@ -1209,7 +1539,9 @@ The user has requested that this compaction PRIORITISE preserving all informatio
    @staticmethod
    def _is_context_summary_content(content: Any) -> bool:
        text = _content_text_for_contains(content).lstrip()
-        return text.startswith(SUMMARY_PREFIX) or text.startswith(LEGACY_SUMMARY_PREFIX)
+        if text.startswith(SUMMARY_PREFIX) or text.startswith(LEGACY_SUMMARY_PREFIX):
+            return True
+        return any(text.startswith(p) for p in _HISTORICAL_SUMMARY_PREFIXES)

    @classmethod
    def _find_latest_context_summary(
@ -1608,9 +1940,9 @@ The user has requested that this compaction PRIORITISE preserving all informatio
        #   True  → ABORT compression entirely. Return messages unchanged
        #           and set _last_compress_aborted=True so callers can warn
        #           the user and stop the auto-compress retry loop.
-        #   False → Fall through to the legacy fallback path below: insert
-        #           a static "summary unavailable" placeholder and drop the
-        #           middle window.  Records _last_summary_fallback_used /
+        #   False → Fall through to the default fallback path below: insert
+        #           a deterministic "summary unavailable" handoff and drop
+        #           the middle window.  Records _last_summary_fallback_used /
        #           _last_summary_dropped_count for gateway hygiene to
        #           surface a warning.
        # Default is False (historical behavior).
@ -1643,21 +1975,18 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                    )
            compressed.append(msg)

-        # Legacy fallback path: LLM summary failed and abort_on_summary_failure
-        # is False (the default).  Insert a static placeholder so the model
-        # knows context was lost rather than silently dropping everything.
+        # If LLM summary failed, insert a deterministic fallback so the model
+        # gets at least locally recoverable continuity anchors instead of a
+        # content-free "N messages were removed" marker.
        if not summary:
            if not self.quiet_mode:
-                logger.warning("Summary generation failed — inserting static fallback context marker")
+                logger.warning("Summary generation failed — inserting deterministic fallback context summary")
            n_dropped = compress_end - compress_start
            self._last_summary_dropped_count = n_dropped
            self._last_summary_fallback_used = True
-            summary = (
-                f"{SUMMARY_PREFIX}\n"
-                f"Summary generation was unavailable. {n_dropped} message(s) were "
-                f"removed to free context space but could not be summarized. The removed "
-                f"messages contained earlier work in this session. Continue based on the "
-                f"recent messages below and the current state of any files or resources."
+            summary = self._build_static_fallback_summary(
+                turns_to_summarize,
+                reason=self._last_summary_error,
            )

        _merge_summary_into_tail = False
--- a/agent/context_engine.py
+++ b/agent/context_engine.py
@ -115,6 +115,15 @@ class ContextEngine(ABC):
        """
        return False

+    def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool:
+        """Return True when preflight should trust recent real usage instead.
+
+        Built-in compression uses this to avoid re-compacting from known-noisy
+        rough estimates after a compressed request has already fit. Third-party
+        engines can ignore it safely.
+        """
+        return False
+
    # -- Optional: manual /compress preflight ------------------------------

    def has_content_to_compress(self, messages: List[Dict[str, Any]]) -> bool:
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@ -34,13 +34,33 @@ import tempfile
 import uuid
 from datetime import datetime
 from pathlib import Path
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional, Tuple

 from agent.model_metadata import estimate_request_tokens_rough

 logger = logging.getLogger(__name__)


+def _compression_lock_holder(agent: Any) -> str:
+    """Build a unique holder id for the lock: pid:tid:agent-instance:uuid.
+
+    The pid+tid prefix lets ops tell crashed/abandoned holders apart from
+    live ones (expiry-based recovery uses the timestamp, but ``holder``
+    is what shows up in diagnostics + log lines). The agent instance id
+    and a per-acquire uuid disambiguate two co-resident agents on the
+    same thread (background_review forks run on a worker thread, but
+    on machines where compression itself dispatches to a thread pool
+    we want each acquire to be unique).
+    """
+    import threading
+    return (
+        f"pid={os.getpid()}"
+        f":tid={threading.get_ident()}"
+        f":agent={id(agent):x}"
+        f":nonce={uuid.uuid4().hex[:8]}"
+    )
+
+
 def check_compression_model_feasibility(agent: Any) -> None:
    """Warn at session start if the auxiliary compression model's context
    window is smaller than the main model's compression threshold.
@ -305,6 +325,103 @@ def compress_context(
        "🗜️ Compacting context — summarizing earlier conversation so I can continue..."
    )

+    # ── Compression lock ────────────────────────────────────────────────
+    # Atomic, state.db-backed lock per session_id.  Without this, two
+    # AIAgent instances that share the same session_id (most commonly the
+    # parent-turn agent and its background-review fork — see
+    # ``agent/background_review.py``: ``review_agent.session_id =
+    # agent.session_id``) can each call compress() on overlapping
+    # snapshots of the same conversation.  Both succeed, both rotate
+    # ``agent.session_id`` to a fresh id, both create child sessions in
+    # state.db parented to the same old id.  The gateway's SessionEntry
+    # only catches one rotation, so the other child becomes an orphan
+    # that silently accumulates writes — Damien's repro shape.
+    #
+    # Acquire keyed on the OLD session_id (the rotation target's parent),
+    # because that's the id that competing paths see and read from
+    # SessionEntry at the start of their own compression attempt.
+    #
+    # If we can't acquire the lock, another path is mid-compression on
+    # this session.  Aborting is correct: the messages are unchanged, the
+    # other path's rotation will produce the canonical new session_id,
+    # and our caller's auto-compress loop sees ``len(returned) == len(input)``
+    # and stops retrying for this cycle. The session is NOT corrupted —
+    # we just sit out this round and let the winner finish.
+    _lock_db = getattr(agent, "_session_db", None)
+    _lock_sid = agent.session_id or ""
+    _lock_holder: Optional[str] = None
+    # Probe whether the lock subsystem is actually available on this
+    # SessionDB instance.  A process running mismatched module versions
+    # (e.g. ``conversation_compression.py`` reloaded after a pull but the
+    # long-lived ``hermes_state.SessionDB`` class still bound to the
+    # pre-#34351 version in memory) has the call site but not the method.
+    # In that case ``try_acquire_compression_lock`` raises AttributeError —
+    # NOT a ``sqlite3.Error`` — so the method's own fail-open guard never
+    # runs and the exception propagates to the outer agent loop, which
+    # prints the error and retries.  Because compression never succeeds,
+    # the token count never drops and the loop re-triggers compaction
+    # forever (the "API call #47/#48/#49 ... has no attribute
+    # try_acquire_compression_lock" spin).  Fail OPEN here: if the lock
+    # subsystem is missing or broken in any unexpected way, skip locking
+    # and proceed with compression.  Skipping the lock risks a rare
+    # concurrent-compression session fork; an infinite no-progress loop
+    # that never compresses at all is strictly worse.
+    if _lock_db is not None and _lock_sid:
+        _lock_holder = _compression_lock_holder(agent)
+        try:
+            _lock_acquired = _lock_db.try_acquire_compression_lock(
+                _lock_sid, _lock_holder
+            )
+        except Exception as _lock_err:
+            # Broken/absent lock subsystem (version skew, etc.).  Log once
+            # per session and proceed WITHOUT the lock rather than letting
+            # the exception spin the outer loop.
+            _lock_holder = None  # we don't own anything to release
+            if getattr(agent, "_last_compression_lock_error_sid", None) != _lock_sid:
+                agent._last_compression_lock_error_sid = _lock_sid
+                logger.warning(
+                    "compression lock subsystem unavailable for session=%s "
+                    "(%s: %s) — proceeding without lock. This usually means a "
+                    "stale in-memory module after an update; restart the "
+                    "process (or `hermes update`) to resync.",
+                    _lock_sid, type(_lock_err).__name__, _lock_err,
+                )
+            _lock_acquired = True  # treat as acquired-but-unlocked; proceed
+        if not _lock_acquired:
+            try:
+                existing = _lock_db.get_compression_lock_holder(_lock_sid)
+            except Exception:
+                existing = None
+            logger.warning(
+                "compression skipped: another path is compressing session=%s "
+                "(holder=%s) — returning messages unchanged to avoid session fork",
+                _lock_sid, existing,
+            )
+            _lock_holder = None  # don't release a lock we don't own
+            # Surface to the user once — quiet for downstream auto-compress loops
+            if getattr(agent, "_last_compression_lock_warning_sid", None) != _lock_sid:
+                agent._last_compression_lock_warning_sid = _lock_sid
+                try:
+                    agent._emit_warning(
+                        "⚠ Skipping concurrent compression — another path "
+                        "is already compressing this session. Will retry "
+                        "after it finishes."
+                    )
+                except Exception:
+                    pass
+            _existing_sp = getattr(agent, "_cached_system_prompt", None)
+            if not _existing_sp:
+                _existing_sp = agent._build_system_prompt(system_message)
+            return messages, _existing_sp
+
+    def _release_lock() -> None:
+        """Release the lock keyed on the OLD session_id (before rotation)."""
+        if _lock_db is not None and _lock_sid and _lock_holder:
+            try:
+                _lock_db.release_compression_lock(_lock_sid, _lock_holder)
+            except Exception as _rel_err:
+                logger.debug("compression lock release failed: %s", _rel_err)
+
    # Notify external memory provider before compression discards context
    if agent._memory_manager:
        try:
@ -318,6 +435,11 @@ def compress_context(
        # Plugin context engine with strict signature that doesn't accept
        # focus_topic / force — fall back to calling without them.
        compressed = agent.context_compressor.compress(messages, current_tokens=approx_tokens)
+    except BaseException:
+        # ANY exception during compress() must release the lock so the
+        # session isn't permanently blocked from future compression.
+        _release_lock()
+        raise

    # If compression aborted (aux LLM failed to produce a usable summary)
    # the compressor returns the input messages unchanged.  Surface the
@ -336,6 +458,7 @@ def compress_context(
        _existing_sp = getattr(agent, "_cached_system_prompt", None)
        if not _existing_sp:
            _existing_sp = agent._build_system_prompt(system_message)
+        _release_lock()  # compression aborted — no rotation will happen
        return messages, _existing_sp

    summary_error = getattr(agent.context_compressor, "_last_summary_error", None)
@ -452,19 +575,18 @@ def compress_context(
            force=True,
        )

-    # Update token estimate after compaction so pressure calculations
-    # use the post-compression count, not the stale pre-compression one.
-    # Use estimate_request_tokens_rough() so tool schemas are included —
-    # with 50+ tools enabled, schemas alone can add 20-30K tokens, and
-    # omitting them delays the next compression cycle far past the
-    # configured threshold (issue #14695).
+    # Keep the post-compression rough estimate for diagnostics, but do not
+    # treat it as provider-reported prompt usage. Schema-heavy rough estimates
+    # can remain above threshold even after the next real API request fits.
    _compressed_est = estimate_request_tokens_rough(
        compressed,
        system_prompt=new_system_prompt or "",
        tools=agent.tools or None,
    )
-    agent.context_compressor.last_prompt_tokens = _compressed_est
+    agent.context_compressor.last_compression_rough_tokens = _compressed_est
+    agent.context_compressor.last_prompt_tokens = -1
    agent.context_compressor.last_completion_tokens = 0
+    agent.context_compressor.awaiting_real_usage_after_compression = True

    # Clear the file-read dedup cache.  After compression the original
    # read content is summarised away — if the model re-reads the same
@ -476,10 +598,16 @@ def compress_context(
        pass

    logger.info(
-        "context compression done: session=%s messages=%d->%d tokens=~%s",
+        "context compression done: session=%s messages=%d->%d rough_tokens=~%s awaiting_real_usage=true",
        agent.session_id or "none", _pre_msg_count, len(compressed),
        f"{_compressed_est:,}",
    )
+    # Release the lock on the OLD session_id only AFTER rotation completed
+    # and all post-rotation bookkeeping (memory manager, context engine,
+    # file dedup) ran. A concurrent path that wakes up the moment we
+    # release will see the NEW session_id in state.db / SessionEntry and
+    # acquire on that — no race against our just-finished work.
+    _release_lock()
    return compressed, new_system_prompt


--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -27,8 +27,6 @@ import time
 import uuid
 from typing import Any, Dict, List, Optional

-from agent.anthropic_adapter import _is_oauth_token
-from agent.auxiliary_client import set_runtime_main
 from agent.codex_responses_adapter import _summarize_user_message_for_log
 from agent.display import KawaiiSpinner
 from agent.error_classifier import FailoverReason, classify_api_error
@ -53,20 +51,13 @@ from agent.model_metadata import (
    parse_available_output_tokens_from_error,
    save_context_length,
 )
-from agent.nous_rate_guard import (
-    clear_nous_rate_limit,
-    is_genuine_nous_rate_limit,
-    nous_rate_limit_remaining,
-    record_nous_rate_limit,
-)
 from agent.process_bootstrap import _install_safe_stdio
 from agent.prompt_caching import apply_anthropic_cache_control
 from agent.retry_utils import jittered_backoff
 from agent.trajectory import has_incomplete_scratchpad
 from agent.usage_pricing import estimate_usage_cost, normalize_usage
-from hermes_constants import display_hermes_home as _dhh_fn, PARTIAL_STREAM_STUB_ID
+from hermes_constants import PARTIAL_STREAM_STUB_ID
 from hermes_logging import set_session_context
-from tools.schema_sanitizer import strip_pattern_and_format
 from tools.skill_provenance import set_current_write_origin
 from utils import base_url_host_matches, env_var_enabled

@ -212,15 +203,13 @@ def _print_billing_or_entitlement_guidance(
 def _try_refresh_nous_paid_entitlement_credentials(agent) -> bool:
    """Refresh Nous runtime credentials after a fresh paid-entitlement check."""
    try:
-        from hermes_cli.auth import NOUS_INFERENCE_AUTH_MODE_LEGACY
        from hermes_cli.nous_account import get_nous_portal_account_info

        account_info = get_nous_portal_account_info(force_fresh=True)
        if account_info.paid_service_access is not True:
            return False
        return agent._try_refresh_nous_client_credentials(
-            force=False,
-            inference_auth_mode=NOUS_INFERENCE_AUTH_MODE_LEGACY,
+            force=True,
        )
    except Exception:
        return False
@ -403,13 +392,15 @@ def run_conversation(
        set_runtime_main(
            getattr(agent, "provider", "") or "",
            getattr(agent, "model", "") or "",
+            base_url=getattr(agent, "base_url", "") or "",
+            api_key=getattr(agent, "api_key", "") or "",
+            api_mode=getattr(agent, "api_mode", "") or "",
        )
    except Exception:
        pass

    # Tag all log records on this thread with the session ID so
    # ``hermes logs --session <id>`` can filter a single conversation.
-    from hermes_logging import set_session_context
    set_session_context(agent.session_id)

    # Bind the skill write-origin ContextVar for this thread so tool
@ -418,7 +409,6 @@ def run_conversation(
    # a foreground user-directed turn. Set at the top of each call;
    # the review fork runs on its own thread with a fresh context,
    # so the foreground value here does not leak into it.
-    from tools.skill_provenance import set_current_write_origin
    set_current_write_origin(getattr(agent, "_memory_write_origin", "assistant_tool"))

    # If the previous turn activated fallback, restore the primary
@ -613,18 +603,50 @@ def run_conversation(
            system_prompt=active_system_prompt or "",
            tools=agent.tools or None,
        )
+        _compressor = agent.context_compressor
+        _defer_preflight = getattr(
+            _compressor,
+            "should_defer_preflight_to_real_usage",
+            lambda _tokens: False,
+        )
+        _preflight_deferred = _defer_preflight(_preflight_tokens)

-        if agent.context_compressor.should_compress(_preflight_tokens):
+        if not _preflight_deferred:
+            # Keep the CLI/ACP context display in sync with what preflight
+            # actually measured.  The status bar reads
+            # ``compressor.last_prompt_tokens``, which otherwise only updates
+            # from a *successful* API response.  When the conversation has grown
+            # since the last successful call — or when compression then fails
+            # (e.g. the auxiliary summary model times out) and no fresh usage
+            # arrives — the bar stays stuck at the old, smaller value while
+            # preflight reports a much larger number, looking out of sync.
+            # Seed it with the fresh estimate (only ever revising upward; a real
+            # ``update_from_response`` will correct it after the next API call).
+            # Skipped when deferring — a deferred estimate is known to over-count
+            # vs the last real provider prompt, so trusting it for the display
+            # would re-introduce the very desync we're avoiding.
+            if _preflight_tokens > (_compressor.last_prompt_tokens or 0):
+                _compressor.last_prompt_tokens = _preflight_tokens
+
+        if _preflight_deferred:
+            logger.info(
+                "Skipping preflight compression: rough estimate ~%s >= %s, "
+                "but last real provider prompt was %s after compression",
+                f"{_preflight_tokens:,}",
+                f"{_compressor.threshold_tokens:,}",
+                f"{_compressor.last_real_prompt_tokens:,}",
+            )
+        elif _compressor.should_compress(_preflight_tokens):
            logger.info(
                "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)",
                f"{_preflight_tokens:,}",
-                f"{agent.context_compressor.threshold_tokens:,}",
+                f"{_compressor.threshold_tokens:,}",
                agent.model,
-                f"{agent.context_compressor.context_length:,}",
+                f"{_compressor.context_length:,}",
            )
            agent._emit_status(
                f"📦 Preflight compression: ~{_preflight_tokens:,} tokens "
-                f">= {agent.context_compressor.threshold_tokens:,} threshold. "
+                f">= {_compressor.threshold_tokens:,} threshold. "
                "This may take a moment."
            )
            # May need multiple passes for very large sessions with small
@ -659,8 +681,8 @@ def run_conversation(
                    system_prompt=active_system_prompt or "",
                    tools=agent.tools or None,
                )
-                if _preflight_tokens < agent.context_compressor.threshold_tokens:
-                    break  # Under threshold
+                if not _compressor.should_compress(_preflight_tokens):
+                    break  # Under threshold or anti-thrash guard stopped it

    # Plugin hook: pre_llm_call
    # Fired once per turn before the tool-calling loop.  Plugins can
@ -1470,7 +1492,8 @@ def run_conversation(
                    
                    if retry_count >= max_retries:
                        # Try fallback before giving up
-                        agent._buffer_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...")
+                        if agent._has_pending_fallback():
+                            agent._buffer_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...")
                        if agent._try_activate_fallback():
                            retry_count = 0
                            compression_attempts = 0
@ -3072,12 +3095,17 @@ def run_conversation(
                ) and not is_context_length_error

                if is_client_error:
-                    # Try fallback before aborting — a different provider
-                    # may not have the same issue (rate limit, auth, etc.)
-                    if classified.reason == FailoverReason.content_policy_blocked:
-                        agent._buffer_status("⚠️ Provider safety filter blocked this request — trying fallback...")
-                    else:
-                        agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
+                    # Try fallback before aborting — a different provider may
+                    # not have the same issue (rate limit, auth, etc.). Only
+                    # announce the attempt when a fallback chain actually
+                    # exists; otherwise "trying fallback..." is a lie and the
+                    # session looks like it's recovering when it's about to
+                    # abort silently (#35314, #17446).
+                    if agent._has_pending_fallback():
+                        if classified.reason == FailoverReason.content_policy_blocked:
+                            agent._buffer_status("⚠️ Provider safety filter blocked this request — trying fallback...")
+                        else:
+                            agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
@ -3220,7 +3248,8 @@ def run_conversation(
                        retry_count = 0
                        continue
                    # Try fallback before giving up entirely
-                    agent._buffer_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...")
+                    if agent._has_pending_fallback():
+                        agent._buffer_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
@ -3875,6 +3904,11 @@ def run_conversation(
                    # inflate completion_tokens with reasoning,
                    # causing premature compression.  (#12026)
                    _real_tokens = _compressor.last_prompt_tokens
+                elif _compressor.last_prompt_tokens == -1:
+                    # Compression just ran and no API-reported prompt count
+                    # has arrived yet. Avoid treating a schema-heavy rough
+                    # post-compression estimate as real context pressure.
+                    _real_tokens = 0
                else:
                    # Include tool schemas — with 50+ tools enabled
                    # these add 20-30K tokens the messages-only
@ -4314,36 +4348,54 @@ def run_conversation(
            )
        final_response = agent._handle_max_iterations(messages, api_call_count)

-        # If running as a kanban worker, block the task so the dispatcher
-        # knows the worker could not complete (rather than treating it as a
+        # If running as a kanban worker, signal the dispatcher that the
+        # worker could not complete (rather than treating it as a
        # protocol violation).  The agent loop strips tools before calling
        # _handle_max_iterations, so the model cannot call kanban_block
        # itself — we must do it on its behalf.
+        #
+        # We route through ``_record_task_failure(outcome="timed_out")``
+        # rather than ``kanban_block`` so this counts toward the
+        # ``consecutive_failures`` counter and the dispatcher's
+        # ``failure_limit`` circuit breaker (#29747 gap 2).  Without this,
+        # a task whose worker keeps exhausting its budget would block
+        # silently each run, get auto-promoted by the operator (or never
+        # surface), and re-block in an endless loop with no signal.
        _kanban_task = os.environ.get("HERMES_KANBAN_TASK")
        if _kanban_task:
            try:
-                _ra().handle_function_call(
-                    "kanban_block",
-                    {
-                        "task_id": _kanban_task,
-                        "reason": (
+                from hermes_cli import kanban_db as _kb
+                _conn = _kb.connect()
+                try:
+                    _kb._record_task_failure(
+                        _conn,
+                        _kanban_task,
+                        error=(
                            f"Iteration budget exhausted "
                            f"({api_call_count}/{agent.max_iterations}) — "
                            "task could not complete within the allowed "
                            "iterations"
                        ),
-                    },
-                    task_id=effective_task_id,
-                )
-                logger.info(
-                    "kanban_block called for task %s after iteration "
-                    "exhaustion (%d/%d)",
-                    _kanban_task, api_call_count, agent.max_iterations,
-                )
+                        outcome="timed_out",
+                        release_claim=True,
+                        end_run=True,
+                        event_payload_extra={
+                            "budget_used": api_call_count,
+                            "budget_max": agent.max_iterations,
+                        },
+                    )
+                    logger.info(
+                        "recorded budget-exhausted failure for task %s (%d/%d)",
+                        _kanban_task, api_call_count, agent.max_iterations,
+                    )
+                finally:
+                    try:
+                        _conn.close()
+                    except Exception:
+                        pass
            except Exception:
                logger.warning(
-                    "Failed to call kanban_block after iteration "
-                    "exhaustion for task %s",
+                    "Failed to record budget-exhausted failure for task %s",
                    _kanban_task,
                    exc_info=True,
                )
@ -4438,6 +4490,55 @@ def run_conversation(
        except Exception as _ver_err:
            logger.debug("file-mutation verifier footer failed: %s", _ver_err)

+    # Turn-completion explainer.
+    # When a turn ends abnormally after substantive work — empty content
+    # after retries, a partial/truncated stream, a still-pending tool
+    # result, or an iteration/budget limit — the user otherwise gets a
+    # blank or fragmentary response box with no consolidated reason why
+    # the agent stopped (#34452).  Surface a single user-visible
+    # explanation derived from ``_turn_exit_reason``, mirroring the
+    # file-mutation verifier footer pattern above.
+    #
+    # Gate carefully so healthy turns stay quiet:
+    #   - ``text_response(...)`` exits never produce an explanation
+    #     (handled inside the formatter), so a terse ``Done.`` is silent.
+    #   - We only ACT when there is no genuinely usable reply this turn:
+    #     an empty response, the "(empty)" terminal sentinel, or a
+    #     suspiciously short partial fragment with no terminating
+    #     punctuation (e.g. "The").  A real short answer keeps its text.
+    if not interrupted:
+        try:
+            if agent._turn_completion_explainer_enabled():
+                _stripped = (final_response or "").strip()
+                _is_empty_terminal = _stripped == "" or _stripped == "(empty)"
+                # A short fragment that is not a normal text_response exit
+                # and lacks sentence-ending punctuation is treated as a
+                # truncated partial (the "The" case from #34452).
+                _is_partial_fragment = (
+                    not _is_empty_terminal
+                    and not str(_turn_exit_reason).startswith("text_response")
+                    and len(_stripped) <= 24
+                    and _stripped[-1:] not in {".", "!", "?", "。", "！", "？", "`", ")"}
+                )
+                if _is_empty_terminal or _is_partial_fragment:
+                    _explanation = agent._format_turn_completion_explanation(
+                        _turn_exit_reason
+                    )
+                    if _explanation:
+                        if _is_empty_terminal:
+                            # Replace the bare "(empty)"/blank sentinel with
+                            # the actionable explanation.
+                            final_response = _explanation
+                        else:
+                            # Keep the partial fragment, append the reason so
+                            # the user sees both what arrived and why it
+                            # stopped.
+                            final_response = (
+                                _stripped + "\n\n" + _explanation
+                            )
+        except Exception as _exp_err:
+            logger.debug("turn-completion explainer failed: %s", _exp_err)
+
    _response_transformed = False

    # Plugin hook: transform_llm_output
@ -4561,6 +4662,7 @@ def run_conversation(
        original_user_message=original_user_message,
        final_response=final_response,
        interrupted=interrupted,
+        messages=messages,
    )

    # Background memory/skill review — runs AFTER the response is delivered
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@ -14,7 +14,7 @@ from datetime import datetime, timezone
 from typing import Any, Dict, List, Optional, Set, Tuple

 from hermes_constants import OPENROUTER_BASE_URL
-from hermes_cli.config import get_env_value, load_env
+from hermes_cli.config import load_env
 from agent.credential_persistence import (
    is_borrowed_credential_source,
    sanitize_borrowed_credential_payload,
@ -22,7 +22,6 @@ from agent.credential_persistence import (
 import hermes_cli.auth as auth_mod
 from hermes_cli.auth import (
    CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
-    DEFAULT_AGENT_KEY_MIN_TTL_SECONDS,
    PROVIDER_REGISTRY,
    _auth_store_lock,
    _codex_access_token_is_expiring,
@ -55,6 +54,38 @@ def _load_config_safe() -> Optional[dict]:

 STATUS_OK = "ok"
 STATUS_EXHAUSTED = "exhausted"
+# Terminal failure — the credential will never recover on its own.  Used for
+# upstream-permanent OAuth states like ``token_invalidated`` / ``token_revoked``
+# where retrying after a TTL cooldown is guaranteed to fail.  ``DEAD`` entries
+# are excluded from rotation unconditionally and only clear when an explicit
+# write-side sync (e.g. ``_save_codex_tokens`` after a fresh device-code
+# login) rewrites the tokens.
+STATUS_DEAD = "dead"
+
+# OAuth error reasons that indicate the credential is permanently invalid
+# server-side and cannot be recovered by retry/refresh.  Sourced from
+# OpenAI Codex Responses API, Anthropic, xAI, and Google OAuth spec.
+_TERMINAL_AUTH_REASONS = frozenset({
+    "token_invalidated",   # OpenAI Codex: "Your authentication token has been invalidated."
+    "token_revoked",        # OAuth 2.0 RFC 7009: token explicitly revoked
+    "invalid_token",        # RFC 6750: bearer token is malformed/expired/revoked
+    "invalid_grant",        # RFC 6749: refresh_token rejected during refresh
+    "unauthorized_client",  # RFC 6749: client no longer authorized
+    "refresh_token_reused", # Single-use refresh token consumed by another process
+})
+
+# How long a DEAD manual credential is preserved before being pruned.
+# Manual entries (``manual:*``) are independent credentials with no singleton
+# to re-seed from, so pruning them after a quiet window cleans up dead state
+# without losing recoverability — the user always has the option to re-add
+# via ``hermes auth add``.
+#
+# Singleton-seeded entries (``device_code``, ``loopback_pkce``, ``claude_code``)
+# are NOT pruned because ``_seed_from_singletons`` would just re-create them
+# on the next ``load_pool()`` with the same stale singleton tokens, defeating
+# the cleanup.  They remain in the pool marked DEAD until an explicit re-auth
+# write-side sync (``_save_codex_tokens`` etc.) clears the status.
+DEAD_MANUAL_PRUNE_TTL_SECONDS = 24 * 60 * 60  # 24 hours

 AUTH_TYPE_OAUTH = "oauth"
 AUTH_TYPE_API_KEY = "api_key"
@ -171,8 +202,22 @@ class PooledCredential:
    def runtime_api_key(self) -> str:
        if self.provider == "nous":
            # Nous stores the runtime inference credential in agent_key for
-            # compatibility. It may be a NAS invoke JWT or legacy opaque key.
-            return str(self.agent_key or self.access_token or "")
+            # compatibility. It must be a NAS invoke JWT.
+            for token, expires_at in (
+                (self.agent_key, self.agent_key_expires_at),
+                (self.access_token, self.expires_at),
+            ):
+                if (
+                    isinstance(token, str)
+                    and token.strip()
+                    and auth_mod._nous_invoke_jwt_is_usable(
+                        token,
+                        scope=getattr(self, "scope", None),
+                        expires_at=expires_at,
+                    )
+                ):
+                    return token.strip()
+            return ""
        return str(self.access_token or "")

    @property
@ -438,6 +483,29 @@ class CredentialPool:
            [entry.to_dict() for entry in self._entries],
        )

+    def _is_terminal_auth_failure(
+        self,
+        status_code: Optional[int],
+        normalized_error: Dict[str, Any],
+    ) -> bool:
+        """Detect upstream-permanent OAuth failures that won't recover on TTL.
+
+        Only fires for 401 responses whose error code/reason matches a known
+        terminal OAuth state (token_invalidated, token_revoked, invalid_grant,
+        etc.).  Distinguishes permanent failures from transient ones like
+        token_expired (refreshable) or generic 401 without a specific reason
+        (could be a server-side glitch worth retrying).
+
+        Returns False for non-401 status codes — 429 rate limits and 402
+        billing failures are transient by nature and should keep TTL semantics.
+        """
+        if status_code != 401:
+            return False
+        reason = normalized_error.get("reason")
+        if not isinstance(reason, str):
+            return False
+        return reason.strip().lower() in _TERMINAL_AUTH_REASONS
+
    def _mark_exhausted(
        self,
        entry: PooledCredential,
@ -445,9 +513,20 @@ class CredentialPool:
        error_context: Optional[Dict[str, Any]] = None,
    ) -> PooledCredential:
        normalized_error = _normalize_error_context(error_context)
+        # Permanent OAuth failures (token_invalidated, token_revoked, etc.)
+        # transition to STATUS_DEAD instead of STATUS_EXHAUSTED.  Without this,
+        # a revoked credential gets a 1-hour TTL cooldown and then re-enters
+        # rotation, failing immediately every hour until the user manually
+        # removes it (issue #32849).  DEAD entries are excluded from rotation
+        # unconditionally and only clear via an explicit re-auth write-side
+        # sync (``_save_codex_tokens`` after a fresh device-code login).
+        if self._is_terminal_auth_failure(status_code, normalized_error):
+            terminal_status = STATUS_DEAD
+        else:
+            terminal_status = STATUS_EXHAUSTED
        updated = replace(
            entry,
-            last_status=STATUS_EXHAUSTED,
+            last_status=terminal_status,
            last_status_at=time.time(),
            last_error_code=status_code,
            last_error_reason=normalized_error.get("reason"),
@ -852,12 +931,7 @@ class CredentialPool:
                if synced is not entry:
                    entry = synced
                auth_mod.resolve_nous_runtime_credentials(
-                    min_key_ttl_seconds=DEFAULT_AGENT_KEY_MIN_TTL_SECONDS,
-                    inference_auth_mode=(
-                        auth_mod.NOUS_INFERENCE_AUTH_MODE_LEGACY
-                        if force
-                        else auth_mod.NOUS_INFERENCE_AUTH_MODE_AUTO
-                    ),
+                    force_refresh=force,
                )
                updated = self._sync_nous_entry_from_auth_store(entry)
            else:
@ -1139,7 +1213,7 @@ class CredentialPool:
                auth_mod.XAI_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
            )
        if self.provider == "nous":
-            # Nous refresh/mint can require network access and should happen when
+            # Nous refresh can require network access and should happen when
            # runtime credentials are actually resolved, not merely when the pool
            # is enumerated for listing, migration, or selection.
            return False
@ -1158,13 +1232,14 @@ class CredentialPool:
        """
        now = time.time()
        cleared_any = False
+        entries_to_prune: List[str] = []
        available: List[PooledCredential] = []
        for entry in self._entries:
            # For anthropic claude_code entries, sync from the credentials file
            # before any status/refresh checks. This picks up tokens refreshed
            # by other processes (Claude Code CLI, other Hermes profiles).
            if (self.provider == "anthropic" and entry.source == "claude_code"
-                    and entry.last_status == STATUS_EXHAUSTED):
+                    and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
                synced = self._sync_anthropic_entry_from_credentials_file(entry)
                if synced is not entry:
                    entry = synced
@ -1175,7 +1250,7 @@ class CredentialPool:
            # exhausted status stale.
            if (self.provider == "nous"
                    and entry.source == "device_code"
-                    and entry.last_status == STATUS_EXHAUSTED):
+                    and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
                synced = self._sync_nous_entry_from_auth_store(entry)
                if synced is not entry:
                    entry = synced
@ -1187,7 +1262,7 @@ class CredentialPool:
            # future for ChatGPT weekly windows).
            if (self.provider == "openai-codex"
                    and entry.source == "device_code"
-                    and entry.last_status == STATUS_EXHAUSTED):
+                    and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
                synced = self._sync_codex_entry_from_auth_store(entry)
                if synced is not entry:
                    entry = synced
@ -1198,11 +1273,41 @@ class CredentialPool:
            # xAI Grok OAuth login) has since rotated in auth.json.
            if (self.provider == "xai-oauth"
                    and entry.source == "loopback_pkce"
-                    and entry.last_status == STATUS_EXHAUSTED):
+                    and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
                synced = self._sync_xai_oauth_entry_from_auth_store(entry)
                if synced is not entry:
                    entry = synced
                    cleared_any = True
+            if entry.last_status == STATUS_DEAD:
+                # Manual DEAD credentials get pruned after a 24h quiet window
+                # so the pool doesn't accumulate dead entries forever.  The
+                # user can always re-add via ``hermes auth add``.  Singleton-
+                # seeded DEAD entries are kept so the audit trail (label,
+                # last_error_reason, timestamps) stays visible — pruning them
+                # would just be undone by ``_seed_from_singletons`` on the
+                # next load anyway.
+                if _is_manual_source(entry.source):
+                    dead_at = entry.last_status_at or 0
+                    if dead_at and now - dead_at > DEAD_MANUAL_PRUNE_TTL_SECONDS:
+                        _label = entry.label or entry.id[:8]
+                        logger.warning(
+                            "credential pool: pruning DEAD manual entry %s "
+                            "(reason=%s, age=%.1fh) — re-add via `hermes auth add %s`",
+                            _label,
+                            entry.last_error_reason or "unknown",
+                            (now - dead_at) / 3600.0,
+                            self.provider,
+                        )
+                        # Mark for removal after the loop completes; we can't
+                        # mutate self._entries while iterating.
+                        entries_to_prune.append(entry.id)
+                        cleared_any = True
+                # Permanently failed credentials never re-enter rotation via
+                # TTL.  They only clear when a write-side re-auth sync rewrites
+                # the tokens (e.g. ``_save_codex_tokens`` after a fresh
+                # device-code login).  The auth.json-sync paths below handle
+                # the re-auth case for OAuth singletons.
+                continue
            if entry.last_status == STATUS_EXHAUSTED:
                exhausted_until = _exhausted_until(entry)
                if exhausted_until is not None and now < exhausted_until:
@ -1226,6 +1331,9 @@ class CredentialPool:
                    continue
                entry = refreshed
            available.append(entry)
+        if entries_to_prune:
+            pruned_ids = set(entries_to_prune)
+            self._entries = [e for e in self._entries if e.id not in pruned_ids]
        if cleared_any:
            self._persist()
        return available
@ -1293,11 +1401,22 @@ class CredentialPool:
            if entry is None:
                return None
            _label = entry.label or entry.id[:8]
-            logger.info(
-                "credential pool: marking %s exhausted (status=%s), rotating",
-                _label, status_code,
-            )
            self._mark_exhausted(entry, status_code, error_context)
+            # Re-read the updated entry to log the correct terminal state.
+            updated_entry = next(
+                (e for e in self._entries if e.id == entry.id), entry,
+            )
+            if updated_entry.last_status == STATUS_DEAD:
+                logger.warning(
+                    "credential pool: marking %s DEAD (status=%s, reason=%s) — "
+                    "permanently failed, will NOT re-enter rotation until re-auth",
+                    _label, status_code, updated_entry.last_error_reason or "unknown",
+                )
+            else:
+                logger.info(
+                    "credential pool: marking %s exhausted (status=%s), rotating",
+                    _label, status_code,
+                )
            self._current_id = None
            next_entry = self._select_unlocked()
            if next_entry:
@ -1637,9 +1756,9 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
                    "inference_base_url": state.get("inference_base_url"),
                    "agent_key": state.get("agent_key"),
                    "agent_key_expires_at": state.get("agent_key_expires_at"),
-                    # Carry the mint/refresh timestamps into the pool so
+                    # Carry the refresh timestamps into the pool so
                    # freshness-sensitive consumers (self-heal hooks, pool
-                    # pruning by age) can distinguish just-minted credentials
+                    # pruning by age) can distinguish just-refreshed credentials
                    # from stale ones.  Without these, fresh device_code
                    # entries get obtained_at=None and look older than they
                    # are (#15099).
--- a/agent/curator_backup.py
+++ b/agent/curator_backup.py
@ -39,12 +39,9 @@ from __future__ import annotations

 import json
 import logging
-import os
 import re
 import shutil
 import tarfile
-import tempfile
-import time
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
--- a/agent/file_safety.py
+++ b/agent/file_safety.py
@ -249,6 +249,10 @@ def get_read_block_error(path: str) -> Optional[str]:
        ".env",
        "webhook_subscriptions.json",
        os.path.join("auth", "google_oauth.json"),
+        # Bitwarden Secrets Manager disk cache: stores plaintext secret values
+        # to avoid re-fetching across back-to-back CLI invocations. The file
+        # was introduced by #31968 but not added to this guard.
+        os.path.join("cache", "bws_cache.json"),
    )
    for hd in hermes_dirs:
        for name in credential_file_names:
--- a/agent/google_code_assist.py
+++ b/agent/google_code_assist.py
@ -31,7 +31,6 @@ import json
 import logging
 import time
 import urllib.error
-import urllib.parse
 import urllib.request
 import uuid
 from dataclasses import dataclass, field
--- a/agent/google_oauth.py
+++ b/agent/google_oauth.py
@ -899,7 +899,15 @@ def start_oauth_flow(
        try:
            import webbrowser

-            webbrowser.open(auth_url, new=1, autoraise=True)
+            try:
+                from hermes_cli.auth import (
+                    _can_open_graphical_browser as _can_open_gui,
+                )
+            except Exception:
+                _can_open_gui = lambda: True  # noqa: E731
+
+            if _can_open_gui():
+                webbrowser.open(auth_url, new=1, autoraise=True)
        except Exception as exc:
            logger.debug("webbrowser.open failed: %s", exc)

--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@ -37,6 +37,8 @@ from __future__ import annotations
 import base64
 import logging
 import mimetypes
+import os
+import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

@ -46,6 +48,102 @@ logger = logging.getLogger(__name__)
 _VALID_MODES = frozenset({"auto", "native", "text"})


+# Image extensions used by extract_image_refs(). Kept tight on purpose — we
+# only auto-attach things the model can actually see. Documents/archives are
+# excluded because the gateway's broader extract_local_files() also routes
+# them differently (send_document), and we don't want to attach a PDF as a
+# vision part.
+_IMAGE_EXTS = (
+    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".heic",
+)
+_IMAGE_EXT_PATTERN = "|".join(e.lstrip(".") for e in _IMAGE_EXTS)
+
+# Absolute / home-relative local image path. Matches the same shape gateway's
+# extract_local_files() uses: anchors to ``~/`` or ``/``, ignores matches inside
+# URLs (the ``(?<![/:\w.])`` lookbehind), and case-insensitive on the extension.
+_LOCAL_IMAGE_PATH_RE = re.compile(
+    r"(?<![/:\w.])(?:~/|/)(?:[\w.\-]+/)*[\w.\-]+\.(?:" + _IMAGE_EXT_PATTERN + r")\b",
+    re.IGNORECASE,
+)
+
+# http(s) URL ending in an image extension (optionally followed by a
+# query string). Case-insensitive on the extension. Strict ``http(s)://``
+# scheme so we don't accidentally grab ``file://`` URLs or other shapes.
+_IMAGE_URL_RE = re.compile(
+    r"https?://[^\s<>\"']+?\.(?:" + _IMAGE_EXT_PATTERN + r")(?:\?[^\s<>\"']*)?",
+    re.IGNORECASE,
+)
+
+
+def extract_image_refs(text: str) -> Tuple[List[str], List[str]]:
+    """Scan free-form text for image references the model should see.
+
+    Returns ``(local_paths, urls)``:
+
+      * ``local_paths`` — absolute (``/``) or home-relative (``~/``) paths
+        whose suffix is an image extension AND whose expanded form exists
+        on disk as a file. Order-preserving, deduplicated.
+      * ``urls`` — ``http(s)://…`` URLs whose path ends in an image
+        extension (a ``?query`` is allowed after the extension).
+        Order-preserving, deduplicated.
+
+    Matches inside fenced code blocks (``` ``` ```) and inline backticks
+    (`` `…` ``) are skipped so that snippets pasted into a task body for
+    reference aren't mistaken for live attachments. This mirrors the
+    behaviour of ``gateway.platforms.base.BaseAdapter.extract_local_files``.
+
+    Local paths are validated against the filesystem; URLs are not
+    (the provider fetches them at request time).
+    """
+    if not isinstance(text, str) or not text:
+        return [], []
+
+    # Build spans covered by fenced code blocks and inline code so we can
+    # ignore references the author embedded purely as example text.
+    code_spans: list[tuple[int, int]] = []
+    for m in re.finditer(r"```[^\n]*\n.*?```", text, re.DOTALL):
+        code_spans.append((m.start(), m.end()))
+    for m in re.finditer(r"`[^`\n]+`", text):
+        code_spans.append((m.start(), m.end()))
+
+    def _in_code(pos: int) -> bool:
+        return any(s <= pos < e for s, e in code_spans)
+
+    local_paths: list[str] = []
+    seen_paths: set[str] = set()
+    for match in _LOCAL_IMAGE_PATH_RE.finditer(text):
+        if _in_code(match.start()):
+            continue
+        raw = match.group(0)
+        expanded = os.path.expanduser(raw)
+        try:
+            if not os.path.isfile(expanded):
+                continue
+        except OSError:
+            # ENAMETOOLONG / EINVAL on pathological inputs — skip rather than crash.
+            continue
+        if expanded in seen_paths:
+            continue
+        seen_paths.add(expanded)
+        local_paths.append(expanded)
+
+    urls: list[str] = []
+    seen_urls: set[str] = set()
+    for match in _IMAGE_URL_RE.finditer(text):
+        if _in_code(match.start()):
+            continue
+        url = match.group(0)
+        # Strip trailing punctuation that's almost certainly prose, not part
+        # of the URL (e.g. "see https://x.com/a.png." or "/a.png)").
+        url = url.rstrip(".,;:!?)]>")
+        if url in seen_urls:
+            continue
+        seen_urls.add(url)
+        urls.append(url)
+
+    return local_paths, urls
+
+
 # Strict YAML/JSON boolean coercion for capability overrides.
 #
 # ``bool("false")`` is True in Python because non-empty strings are truthy, so
@ -320,20 +418,29 @@ def _file_to_data_url(path: Path) -> Optional[str]:
 def build_native_content_parts(
    user_text: str,
    image_paths: List[str],
+    image_urls: Optional[List[str]] = None,
 ) -> Tuple[List[Dict[str, Any]], List[str]]:
    """Build an OpenAI-style ``content`` list for a user turn.

    Shape:
      [{"type": "text", "text": "...\\n\\n[Image attached at: /local/path]"},
       {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
+       {"type": "image_url", "image_url": {"url": "https://example.com/a.png"}},
       ...]

-    The local path of each successfully attached image is appended to the
-    text part as ``[Image attached at: <path>]``. The model still sees the
-    pixels via the ``image_url`` part (full native vision); the path note
-    just gives it a string handle so MCP/skill tools that take an image
-    path or URL argument can be invoked on the same image without an
-    extra round-trip. This parallels the text-mode hint produced by
+    Local paths are read from disk and embedded as base64 ``data:`` URLs.
+    Remote URLs (``http(s)://``) are passed through verbatim — the provider
+    fetches them server-side. The model still sees the pixels either way.
+
+    For each successfully attached image, a hint is appended to the text
+    part:
+
+      * local path → ``[Image attached at: <path>]``
+      * URL        → ``[Image attached: <url>]``
+
+    The hint gives the model a string handle so MCP/skill tools that take
+    an image path or URL argument can be invoked on the same image without
+    an extra round-trip. This parallels the text-mode hint produced by
    ``Runner._enrich_message_with_vision`` (``vision_analyze using image_url:
    <path>``) so behaviour is consistent across both image input modes.

@ -342,12 +449,14 @@ def build_native_content_parts(
    ceiling), the agent's retry loop transparently shrinks and retries
    once — see ``run_agent._try_shrink_image_parts_in_messages``.

-    Returns (content_parts, skipped_paths). Skipped paths are files that
-    couldn't be read from disk and are NOT advertised in the path hints.
+    Returns (content_parts, skipped). Skipped entries are local paths
+    that couldn't be read from disk; URLs are never skipped (they're
+    not validated here).
    """
    skipped: List[str] = []
    image_parts: List[Dict[str, Any]] = []
    attached_paths: List[str] = []
+    attached_urls: List[str] = []

    for raw_path in image_paths:
        p = Path(raw_path)
@ -364,16 +473,26 @@ def build_native_content_parts(
        })
        attached_paths.append(str(raw_path))

+    for url in image_urls or []:
+        url = (url or "").strip()
+        if not url:
+            continue
+        image_parts.append({
+            "type": "image_url",
+            "image_url": {"url": url},
+        })
+        attached_urls.append(url)
+
    text = (user_text or "").strip()

    # If at least one image attached, build a single text part that combines
-    # the user's caption (or a neutral default) with one path hint per image.
-    if attached_paths:
+    # the user's caption (or a neutral default) with one hint per image.
+    if attached_paths or attached_urls:
        base_text = text or "What do you see in this image?"
-        path_hints = "\n".join(
-            f"[Image attached at: {p}]" for p in attached_paths
-        )
-        combined_text = f"{base_text}\n\n{path_hints}"
+        hint_lines: List[str] = []
+        hint_lines.extend(f"[Image attached at: {p}]" for p in attached_paths)
+        hint_lines.extend(f"[Image attached: {u}]" for u in attached_urls)
+        combined_text = f"{base_text}\n\n" + "\n".join(hint_lines)
        parts: List[Dict[str, Any]] = [{"type": "text", "text": combined_text}]
        parts.extend(image_parts)
        return parts, skipped
@ -388,4 +507,5 @@ def build_native_content_parts(
 __all__ = [
    "decide_image_input_mode",
    "build_native_content_parts",
+    "extract_image_refs",
 ]
--- a/agent/lsp/cli.py
+++ b/agent/lsp/cli.py
@ -16,7 +16,6 @@ from __future__ import annotations

 import argparse
 import sys
-from typing import Optional


 def register_subparser(subparsers: argparse._SubParsersAction) -> None:
@ -248,19 +247,13 @@ def _cmd_restart() -> int:


 def _cmd_which(server_id: str) -> int:
-    from agent.lsp.install import INSTALL_RECIPES, hermes_lsp_bin_dir
-    import os
-    import shutil as _shutil
+    from agent.lsp.install import INSTALL_RECIPES, _existing_binary

    recipe = INSTALL_RECIPES.get(server_id)
    bin_name = (recipe or {}).get("bin", server_id)
-    staged = hermes_lsp_bin_dir() / bin_name
-    if staged.exists():
-        sys.stdout.write(str(staged) + "\n")
-        return 0
-    on_path = _shutil.which(bin_name)
-    if on_path:
-        sys.stdout.write(on_path + "\n")
+    resolved = _existing_binary(bin_name)
+    if resolved:
+        sys.stdout.write(resolved + "\n")
        return 0
    sys.stderr.write(f"{server_id}: not installed\n")
    return 1
@ -294,11 +287,9 @@ def _backend_warnings() -> list:
    suggestion across common platforms.
    """
    import shutil as _shutil
-    from agent.lsp.install import hermes_lsp_bin_dir
+    from agent.lsp.install import _existing_binary
    notes: list = []
-    bash_installed = _shutil.which("bash-language-server") is not None or (
-        (hermes_lsp_bin_dir() / "bash-language-server").exists()
-    )
+    bash_installed = _existing_binary("bash-language-server") is not None
    if bash_installed and _shutil.which("shellcheck") is None:
        notes.append(
            "bash-language-server is installed but shellcheck is missing — "
--- a/agent/lsp/client.py
+++ b/agent/lsp/client.py
@ -44,6 +44,7 @@ from __future__ import annotations
 import asyncio
 import logging
 import os
+import sys
 from pathlib import Path
 from typing import Any, Awaitable, Callable, Dict, List, Optional, Set
 from urllib.parse import quote, unquote
@ -244,15 +245,27 @@ class LSPClient:
            await self._cleanup_process()
            raise

+    @staticmethod
+    def _win_wrap_cmd(cmd: List[str]) -> List[str]:
+        """On Windows, wrap .cmd/.bat shims so CreateProcess can run them."""
+        exe = cmd[0]
+        if exe.lower().endswith((".cmd", ".bat")):
+            return ["cmd.exe", "/c", *cmd]
+        return cmd
+
    async def _spawn(self) -> None:
        env = dict(os.environ)
        if self._env:
            env.update(self._env)

+        cmd = self._command
+        if sys.platform == "win32":
+            cmd = self._win_wrap_cmd(cmd)
+
        try:
            self._proc = await asyncio.create_subprocess_exec(
-                self._command[0],
-                *self._command[1:],
+                cmd[0],
+                *cmd[1:],
                stdin=asyncio.subprocess.PIPE,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
@ -261,7 +274,7 @@ class LSPClient:
            )
        except FileNotFoundError as e:
            raise LSPProtocolError(
-                f"LSP server binary not found: {self._command[0]} ({e})"
+                f"LSP server binary not found: {cmd[0]} ({e})"
            ) from e

        # Drain stderr at debug level — if we don't, the pipe buffer
--- a/agent/lsp/install.py
+++ b/agent/lsp/install.py
@ -108,6 +108,11 @@ INSTALL_RECIPES: Dict[str, Dict[str, Any]] = {
 _install_locks: Dict[str, threading.Lock] = {}
 _install_results: Dict[str, Optional[str]] = {}
 _install_lock_meta = threading.Lock()
+_WINDOWS_WRAPPER_SUFFIXES = (".cmd", ".exe", ".bat")
+
+
+def _is_windows() -> bool:
+    return os.name == "nt"


 def hermes_lsp_bin_dir() -> Path:
@ -120,14 +125,33 @@ def hermes_lsp_bin_dir() -> Path:
    return p


+def _native_binary_candidates(base: Path) -> list[Path]:
+    """Return platform-native executable candidates for a staged binary."""
+    candidates = [base]
+    if _is_windows():
+        existing = {str(base).lower()}
+        for suffix in _WINDOWS_WRAPPER_SUFFIXES:
+            candidate = Path(str(base) + suffix)
+            key = str(candidate).lower()
+            if key not in existing:
+                candidates.append(candidate)
+                existing.add(key)
+    return candidates
+
+
 def _existing_binary(name: str) -> Optional[str]:
    """Probe the staging dir + PATH for a binary named ``name``."""
-    staged = hermes_lsp_bin_dir() / name
-    if staged.exists() and os.access(staged, os.X_OK):
-        return str(staged)
+    for staged in _native_binary_candidates(hermes_lsp_bin_dir() / name):
+        if staged.exists() and os.access(staged, os.X_OK):
+            return str(staged)
    on_path = shutil.which(name)
    if on_path:
        return on_path
+    if _is_windows():
+        for suffix in _WINDOWS_WRAPPER_SUFFIXES:
+            on_path = shutil.which(f"{name}{suffix}")
+            if on_path:
+                return on_path
    return None


@ -250,12 +274,7 @@ def _install_npm(

    # Find the bin
    nm_bin = staging / "node_modules" / ".bin" / bin_name
-    if os.name == "nt":
-        # On Windows npm sometimes drops `.cmd` shims
-        candidates = [nm_bin, nm_bin.with_suffix(".cmd")]
-    else:
-        candidates = [nm_bin]
-    for c in candidates:
+    for c in _native_binary_candidates(nm_bin):
        if c.exists():
            # Symlink into our `lsp/bin/` for stable PATH access.
            link = hermes_lsp_bin_dir() / c.name
@ -301,7 +320,7 @@ def _install_go(pkg: str, bin_name: str) -> Optional[str]:
        logger.warning("[install] go install errored for %s: %s", pkg, e)
        return None
    bin_path = staging / bin_name
-    if os.name == "nt":
+    if _is_windows():
        bin_path = bin_path.with_suffix(".exe")
    if bin_path.exists():
        return str(bin_path)
@ -337,19 +356,24 @@ def _install_pip(pkg: str, bin_name: str) -> Optional[str]:
    except (subprocess.TimeoutExpired, OSError) as e:
        logger.warning("[install] pip install errored for %s: %s", pkg, e)
        return None
-    # Look for the script
-    bin_path = pip_target / "bin" / bin_name
-    if bin_path.exists():
-        link = hermes_lsp_bin_dir() / bin_name
-        if not link.exists():
-            try:
-                link.symlink_to(bin_path)
-            except (OSError, NotImplementedError):
-                try:
-                    shutil.copy2(bin_path, link)
-                except OSError:
-                    return str(bin_path)
-        return str(link if link.exists() else bin_path)
+    # Look for the console script.  POSIX wheels generally write to bin/,
+    # while native Windows installs use Scripts/.
+    script_dirs = [pip_target / "bin"]
+    if _is_windows():
+        script_dirs.append(pip_target / "Scripts")
+    for script_dir in script_dirs:
+        for bin_path in _native_binary_candidates(script_dir / bin_name):
+            if bin_path.exists():
+                link = hermes_lsp_bin_dir() / bin_path.name
+                if not link.exists():
+                    try:
+                        link.symlink_to(bin_path)
+                    except (OSError, NotImplementedError):
+                        try:
+                            shutil.copy2(bin_path, link)
+                        except OSError:
+                            return str(bin_path)
+                return str(link if link.exists() else bin_path)
    return None


--- a/agent/lsp/manager.py
+++ b/agent/lsp/manager.py
@ -39,25 +39,20 @@ import logging
 import os
 import threading
 import time
-from concurrent.futures import Future as ConcurrentFuture
 from typing import Any, Callable, Dict, List, Optional, Tuple

 from agent.lsp import eventlog
 from agent.lsp.client import (
    DIAGNOSTICS_DOCUMENT_WAIT,
    LSPClient,
-    file_uri,
 )
 from agent.lsp.servers import (
    ServerContext,
-    ServerDef,
-    SpawnSpec,
    find_server_for_file,
    language_id_for,
 )
 from agent.lsp.workspace import (
    clear_cache,
-    is_inside_workspace,
    resolve_workspace_for_file,
 )

--- a/agent/lsp/servers.py
+++ b/agent/lsp/servers.py
@ -25,7 +25,7 @@ import shutil
 from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple

-from agent.lsp.workspace import nearest_root, normalize_path
+from agent.lsp.workspace import nearest_root

 logger = logging.getLogger("agent.lsp.servers")

--- a/agent/memory_manager.py
+++ b/agent/memory_manager.py
@ -368,11 +368,42 @@ class MemoryManager:

    # -- Sync ----------------------------------------------------------------

-    def sync_all(self, user_content: str, assistant_content: str, *, session_id: str = "") -> None:
+    @staticmethod
+    def _provider_sync_accepts_messages(provider: MemoryProvider) -> bool:
+        """Return whether sync_turn accepts a messages keyword."""
+        try:
+            signature = inspect.signature(provider.sync_turn)
+        except (TypeError, ValueError):
+            return True
+        params = list(signature.parameters.values())
+        if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
+            return True
+        return "messages" in signature.parameters
+
+    def sync_all(
+        self,
+        user_content: str,
+        assistant_content: str,
+        *,
+        session_id: str = "",
+        messages: Optional[List[Dict[str, Any]]] = None,
+    ) -> None:
        """Sync a completed turn to all providers."""
        for provider in self._providers:
            try:
-                provider.sync_turn(user_content, assistant_content, session_id=session_id)
+                if messages is not None and self._provider_sync_accepts_messages(provider):
+                    provider.sync_turn(
+                        user_content,
+                        assistant_content,
+                        session_id=session_id,
+                        messages=messages,
+                    )
+                else:
+                    provider.sync_turn(
+                        user_content,
+                        assistant_content,
+                        session_id=session_id,
+                    )
            except Exception as e:
                logger.warning(
                    "Memory provider '%s' sync_turn failed: %s",
--- a/agent/memory_provider.py
+++ b/agent/memory_provider.py
@ -112,11 +112,22 @@ class MemoryProvider(ABC):
        that do background prefetching should override this.
        """

-    def sync_turn(self, user_content: str, assistant_content: str, *, session_id: str = "") -> None:
+    def sync_turn(
+        self,
+        user_content: str,
+        assistant_content: str,
+        *,
+        session_id: str = "",
+        messages: Optional[List[Dict[str, Any]]] = None,
+    ) -> None:
        """Persist a completed turn to the backend.

        Called after each turn. Should be non-blocking — queue for
        background processing if the backend has latency.
+
+        ``messages`` is the OpenAI-style conversation message list as of the
+        completed turn, including any assistant tool calls and tool results.
+        Providers that do not need raw turn context can ignore it.
        """

    @abstractmethod
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@ -7,7 +7,6 @@ assemble pieces, then combines them with memory and ephemeral prompts.
 import json
 import logging
 import os
-import re
 import threading
 from collections import OrderedDict
 from pathlib import Path
@ -236,6 +235,11 @@ KANBAN_GUIDANCE = (
    "- Do not shell out to `hermes kanban <verb>` for board operations. Use "
    "the `kanban_*` tools — they work across all terminal backends.\n"
    "- Do not complete a task you didn't actually finish. Block it.\n"
+    "- Do not call `clarify` to ask questions. You are running headless — "
+    "there is no live user to answer. The call will time out and the task "
+    "will sit silently in `running` with no signal to the operator. Instead: "
+    "`kanban_comment` the context, then `kanban_block(reason=...)` so the "
+    "task surfaces on the board as needing input.\n"
    "- Do not assign follow-up work to yourself. Assign it to the right "
    "specialist profile.\n"
    "- Do not call `delegate_task` as a board substitute. `delegate_task` is "
@ -262,6 +266,37 @@ TOOL_USE_ENFORCEMENT_GUIDANCE = (
 # Add new patterns here when a model family needs explicit steering.
 TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma", "grok", "glm", "qwen", "deepseek")

+# Universal "finish the job" guidance — applied to ALL models, not gated
+# by model family.  Addresses two cross-model failure modes:
+#   1. Stopping after a stub: writing a tiny file or running one command
+#      and then ending the turn with a description of the plan instead
+#      of the finished artifact.  (Observed on Opus during a real
+#      Sarasota real-estate build task: 3 API calls, 85-byte file,
+#      one terminal command, finish_reason=stop.)
+#   2. Fabricating output when a real path is blocked.  When `pip` or a
+#      tool fails, some models will synthesize plausible-looking results
+#      (fake addresses, fake JSON, fake numbers) instead of reporting
+#      the blocker.  (Observed on DeepSeek v4-flash on the same task:
+#      pushed through PEP-668 wall, then returned fabricated listings.)
+#
+# Short on purpose.  This block is shipped to every user, every session,
+# in the cached system prompt — token cost is paid once at install and
+# then amortised across all sessions via prefix caching.  Keep it tight.
+TASK_COMPLETION_GUIDANCE = (
+    "# Finishing the job\n"
+    "When the user asks you to build, run, or verify something, the deliverable is "
+    "a working artifact backed by real tool output — not a description of one. "
+    "Do not stop after writing a stub, a plan, or a single command. Keep working "
+    "until you have actually exercised the code or produced the requested result, "
+    "then report what real execution returned.\n"
+    "If a tool, install, or network call fails and blocks the real path, say so "
+    "directly and try an alternative (different package manager, different "
+    "approach, ask the user). NEVER substitute plausible-looking fabricated "
+    "output (made-up data, invented file contents, synthesised API responses) "
+    "for results you couldn't actually produce. Reporting a blocker honestly "
+    "is always better than inventing a result."
+)
+
 # OpenAI GPT/Codex-specific execution guidance.  Addresses known failure modes
 # where GPT models abandon work on partial results, skip prerequisite lookups,
 # hallucinate instead of using tools, and declare "done" without verification.
@ -813,6 +848,27 @@ def build_environment_hints() -> str:

    if is_wsl():
        hints.append(WSL_ENVIRONMENT_HINT)
+
+    # Embedder-supplied environment description. Lets a host that wraps Hermes
+    # (e.g. a sandbox runner / managed platform) explain the environment the
+    # agent is running in — proxy, credential handling, mount layout — without
+    # forking the identity slot (SOUL.md). Read once at prompt-build time, so
+    # it's part of the stable, cache-safe system prompt. The env var is the
+    # build-time/embedder mechanism (set in a container ENV); config.yaml
+    # ``agent.environment_hint`` is the user-facing surface. Env var wins.
+    extra = (os.getenv("HERMES_ENVIRONMENT_HINT") or "").strip()
+    if not extra:
+        try:
+            from hermes_cli.config import load_config
+
+            extra = str(
+                (load_config().get("agent", {}) or {}).get("environment_hint", "")
+            ).strip()
+        except Exception as e:
+            logger.debug("Could not read agent.environment_hint from config: %s", e)
+    if extra:
+        hints.append(extra)
+
    return "\n\n".join(hints)


--- a/agent/redact.py
+++ b/agent/redact.py
@ -331,7 +331,7 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    """Apply all redaction patterns to a block of text.

    Safe to call on any string -- non-matching text passes through unchanged.
-    Disabled by default — enable via security.redact_secrets: true in config.yaml.
+    Enabled by default. Disable via security.redact_secrets: false in config.yaml.
    Set force=True for safety boundaries that must never return raw secrets
    regardless of the user's global logging redaction preference.

--- a/agent/secret_sources/bitwarden.py
+++ b/agent/secret_sources/bitwarden.py
@ -37,7 +37,6 @@ import platform
 import shutil
 import stat
 import subprocess
-import sys
 import tempfile
 import time
 import urllib.error
--- a/agent/system_prompt.py
+++ b/agent/system_prompt.py
@ -37,6 +37,7 @@ from agent.prompt_builder import (
    PLATFORM_HINTS,
    SESSION_SEARCH_GUIDANCE,
    SKILLS_GUIDANCE,
+    TASK_COMPLETION_GUIDANCE,
    TOOL_USE_ENFORCEMENT_GUIDANCE,
    TOOL_USE_ENFORCEMENT_MODELS,
 )
@ -100,6 +101,15 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None)
    # Pointer to the hermes-agent skill + docs for user questions about Hermes itself.
    stable_parts.append(HERMES_AGENT_HELP_GUIDANCE)

+    # Universal task-completion / no-fabrication guidance.  Applied to ALL
+    # models regardless of tool_use_enforcement gating — the failure modes
+    # this targets (stopping after a stub; fabricating output when a real
+    # path is blocked) are not model-family specific.  Gated only by
+    # config.yaml ``agent.task_completion_guidance`` (default True) so
+    # users who want a leaner prompt can turn it off.
+    if getattr(agent, "_task_completion_guidance", True) and agent.valid_tool_names:
+        stable_parts.append(TASK_COMPLETION_GUIDANCE)
+
    # Tool-aware behavioral guidance: only inject when the tools are loaded
    tool_guidance = []
    if "memory" in agent.valid_tool_names:
@ -205,6 +215,23 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None)
    if _env_hints:
        stable_parts.append(_env_hints)

+    # Local Python toolchain probe — names python/pip/uv/PEP-668 state when
+    # something is non-default so the model can pick the right install
+    # strategy without discovering by failure.  Emits a single line; emits
+    # NOTHING when the environment is clean (no token cost).  Skipped
+    # entirely for remote terminal backends (the host's Python state is
+    # irrelevant when tools run inside docker/modal/ssh).  Gated by
+    # config.yaml ``agent.environment_probe`` (default True).
+    if getattr(agent, "_environment_probe", True):
+        try:
+            from tools.env_probe import get_environment_probe_line
+            _probe_line = get_environment_probe_line()
+            if _probe_line:
+                stable_parts.append(_probe_line)
+        except Exception:
+            # Probe failure must never block prompt build.
+            pass
+
    # Active-profile hint — names the Hermes profile the agent is running
    # under so it doesn't conflate ~/.hermes/skills/ (default profile) with
    # ~/.hermes/profiles/<active>/skills/ (this profile's). Deterministic
--- a/agent/tool_executor.py
+++ b/agent/tool_executor.py
@ -13,14 +13,13 @@ extracted functions reach back through the ``run_agent`` module via
 from __future__ import annotations

 import concurrent.futures
-import contextvars
 import json
 import logging
 import os
 import random
 import threading
 import time
-from typing import Any, Optional
+from typing import Optional

 from agent.display import (
    KawaiiSpinner,
@ -38,12 +37,9 @@ from agent.tool_dispatch_helpers import (
    make_tool_result_message,
 )
 from tools.terminal_tool import (
-    _get_approval_callback,
-    _get_sudo_password_callback,
-    set_approval_callback as _set_approval_callback,
-    set_sudo_password_callback as _set_sudo_password_callback,
    get_active_env,
 )
+from tools.thread_context import propagate_context_to_thread
 from tools.tool_result_storage import (
    maybe_persist_tool_result,
    enforce_turn_budget,
@ -62,6 +58,55 @@ def _ra():
    return run_agent


+def _tool_search_scoped_names(agent) -> frozenset:
+    """Return the deferrable tool names the session may invoke via tool_call.
+
+    The Tool Search unwrap dispatches the underlying tool directly, bypassing
+    the bridge branch (and its scope check) in
+    ``model_tools.handle_function_call``. To keep a restricted-toolset session
+    (subagent, kanban worker, curated gateway session) from reaching tools it
+    was never granted, the unwrap validates the underlying name against this
+    set: the deferrable subset of the session's own enabled/disabled toolset
+    scope.
+
+    Result is cached on the agent and refreshed when the tool registry's
+    generation changes (e.g. an MCP server reconnects), so the common case is
+    a dict lookup, not a full tool-defs rebuild on every tool call.
+    """
+    try:
+        import model_tools
+        from tools import tool_search as _ts
+        from tools.registry import registry as _registry
+    except Exception:
+        return frozenset()
+
+    enabled = getattr(agent, "enabled_toolsets", None)
+    disabled = getattr(agent, "disabled_toolsets", None)
+    cache_key = (
+        getattr(_registry, "_generation", 0),
+        frozenset(enabled) if enabled is not None else None,
+        frozenset(disabled) if disabled is not None else None,
+    )
+    cached = getattr(agent, "_tool_search_scope_cache", None)
+    if cached is not None and cached[0] == cache_key:
+        return cached[1]
+    try:
+        scoped_defs = model_tools.get_tool_definitions(
+            enabled_toolsets=enabled,
+            disabled_toolsets=disabled,
+            quiet_mode=True,
+            skip_tool_search_assembly=True,
+        ) or []
+        names = _ts.scoped_deferrable_names(scoped_defs)
+    except Exception:
+        names = frozenset()
+    try:
+        agent._tool_search_scope_cache = (cache_key, names)
+    except Exception:
+        pass
+    return names
+
+
 def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
    """Execute multiple tool calls concurrently using a thread pool.

@ -100,45 +145,89 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
        if not isinstance(function_args, dict):
            function_args = {}

-        # Checkpoint for file-mutating tools
-        if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
-            try:
-                file_path = function_args.get("path", "")
-                if file_path:
-                    work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path)
-                    agent._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}")
-            except Exception:
-                pass
-
-        # Checkpoint before destructive terminal commands
-        if function_name == "terminal" and agent._checkpoint_mgr.enabled:
-            try:
-                cmd = function_args.get("command", "")
-                if _is_destructive_command(cmd):
-                    cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
-                    agent._checkpoint_mgr.ensure_checkpoint(
-                        cwd, f"before terminal: {cmd[:60]}"
-                    )
-            except Exception:
-                pass
+        # ── Tool Search unwrap ────────────────────────────────────────
+        # When the model invokes the tool_call bridge, peel it open so
+        # every downstream check (checkpointing, guardrails, plugin
+        # pre-tool-call hooks, the display/activity feed, the post-call
+        # callback) sees the underlying tool — not the bridge. This is
+        # the OpenClaw lesson: hooks must observe the real tool name.
+        #
+        # The original tool_call entry on ``tool_call.function`` is left
+        # untouched so the conversation transcript and the matching
+        # tool_call_id are preserved exactly as the model emitted them.
+        #
+        # Scope gate: the unwrap dispatches the underlying tool directly
+        # (bypassing the bridge branch in handle_function_call and its
+        # scope check), so we enforce session toolset scope HERE. A tool
+        # the session was not granted is rejected before any checkpoint,
+        # hook, or dispatch fires.
+        _ts_scope_block = None
+        try:
+            from tools import tool_search as _ts
+            if function_name == _ts.TOOL_CALL_NAME:
+                _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
+                if not _err and _underlying:
+                    if _underlying in _tool_search_scoped_names(agent):
+                        function_name = _underlying
+                        function_args = _underlying_args
+                    else:
+                        _ts_scope_block = json.dumps({
+                            "error": (
+                                f"'{_underlying}' is not available in this session. "
+                                "Use tool_search to find tools you can call."
+                            ),
+                        }, ensure_ascii=False)
+        except Exception:
+            pass

+        # ── Block evaluation (BEFORE checkpoint preflight) ───────────
+        # We must know whether the tool will execute before touching
+        # checkpoint state (dedup slot, real snapshots).
        block_result = None
        blocked_by_guardrail = False
-        try:
-            from hermes_cli.plugins import get_pre_tool_call_block_message
-            block_message = get_pre_tool_call_block_message(
-                function_name, function_args, task_id=effective_task_id or "",
-            )
-        except Exception:
-            block_message = None
-
-        if block_message is not None:
-            block_result = json.dumps({"error": block_message}, ensure_ascii=False)
+        if _ts_scope_block is not None:
+            # Out-of-scope tool_call: reject before hooks/guardrails/dispatch.
+            block_result = _ts_scope_block
        else:
-            guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
-            if not guardrail_decision.allows_execution:
-                block_result = agent._guardrail_block_result(guardrail_decision)
-                blocked_by_guardrail = True
+            try:
+                from hermes_cli.plugins import get_pre_tool_call_block_message
+                block_message = get_pre_tool_call_block_message(
+                    function_name, function_args, task_id=effective_task_id or "",
+                )
+            except Exception:
+                block_message = None
+
+            if block_message is not None:
+                block_result = json.dumps({"error": block_message}, ensure_ascii=False)
+            else:
+                guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
+                if not guardrail_decision.allows_execution:
+                    block_result = agent._guardrail_block_result(guardrail_decision)
+                    blocked_by_guardrail = True
+
+        # ── Checkpoint preflight (only for tools that will execute) ──
+        if block_result is None:
+            # Checkpoint for file-mutating tools
+            if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
+                try:
+                    file_path = function_args.get("path", "")
+                    if file_path:
+                        work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path)
+                        agent._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}")
+                except Exception:
+                    pass
+
+            # Checkpoint before destructive terminal commands
+            if function_name == "terminal" and agent._checkpoint_mgr.enabled:
+                try:
+                    cmd = function_args.get("command", "")
+                    if _is_destructive_command(cmd):
+                        cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
+                        agent._checkpoint_mgr.ensure_checkpoint(
+                            cwd, f"before terminal: {cmd[:60]}"
+                        )
+                except Exception:
+                    pass

        parsed_calls.append((tool_call, function_name, function_args, block_result, blocked_by_guardrail))

@ -186,14 +275,6 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
    agent._current_tool = tool_names_str
    agent._touch_activity(f"executing {num_tools} tools concurrently: {tool_names_str}")

-    # Capture CLI callbacks from the agent thread so worker threads can
-    # register them locally.  Without this, _get_approval_callback() in
-    # terminal_tool returns None in ThreadPoolExecutor workers, causing
-    # the dangerous-command prompt to fall back to input() — which
-    # deadlocks against prompt_toolkit's raw terminal mode (#13617).
-    _parent_approval_cb = _get_approval_callback()
-    _parent_sudo_cb = _get_sudo_password_callback()
-
    def _run_tool(index, tool_call, function_name, function_args):
        """Worker function executed in a thread."""
        # Register this worker tid so the agent can fan out an interrupt
@ -220,54 +301,43 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            set_activity_callback(agent._touch_activity)
        except Exception:
            pass
-        # Propagate approval/sudo callbacks to this worker thread.
-        # Mirrors cli.py run_agent() pattern (GHSA-qg5c-hvr5-hjgr).
-        if _parent_approval_cb is not None:
-            try:
-                _set_approval_callback(_parent_approval_cb)
-            except Exception:
-                pass
-        if _parent_sudo_cb is not None:
-            try:
-                _set_sudo_password_callback(_parent_sudo_cb)
-            except Exception:
-                pass
+        # Approval/sudo callbacks (thread-local) and the agent turn's
+        # ContextVars are propagated by propagate_context_to_thread() at the
+        # submit site below (GHSA-qg5c-hvr5-hjgr, #13617).
        start = time.time()
        try:
-            result = agent._invoke_tool(
-                function_name,
-                function_args,
-                effective_task_id,
-                tool_call.id,
-                messages=messages,
-                pre_tool_block_checked=True,
-            )
-        except Exception as tool_error:
-            result = f"Error executing tool '{function_name}': {tool_error}"
-            logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
-        duration = time.time() - start
-        is_error, _ = _detect_tool_failure(function_name, result)
-        if is_error:
-            logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
-        else:
-            logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
-        results[index] = (function_name, function_args, result, duration, is_error, False)
-        # Tear down worker-tid tracking.  Clear any interrupt bit we may
-        # have set so the next task scheduled onto this recycled tid
-        # starts with a clean slate.
-        with agent._tool_worker_threads_lock:
-            agent._tool_worker_threads.discard(_worker_tid)
-        try:
-            _ra()._set_interrupt(False, _worker_tid)
-        except Exception:
-            pass
-        # Clear thread-local callbacks so a recycled worker thread
-        # doesn't hold stale references to a disposed CLI instance.
-        try:
-            _set_approval_callback(None)
-            _set_sudo_password_callback(None)
-        except Exception:
-            pass
+            try:
+                result = agent._invoke_tool(
+                    function_name,
+                    function_args,
+                    effective_task_id,
+                    tool_call.id,
+                    messages=messages,
+                    pre_tool_block_checked=True,
+                )
+            except Exception as tool_error:
+                result = f"Error executing tool '{function_name}': {tool_error}"
+                logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
+            duration = time.time() - start
+            is_error, _ = _detect_tool_failure(function_name, result)
+            if is_error:
+                logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
+            else:
+                logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
+            results[index] = (function_name, function_args, result, duration, is_error, False)
+        finally:
+            # Tear down worker-tid tracking.  Clear any interrupt bit we may
+            # have set so the next task scheduled onto this recycled tid
+            # starts with a clean slate.  This MUST be in a finally block
+            # because BaseException subclasses (CancelledError, KeyboardInterrupt)
+            # bypass ``except Exception`` and would otherwise leak the tid
+            # into _interrupted_threads, poisoning the recycled thread.
+            with agent._tool_worker_threads_lock:
+                agent._tool_worker_threads.discard(_worker_tid)
+            try:
+                _ra()._set_interrupt(False, _worker_tid)
+            except Exception:
+                pass

    # Start spinner for CLI mode (skip when TUI handles tool progress)
    spinner = None
@ -287,9 +357,12 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS)
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                for i, tc, name, args in runnable_calls:
-                    # Propagate ContextVars (e.g. _approval_session_key); mirrors asyncio.to_thread.
-                    ctx = contextvars.copy_context()
-                    f = executor.submit(ctx.run, _run_tool, i, tc, name, args)
+                    # Propagate the agent turn's ContextVars (e.g.
+                    # _approval_session_key) AND thread-local approval/sudo
+                    # callbacks into the worker thread; clears callbacks on exit.
+                    f = executor.submit(
+                        propagate_context_to_thread(_run_tool), i, tc, name, args
+                    )
                    futures.append(f)

                # Wait for all to complete with periodic heartbeats so the
@ -497,16 +570,39 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
        if not isinstance(function_args, dict):
            function_args = {}

-        # Check plugin hooks for a block directive before executing.
-        _block_msg: Optional[str] = None
+        # Tool Search unwrap — see execute_tool_calls_concurrent for full
+        # rationale, including the scope gate (the unwrap dispatches the
+        # underlying tool directly, so session toolset scope is enforced here).
+        _ts_scope_block: Optional[str] = None
        try:
-            from hermes_cli.plugins import get_pre_tool_call_block_message
-            _block_msg = get_pre_tool_call_block_message(
-                function_name, function_args, task_id=effective_task_id or "",
-            )
+            from tools import tool_search as _ts
+            if function_name == _ts.TOOL_CALL_NAME:
+                _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
+                if not _err and _underlying:
+                    if _underlying in _tool_search_scoped_names(agent):
+                        function_name = _underlying
+                        function_args = _underlying_args
+                    else:
+                        _ts_scope_block = (
+                            f"'{_underlying}' is not available in this session. "
+                            "Use tool_search to find tools you can call."
+                        )
        except Exception:
            pass

+        # Check plugin hooks for a block directive before executing.
+        _block_msg: Optional[str] = None
+        if _ts_scope_block is not None:
+            _block_msg = _ts_scope_block
+        else:
+            try:
+                from hermes_cli.plugins import get_pre_tool_call_block_message
+                _block_msg = get_pre_tool_call_block_message(
+                    function_name, function_args, task_id=effective_task_id or "",
+                )
+            except Exception:
+                pass
+
        _guardrail_block_decision: ToolGuardrailDecision | None = None
        if _block_msg is None:
            guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
@ -667,10 +763,14 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
        elif function_name == "delegate_task":
            tasks_arg = function_args.get("tasks")
            if tasks_arg and isinstance(tasks_arg, list):
-                spinner_label = f"🔀 delegating {len(tasks_arg)} tasks"
+                spinner_label = f"🔀 delegating {len(tasks_arg)} tasks · (/agents to monitor)"
            else:
                goal_preview = (function_args.get("goal") or "")[:30]
-                spinner_label = f"🔀 {goal_preview}" if goal_preview else "🔀 delegating"
+                spinner_label = (
+                    f"🔀 {goal_preview} · (/agents to monitor)"
+                    if goal_preview
+                    else "🔀 delegating · (/agents to monitor)"
+                )
            spinner = None
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
@ -752,6 +852,8 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
                    session_id=agent.session_id or "",
                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
                    skip_pre_tool_call_hook=True,
+                    enabled_toolsets=getattr(agent, "enabled_toolsets", None),
+                    disabled_toolsets=getattr(agent, "disabled_toolsets", None),
                )
                _spinner_result = function_result
            except Exception as tool_error:
@ -772,6 +874,8 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
                    session_id=agent.session_id or "",
                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
                    skip_pre_tool_call_hook=True,
+                    enabled_toolsets=getattr(agent, "enabled_toolsets", None),
+                    disabled_toolsets=getattr(agent, "disabled_toolsets", None),
                )
            except Exception as tool_error:
                function_result = f"Error executing tool '{function_name}': {tool_error}"
--- a/agent/transports/chat_completions.py
+++ b/agent/transports/chat_completions.py
@ -10,7 +10,7 @@ reasoning configuration, temperature handling, and extra_body assembly.
 """

 import copy
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict

 from agent.lmstudio_reasoning import resolve_lmstudio_effort
 from agent.moonshot_schema import is_moonshot_model, sanitize_moonshot_tools
@ -476,13 +476,17 @@ class ChatCompletionsTransport(ProviderTransport):
        ephemeral = params.get("ephemeral_max_output_tokens")
        user_max = params.get("max_tokens")
        anthropic_max = params.get("anthropic_max_output")
+        # Per-model default cap — profiles override get_max_tokens() when
+        # they front several backends with different completion-token limits
+        # (e.g. opencode-go: mimo-v2.5-pro = 131072).
+        profile_max = profile.get_max_tokens(model)

        if ephemeral is not None and max_tokens_fn:
            api_kwargs.update(max_tokens_fn(ephemeral))
        elif user_max is not None and max_tokens_fn:
            api_kwargs.update(max_tokens_fn(user_max))
-        elif profile.default_max_tokens and max_tokens_fn:
-            api_kwargs.update(max_tokens_fn(profile.default_max_tokens))
+        elif profile_max and max_tokens_fn:
+            api_kwargs.update(max_tokens_fn(profile_max))
        elif anthropic_max is not None:
            api_kwargs["max_tokens"] = anthropic_max

--- a/agent/transports/codex_app_server.py
+++ b/agent/transports/codex_app_server.py
@ -23,7 +23,7 @@ import subprocess
 import threading
 import time
 from dataclasses import dataclass, field
-from typing import Any, Callable, Optional
+from typing import Any, Optional

 # Default minimum codex version we test against. The PR sets this from the
 # `codex --version` parsed at install time; bumping is a one-line change here.
--- a/agent/transports/codex_app_server_session.py
+++ b/agent/transports/codex_app_server_session.py
@ -31,6 +31,7 @@ import time
 from dataclasses import dataclass, field
 from typing import Any, Callable, Optional

+from agent.codex_responses_adapter import _format_responses_error
 from agent.redact import redact_sensitive_text
 from agent.transports.codex_app_server import (
    CodexAppServerClient,
@ -581,7 +582,7 @@ class CodexAppServerSession:
                        (note.get("params") or {}).get("turn") or {}
                    ).get("error")
                    if err_obj:
-                        err_msg = err_obj.get("message") or str(err_obj)
+                        err_msg = _format_responses_error(err_obj, str(turn_status))
                        # If the turn failed for an auth/refresh reason,
                        # rewrite the error into a re-auth hint AND mark
                        # the session for retirement.
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@ -163,7 +163,7 @@ model:
 # -----------------------------------------------------------------------------
 # Working directory behavior:
 #   - CLI (`hermes` command): Uses "." (current directory where you run hermes)
-#   - Messaging (Telegram/Discord): Uses MESSAGING_CWD from .env (default: home)
+#   - Gateway/messaging/cron: Uses terminal.cwd here; legacy .env cwd values are deprecated
 terminal:
  backend: "local"
  cwd: "."  # For local backend: "." = current directory. Ignored for remote backends unless a backend documents otherwise.
--- a/cli.py
+++ b/cli.py
@ -74,10 +74,15 @@ except (ImportError, AttributeError):
    _STEADY_CURSOR = None

 try:
-    from hermes_cli.pt_input_extras import install_shift_enter_alias, install_ctrl_enter_alias
+    from hermes_cli.pt_input_extras import (
+        install_ctrl_enter_alias,
+        install_ignored_terminal_sequences,
+        install_shift_enter_alias,
+    )
    install_shift_enter_alias()
    install_ctrl_enter_alias()
-    del install_shift_enter_alias, install_ctrl_enter_alias
+    install_ignored_terminal_sequences()
+    del install_shift_enter_alias, install_ctrl_enter_alias, install_ignored_terminal_sequences
 except Exception:
    pass
 import threading
@ -382,6 +387,10 @@ def load_cli_config() -> Dict[str, Any]:
            "inactivity_timeout": 120,  # Auto-cleanup inactive browser sessions after 2 min
            "record_sessions": False,  # Auto-record browser sessions as WebM videos
            "engine": "auto",  # Browser engine: auto (Chrome), lightpanda, chrome
+            "camofox": {
+                "rewrite_loopback_urls": False,
+                "loopback_host_alias": "host.docker.internal",
+            },
        },
        "compression": {
            "enabled": True,      # Auto-compress when approaching context limit
@ -576,6 +585,8 @@ def load_cli_config() -> Dict[str, Any]:
        "docker_env": "TERMINAL_DOCKER_ENV",
        "docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
        "docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
+        "docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
+        "docker_orphan_reaper": "TERMINAL_DOCKER_ORPHAN_REAPER",
        "sandbox_dir": "TERMINAL_SANDBOX_DIR",
        # Persistent shell (non-local backends)
        "persistent_shell": "TERMINAL_PERSISTENT_SHELL",
@ -776,8 +787,10 @@ def AIAgent(*args, **kwargs):


 def get_tool_definitions(*args, **kwargs):
+    from hermes_cli.mcp_startup import wait_for_mcp_discovery
    from model_tools import get_tool_definitions as _get_tool_definitions

+    wait_for_mcp_discovery()
    return _get_tool_definitions(*args, **kwargs)


@ -885,9 +898,12 @@ def _prepare_deferred_agent_startup() -> None:
            exc_info=True,
        )
    try:
-        from tools.mcp_tool import discover_mcp_tools
+        from hermes_cli.mcp_startup import start_background_mcp_discovery

-        discover_mcp_tools()
+        start_background_mcp_discovery(
+            logger=logger,
+            thread_name="termux-cli-mcp-discovery",
+        )
    except Exception:
        logger.debug(
            "MCP tool discovery failed at deferred CLI startup",
@ -1526,9 +1542,17 @@ def _query_osc11_background() -> str | None:
    Most modern terminals reply with \x1b]11;rgb:RRRR/GGGG/BBBB\x1b\\
    within a few ms.  We wait up to 100ms total before giving up.
    Returns "#RRGGBB" or None on timeout / non-tty.
+
+    Skipped over SSH: the round-trip routinely exceeds our 100ms budget, so a
+    late reply lands after prompt_toolkit has grabbed the tty — its payload
+    leaks in as typed text and the BEL terminator reads as Ctrl+G (open
+    editor), trapping the user in a stray editor. Remote sessions fall back to
+    COLORFGBG / env hints / the dark default instead.
    """
    if not sys.stdin.isatty() or not sys.stdout.isatty():
        return None
+    if any(os.environ.get(v) for v in ("SSH_CONNECTION", "SSH_CLIENT", "SSH_TTY")):
+        return None
    try:
        import termios
        import tty
@ -1576,8 +1600,11 @@ def _query_osc11_background() -> str | None:
        r, g, b = norm(m.group(1)), norm(m.group(2)), norm(m.group(3))
        return f"#{r:02X}{g:02X}{b:02X}"
    finally:
+        # TCSAFLUSH discards any unread input as it restores the original
+        # attributes — scrubs a slow/partial OSC 11 reply out of the tty
+        # buffer before prompt_toolkit can read it as keystrokes.
        try:
-            termios.tcsetattr(fd, termios.TCSANOW, old)
+            termios.tcsetattr(fd, termios.TCSAFLUSH, old)
        except Exception:
            pass

@ -2475,8 +2502,9 @@ _TERMINAL_INPUT_MODE_RESET_SEQ = (
 def _preserve_ctrl_enter_newline() -> bool:
    """Detect environments where Ctrl+Enter must produce a newline, not submit.

-    Native Windows, WSL, SSH sessions, and Windows Terminal all send Ctrl+Enter
-    as bare LF (c-j). On those terminals c-j must NOT be bound to submit;
+    Windows Terminal, WSL, SSH sessions, Ghostty, and some modern terminals
+    deliver Ctrl+Enter/Ctrl+J as bare LF (c-j). On those terminals c-j must
+    NOT be bound to submit;
    binding it to submit makes Ctrl+Enter (intended as 'newline like Alt+Enter')
    submit instead. Local POSIX TTYs that deliver Enter as LF (docker exec,
    some thin PTYs without SSH) still need c-j bound to submit, so we keep
@ -2490,6 +2518,12 @@ def _preserve_ctrl_enter_newline() -> bool:
        return True
    if os.environ.get("WT_SESSION"):
        return True
+    if os.environ.get("GHOSTTY_RESOURCES_DIR") or os.environ.get("GHOSTTY_BIN_DIR"):
+        return True
+    if os.environ.get("TERM", "").lower() == "xterm-ghostty":
+        return True
+    if os.environ.get("TERM_PROGRAM", "").lower() == "ghostty":
+        return True
    if "microsoft" in os.environ.get("WSL_DISTRO_NAME", "").lower():
        return True
    # WSL detection — env vars can be scrubbed under sudo, also peek /proc.
@ -2510,7 +2544,7 @@ def _bind_prompt_submit_keys(kb, handler) -> None:
    some thin PTYs (docker exec, certain SSH flavors) deliver Enter as LF
    instead of CR — without this, Enter appears dead on those terminals.

-    Exception: on Windows, WSL, SSH sessions, and Windows Terminal,
+    Exception: on Windows, WSL, SSH sessions, Windows Terminal, and Ghostty,
    c-j is the wire encoding of Ctrl+Enter (a distinct keystroke from
    plain Enter / c-m). We leave c-j unbound there so the c-j newline
    handler registered separately can fire — giving the user an
@ -3230,6 +3264,12 @@ class HermesCLI:
        self._slash_confirm_state = None
        self._slash_confirm_deadline = 0
        self._model_picker_state = None
+        # Armed when a bare `/resume` prints the recent-sessions list so the
+        # very next bare numeric input (e.g. `3`) resolves to that session.
+        # Holds the exact list used for index resolution; one-shot (cleared on
+        # the next submitted input, whether it's the selection or anything
+        # else). See #34584.
+        self._pending_resume_sessions = None
        self._secret_state = None
        self._secret_deadline = 0
        self._spinner_text: str = ""  # thinking spinner text for TUI
@ -4847,6 +4887,10 @@ class HermesCLI:
        if not self._ensure_runtime_credentials():
            return False

+        from hermes_cli.mcp_startup import wait_for_mcp_discovery
+
+        wait_for_mcp_discovery()
+
        # Initialize SQLite session store for CLI sessions (if not already done in __init__)
        if self._session_db is None:
            try:
@ -6675,10 +6719,21 @@ class HermesCLI:
        if not target:
            _cprint("  Usage: /resume <number|session_id_or_title>")
            if self._show_recent_sessions(reason="resume"):
+                # Arm a one-shot pending-resume selection so the user can type
+                # just the number (`3`) on the next line instead of having to
+                # retype `/resume 3`. The list here must match the one shown by
+                # _show_recent_sessions and used for index resolution below —
+                # all three go through _list_recent_sessions(limit=10). See
+                # #34584.
+                self._pending_resume_sessions = self._list_recent_sessions(limit=10)
                return
            _cprint("  Tip:   Use /history or `hermes sessions list` to find sessions.")
            return

+        # Any explicit /resume <target> supersedes a previously-armed bare
+        # numbered prompt.
+        self._pending_resume_sessions = None
+
        if not self._session_db:
            from hermes_state import format_session_db_unavailable
            _cprint(f"  {format_session_db_unavailable()}")
@ -6792,6 +6847,44 @@ class HermesCLI:
        else:
            _cprint(f"  ↻ Resumed session {target_id}{title_part} — no messages, starting fresh.")

+    def _consume_pending_resume_selection(self, text: str) -> bool:
+        """Resolve a bare numeric reply that follows a bare ``/resume`` prompt.
+
+        After ``/resume`` (no args) prints the recent-sessions list it arms
+        ``self._pending_resume_sessions``. The next submitted input is given
+        one chance to be a bare session number (``3``); if so we resume that
+        session here. Anything else (another command, free text, blank) simply
+        disarms the prompt and is handled normally by the caller.
+
+        Returns True if the input was consumed as a resume selection (caller
+        must not treat it as chat); False otherwise. The pending state is
+        always one-shot: it is cleared on the first submitted input regardless
+        of outcome. See #34584.
+        """
+        pending = self._pending_resume_sessions
+        if not pending:
+            return False
+        # One-shot: disarm now so a non-matching input can't leave the prompt
+        # armed and hijack a later number the user meant as chat.
+        self._pending_resume_sessions = None
+
+        if not isinstance(text, str):
+            return False
+        stripped = text.strip()
+        # Only a pure number selects; let "/resume 3", titles, or any other
+        # text fall through to normal handling.
+        if not stripped.isdigit():
+            return False
+
+        index = int(stripped)
+        if index < 1 or index > len(pending):
+            _cprint(f"  Resume index {index} is out of range.")
+            _cprint("  Use /resume with no arguments to see available sessions.")
+            return True
+
+        self._handle_resume_command(f"/resume {index}")
+        return True
+
    def _handle_sessions_command(self, cmd_original: str) -> None:
        """Handle /sessions [list|<id_or_title>] — browse or resume previous sessions.

@ -8315,7 +8408,14 @@ class HermesCLI:
        _base_word = cmd_lower.split()[0].lstrip("/")
        _cmd_def = _resolve_cmd(_base_word)
        canonical = _cmd_def.name if _cmd_def else _base_word
-        
+
+        # A bare `/resume` prompt is one-shot: any command other than the
+        # resume/sessions handlers (which manage the pending state themselves)
+        # disarms it so a later number isn't swallowed as a stale selection.
+        # See #34584.
+        if canonical not in {"resume", "sessions"}:
+            self._pending_resume_sessions = None
+
        if canonical in {"quit", "exit"}:
            # Parse --delete flag: /exit --delete also removes the current
            # session's transcripts + SQLite history. Ported from
@ -9867,10 +9967,20 @@ class HermesCLI:
    def _manual_compress(self, cmd_original: str = ""):
        """Manually trigger context compression on the current conversation.

-        Accepts an optional focus topic: ``/compress <focus>`` guides the
-        summariser to preserve information related to *focus* while being
-        more aggressive about discarding everything else.  Inspired by
-        Claude Code's ``/compact <focus>`` feature.
+        Two modes:
+
+        * ``/compress [<focus>]`` — compress the *whole* history. An
+          optional focus topic guides the summariser to preserve
+          information related to *focus* while being more aggressive
+          about discarding everything else.  Inspired by Claude Code's
+          ``/compact <focus>`` feature.
+        * ``/compress here [N]`` — boundary-aware compression. Summarize
+          everything *except* the most recent ``N`` exchanges (default
+          2), which are preserved verbatim. Inspired by Claude Code's
+          Rewind "Summarize up to here" action (v2.1.139, May 2026,
+          https://code.claude.com/docs/en/whats-new/2026-w20). Lets the
+          user pick the compression boundary instead of leaving it to
+          the automatic token-budget heuristic.
        """
        if not self.conversation_history or len(self.conversation_history) < 4:
            print("(._.) Not enough conversation to compress (need at least 4 messages).")
@ -9884,12 +9994,21 @@ class HermesCLI:
            print("(._.) Compression is disabled in config.")
            return

-        # Extract optional focus topic from the command (e.g. "/compress database schema")
-        focus_topic = ""
+        from hermes_cli.partial_compress import (
+            parse_partial_compress_args,
+            rejoin_compressed_head_and_tail,
+            split_history_for_partial_compress,
+        )
+
+        # Args after the command word (e.g. "/compress here 3" -> "here 3").
+        raw_args = ""
        if cmd_original:
-            parts = cmd_original.strip().split(None, 1)
-            if len(parts) > 1:
-                focus_topic = parts[1].strip()
+            _parts = cmd_original.strip().split(None, 1)
+            if len(_parts) > 1:
+                raw_args = _parts[1].strip()
+
+        partial, keep_last, focus_topic = parse_partial_compress_args(raw_args)
+        focus_topic = focus_topic or ""

        original_count = len(self.conversation_history)
        with self._busy_command("Compressing context..."):
@ -9897,6 +10016,22 @@ class HermesCLI:
                from agent.model_metadata import estimate_request_tokens_rough
                from agent.manual_compression_feedback import summarize_manual_compression
                original_history = list(self.conversation_history)
+
+                # Boundary-aware split: only the head is summarized; the
+                # most recent `keep_last` exchanges ride along verbatim.
+                tail: list = []
+                head = original_history
+                if partial:
+                    head, tail = split_history_for_partial_compress(
+                        original_history, keep_last
+                    )
+                    if not tail:
+                        # Split degenerated (everything would be kept, or
+                        # no head left to compress). Fall back to full
+                        # compression so the user still gets an action.
+                        partial = False
+                        head = original_history
+
                # Include system prompt + tool schemas in the estimate —
                # a transcript-only number understates real request pressure
                # and can even appear to grow after compression because a
@ -9908,7 +10043,11 @@ class HermesCLI:
                    system_prompt=_sys_prompt,
                    tools=_tools,
                )
-                if focus_topic:
+                if partial:
+                    print(f"🗜️  Summarizing up to here: compressing {len(head)} of "
+                          f"{original_count} messages (~{approx_tokens:,} tokens), "
+                          f"keeping last {keep_last} exchange(s) verbatim...")
+                elif focus_topic:
                    print(f"🗜️  Compressing {original_count} messages (~{approx_tokens:,} tokens), "
                          f"focus: \"{focus_topic}\"...")
                else:
@ -9921,12 +10060,21 @@ class HermesCLI:
                # which already contain the agent identity — resulting in the
                # identity block appearing twice (issue #15281).
                compressed, _ = self.agent._compress_context(
-                    original_history,
+                    head,
                    None,
                    approx_tokens=approx_tokens,
                    focus_topic=focus_topic or None,
                    force=True,
                )
+                # Re-append the verbatim tail after the compressed head.
+                # The split guarantees `tail` begins on a user turn, so the
+                # compressed-head -> tail boundary is normally valid
+                # (the head's compressed output ends on assistant/tool).
+                # rejoin_compressed_head_and_tail() additionally guards the
+                # seam against any illegal user->user / assistant->assistant
+                # adjacency, defending provider role-alternation rules.
+                if partial and tail:
+                    compressed = rejoin_compressed_head_and_tail(compressed, tail)
                self.conversation_history = compressed
                # _compress_context ends the old session and creates a new child
                # session on the agent (run_agent.py::_compress_context). Sync the
@ -12704,7 +12852,21 @@ class HermesCLI:
        
        # Key bindings for the input area
        kb = KeyBindings()
-        
+
+        from prompt_toolkit.keys import Keys as _IgnoreKeys
+
+        @kb.add(_IgnoreKeys.Ignore, eager=True)
+        def handle_ignored_terminal_sequence(event):
+            """Consume parser-level ignored terminal sequences before self-insert.
+
+            install_ignored_terminal_sequences() in hermes_cli.pt_input_extras
+            registers focus reports (CSI I / CSI O) as Keys.Ignore at the
+            VT100 parser level. Without this no-op binding the default
+            self-insert path would still fire and the bytes would land in
+            the buffer.
+            """
+            return None
+
        def handle_enter(event):
            """Handle Enter key - submit input.
            
@ -12803,6 +12965,13 @@ class HermesCLI:
                        if event.app.is_running:
                            event.app.exit()
                    event.app.current_buffer.reset(append_to_history=True)
+                    # Force a repaint: process_command() prints through
+                    # patch_stdout (scrolls output above the prompt) and never
+                    # invalidates the app, so the just-cleared input area can
+                    # keep showing the submitted text until some unrelated
+                    # redraw fires. Every other early-return branch in this
+                    # handler invalidates after reset — match them.
+                    event.app.invalidate()
                    return

                # Handle /steer while the agent is running immediately on the
@ -12814,6 +12983,13 @@ class HermesCLI:
                if self._should_handle_steer_command_inline(text, has_images=has_images):
                    self.process_command(text)
                    event.app.current_buffer.reset(append_to_history=True)
+                    # Force a repaint after clearing the buffer.  /steer is
+                    # dispatched mid-run while the agent streams output through
+                    # patch_stdout; process_command() never invalidates the
+                    # app, so without this the submitted "/steer <text>" can
+                    # linger in the input area (looking unsent) and invite an
+                    # accidental re-submit. See issue #34569.
+                    event.app.invalidate()
                    return

                # Snapshot and clear attached images
@ -13947,7 +14123,12 @@ class HermesCLI:
            reserved_below = 6

            available = max(0, term_rows - reserved_below)
-            mandatory_full = chrome_full + len(choice_wrapped) + len(other_wrapped)
+            # The compact decision must reserve room for at least one question
+            # row on top of the choices, otherwise full chrome (3 blank
+            # separators) gets kept when there is no room for it and the panel
+            # overflows the viewport — HSplit then clips the panel's tail,
+            # silently dropping the choices (the reported bug).
+            mandatory_full = chrome_full + 1 + len(choice_wrapped) + len(other_wrapped)

            use_compact_chrome = mandatory_full > available
            chrome_rows = chrome_tight if use_compact_chrome else chrome_full
@ -13955,9 +14136,24 @@ class HermesCLI:
            max_question_rows = max(1, available - chrome_rows - len(choice_wrapped) - len(other_wrapped))
            max_question_rows = min(max_question_rows, 12)  # soft cap on huge terminals

+            # When the choices alone (plus compact chrome) already exceed the
+            # viewport, drop the question entirely — the choices are the only
+            # thing the user must see to make a selection. Without this the
+            # question would still claim its 1-row floor above and push the
+            # tail of the choices off-screen (HSplit clips the overflow).
+            choices_overflow = chrome_rows + len(choice_wrapped) + len(other_wrapped) >= available
+            if choices_overflow:
+                max_question_rows = 0
+
            question_wrapped = _wrap_panel_text(question, inner_text_width)
-            if len(question_wrapped) > max_question_rows:
-                keep = max(1, max_question_rows - 1)
+            if max_question_rows <= 0:
+                question_wrapped = []
+            elif len(question_wrapped) > max_question_rows:
+                # The truncation marker is itself a row, so it must count
+                # against the budget. With a 1-row budget there is no room for
+                # both a question line and the marker — show the marker alone
+                # so the rendered question never exceeds max_question_rows.
+                keep = max(0, max_question_rows - 1)
                question_wrapped = question_wrapped[:keep] + ["… (question truncated)"]

            lines = []
@ -14491,6 +14687,17 @@ class HermesCLI:
                                + (f"\n{_remainder}" if _remainder else "")
                            )

+                    # A bare number right after a bare `/resume` prompt selects
+                    # that session (see #34584). Checked before chat routing so
+                    # the digit isn't sent to the agent as a message.
+                    if (
+                        not _file_drop
+                        and self._pending_resume_sessions
+                        and isinstance(user_input, str)
+                        and self._consume_pending_resume_selection(user_input)
+                    ):
+                        continue
+
                    if not _file_drop and isinstance(user_input, str) and _looks_like_slash_command(user_input):
                        _cprint(f"\n⚙️  {user_input}")
                        try:
@ -15125,13 +15332,50 @@ def main(
    # Handle single query mode
    if query or image:
        query, single_query_images = _collect_query_images(query, image)
+        # Kanban workers spawn with ``hermes chat -q "work kanban task <id>"``;
+        # the actual task description lives in the task body. Mirror the
+        # gateway/CLI behaviour for inbound images by scanning the body for
+        # local image paths and http(s) image URLs and attaching them to the
+        # worker's first turn. Without this, users who paste a screenshot
+        # path or URL into a kanban task body never get it routed to the
+        # model's vision input.
+        single_query_image_urls: list[str] = []
+        _kanban_task_id = os.environ.get("HERMES_KANBAN_TASK", "").strip()
+        if _kanban_task_id:
+            try:
+                from hermes_cli import kanban_db as _kb
+                from agent.image_routing import extract_image_refs as _extract_refs
+
+                _conn = _kb.connect()
+                try:
+                    _task = _kb.get_task(_conn, _kanban_task_id)
+                finally:
+                    try:
+                        _conn.close()
+                    except Exception:
+                        pass
+                _body = getattr(_task, "body", "") if _task is not None else ""
+                if _body:
+                    _kb_paths, _kb_urls = _extract_refs(_body)
+                    if _kb_paths:
+                        # Dedupe against any --image the user already passed.
+                        _seen = {str(p) for p in single_query_images}
+                        for _p in _kb_paths:
+                            if _p not in _seen:
+                                _seen.add(_p)
+                                single_query_images.append(Path(_p))
+                    if _kb_urls:
+                        single_query_image_urls.extend(_kb_urls)
+            except Exception as _exc:
+                # Best-effort enrichment; never block worker startup on it.
+                logger.debug("kanban image-ref extraction failed: %s", _exc)
        if quiet:
            # Quiet mode: suppress banner, spinner, tool previews.
            # Only print the final response and parseable session info.
            cli.tool_progress_mode = "off"
            if cli._ensure_runtime_credentials():
                effective_query: Any = query
-                if single_query_images:
+                if single_query_images or single_query_image_urls:
                    # Honour the same image-routing decision used by the
                    # interactive path. With a vision-capable model (incl.
                    # custom-provider models declared via
@ -15160,19 +15404,26 @@ def main(
                            _parts, _skipped = _build_parts(
                                query if isinstance(query, str) else "",
                                [str(p) for p in single_query_images],
+                                image_urls=list(single_query_image_urls) or None,
                            )
                            if any(p.get("type") == "image_url" for p in _parts):
                                effective_query = _parts
                            else:
                                # All images unreadable — text fallback.
+                                # ``_preprocess_images_with_vision`` only knows
+                                # about local files; URLs would be lost there,
+                                # so keep the original query text intact when
+                                # only URLs were supplied.
+                                if single_query_images:
+                                    effective_query = cli._preprocess_images_with_vision(
+                                        query, single_query_images, announce=False,
+                                    )
+                        except Exception:
+                            if single_query_images:
                                effective_query = cli._preprocess_images_with_vision(
                                    query, single_query_images, announce=False,
                                )
-                        except Exception:
-                            effective_query = cli._preprocess_images_with_vision(
-                                query, single_query_images, announce=False,
-                            )
-                    else:
+                    elif single_query_images:
                        effective_query = cli._preprocess_images_with_vision(
                            query,
                            single_query_images,
--- a/docker/s6-rc.d/dashboard/run
+++ b/docker/s6-rc.d/dashboard/run
@ -30,13 +30,21 @@ cd /opt/data
 dash_host="${HERMES_DASHBOARD_HOST:-0.0.0.0}"
 dash_port="${HERMES_DASHBOARD_PORT:-9119}"

-# Binding to anything other than localhost requires --insecure — the
-# dashboard refuses otherwise because it exposes API keys. Inside a
-# container this is the expected deployment.
+# `--insecure` is opt-in via HERMES_DASHBOARD_INSECURE. The dashboard's
+# OAuth auth gate engages automatically on non-loopback binds when a
+# DashboardAuthProvider is registered (e.g. the bundled dashboard_auth/nous
+# provider, which auto-registers when HERMES_DASHBOARD_OAUTH_CLIENT_ID is
+# set). If no provider is registered, start_server fails closed with a
+# specific operator-facing error.
+#
+# This used to derive --insecure from the bind host ("anything non-loopback
+# implies insecure"), but that predates the OAuth gate and silently
+# disabled it on every container-deployed dashboard. The gate is now the
+# authority; operators on trusted LANs / behind a reverse proxy without
+# the OAuth contract opt in explicitly.
 insecure=""
-case "$dash_host" in
-    127.0.0.1|localhost) ;;
-    *) insecure="--insecure" ;;
+case "${HERMES_DASHBOARD_INSECURE:-}" in
+    1|true|TRUE|True|yes|YES|Yes) insecure="--insecure" ;;
 esac

 # shellcheck disable=SC2086  # word-splitting of $insecure is intentional
--- a/docker/stage2-hook.sh
+++ b/docker/stage2-hook.sh
@ -33,6 +33,15 @@ INSTALL_DIR="/opt/hermes"
 mkdir -p "$HERMES_HOME"

 # --- UID/GID remap ---
+# Accept PUID/PGID as aliases for HERMES_UID/HERMES_GID.  NAS users (UGOS,
+# Synology, unRAID) expect the LinuxServer.io PUID/PGID convention and
+# bind-mount /opt/data from a host directory owned by their own UID; without
+# this alias those vars are silently ignored and the s6-setuidgid drop to
+# UID 10000 leaves the runtime unable to read the volume.  HERMES_UID/
+# HERMES_GID still win when both are set.  See #15290, salvages #25872.
+HERMES_UID="${HERMES_UID:-${PUID:-}}"
+HERMES_GID="${HERMES_GID:-${PGID:-}}"
+
 if [ -n "${HERMES_UID:-}" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then
    echo "[stage2] Changing hermes UID to $HERMES_UID"
    usermod -u "$HERMES_UID" hermes
@ -44,6 +53,62 @@ if [ -n "${HERMES_GID:-}" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then
    groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true
 fi

+# --- Docker socket group membership (docker-in-docker / DooD) ---
+# When the user bind-mounts the host Docker daemon socket
+# (`-v /var/run/docker.sock:/var/run/docker.sock`) to use the `docker`
+# terminal backend from inside the container, the socket is owned by the
+# host's `docker` group (or root). The supervised hermes user (UID 10000)
+# is not a member of any group that matches the socket's GID, so every
+# `docker` invocation EACCES'es and `check_terminal_requirements()` fails.
+# See #16703.
+#
+# Granting the supp group via `docker run --group-add <gid>` alone is
+# NOT sufficient with our s6-setuidgid privilege drop: s6-setuidgid (and
+# gosu, the older shim) calls initgroups() for the target user, which
+# rebuilds the supplementary group list from /etc/group. Without an
+# /etc/group entry whose GID matches the socket, the kernel-granted
+# supp group is silently wiped between PID 1 and the dropped process.
+# Confirmed empirically: `--group-add 998` alone leaves the dropped
+# hermes process with `Groups: 10000` (998 gone); after this hook adds
+# the entry, the dropped process has `Groups: 998 10000` as expected.
+#
+# Fix: detect the socket's GID at boot and ensure /etc/group has a
+# matching entry that includes hermes. Idempotent across container
+# restarts. Skipped silently when no socket is bind-mounted.
+#
+# Handles the awkward corner cases:
+#   - socket owned by GID 0 (root) — some Podman setups; usermod -aG root
+#   - socket GID already used by a known container group (e.g. tty=5):
+#     reuse that group's name rather than creating a duplicate
+#   - hermes is already a member of the right group (idempotent restart)
+#   - chown/groupadd failures under rootless containers — non-fatal
+for sock in /var/run/docker.sock /run/docker.sock; do
+    [ -S "$sock" ] || continue
+    sock_gid=$(stat -c '%g' "$sock" 2>/dev/null) || continue
+    [ -n "$sock_gid" ] || continue
+    # Already a member? Nothing to do.
+    if id -G hermes 2>/dev/null | tr ' ' '\n' | grep -qx "$sock_gid"; then
+        echo "[stage2] hermes already in group $sock_gid for $sock"
+        break
+    fi
+    # Resolve or create a group name for this GID.
+    sock_group=$(getent group "$sock_gid" 2>/dev/null | cut -d: -f1)
+    if [ -z "$sock_group" ]; then
+        sock_group="hostdocker"
+        if ! groupadd -g "$sock_gid" "$sock_group" 2>/dev/null; then
+            echo "[stage2] Warning: groupadd -g $sock_gid $sock_group failed; skipping docker socket group setup"
+            break
+        fi
+        echo "[stage2] Created group $sock_group (GID $sock_gid) for Docker socket"
+    fi
+    if usermod -aG "$sock_group" hermes 2>/dev/null; then
+        echo "[stage2] Added hermes to group $sock_group (GID $sock_gid) for $sock"
+    else
+        echo "[stage2] Warning: usermod -aG $sock_group hermes failed; docker backend may fail with EACCES"
+    fi
+    break
+done
+
 # --- Fix ownership of data volume ---
 # When HERMES_UID is remapped or the top-level $HERMES_HOME isn't owned by
 # the runtime hermes UID, restore ownership to hermes — but ONLY for the
--- a/docs/security/network-egress-isolation.md
+++ b/docs/security/network-egress-isolation.md
@ -0,0 +1,195 @@
+# Network Egress Isolation for Docker Deployments
+
+When running Hermes inside Docker, the default `network_mode: host` gives the
+agent process unrestricted outbound network access. This guide shows how to
+segment traffic so the agent core can only reach the services it needs, while
+blocking arbitrary outbound connections.
+
+This is primarily a defense against prompt injection attacks that attempt to
+exfiltrate data via `curl`, `wget`, or raw HTTP from tool-generated shell
+commands.
+
+## Threat Model
+
+The Hermes [SECURITY.md](../../SECURITY.md) §2 defines the trust model. The
+terminal backend is the primary execution boundary. However, when running with
+`network_mode: host`, any command the agent executes can reach any endpoint on
+the network, including external ones.
+
+Network egress isolation adds a second layer: even if a malicious command
+executes inside the container, it cannot reach endpoints outside the
+explicitly allowlisted set.
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────┐
+│  Docker Network: internal (no internet)     │
+│                                             │
+│   ┌──────────────┐   ┌──────────────────┐   │
+│   │ hermes-agent │   │ hermes-dashboard │   │
+│   └──────┬───────┘   └────────┬─────────┘   │
+│          │                    │              │
+│          ▼                    │              │
+│   ┌──────────────┐            │              │
+│   │ hermes-gtw   │◄───────────┘              │
+│   └──────┬───────┘                           │
+│          │                                   │
+└──────────┼───────────────────────────────────┘
+           │
+┌──────────┼───────────────────────────────────┐
+│  Docker Network: egress (internet-capable)   │
+│          │                                   │
+│          ▼                                   │
+│   ┌─────────────────┐                        │
+│   │ egress-proxy     │──► allowlisted hosts  │
+│   │ (squid / envoy)  │                       │
+│   └─────────────────┘                        │
+└──────────────────────────────────────────────┘
+```
+
+Two Docker networks:
+
+- **`internal`** — no default route, no internet access. The agent, dashboard,
+  and gateway run here.
+- **`egress`** — has internet access. Only services that need to reach external
+  APIs are attached to this network.
+
+The gateway service is dual-homed (attached to both networks) so it can
+receive inbound messages from Telegram/Slack/etc. and forward them to the
+agent on the internal network.
+
+## Compose Configuration
+
+Override the default `docker-compose.yml` with a
+`docker-compose.override.yml`:
+
+```yaml
+# docker-compose.override.yml
+# Network egress isolation for production deployments.
+#
+# Usage:
+#   HERMES_UID=$(id -u) HERMES_GID=$(id -g) docker compose up -d
+#
+# This overrides network_mode: host with isolated Docker networks.
+
+networks:
+  internal:
+    driver: bridge
+    internal: true          # no default route, no internet
+  egress:
+    driver: bridge
+
+services:
+  gateway:
+    network_mode: ""        # clear the host-mode default
+    networks:
+      - internal
+      - egress              # needs outbound for Telegram, LLM APIs
+    ports:
+      - "127.0.0.1:9119:9119"   # dashboard proxy, localhost only
+
+  dashboard:
+    network_mode: ""
+    networks:
+      - internal            # internal only, no egress needed
+```
+
+### With an Egress Proxy (Recommended)
+
+For tighter control, route all outbound traffic through an HTTP proxy with
+an explicit allowlist:
+
+```yaml
+# docker-compose.override.yml (with egress proxy)
+
+networks:
+  internal:
+    driver: bridge
+    internal: true
+  egress:
+    driver: bridge
+
+services:
+  gateway:
+    network_mode: ""
+    networks:
+      - internal
+      - egress
+    environment:
+      - HTTP_PROXY=http://egress-proxy:3128
+      - HTTPS_PROXY=http://egress-proxy:3128
+      - NO_PROXY=hermes,hermes-dashboard,localhost
+
+  dashboard:
+    network_mode: ""
+    networks:
+      - internal
+
+  egress-proxy:
+    image: ubuntu/squid:6.10-24.04_edge
+    networks:
+      - egress
+    volumes:
+      - ./config/squid-allowlist.conf:/etc/squid/conf.d/allowlist.conf:ro
+    restart: unless-stopped
+```
+
+Example `config/squid-allowlist.conf`:
+
+```
+# Only allow HTTPS CONNECT to these hosts
+acl allowed_hosts dstdomain api.openai.com
+acl allowed_hosts dstdomain api.anthropic.com
+acl allowed_hosts dstdomain openrouter.ai
+acl allowed_hosts dstdomain generativelanguage.googleapis.com
+acl allowed_hosts dstdomain api.telegram.org
+acl allowed_hosts dstdomain api.github.com
+acl allowed_hosts dstdomain discord.com
+
+http_access allow CONNECT allowed_hosts
+http_access deny all
+```
+
+Adjust the allowlist to match your LLM provider and messaging platform.
+
+## Validating the Setup
+
+After bringing up the stack, verify isolation:
+
+```bash
+# From the agent container: this should FAIL (no egress)
+docker compose exec gateway \
+  curl -sf --max-time 5 https://example.com && echo "FAIL: egress not blocked" || echo "OK: egress blocked"
+
+# From the agent container: this should SUCCEED (internal network)
+docker compose exec gateway \
+  curl -sf --max-time 5 http://hermes-dashboard:9119/health && echo "OK: internal reachable" || echo "FAIL"
+
+# If using egress proxy: this should SUCCEED (allowlisted)
+docker compose exec gateway \
+  curl -sf --max-time 5 --proxy http://egress-proxy:3128 https://api.openai.com/v1/models && echo "OK" || echo "FAIL"
+```
+
+## Limitations
+
+- **DNS resolution:** The `internal` network can still resolve external DNS
+  names unless you also run a local DNS resolver that blocks external queries.
+  For most threat models this is acceptable since DNS resolution alone does not
+  exfiltrate meaningful data.
+
+- **Not a substitute for sandbox backends:** This guide isolates the agent
+  *container's* network. If you use the default local terminal backend, tool
+  commands execute inside the same container. For stronger isolation, combine
+  network segmentation with a sandboxed terminal backend (Docker, Modal,
+  Daytona).
+
+- **Platform adapters need egress:** The gateway service needs outbound access
+  to reach messaging platform APIs. If you add new platform adapters, add their
+  API endpoints to the proxy allowlist.
+
+## Related
+
+- [SECURITY.md](../../SECURITY.md) — Hermes trust model and vulnerability reporting
+- [Terminal backends](../../README.md) — sandboxed execution targets
+- [docker-compose.yml](../../docker-compose.yml) — default compose configuration
--- a/gateway/config.py
+++ b/gateway/config.py
@ -474,6 +474,13 @@ class GatewayConfig:
    
    # Delivery settings
    always_log_local: bool = True  # Always save cron outputs to local files
+    # Drop outbound "silence narration" messages (e.g. *(silent)*, 🔇, a bare
+    # ".") pre-send. These are model hallucinations emitted when a persona has
+    # nothing actionable to say; in bot-to-bot channels they mirror back and
+    # forth, burning tokens and crashing models. Substrate-level guard that
+    # survives SOUL.md/prompt drift across providers. Opt out with False for
+    # raw passthrough.
+    filter_silence_narration: bool = True

    # STT settings
    stt_enabled: bool = True  # Whether to auto-transcribe inbound voice messages
@ -582,6 +589,7 @@ class GatewayConfig:
            "quick_commands": self.quick_commands,
            "sessions_dir": str(self.sessions_dir),
            "always_log_local": self.always_log_local,
+            "filter_silence_narration": self.filter_silence_narration,
            "stt_enabled": self.stt_enabled,
            "group_sessions_per_user": self.group_sessions_per_user,
            "thread_sessions_per_user": self.thread_sessions_per_user,
@ -650,6 +658,9 @@ class GatewayConfig:
            quick_commands=quick_commands,
            sessions_dir=sessions_dir,
            always_log_local=_coerce_bool(data.get("always_log_local"), True),
+            filter_silence_narration=_coerce_bool(
+                data.get("filter_silence_narration"), True
+            ),
            stt_enabled=_coerce_bool(stt_enabled, True),
            group_sessions_per_user=_coerce_bool(group_sessions_per_user, True),
            thread_sessions_per_user=_coerce_bool(thread_sessions_per_user, False),
@ -757,21 +768,32 @@ def load_gateway_config() -> GatewayConfig:
            if "always_log_local" in yaml_cfg:
                gw_data["always_log_local"] = yaml_cfg["always_log_local"]

+            if "filter_silence_narration" in yaml_cfg:
+                gw_data["filter_silence_narration"] = yaml_cfg[
+                    "filter_silence_narration"
+                ]
+
            if "unauthorized_dm_behavior" in yaml_cfg:
                gw_data["unauthorized_dm_behavior"] = _normalize_unauthorized_dm_behavior(
                    yaml_cfg.get("unauthorized_dm_behavior"),
                    "pair",
                )

-            # Merge platforms section from config.yaml into gw_data so that
-            # nested keys like platforms.webhook.extra.routes are loaded.
-            yaml_platforms = yaml_cfg.get("platforms")
+            # Merge platform config into gw_data so runtime-only settings under
+            # ``gateway.platforms`` are loaded the same way as top-level
+            # ``platforms``. Merge nested first so top-level config keeps
+            # precedence, matching the existing gateway.streaming fallback.
+            gateway_cfg = yaml_cfg.get("gateway")
+            gateway_platforms = gateway_cfg.get("platforms") if isinstance(gateway_cfg, dict) else None
            platforms_data = gw_data.setdefault("platforms", {})
            if not isinstance(platforms_data, dict):
                platforms_data = {}
                gw_data["platforms"] = platforms_data
-            if isinstance(yaml_platforms, dict):
-                for plat_name, plat_block in yaml_platforms.items():
+
+            def _merge_platform_map(source_platforms: Any) -> None:
+                if not isinstance(source_platforms, dict):
+                    return
+                for plat_name, plat_block in source_platforms.items():
                    if not isinstance(plat_block, dict):
                        continue
                    existing = platforms_data.get(plat_name, {})
@ -785,6 +807,10 @@ def load_gateway_config() -> GatewayConfig:
                    if merged_extra:
                        merged["extra"] = merged_extra
                    platforms_data[plat_name] = merged
+
+            _merge_platform_map(gateway_platforms)
+            _merge_platform_map(yaml_cfg.get("platforms"))
+            if platforms_data:
                gw_data["platforms"] = platforms_data
            # Iterate built-in platforms plus any registered plugin platforms
            # so plugin authors get the same shared-key bridging (#24836).
@ -890,6 +916,18 @@ def load_gateway_config() -> GatewayConfig:
                    if entry.apply_yaml_config_fn is None:
                        continue
                    platform_cfg = yaml_cfg.get(entry.name)
+                    # Fall back to the platform's block under ``platforms`` /
+                    # ``gateway.platforms`` so adapter hooks still run when the
+                    # user configured the platform only under those nested paths
+                    # (e.g. ``platforms.discord.extra.allow_from``) and not via a
+                    # top-level ``discord:`` block.
+                    if not isinstance(platform_cfg, dict):
+                        for _src in (gateway_platforms, yaml_cfg.get("platforms")):
+                            if isinstance(_src, dict):
+                                _candidate = _src.get(entry.name)
+                                if isinstance(_candidate, dict):
+                                    platform_cfg = _candidate
+                                    break
                    if not isinstance(platform_cfg, dict):
                        continue
                    try:
--- a/gateway/delivery.py
+++ b/gateway/delivery.py
@ -9,6 +9,8 @@ Routes messages to the appropriate destination based on:
 """

 import logging
+import os
+import re
 from pathlib import Path
 from datetime import datetime
 from dataclasses import dataclass
@ -21,6 +23,32 @@ logger = logging.getLogger(__name__)
 MAX_PLATFORM_OUTPUT = 4000
 TRUNCATED_VISIBLE = 3800

+# Matches strings that are *only* a "silence" narration with optional markdown
+# wrappers. Covers: *(silent)*, _silent_, `silent`, ~silent~, (silent), silent,
+# 🔇, a bare ".", "…", and the whitespace/marker-padded variants seen in the
+# wild. Anchored to start/end so substantive messages that merely *contain* the
+# word "silent" are never matched.
+_SILENCE_NARRATION = re.compile(
+    r'^[\s*_~`]*\(?\s*(silent|silence|no\s+response|no\s+reply)\s*\.?\)?[\s*_~`]*$'
+    r'|^[\s*_~`]*[\U0001F507\.\u2026]+[\s*_~`]*$',
+    re.IGNORECASE,
+)
+
+
+def _is_silence_narration(content: Optional[str]) -> bool:
+    """Return True when ``content`` is *only* a silence-narration token.
+
+    Length-guarded (real messages are longer) and anchored to the whole string
+    so legitimate prose like "The deployment ran silently" or "Silence is
+    golden — here is the plan..." is never flagged.
+    """
+    if not content:
+        return False
+    stripped = content.strip()
+    if not stripped or len(stripped) > 64:  # length guard
+        return False
+    return bool(_SILENCE_NARRATION.match(stripped))
+
 from .config import Platform, GatewayConfig
 from .session import SessionSource

@ -261,6 +289,18 @@ class DeliveryRouter:
        path.write_text(content)
        return path

+    def _filter_silence_narration_enabled(self) -> bool:
+        """Whether the outbound silence-narration filter is active.
+
+        ``HERMES_FILTER_SILENCE_NARRATION`` env var overrides config when set;
+        otherwise the ``gateway.filter_silence_narration`` config flag wins
+        (default True).
+        """
+        env = os.getenv("HERMES_FILTER_SILENCE_NARRATION")
+        if env is not None:
+            return env.strip().lower() in ("1", "true", "yes", "on")
+        return bool(getattr(self.config, "filter_silence_narration", True))
+
    async def _deliver_to_platform(
        self,
        target: DeliveryTarget,
@ -286,6 +326,27 @@ class DeliveryRouter:
                + f"\n\n... [truncated, full output saved to {saved_path}]"
            )
        
+        # Substrate-level anti-loop guard: drop hallucinated "silence narration"
+        # (*(silent)*, 🔇, a bare ".", etc.) before it ever reaches the adapter.
+        # In bot-to-bot channels these tokens mirror back and forth until a
+        # model crashes with "no content after all retries". Behavioral prompt
+        # rules drift across providers; this single chokepoint covers every
+        # platform adapter regardless of which persona's prompt failed.
+        # Local/file delivery (_deliver_local) is a separate path and is never
+        # filtered — saved silence has no loop risk.
+        if self._filter_silence_narration_enabled() and _is_silence_narration(content):
+            logger.warning(
+                "Dropped silence-narration outbound to %s (chat=%s): %r",
+                target.platform.value,
+                target.chat_id,
+                content[:40],
+            )
+            return {
+                "success": True,
+                "filtered": "silence_narration",
+                "delivered": False,
+            }
+
        send_metadata = dict(metadata or {})
        is_named_telegram_private_topic = False
        named_telegram_private_topic_name: Optional[str] = None
--- a/gateway/platforms/api_server.py
+++ b/gateway/platforms/api_server.py
@ -1605,6 +1605,7 @@ class APIServerAdapter(BasePlatformAdapter):
                )
                final_response = result.get("final_response", "") if isinstance(result, dict) else ""
                effective_session_id = result.get("session_id", session_id) if isinstance(result, dict) else session_id
+                turn_messages = self._turn_transcript_messages(history, user_message, result) if isinstance(result, dict) else []
                await queue.put(_event_payload("assistant.completed", {
                    "session_id": effective_session_id,
                    "message_id": message_id,
@ -1617,6 +1618,7 @@ class APIServerAdapter(BasePlatformAdapter):
                    "session_id": effective_session_id,
                    "message_id": message_id,
                    "completed": True,
+                    "messages": turn_messages,
                    "usage": usage,
                }))
            except Exception as exc:
@ -3329,6 +3331,44 @@ class APIServerAdapter(BasePlatformAdapter):
            return len(prior)
        return 0

+    @classmethod
+    def _turn_transcript_messages(
+        cls,
+        conversation_history: List[Dict[str, Any]],
+        user_message: Any,
+        result: Dict[str, Any],
+    ) -> List[Dict[str, Any]]:
+        """Return this turn's assistant/tool messages in client-safe shape.
+
+        The streaming SSE contract delivers all assistant text as
+        ``assistant.delta`` events under one ``message_id`` interleaved with
+        ``tool.*`` events, and a single ``assistant.completed`` carrying only
+        the final reply.  A client that accumulates deltas into one buffer
+        cannot reconstruct *intermediate* assistant text segments that preceded
+        tool calls — so when the page is re-opened mid/post-stream those
+        segments appear lost, even though state.db persisted them correctly.
+
+        Emitting the authoritative per-turn transcript on ``run.completed`` lets
+        any SSE consumer reconcile its live view against ground truth without a
+        separate ``GET /messages`` round-trip.  Purely additive: clients that
+        ignore the field are unaffected.  Refs #34703.
+        """
+        agent_messages = result.get("messages") if isinstance(result, dict) else None
+        if not isinstance(agent_messages, list) or not agent_messages:
+            return []
+        start = cls._response_messages_turn_start_index(
+            conversation_history, user_message, result
+        )
+        turn = agent_messages[start:]
+        out: List[Dict[str, Any]] = []
+        for msg in turn:
+            if not isinstance(msg, dict):
+                continue
+            if msg.get("role") not in {"assistant", "tool"}:
+                continue
+            out.append(cls._message_response(msg))
+        return out
+
    @staticmethod
    def _extract_output_items(result: Dict[str, Any], start_index: int = 0) -> List[Dict[str, Any]]:
        """
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -472,6 +472,7 @@ def is_host_excluded_by_no_proxy(hostname: str, no_proxy_value: str | None = Non
    return False


+import dataclasses
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
@ -847,6 +848,13 @@ MEDIA_DELIVERY_SAFE_ROOTS = (
    _HERMES_HOME / "video_cache",
    _HERMES_HOME / "document_cache",
    _HERMES_HOME / "browser_screenshots",
+    # Canonical cache layout — listed alongside the legacy *_cache dirs so
+    # generated artifacts deliver on installs that have both (#31733).
+    _HERMES_HOME / "cache" / "images",
+    _HERMES_HOME / "cache" / "audio",
+    _HERMES_HOME / "cache" / "videos",
+    _HERMES_HOME / "cache" / "documents",
+    _HERMES_HOME / "cache" / "screenshots",
 )

 # Default recency window for trusting freshly-produced files (seconds).
@ -946,11 +954,13 @@ def _media_delivery_denied_paths() -> List[Path]:
    home = Path(os.path.expanduser("~"))
    for sub in _MEDIA_DELIVERY_DENIED_HOME_SUBPATHS:
        denied.append(home / sub)
-    # The Hermes home itself contains credentials (auth.json, .env) — only the
-    # cache subdirectories under it are explicitly allowlisted above.
+    # The Hermes home itself contains credentials (auth.json, .env) and
+    # configuration (config.yaml) — only the cache subdirectories under it
+    # are explicitly allowlisted above.
    denied.append(_HERMES_HOME / ".env")
    denied.append(_HERMES_HOME / "auth.json")
    denied.append(_HERMES_HOME / "credentials")
+    denied.append(_HERMES_HOME / "config.yaml")
    return denied


@ -1021,7 +1031,11 @@ def validate_media_delivery_path(path: str) -> Optional[str]:
    if not candidate:
        return None

-    expanded = Path(os.path.expanduser(candidate))
+    try:
+        expanded = Path(os.path.expanduser(candidate))
+    except (OSError, RuntimeError, ValueError):
+        # expanduser raises ValueError("embedded null byte") for a ~\x00 path.
+        return None
    if not expanded.is_absolute():
        return None

@ -1065,6 +1079,17 @@ def validate_media_delivery_path(path: str) -> Optional[str]:
    return None


+# Neutralise control chars and the Unicode line separators (NEL, LS, PS) that
+# str.splitlines() / log aggregators treat as breaks, so a model-emitted path
+# can't forge a second log line. Truncated to keep records bounded.
+_LOG_UNSAFE_CHARS = re.compile(r"[\x00-\x1f\x7f\x85\u2028\u2029]")
+
+
+def _log_safe_path(path: str) -> str:
+    """Return a single-line, length-bounded path for log output."""
+    return _LOG_UNSAFE_CHARS.sub("?", str(path))[:200]
+
+
 SUPPORTED_DOCUMENT_TYPES = {
    ".pdf": "application/pdf",
    ".md": "text/markdown",
@ -1108,6 +1133,77 @@ SUPPORTED_IMAGE_DOCUMENT_TYPES = {
 }


+# ---------------------------------------------------------------------------
+# Media-delivery extension allowlist — SINGLE SOURCE OF TRUTH
+#
+# Both extractors that turn response text into native attachments derive their
+# extension set from this tuple:
+#   * ``extract_media()``       — explicit ``MEDIA:<path>`` tags
+#   * ``extract_local_files()`` — bare absolute/home paths the agent mentions
+#
+# Historically these two carried independently-maintained extension lists.
+# ``extract_media`` had a narrow list (no .md/.json/.yaml/.xml/.html/...) while
+# ``extract_local_files`` had a broad one. Combined with the unconditional
+# ``MEDIA:\\s*\\S+`` cleanup at the dispatch sites, that mismatch created a
+# silent black hole: a ``MEDIA:/report.md`` tag failed the narrow extract_media
+# match, got stripped from the body by the loose cleanup regex, and was then
+# invisible to extract_local_files — the file was never delivered (issue
+# #34517). Keeping one list eliminates the drift; building the cleanup regexes
+# from the same set means a tag is only stripped when its extension is one we
+# can actually deliver, so an unknown-extension path survives in the body
+# instead of vanishing.
+#
+# Covers images (inline), video (inline where supported), audio (voice/audio),
+# documents/spreadsheets/presentations (send_document), archives, and rendered
+# web output. The dispatch partition (image vs video vs document) lives in
+# ``gateway/run.py``.
+# ---------------------------------------------------------------------------
+
+MEDIA_DELIVERY_EXTS: Tuple[str, ...] = (
+    # Images (embed inline)
+    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg",
+    # Video (embed inline where supported)
+    ".mp4", ".mov", ".avi", ".mkv", ".webm",
+    # Audio (delivered as voice/audio where supported)
+    ".mp3", ".wav", ".ogg", ".opus", ".m4a", ".flac",
+    # Documents (uploaded as file attachments)
+    ".pdf", ".docx", ".doc", ".odt", ".rtf", ".txt", ".md", ".epub",
+    # Spreadsheets / data
+    ".xlsx", ".xls", ".ods", ".csv", ".tsv", ".json", ".xml", ".yaml", ".yml",
+    # Presentations
+    ".pptx", ".ppt", ".odp", ".key",
+    # Archives
+    ".zip", ".tar", ".gz", ".tgz", ".bz2", ".xz", ".7z", ".rar", ".apk", ".ipa",
+    # Web / rendered output
+    ".html", ".htm",
+)
+
+# Regex alternation fragment of bare extensions (no leading dot), e.g.
+# ``png|jpe?g|...``. ``jpe?g`` collapses jpg/jpeg into one branch. Sorted
+# longest-first so the alternation never matches a shorter ext as a prefix of
+# a longer one (e.g. ``.tar`` before ``.tar.gz`` components).
+_MEDIA_EXT_ALTERNATION = "|".join(
+    sorted((e.lstrip(".") for e in MEDIA_DELIVERY_EXTS), key=len, reverse=True)
+)
+
+# Anchored ``MEDIA:<path>`` cleanup pattern. Unlike the old loose
+# ``MEDIA:\\s*\\S+``, this only strips a tag whose path ends in a known
+# deliverable extension (optionally quoted/backticked). A ``MEDIA:`` tag with
+# an unknown extension is left in the text so it can still be picked up by the
+# bare-path detector (extract_local_files) downstream rather than silently
+# deleted. Shared by the non-streaming dispatch path and the streaming
+# consumer so both behave identically.
+# Path anchors: ``~/`` (Unix home-relative), ``/`` (Unix absolute),
+# ``X:\\`` or ``X:/`` (Windows drive-letter absolute — #34632).
+MEDIA_TAG_CLEANUP_RE = re.compile(
+    r'''[`"']?MEDIA:\s*'''
+    r'''(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|'''
+    r'''(?:~/|/|[A-Za-z]:[/\\])\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))'''
+    r'''(?=[\s`"',;:)\]}]|$)[`"']?''',
+    re.IGNORECASE,
+)
+
+
 def get_document_cache_dir() -> Path:
    """Return the document cache directory, creating it if it doesn't exist."""
    DOCUMENT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
@ -1561,6 +1657,10 @@ class BasePlatformAdapter(ABC):
        self.config = config
        self.platform = platform
        self._message_handler: Optional[MessageHandler] = None
+        # Optional hook (e.g. Telegram DM topic recovery) that rewrites
+        # ``event.source.thread_id`` before session keying. Returns the
+        # corrected thread_id or None to leave the source untouched.
+        self._topic_recovery_fn: Optional[Callable[[Any], Optional[str]]] = None
        self._running = False
        self._fatal_error_code: Optional[str] = None
        self._fatal_error_message: Optional[str] = None
@ -1628,6 +1728,29 @@ class BasePlatformAdapter(ABC):
        """
        return len

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """Whether this adapter gates inbound access before dispatch.
+
+        Some adapters (WeCom, Weixin, Yuanbao, QQBot) implement a documented
+        config-driven access surface — ``dm_policy`` / ``group_policy`` /
+        ``allow_from`` / ``group_allow_from`` in ``PlatformConfig.extra`` — and
+        enforce it at intake: a message is dropped inside the adapter and never
+        reaches the gateway unless it already passed that policy.
+
+        The gateway's env-based allowlist check runs *after* the adapter, so for
+        these platforms a message arriving at ``_is_user_authorized`` has, by
+        definition, already been authorized by the adapter. Without this flag the
+        gateway would then deny it again (no env allowlist → default deny),
+        silently breaking ``dm_policy: open`` and config-only allowlists.
+
+        Adapters that own their access policy override this to return ``True``.
+        The gateway treats that as "already authorized at intake" and skips the
+        env-allowlist default-deny. Adapters that delegate access control to the
+        gateway leave it ``False`` (the default).
+        """
+        return False
+
    def supports_draft_streaming(
        self,
        chat_type: Optional[str] = None,
@ -1816,6 +1939,40 @@ class BasePlatformAdapter(ABC):
        """
        self._message_handler = handler

+    def set_topic_recovery_fn(
+        self,
+        fn: Optional[Callable[[Any], Optional[str]]],
+    ) -> None:
+        """Install a thread_id-recovery hook (Telegram DM topic mode).
+
+        The hook is called with ``event.source`` before session keying;
+        a non-None return value replaces ``source.thread_id``. Pass
+        ``None`` to clear the hook.
+        """
+        # Guard against subclasses that initialize via ``object.__new__`` in
+        # tests and never run ``BasePlatformAdapter.__init__``.
+        self._topic_recovery_fn = fn  # type: ignore[attr-defined]
+
+    def _apply_topic_recovery(self, event: MessageEvent) -> None:
+        """Rewrite ``event.source.thread_id`` in place if the hook returns one."""
+        recover = getattr(self, "_topic_recovery_fn", None)
+        if recover is None:
+            return
+        source = getattr(event, "source", None)
+        if source is None:
+            return
+        try:
+            recovered = recover(source)
+        except Exception:
+            logger.debug("topic recovery hook failed", exc_info=True)
+            return
+        if recovered is None or str(recovered) == str(source.thread_id or ""):
+            return
+        try:
+            event.source = dataclasses.replace(source, thread_id=str(recovered))
+        except Exception:
+            logger.debug("topic recovery rewrite failed", exc_info=True)
+
    def set_busy_session_handler(self, handler: Optional[Callable[[MessageEvent, str], Awaitable[bool]]]) -> None:
        """Set an optional handler for messages arriving during active sessions."""
        self._busy_session_handler = handler
@ -2399,11 +2556,12 @@ class BasePlatformAdapter(ABC):
        """Drop unsafe MEDIA paths and normalize accepted paths."""
        safe_media: List[Tuple[str, bool]] = []
        for media_path, is_voice in media_files or []:
-            safe_path = validate_media_delivery_path(str(media_path))
+            raw = str(media_path)
+            safe_path = validate_media_delivery_path(raw)
            if safe_path:
                safe_media.append((safe_path, bool(is_voice)))
            else:
-                logger.warning("Skipping unsafe MEDIA directive path outside allowed roots")
+                logger.warning("Skipping unsafe MEDIA directive path: %s", _log_safe_path(raw))
        return safe_media

    @staticmethod
@ -2411,11 +2569,12 @@ class BasePlatformAdapter(ABC):
        """Drop unsafe bare local file paths and normalize accepted paths."""
        safe_paths: List[str] = []
        for file_path in file_paths or []:
-            safe_path = validate_media_delivery_path(str(file_path))
+            raw = str(file_path)
+            safe_path = validate_media_delivery_path(raw)
            if safe_path:
                safe_paths.append(safe_path)
            else:
-                logger.warning("Skipping unsafe local file path outside allowed roots")
+                logger.warning("Skipping unsafe local file path: %s", _log_safe_path(raw))
        return safe_paths

    @staticmethod
@ -2456,17 +2615,22 @@ class BasePlatformAdapter(ABC):
        cleaned = cleaned.replace("[[as_document]]", "")
        
        # Extract MEDIA:<path> tags, allowing optional whitespace after the colon
-        # and quoted/backticked paths for LLM-formatted outputs.
-        media_pattern = re.compile(
-            r'''[`"']?MEDIA:\s*(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$))[`"']?'''
-        )
+        # and quoted/backticked paths for LLM-formatted outputs. The extension
+        # set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into
+        # MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files.
+        media_pattern = MEDIA_TAG_CLEANUP_RE
        for match in media_pattern.finditer(content):
            path = match.group("path").strip()
            if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
                path = path[1:-1].strip()
            path = path.lstrip("`\"'").rstrip("`\"',.;:)}]")
            if path:
-                media.append((os.path.expanduser(path), has_voice_tag))
+                try:
+                    media.append((os.path.expanduser(path), has_voice_tag))
+                except (OSError, RuntimeError, ValueError):
+                    # Skip a crafted ~\x00 path rather than aborting extraction
+                    # and dropping every other attachment in the response.
+                    continue

        # Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
        if media:
@ -2500,31 +2664,15 @@ class BasePlatformAdapter(ABC):
            Tuple of (list of expanded file paths, cleaned text with the
            raw path strings removed).
        """
-        _LOCAL_MEDIA_EXTS = (
-            # Images (embed inline)
-            '.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', '.svg',
-            # Video (embed inline where supported)
-            '.mp4', '.mov', '.avi', '.mkv', '.webm',
-            # Audio (delivered as voice/audio where supported)
-            '.mp3', '.wav', '.ogg', '.m4a', '.flac',
-            # Documents (uploaded as file attachments)
-            '.pdf', '.docx', '.doc', '.odt', '.rtf', '.txt', '.md',
-            # Spreadsheets / data
-            '.xlsx', '.xls', '.ods', '.csv', '.tsv', '.json', '.xml', '.yaml', '.yml',
-            # Presentations
-            '.pptx', '.ppt', '.odp', '.key',
-            # Archives
-            '.zip', '.tar', '.gz', '.tgz', '.bz2', '.xz', '.7z', '.rar',
-            # Web / rendered output
-            '.html', '.htm',
-        )
+        _LOCAL_MEDIA_EXTS = MEDIA_DELIVERY_EXTS
        ext_part = '|'.join(e.lstrip('.') for e in _LOCAL_MEDIA_EXTS)

        # (?<![/:\w.]) prevents matching inside URLs (e.g. https://…/img.png)
        #             and relative paths (./foo.png)
-        # (?:~/|/)    anchors to absolute or home-relative paths
+        # (?:~/|/)    anchors to absolute or home-relative Unix paths
+        # (?:[A-Za-z]:[/\\]) anchors to Windows drive-letter paths (#34632)
        path_re = re.compile(
-            r'(?<![/:\w.])(?:~/|/)(?:[\w.\-]+/)*[\w.\-]+\.(?:' + ext_part + r')\b',
+            r'(?<![/:\w.])(?:~/|/|[A-Za-z]:[/\\])(?:[\w.\-]+[/\\])*[\w.\-]+\.(?:' + ext_part + r')\b',
            re.IGNORECASE,
        )

@ -3332,7 +3480,12 @@ class BasePlatformAdapter(ABC):
            return

        coerce_plaintext_gateway_command(event)
-        
+
+        # Rewrite ``event.source.thread_id`` via the installed recovery hook
+        # (Telegram DM topic mode) so the session key, guard checks, and
+        # downstream delivery all agree on the same lane.
+        self._apply_topic_recovery(event)
+
        session_key = build_session_key(
            event.source,
            group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True),
@ -3633,7 +3786,12 @@ class BasePlatformAdapter(ABC):
                # Strip any remaining internal directives from message body (fixes #1561)
                text_content = text_content.replace("[[audio_as_voice]]", "").strip()
                text_content = text_content.replace("[[as_document]]", "").strip()
-                text_content = re.sub(r"MEDIA:\s*\S+", "", text_content).strip()
+                # Strip only MEDIA: tags whose path has a deliverable extension
+                # (shared MEDIA_TAG_CLEANUP_RE). A MEDIA: tag with an unknown
+                # extension is intentionally left in the body so extract_local_files
+                # below can still pick up the bare path — otherwise the file would
+                # be silently dropped (issue #34517).
+                text_content = MEDIA_TAG_CLEANUP_RE.sub("", text_content).strip()
                if images:
                    logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response))

--- a/gateway/platforms/feishu.py
+++ b/gateway/platforms/feishu.py
@ -48,6 +48,7 @@ user is seen through different apps in the future.
 from __future__ import annotations

 import asyncio
+import collections
 import hashlib
 import hmac
 import itertools
@ -1408,6 +1409,8 @@ class FeishuAdapter(BasePlatformAdapter):
    """Feishu/Lark bot adapter."""

    MAX_MESSAGE_LENGTH = 8000
+    # Max distinct chat IDs retained in _chat_locks before LRU eviction kicks in.
+    CHAT_LOCK_MAX_SIZE: int = 1000
    # Threshold for detecting Feishu client-side message splits.
    # When a chunk is near the ~4096-char practical limit, a continuation
    # is almost certain.
@ -1445,7 +1448,7 @@ class FeishuAdapter(BasePlatformAdapter):
        self._pending_inbound_lock = threading.Lock()
        self._pending_drain_scheduled = False
        self._pending_inbound_max_depth = 1000  # cap queue; drop oldest beyond
-        self._chat_locks: Dict[str, asyncio.Lock] = {}  # chat_id → lock (per-chat serial processing)
+        self._chat_locks: "collections.OrderedDict[str, asyncio.Lock]" = collections.OrderedDict()  # chat_id → lock (per-chat serial processing, LRU-bounded)
        self._sent_message_ids_to_chat: Dict[str, str] = {}  # message_id → chat_id (for reaction routing)
        self._sent_message_id_order: List[str] = []  # LRU order for _sent_message_ids_to_chat
        self._chat_info_cache: Dict[str, Dict[str, Any]] = {}
@ -2835,11 +2838,28 @@ class FeishuAdapter(BasePlatformAdapter):
    # =========================================================================

    def _get_chat_lock(self, chat_id: str) -> asyncio.Lock:
-        """Return (creating if needed) the per-chat asyncio.Lock for serial message processing."""
+        """Return (creating if needed) the per-chat asyncio.Lock for serial message processing.
+
+        Bounded with LRU eviction so a long-running gateway that sees many
+        distinct chats does not grow ``_chat_locks`` without limit. Locks that
+        are currently held are never evicted; if every entry is locked we fall
+        back to dropping the least-recently-used one.
+        """
        lock = self._chat_locks.get(chat_id)
-        if lock is None:
-            lock = asyncio.Lock()
-            self._chat_locks[chat_id] = lock
+        if lock is not None:
+            self._chat_locks.move_to_end(chat_id)
+            return lock
+        if len(self._chat_locks) >= self.CHAT_LOCK_MAX_SIZE:
+            evicted = False
+            for key in list(self._chat_locks):
+                if not self._chat_locks[key].locked():
+                    self._chat_locks.pop(key)
+                    evicted = True
+                    break
+            if not evicted:
+                self._chat_locks.pop(next(iter(self._chat_locks)))
+        lock = asyncio.Lock()
+        self._chat_locks[chat_id] = lock
        return lock

    async def _handle_message_with_guards(self, event: MessageEvent) -> None:
--- a/gateway/platforms/matrix.py
+++ b/gateway/platforms/matrix.py
@ -2236,7 +2236,8 @@ class MatrixAdapter(BasePlatformAdapter):
            if prompt and not prompt.resolved:
                if room_id != prompt.chat_id:
                    return
-                if self._allowed_user_ids and sender not in self._allowed_user_ids:
+                _allow_all = os.getenv("GATEWAY_ALLOW_ALL_USERS", "").lower() in {"true", "1", "yes"}
+                if not _allow_all and not (self._allowed_user_ids and sender in self._allowed_user_ids):
                    logger.info(
                        "Matrix: ignoring approval reaction from unauthorized user %s on %s",
                        sender, reacts_to,
--- a/gateway/platforms/qqbot/adapter.py
+++ b/gateway/platforms/qqbot/adapter.py
@ -126,7 +126,6 @@ from gateway.platforms.qqbot.chunked_upload import (
 )
 from gateway.platforms.qqbot.keyboards import (
    ApprovalRequest,
-    ApprovalSender,
    InlineKeyboard,
    InteractionEvent,
    build_approval_keyboard,
@ -270,6 +269,11 @@ class QQAdapter(BasePlatformAdapter):
    def name(self) -> str:
        return "QQBot"

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """QQBot gates DM/group access at intake via dm_policy/group_policy."""
+        return True
+
    # ------------------------------------------------------------------
    # Connection lifecycle
    # ------------------------------------------------------------------
--- a/gateway/platforms/qqbot/chunked_upload.py
+++ b/gateway/platforms/qqbot/chunked_upload.py
@ -37,7 +37,7 @@ import asyncio
 import functools
 import hashlib
 import logging
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Awaitable, Callable, Dict, List, Optional

--- a/gateway/platforms/telegram.py
+++ b/gateway/platforms/telegram.py
@ -1690,7 +1690,6 @@ class TelegramAdapter(BasePlatformAdapter):
                    BotCommandScopeAllPrivateChats,
                    BotCommandScopeAllGroupChats,
                    BotCommandScopeDefault,
-                    BotCommandScopeChat,
                )
                from hermes_cli.commands import telegram_menu_commands
                # Telegram allows up to 100 commands but has an undocumented
@ -2805,21 +2804,8 @@ class TelegramAdapter(BasePlatformAdapter):
                return slug

        try:
-            # Build provider buttons — 2 per row
-            buttons: list = []
-            for p in providers:
-                count = p.get("total_models", len(p.get("models", [])))
-                label = f"{p['name']} ({count})"
-                if p.get("is_current"):
-                    label = f"✓ {label}"
-                # Compact callback data: mp:<slug>  (max 64 bytes)
-                buttons.append(
-                    InlineKeyboardButton(label, callback_data=f"mp:{p['slug']}")
-                )
-
-            rows = [buttons[i : i + 2] for i in range(0, len(buttons), 2)]
-            rows.append([InlineKeyboardButton("✗ Cancel", callback_data="mx")])
-            keyboard = InlineKeyboardMarkup(rows)
+            # Build provider buttons — folds provider groups (display only).
+            keyboard = self._build_provider_keyboard(providers)

            provider_label = get_label(current_provider)
            text = self.format_message(
@ -2866,6 +2852,56 @@ class TelegramAdapter(BasePlatformAdapter):

    _MODEL_PAGE_SIZE = 8

+    def _build_provider_keyboard(self, providers: list):
+        """Build the top-level provider keyboard, folding provider groups.
+
+        Provider families (Kimi/Moonshot, MiniMax, xAI Grok, ...) collapse to
+        a single ``mpg:<gid>`` button; tapping it drills into a member
+        sub-keyboard. Single providers (and groups with only one authenticated
+        member) render as direct ``mp:<slug>`` buttons. Grouping mirrors the
+        CLI ``hermes model`` picker via the shared ``group_providers`` fold,
+        so all surfaces stay consistent.
+        """
+        try:
+            from hermes_cli.models import group_providers
+        except Exception:
+            group_providers = None
+
+        by_slug = {p.get("slug"): p for p in providers}
+
+        def _provider_button(p):
+            count = p.get("total_models", len(p.get("models", [])))
+            label = f"{p['name']} ({count})"
+            if p.get("is_current"):
+                label = f"✓ {label}"
+            return InlineKeyboardButton(label, callback_data=f"mp:{p['slug']}")
+
+        buttons: list = []
+        if group_providers is not None:
+            for row in group_providers([p.get("slug") for p in providers]):
+                if row["kind"] == "group":
+                    members = [by_slug[m] for m in row["members"] if m in by_slug]
+                    count = sum(
+                        m.get("total_models", len(m.get("models", []))) for m in members
+                    )
+                    label = f"{row['label']} ▸ ({count})"
+                    if any(m.get("is_current") for m in members):
+                        label = f"✓ {label}"
+                    buttons.append(
+                        InlineKeyboardButton(label, callback_data=f"mpg:{row['group_id']}")
+                    )
+                else:
+                    p = by_slug.get(row["slug"])
+                    if p is not None:
+                        buttons.append(_provider_button(p))
+        else:
+            for p in providers:
+                buttons.append(_provider_button(p))
+
+        rows = [buttons[i : i + 2] for i in range(0, len(buttons), 2)]
+        rows.append([InlineKeyboardButton("✗ Cancel", callback_data="mx")])
+        return InlineKeyboardMarkup(rows)
+
    def _build_model_keyboard(self, models: list, page: int) -> tuple:
        """Build paginated model buttons. Returns (keyboard, page_info_text)."""
        page_size = self._MODEL_PAGE_SIZE
@ -3044,10 +3080,23 @@ class TelegramAdapter(BasePlatformAdapter):
            # Clean up state
            self._model_picker_state.pop(chat_id, None)

-        elif data == "mb":
-            # --- Back to provider list ---
+        elif data.startswith("mpg:"):
+            # --- Provider group selected: show member providers ---
+            group_id = data[4:]
+            try:
+                from hermes_cli.models import PROVIDER_GROUPS
+                _label, member_slugs = PROVIDER_GROUPS.get(group_id, ("", []))
+            except Exception:
+                _label, member_slugs = "", []
+
+            by_slug = {p["slug"]: p for p in state["providers"]}
+            members = [by_slug[m] for m in member_slugs if m in by_slug]
+            if not members:
+                await query.answer(text="Group not found.")
+                return
+
            buttons = []
-            for p in state["providers"]:
+            for p in members:
                count = p.get("total_models", len(p.get("models", [])))
                label = f"{p['name']} ({count})"
                if p.get("is_current"):
@ -3055,11 +3104,30 @@ class TelegramAdapter(BasePlatformAdapter):
                buttons.append(
                    InlineKeyboardButton(label, callback_data=f"mp:{p['slug']}")
                )
-
            rows = [buttons[i : i + 2] for i in range(0, len(buttons), 2)]
-            rows.append([InlineKeyboardButton("✗ Cancel", callback_data="mx")])
+            rows.append([
+                InlineKeyboardButton("◀ Back", callback_data="mb"),
+                InlineKeyboardButton("✗ Cancel", callback_data="mx"),
+            ])
            keyboard = InlineKeyboardMarkup(rows)

+            await query.edit_message_text(
+                text=self.format_message(
+                    (
+                        f"⚙ *Model Configuration*\n\n"
+                        f"Provider family: *{_label or group_id}*\n\n"
+                        f"Select a provider:"
+                    )
+                ),
+                parse_mode=ParseMode.MARKDOWN_V2,
+                reply_markup=keyboard,
+            )
+            await query.answer()
+
+        elif data == "mb":
+            # --- Back to provider list (folds groups) ---
+            keyboard = self._build_provider_keyboard(state["providers"])
+
            try:
                provider_label = get_label(state["current_provider"])
            except Exception:
@ -3108,7 +3176,7 @@ class TelegramAdapter(BasePlatformAdapter):
        query_user_name = getattr(query.from_user, "first_name", None)

        # --- Model picker callbacks ---
-        if data.startswith(("mp:", "mm:", "mb", "mx", "mg:")):
+        if data.startswith(("mp:", "mpg:", "mm:", "mb", "mx", "mg:")):
            chat_id = str(query.message.chat_id) if query.message else None
            if chat_id:
                await self._handle_model_picker_callback(query, data, chat_id)
@ -5027,8 +5095,14 @@ class TelegramAdapter(BasePlatformAdapter):
    # ------------------------------------------------------------------

    def _text_batch_key(self, event: MessageEvent) -> str:
-        """Session-scoped key for text message batching."""
+        """Session-scoped key for text message batching.
+
+        Applies the installed topic-recovery hook first so DM-topic batches
+        coalesce on (and dispatch to) the recovered lane rather than the
+        raw inbound ``message_thread_id`` Telegram may have attached.
+        """
        from gateway.session import build_session_key
+        self._apply_topic_recovery(event)
        return build_session_key(
            event.source,
            group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True),
--- a/gateway/platforms/wecom.py
+++ b/gateway/platforms/wecom.py
@ -847,6 +847,11 @@ class WeComAdapter(BasePlatformAdapter):
    # Policy helpers
    # ------------------------------------------------------------------

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """WeCom gates DM/group access at intake via dm_policy/group_policy."""
+        return True
+
    def _is_dm_allowed(self, sender_id: str) -> bool:
        if self._dm_policy == "disabled":
            return False
--- a/gateway/platforms/weixin.py
+++ b/gateway/platforms/weixin.py
@ -658,52 +658,6 @@ def _split_table_row(line: str) -> List[str]:
    return [cell.strip() for cell in row.split("|")]


-def _rewrite_headers_for_weixin(line: str) -> str:
-    match = _HEADER_RE.match(line)
-    if not match:
-        return line.rstrip()
-    level = len(match.group(1))
-    title = match.group(2).strip()
-    if level == 1:
-        return f"【{title}】"
-    return f"**{title}**"
-
-
-def _rewrite_table_block_for_weixin(lines: List[str]) -> str:
-    if len(lines) < 2:
-        return "\n".join(lines)
-    headers = _split_table_row(lines[0])
-    body_rows = [_split_table_row(line) for line in lines[2:] if line.strip()]
-    if not headers or not body_rows:
-        return "\n".join(lines)
-
-    formatted_rows: List[str] = []
-    for row in body_rows:
-        pairs = []
-        for idx, header in enumerate(headers):
-            if idx >= len(row):
-                break
-            label = header or f"Column {idx + 1}"
-            value = row[idx].strip()
-            if value:
-                pairs.append((label, value))
-        if not pairs:
-            continue
-        if len(pairs) == 1:
-            label, value = pairs[0]
-            formatted_rows.append(f"- {label}: {value}")
-            continue
-        if len(pairs) == 2:
-            label, value = pairs[0]
-            other_label, other_value = pairs[1]
-            formatted_rows.append(f"- {label}: {value}")
-            formatted_rows.append(f"  {other_label}: {other_value}")
-            continue
-        summary = " | ".join(f"{label}: {value}" for label, value in pairs)
-        formatted_rows.append(f"- {summary}")
-    return "\n".join(formatted_rows) if formatted_rows else "\n".join(lines)
-
-
 def _normalize_markdown_blocks(content: str) -> str:
    lines = content.splitlines()
    result: List[str] = []
@ -1226,12 +1180,48 @@ class WeixinAdapter(BasePlatformAdapter):
            default=False,
        )

+        # Text debounce batching (mirrors Telegram adapter pattern).
+        # iLink delivers messages individually, so rapid multi-message
+        # bursts (forwarded batches, paste-splits) each trigger a
+        # separate agent invocation.  Default 3s delay / 5s split delay
+        # are tuned for iLink's typical delivery cadence.  Tunable via
+        # config.yaml under
+        # ``gateway.platforms.weixin.extra.text_batch_delay_seconds`` /
+        # ``text_batch_split_delay_seconds``.
+        self._text_batch_delay_seconds = self._coerce_float_extra(
+            "text_batch_delay_seconds", 3.0
+        )
+        self._text_batch_split_delay_seconds = self._coerce_float_extra(
+            "text_batch_split_delay_seconds", 5.0
+        )
+        self._pending_text_batches: Dict[str, MessageEvent] = {}
+        self._pending_text_batch_tasks: Dict[str, asyncio.Task] = {}
+
        if self._account_id and not self._token:
            persisted = load_weixin_account(hermes_home, self._account_id)
            if persisted:
                self._token = str(persisted.get("token") or "").strip()
                self._base_url = str(persisted.get("base_url") or self._base_url).strip().rstrip("/")

+    def _coerce_float_extra(self, key: str, default: float) -> float:
+        """Read a float from ``config.extra``, guarding against bad/non-finite values.
+
+        The result is fed directly to ``asyncio.sleep()``, so NaN/Inf and
+        unparseable values fall back to ``default``.
+        """
+        import math
+
+        value = self.config.extra.get(key) if getattr(self.config, "extra", None) else None
+        if value is None:
+            return float(default)
+        try:
+            parsed = float(value)
+        except (TypeError, ValueError):
+            return float(default)
+        if not math.isfinite(parsed) or parsed < 0:
+            return float(default)
+        return parsed
+
    @staticmethod
    def _coerce_list(value: Any) -> List[str]:
        if value is None:
@ -1293,6 +1283,11 @@ class WeixinAdapter(BasePlatformAdapter):
    async def disconnect(self) -> None:
        _LIVE_ADAPTERS.pop(self._token, None)
        self._running = False
+        for task in self._pending_text_batch_tasks.values():
+            if not task.done():
+                task.cancel()
+        self._pending_text_batches.clear()
+        self._pending_text_batch_tasks.clear()
        if self._poll_task and not self._poll_task.done():
            self._poll_task.cancel()
            try:
@ -1441,7 +1436,10 @@ class WeixinAdapter(BasePlatformAdapter):
            timestamp=datetime.now(),
        )
        logger.info("[%s] inbound from=%s type=%s media=%d", self.name, _safe_id(sender_id), source.chat_type, len(media_paths))
-        await self.handle_message(event)
+        if event.message_type == MessageType.TEXT:
+            self._enqueue_text_event(event)
+        else:
+            await self.handle_message(event)

    def _is_dm_allowed(self, sender_id: str) -> bool:
        if self._dm_policy == "disabled":
@ -1450,6 +1448,76 @@ class WeixinAdapter(BasePlatformAdapter):
            return sender_id in self._allow_from
        return True

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """Weixin gates DM/group access at intake via dm_policy/group_policy."""
+        return True
+
+    # ------------------------------------------------------------------
+    # Text debounce batching
+    # ------------------------------------------------------------------
+
+    _SPLIT_THRESHOLD = 1800  # iLink chunks at ~2048 chars
+
+    def _text_batch_key(self, event: MessageEvent) -> str:
+        """Session-scoped key for text message batching."""
+        from gateway.session import build_session_key
+        return build_session_key(
+            event.source,
+            group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True),
+            thread_sessions_per_user=self.config.extra.get("thread_sessions_per_user", False),
+        )
+
+    def _enqueue_text_event(self, event: MessageEvent) -> None:
+        """Buffer a text event and reset the flush timer.
+
+        When users forward multiple messages or send rapid-fire texts
+        via WeChat, each arrives as a separate iLink message. This
+        concatenates them and waits for a short quiet period before
+        dispatching the combined message.
+        """
+        key = self._text_batch_key(event)
+        existing = self._pending_text_batches.get(key)
+        chunk_len = len(event.text or "")
+        if existing is None:
+            event._last_chunk_len = chunk_len  # type: ignore[attr-defined]
+            self._pending_text_batches[key] = event
+        else:
+            if event.text:
+                existing.text = f"{existing.text}\n{event.text}" if existing.text else event.text
+            existing._last_chunk_len = chunk_len  # type: ignore[attr-defined]
+            if event.media_urls:
+                existing.media_urls.extend(event.media_urls)
+                existing.media_types.extend(event.media_types)
+
+        prior_task = self._pending_text_batch_tasks.get(key)
+        if prior_task and not prior_task.done():
+            prior_task.cancel()
+        self._pending_text_batch_tasks[key] = asyncio.create_task(
+            self._flush_text_batch(key)
+        )
+
+    async def _flush_text_batch(self, key: str) -> None:
+        """Wait for quiet period then dispatch aggregated text."""
+        current_task = asyncio.current_task()
+        try:
+            pending = self._pending_text_batches.get(key)
+            last_len = getattr(pending, "_last_chunk_len", 0) if pending else 0
+            if last_len >= self._SPLIT_THRESHOLD:
+                delay = self._text_batch_split_delay_seconds
+            else:
+                delay = self._text_batch_delay_seconds
+            await asyncio.sleep(delay)
+            if self._pending_text_batch_tasks.get(key) is not current_task:
+                return
+            event = self._pending_text_batches.pop(key, None)
+            if not event:
+                return
+            await self.handle_message(event)
+        finally:
+            if self._pending_text_batch_tasks.get(key) is current_task:
+                self._pending_text_batch_tasks.pop(key, None)
+
    async def _collect_media(self, item: Dict[str, Any], media_paths: List[str], media_types: List[str]) -> None:
        item_type = item.get("type")
        if item_type == ITEM_IMAGE:
--- a/gateway/platforms/whatsapp.py
+++ b/gateway/platforms/whatsapp.py
@ -278,6 +278,43 @@ class WhatsAppAdapter(BasePlatformAdapter):
        # notification before the normal "✓ whatsapp disconnected" fires.
        self._shutting_down: bool = False

+        # Text debounce batching (mirrors Telegram adapter pattern).
+        # WhatsApp often delivers multiple messages in rapid succession
+        # (e.g. forwarded batches, paste-splits) — without debounce each
+        # message triggers a separate agent invocation, wasting tokens and
+        # flooding the user with reply fragments.  Default 5s delay /
+        # 10s split delay are conservative for WhatsApp's delivery cadence.
+        # Tunable via config.yaml under
+        # ``gateway.platforms.whatsapp.extra.text_batch_delay_seconds`` /
+        # ``text_batch_split_delay_seconds``.
+        self._text_batch_delay_seconds = self._coerce_float_extra(
+            "text_batch_delay_seconds", 5.0
+        )
+        self._text_batch_split_delay_seconds = self._coerce_float_extra(
+            "text_batch_split_delay_seconds", 10.0
+        )
+        self._pending_text_batches: Dict[str, MessageEvent] = {}
+        self._pending_text_batch_tasks: Dict[str, asyncio.Task] = {}
+
+    def _coerce_float_extra(self, key: str, default: float) -> float:
+        """Read a float from ``config.extra``, guarding against bad/non-finite values.
+
+        The result is fed directly to ``asyncio.sleep()``, so NaN/Inf and
+        unparseable values fall back to ``default``.
+        """
+        import math
+
+        value = self.config.extra.get(key) if getattr(self.config, "extra", None) else None
+        if value is None:
+            return float(default)
+        try:
+            parsed = float(value)
+        except (TypeError, ValueError):
+            return float(default)
+        if not math.isfinite(parsed) or parsed < 0:
+            return float(default)
+        return parsed
+
    def _effective_reply_prefix(self) -> str:
        """Return the prefix the Node bridge will add in self-chat mode."""
        whatsapp_mode = os.getenv("WHATSAPP_MODE", "self-chat")
@ -1139,7 +1176,10 @@ class WhatsAppAdapter(BasePlatformAdapter):
                        for msg_data in messages:
                            event = await self._build_message_event(msg_data)
                            if event:
-                                await self.handle_message(event)
+                                if event.message_type == MessageType.TEXT:
+                                    self._enqueue_text_event(event)
+                                else:
+                                    await self.handle_message(event)
            except asyncio.CancelledError:
                break
            except Exception as e:
@ -1151,7 +1191,67 @@ class WhatsAppAdapter(BasePlatformAdapter):
                await asyncio.sleep(5)
            
            await asyncio.sleep(1)  # Poll interval
-    
+
+    # ── Text debounce batching ──────────────────────────────────────
+
+    _SPLIT_THRESHOLD = 6000  # WhatsApp supports ~65K chars; generous threshold
+
+    def _text_batch_key(self, event: MessageEvent) -> str:
+        """Session-scoped key for text message batching."""
+        from gateway.session import build_session_key
+        return build_session_key(
+            event.source,
+            group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True),
+            thread_sessions_per_user=self.config.extra.get("thread_sessions_per_user", False),
+        )
+
+    def _enqueue_text_event(self, event: MessageEvent) -> None:
+        """Buffer a text event and reset the flush timer.
+
+        When WhatsApp delivers rapid-fire messages (e.g. forwarded
+        batches), this concatenates them and waits for a short quiet
+        period before dispatching the combined message.
+        """
+        key = self._text_batch_key(event)
+        existing = self._pending_text_batches.get(key)
+        chunk_len = len(event.text or "")
+        if existing is None:
+            event._last_chunk_len = chunk_len  # type: ignore[attr-defined]
+            self._pending_text_batches[key] = event
+        else:
+            if event.text:
+                existing.text = f"{existing.text}\n{event.text}" if existing.text else event.text
+            existing._last_chunk_len = chunk_len  # type: ignore[attr-defined]
+            if event.media_urls:
+                existing.media_urls.extend(event.media_urls)
+                existing.media_types.extend(event.media_types)
+
+        prior_task = self._pending_text_batch_tasks.get(key)
+        if prior_task and not prior_task.done():
+            prior_task.cancel()
+        self._pending_text_batch_tasks[key] = asyncio.create_task(
+            self._flush_text_batch(key)
+        )
+
+    async def _flush_text_batch(self, key: str) -> None:
+        """Wait for quiet period then dispatch aggregated text."""
+        current_task = asyncio.current_task()
+        try:
+            pending = self._pending_text_batches.get(key)
+            last_len = getattr(pending, "_last_chunk_len", 0) if pending else 0
+            if last_len >= self._SPLIT_THRESHOLD:
+                delay = self._text_batch_split_delay_seconds
+            else:
+                delay = self._text_batch_delay_seconds
+            await asyncio.sleep(delay)
+            event = self._pending_text_batches.pop(key, None)
+            if not event:
+                return
+            await self.handle_message(event)
+        finally:
+            if self._pending_text_batch_tasks.get(key) is current_task:
+                self._pending_text_batch_tasks.pop(key, None)
+
    async def _build_message_event(self, data: Dict[str, Any]) -> Optional[MessageEvent]:
        """Build a MessageEvent from bridge message data, downloading images to cache."""
        try:
--- a/gateway/platforms/yuanbao.py
+++ b/gateway/platforms/yuanbao.py
@ -2230,6 +2230,45 @@ class MediaResolveMiddleware(InboundMiddleware):

    name = "media-resolve"

+    # --- Resource download cache (keyed by resourceId) ---
+    # Avoids redundant downloads of the same resource within the TTL window.
+    # The same resourceId can be referenced multiple times in a session (own
+    # attachment, then quoted again, then observed in a group backfill); each
+    # reference otherwise triggers a fresh token exchange + download.
+    _resource_cache: ClassVar[Dict[str, Tuple[str, str, float]]] = {}  # rid -> (local_path, mime, ts)
+    _RESOURCE_CACHE_TTL_S: ClassVar[int] = 24 * 60 * 60  # 24 hours
+    _RESOURCE_CACHE_MAX_SIZE: ClassVar[int] = 256
+
+    @classmethod
+    def _get_cached_resource(cls, resource_id: str) -> Optional[Tuple[str, str]]:
+        """Return cached ``(local_path, mime)`` if still valid and file exists, else None."""
+        if not resource_id:
+            return None
+        entry = cls._resource_cache.get(resource_id)
+        if entry is None:
+            return None
+        local_path, mime, ts = entry
+        if time.time() - ts > cls._RESOURCE_CACHE_TTL_S:
+            cls._resource_cache.pop(resource_id, None)
+            return None
+        # Verify the cached file still exists on disk (cache dir may be swept).
+        if not os.path.isfile(local_path):
+            cls._resource_cache.pop(resource_id, None)
+            return None
+        return local_path, mime
+
+    @classmethod
+    def _put_cached_resource(cls, resource_id: str, local_path: str, mime: str) -> None:
+        """Store download result in cache. Evicts oldest entries when over capacity."""
+        if not resource_id:
+            return
+        if len(cls._resource_cache) >= cls._RESOURCE_CACHE_MAX_SIZE:
+            # Drop the oldest 25% of entries by timestamp.
+            sorted_keys = sorted(cls._resource_cache, key=lambda k: cls._resource_cache[k][2])
+            for k in sorted_keys[: cls._RESOURCE_CACHE_MAX_SIZE // 4]:
+                cls._resource_cache.pop(k, None)
+        cls._resource_cache[resource_id] = (local_path, mime, time.time())
+
    @staticmethod
    def _guess_image_ext_from_url(url: str) -> str:
        """Guess image extension from URL path."""
@ -2327,8 +2366,23 @@ class MediaResolveMiddleware(InboundMiddleware):
    async def _download_and_cache(
        cls, adapter, *, fetch_url: str, kind: str,
        file_name: Optional[str] = None, log_tag: str = "",
+        resource_id: str = "",
    ) -> Optional[Tuple[str, str]]:
-        """Download a Yuanbao resource and cache locally. Returns ``(local_path, mime)`` or ``None``."""
+        """Download a Yuanbao resource and cache locally. Returns ``(local_path, mime)`` or ``None``.
+
+        When *resource_id* is provided, an in-memory cache keyed by resourceId
+        is consulted first to skip redundant downloads of the same resource
+        within the TTL window.
+        """
+        if resource_id:
+            hit = cls._get_cached_resource(resource_id)
+            if hit is not None:
+                logger.debug(
+                    "[%s] resource cache hit: rid=%s path=%s",
+                    adapter.name, resource_id, hit[0],
+                )
+                return hit
+
        try:
            file_bytes, content_type = await media_download_url(
                fetch_url, max_size_mb=adapter.MEDIA_MAX_SIZE_MB,
@ -2353,6 +2407,7 @@ class MediaResolveMiddleware(InboundMiddleware):
            mime = guess_mime_type(f"image{ext}")
            if not mime.startswith("image/"):
                mime = content_type if content_type.startswith("image/") else "image/jpeg"
+            cls._put_cached_resource(resource_id, local_path, mime)
            return local_path, mime

        # kind == "file"
@ -2368,6 +2423,7 @@ class MediaResolveMiddleware(InboundMiddleware):
            )
            return None
        mime = guess_mime_type(file_name) or content_type or "application/octet-stream"
+        cls._put_cached_resource(resource_id, local_path, mime)
        return local_path, mime

    @classmethod
@ -2393,6 +2449,9 @@ class MediaResolveMiddleware(InboundMiddleware):
            if kind not in _RESOLVABLE_MEDIA_KINDS or not url:
                continue

+            # Extract resourceId from the placeholder URL for cache dedup.
+            rid = ExtractContentMiddleware._parse_resource_id(url)
+
            try:
                fetch_url = await cls._resolve_download_url(adapter, url)
            except Exception as exc:
@ -2408,6 +2467,7 @@ class MediaResolveMiddleware(InboundMiddleware):
                kind=kind,
                file_name=str(ref.get("name") or "").strip() or None,
                log_tag=f"placeholder_url={url[:80]}",
+                resource_id=rid,
            )
            if cached is None:
                continue
@ -2480,6 +2540,7 @@ class MediaResolveMiddleware(InboundMiddleware):
                kind=kind,
                file_name=filename or None,
                log_tag=f"rid={rid}",
+                resource_id=rid,
            )
            if cached is None:
                continue
@ -2563,6 +2624,7 @@ class DispatchMiddleware(InboundMiddleware):
                        kind=kind,
                        file_name=filename or None,
                        log_tag=f"quote rid={rid}",
+                        resource_id=rid,
                    )
                    if cached is None:
                        continue
@ -4629,6 +4691,11 @@ class YuanbaoAdapter(BasePlatformAdapter):
    # Abstract method implementations
    # ------------------------------------------------------------------

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """Yuanbao gates DM/group access at intake via dm_policy/group_policy."""
+        return True
+
    async def connect(self) -> bool:
        """Connect to Yuanbao WS gateway and authenticate.

--- a/gateway/run.py
+++ b/gateway/run.py
@ -751,7 +751,7 @@ _hermes_home = get_hermes_home()

 # Load environment variables from ~/.hermes/.env first.
 # User-managed env files should override stale shell exports on restart.
-from dotenv import load_dotenv  # backward-compat for tests that monkeypatch this symbol
+from dotenv import load_dotenv  # noqa: F401  # backward-compat for tests that monkeypatch this symbol
 from hermes_cli.env_loader import load_hermes_dotenv
 _env_path = _hermes_home / '.env'
 load_hermes_dotenv(hermes_home=_hermes_home, project_env=Path(__file__).resolve().parents[1] / '.env')
@ -831,6 +831,8 @@ if _config_path.exists():
                "docker_env": "TERMINAL_DOCKER_ENV",
                "docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
                "docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
+                "docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
+                "docker_orphan_reaper": "TERMINAL_DOCKER_ORPHAN_REAPER",
                "sandbox_dir": "TERMINAL_SANDBOX_DIR",
                "persistent_shell": "TERMINAL_PERSISTENT_SHELL",
            }
@ -1728,6 +1730,14 @@ class GatewayRunner:
        self._running_agents: Dict[str, Any] = {}
        self._running_agents_ts: Dict[str, float] = {}  # start timestamp per session
        self._pending_messages: Dict[str, str] = {}  # Queued messages during interrupt
+        # Last successfully-resolved (non-empty) model, keyed by session. Used
+        # as a fallback when a fresh config read transiently returns an empty
+        # model (e.g. an mtime-keyed config-cache miss during a post-interrupt
+        # recovery turn). Without this, the agent is built with model="" and
+        # every API call fails HTTP 400 "No models provided" — the session goes
+        # silent until the user manually re-sends. See #35314. ``"*"`` holds a
+        # process-wide last-known-good for sessions seen for the first time.
+        self._last_resolved_model: Dict[str, str] = {}
        # Overflow buffer for explicit /queue commands.  The adapter-level
        # _pending_messages dict is a single slot per session (designed for
        # "next-turn" follow-ups where repeated sends collapse into one
@ -1805,7 +1815,34 @@ class GatewayRunner:
            ensure_installed(log_failures=False)
        except Exception:
            pass  # Non-fatal — fail-open at scan time if unavailable
-        
+
+        # Startup heads-up (#30882): a gateway in manual approval mode with no
+        # automated risk assessor (tirith disabled AND no auxiliary.approval
+        # model) can only gate dangerous commands / execute_code scripts via
+        # live in-chat approval. With approval routing fixed, those actions now
+        # fail closed (block) rather than silently auto-running — surface that
+        # so operators knowingly enable tirith or configure auxiliary.approval
+        # for unattended gateways.
+        try:
+            from hermes_cli.config import load_config as _load_full_config
+            _appr_cfg = _load_full_config()
+            _appr_mode = str(
+                cfg_get(_appr_cfg, "approvals", "mode", default="manual") or "manual"
+            ).strip().lower()
+            _tirith_on = bool(cfg_get(_appr_cfg, "security", "tirith_enabled", default=True))
+            _aux_approval = cfg_get(_appr_cfg, "auxiliary", "approval", default=None)
+            if _appr_mode == "manual" and not _tirith_on and not _aux_approval:
+                logger.warning(
+                    "Gateway approvals.mode=manual with no automated risk "
+                    "assessor (security.tirith_enabled is false and "
+                    "auxiliary.approval is unset): dangerous commands and "
+                    "execute_code scripts will BLOCK until a human approves "
+                    "them in chat. Enable security.tirith_enabled or configure "
+                    "auxiliary.approval for unattended operation."
+                )
+        except Exception:
+            logger.debug("approvals.mode startup check skipped", exc_info=True)
+
        # Initialize session database for session_search tool support
        self._session_db = None
        try:
@ -2301,6 +2338,32 @@ class GatewayRunner:
            session_id=session_entry.session_id,
        )

+    def _sync_telegram_topic_binding(
+        self,
+        source: SessionSource,
+        session_entry,
+        *,
+        reason: str,
+    ) -> None:
+        """Update the topic binding to point at ``session_entry.session_id``.
+
+        Telegram topic lanes persist a (chat_id, thread_id) -> session_id row
+        so reopening a topic in a fresh process resumes the right Hermes
+        session. When compression rotates ``session_entry.session_id`` mid-turn,
+        the binding goes stale and the next inbound message in that topic
+        reloads the oversized parent transcript instead of the compressed
+        child, retriggering preflight compression — sometimes in a loop
+        (#20470, #29712, #33414).
+        """
+        if not self._is_telegram_topic_lane(source):
+            return
+        try:
+            self._record_telegram_topic_binding(source, session_entry)
+        except Exception:
+            logger.debug(
+                "telegram topic binding refresh failed (%s)", reason, exc_info=True,
+            )
+
    def _recover_telegram_topic_thread_id(
        self,
        source: SessionSource,
@ -2433,6 +2496,32 @@ class GatewayRunner:
            except Exception:
                pass

+        # Final safety net (#35314): if resolution still produced an empty
+        # model — e.g. a transient config-cache miss during a post-interrupt
+        # recovery turn returned an empty user_config — reuse the last model we
+        # successfully resolved for this session (or, failing that, the most
+        # recent one resolved process-wide). Building an agent with model=""
+        # makes every API call fail HTTP 400 "No models provided" and the
+        # session goes silent until the user manually re-sends. ``getattr``
+        # guards against bare test runners built via ``object.__new__``.
+        _last_good = getattr(self, "_last_resolved_model", None)
+        if _last_good is not None:
+            if not model:
+                _recovered = _last_good.get(resolved_session_key or "") or _last_good.get("*")
+                if _recovered:
+                    logger.warning(
+                        "Empty model resolved for session=%s — recovering "
+                        "last-known-good model %s (config read likely returned "
+                        "empty; see #35314)",
+                        resolved_session_key or "", _recovered,
+                    )
+                    model = _recovered
+            elif model:
+                # Cache the good resolution for future recovery turns.
+                if resolved_session_key:
+                    _last_good[resolved_session_key] = model
+                _last_good["*"] = model
+
        return model, runtime_kwargs

    def _resolve_turn_agent_config(self, user_message: str, model: str, runtime_kwargs: dict) -> dict:
@ -2729,10 +2818,12 @@ class GatewayRunner:
        """Mark a queued platform as paused — keep it in ``_failed_platforms``
        but stop the reconnect watcher from hammering it.

-        Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
-        retryable failures, and by ``/platform pause <name>`` for manual
-        intervention.  Paused platforms are surfaced in ``/platform list``
-        and resumed with ``/platform resume <name>``.
+        Used by ``/platform pause <name>`` for manual operator intervention.
+        Paused platforms are surfaced in ``/platform list`` and resumed with
+        ``/platform resume <name>``.  Note: the reconnect watcher does NOT
+        auto-pause — retryable (network/DNS) failures keep retrying at the
+        backoff cap indefinitely so a transient outage self-heals without
+        manual intervention.
        """
        info = getattr(self, "_failed_platforms", {}).get(platform)
        if info is None:
@ -4157,6 +4248,7 @@ class GatewayRunner:
            adapter.set_fatal_error_handler(self._handle_adapter_fatal_error)
            adapter.set_session_store(self.session_store)
            adapter.set_busy_session_handler(self._handle_active_session_busy_message)
+            adapter.set_topic_recovery_fn(self._recover_telegram_topic_thread_id)
            adapter._busy_text_mode = self._busy_text_mode
            
            # Try to connect
@ -5418,6 +5510,49 @@ class GatewayRunner:
            )
            stale_timeout_seconds = 0

+        # Read kanban.default_assignee — fallback profile for tasks
+        # created without an explicit assignee (e.g. via the dashboard).
+        # When set, the dispatcher applies it to unassigned ready tasks
+        # instead of skipping them indefinitely (#27145). Empty string
+        # (the schema default) means "no fallback, keep skipping" —
+        # backward-compatible with existing installs.
+        default_assignee = (kanban_cfg.get("default_assignee") or "").strip() or None
+        if default_assignee:
+            logger.info(
+                "kanban dispatcher: default_assignee=%r (unassigned ready tasks "
+                "will route to this profile)",
+                default_assignee,
+            )
+
+        # Read kanban.max_in_progress_per_profile — per-profile concurrency
+        # cap (#21582). When set, no single profile gets more than N
+        # workers running at once, even if the global max_in_progress
+        # would allow it. Prevents one profile's local model / API quota
+        # / browser pool from being overwhelmed by a fan-out.
+        raw_per_profile = kanban_cfg.get("max_in_progress_per_profile", None)
+        max_in_progress_per_profile = None
+        if raw_per_profile is not None:
+            try:
+                max_in_progress_per_profile = int(raw_per_profile)
+            except (TypeError, ValueError):
+                logger.warning(
+                    "kanban dispatcher: invalid kanban.max_in_progress_per_profile=%r; ignoring",
+                    raw_per_profile,
+                )
+                max_in_progress_per_profile = None
+            else:
+                if max_in_progress_per_profile < 1:
+                    logger.warning(
+                        "kanban dispatcher: kanban.max_in_progress_per_profile=%r is below 1; ignoring",
+                        raw_per_profile,
+                    )
+                    max_in_progress_per_profile = None
+                else:
+                    logger.info(
+                        "kanban dispatcher: max_in_progress_per_profile=%d",
+                        max_in_progress_per_profile,
+                    )
+
        # Initial delay so the gateway finishes wiring adapters before the
        # dispatcher spawns workers (those workers may hit gateway notify
        # subscriptions etc.). Matches the notifier watcher's delay.
@ -5509,6 +5644,8 @@ class GatewayRunner:
                    max_in_progress=max_in_progress,
                    failure_limit=failure_limit,
                    stale_timeout_seconds=stale_timeout_seconds,
+                    default_assignee=default_assignee,
+                    max_in_progress_per_profile=max_in_progress_per_profile,
                )
            except sqlite3.DatabaseError as exc:
                if _is_corrupt_board_db_error(exc):
@ -5764,15 +5901,17 @@ class GatewayRunner:
        """Background task that periodically retries connecting failed platforms.

        Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap).
-        Retryable failures keep retrying at the backoff cap indefinitely
-        — but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
-        without ever succeeding, it is *paused*: kept in the retry queue
-        but no longer hammered.  The user surfaces it with ``/platform list``
-        and resumes it with ``/platform resume <name>``.  Non-retryable
-        failures (bad auth, etc.) still drop out of the queue immediately.
+        Retryable failures (network/DNS blips) keep retrying at the backoff
+        cap indefinitely — they self-heal once connectivity returns, so a
+        transient outage never requires manual intervention. Non-retryable
+        failures (bad auth, etc.) drop out of the queue immediately. The
+        circuit breaker (``_pause_failed_platform`` / ``/platform pause``)
+        remains available for manual operator control via ``/platform list``
+        and ``/platform resume <name>``, but is no longer triggered
+        automatically — auto-pausing a recovered platform was the cause of
+        bots silently staying dead after a transient DNS failure.
        """
        _BACKOFF_CAP = 300  # 5 minutes max between retries
-        _PAUSE_AFTER_FAILURES = 10  # circuit-breaker threshold

        await asyncio.sleep(10)  # initial delay — let startup finish
        while self._running:
@ -5817,6 +5956,7 @@ class GatewayRunner:
                    adapter.set_fatal_error_handler(self._handle_adapter_fatal_error)
                    adapter.set_session_store(self.session_store)
                    adapter.set_busy_session_handler(self._handle_active_session_busy_message)
+                    adapter.set_topic_recovery_fn(self._recover_telegram_topic_thread_id)
                    adapter._busy_text_mode = self._busy_text_mode

                    success = await self._connect_adapter_with_timeout(adapter, platform)
@ -5866,14 +6006,14 @@ class GatewayRunner:
                            "Reconnect %s failed, next retry in %ds",
                            platform.value, backoff,
                        )
-                        if attempt >= _PAUSE_AFTER_FAILURES:
-                            self._pause_failed_platform(
-                                platform,
-                                reason=(
-                                    adapter.fatal_error_message
-                                    or "failed to reconnect"
-                                ),
-                            )
+                        # Retryable failures (network/DNS blips) keep retrying
+                        # at the backoff cap indefinitely — they self-heal once
+                        # connectivity returns. We do NOT auto-pause them: a
+                        # transient outage must never require manual `/platform
+                        # resume` to recover. Non-retryable failures (bad auth,
+                        # etc.) already drop out of the queue via the
+                        # `not fatal_error_retryable` branch above, so anything
+                        # reaching here is by definition retryable.
                except Exception as e:
                    self._update_platform_runtime_status(
                        platform.value,
@ -5888,8 +6028,9 @@ class GatewayRunner:
                        "Reconnect %s error: %s, next retry in %ds",
                        platform.value, e, backoff,
                    )
-                    if attempt >= _PAUSE_AFTER_FAILURES:
-                        self._pause_failed_platform(platform, reason=str(e))
+                    # A raised exception during reconnect (connect timeout, DNS
+                    # resolution failure, etc.) is inherently transient — keep
+                    # retrying at the backoff cap rather than auto-pausing.

            # Check every 10 seconds for platforms that need reconnection
            for _ in range(10):
@ -6440,6 +6581,31 @@ class GatewayRunner:
            return YuanbaoAdapter(config)

        return None
+
+    def _adapter_enforces_own_access_policy(self, platform: Optional[Platform]) -> bool:
+        """Whether the adapter for *platform* gates access at intake itself.
+
+        Mirrors ``BasePlatformAdapter.enforces_own_access_policy``. Adapters
+        such as WeCom, Weixin, Yuanbao, and QQBot evaluate their documented
+        ``dm_policy`` / ``group_policy`` / ``allow_from`` config before a
+        message is dispatched to the gateway, so a message that reaches
+        ``_is_user_authorized`` has already been authorized by the adapter.
+        Defaults to ``False`` when the adapter is unknown or doesn't expose
+        the flag.
+        """
+        if not platform:
+            return False
+        # Some test helpers build a bare GatewayRunner via object.__new__ and
+        # never set ``adapters``; treat a missing/empty map as "no adapter"
+        # rather than raising (see pitfalls.md #17).
+        adapters = getattr(self, "adapters", None)
+        if not adapters:
+            return False
+        adapter = adapters.get(platform)
+        if adapter is None:
+            return False
+        return bool(getattr(adapter, "enforces_own_access_policy", False))
+
    def _is_user_authorized(self, source: SessionSource) -> bool:
        """
        Check if a user is authorized to use the bot.
@ -6579,6 +6745,15 @@ class GatewayRunner:
        global_allowlist = os.getenv("GATEWAY_ALLOWED_USERS", "").strip()

        if not platform_allowlist and not group_user_allowlist and not group_chat_allowlist and not global_allowlist:
+            # No env allowlists configured. Adapters that own their own
+            # config-driven access policy (dm_policy / group_policy /
+            # allow_from / group_allow_from) already gated this message at
+            # intake — it would not have reached the gateway otherwise — so
+            # honor that decision instead of falling through to the
+            # env-only default-deny below, which would silently break
+            # `dm_policy: open` and config-only allowlists. (#34515)
+            if self._adapter_enforces_own_access_policy(source.platform):
+                return True
            # No allowlists configured -- check global allow-all flag
            return os.getenv("GATEWAY_ALLOW_ALL_USERS", "").lower() in {"true", "1", "yes"}

@ -6686,6 +6861,20 @@ class GatewayRunner:
            if config.unauthorized_dm_behavior != "pair":  # non-default → explicit override
                return config.unauthorized_dm_behavior

+        # Config-driven dm_policy (WeCom / Weixin / Yuanbao / QQBot). An
+        # allowlist or disabled DM policy means the operator restricted access,
+        # so unauthorized DMs should be dropped silently rather than answered
+        # with a pairing code. An explicit pairing policy opts back into codes.
+        if platform and config and hasattr(config, "platforms"):
+            platform_cfg = config.platforms.get(platform)
+            extra = getattr(platform_cfg, "extra", None) if platform_cfg else None
+            if isinstance(extra, dict):
+                dm_policy = str(extra.get("dm_policy") or "").strip().lower()
+                if dm_policy == "pairing":
+                    return "pair"
+                if dm_policy in {"allowlist", "disabled"}:
+                    return "ignore"
+
        # No explicit override.  Fall back to allowlist-aware default:
        # if any allowlist is configured for this platform, silently drop
        # unauthorized messages instead of sending pairing codes.
@ -8230,6 +8419,28 @@ class GatewayRunner:
                binding = None
            if binding:
                bound_session_id = str(binding.get("session_id") or "")
+                # Heal bindings that point at a pre-compression parent: walk
+                # the compression-continuation chain forward to its tip so the
+                # next message resumes the compressed child instead of
+                # reloading the oversized parent transcript (#20470/#29712/
+                # #33414). Returns the input unchanged when the session isn't
+                # a compression parent, so this is cheap and safe.
+                if bound_session_id and self._session_db is not None:
+                    try:
+                        canonical_session_id = self._session_db.get_compression_tip(
+                            bound_session_id,
+                        )
+                    except Exception:
+                        logger.debug(
+                            "compression-tip lookup failed for %s",
+                            bound_session_id, exc_info=True,
+                        )
+                        canonical_session_id = bound_session_id
+                    if (
+                        canonical_session_id
+                        and canonical_session_id != bound_session_id
+                    ):
+                        bound_session_id = canonical_session_id
                if bound_session_id and bound_session_id != session_entry.session_id:
                    # Route the override through SessionStore so the session_key
                    # → session_id mapping is persisted to disk and the previous
@ -8239,6 +8450,15 @@ class GatewayRunner:
                    switched = self.session_store.switch_session(session_key, bound_session_id)
                    if switched is not None:
                        session_entry = switched
+                # If the stored binding pointed at a parent, rewrite it to the
+                # canonical descendant now that we've followed the chain.
+                if (
+                    bound_session_id
+                    and bound_session_id != str(binding.get("session_id") or "")
+                ):
+                    self._sync_telegram_topic_binding(
+                        source, session_entry, reason="compression-tip-walk",
+                    )
            else:
                try:
                    self._record_telegram_topic_binding(source, session_entry)
@ -8615,6 +8835,10 @@ class GatewayRunner:
                                    if _hyg_new_sid != session_entry.session_id:
                                        session_entry.session_id = _hyg_new_sid
                                        self.session_store._save()
+                                        self._sync_telegram_topic_binding(
+                                            source, session_entry,
+                                            reason="hygiene-compression",
+                                        )

                                    self.session_store.rewrite_transcript(
                                        session_entry.session_id, _compressed
@ -8880,6 +9104,9 @@ class GatewayRunner:
            if agent_result.get("session_id") and agent_result["session_id"] != session_entry.session_id:
                session_entry.session_id = agent_result["session_id"]
                self.session_store._save()
+                self._sync_telegram_topic_binding(
+                    source, session_entry, reason="agent-result-compression",
+                )

            # Prepend reasoning/thinking if display is enabled (per-platform)
            try:
@ -10361,6 +10588,22 @@ class GatewayRunner:
                            except Exception as exc:
                                logger.warning("Picker model switch failed for cached agent: %s", exc)

+                        # Persist the new model to the session DB so the
+                        # dashboard shows the updated model (#34850).
+                        _sess_db = getattr(_self, "_session_db", None)
+                        if _sess_db is not None:
+                            try:
+                                _sess_entry = _self.session_store.get_or_create_session(
+                                    event.source
+                                )
+                                _sess_db.update_session_model(
+                                    _sess_entry.session_id, result.new_model
+                                )
+                            except Exception as exc:
+                                logger.debug(
+                                    "Failed to persist model switch to DB: %s", exc
+                                )
+
                        # Store model note + session override
                        if not hasattr(_self, "_pending_model_notes"):
                            _self._pending_model_notes = {}
@ -10498,6 +10741,20 @@ class GatewayRunner:
            except Exception as exc:
                logger.warning("In-place model switch failed for cached agent: %s", exc)

+        # Persist the new model to the session DB so the dashboard
+        # shows the updated model (#34850).
+        _sess_db = getattr(self, "_session_db", None)
+        if _sess_db is not None:
+            try:
+                _sess_entry = self.session_store.get_or_create_session(source)
+                _sess_db.update_session_model(
+                    _sess_entry.session_id, result.new_model
+                )
+            except Exception as exc:
+                logger.debug(
+                    "Failed to persist model switch to DB: %s", exc
+                )
+
        # Store a note to prepend to the next user message so the model
        # knows about the switch (avoids system messages mid-history).
        if not hasattr(self, "_pending_model_notes"):
@ -11555,9 +11812,16 @@ class GatewayRunner:

            from gateway.platforms.base import BasePlatformAdapter, should_send_media_as_audio

-            media_files, _ = adapter.extract_media(response)
+            media_files, cleaned = adapter.extract_media(response)
            media_files = BasePlatformAdapter.filter_media_delivery_paths(media_files)
-            _, cleaned = adapter.extract_images(response)
+            # Chain the cleaned text through each extractor (extract_media →
+            # extract_images → extract_local_files) so MEDIA: tags and image URLs
+            # are removed before the bare-path auto-detect runs. Previously the
+            # cleaned text from extract_media was dropped (``_``) and
+            # extract_local_files scanned text that still contained MEDIA: tags,
+            # producing false-positive bare-path matches with the MEDIA: prefix
+            # glued on. This matches the chain order in gateway/platforms/base.py.
+            _, cleaned = adapter.extract_images(cleaned)
            local_files, _ = adapter.extract_local_files(cleaned)
            local_files = BasePlatformAdapter.filter_local_delivery_paths(local_files)

@ -12254,6 +12518,12 @@ class GatewayRunner:
        Accepts an optional focus topic: ``/compress <focus>`` guides the
        summariser to preserve information related to *focus* while being
        more aggressive about discarding everything else.
+
+        Also accepts the boundary-aware form ``/compress here [N]``:
+        summarize everything except the most recent ``N`` exchanges
+        (default 2), kept verbatim. Inspired by Claude Code's Rewind
+        "Summarize up to here" action (v2.1.139, May 2026,
+        https://code.claude.com/docs/en/whats-new/2026-w20).
        """
        source = event.source
        session_entry = self.session_store.get_or_create_session(source)
@ -12262,8 +12532,15 @@ class GatewayRunner:
        if not history or len(history) < 4:
            return t("gateway.compress.not_enough")

-        # Extract optional focus topic from command args
-        focus_topic = (event.get_command_args() or "").strip() or None
+        # Parse args: either a focus topic (full compress) or the
+        # boundary-aware "here [N]" form (partial compress).
+        from hermes_cli.partial_compress import (
+            parse_partial_compress_args,
+            rejoin_compressed_head_and_tail,
+            split_history_for_partial_compress,
+        )
+        _raw_args = (event.get_command_args() or "").strip()
+        partial, keep_last, focus_topic = parse_partial_compress_args(_raw_args)

        try:
            from run_agent import AIAgent
@ -12284,6 +12561,19 @@ class GatewayRunner:
                if m.get("role") in {"user", "assistant"} and m.get("content")
            ]

+            # Boundary-aware split: only the head is summarized; the most
+            # recent `keep_last` exchanges are preserved verbatim. The
+            # split snaps the tail to a user-turn start so the rejoined
+            # transcript keeps role alternation valid.
+            tail: list = []
+            head = msgs
+            if partial:
+                head, tail = split_history_for_partial_compress(msgs, keep_last)
+                if not tail:
+                    # Degenerate split — fall back to full compression.
+                    partial = False
+                    head = msgs
+
            tmp_agent = AIAgent(
                **runtime_kwargs,
                model=model,
@ -12307,15 +12597,20 @@ class GatewayRunner:
                )

                compressor = tmp_agent.context_compressor
-                if not compressor.has_content_to_compress(msgs):
+                if not compressor.has_content_to_compress(head):
                    return t("gateway.compress.nothing_to_do")

                loop = asyncio.get_running_loop()
                compressed, _ = await loop.run_in_executor(
                    None,
-                    lambda: tmp_agent._compress_context(msgs, "", approx_tokens=approx_tokens, focus_topic=focus_topic, force=True)
+                    lambda: tmp_agent._compress_context(head, "", approx_tokens=approx_tokens, focus_topic=focus_topic, force=True)
                )

+                # Re-append the verbatim tail after the compressed head,
+                # guarding the seam against illegal role adjacency.
+                if partial and tail:
+                    compressed = rejoin_compressed_head_and_tail(compressed, tail)
+
                # _compress_context already calls end_session() on the old session
                # (preserving its full transcript in SQLite) and creates a new
                # session_id for the continuation.  Write the compressed messages
@ -12324,6 +12619,9 @@ class GatewayRunner:
                if new_session_id != session_entry.session_id:
                    session_entry.session_id = new_session_id
                    self.session_store._save()
+                    self._sync_telegram_topic_binding(
+                        source, session_entry, reason="compress-command",
+                    )

                self.session_store.rewrite_transcript(new_session_id, compressed)
                # Reset stored token count — transcript changed, old value is stale
@ -15102,8 +15400,52 @@ class GatewayRunner:
        ("compression", "target_ratio"),
        ("compression", "protect_last_n"),
        ("agent", "disabled_toolsets"),
+        ("memory", "provider"),
    )

+    _HONCHO_CACHE_BUSTING_KEYS = (
+        "honcho.peer_name",
+        "honcho.ai_peer",
+        "honcho.pin_peer_name",
+        "honcho.runtime_peer_prefix",
+        "honcho.user_peer_aliases",
+    )
+    _HONCHO_CACHE_BUSTING_MEMO: dict[tuple[str, int | None], dict[str, Any]] = {}
+
+    @classmethod
+    def _empty_honcho_cache_busting_config(cls) -> dict[str, Any]:
+        return {key: None for key in cls._HONCHO_CACHE_BUSTING_KEYS}
+
+    @classmethod
+    def _extract_honcho_cache_busting_config(cls) -> dict[str, Any]:
+        """Extract Honcho identity keys, memoized by honcho.json mtime."""
+        try:
+            from plugins.memory.honcho.client import HonchoClientConfig, resolve_config_path
+
+            path = resolve_config_path()
+            try:
+                mtime_ns = path.stat().st_mtime_ns
+            except OSError:
+                mtime_ns = None
+            memo_key = (str(path), mtime_ns)
+            cached = cls._HONCHO_CACHE_BUSTING_MEMO.get(memo_key)
+            if cached is not None:
+                return dict(cached)
+
+            hcfg = HonchoClientConfig.from_global_config(config_path=path)
+            aliases = hcfg.user_peer_aliases or {}
+            values = {
+                "honcho.peer_name": hcfg.peer_name,
+                "honcho.ai_peer": hcfg.ai_peer,
+                "honcho.pin_peer_name": bool(hcfg.pin_peer_name),
+                "honcho.runtime_peer_prefix": hcfg.runtime_peer_prefix or "",
+                "honcho.user_peer_aliases": sorted(aliases.items()) if isinstance(aliases, dict) else [],
+            }
+            cls._HONCHO_CACHE_BUSTING_MEMO = {memo_key: values}
+            return dict(values)
+        except Exception:
+            return cls._empty_honcho_cache_busting_config()
+
    @classmethod
    def _extract_cache_busting_config(cls, user_config: dict | None) -> dict:
        """Pull values that must bust the cached agent.
@ -15134,26 +15476,12 @@ class GatewayRunner:
            out["tools.registry_generation"] = None

        # Honcho identity-mapping keys live in honcho.json, not user_config.
-        # HonchoSessionManager freezes the resolved peer_name / ai_peer /
-        # pin / aliases / prefix at construction; without busting here,
-        # mid-flight honcho.json edits go unread until the next unrelated
-        # cache eviction.
-        try:
-            from plugins.memory.honcho.client import HonchoClientConfig
-
-            hcfg = HonchoClientConfig.from_global_config()
-            out["honcho.peer_name"] = hcfg.peer_name
-            out["honcho.ai_peer"] = hcfg.ai_peer
-            out["honcho.pin_peer_name"] = bool(hcfg.pin_peer_name)
-            out["honcho.runtime_peer_prefix"] = hcfg.runtime_peer_prefix or ""
-            aliases = hcfg.user_peer_aliases or {}
-            out["honcho.user_peer_aliases"] = sorted(aliases.items()) if isinstance(aliases, dict) else []
-        except Exception:
-            out["honcho.peer_name"] = None
-            out["honcho.ai_peer"] = None
-            out["honcho.pin_peer_name"] = None
-            out["honcho.runtime_peer_prefix"] = None
-            out["honcho.user_peer_aliases"] = None
+        # Only read that file when Honcho is the active memory provider.
+        provider = cfg_get(cfg, "memory", "provider")
+        if isinstance(provider, str) and provider.lower() == "honcho":
+            out.update(cls._extract_honcho_cache_busting_config())
+        else:
+            out.update(cls._empty_honcho_cache_busting_config())

        return out

@ -16992,7 +17320,7 @@ class GatewayRunner:
                    _hc = _hm.get("content", "")
                    if "MEDIA:" in _hc:
                        _TOOL_MEDIA_RE = re.compile(
-                            r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
+                            r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
                            r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
                            r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
                            r'txt|csv|apk|ipa))',
@ -17287,18 +17615,38 @@ class GatewayRunner:
            # append any that aren't already present in the final response, so the
            # adapter's extract_media() can find and deliver the files exactly once.
            #
-            # Uses path-based deduplication against _history_media_paths (collected
-            # before run_conversation) instead of index slicing. This is safe even
-            # when context compression shrinks the message list. (Fixes #160)
+            # Scope the scan to THIS turn's tool results only. ``agent_history``
+            # was passed into run_conversation as ``conversation_history``, so the
+            # agent's returned ``messages`` list is ``agent_history`` followed by
+            # the messages produced this turn. Slicing at ``len(agent_history)``
+            # isolates the current turn precisely, so a stale MEDIA: path emitted
+            # by a tool several turns earlier (still present in the full message
+            # list) can never leak onto a later text-only reply. (Fixes #34608)
+            #
+            # Path-based deduplication against _history_media_paths (collected
+            # before run_conversation) is retained as a secondary guard. It is
+            # also the sole guard on the fallback branch taken when mid-run
+            # context compression shrinks the message list below the original
+            # history length, preserving the compression-safe behaviour of #160.
            if "MEDIA:" not in final_response:
                media_tags = []
                has_voice_directive = False
-                for msg in result.get("messages", []):
+                _all_msgs = result.get("messages", [])
+                _history_len = len(agent_history)
+                # Only trust the slice boundary when the message list still
+                # contains the full history prefix. Mid-run compression can
+                # rewrite/shrink the list; in that case fall back to scanning
+                # everything and rely on _history_media_paths for dedup.
+                if _history_len and len(_all_msgs) >= _history_len:
+                    _scan_msgs = _all_msgs[_history_len:]
+                else:
+                    _scan_msgs = _all_msgs
+                for msg in _scan_msgs:
                    if msg.get("role") in {"tool", "function"}:
                        content = msg.get("content", "")
                        if "MEDIA:" in content:
                            _TOOL_MEDIA_RE = re.compile(
-                                r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
+                                r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
                                r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
                                r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
                                r'txt|csv|apk|ipa))',
@ -18251,7 +18599,10 @@ def _run_planned_stop_watcher(
        poll_interval: seconds between marker checks. 0.5s gives a
            responsive shutdown without burning CPU.
    """
-    from gateway.status import _get_planned_stop_marker_path
+    from gateway.status import (
+        _get_planned_stop_marker_path,
+        planned_stop_marker_targets_self,
+    )
    marker_path = _get_planned_stop_marker_path()
    while not stop_event.is_set():
        try:
@ -18260,6 +18611,26 @@ def _run_planned_stop_watcher(
                and not getattr(runner, "_draining", False)
                and getattr(runner, "_running", False)
            ):
+                # A marker existing is NOT sufficient — it may have been
+                # written for a PREVIOUS gateway instance (different PID)
+                # and left behind because that process exited before the
+                # CLI's stop() could clean it up. Firing the handler on a
+                # stale/foreign marker drives the gateway into shutdown,
+                # then consume_planned_stop_marker_for_self() correctly
+                # reports a PID mismatch — but by then we're already
+                # stopping, so it's logged as an unexpected "UNKNOWN" exit
+                # and the watchdog crash-loops the gateway (issue #34597,
+                # a regression from PR #33798 which added this watcher
+                # without the PID check).
+                #
+                # Only fire when the marker actually targets us. The probe
+                # is non-destructive on a match (the handler does the
+                # authoritative consume on the loop thread) and self-heals
+                # by unlinking stale/malformed markers so they cannot wedge
+                # a freshly booted gateway.
+                if not planned_stop_marker_targets_self():
+                    stop_event.wait(poll_interval)
+                    continue
                # Drive the same path as a real signal handler.
                # Pass signal=None — the handler tolerates that and consumes
                # the marker via consume_planned_stop_marker_for_self,
--- a/gateway/runtime_footer.py
+++ b/gateway/runtime_footer.py
@ -26,7 +26,6 @@ piecemeal, the footer is sent as a separate trailing message via
 from __future__ import annotations

 import os
-from pathlib import Path
 from typing import Any, Iterable, Optional

 _DEFAULT_FIELDS: tuple[str, ...] = ("model", "context_pct", "cwd")
--- a/gateway/status.py
+++ b/gateway/status.py
@ -816,12 +816,24 @@ def _consume_pid_marker_for_self(

    our_pid = os.getpid()
    our_start_time = _get_process_start_time(our_pid)
-    matches = (
-        target_pid == our_pid
-        and target_start_time is not None
-        and our_start_time is not None
-        and target_start_time == our_start_time
-    )
+    # Start-time is a PID-reuse guard. It is only meaningful when both
+    # sides actually have it: ``_get_process_start_time`` returns None on
+    # platforms without ``/proc`` (macOS, native Windows — the very
+    # platform the planned-stop watcher exists for). Requiring a non-None
+    # match there would make every consume return False, so a legitimate
+    # ``hermes gateway stop`` on Windows would be misclassified as an
+    # unexpected ``UNKNOWN`` exit (exit 1) and revived by the service
+    # manager. So: when both start_times are known they must match; when
+    # either is unknown, fall back to PID equality alone (bounded by the
+    # marker's short TTL). This mirrors ``planned_stop_marker_targets_self``
+    # so the watcher's non-destructive probe and this authoritative
+    # consume agree on every platform (issue #34597).
+    if target_pid != our_pid:
+        matches = False
+    elif target_start_time is not None and our_start_time is not None:
+        matches = target_start_time == our_start_time
+    else:
+        matches = True

    try:
        path.unlink(missing_ok=True)
@ -914,6 +926,68 @@ def consume_planned_stop_marker_for_self() -> bool:
    )


+def planned_stop_marker_targets_self() -> bool:
+    """Return True only when a live planned-stop marker names the current process.
+
+    This is a **non-destructive** probe used by the watcher thread
+    (``gateway/run.py:_run_planned_stop_watcher``) to decide whether to
+    trigger shutdown. Unlike :func:`consume_planned_stop_marker_for_self`,
+    it never unlinks a marker that matches us — the shutdown handler does
+    the authoritative consume on its own thread.
+
+    It *does* clean up markers that can never apply to this process:
+    malformed markers and markers older than the TTL are unlinked so a
+    stale file left behind by a previous gateway instance cannot wedge
+    the new one. Markers naming a different PID/start_time are left in
+    place (they may still be consumed legitimately by the process they
+    name) but report False here.
+
+    Returns False (without raising) on any read/parse error.
+    """
+    path = _get_planned_stop_marker_path()
+    record = _read_json_file(path)
+    if not record:
+        return False
+
+    try:
+        target_pid = int(record["target_pid"])
+        target_start_time = record.get("target_start_time")
+        written_at = record.get("written_at") or ""
+    except (KeyError, TypeError, ValueError):
+        # Malformed marker can never match anyone — drop it.
+        try:
+            path.unlink(missing_ok=True)
+        except OSError:
+            pass
+        return False
+
+    if _marker_is_stale(written_at, _PLANNED_STOP_MARKER_TTL_S):
+        # A marker this old is past its useful life regardless of target —
+        # clean it up so it cannot crash-loop a freshly booted gateway.
+        try:
+            path.unlink(missing_ok=True)
+        except OSError:
+            pass
+        return False
+
+    our_pid = os.getpid()
+    if target_pid != our_pid:
+        return False
+
+    # Start-time is a PID-reuse guard. It is only meaningful when both
+    # sides actually have it: ``_get_process_start_time`` returns None on
+    # platforms without ``/proc`` (macOS, native Windows — the very
+    # platform this watcher exists for). Requiring a non-None match there
+    # would make the watcher never fire and re-break the #33778 Windows
+    # session-resume path. So: when both start_times are known they must
+    # match; when either is unknown, fall back to PID equality alone
+    # (the marker is short-lived under a 60s TTL, bounding reuse risk).
+    our_start_time = _get_process_start_time(our_pid)
+    if target_start_time is not None and our_start_time is not None:
+        return target_start_time == our_start_time
+    return True
+
+
 def clear_planned_stop_marker() -> None:
    """Remove the planned-stop marker unconditionally."""
    try:
--- a/gateway/stream_consumer.py
+++ b/gateway/stream_consumer.py
@ -26,6 +26,7 @@ from typing import Any, Callable, Optional

 from gateway.platforms.base import BasePlatformAdapter as _BasePlatformAdapter
 from gateway.platforms.base import _custom_unit_to_cp
+from gateway.platforms.base import MEDIA_TAG_CLEANUP_RE
 from gateway.config import (
    DEFAULT_STREAMING_EDIT_INTERVAL as _DEFAULT_STREAMING_EDIT_INTERVAL,
    DEFAULT_STREAMING_BUFFER_THRESHOLD as _DEFAULT_STREAMING_BUFFER_THRESHOLD,
@ -645,10 +646,13 @@ class GatewayStreamConsumer:
        except Exception as e:
            logger.error("Stream consumer error: %s", e)

-    # Pattern to strip MEDIA:<path> tags (including optional surrounding quotes).
-    # Matches the simple cleanup regex used by the non-streaming path in
-    # gateway/platforms/base.py for post-processing.
-    _MEDIA_RE = re.compile(r'''[`"']?MEDIA:\s*\S+[`"']?''')
+    # Strip MEDIA:<path> tags before display. Uses the shared anchored
+    # MEDIA_TAG_CLEANUP_RE from gateway/platforms/base.py — only tags whose
+    # path ends in a deliverable extension are removed, so an unknown-extension
+    # path stays visible instead of being silently dropped (issue #34517).
+    # Streaming and non-streaming paths share the same regex, so a tag is
+    # treated identically whichever path delivered the text.
+    _MEDIA_RE = MEDIA_TAG_CLEANUP_RE

    @staticmethod
    def _clean_for_display(text: str) -> str:
--- a/hermes_cli/init.py
+++ b/hermes_cli/init.py
@ -14,8 +14,8 @@ Provides subcommands for:
 import os
 import sys

-__version__ = "0.15.0"
-__release_date__ = "2026.5.28"
+__version__ = "0.15.1"
+__release_date__ = "2026.5.29"


 def _ensure_utf8():
--- a/hermes_cli/_subprocess_compat.py
+++ b/hermes_cli/_subprocess_compat.py
@ -27,11 +27,9 @@ guarantee.

 from __future__ import annotations

-import os
 import shutil
-import subprocess
 import sys
-from typing import Optional, Sequence
+from typing import Sequence

 __all__ = [
    "IS_WINDOWS",
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
--- a/hermes_cli/auth_commands.py
+++ b/hermes_cli/auth_commands.py
@ -272,9 +272,6 @@ def auth_add_command(args) -> None:
                print("Rehydrating Nous session from shared credentials...")
                rehydrated = auth_mod._try_import_shared_nous_state(
                    timeout_seconds=getattr(args, "timeout", None) or 15.0,
-                    min_key_ttl_seconds=max(
-                        60, int(getattr(args, "min_key_ttl_seconds", 5 * 60))
-                    ),
                )
                if rehydrated is not None:
                    custom_label = (getattr(args, "label", None) or "").strip() or None
@ -297,7 +294,6 @@ def auth_add_command(args) -> None:
            timeout_seconds=getattr(args, "timeout", None) or 15.0,
            insecure=bool(getattr(args, "insecure", False)),
            ca_bundle=getattr(args, "ca_bundle", None),
-            min_key_ttl_seconds=max(60, int(getattr(args, "min_key_ttl_seconds", 5 * 60))),
        )
        # Honor `--label <name>` so nous matches other providers' UX.  The
        # helper embeds this into providers.nous so that label_from_token
--- a/hermes_cli/backup.py
+++ b/hermes_cli/backup.py
@ -670,6 +670,105 @@ def restore_quick_snapshot(
    return restored > 0


+# Relative path of the cron job database inside HERMES_HOME. Kept in sync with
+# the entry in ``_QUICK_STATE_FILES`` and with ``cron/jobs.py``'s ``JOBS_FILE``.
+_CRON_JOBS_REL = "cron/jobs.json"
+
+
+def _count_cron_jobs(path: Path) -> Optional[int]:
+    """Return the number of cron jobs stored in ``path``.
+
+    The canonical on-disk shape is ``{"jobs": [...]}`` (see ``cron/jobs.py``).
+    A legacy bare-list shape (``[...]``) is also honoured.
+
+    Returns:
+        The job count for any *valid, readable* JSON document, or ``None`` if
+        the file is missing or cannot be parsed. ``None`` means "unknown" —
+        callers must not treat it as "zero jobs", because acting on an
+        unreadable file could mask a real corruption the user needs to see.
+    """
+    if not path.is_file():
+        return None
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return None
+    if isinstance(data, dict):
+        jobs = data.get("jobs", [])
+        return len(jobs) if isinstance(jobs, list) else None
+    if isinstance(data, list):
+        return len(data)
+    return None
+
+
+def restore_cron_jobs_if_emptied(
+    snapshot_id: str,
+    hermes_home: Optional[Path] = None,
+) -> Optional[Dict[str, Any]]:
+    """Safety net for silent cron-job loss across ``hermes update``.
+
+    Config-version migrations have been observed to leave ``cron/jobs.json``
+    valid-but-empty after an update, silently dropping every scheduled job
+    (issue #34600). The existing malformed-shape guards in ``cron/jobs.py``
+    don't catch this case because ``{"jobs": []}`` is perfectly valid JSON.
+
+    This compares the *current* job count against the pre-update snapshot. If
+    the live file now has **zero** jobs while the snapshot captured **one or
+    more**, the snapshot copy of ``cron/jobs.json`` is restored in place.
+
+    The check is deliberately conservative — it only ever restores when there
+    is unambiguous evidence of loss (snapshot had jobs, live file has none),
+    so a user who genuinely deleted all their jobs during/after the update is
+    never second-guessed, and an unreadable live file (count ``None``) is left
+    untouched so real corruption still surfaces.
+
+    Args:
+        snapshot_id: The pre-update quick-snapshot id (from
+            :func:`create_quick_snapshot`).
+        hermes_home: Override for the Hermes home directory (tests).
+
+    Returns:
+        ``None`` when no action was taken (the common, healthy path). On a
+        successful restore, a dict ``{"restored": True, "job_count": N,
+        "snapshot_id": ...}`` so the caller can warn the user.
+    """
+    if not snapshot_id:
+        return None
+
+    home = hermes_home or get_hermes_home()
+    live_path = home / _CRON_JOBS_REL
+
+    live_count = _count_cron_jobs(live_path)
+    # Only act when the live file is readable AND empty. ``None`` (missing or
+    # unparseable) is intentionally left alone — that's a different failure
+    # mode the user should see rather than have papered over.
+    if live_count is None or live_count > 0:
+        return None
+
+    snap_path = _quick_snapshot_root(home) / snapshot_id / _CRON_JOBS_REL
+    snap_count = _count_cron_jobs(snap_path)
+    if not snap_count:  # None or 0 — nothing worth restoring
+        return None
+
+    try:
+        live_path.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(snap_path, live_path)
+    except (OSError, PermissionError) as exc:
+        logger.error(
+            "Cron jobs were emptied during update but auto-restore failed: %s", exc
+        )
+        return None
+
+    logger.warning(
+        "Restored %d cron job(s) from pre-update snapshot %s "
+        "(cron/jobs.json was emptied during migration)",
+        snap_count,
+        snapshot_id,
+    )
+    return {"restored": True, "job_count": snap_count, "snapshot_id": snapshot_id}
+
+
 def _prune_quick_snapshots(root: Path, keep: int = _QUICK_DEFAULT_KEEP) -> int:
    """Remove oldest quick snapshots beyond the keep limit. Returns count deleted."""
    if not root.exists():
--- a/hermes_cli/banner.py
+++ b/hermes_cli/banner.py
@ -12,14 +12,16 @@ import threading
 import time
 from pathlib import Path
 from hermes_constants import get_hermes_home
-from typing import Dict, List, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional

-from rich.console import Console
-from rich.panel import Panel
-from rich.table import Table
-
-from prompt_toolkit import print_formatted_text as _pt_print
-from prompt_toolkit.formatted_text import ANSI as _PT_ANSI
+# rich and prompt_toolkit are imported lazily (inside the functions that use
+# them) rather than at module level.  Importing this module is on the TUI
+# gateway's critical startup path purely to reach the lightweight update-check
+# helpers (``prefetch_update_check``); pulling rich.console + prompt_toolkit
+# eagerly added ~50ms of wasted imports before ``gateway.ready`` could fire.
+# Keep the type-only reference available to checkers without the runtime cost.
+if TYPE_CHECKING:
+    from rich.console import Console

 logger = logging.getLogger(__name__)

@ -36,6 +38,8 @@ _RST = "\033[0m"

 def cprint(text: str):
    """Print ANSI-colored text through prompt_toolkit's renderer."""
+    from prompt_toolkit import print_formatted_text as _pt_print
+    from prompt_toolkit.formatted_text import ANSI as _PT_ANSI
    _pt_print(_PT_ANSI(text))


@ -50,17 +54,6 @@ def _skin_color(key: str, fallback: str) -> str:
        return get_active_skin().get_color(key, fallback)
    except Exception:
        return fallback
-
-
-def _skin_branding(key: str, fallback: str) -> str:
-    """Get a branding string from the active skin, or return fallback."""
-    try:
-        from hermes_cli.skin_engine import get_active_skin
-        return get_active_skin().get_branding(key, fallback)
-    except Exception:
-        return fallback
-
-
 # =========================================================================
 # ASCII Art & Branding
 # =========================================================================
@ -232,7 +225,11 @@ def check_for_updates() -> Optional[int]:
    cache_file = hermes_home / ".update_check"
    embedded_rev = os.environ.get("HERMES_REVISION") or None

-    # Read cache — invalidate if the embedded rev has changed since last check
+    # Read cache — invalidate if the embedded rev OR installed version has
+    # changed since the last check. The version guard matters for pip installs:
+    # `check_via_pypi()` compares against VERSION, so a `pip install --upgrade`
+    # changes VERSION but leaves rev unchanged (both None), and without this
+    # the stale "behind" count would survive the upgrade for up to 6h. See #34491.
    now = time.time()
    try:
        if cache_file.exists():
@ -240,6 +237,7 @@ def check_for_updates() -> Optional[int]:
            if (
                now - cached.get("ts", 0) < _UPDATE_CHECK_CACHE_SECONDS
                and cached.get("rev") == embedded_rev
+                and cached.get("ver") == VERSION
            ):
                return cached.get("behind")
    except Exception:
@ -260,7 +258,9 @@ def check_for_updates() -> Optional[int]:
            behind = _check_via_local_git(repo_dir)

    try:
-        cache_file.write_text(json.dumps({"ts": now, "behind": behind, "rev": embedded_rev}))
+        cache_file.write_text(
+            json.dumps({"ts": now, "behind": behind, "rev": embedded_rev, "ver": VERSION})
+        )
    except Exception:
        pass

@ -475,7 +475,7 @@ def _display_toolset_name(toolset_name: str) -> str:
    )


-def build_welcome_banner(console: Console, model: str, cwd: str,
+def build_welcome_banner(console: "Console", model: str, cwd: str,
                         tools: List[dict] = None,
                         enabled_toolsets: List[str] = None,
                         session_id: str = None,
@ -494,6 +494,8 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
        context_length: Model's context window size in tokens.
    """
    from model_tools import check_tool_availability, TOOLSET_REQUIREMENTS
+    from rich.panel import Panel
+    from rich.table import Table
    if get_toolset_for_tool is None:
        from model_tools import get_toolset_for_tool

@ -702,6 +704,21 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
    except Exception:
        pass  # Never break the banner over an update check

+    # Pip-install warning — `pip install hermes-agent` is not the supported
+    # install path (it exists on PyPI for internal/CI reasons, not end users).
+    # Such installs miss the git checkout + installer-managed deps, so updates,
+    # self-update, and issue triage don't behave correctly. Warn, don't block.
+    try:
+        from hermes_cli.config import detect_install_method
+        if detect_install_method() == "pip":
+            right_lines.append(
+                "[bold yellow]⚠ pip install not officially supported[/]"
+                "[dim yellow] — exists for reasons other than user install; "
+                "expect instability and an inability to support issues[/]"
+            )
+    except Exception:
+        pass  # Never break the banner over the install-method check
+
    right_content = "\n".join(right_lines)
    layout_table.add_row(left_content, right_content)

--- a/hermes_cli/bundles.py
+++ b/hermes_cli/bundles.py
@ -15,7 +15,7 @@ Subcommands:
 from __future__ import annotations

 import sys
-from typing import List, Optional
+from typing import List

 from rich.console import Console
 from rich.table import Table
--- a/hermes_cli/checkpoints.py
+++ b/hermes_cli/checkpoints.py
@ -25,7 +25,7 @@ import argparse
 import time
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any


 def _fmt_bytes(n: int) -> str:
--- a/hermes_cli/commands.py
+++ b/hermes_cli/commands.py
@ -85,8 +85,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
               args_hint="<platform>", cli_only=True),
    CommandDef("branch", "Branch the current session (explore a different path)", "Session",
               aliases=("fork",), args_hint="[name]"),
-    CommandDef("compress", "Manually compress conversation context", "Session",
-               args_hint="[focus topic]"),
+    CommandDef("compress", "Compress conversation context (add 'here [N]' to keep recent N turns)", "Session",
+               args_hint="[here [N] | focus topic]"),
    CommandDef("rollback", "List or restore filesystem checkpoints", "Session",
               args_hint="[number]"),
    CommandDef("snapshot", "Create or restore state snapshots of Hermes config/state", "Session",
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -285,9 +285,22 @@ def detect_install_method(project_root: Optional[Path] = None) -> str:
    Resolution order:
    1. Stamped ``~/.hermes/.install_method`` file (written by installers)
    2. HERMES_MANAGED env / .managed marker (NixOS, Homebrew)
-    3. Container detection (/.dockerenv, /run/.containerenv, cgroup)
-    4. .git directory presence -> 'git'
-    5. Fallback -> 'pip'
+    3. .git directory presence -> 'git'
+    4. Fallback -> 'pip'
+
+    Note: running inside a container is NOT treated as "docker" on its own.
+    The two supported install paths both self-identify via the
+    ``.install_method`` stamp (caught by step 1), so neither relies on
+    container detection here:
+      - the curl installer (scripts/install.sh, the README/website install
+        command) git-clones the repo and stamps ``git``;
+      - the published ``nousresearch/hermes-agent`` image stamps ``docker``
+        at boot via ``docker/stage2-hook.sh``.
+    An unsupported manual install dropped into a container (no stamp) was
+    wrongly classified as the published image by bare container detection,
+    so ``hermes update`` bailed with "doesn't apply inside the Docker
+    container". Without that fallback such installs fall through to the
+    ``.git``/pip checks and behave like any off-path install. See issue #34397.
    """
    stamp = get_hermes_home() / ".install_method"
    try:
@ -299,9 +312,6 @@ def detect_install_method(project_root: Optional[Path] = None) -> str:
    managed = get_managed_system()
    if managed:
        return managed.lower().replace(" ", "-")
-    from hermes_constants import is_container
-    if is_container():
-        return "docker"
    if project_root is None:
        project_root = Path(__file__).parent.parent.resolve()
    if (project_root / ".git").is_dir():
@ -319,6 +329,34 @@ def stamp_install_method(method: str) -> None:
        pass


+def is_uv_tool_install() -> bool:
+    """Return True when the *running* Hermes lives in a ``uv tool`` layout.
+
+    ``uv tool install hermes-agent`` places the install at
+    ``.../uv/tools/hermes-agent/...`` (default ``~/.local/share/uv/tools``,
+    or ``$UV_TOOL_DIR/...``). Such installs live outside any virtualenv, so
+    ``uv pip install`` fails with ``No virtual environment found`` and the
+    update path must use ``uv tool upgrade`` instead.
+
+    Detection is intentionally restricted to properties of the running
+    interpreter (``sys.prefix`` / ``sys.executable``). We deliberately do
+    NOT consult ``uv tool list``: it would also return True when
+    ``hermes-agent`` happens to be uv-tool-installed on the machine while
+    the *active* Hermes is a regular pip/venv install, causing
+    ``hermes update`` to upgrade the wrong copy. It would also block on a
+    subprocess call (~seconds) just to compute a recommendation string.
+    """
+    def _has_uv_tool_marker(path: str) -> bool:
+        norm = os.path.normpath(path).replace(os.sep, "/").lower()
+        return "/uv/tools/hermes-agent/" in norm + "/"
+
+    if _has_uv_tool_marker(sys.prefix):
+        return True
+    if _has_uv_tool_marker(sys.executable or ""):
+        return True
+    return False
+
+
 def recommended_update_command_for_method(method: str) -> str:
    """Return the update command or guidance for a given install method."""
    if method == "nixos":
@ -328,9 +366,10 @@ def recommended_update_command_for_method(method: str) -> str:
    if method == "docker":
        return "docker pull nousresearch/hermes-agent:latest"
    if method == "pip":
+        if is_uv_tool_install():
+            return "uv tool upgrade hermes-agent"
        import shutil
-        uv = shutil.which("uv")
-        if uv:
+        if shutil.which("uv"):
            return "uv pip install --upgrade hermes-agent"
        return "pip install --upgrade hermes-agent"
    return "hermes update"
@ -669,6 +708,27 @@ DEFAULT_CONFIG = {
        # (force on/off for all models), or a list of model-name substrings
        # to match (e.g. ["gpt", "codex", "gemini", "qwen"]).
        "tool_use_enforcement": "auto",
+        # Universal "finish the job" guidance — short prompt block applied to
+        # all models that targets two cross-family failure modes: (1) stopping
+        # after a stub instead of finishing the artifact, (2) fabricating
+        # plausible-looking output when a real path is blocked.  Costs ~80
+        # tokens in the cached system prompt.  Set False to disable globally.
+        "task_completion_guidance": True,
+        # Local-environment toolchain probe — surfaces Python/pip/uv/PEP-668
+        # state in the system prompt when something non-default is detected
+        # (e.g. python3 has no pip module, pip→python version mismatch, PEP
+        # 668 enforcement without uv).  Costs zero tokens when the env is
+        # clean (probe emits nothing).  Skipped for remote terminal backends
+        # (docker/modal/ssh — they have their own probe).  Set False to
+        # disable entirely.
+        "environment_probe": True,
+        # Embedder-supplied environment description appended to the system
+        # prompt's environment-hints block. Lets a host that wraps Hermes
+        # (sandbox runner, managed platform) explain the runtime environment
+        # — proxy, credential handling, mount layout — without editing the
+        # identity slot (SOUL.md). Empty by default. The HERMES_ENVIRONMENT_HINT
+        # env var overrides this (build-time/container mechanism).
+        "environment_hint": "",
        # Staged inactivity warning: send a warning to the user at this
        # threshold before escalating to a full timeout.  The warning fires
        # once per run and does not interrupt the agent.  0 = disable warning.
@ -836,6 +896,11 @@ DEFAULT_CONFIG = {
            "session_key": "",
            # Rehydrate tab_id from Camofox before creating a new tab.
            "adopt_existing_tab": False,
+            # Docker Camofox opens page URLs from inside the container. Enable
+            # this to rewrite loopback page URLs (localhost/127.0.0.1/::1) to a
+            # host alias while leaving CAMOFOX_URL itself unchanged.
+            "rewrite_loopback_urls": False,
+            "loopback_host_alias": "host.docker.internal",
        },
    },

@ -1157,6 +1222,11 @@ DEFAULT_CONFIG = {
        # Mirrors `hermes -c` muscle memory.  Default off so existing
        # users aren't surprised.  HERMES_TUI_RESUME=<id> always wins.
        "tui_auto_resume_recent": False,
+        # When true (default), `hermes --tui` drops a one-time hint
+        # ("subagents working · /agents to watch live") the first time a turn
+        # starts delegating, nudging the user toward the live spawn-tree
+        # dashboard. Set false to suppress the hint.
+        "tui_agents_nudge": True,
        "bell_on_complete": False,
        "show_reasoning": False,
        "streaming": False,
@ -1176,6 +1246,13 @@ DEFAULT_CONFIG = {
        # class of over-claim that otherwise forces users to run
        # `git status` to verify edits landed.  Set false to suppress.
        "file_mutation_verifier": True,
+        # Turn-completion explainer.  When true (default), the agent appends a
+        # one-line explanation to its final response whenever a turn ends
+        # abnormally with no usable reply — empty content after retries, a
+        # partial/truncated stream, a still-pending tool result, or an
+        # iteration/budget limit.  Replaces the bare "(empty)" sentinel so the
+        # failure isn't silent from the UI's perspective.  Set false to suppress.
+        "turn_completion_explainer": True,
        "show_cost": False,       # Show $ cost in the status bar (off by default)
        "skin": "default",
        # UI language for static user-facing messages (approval prompts, a
@ -1726,6 +1803,15 @@ DEFAULT_CONFIG = {
        # assignee to any installed profile. When unset, falls back to the
        # default profile. A task never ends up with assignee=None.
        "default_assignee": "",
+        # Per-profile concurrency cap (#21582). When set to a positive int,
+        # no single profile can have more than N workers running at once,
+        # even if the global max_in_progress / max_spawn caps would allow
+        # it. Tasks blocked this way defer to the next dispatcher tick.
+        # Unset (None) means "no per-profile cap" — backward-compatible
+        # with existing installs. Useful for fan-out workflows that would
+        # otherwise saturate one profile's local model / API quota /
+        # browser pool while leaving other profiles idle.
+        "max_in_progress_per_profile": None,
        # When true, the kanban dispatcher auto-runs the decomposer on
        # tasks that land in Triage (every dispatcher tick). When false,
        # decomposition is manual via `hermes kanban decompose <id>` or
@ -1757,6 +1843,38 @@ DEFAULT_CONFIG = {
        "mode": "project",
    },

+    # Tool Search (progressive disclosure for large tool surfaces).
+    # When the model is connected to many MCP servers or non-core plugin
+    # tools, their JSON schemas can consume a substantial fraction of the
+    # context window on every turn. When enabled, those tools are replaced
+    # in the model-facing tools array with three bridge tools —
+    # tool_search / tool_describe / tool_call — and surfaced on demand.
+    #
+    # Core Hermes tools (terminal, read_file, write_file, patch,
+    # search_files, todo, memory, browser_*, etc.) are NEVER deferred.
+    # See tools/tool_search.py for full design notes and the
+    # openclaw-tool-search-report PDF in this PR for the rationale.
+    "tools": {
+        "tool_search": {
+            # "auto" (default) — activate only when deferrable tool schemas
+            #   exceed ``threshold_pct`` of the active model's context length,
+            #   so small toolsets pay no overhead.
+            # "on"  — always activate when there is at least one deferrable
+            #   tool. Use when you have many MCP servers and want maximum
+            #   token reduction unconditionally.
+            # "off" — disable entirely. Tools-array assembly is a pass-through.
+            "enabled": "auto",
+            # Percentage of context length at which "auto" mode kicks in.
+            # 10 matches the Claude Code default. Range 0..100.
+            "threshold_pct": 10,
+            # When the model calls tool_search without a ``limit`` argument,
+            # how many hits to return. Range 1..max_search_limit.
+            "search_default_limit": 5,
+            # Hard upper bound the model can request via ``limit``. Range 1..50.
+            "max_search_limit": 20,
+        },
+    },
+
    # Logging — controls file logging to ~/.hermes/logs/.
    # agent.log captures INFO+ (all agent activity); errors.log captures WARNING+.
    "logging": {
@ -5551,6 +5669,8 @@ def set_config_value(key: str, value: str):
        "terminal.daytona_image": "TERMINAL_DAYTONA_IMAGE",
        "terminal.docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
        "terminal.docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
+        "terminal.docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
+        "terminal.docker_orphan_reaper": "TERMINAL_DOCKER_ORPHAN_REAPER",
        "terminal.docker_env": "TERMINAL_DOCKER_ENV",
        # terminal.cwd intentionally excluded — CLI resolves at runtime,
        # gateway bridges it in gateway/run.py. Persisting to .env causes
--- a/hermes_cli/dashboard_auth/middleware.py
+++ b/hermes_cli/dashboard_auth/middleware.py
@ -26,10 +26,15 @@ from hermes_cli.dashboard_auth import list_providers
 from hermes_cli.dashboard_auth.audit import AuditEvent, audit_log
 from hermes_cli.dashboard_auth.base import ProviderError
 from hermes_cli.dashboard_auth.cookies import read_session_cookies
+from hermes_cli.dashboard_auth.public_paths import PUBLIC_API_PATHS

 _log = logging.getLogger(__name__)

-# Paths that bypass the auth gate. Order matters: prefix match.
+# Prefixes that bypass the auth gate. Match via ``path == prefix`` or
+# ``path.startswith(prefix)`` — so ``/assets/`` (with trailing slash)
+# matches ``/assets/foo.css`` but not ``/assetsleak``. Auth-bootstrap
+# (login page, OAuth round trip, provider listing) and static asset
+# mounts go here.
 _GATE_PUBLIC_PREFIXES: tuple[str, ...] = (
    "/auth/login",
    "/auth/callback",
@ -45,6 +50,20 @@ _GATE_PUBLIC_PREFIXES: tuple[str, ...] = (


 def _path_is_public(path: str) -> bool:
+    """True if ``path`` bypasses the OAuth auth gate.
+
+    Two sources of public-ness:
+
+    * :data:`PUBLIC_API_PATHS` — the shared ``/api/*`` allowlist that
+      the legacy ``_SESSION_TOKEN`` middleware also honours. Matched
+      exactly (no prefix expansion) so adding ``/api/status`` doesn't
+      accidentally expose ``/api/status/secret-extension``.
+    * :data:`_GATE_PUBLIC_PREFIXES` — auth-bootstrap routes and static
+      mounts. Prefix-matched so ``/assets/foo.css`` lights up via
+      ``/assets/``.
+    """
+    if path in PUBLIC_API_PATHS:
+        return True
    return any(
        path == prefix or path.startswith(prefix)
        for prefix in _GATE_PUBLIC_PREFIXES
--- a/hermes_cli/dashboard_auth/public_paths.py
+++ b/hermes_cli/dashboard_auth/public_paths.py
@ -0,0 +1,49 @@
+"""Shared allowlist of ``/api/*`` paths that bypass dashboard auth.
+
+Two middlewares enforce dashboard auth and previously kept independent
+copies of this list:
+
+* ``hermes_cli.web_server.auth_middleware`` — loopback / ``--insecure``
+  mode, gates on the ephemeral ``_SESSION_TOKEN``.
+* ``hermes_cli.dashboard_auth.middleware.gated_auth_middleware`` —
+  non-loopback mode, gates on the OAuth session cookie.
+
+When the lists drifted, ``/api/status`` ended up public under the legacy
+gate but 401'd under the OAuth gate. That broke the portal's wildcard
+liveness probe (``nous-account-service`` ``fly-provider.ts``
+``getInstanceRuntimeStatus``), which fetches ``/api/status`` without a
+cookie as its sole signal of "agent dashboard is alive": every healthy
+wildcard-subdomain agent surfaced as STARTING/down in the portal UI even
+though the dashboard was serving correctly.
+
+Centralising the allowlist here so both middlewares import the same
+frozenset prevents the next drift. Keep this list minimal — only truly
+non-sensitive, read-only endpoints belong here. As a sanity check, every
+entry should be safe to expose to:
+
+  * external uptime probes (Pingdom, Better Stack, NAS),
+  * the dashboard SPA before the user has logged in,
+  * anyone who happens to ``curl`` the hostname.
+
+If a new endpoint doesn't pass all three tests, it should be gated and
+the SPA should bootstrap it after login instead.
+"""
+from __future__ import annotations
+
+PUBLIC_API_PATHS: frozenset[str] = frozenset({
+    # Liveness probe target. Returns version, gateway state, active
+    # session count, and the dashboard auth-gate shape. No bodies, no
+    # session content, no secrets. Documented as the portal's wildcard
+    # liveness probe in
+    # ``docs/agent-dashboard-public-url-contract.md`` (NAS side).
+    "/api/status",
+    # Read-only config-defaults / schema feeds for the SPA's Config page.
+    "/api/config/defaults",
+    "/api/config/schema",
+    # Read-only model metadata (context windows, etc.) — same shape as
+    # provider catalogs already exposed on the public internet.
+    "/api/model/info",
+    # Read-only theme + plugin manifests for the dashboard skin engine.
+    "/api/dashboard/themes",
+    "/api/dashboard/plugins",
+})
--- a/hermes_cli/debug.py
+++ b/hermes_cli/debug.py
@ -17,8 +17,6 @@ import logging
 import re
 import sys
 import time
-import urllib.error
-import urllib.parse
 import urllib.request
 from dataclasses import dataclass
 from pathlib import Path
@ -260,15 +258,6 @@ def _schedule_auto_delete(urls: list[str], delay_seconds: int = _AUTO_DELETE_SEC
    _record_pending(urls, delay_seconds=delay_seconds)


-def _delete_hint(url: str) -> str:
-    """Return a one-liner delete command for the given paste URL."""
-    paste_id = _extract_paste_id(url)
-    if paste_id:
-        return f"hermes debug delete {url}"
-    # dpaste.com — no API delete, expires on its own.
-    return "(auto-expires per dpaste.com policy)"
-
-
 def _upload_paste_rs(content: str) -> str:
    """Upload to paste.rs.  Returns the paste URL.

--- a/hermes_cli/doctor.py
+++ b/hermes_cli/doctor.py
@ -8,7 +8,6 @@ import os
 import sys
 import subprocess
 import shutil
-import importlib.util
 from pathlib import Path

 from hermes_cli.config import get_project_root, get_hermes_home, get_env_path
@ -205,6 +204,60 @@ def _fail_and_issue(text: str, detail: str, fix: str, issues: list[str]) -> None
    issues.append(fix)


+def _read_pyproject_version() -> str | None:
+    """Read the ``version = "..."`` from ``pyproject.toml`` at the project root.
+
+    Returns None when running from an installed wheel (no pyproject.toml ships
+    with the package) or when the file can't be parsed. Reads only the
+    ``[project]`` version, ignoring any version strings that appear in other
+    tables.
+    """
+    pyproject = PROJECT_ROOT / "pyproject.toml"
+    try:
+        text = pyproject.read_text(encoding="utf-8")
+    except OSError:
+        return None
+    in_project = False
+    for raw in text.splitlines():
+        line = raw.strip()
+        if line.startswith("[") and line.endswith("]"):
+            in_project = line == "[project]"
+            continue
+        if in_project and line.startswith("version") and "=" in line:
+            value = line.split("=", 1)[1]
+            value = value.split("#", 1)[0].strip().strip("\"'")
+            return value or None
+    return None
+
+
+def _check_version_consistency(issues: list[str]) -> None:
+    """Verify pyproject.toml version matches hermes_cli.__version__.
+
+    A git conflict resolution (reset/merge) can revert one file without the
+    other, leaving ``hermes --version`` reporting a stale version while
+    ``pyproject.toml`` is current. Detect that drift so users can re-sync.
+    Silent no-op for installed wheels where pyproject.toml isn't present.
+    """
+    try:
+        from hermes_cli import __version__ as init_version
+    except Exception:
+        return
+    pyproject_version = _read_pyproject_version()
+    if pyproject_version is None:
+        # Installed wheel or unreadable pyproject — nothing to cross-check.
+        return
+    if pyproject_version == init_version:
+        check_ok("Version files consistent", f"({init_version})")
+    else:
+        _fail_and_issue(
+            "Version mismatch between source files",
+            f"(pyproject.toml {pyproject_version} != hermes_cli/__init__.py {init_version})",
+            "Re-sync version files (e.g. run 'hermes update', or set "
+            "hermes_cli/__init__.py __version__ to match pyproject.toml)",
+            issues,
+        )
+
+
 def _check_s6_supervision(issues: list[str]) -> None:
    """Inside a container under our s6 /init, surface what s6 sees.

@ -510,6 +563,10 @@ def run_doctor(args):
        check_ok("Virtual environment active")
    else:
        check_warn("Not in virtual environment", "(recommended)")
+
+    # Detect drift between pyproject.toml and hermes_cli/__init__.py versions
+    # (a git conflict resolution can silently revert one but not the other).
+    _check_version_consistency(issues)
    
    _section("Required Packages")
    required_packages = [
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@ -2161,9 +2161,37 @@ def _build_service_path_dirs(project_root: Path | None = None) -> list[str]:
    return candidates


+def _stable_service_working_dir() -> str:
+    """Return a WorkingDirectory that will not disappear out from under systemd.
+
+    The gateway does NOT need its cwd to be the source checkout — ``ExecStart``
+    uses an absolute python interpreter and ``-m hermes_cli.main``, so module
+    resolution does not depend on cwd. Pinning ``WorkingDirectory`` to
+    ``PROJECT_ROOT`` (``Path(__file__).parent.parent``) is actively harmful:
+    when the unit is generated from a transient checkout — a ``.worktrees/``
+    dir, or a clone that ``hermes update`` later relocates/removes — the path
+    rots. systemd then fails the start at the CHDIR step (``status=200/CHDIR``,
+    "Changing to the requested working directory failed") *before* Python
+    loads, so the on-boot ``refresh_systemd_unit_if_needed()`` self-heal never
+    runs and ``Restart=always`` crash-loops forever on a dead directory.
+
+    ``HERMES_HOME`` is the stable anchor: it is where config/state/logs live,
+    it never moves, and it is guaranteed to exist whenever the gateway is
+    meaningfully installed. Fall back to ``PROJECT_ROOT`` only if HERMES_HOME
+    cannot be resolved (it always can in practice).
+    """
+    try:
+        home = get_hermes_home()
+        if home and Path(home).is_dir():
+            return str(Path(home).resolve())
+    except Exception:
+        pass
+    return str(PROJECT_ROOT)
+
+
 def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) -> str:
    python_path = get_python_path()
-    working_dir = str(PROJECT_ROOT)
+    working_dir = _stable_service_working_dir()
    detected_venv = _detect_venv_dir()
    venv_dir = str(detected_venv) if detected_venv else str(PROJECT_ROOT / "venv")

@ -2192,7 +2220,10 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
        # (e.g. /root/) to the target user's home so the service can
        # actually access them.
        python_path = _remap_path_for_user(python_path, home_dir)
-        working_dir = _remap_path_for_user(working_dir, home_dir)
+        # Anchor cwd to the target user's HERMES_HOME (stable, always exists)
+        # rather than a remapped source-checkout path that can rot. See
+        # _stable_service_working_dir() for the full rationale.
+        working_dir = str(hermes_home) if hermes_home else _remap_path_for_user(working_dir, home_dir)
        venv_dir = _remap_path_for_user(venv_dir, home_dir)
        path_entries = [_remap_path_for_user(p, home_dir) for p in path_entries]
        path_entries.extend(_build_user_local_paths(Path(home_dir), path_entries))
@ -2804,7 +2835,10 @@ def _launchd_domain() -> str:

 def generate_launchd_plist() -> str:
    python_path = get_python_path()
-    working_dir = str(PROJECT_ROOT)
+    # Stable cwd anchor — never the volatile source checkout. See
+    # _stable_service_working_dir() for the rationale (same rot risk applies
+    # to launchd's WorkingDirectory as to systemd's).
+    working_dir = _stable_service_working_dir()
    hermes_home = str(get_hermes_home().resolve())
    log_dir = get_hermes_home() / "logs"
    log_dir.mkdir(parents=True, exist_ok=True)
@ -3960,18 +3994,6 @@ def _setup_whatsapp():
    cmd_whatsapp(argparse.Namespace())


-def _setup_email():
-    """Configure Email via the standard platform setup."""
-    email_platform = next(p for p in _PLATFORMS if p["key"] == "email")
-    _setup_standard_platform(email_platform)
-
-
-def _setup_sms():
-    """Configure SMS (Twilio) via the standard platform setup."""
-    sms_platform = next(p for p in _PLATFORMS if p["key"] == "sms")
-    _setup_standard_platform(sms_platform)
-
-
 def _setup_dingtalk():
    """Configure DingTalk — QR scan (recommended) or manual credential entry."""
    from hermes_cli.setup import (
@ -4144,12 +4166,6 @@ def _setup_wecom():
    print_success("💬 WeCom configured!")


-def _setup_yuanbao():
-    """Configure Yuanbao via the standard platform setup."""
-    yuanbao_platform = next(p for p in _PLATFORMS if p["key"] == "yuanbao")
-    _setup_standard_platform(yuanbao_platform)
-
-
 def _is_service_installed() -> bool:
    """Check if the gateway is installed as a system service."""
    if supports_systemd_services():
--- a/hermes_cli/kanban.py
+++ b/hermes_cli/kanban.py
@ -548,6 +548,11 @@ def build_parser(parent_subparsers: argparse._SubParsersAction) -> argparse.Argu
                            help="Additional task ids to schedule with the same reason (bulk mode)")

    p_unblock = sub.add_parser("unblock", help="Return one or more blocked/scheduled tasks to ready")
+    p_unblock.add_argument(
+        "--reason",
+        default=None,
+        help="Optional reason/note — recorded as a comment before unblocking. Quote multi-word reasons.",
+    )
    p_unblock.add_argument("task_ids", nargs="+")

    p_promote = sub.add_parser(
@ -1978,14 +1983,20 @@ def _cmd_unblock(args: argparse.Namespace) -> int:
    if not ids:
        print("at least one task_id is required", file=sys.stderr)
        return 1
+    reason = getattr(args, "reason", None)
+    if reason is not None:
+        reason = reason.strip() or None
+    author = _profile_author() if reason else None
    failed: list[str] = []
    with kb.connect_closing() as conn:
        for tid in ids:
+            if reason:
+                kb.add_comment(conn, tid, author, f"UNBLOCK: {reason}")
            if not kb.unblock_task(conn, tid):
                failed.append(tid)
                print(f"cannot unblock {tid} (not blocked/scheduled?)", file=sys.stderr)
            else:
-                print(f"Unblocked {tid}")
+                print(f"Unblocked {tid}" + (f": {reason}" if reason else ""))
    return 0 if not failed else 1


@ -2087,12 +2098,52 @@ def _cmd_tail(args: argparse.Namespace) -> int:


 def _cmd_dispatch(args: argparse.Namespace) -> int:
+    # Honour kanban.default_assignee as the fallback for unassigned ready
+    # tasks (#27145), kanban.max_in_progress as the global concurrency cap
+    # (#33488), kanban.max_in_progress_per_profile as the per-profile
+    # cap (#21582), and kanban.max_spawn as the per-tick spawn limit
+    # (#28805). Same semantics as the gateway dispatch path so behavior
+    # matches whether the user runs the CLI directly or relies on the
+    # gateway-embedded dispatcher.
+    try:
+        from hermes_cli.config import load_config
+        _cfg = load_config()
+        _kanban_cfg = _cfg.get("kanban", {}) if isinstance(_cfg, dict) else {}
+        default_assignee = (_kanban_cfg.get("default_assignee") or "").strip() or None
+
+        def _coerce_positive_int(value):
+            if value is None:
+                return None
+            try:
+                ival = int(value)
+            except (TypeError, ValueError):
+                return None
+            return ival if ival >= 1 else None
+
+        max_in_progress_per_profile = _coerce_positive_int(
+            _kanban_cfg.get("max_in_progress_per_profile")
+        )
+        max_in_progress = _coerce_positive_int(_kanban_cfg.get("max_in_progress"))
+        # CLI --max overrides config kanban.max_spawn when both are present;
+        # CLI is the more explicit signal so it wins.
+        cli_max = getattr(args, "max", None)
+        max_spawn = cli_max if cli_max is not None else _coerce_positive_int(
+            _kanban_cfg.get("max_spawn")
+        )
+    except Exception:
+        default_assignee = None
+        max_in_progress_per_profile = None
+        max_in_progress = None
+        max_spawn = getattr(args, "max", None)
    with kb.connect_closing() as conn:
        res = kb.dispatch_once(
            conn,
            dry_run=args.dry_run,
-            max_spawn=args.max,
+            max_spawn=max_spawn,
+            max_in_progress=max_in_progress,
            failure_limit=getattr(args, "failure_limit", kb.DEFAULT_SPAWN_FAILURE_LIMIT),
+            default_assignee=default_assignee,
+            max_in_progress_per_profile=max_in_progress_per_profile,
        )
    if getattr(args, "json", False):
        print(json.dumps({
@ -2108,6 +2159,11 @@ def _cmd_dispatch(args: argparse.Namespace) -> int:
            ],
            "skipped_unassigned": res.skipped_unassigned,
            "skipped_nonspawnable": res.skipped_nonspawnable,
+            "skipped_per_profile_capped": [
+                {"task_id": tid, "assignee": who, "current": current}
+                for (tid, who, current) in res.skipped_per_profile_capped
+            ],
+            "auto_assigned_default": res.auto_assigned_default,
        }, indent=2))
        return 0
    print(f"Reclaimed:    {res.reclaimed}")
@ -2128,8 +2184,18 @@ def _cmd_dispatch(args: argparse.Namespace) -> int:
    for tid, who, ws in res.spawned:
        tag = " (dry)" if args.dry_run else ""
        print(f"  - {tid}  ->  {who}  @ {ws or '-'}{tag}")
+    if res.auto_assigned_default:
+        print(
+            f"Auto-assigned to kanban.default_assignee={default_assignee!r}: "
+            f"{', '.join(res.auto_assigned_default)}"
+        )
    if res.skipped_unassigned:
        print(f"Skipped (unassigned): {', '.join(res.skipped_unassigned)}")
+    if res.skipped_per_profile_capped:
+        for tid, who, current in res.skipped_per_profile_capped:
+            print(
+                f"Deferred ({who} at per-profile cap, {current} running): {tid}"
+            )
    if res.skipped_nonspawnable:
        print(
            f"Skipped (non-spawnable assignee — terminal lane, OK): "
--- a/hermes_cli/kanban_db.py
+++ b/hermes_cli/kanban_db.py
@ -84,7 +84,6 @@ import threading
 import logging
 import time
 from dataclasses import dataclass, field
-from datetime import datetime
 from pathlib import Path
 from typing import Any, Iterable, Optional

@ -111,6 +110,16 @@ _IS_WINDOWS = sys.platform == "win32"
 # long single-call MCP workflows.
 DEFAULT_CLAIM_TTL_SECONDS = 15 * 60

+# If a worker's PID is still alive but its ``last_heartbeat_at`` is
+# older than this when ``release_stale_claims`` runs, treat the worker
+# as wedged and reclaim regardless of PID liveness (#29747 gap 3).
+# This catches the logic-loop case where the process is technically
+# running but not making observable progress.  ``_touch_activity``
+# bridges chunk-level liveness into ``last_heartbeat_at`` via #31752,
+# so any genuinely active worker keeps its heartbeat fresh as a side
+# effect of normal API traffic.
+DEFAULT_CLAIM_HEARTBEAT_MAX_STALE_SECONDS = 60 * 60
+

 def _resolve_claim_ttl_seconds(ttl_seconds: Optional[int] = None) -> int:
    """Return the effective claim TTL, honoring the kanban env override.
@ -387,6 +396,41 @@ def workspaces_root(board: Optional[str] = None) -> Path:
    return board_dir(slug) / "workspaces"


+def attachments_root(board: Optional[str] = None) -> Path:
+    """Return the directory under which task file attachments are stored.
+
+    Mirrors :func:`worker_logs_dir` / :func:`workspaces_root`: anchored
+    per-board so attachments don't leak between projects. Each task gets
+    its own ``<root>/.../attachments/<task_id>/`` subdirectory.
+
+    ``HERMES_KANBAN_ATTACHMENTS_ROOT`` pins the path directly (highest
+    precedence) for tests and unusual deployments.
+
+    ``default`` uses ``<root>/kanban/attachments/``; other boards use
+    ``<root>/kanban/boards/<slug>/attachments/``.
+
+    Workers (which run with full file-tool access) read attached files
+    by the absolute path surfaced in :func:`build_worker_context`. On the
+    local terminal backend — the default for kanban — that path resolves
+    directly. Remote backends (Docker/Modal) need this directory mounted;
+    see the kanban docs.
+    """
+    override = os.environ.get("HERMES_KANBAN_ATTACHMENTS_ROOT", "").strip()
+    if override:
+        return Path(override).expanduser()
+    slug = _normalize_board_slug(board)
+    if slug is None:
+        slug = get_current_board()
+    if slug == DEFAULT_BOARD:
+        return kanban_home() / "kanban" / "attachments"
+    return board_dir(slug) / "attachments"
+
+
+def task_attachments_dir(task_id: str, board: Optional[str] = None) -> Path:
+    """Return the per-task attachment directory ``<root>/<task_id>/``."""
+    return attachments_root(board=board) / task_id
+
+
 def worker_logs_dir(board: Optional[str] = None) -> Path:
    """Return the directory under which per-task worker logs are written.

@ -822,6 +866,20 @@ class Comment:
    created_at: int


+@dataclass
+class Attachment:
+    """In-memory view of a row from the ``task_attachments`` table."""
+
+    id: int
+    task_id: str
+    filename: str
+    stored_path: str
+    content_type: Optional[str]
+    size: int
+    uploaded_by: Optional[str]
+    created_at: int
+
+
@dataclass
 class Event:
    id: int
@ -948,6 +1006,23 @@ CREATE TABLE IF NOT EXISTS task_runs (
    error               TEXT
 );

+-- Files attached to a task (PDFs, images, source documents). The blob
+-- lives on disk under ``attachments_root(board)/<task_id>/<stored_name>``;
+-- this row carries metadata + the absolute ``stored_path`` so the
+-- dashboard can list/download and ``build_worker_context`` can surface
+-- the absolute path to the worker (which has full file-tool access). See
+-- #35338.
+CREATE TABLE IF NOT EXISTS task_attachments (
+    id           INTEGER PRIMARY KEY AUTOINCREMENT,
+    task_id      TEXT NOT NULL,
+    filename     TEXT NOT NULL,
+    stored_path  TEXT NOT NULL,
+    content_type TEXT,
+    size         INTEGER NOT NULL DEFAULT 0,
+    uploaded_by  TEXT,
+    created_at   INTEGER NOT NULL
+);
+
 -- Subscription from a gateway source (platform + chat + thread) to a
 -- task. The gateway's kanban-notifier watcher tails task_events and
 -- pushes ``completed`` / ``blocked`` / ``spawn_auto_blocked`` events to
@ -972,6 +1047,7 @@ CREATE INDEX IF NOT EXISTS idx_comments_task         ON task_comments(task_id, c
 CREATE INDEX IF NOT EXISTS idx_events_task           ON task_events(task_id, created_at);
 CREATE INDEX IF NOT EXISTS idx_runs_task             ON task_runs(task_id, started_at);
 CREATE INDEX IF NOT EXISTS idx_runs_status           ON task_runs(status);
+CREATE INDEX IF NOT EXISTS idx_attachments_task      ON task_attachments(task_id, created_at);
 CREATE INDEX IF NOT EXISTS idx_notify_task           ON kanban_notify_subs(task_id);
 """

@ -1628,6 +1704,140 @@ def _migrate_add_optional_columns(conn: sqlite3.Connection) -> None:
            (new, old),
        )

+    _rebuild_drifted_tables(conn)
+
+
+# Legacy DBs defined these tables with a ``TEXT PRIMARY KEY`` id (or, for
+# ``kanban_notify_subs``, a nullable ``TEXT last_event_id``). The current
+# schema uses ``INTEGER PRIMARY KEY AUTOINCREMENT`` / ``INTEGER NOT NULL
+# DEFAULT 0``. ``CREATE TABLE IF NOT EXISTS`` skips existing tables
+# regardless of schema and ``_add_column_if_missing`` only adds columns, so
+# neither can fix a drifted column type — the table must be rebuilt. See
+# #35096.
+#
+# Each entry pairs the canonical CREATE TABLE with the CREATE INDEX
+# statements that DROP TABLE would otherwise take down with it (including
+# ``idx_events_run``, added by the additive pass above). To guard against
+# this list drifting from SCHEMA_SQL, ``test_rebuilt_schema_matches_fresh``
+# asserts a rebuilt legacy DB is byte-identical to a fresh one.
+_REBUILD_SPECS = {
+    "task_events": (
+        "CREATE TABLE task_events ("
+        " id INTEGER PRIMARY KEY AUTOINCREMENT,"
+        " task_id TEXT NOT NULL, run_id INTEGER, kind TEXT NOT NULL,"
+        " payload TEXT, created_at INTEGER NOT NULL)",
+        (
+            "CREATE INDEX idx_events_task ON task_events(task_id, created_at)",
+            "CREATE INDEX idx_events_run ON task_events(run_id, id)",
+        ),
+    ),
+    "task_comments": (
+        "CREATE TABLE task_comments ("
+        " id INTEGER PRIMARY KEY AUTOINCREMENT,"
+        " task_id TEXT NOT NULL, author TEXT NOT NULL, body TEXT NOT NULL,"
+        " created_at INTEGER NOT NULL)",
+        ("CREATE INDEX idx_comments_task ON task_comments(task_id, created_at)",),
+    ),
+    "task_runs": (
+        "CREATE TABLE task_runs ("
+        " id INTEGER PRIMARY KEY AUTOINCREMENT,"
+        " task_id TEXT NOT NULL, profile TEXT, step_key TEXT,"
+        " status TEXT NOT NULL, claim_lock TEXT, claim_expires INTEGER,"
+        " worker_pid INTEGER, max_runtime_seconds INTEGER,"
+        " last_heartbeat_at INTEGER, started_at INTEGER NOT NULL,"
+        " ended_at INTEGER, outcome TEXT, summary TEXT, metadata TEXT,"
+        " error TEXT)",
+        (
+            "CREATE INDEX idx_runs_task ON task_runs(task_id, started_at)",
+            "CREATE INDEX idx_runs_status ON task_runs(status)",
+        ),
+    ),
+    "kanban_notify_subs": (
+        "CREATE TABLE kanban_notify_subs ("
+        " task_id TEXT NOT NULL, platform TEXT NOT NULL, chat_id TEXT NOT NULL,"
+        " thread_id TEXT NOT NULL DEFAULT '', user_id TEXT,"
+        " notifier_profile TEXT, created_at INTEGER NOT NULL,"
+        " last_event_id INTEGER NOT NULL DEFAULT 0,"
+        " PRIMARY KEY (task_id, platform, chat_id, thread_id))",
+        ("CREATE INDEX idx_notify_task ON kanban_notify_subs(task_id)",),
+    ),
+}
+
+
+def _table_has_drifted(conn: sqlite3.Connection, table: str) -> bool:
+    """True when ``table`` still carries the legacy (pre-AUTOINCREMENT) shape."""
+    info = conn.execute(f"PRAGMA table_info({table})").fetchall()
+    if not info:
+        return False  # table absent — nothing to rebuild
+    if table == "kanban_notify_subs":
+        lei = next((c for c in info if c["name"] == "last_event_id"), None)
+        return lei is not None and (lei["type"] or "").upper() != "INTEGER"
+    # task_events / task_comments / task_runs: id must be INTEGER and a PK.
+    id_col = next((c for c in info if c["name"] == "id"), None)
+    if id_col is None:
+        return False
+    return not ((id_col["type"] or "").upper() == "INTEGER" and id_col["pk"])
+
+
+def _rebuild_drifted_tables(conn: sqlite3.Connection) -> None:
+    """Rebuild any kanban table whose column types drifted from SCHEMA_SQL.
+
+    Old boards crash the gateway notifier (``int(None)`` on a NULL id in
+    ``unseen_events_for_sub``) and never match the ``id > cursor`` filter, so
+    every kanban notification is silently lost (#35096). Each affected table is
+    rebuilt with the standard SQLite pattern — CREATE new → INSERT shared
+    columns → DROP old → RENAME — recreating its indexes too (DROP TABLE takes
+    them down). The legacy TEXT ids are dropped (they aren't valid integers);
+    AUTOINCREMENT assigns fresh ones and ``last_event_id`` cursors reset to 0,
+    so the first post-migration tick replays a task's event history once —
+    the safe failure mode for a feature that was already fully broken.
+
+    The whole pass runs in one transaction so an interruption can't leave a
+    table half-renamed, and under ``connect()``'s init locks so nothing races
+    it. Idempotent: a correctly-typed DB skips every table and returns without
+    opening a transaction.
+    """
+    drifted = [t for t in _REBUILD_SPECS if _table_has_drifted(conn, t)]
+    if not drifted:
+        return
+
+    conn.execute("BEGIN IMMEDIATE")
+    try:
+        for table in drifted:
+            create_sql, index_sqls = _REBUILD_SPECS[table]
+            old_cols = [c["name"] for c in conn.execute(f"PRAGMA table_info({table})")]
+            _log.info("kanban migration: rebuilding %s to match current schema", table)
+            conn.execute(f"ALTER TABLE {table} RENAME TO {table}_legacy")
+            conn.execute(create_sql)
+            new_cols = {c["name"] for c in conn.execute(f"PRAGMA table_info({table})")}
+            if table == "kanban_notify_subs":
+                # Cast the legacy TEXT cursor to INTEGER; NULL / non-numeric → 0.
+                shared = [c for c in old_cols if c in new_cols and c != "last_event_id"]
+                cols_csv = ", ".join(shared)
+                conn.execute(
+                    f"INSERT INTO {table} ({cols_csv}, last_event_id) "
+                    f"SELECT {cols_csv}, COALESCE(CAST(last_event_id AS INTEGER), 0) "
+                    f"FROM {table}_legacy"
+                )
+            else:
+                # Drop the legacy TEXT id; AUTOINCREMENT reassigns it.
+                shared = [c for c in old_cols if c in new_cols and c != "id"]
+                cols_csv = ", ".join(shared)
+                conn.execute(
+                    f"INSERT INTO {table} ({cols_csv}) "
+                    f"SELECT {cols_csv} FROM {table}_legacy"
+                )
+            conn.execute(f"DROP TABLE {table}_legacy")
+            for index_sql in index_sqls:
+                conn.execute(index_sql)
+        conn.execute("COMMIT")
+    except Exception:
+        try:
+            conn.execute("ROLLBACK")
+        except sqlite3.OperationalError:
+            pass
+        raise
+

 def _check_file_length_invariant(conn: sqlite3.Connection) -> None:
    """Read the SQLite header page_count and compare against actual file size.
@ -2243,6 +2453,121 @@ def list_comments(conn: sqlite3.Connection, task_id: str) -> list[Comment]:
    ]


+# ---------------------------------------------------------------------------
+# Attachments
+# ---------------------------------------------------------------------------
+
+def add_attachment(
+    conn: sqlite3.Connection,
+    task_id: str,
+    *,
+    filename: str,
+    stored_path: str,
+    content_type: Optional[str] = None,
+    size: int = 0,
+    uploaded_by: Optional[str] = None,
+) -> int:
+    """Record a file attachment for a task. Returns the new attachment id.
+
+    The caller is responsible for writing the blob to ``stored_path``
+    first (under :func:`task_attachments_dir`); this only persists the
+    metadata row and appends an ``attached`` event.
+    """
+    if not filename or not filename.strip():
+        raise ValueError("attachment filename is required")
+    if not stored_path or not stored_path.strip():
+        raise ValueError("attachment stored_path is required")
+    now = int(time.time())
+    with write_txn(conn):
+        if not conn.execute(
+            "SELECT 1 FROM tasks WHERE id = ?", (task_id,)
+        ).fetchone():
+            raise ValueError(f"unknown task {task_id}")
+        cur = conn.execute(
+            "INSERT INTO task_attachments "
+            "(task_id, filename, stored_path, content_type, size, uploaded_by, created_at) "
+            "VALUES (?, ?, ?, ?, ?, ?, ?)",
+            (
+                task_id,
+                filename.strip(),
+                stored_path,
+                content_type,
+                int(size),
+                uploaded_by,
+                now,
+            ),
+        )
+        _append_event(
+            conn,
+            task_id,
+            "attached",
+            {"filename": filename.strip(), "size": int(size), "by": uploaded_by},
+        )
+        return int(cur.lastrowid or 0)
+
+
+def list_attachments(conn: sqlite3.Connection, task_id: str) -> list[Attachment]:
+    rows = conn.execute(
+        "SELECT * FROM task_attachments WHERE task_id = ? ORDER BY created_at ASC, id ASC",
+        (task_id,),
+    ).fetchall()
+    return [
+        Attachment(
+            id=r["id"],
+            task_id=r["task_id"],
+            filename=r["filename"],
+            stored_path=r["stored_path"],
+            content_type=r["content_type"],
+            size=r["size"] or 0,
+            uploaded_by=r["uploaded_by"],
+            created_at=r["created_at"],
+        )
+        for r in rows
+    ]
+
+
+def get_attachment(conn: sqlite3.Connection, attachment_id: int) -> Optional[Attachment]:
+    r = conn.execute(
+        "SELECT * FROM task_attachments WHERE id = ?", (attachment_id,)
+    ).fetchone()
+    if r is None:
+        return None
+    return Attachment(
+        id=r["id"],
+        task_id=r["task_id"],
+        filename=r["filename"],
+        stored_path=r["stored_path"],
+        content_type=r["content_type"],
+        size=r["size"] or 0,
+        uploaded_by=r["uploaded_by"],
+        created_at=r["created_at"],
+    )
+
+
+def delete_attachment(conn: sqlite3.Connection, attachment_id: int) -> Optional[Attachment]:
+    """Delete an attachment row and its on-disk blob. Returns the removed row.
+
+    Returns ``None`` when no row matched. The blob is removed best-effort
+    (a missing file is not an error); the metadata row is the source of
+    truth for whether an attachment "exists".
+    """
+    with write_txn(conn):
+        att = get_attachment(conn, attachment_id)
+        if att is None:
+            return None
+        conn.execute("DELETE FROM task_attachments WHERE id = ?", (attachment_id,))
+        _append_event(
+            conn, att.task_id, "attachment_removed", {"filename": att.filename}
+        )
+    try:
+        p = Path(att.stored_path)
+        if p.is_file():
+            p.unlink()
+    except OSError:
+        pass
+    return att
+
+
 def list_events(conn: sqlite3.Connection, task_id: str) -> list[Event]:
    rows = conn.execute(
        "SELECT * FROM task_events WHERE task_id = ? ORDER BY created_at ASC, id ASC",
@ -2448,7 +2773,9 @@ def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool:
    return bool(row) and row["kind"] == "blocked"


-def recompute_ready(conn: sqlite3.Connection) -> int:
+def recompute_ready(
+    conn: sqlite3.Connection, failure_limit: int = None,
+) -> int:
    """Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``.

    Returns the number of tasks promoted.  Safe to call inside or outside
@ -2456,17 +2783,34 @@ def recompute_ready(conn: sqlite3.Connection) -> int:

    ``blocked`` tasks are also considered for promotion (so a task
    blocked purely by a parent dependency unblocks itself when the
-    parent completes), *except* when the most recent block event was a
-    worker-initiated ``kanban_block`` — those stay blocked until an
-    explicit ``kanban_unblock`` (#28712).  Without that guard, a
-    ``review-required`` handoff would auto-respawn, the fresh worker
-    would find nothing to do, exit cleanly, get recorded as a protocol
-    violation, and the cycle would repeat indefinitely.
+    parent completes), *except* in two cases:
+
+    1. The most recent block event was a worker-initiated
+       ``kanban_block`` — those stay blocked until an explicit
+       ``kanban_unblock`` (#28712).
+
+    2. The task's ``consecutive_failures`` has reached the effective
+       failure limit.  This prevents infinite retry loops when a task
+       repeatedly exhausts its iteration budget: without this guard the
+       counter would reset on every recovery cycle and the circuit
+       breaker could never trip (#35072).
+
+    The effective failure limit resolves in the same order as the
+    circuit breaker in ``_record_task_failure`` so the two never
+    disagree about when a task is permanently blocked:
+
+      1. per-task ``max_retries`` if set
+      2. caller-supplied ``failure_limit`` (the dispatcher passes the
+         ``kanban.failure_limit`` config value through ``dispatch_once``)
+      3. ``DEFAULT_FAILURE_LIMIT``
    """
+    if failure_limit is None:
+        failure_limit = DEFAULT_FAILURE_LIMIT
    promoted = 0
    with write_txn(conn):
        todo_rows = conn.execute(
-            "SELECT id, status FROM tasks WHERE status IN ('todo', 'blocked')"
+            "SELECT id, status, consecutive_failures, max_retries "
+            "FROM tasks WHERE status IN ('todo', 'blocked')"
        ).fetchall()
        for row in todo_rows:
            task_id = row["id"]
@ -2484,13 +2828,25 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
                (task_id,),
            ).fetchall()
            if all(p["status"] in ("done", "archived") for p in parents):
-                # Blocked tasks also get their failure counters reset —
-                # this is effectively an auto-unblock (circuit-breaker
-                # recovery; worker-initiated blocks are skipped above).
                if cur_status == "blocked":
+                    # Don't auto-recover tasks that have hit the
+                    # circuit-breaker failure limit.  Without this
+                    # guard, a task that repeatedly exhausts its
+                    # iteration budget would cycle forever:
+                    # block → auto-recover → respawn → budget
+                    # exhausted → block → …  The counter must also
+                    # be preserved so the breaker can accumulate
+                    # across recovery cycles.
+                    failures = int(row["consecutive_failures"] or 0)
+                    task_limit = row["max_retries"]
+                    effective_limit = (
+                        int(task_limit) if task_limit is not None
+                        else int(failure_limit)
+                    )
+                    if failures >= effective_limit:
+                        continue
                    conn.execute(
-                        "UPDATE tasks SET status = 'ready', "
-                        "consecutive_failures = 0, last_failure_error = NULL "
+                        "UPDATE tasks SET status = 'ready' "
                        "WHERE id = ? AND status = 'blocked'",
                        (task_id,),
                    )
@ -2741,9 +3097,19 @@ def release_stale_claims(
    then-immediately-reclaim loop seen on slow models that spend longer
    than ``DEFAULT_CLAIM_TTL_SECONDS`` inside a single tool-free LLM
    call (#23025): no tool calls means no ``kanban_heartbeat``, even
-    though the subprocess is healthy. ``enforce_max_runtime`` and
-    ``detect_crashed_workers`` remain the upper bounds for genuinely
-    wedged or dead workers.
+    though the subprocess is healthy.
+
+    Backstop (#29747 gap 3): if the worker's PID is still alive but its
+    ``last_heartbeat_at`` is stale by more than
+    ``DEFAULT_CLAIM_HEARTBEAT_MAX_STALE_SECONDS`` (1h), the worker has
+    been making no observable progress and we reclaim anyway — even if
+    ``_pid_alive`` is still true. This catches the wedged-in-a-logic-loop
+    case where the process is technically running but accomplishing
+    nothing. ``_touch_activity`` (run_agent.py) bridges chunk-level
+    liveness into ``last_heartbeat_at`` via #31752, so any genuinely
+    active worker keeps its heartbeat fresh as a side effect of normal
+    API traffic. ``enforce_max_runtime`` and ``detect_crashed_workers``
+    remain the upper bounds for genuinely wedged or dead workers.

    Returns the number of stale claims actually reclaimed (live-pid
    extensions don't count). Safe to call often.
@ -2761,7 +3127,21 @@ def release_stale_claims(
    for row in stale:
        lock = row["claim_lock"] or ""
        host_local = lock.startswith(host_prefix)
-        if host_local and row["worker_pid"] and _pid_alive(row["worker_pid"]):
+        hb = row["last_heartbeat_at"]
+        # Heartbeat staleness backstop: if we have a heartbeat at all
+        # and it's older than the max-stale threshold, the worker is
+        # not making observable progress.  Reclaim instead of extending,
+        # even if the PID is still alive (it's likely in a logic loop).
+        heartbeat_stale = (
+            hb is not None
+            and (now - int(hb)) > DEFAULT_CLAIM_HEARTBEAT_MAX_STALE_SECONDS
+        )
+        if (
+            host_local
+            and row["worker_pid"]
+            and _pid_alive(row["worker_pid"])
+            and not heartbeat_stale
+        ):
            new_expires = now + _resolve_claim_ttl_seconds()
            with write_txn(conn):
                cur = conn.execute(
@ -2830,6 +3210,7 @@ def release_stale_claims(
                ),
                "now": now,
                "host_local": host_local,
+                "heartbeat_stale": bool(heartbeat_stale),
            }
            payload.update(termination)
            _append_event(
@ -4289,6 +4670,12 @@ class DispatchResult:
    skipped_unassigned: list[str] = field(default_factory=list)
    """Ready task ids skipped because they have no assignee at all.
    Operator-actionable — usually a misfiled task waiting for routing."""
+    auto_assigned_default: list[str] = field(default_factory=list)
+    """Task ids that were unassigned in the DB and had
+    ``kanban.default_assignee`` applied this tick before spawning (#27145).
+    Surfaces the auto-assignment to telemetry / CLI / dashboard so the
+    operator can see when the dispatcher is acting on the fallback rule
+    rather than on explicit per-task assignments."""
    skipped_nonspawnable: list[str] = field(default_factory=list)
    """Ready task ids skipped because their assignee names a control-plane
    lane (a Claude Code terminal like ``orion-cc``) rather than a Hermes
@ -4296,6 +4683,14 @@ class DispatchResult:
    operator-actionable failure. Tracked separately so health telemetry
    can distinguish "real stuck" (nothing spawned but spawnable work
    available) from "correctly idle" (nothing spawnable in the queue)."""
+    skipped_per_profile_capped: list[tuple[str, str, int]] = field(default_factory=list)
+    """Tasks deferred this tick because their assignee is already at
+    ``kanban.max_in_progress_per_profile`` (#21582). Each entry is
+    ``(task_id, assignee, current_running_count)``. NOT an
+    operator-actionable failure — the task will be picked up on a
+    subsequent tick when the assignee has capacity. Separate bucket so
+    telemetry / dashboards can show "this profile is busy" vs
+    "task is genuinely stuck"."""
    crashed: list[str] = field(default_factory=list)
    """Task ids reclaimed because their worker PID disappeared."""
    auto_blocked: list[str] = field(default_factory=list)
@ -4729,7 +5124,6 @@ def detect_stale_running(
    if stale_timeout_seconds <= 0:
        return []

-    import signal as _signal_mod

    now = int(time.time())
    host_prefix = f"{_claimer_id().split(':', 1)[0]}:"
@ -4818,21 +5212,6 @@ def detect_stale_running(
    return reclaimed


-def set_max_runtime(
-    conn: sqlite3.Connection,
-    task_id: str,
-    seconds: Optional[int],
-) -> bool:
-    """Set or clear the per-task max_runtime_seconds. Returns True on
-    success."""
-    with write_txn(conn):
-        cur = conn.execute(
-            "UPDATE tasks SET max_runtime_seconds = ? WHERE id = ?",
-            (int(seconds) if seconds is not None else None, task_id),
-        )
-    return cur.rowcount == 1
-
-
 def _error_fingerprint(error_text: str) -> str:
    """Normalize an error message for grouping identical failures.

@ -5342,6 +5721,8 @@ def dispatch_once(
    failure_limit: int = DEFAULT_SPAWN_FAILURE_LIMIT,
    stale_timeout_seconds: int = 0,
    board: Optional[str] = None,
+    default_assignee: Optional[str] = None,
+    max_in_progress_per_profile: Optional[int] = None,
 ) -> DispatchResult:
    """Run one dispatcher tick.

@ -5390,7 +5771,7 @@ def dispatch_once(
    if _crash_auto_blocked:
        result.auto_blocked.extend(_crash_auto_blocked)
    result.timed_out = enforce_max_runtime(conn)
-    result.promoted = recompute_ready(conn)
+    result.promoted = recompute_ready(conn, failure_limit=failure_limit)

    # Count tasks already running so max_spawn enforces concurrency rather
    # than a per-tick spawn budget. See the docstring above for the full
@ -5427,12 +5808,89 @@ def dispatch_once(
        if max_spawn is None or max_spawn > remaining:
            max_spawn = remaining
    spawned = 0
+    # Per-profile concurrency cap (#21582): when set, track how many
+    # workers each assignee already has in flight, and refuse to spawn
+    # when this would push that assignee past the cap. Prevents
+    # fan-out workloads from melting a single profile's local model /
+    # API quota / browser pool while leaving other profiles idle.
+    # Tasks blocked this way go to skipped_per_profile_capped (not
+    # skipped_unassigned — the operator-actionable signal is different:
+    # "this profile is busy, try again later" not "this needs routing").
+    _per_profile_cap = max_in_progress_per_profile if (
+        isinstance(max_in_progress_per_profile, int)
+        and max_in_progress_per_profile > 0
+    ) else None
+    _per_profile_running: dict[str, int] = {}
+    if _per_profile_cap is not None:
+        for prow in conn.execute(
+            "SELECT assignee, COUNT(*) AS n FROM tasks "
+            "WHERE status = 'running' AND assignee IS NOT NULL "
+            "GROUP BY assignee"
+        ):
+            _per_profile_running[prow["assignee"]] = int(prow["n"])
+    # Normalize default_assignee once: empty/whitespace string → None so the
+    # rest of the loop can use ``if default_assignee:`` as a single check.
+    # We also resolve profile_exists once here for the same reason.
+    _default_assignee = (default_assignee or "").strip() or None
+    _default_assignee_resolved = False
+    if _default_assignee:
+        try:
+            from hermes_cli.profiles import profile_exists as _pe
+            _default_assignee_resolved = bool(_pe(_default_assignee))
+        except Exception:
+            # Profiles module not importable (test stubs, exotic envs).
+            # Trust the operator's config and try the assignment; the
+            # downstream profile_exists check on the assigned row will
+            # bucket it as nonspawnable if the profile genuinely isn't
+            # there, with the existing diagnostic.
+            _default_assignee_resolved = True
    for row in ready_rows:
        if max_spawn is not None and running_count + spawned >= max_spawn:
            break
-        if not row["assignee"]:
-            result.skipped_unassigned.append(row["id"])
-            continue
+        row_assignee = row["assignee"]
+        if not row_assignee:
+            # Honour kanban.default_assignee: when the dispatcher hits an
+            # unassigned ready task and an operator-configured fallback
+            # exists, persist the assignment and proceed. This removes the
+            # dashboard footgun where a task created without an assignee
+            # parks in 'ready' forever even though the operator's intent
+            # ("default") was perfectly clear (#27145). Mutating the row
+            # (not just the in-memory view) keeps diagnostics and the
+            # board state consistent: the task is now legitimately owned
+            # by ``kanban.default_assignee``, not "unassigned but secretly
+            # routed".
+            if _default_assignee and _default_assignee_resolved:
+                # Dry-run: show what WOULD happen (auto-assign + spawn) without
+                # mutating the DB. Real run: mutate the row + emit the
+                # 'assigned' event so the board state matches what just happened.
+                if not dry_run:
+                    try:
+                        with write_txn(conn):
+                            conn.execute(
+                                "UPDATE tasks SET assignee = ? WHERE id = ? "
+                                "AND (assignee IS NULL OR assignee = '')",
+                                (_default_assignee, row["id"]),
+                            )
+                            _append_event(
+                                conn, row["id"], "assigned",
+                                {
+                                    "assignee": _default_assignee,
+                                    "source": "kanban.default_assignee",
+                                },
+                            )
+                    except Exception:
+                        _log.debug(
+                            "kanban dispatch: failed to apply default_assignee=%r "
+                            "to task %s",
+                            _default_assignee, row["id"], exc_info=True,
+                        )
+                        result.skipped_unassigned.append(row["id"])
+                        continue
+                row_assignee = _default_assignee
+                result.auto_assigned_default.append(row["id"])
+            else:
+                result.skipped_unassigned.append(row["id"])
+                continue
        # Skip ready tasks whose assignee is not a real Hermes profile.
        # `_default_spawn` invokes ``hermes -p <assignee>`` which fails
        # with "Profile 'X' does not exist" when the assignee names a
@ -5447,7 +5905,7 @@ def dispatch_once(
            from hermes_cli.profiles import profile_exists  # local import: avoids cycle
        except Exception:
            profile_exists = None  # type: ignore[assignment]
-        if profile_exists is not None and not profile_exists(row["assignee"]):
+        if profile_exists is not None and not profile_exists(row_assignee):
            # Bucket separately from skipped_unassigned: the operator
            # cannot fix this by assigning a profile (the assignee IS the
            # intended owner — a terminal lane). Health telemetry uses
@ -5456,6 +5914,19 @@ def dispatch_once(
            # of human-pulled work.
            result.skipped_nonspawnable.append(row["id"])
            continue
+        # Per-profile concurrency cap (#21582): even if there's global
+        # headroom, refuse to spawn for an assignee that's already at
+        # its in-flight cap. Prevents one profile's local model / API
+        # quota / browser pool from being overwhelmed by a fan-out
+        # while the global max_in_progress / max_spawn caps still allow
+        # work on OTHER profiles.
+        if _per_profile_cap is not None:
+            current = _per_profile_running.get(row_assignee, 0)
+            if current >= _per_profile_cap:
+                result.skipped_per_profile_capped.append(
+                    (row["id"], row_assignee, current)
+                )
+                continue
        # Respawn guard: refuse to re-spawn when useful work is already
        # in-flight/recent, or when the last failure is a deterministic
        # blocker (quota / auth). The guard defers the spawn this tick so
@ -5478,7 +5949,15 @@ def dispatch_once(
                    )
            continue
        if dry_run:
-            result.spawned.append((row["id"], row["assignee"], ""))
+            result.spawned.append((row["id"], row_assignee, ""))
+            # Increment per-profile counter even in dry_run so the cap
+            # check sees the would-be spawn on subsequent iterations.
+            # Without this, dry_run reports every task as spawnable and
+            # under-reports the capped subset (#21582).
+            if _per_profile_cap is not None and row_assignee:
+                _per_profile_running[row_assignee] = (
+                    _per_profile_running.get(row_assignee, 0) + 1
+                )
            continue
        claimed = claim_task(conn, row["id"], ttl_seconds=ttl_seconds)
        if claimed is None:
@ -5521,6 +6000,13 @@ def dispatch_once(
            # complete_task).
            result.spawned.append((claimed.id, claimed.assignee or "", str(workspace)))
            spawned += 1
+            # Track the new in-flight count for this profile so later
+            # iterations in this same tick respect the per-profile cap
+            # (#21582). Subsequent ticks re-query from the DB.
+            if _per_profile_cap is not None and claimed.assignee:
+                _per_profile_running[claimed.assignee] = (
+                    _per_profile_running.get(claimed.assignee, 0) + 1
+                )
        except Exception as exc:
            auto = _record_spawn_failure(
                conn, claimed.id, str(exc),
@ -6161,6 +6647,25 @@ def build_worker_context(conn: sqlite3.Connection, task_id: str) -> str:
        lines.append(_cap(task.body, _CTX_MAX_BODY_BYTES))
        lines.append("")

+    # Attachments — files uploaded to this task (PDFs, source docs,
+    # images). Surface the absolute on-disk path so the worker, which has
+    # full file-tool access, can read them directly (read_file, terminal
+    # `pdftotext`, etc.). On the local terminal backend the path resolves
+    # as-is; remote backends need the kanban attachments dir mounted.
+    attachments = list_attachments(conn, task_id)
+    if attachments:
+        lines.append("## Attachments")
+        lines.append(
+            "Files attached to this task. Read them with the file/terminal "
+            "tools at the absolute paths below:"
+        )
+        for att in attachments:
+            size_kb = max(1, (att.size + 1023) // 1024) if att.size else 0
+            size_str = f", {size_kb} KB" if size_kb else ""
+            ctype = f", {att.content_type}" if att.content_type else ""
+            lines.append(f"- `{att.filename}`{ctype}{size_str} → `{att.stored_path}`")
+        lines.append("")
+
    # Prior attempts — show closed runs so a retrying worker sees the
    # history. Skip the currently-active run (that's this worker).
    # Cap at _CTX_MAX_PRIOR_ATTEMPTS most-recent closed runs; older
@ -6362,7 +6867,7 @@ def _to_epoch(val) -> Optional[int]:
        pass
    # ISO-8601 fallback (e.g. '2026-05-10T15:00:00Z')
    try:
-        from datetime import datetime, timezone
+        from datetime import datetime
        dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
        return int(dt.timestamp())
    except (ValueError, OSError):
@ -6813,16 +7318,6 @@ def get_run(conn: sqlite3.Connection, run_id: int) -> Optional[Run]:
    return Run.from_row(row) if row else None


-def active_run(conn: sqlite3.Connection, task_id: str) -> Optional[Run]:
-    """Return the currently-open run for ``task_id`` (``ended_at IS NULL``)."""
-    row = conn.execute(
-        "SELECT * FROM task_runs WHERE task_id = ? AND ended_at IS NULL "
-        "ORDER BY started_at DESC LIMIT 1",
-        (task_id,),
-    ).fetchone()
-    return Run.from_row(row) if row else None
-
-
 def latest_run(conn: sqlite3.Connection, task_id: str) -> Optional[Run]:
    """Return the most recent run regardless of outcome (active or closed)."""
    row = conn.execute(
--- a/hermes_cli/kanban_diagnostics.py
+++ b/hermes_cli/kanban_diagnostics.py
@ -191,23 +191,6 @@ def _active_hallucination_events(
        elif k == kind:
            active.append(ev)
    return active
-
-
-def _latest_clean_event_ts(events: Iterable[Any]) -> int:
-    """Timestamp of the most recent clean completion / edit event.
-
-    Kept for general "has this task ever been successfully completed"
-    lookups; hallucination rules use ``_active_hallucination_events``
-    instead because they need strict ordering.
-    """
-    latest = 0
-    for ev in events:
-        if _event_kind(ev) in {"completed", "edited"}:
-            t = _event_ts(ev)
-            latest = max(latest, t)
-    return latest
-
-
 # Standard always-available actions. Every diagnostic can offer these as
 # fallbacks regardless of kind — they're the two baseline recovery
 # primitives the kernel supports.
@ -791,6 +774,83 @@ def _rule_stuck_in_blocked(task, events, runs, now, cfg) -> list[Diagnostic]:
    )]


+def _rule_block_unblock_cycling(task, events, runs, now, cfg) -> list[Diagnostic]:
+    """Task has cycled through blocked → unblocked many times — the
+    ``unblock`` is not fixing the underlying problem and the worker
+    keeps re-blocking for substantially the same reason.
+
+    ``_rule_stuck_in_blocked`` resets its timer on any ``commented`` /
+    ``unblocked`` event, so a task that cycles every few minutes is
+    invisible to it regardless of how many times it cycles (#29747
+    gap 1). This rule complements that one by counting block→unblock
+    cycles in a sliding window.
+
+    Threshold: cfg["block_cycle_threshold"] (default 3) cycles within
+    cfg["block_cycle_window_seconds"] (default 24h).
+    """
+    threshold = _positive_int(cfg.get("block_cycle_threshold"), 3)
+    window_seconds = float(cfg.get("block_cycle_window_seconds", 24 * 3600))
+    cycle_cutoff = now - window_seconds
+
+    # Walk events chronologically (arrival order — callers pre-sort by
+    # id, which is the canonical chronological order; ``created_at``
+    # alone is insufficient because multiple events can share the same
+    # second).  Count "blocked after unblocked" transitions: every time
+    # a blocked event follows at least one unblocked event since the
+    # last cycle was counted, that's a new cycle.
+    cycles = 0
+    seen_unblock_since_last_cycle = False
+    initial_blocked_ts = 0
+    last_cycle_blocked_ts = 0
+    for ev in events:
+        ts = _event_ts(ev)
+        if ts < cycle_cutoff:
+            continue
+        kind = _event_kind(ev)
+        if kind == "blocked":
+            if initial_blocked_ts == 0:
+                initial_blocked_ts = ts
+            if seen_unblock_since_last_cycle:
+                cycles += 1
+                last_cycle_blocked_ts = ts
+                seen_unblock_since_last_cycle = False
+        elif kind == "unblocked":
+            seen_unblock_since_last_cycle = True
+
+    if cycles < threshold:
+        return []
+
+    task_id = _task_field(task, "id")
+    actions: list[DiagnosticAction] = []
+    if task_id:
+        actions.append(DiagnosticAction(
+            kind="cli_hint",
+            label=f"Check block reasons: hermes kanban events {task_id}",
+            payload={"command": f"hermes kanban events {task_id}"},
+            suggested=True,
+        ))
+    return [Diagnostic(
+        kind="block_unblock_cycling",
+        severity="warning",
+        title=f"Task block→unblock cycled {cycles}x in {int(window_seconds/3600)}h",
+        detail=(
+            f"This task has been blocked {cycles} times after being "
+            "unblocked, suggesting the unblock is not addressing the "
+            "root cause and the worker keeps hitting the same wall. "
+            "Review the block reasons in the event history; a different "
+            "intervention (reassign, change scope, archive) may be needed."
+        ),
+        actions=actions,
+        first_seen_at=int(initial_blocked_ts) if initial_blocked_ts else int(now),
+        last_seen_at=int(last_cycle_blocked_ts) if last_cycle_blocked_ts else int(now),
+        count=cycles,
+        data={
+            "cycles": cycles,
+            "window_seconds": int(window_seconds),
+        },
+    )]
+
+
 def _rule_stranded_in_ready(task, events, runs, now, cfg) -> list[Diagnostic]:
    """Task has been in ``ready`` status for too long without any worker
    claiming it.
@ -923,6 +983,7 @@ _RULES: list[RuleFn] = [
    _rule_repeated_failures,
    _rule_repeated_crashes,
    _rule_stuck_in_blocked,
+    _rule_block_unblock_cycling,
    _rule_stranded_in_ready,
 ]

@ -936,6 +997,7 @@ DIAGNOSTIC_KINDS = (
    "repeated_failures",
    "repeated_crashes",
    "stuck_in_blocked",
+    "block_unblock_cycling",
    "stranded_in_ready",
 )

@ -1043,16 +1105,3 @@ def compute_task_diagnostics(
        )
    )
    return out
-
-
-def severity_of_highest(diagnostics: Iterable[Diagnostic]) -> Optional[str]:
-    """Highest severity present in the list, or None if empty. Useful
-    for card badges that need a single color."""
-    highest_idx = -1
-    highest = None
-    for d in diagnostics:
-        idx = SEVERITY_ORDER.index(d.severity) if d.severity in SEVERITY_ORDER else -1
-        if idx > highest_idx:
-            highest_idx = idx
-            highest = d.severity
-    return highest
--- a/hermes_cli/kanban_swarm.py
+++ b/hermes_cli/kanban_swarm.py
@ -209,7 +209,7 @@ def create_swarm(
        priority=priority,
        workspace_kind=workspace_kind,
        workspace_path=workspace_path,
-        skills=["avoid-ai-writing"],
+        skills=["humanizer"],
    )

    created = SwarmCreated(root, worker_ids, verifier, synthesizer)
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -65,6 +65,46 @@ import os
 import sys


+def _set_process_title() -> None:
+    """Set the process title to 'hermes' so tools like 'ps', 'top', and
+    'htop' show the app name instead of 'python3.xx'.
+
+    Purely cosmetic — non-fatal on any platform.
+
+    Strategy (try in order):
+      1. ``setproctitle`` (opt-in dep — installed via ``hermes tools`` or
+         ``pip install setproctitle``, or bundled in a future release).
+      2. ctypes ``prctl(PR_SET_NAME)`` (Linux only, 15-char limit).
+      3. ctypes ``pthread_setname_np`` (macOS only, kernel thread name —
+         changes lldb/top but not ``ps aux``).
+      4. No-op on Windows (the .exe name is already ``hermes.exe``).
+    """
+    # Strategy 1: setproctitle (best — works on macOS, Linux, BSD)
+    try:
+        import setproctitle  # type: ignore[import-untyped]
+
+        setproctitle.setproctitle("hermes")
+        return
+    except ImportError:
+        pass
+
+    # Strategy 2/3: platform-specific ctypes fallback
+    import ctypes
+    import platform
+
+    try:
+        system = platform.system()
+        if system == "Linux":
+            libc = ctypes.CDLL("libc.so.6", use_errno=True)
+            libc.prctl(15, b"hermes", 0, 0, 0)  # PR_SET_NAME = 15
+        elif system == "Darwin":
+            libc = ctypes.CDLL("libc.dylib", use_errno=True)
+            libc.pthread_setname_np(b"hermes")
+        # Windows: the .exe name is already ``hermes.exe`` — nothing to do.
+    except Exception:
+        pass
+
+
 # Mouse-tracking residue suppression — runs BEFORE every other import on the
 # TUI hot path so the terminal stops emitting SGR/X10 mouse reports while the
 # Python launcher is still doing imports (≈100–300ms in cooked + echo mode,
@ -2354,7 +2394,12 @@ def select_provider_and_model(args=None):
    if active == "openrouter" and get_env_value("OPENAI_BASE_URL"):
        active = "custom"

-    from hermes_cli.models import CANONICAL_PROVIDERS, _PROVIDER_LABELS
+    from hermes_cli.models import (
+        CANONICAL_PROVIDERS,
+        _PROVIDER_LABELS,
+        group_providers,
+        provider_group_for_slug,
+    )

    provider_labels = dict(_PROVIDER_LABELS)  # derive from canonical list
    if active and active in _custom_provider_map:
@ -2367,8 +2412,43 @@ def select_provider_and_model(args=None):
    print(f"  Active provider:  {active_label}")
    print()

-    # Step 1: Provider selection — flat list from CANONICAL_PROVIDERS
-    all_providers = [(p.slug, p.tui_desc) for p in CANONICAL_PROVIDERS]
+    # Step 1: Provider selection.
+    #
+    # Canonical providers are folded into top-level groups (display only — see
+    # PROVIDER_GROUPS in hermes_cli/models.py). A multi-member group shows one
+    # row ("Kimi / Moonshot ▸"); picking it opens a member sub-picker that
+    # resolves back to a concrete slug, so the dispatch chain below is
+    # unchanged. Custom providers and the trailing actions stay flat.
+    canonical_descs = {p.slug: p.tui_desc for p in CANONICAL_PROVIDERS}
+    grouped_rows = group_providers([p.slug for p in CANONICAL_PROVIDERS])
+
+    # The group/slug that should be pre-selected: the active provider's group
+    # if it's grouped, otherwise the active slug itself.
+    active_group = provider_group_for_slug(active) if active else ""
+
+    # ordered entries: (key, label, members)
+    #   members == [] → leaf row, key is a provider slug / action
+    #   members != [] → group row, key is "group:<gid>"
+    ordered: list[tuple[str, str, list[str]]] = []
+    default_idx = 0
+    for row in grouped_rows:
+        if row["kind"] == "group":
+            gid = row["group_id"]
+            label = f"{row['label']} ▸"
+            key = f"group:{gid}"
+            is_active = bool(active_group) and gid == active_group
+            members = row["members"]
+        else:
+            slug = row["slug"]
+            label = canonical_descs.get(slug, provider_labels.get(slug, slug))
+            key = slug
+            is_active = bool(active) and slug == active
+            members = []
+        if is_active:
+            ordered.append((key, f"{label}  ← currently active", members))
+            default_idx = len(ordered) - 1
+        else:
+            ordered.append((key, label, members))

    for key, provider_info in _custom_provider_map.items():
        name = provider_info["name"]
@ -2376,36 +2456,49 @@ def select_provider_and_model(args=None):
        short_url = base_url.replace("https://", "").replace("http://", "").rstrip("/")
        saved_model = provider_info.get("model", "")
        model_hint = f" — {saved_model}" if saved_model else ""
-        all_providers.append((key, f"{name} ({short_url}){model_hint}"))
-
-    # Build the menu
-    ordered = []
-    default_idx = 0
-    for key, label in all_providers:
+        label = f"{name} ({short_url}){model_hint}"
        if active and key == active:
-            ordered.append((key, f"{label}  ← currently active"))
+            ordered.append((key, f"{label}  ← currently active", []))
            default_idx = len(ordered) - 1
        else:
-            ordered.append((key, label))
+            ordered.append((key, label, []))

-    ordered.append(("custom", "Custom endpoint (enter URL manually)"))
+    ordered.append(("custom", "Custom endpoint (enter URL manually)", []))
    _has_saved_custom_list = isinstance(config.get("custom_providers"), list) and bool(
        config.get("custom_providers")
    )
    if _has_saved_custom_list:
-        ordered.append(("remove-custom", "Remove a saved custom provider"))
-    ordered.append(("aux-config", "Configure auxiliary models..."))
-    ordered.append(("cancel", "Leave unchanged"))
+        ordered.append(("remove-custom", "Remove a saved custom provider", []))
+    ordered.append(("aux-config", "Configure auxiliary models...", []))
+    ordered.append(("cancel", "Leave unchanged", []))

    provider_idx = _prompt_provider_choice(
-        [label for _, label in ordered],
+        [label for _, label, _ in ordered],
        default=default_idx,
    )
    if provider_idx is None or ordered[provider_idx][0] == "cancel":
        print("No change.")
        return

-    selected_provider = ordered[provider_idx][0]
+    selected_key = ordered[provider_idx][0]
+    selected_members = ordered[provider_idx][2]
+
+    # Group row → drill into a member sub-picker. Default to the active member
+    # if the active provider lives in this group.
+    if selected_members:
+        member_default = 0
+        if active in selected_members:
+            member_default = selected_members.index(active)
+        member_labels = [
+            canonical_descs.get(m, provider_labels.get(m, m)) for m in selected_members
+        ]
+        member_idx = _prompt_provider_choice(member_labels, default=member_default)
+        if member_idx is None:
+            print("No change.")
+            return
+        selected_provider = selected_members[member_idx]
+    else:
+        selected_provider = selected_key

    if selected_provider == "aux-config":
        _aux_config_menu()
@ -3004,7 +3097,6 @@ def _model_flow_nous(config, current_model="", args=None):
    """Nous Portal provider: ensure logged in, then pick model."""
    from hermes_cli.auth import (
        get_provider_auth_state,
-        NOUS_INFERENCE_AUTH_MODE_LEGACY,
        _prompt_model_selection,
        _save_model_choice,
        _update_config_for_provider,
@ -3072,7 +3164,7 @@ def _model_flow_nous(config, current_model="", args=None):

    # Verify credentials are still valid (catches expired sessions early)
    try:
-        creds = resolve_nous_runtime_credentials(min_key_ttl_seconds=5 * 60)
+        creds = resolve_nous_runtime_credentials()
    except Exception as exc:
        relogin = isinstance(exc, AuthError) and exc.relogin_required
        msg = format_auth_error(exc) if isinstance(exc, AuthError) else str(exc)
@ -3106,14 +3198,13 @@ def _model_flow_nous(config, current_model="", args=None):
    if not free_tier:
        try:
            refreshed_creds = resolve_nous_runtime_credentials(
-                min_key_ttl_seconds=5 * 60,
-                inference_auth_mode=NOUS_INFERENCE_AUTH_MODE_LEGACY,
+                force_refresh=True,
            )
            if refreshed_creds:
                creds = refreshed_creds
        except Exception:
            # Runtime inference has its own paid-entitlement recovery path; do
-            # not block model selection if this opportunistic remint fails.
+            # not block model selection if this opportunistic refresh fails.
            pass

    # Resolve portal URL early — needed both for upgrade links and for the
@ -5593,7 +5684,6 @@ def _model_flow_bedrock(config, current_model=""):
 def _model_flow_api_key_provider(config, provider_id, current_model=""):
    """Generic flow for API-key providers (z.ai, MiniMax, OpenCode, etc.)."""
    from hermes_cli.auth import (
-        LMSTUDIO_NOAUTH_PLACEHOLDER,
        PROVIDER_REGISTRY,
        _prompt_model_selection,
        _save_model_choice,
@ -6163,13 +6253,6 @@ def cmd_webhook(args):
    webhook_command(args)


-def cmd_portal(args):
-    """Nous Portal status and Tool Gateway routing surface."""
-    from hermes_cli.portal_cli import portal_command
-
-    return portal_command(args)
-
-
 def cmd_slack(args):
    """Slack integration helpers.

@ -7850,39 +7933,6 @@ def _detect_concurrent_hermes_instances(
    except Exception:
        return []

-    # Build a set of PIDs to exclude: the Python process itself plus its
-    # entire parent chain. On Windows the setuptools-generated hermes.exe
-    # launcher is a separate native process that spawns python.exe (the
-    # interpreter that runs our code).  os.getpid() returns the Python PID,
-    # but the launcher (which holds the file lock) is the parent.  Without
-    # walking the parent chain, every ``hermes update`` reports its own
-    # launcher as a concurrent instance — a false positive.
-    if exclude_pid is not None:
-        exclude_pids: set[int] = {exclude_pid}
-    else:
-        exclude_pids = {os.getpid()}
-    # The parent-walk is best-effort: if psutil rejects a PID (NoSuchProcess /
-    # AccessDenied) we stop walking and use whatever we've collected so far.
-    # Broader Exception catch on the outer block guards against partially-
-    # stubbed psutil in unit tests (e.g. a SimpleNamespace lacking Process /
-    # NoSuchProcess) — the surrounding update flow documents this helper as
-    # "never raises".
-    try:
-        current = psutil.Process(next(iter(exclude_pids)))
-        while True:
-            try:
-                parent = current.parent()
-            except Exception:
-                break
-            if parent is None or parent.pid <= 0:
-                break
-            if parent.pid in exclude_pids:
-                break  # loop detected
-            exclude_pids.add(parent.pid)
-            current = parent
-    except Exception:
-        pass
-
    # Resolve every shim path to its canonical form once for cheap comparison.
    shim_paths: set[str] = set()
    for shim in _hermes_exe_shims(scripts_dir):
@ -7893,6 +7943,56 @@ def _detect_concurrent_hermes_instances(
    if not shim_paths:
        return []

+    # Build a set of PIDs to exclude: the Python process itself plus every
+    # ancestor whose executable is one of our shims. On Windows the
+    # setuptools-generated hermes.exe launcher is a separate native process
+    # that spawns python.exe (the interpreter that runs our code).
+    # os.getpid() returns the Python PID, but the launcher (which holds the
+    # file lock) is the parent. Without excluding it, every ``hermes update``
+    # reports its own launcher as a concurrent instance — a false positive
+    # (issues #29341, #34795).
+    #
+    # Two robustness points learned from the field:
+    #   1. Use ``proc.parents()`` — it returns the WHOLE ancestor list in one
+    #      call. The earlier per-hop ``current.parent()`` loop bailed on the
+    #      first psutil error (AccessDenied/NoSuchProcess is common on Windows
+    #      across session/elevation boundaries), leaving the launcher shim in
+    #      the candidate set and re-triggering the false positive.
+    #   2. Only exclude ancestors whose exe is itself a shim. A genuine second
+    #      hermes.exe sitting *under* a non-Hermes parent (e.g. a Hermes
+    #      Desktop backend child) must still be flagged, so we don't blanket-
+    #      exclude unrelated ancestors like the shell or terminal.
+    # Broad ``except Exception`` guards against partially-stubbed psutil in
+    # unit tests; this helper is documented as "never raises".
+    if exclude_pid is not None:
+        exclude_pids: set[int] = {int(exclude_pid)}
+    else:
+        exclude_pids = {os.getpid()}
+    try:
+        seed = next(iter(exclude_pids))
+        try:
+            ancestors = psutil.Process(seed).parents()
+        except Exception:
+            ancestors = []
+        for ancestor in ancestors:
+            try:
+                anc_exe = ancestor.exe()
+            except Exception:
+                continue
+            if not anc_exe:
+                continue
+            try:
+                anc_norm = str(Path(anc_exe).resolve()).lower()
+            except (OSError, ValueError):
+                anc_norm = str(anc_exe).lower()
+            if anc_norm in shim_paths:
+                try:
+                    exclude_pids.add(int(ancestor.pid))
+                except Exception:
+                    continue
+    except Exception:
+        pass
+
    matches: list[tuple[int, str]] = []
    try:
        proc_iter = psutil.process_iter(["pid", "exe", "name"])
@ -7933,6 +8033,13 @@ def _format_concurrent_instances_message(
    lines.append("")
    lines.append("  Close Hermes Desktop, exit any open `hermes` REPLs, and")
    lines.append("  stop the gateway (`hermes gateway stop`) before retrying.")
+    lines.append("")
+    if matches:
+        pid_args = " ".join(f"/PID {pid}" for pid, _ in matches)
+        lines.append("  If you've already closed everything and these PIDs are")
+        lines.append("  stale, terminate them directly, then retry the update:")
+        lines.append(f"      taskkill {pid_args} /F")
+        lines.append("")
    lines.append("  Override with `hermes update --force` if you've already")
    lines.append("  confirmed those processes will not write to the venv.")
    return "\n".join(lines)
@ -8888,18 +8995,51 @@ def cmd_update(args):
 def _cmd_update_pip(args):
    """Update Hermes via pip (for PyPI installs)."""
    from hermes_cli import __version__
+    from hermes_cli.config import is_uv_tool_install

    print(f"→ Current version: {__version__}")
    print("→ Checking PyPI for updates...")

    uv = shutil.which("uv")
-    if uv:
+    in_venv = sys.prefix != sys.base_prefix
+    # pipx-managed installs live under .../pipx/venvs/<name>/...
+    pipx_managed = "pipx" in sys.prefix.split(os.sep)
+    pipx = shutil.which("pipx") if pipx_managed else None
+
+    # Only the ``uv pip install`` path inside a venv needs VIRTUAL_ENV
+    # exported (uv refuses to install without it when the launcher shim
+    # didn't activate the venv). ``uv tool upgrade`` / ``pipx upgrade``
+    # operate on a named environment and ignore VIRTUAL_ENV, so we don't
+    # set it for them.
+    export_virtualenv = False
+
+    if is_uv_tool_install():
+        if not uv:
+            print("✗ Detected a uv-tool install but `uv` is not on PATH; install uv and retry.")
+            sys.exit(1)
+        cmd = [uv, "tool", "upgrade", "hermes-agent"]
+    elif pipx_managed and pipx:
+        # pipx owns its own venv; ``pipx upgrade`` is the only correct path.
+        # Matches scripts/auto-update.sh, which already uses pipx upgrade.
+        cmd = [pipx, "upgrade", "hermes-agent"]
+    elif uv:
        cmd = [uv, "pip", "install", "--upgrade", "hermes-agent"]
+        if in_venv:
+            # Launcher shim runs the venv interpreter but doesn't export
+            # VIRTUAL_ENV; without it uv errors "No virtual environment found".
+            export_virtualenv = True
+        else:
+            # Outside any venv, ``--system`` lets uv target the active
+            # interpreter, matching pip's default behaviour.
+            cmd.insert(3, "--system")
    else:
        cmd = [sys.executable, "-m", "pip", "install", "--upgrade", "hermes-agent"]

    print(f"→ Running: {' '.join(cmd)}")
-    result = subprocess.run(cmd)
+    run_kwargs = {}
+    if export_virtualenv:
+        run_kwargs["env"] = {**os.environ, "VIRTUAL_ENV": sys.prefix}
+    result = subprocess.run(cmd, **run_kwargs)
    if result.returncode != 0:
        print("✗ Update failed")
        sys.exit(1)
@ -9135,12 +9275,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
        # though `git pull` can't touch $HERMES_HOME, this is cheap
        # belt-and-suspenders insurance and gives the user something to
        # restore from via `/snapshot list` / `/snapshot restore <id>`.
+        pre_update_snapshot_id = None
        try:
            from hermes_cli.backup import create_quick_snapshot

-            snap_id = create_quick_snapshot(label="pre-update", keep=1)
-            if snap_id:
-                print(f"  ✓ Pre-update snapshot: {snap_id}")
+            pre_update_snapshot_id = create_quick_snapshot(label="pre-update", keep=1)
+            if pre_update_snapshot_id:
+                print(f"  ✓ Pre-update snapshot: {pre_update_snapshot_id}")
        except Exception as exc:
            # Never let a snapshot failure block an update.
            logger.debug("Pre-update snapshot failed: %s", exc)
@ -9477,6 +9618,25 @@ def _cmd_update_impl(args, gateway_mode: bool):
        else:
            print("  ✓ Configuration is up to date")

+        # Safety net: config-version migrations have been observed to leave
+        # cron/jobs.json valid-but-empty, silently dropping every scheduled
+        # job (issue #34600). If the live file is now empty while the
+        # pre-update snapshot held jobs, restore it and warn loudly.
+        try:
+            from hermes_cli.backup import restore_cron_jobs_if_emptied
+
+            cron_restore = restore_cron_jobs_if_emptied(pre_update_snapshot_id)
+            if cron_restore:
+                print()
+                print(
+                    "  ⚠️  cron/jobs.json was emptied during this update — "
+                    f"restored {cron_restore['job_count']} job(s) from "
+                    f"pre-update snapshot {cron_restore['snapshot_id']}."
+                )
+        except Exception as exc:
+            # Never let the cron safety net break an otherwise-good update.
+            logger.debug("Cron jobs auto-restore check failed: %s", exc)
+
        print()
        print("✓ Update complete!")

@ -10572,11 +10732,10 @@ def cmd_profile(args):
            if collision:
                print(f"Error: {collision}")
                sys.exit(1)
-            wrapper_path = create_wrapper_script(alias_name)
+            wrapper_path = create_wrapper_script(
+                alias_name, target=name if custom_name else None
+            )
            if wrapper_path:
-                # If custom name, write the profile name into the wrapper
-                if custom_name:
-                    wrapper_path.write_text(f'#!/bin/sh\nexec hermes -p {name} "$@"\n')
                print(f"✓ Alias created: {wrapper_path}")
                if not _is_wrapper_dir_in_path():
                    print(f"⚠ {_get_wrapper_dir()} is not in your PATH.")
@ -10959,6 +11118,13 @@ def cmd_completion(args, parser=None):
        print(generate_bash(parser))


+def cmd_prompt_size(args):
+    """Show a byte/char breakdown of the system prompt + tool schemas."""
+    from hermes_cli.prompt_size import cmd_prompt_size as _impl
+
+    _impl(args)
+
+
 def cmd_logs(args):
    """View and filter Hermes log files."""
    from hermes_cli.logs import tail_log, list_logs
@ -10978,24 +11144,6 @@ def cmd_logs(args):
        since=getattr(args, "since", None),
        component=getattr(args, "component", None),
    )
-
-
-def _build_provider_choices() -> list[str]:
-    """Build the --provider choices list from CANONICAL_PROVIDERS + 'auto'."""
-    try:
-        from hermes_cli.models import CANONICAL_PROVIDERS as _cp
-        return ["auto"] + [p.slug for p in _cp]
-    except Exception:
-        # Fallback: static list guarantees the CLI always works
-        return [
-            "auto", "openrouter", "nous", "openai-codex", "xai-oauth", "copilot-acp", "copilot",
-            "anthropic", "gemini", "google-gemini-cli", "xai", "bedrock", "azure-foundry",
-            "ollama-cloud", "huggingface", "zai", "kimi-coding", "kimi-coding-cn",
-            "stepfun", "minimax", "minimax-cn", "kilocode", "novita", "xiaomi", "arcee",
-            "nvidia", "deepseek", "alibaba", "qwen-oauth", "opencode-zen", "opencode-go",
-        ]
-
-
 # Top-level subcommands that argparse knows about WITHOUT running plugin
 # discovery.  Used to short-circuit eager plugin imports (which can take
 # 500ms+ pulling in google.cloud.pubsub_v1, aiohttp, grpc, etc.) when the
@ -11013,6 +11161,7 @@ _BUILTIN_SUBCOMMANDS = frozenset(
        "dump", "fallback", "gateway", "hooks", "import", "insights",
        "kanban", "login", "logout", "logs", "lsp", "mcp", "memory", "migrate",
        "model", "pairing", "plugins", "portal", "postinstall", "profile", "proxy",
+        "prompt-size",
        "send", "sessions", "setup",
        "skills", "slack", "status", "tools", "uninstall", "update",
        "version", "webhook", "whatsapp", "chat", "secrets", "security",
@ -11113,6 +11262,26 @@ _AGENT_SUBCOMMANDS = {
 }


+def _is_tui_chat_launch(args) -> bool:
+    return bool(getattr(args, "tui", False) or os.environ.get("HERMES_TUI") == "1")
+
+
+def _command_has_dedicated_mcp_startup(args) -> bool:
+    if args.command == "acp":
+        return True
+    if args.command == "gateway" and getattr(args, "gateway_command", None) == "run":
+        return True
+    if args.command == "cron" and getattr(args, "cron_command", None) in {"run", "tick"}:
+        return True
+    return False
+
+
+def _should_background_mcp_startup(args) -> bool:
+    if _is_tui_chat_launch(args):
+        return False
+    return args.command in {None, "chat", "rl"}
+
+
 def _prepare_agent_startup(args) -> None:
    """Discover plugins/MCP/hooks for commands that can run an agent turn."""
    _sub_attr, _sub_set = _AGENT_SUBCOMMANDS.get(args.command, (None, None))
@ -11132,19 +11301,42 @@ def _prepare_agent_startup(args) -> None:
            "plugin discovery failed at CLI startup",
            exc_info=True,
        )
-    try:
-        # MCP tool discovery — no event loop running in CLI/TUI startup,
-        # so inline is safe.  Moved here from model_tools.py module scope
-        # to avoid freezing the gateway's event loop on its first message
-        # via the same lazy import path (#16856).
-        from tools.mcp_tool import discover_mcp_tools
+    _run_inline_mcp_discovery = True
+    if _is_tui_chat_launch(args):
+        # The TUI launcher hands off to a dedicated startup path that already
+        # backgrounds MCP discovery with a bounded join before the first tool
+        # snapshot.
+        _run_inline_mcp_discovery = False
+    elif _command_has_dedicated_mcp_startup(args):
+        # These entrypoints already do their own MCP startup later on the real
+        # runtime path (gateway executor, ACP launcher, cron job runner).
+        _run_inline_mcp_discovery = False
+    elif _should_background_mcp_startup(args):
+        try:
+            from hermes_cli.mcp_startup import start_background_mcp_discovery

-        discover_mcp_tools()
-    except Exception:
-        logger.debug(
-            "MCP tool discovery failed at CLI startup",
-            exc_info=True,
-        )
+            start_background_mcp_discovery(
+                logger=logger,
+                thread_name="cli-mcp-discovery",
+            )
+        except Exception:
+            logger.debug(
+                "Background MCP tool discovery failed at CLI startup",
+                exc_info=True,
+            )
+        _run_inline_mcp_discovery = False
+    if _run_inline_mcp_discovery:
+        try:
+            # MCP tool discovery remains synchronous for entrypoints that do
+            # not own a later bounded/executor startup path.
+            from tools.mcp_tool import discover_mcp_tools
+
+            discover_mcp_tools()
+        except Exception:
+            logger.debug(
+                "MCP tool discovery failed at CLI startup",
+                exc_info=True,
+            )
    try:
        from hermes_cli.config import load_config
        from agent.shell_hooks import register_from_config
@ -11285,6 +11477,10 @@ def _try_termux_fast_tui_launch() -> bool:

 def main():
    """Main entry point for hermes CLI."""
+    # Cosmetic: make the process show up as 'hermes' instead of 'python3.11'
+    # in ps/top/htop.  Non-fatal — just a nicer UX.
+    _set_process_title()
+
    # Force UTF-8 stdio on Windows before anything prints.  No-op elsewhere.
    try:
        from hermes_cli.stdio import configure_windows_stdio
@ -12911,7 +13107,34 @@ Examples:
    )
    plugins_remove.add_argument("name", help="Plugin directory name to remove")

-    plugins_subparsers.add_parser("list", aliases=["ls"], help="List installed plugins")
+    plugins_list = plugins_subparsers.add_parser(
+        "list", aliases=["ls"], help="List installed plugins"
+    )
+    plugins_list.add_argument(
+        "--enabled",
+        action="store_true",
+        help="Show only enabled plugins",
+    )
+    plugins_list.add_argument(
+        "--user",
+        action="store_true",
+        help="Show only user-installed plugins (including git plugins)",
+    )
+    plugins_list.add_argument(
+        "--no-bundled",
+        action="store_true",
+        help="Hide bundled plugins",
+    )
+    plugins_list.add_argument(
+        "--plain",
+        action="store_true",
+        help="Print compact plain-text output instead of a Rich table",
+    )
+    plugins_list.add_argument(
+        "--json",
+        action="store_true",
+        help="Print machine-readable JSON",
+    )

    plugins_enable = plugins_subparsers.add_parser(
        "enable", help="Enable a disabled plugin"
@ -13011,9 +13234,15 @@ Examples:
        ),
    )
    memory_sub = memory_parser.add_subparsers(dest="memory_command")
-    memory_sub.add_parser(
+    _setup_parser = memory_sub.add_parser(
        "setup", help="Interactive provider selection and configuration"
    )
+    _setup_parser.add_argument(
+        "provider",
+        nargs="?",
+        default=None,
+        help="Provider to configure directly (e.g. honcho), skipping the picker",
+    )
    memory_sub.add_parser("status", help="Show current memory provider config")
    memory_sub.add_parser("off", help="Disable external provider (built-in only)")
    _reset_parser = memory_sub.add_parser(
@ -13391,6 +13620,11 @@ Examples:
        "--yes", "-y", action="store_true", help="Skip confirmation"
    )

+    sessions_subparsers.add_parser(
+        "optimize",
+        help="Reclaim disk space: merge FTS5 segments + VACUUM (no data change)",
+    )
+
    sessions_subparsers.add_parser("stats", help="Show session store statistics")

    sessions_rename = sessions_subparsers.add_parser(
@ -13563,6 +13797,34 @@ Examples:
            relaunch(["--resume", selected_id])
            return  # won't reach here after execvp

+        elif action == "optimize":
+            db_path = db.db_path
+            before_mb = (
+                os.path.getsize(db_path) / (1024 * 1024)
+                if db_path.exists()
+                else 0.0
+            )
+            print("Optimizing session store (FTS merge + VACUUM)…")
+            try:
+                # vacuum() merges FTS5 segments (optimize_fts) then VACUUMs,
+                # and returns the number of indexes it merged.
+                n = db.vacuum()
+            except Exception as e:
+                print(f"Error: optimization failed: {e}")
+                db.close()
+                return
+            after_mb = (
+                os.path.getsize(db_path) / (1024 * 1024)
+                if db_path.exists()
+                else 0.0
+            )
+            saved = before_mb - after_mb
+            print(f"Optimized {n} FTS index(es).")
+            print(
+                f"Database size: {before_mb:.1f} MB -> {after_mb:.1f} MB "
+                f"(reclaimed {saved:.1f} MB)"
+            )
+
        elif action == "stats":
            total = db.session_count()
            msgs = db.message_count()
@ -14176,6 +14438,30 @@ Examples:
    )
    logs_parser.set_defaults(func=cmd_logs)

+    # =========================================================================
+    # prompt-size command
+    # =========================================================================
+    prompt_size_parser = subparsers.add_parser(
+        "prompt-size",
+        help="Show a byte breakdown of the system prompt + tool schemas",
+        description=(
+            "Report the fixed prompt budget for a fresh session: system "
+            "prompt total, skills index, memory, user profile, and tool-schema "
+            "JSON. Runs offline (no API call)."
+        ),
+    )
+    prompt_size_parser.add_argument(
+        "--platform",
+        default="cli",
+        help="Platform to simulate (cli, telegram, discord, ...). Default: cli",
+    )
+    prompt_size_parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Emit the breakdown as JSON",
+    )
+    prompt_size_parser.set_defaults(func=cmd_prompt_size)
+
    # =========================================================================
    # Parse and execute
    # =========================================================================
--- a/hermes_cli/mcp_catalog.py
+++ b/hermes_cli/mcp_catalog.py
@ -23,7 +23,6 @@ See references/mcp-catalog.md (this repo's skill) for the manifest schema.

 from __future__ import annotations

-import os
 import re
 import shutil
 import subprocess
@ -41,7 +40,7 @@ from hermes_cli.config import (
    get_env_value,
    save_env_value,
 )
-from hermes_cli.cli_output import prompt as _prompt_input, prompt_yes_no
+from hermes_cli.cli_output import prompt as _prompt_input

 _MANIFEST_VERSION = 1

--- a/hermes_cli/mcp_config.py
+++ b/hermes_cli/mcp_config.py
@ -205,6 +205,22 @@ def _probe_single_server(
    return tools_found


+def _oauth_tokens_present(name: str) -> bool:
+    """Return True if an OAuth token file exists on disk for ``name``.
+
+    Used after ``hermes mcp login`` to distinguish a genuine authentication
+    from a probe that succeeded only because the server allowed
+    initialize/tools-list without auth (so no token was ever acquired).
+    """
+    try:
+        from tools.mcp_oauth import HermesTokenStorage
+        return HermesTokenStorage(name).has_cached_tokens()
+    except Exception as exc:  # pragma: no cover — defensive
+        logger.debug("Could not check OAuth tokens for '%s': %s", name, exc)
+        # Be permissive on unexpected errors: don't block a real success.
+        return True
+
+
 def _unwrap_exception_group(exc: BaseException) -> Exception:
    """Extract the root-cause exception from anyio TaskGroup wrappers.

@ -631,6 +647,36 @@ def cmd_mcp_login(args):
    # Probe triggers the OAuth flow (browser redirect + callback capture).
    try:
        tools = _probe_single_server(name, server_config)
+        # A clean probe is NOT proof of authentication. Some MCP servers
+        # (notably Google's official Drive server) serve initialize +
+        # tools/list WITHOUT auth, so the probe lists tools even when the
+        # OAuth flow never completed — e.g. dynamic client registration
+        # 400'd because the provider doesn't support RFC 7591. Reporting
+        # "Authenticated — N tools" in that case is a false success: every
+        # real tool call later hangs until timeout because there's no token.
+        # Verify a token actually landed on disk before claiming success.
+        if not _oauth_tokens_present(name):
+            _warning(
+                "Server responded, but no OAuth token was obtained — "
+                "authentication did not complete."
+            )
+            print()
+            _info(
+                "Some providers (e.g. Google Drive, Atlassian) do not support "
+                "automatic client registration. For those you must create an "
+                "OAuth client yourself and add its credentials to config.yaml:"
+            )
+            print()
+            print(color(f"    mcp_servers:", Colors.DIM))
+            print(color(f"      {name}:", Colors.DIM))
+            print(color(f"        url: {url}", Colors.DIM))
+            print(color(f"        auth: oauth", Colors.DIM))
+            print(color(f"        oauth:", Colors.DIM))
+            print(color(f"          client_id: \"<your-oauth-client-id>\"", Colors.DIM))
+            print(color(f"          client_secret: \"<your-oauth-client-secret>\"", Colors.DIM))
+            print()
+            _info("Then re-run `hermes mcp login " + name + "`.")
+            return
        if tools:
            _success(f"Authenticated — {len(tools)} tool(s) available")
        else:
--- a/hermes_cli/mcp_startup.py
+++ b/hermes_cli/mcp_startup.py
@ -0,0 +1,59 @@
+"""Shared CLI/TUI-safe helpers for background MCP discovery."""
+
+from __future__ import annotations
+
+import threading
+from typing import Optional
+
+_mcp_discovery_lock = threading.Lock()
+_mcp_discovery_started = False
+_mcp_discovery_thread: Optional[threading.Thread] = None
+
+
+def _has_configured_mcp_servers() -> bool:
+    """Cheap config probe so non-MCP users avoid importing the MCP stack."""
+    try:
+        from hermes_cli.config import read_raw_config
+
+        mcp_servers = (read_raw_config() or {}).get("mcp_servers")
+        return isinstance(mcp_servers, dict) and len(mcp_servers) > 0
+    except Exception:
+        # Be conservative: if config probing fails, try discovery in the
+        # background so startup still can't block.
+        return True
+
+
+def start_background_mcp_discovery(*, logger, thread_name: str) -> None:
+    """Spawn one shared background MCP discovery thread for this process."""
+    global _mcp_discovery_started, _mcp_discovery_thread
+
+    with _mcp_discovery_lock:
+        if _mcp_discovery_started:
+            return
+        _mcp_discovery_started = True
+        if not _has_configured_mcp_servers():
+            return
+
+        def _discover() -> None:
+            try:
+                from tools.mcp_tool import discover_mcp_tools
+
+                discover_mcp_tools()
+            except Exception:
+                logger.debug("Background MCP tool discovery failed", exc_info=True)
+
+        thread = threading.Thread(
+            target=_discover,
+            name=thread_name,
+            daemon=True,
+        )
+        _mcp_discovery_thread = thread
+        thread.start()
+
+
+def wait_for_mcp_discovery(timeout: float = 0.75) -> None:
+    """Briefly wait for background MCP discovery before the first tool snapshot."""
+    thread = _mcp_discovery_thread
+    if thread is None or not thread.is_alive():
+        return
+    thread.join(timeout=timeout)
--- a/hermes_cli/memory_setup.py
+++ b/hermes_cli/memory_setup.py
@ -452,7 +452,11 @@ def memory_command(args) -> None:
    """Route memory subcommands."""
    sub = getattr(args, "memory_command", None)
    if sub == "setup":
-        cmd_setup(args)
+        provider = getattr(args, "provider", None)
+        if provider:
+            cmd_setup_provider(provider)
+        else:
+            cmd_setup(args)
    elif sub == "status":
        cmd_status(args)
    else:
--- a/hermes_cli/model_catalog.py
+++ b/hermes_cli/model_catalog.py
@ -64,6 +64,15 @@ logger = logging.getLogger(__name__)
 DEFAULT_CATALOG_URL = (
    "https://hermes-agent.nousresearch.com/docs/api/model-catalog.json"
 )
+# Fallback fetch chain. The Docusaurus site is served through Vercel, which
+# occasionally returns HTTP 403 + x-vercel-mitigated: challenge for non-
+# browser clients (urllib, curl). When that happens the disk cache goes
+# stale and new model releases never reach the picker. The raw GitHub URL
+# is the same manifest published from the same repo and is not bot-gated,
+# so we fall through to it whenever the primary URL fails.
+DEFAULT_CATALOG_FALLBACK_URLS: tuple[str, ...] = (
+    "https://raw.githubusercontent.com/NousResearch/hermes-agent/main/website/static/api/model-catalog.json",
+)
 DEFAULT_TTL_HOURS = 24
 DEFAULT_FETCH_TIMEOUT = 8.0
 SUPPORTED_SCHEMA_VERSION = 1
@ -139,6 +148,31 @@ def _fetch_manifest(url: str, timeout: float) -> dict[str, Any] | None:
    return data


+def _fetch_manifest_with_fallback(
+    primary_url: str,
+    timeout: float,
+    fallback_urls: tuple[str, ...] = DEFAULT_CATALOG_FALLBACK_URLS,
+) -> dict[str, Any] | None:
+    """Try ``primary_url`` first, then walk ``fallback_urls``.
+
+    Returns the first manifest that fetches and validates, or None when
+    every URL fails. Skips fallback URLs identical to the primary so an
+    operator who configured the catalog URL to point at the raw GitHub
+    copy doesn't double-fetch.
+    """
+    data = _fetch_manifest(primary_url, timeout)
+    if data is not None:
+        return data
+    for url in fallback_urls:
+        if not url or url == primary_url:
+            continue
+        data = _fetch_manifest(url, timeout)
+        if data is not None:
+            logger.info("model catalog primary URL failed; using fallback %s", url)
+            return data
+    return None
+
+
 def _validate_manifest(data: Any) -> bool:
    """Return True when ``data`` matches the minimum manifest shape."""
    if not isinstance(data, dict):
@ -235,7 +269,7 @@ def get_catalog(*, force_refresh: bool = False) -> dict[str, Any]:
        return disk_data

    # Need to (re)fetch. If it fails, fall back to any stale disk copy.
-    fetched = _fetch_manifest(cfg["url"], DEFAULT_FETCH_TIMEOUT)
+    fetched = _fetch_manifest_with_fallback(cfg["url"], DEFAULT_FETCH_TIMEOUT)
    if fetched is not None:
        _write_disk_cache(fetched)
        new_disk_data, new_mtime = _read_disk_cache()
--- a/hermes_cli/model_switch.py
+++ b/hermes_cli/model_switch.py
@ -277,19 +277,6 @@ class ModelSwitchResult:
    capabilities: Optional[ModelCapabilities] = None
    model_info: Optional[ModelInfo] = None
    is_global: bool = False
-
-
-@dataclass
-class CustomAutoResult:
-    """Result of switching to bare 'custom' provider with auto-detect."""
-
-    success: bool
-    model: str = ""
-    base_url: str = ""
-    api_key: str = ""
-    error_message: str = ""
-
-
 # ---------------------------------------------------------------------------
 # Flag parsing
 # ---------------------------------------------------------------------------
@ -1085,8 +1072,7 @@ def list_authenticated_providers(
    from hermes_cli.auth import PROVIDER_REGISTRY
    from hermes_cli.models import (
        OPENROUTER_MODELS, _PROVIDER_MODELS,
-        _MODELS_DEV_PREFERRED, _merge_with_models_dev, provider_model_ids,
-        cached_provider_model_ids,
+        _MODELS_DEV_PREFERRED, _merge_with_models_dev, cached_provider_model_ids,
        get_curated_nous_model_ids,
    )

@ -1570,24 +1556,21 @@ def list_authenticated_providers(

    # --- 4. Saved custom providers from config ---
    # Each ``custom_providers`` entry represents one model under a named
-    # provider. Entries sharing the same endpoint (``base_url`` + ``api_key``)
-    # are grouped into a single picker row, so e.g. four Ollama entries
-    # pointing at ``http://localhost:11434/v1`` with per-model display names
-    # ("Ollama — GLM 5.1", "Ollama — Qwen3-coder", ...) appear as one
+    # provider. Entries sharing the same endpoint, credential identity, and
+    # wire protocol are grouped into a single picker row, so e.g. four Ollama
+    # entries pointing at ``http://localhost:11434/v1`` with per-model display
+    # names ("Ollama — GLM 5.1", "Ollama — Qwen3-coder", ...) appear as one
    # "Ollama" row with four models inside instead of four near-duplicates
-    # that differ only by suffix. Entries with distinct endpoints still
-    # produce separate rows.
-    #
-    # When the grouped endpoint matches ``current_base_url`` the group's
-    # slug becomes ``current_provider`` so that selecting a model from the
-    # picker flows back through the runtime provider that already holds
-    # valid credentials — no re-resolution needed.
+    # that differ only by suffix. Same-host entries with different ``key_env``
+    # or ``api_mode`` remain distinct providers.
    if custom_providers and isinstance(custom_providers, list):
        from collections import OrderedDict

-        # Key by (base_url, api_key) instead of slug: names frequently
-        # differ per model ("Ollama — X") while the endpoint stays the
-        # same. Slug-based grouping left them as separate rows.
+        # Key by endpoint + credential identity + wire protocol instead of
+        # slug: names frequently differ per model ("Ollama — X") while the
+        # endpoint stays the same.  Keep same-host providers with distinct
+        # env-backed credentials or API protocols separate so picker selection
+        # cannot route through the wrong credential/mode pair.
        groups: "OrderedDict[tuple, dict]" = OrderedDict()
        for entry in custom_providers:
            if not isinstance(entry, dict):
@ -1602,9 +1585,23 @@ def list_authenticated_providers(
            ).strip().rstrip("/")
            if not raw_name or not api_url:
                continue
-            api_key = (entry.get("api_key") or "").strip()
+            inline_api_key = (entry.get("api_key") or "").strip()
+            key_env = (entry.get("key_env") or "").strip()
+            api_key = inline_api_key or (
+                os.environ.get(key_env, "").strip() if key_env else ""
+            )
+            api_mode = str(
+                entry.get("api_mode")
+                or entry.get("transport")
+                or ""
+            ).strip().lower()
+            credential_identity = (
+                inline_api_key
+                if inline_api_key
+                else (f"env:{key_env}" if key_env else "")
+            )

-            group_key = (api_url, api_key)
+            group_key = (api_url, credential_identity, api_mode)
            if group_key not in groups:
                # Strip per-model suffix so "Ollama — GLM 5.1" becomes
                # "Ollama" for the grouped row. Em dash is the convention
@ -1617,29 +1614,16 @@ def list_authenticated_providers(
                        break
                if not display_name:
                    display_name = raw_name
-                # If this endpoint matches the currently active one, use
-                # ``current_provider`` as the slug so picker-driven switches
-                # route through the live credential pipeline.
-                if (
-                    current_base_url
-                    and api_url == current_base_url.strip().rstrip("/")
-                ):
-                    # Guard against bare "custom" slug left by a prior
-                    # failed switch — always resolve to the canonical
-                    # custom:<name> form.  (GH #17478)
-                    slug = (
-                        current_provider
-                        if current_provider and current_provider != "custom"
-                        else custom_provider_slug(display_name)
-                    )
-                else:
-                    slug = custom_provider_slug(display_name)
+                slug = custom_provider_slug(display_name)
                groups[group_key] = {
                    "slug": slug,
                    "name": display_name,
                    "api_url": api_url,
+                    "api_key": api_key,
                    "models": [],
                }
+            elif api_key and not groups[group_key].get("api_key"):
+                groups[group_key]["api_key"] = api_key

            # The singular ``model:`` field only holds the currently
            # active model. Hermes's own writer (main.py::_save_custom_provider)
@ -1661,8 +1645,16 @@ def list_authenticated_providers(
                        groups[group_key]["models"].append(m)

        _section4_emitted_slugs: set = set()
-        for grp_key, grp in groups.items():
-            api_url, api_key = grp_key
+        _current_base_url_norm = str(current_base_url or "").strip().rstrip("/").lower()
+        _current_base_url_group_count = sum(
+            1
+            for _grp in groups.values()
+            if _current_base_url_norm
+            and str(_grp["api_url"]).strip().rstrip("/").lower() == _current_base_url_norm
+        )
+        for grp in groups.values():
+            api_url = grp["api_url"]
+            api_key = grp.get("api_key", "")
            slug = grp["slug"]
            # If the slug is already claimed by a built-in / overlay /
            # user-provider row (sections 1-3), skip this custom group
@ -1735,8 +1727,10 @@ def list_authenticated_providers(
                "slug": slug,
                "name": grp["name"],
                "is_current": slug == current_provider or (
-                    bool(current_base_url)
-                    and _grp_url_norm == current_base_url.strip().rstrip("/").lower()
+                    current_provider == "custom"
+                    and bool(_current_base_url_norm)
+                    and _grp_url_norm == _current_base_url_norm
+                    and _current_base_url_group_count == 1
                ),
                "is_user_defined": True,
                "models": grp["models"],
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@ -49,11 +49,11 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
    ("xiaomi/mimo-v2.5-pro",                   ""),
    ("tencent/hy3-preview",                    ""),
    ("google/gemini-3-pro-image-preview",      ""),
-    ("google/gemini-3-flash-preview",          ""),
+    ("google/gemini-3.5-flash",                ""),
    ("google/gemini-3.1-pro-preview",          ""),
    ("google/gemini-3.1-flash-lite-preview",   ""),
    ("qwen/qwen3.6-35b-a3b",                   ""),
-    ("stepfun/step-3.5-flash",                 ""),
+    ("stepfun/step-3.7-flash",                 ""),
    ("minimax/minimax-m2.7",                   ""),
    ("z-ai/glm-5.1",                           ""),
    ("x-ai/grok-4.20",                         ""),
@ -156,11 +156,11 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "xiaomi/mimo-v2.5-pro",
        "tencent/hy3-preview",
        "google/gemini-3-pro-preview",
-        "google/gemini-3-flash-preview",
+        "google/gemini-3.5-flash",
        "google/gemini-3.1-pro-preview",
        "google/gemini-3.1-flash-lite-preview",
        "qwen/qwen3.6-35b-a3b",
-        "stepfun/step-3.5-flash",
+        "stepfun/step-3.7-flash",
        "minimax/minimax-m2.7",
        "z-ai/glm-5.1",
        "x-ai/grok-4.3",
@ -484,41 +484,6 @@ def _is_model_free(model_id: str, pricing: dict[str, dict[str, str]]) -> bool:
 # ---------------------------------------------------------------------------
 # Nous Portal account tier detection
 # ---------------------------------------------------------------------------
-
-def fetch_nous_account_tier(access_token: str, portal_base_url: str = "") -> dict[str, Any]:
-    """Fetch the user's Nous Portal account/subscription info.
-
-    Calls ``<portal>/api/oauth/account`` with the OAuth access token.
-
-    Returns the parsed JSON dict on success, e.g.::
-
-        {
-            "subscription": {
-                "plan": "Plus",
-                "tier": 2,
-                "monthly_charge": 20,
-                "credits_remaining": 1686.60,
-                ...
-            },
-            ...
-        }
-
-    Returns an empty dict on any failure (network, auth, parse).
-    """
-    base = (portal_base_url or "https://portal.nousresearch.com").rstrip("/")
-    url = f"{base}/api/oauth/account"
-    headers = {
-        "Authorization": f"Bearer {access_token}",
-        "Accept": "application/json",
-    }
-    try:
-        req = urllib.request.Request(url, headers=headers)
-        with urllib.request.urlopen(req, timeout=8) as resp:
-            return json.loads(resp.read().decode())
-    except Exception:
-        return {}
-
-
 def is_nous_free_tier(account_info: dict[str, Any]) -> bool:
    """Return True if the account info indicates a free (unpaid) tier.

@ -971,6 +936,105 @@ _PROVIDER_LABELS = {p.slug: p.label for p in CANONICAL_PROVIDERS}
 _PROVIDER_LABELS["custom"] = "Custom endpoint"  # special case: not a named provider


+# ---------------------------------------------------------------------------
+# Provider groups — DISPLAY ONLY
+#
+# Some vendors expose several Hermes provider slugs (one per endpoint /
+# auth method: global API, China API, OAuth coding plan, ...). Listing every
+# slug as a top-level row in the interactive `hermes model` / setup wizard /
+# Telegram `/model` pickers makes that list long and noisy.
+#
+# These groups fold related slugs under one top-level row in INTERACTIVE
+# PICKERS only. They do NOT change ``CANONICAL_PROVIDERS``, slug identity,
+# the ``--provider`` flag, ``/model <provider:model>``, or any typed path —
+# every member slug remains individually addressable. Grouping is a pure
+# display affordance; ``group_providers()`` is the single fold used by all
+# three picker surfaces so they stay consistent.
+#
+#   group_id -> (display_label, [member_slug, ...])
+#
+# Member order is the order shown inside the group submenu.
+# ---------------------------------------------------------------------------
+PROVIDER_GROUPS: dict[str, tuple[str, list[str]]] = {
+    "kimi":     ("Kimi / Moonshot", ["kimi-coding", "kimi-coding-cn"]),
+    "minimax":  ("MiniMax",         ["minimax", "minimax-oauth", "minimax-cn"]),
+    "xai":      ("xAI Grok",        ["xai", "xai-oauth"]),
+    "google":   ("Google Gemini",   ["gemini", "google-gemini-cli"]),
+    "openai":   ("OpenAI",          ["openai-codex", "openai-api"]),
+    "opencode": ("OpenCode",        ["opencode-zen", "opencode-go"]),
+    "copilot":  ("GitHub Copilot",  ["copilot", "copilot-acp"]),
+}
+
+# Reverse index: member slug -> group_id. Built once at import.
+_SLUG_TO_GROUP: dict[str, str] = {
+    slug: gid for gid, (_label, members) in PROVIDER_GROUPS.items() for slug in members
+}
+
+
+def provider_group_for_slug(slug: str) -> str:
+    """Return the group_id a provider slug belongs to, or "" if ungrouped."""
+    return _SLUG_TO_GROUP.get(str(slug or "").strip().lower(), "")
+
+
+def group_providers(slugs):
+    """Fold a flat ordered slug iterable into picker rows by provider group.
+
+    DISPLAY ONLY. Used by every interactive picker (``hermes model``, the
+    setup wizard, the Telegram ``/model`` keyboard) so grouping is identical
+    across surfaces.
+
+    Each returned row is a dict::
+
+        {"kind": "single", "slug": <slug>}                       # ungrouped, or
+                                                                  # 1-member group
+        {"kind": "group", "group_id": <gid>, "label": <label>,
+         "members": [<slug>, ...]}                                # 2+ members
+
+    Rules:
+      * A group row appears at the position of its FIRST present member, in
+        the input order. Subsequent members fold into that row (and are not
+        emitted again).
+      * Member order inside a group follows ``PROVIDER_GROUPS`` declaration,
+        restricted to the members actually present in ``slugs``.
+      * A group reduced to a single present member degrades to a ``single``
+        row — no pointless one-item submenu.
+      * Slugs not in any group pass through as ``single`` rows, order
+        preserved.
+      * Duplicate slugs in the input are ignored after first sight.
+    """
+    seen: set[str] = set()
+    # Which present members each group has, in declaration order.
+    group_members: dict[str, list[str]] = {}
+    for gid, (_label, members) in PROVIDER_GROUPS.items():
+        present = [m for m in members if m in set(slugs)]
+        if present:
+            group_members[gid] = present
+
+    rows = []
+    emitted_groups: set[str] = set()
+    for slug in slugs:
+        s = str(slug or "").strip().lower()
+        if not s or s in seen:
+            continue
+        seen.add(s)
+        gid = _SLUG_TO_GROUP.get(s, "")
+        if not gid:
+            rows.append({"kind": "single", "slug": s})
+            continue
+        if gid in emitted_groups:
+            continue  # already folded at the first member's position
+        emitted_groups.add(gid)
+        members = group_members.get(gid, [s])
+        if len(members) <= 1:
+            rows.append({"kind": "single", "slug": members[0]})
+        else:
+            label, _ = PROVIDER_GROUPS[gid]
+            rows.append(
+                {"kind": "group", "group_id": gid, "label": label, "members": list(members)}
+            )
+    return rows
+
+
 _PROVIDER_ALIASES = {
    "glm": "zai",
    "z-ai": "zai",
@ -1223,68 +1287,6 @@ def _format_price_per_mtok(per_token_str: str) -> str:
    return f"${per_m:.2f}"


-def format_model_pricing_table(
-    models: list[tuple[str, str]],
-    pricing_map: dict[str, dict[str, str]],
-    current_model: str = "",
-    indent: str = "      ",
-) -> list[str]:
-    """Build a column-aligned model+pricing table for terminal display.
-
-    Returns a list of pre-formatted lines ready to print.
-    *models* is ``[(model_id, description), ...]``.
-    """
-    if not models:
-        return []
-
-    # Build rows: (model_id, input_price, output_price, cache_price, is_current)
-    rows: list[tuple[str, str, str, str, bool]] = []
-    has_cache = False
-    for mid, _desc in models:
-        is_cur = mid == current_model
-        p = pricing_map.get(mid)
-        if p:
-            inp = _format_price_per_mtok(p.get("prompt", ""))
-            out = _format_price_per_mtok(p.get("completion", ""))
-            cache_read = p.get("input_cache_read", "")
-            cache = _format_price_per_mtok(cache_read) if cache_read else ""
-            if cache:
-                has_cache = True
-        else:
-            inp, out, cache = "", "", ""
-        rows.append((mid, inp, out, cache, is_cur))
-
-    name_col = max(len(r[0]) for r in rows) + 2
-    # Compute price column widths from the actual data so decimals align
-    price_col = max(
-        max((len(r[1]) for r in rows if r[1]), default=4),
-        max((len(r[2]) for r in rows if r[2]), default=4),
-        3,  # minimum: "In" / "Out" header
-    )
-    cache_col = max(
-        max((len(r[3]) for r in rows if r[3]), default=4),
-        5,  # minimum: "Cache" header
-    ) if has_cache else 0
-    lines: list[str] = []
-
-    # Header
-    if has_cache:
-        lines.append(f"{indent}{'Model':<{name_col}} {'In':>{price_col}}  {'Out':>{price_col}}  {'Cache':>{cache_col}}  /Mtok")
-        lines.append(f"{indent}{'-' * name_col} {'-' * price_col}  {'-' * price_col}  {'-' * cache_col}")
-    else:
-        lines.append(f"{indent}{'Model':<{name_col}} {'In':>{price_col}}  {'Out':>{price_col}}  /Mtok")
-        lines.append(f"{indent}{'-' * name_col} {'-' * price_col}  {'-' * price_col}")
-
-    for mid, inp, out, cache, is_cur in rows:
-        marker = "  ← current" if is_cur else ""
-        if has_cache:
-            lines.append(f"{indent}{mid:<{name_col}} {inp:>{price_col}}  {out:>{price_col}}  {cache:>{cache_col}}{marker}")
-        else:
-            lines.append(f"{indent}{mid:<{name_col}} {inp:>{price_col}}  {out:>{price_col}}{marker}")
-
-    return lines
-
-
 def fetch_models_with_pricing(
    api_key: str | None = None,
    base_url: str = "https://openrouter.ai/api",
--- a/hermes_cli/nous_account.py
+++ b/hermes_cli/nous_account.py
@ -4,6 +4,7 @@ from __future__ import annotations

 import hashlib
 import json
+import threading
 import time
 import urllib.request
 from dataclasses import dataclass
@ -15,6 +16,7 @@ NousAccountInfoSource = Literal["jwt", "account_api", "inference_key", "none", "

 _ACCOUNT_INFO_CACHE_TTL = 60
 _account_info_cache: tuple[str, float, "NousPortalAccountInfo"] | None = None
+_ACCOUNT_INFO_CACHE_LOCK = threading.Lock()


@dataclass(frozen=True)
@ -302,10 +304,11 @@ def _fresh_account_info(
        portal_base_url = _portal_base_url(refreshed_state) or portal_base_url
        cache_key = _cache_key(access_token, portal_base_url)

-        if not force_fresh and _account_info_cache is not None:
-            cached_key, cached_at, cached_info = _account_info_cache
-            if cached_key == cache_key and (time.monotonic() - cached_at) < _ACCOUNT_INFO_CACHE_TTL:
-                return cached_info
+        with _ACCOUNT_INFO_CACHE_LOCK:
+            if not force_fresh and _account_info_cache is not None:
+                cached_key, cached_at, cached_info = _account_info_cache
+                if cached_key == cache_key and (time.monotonic() - cached_at) < _ACCOUNT_INFO_CACHE_TTL:
+                    return cached_info

        payload = _fetch_nous_account_info(access_token, portal_base_url)
        if not payload:
@ -327,7 +330,8 @@ def _fresh_account_info(
            state=refreshed_state,
            portal_base_url=portal_base_url,
        )
-        _account_info_cache = (cache_key, time.monotonic(), info)
+        with _ACCOUNT_INFO_CACHE_LOCK:
+            _account_info_cache = (cache_key, time.monotonic(), info)
        return info
    except Exception as exc:
        return _error_info(
--- a/hermes_cli/nous_subscription.py
+++ b/hermes_cli/nous_subscription.py
@ -71,12 +71,16 @@ class NousSubscriptionFeatures:
    def browser(self) -> NousFeatureState:
        return self.features["browser"]

+    @property
+    def video_gen(self) -> NousFeatureState:
+        return self.features["video_gen"]
+
    @property
    def modal(self) -> NousFeatureState:
        return self.features["modal"]

    def items(self) -> Iterable[NousFeatureState]:
-        ordered = ("web", "image_gen", "tts", "browser", "modal")
+        ordered = ("web", "image_gen", "video_gen", "tts", "browser", "modal")
        for key in ordered:
            yield self.features[key]

@ -255,6 +259,7 @@ def get_nous_subscription_features(

    web_tool_enabled = _toolset_enabled(config, "web")
    image_tool_enabled = _toolset_enabled(config, "image_gen")
+    video_tool_enabled = _toolset_enabled(config, "video_gen")
    tts_tool_enabled = _toolset_enabled(config, "tts")
    browser_tool_enabled = _toolset_enabled(config, "browser")
    modal_tool_enabled = _toolset_enabled(config, "terminal")
@ -289,6 +294,8 @@ def get_nous_subscription_features(
    browser_use_gateway = _uses_gateway(browser_cfg)
    image_gen_cfg = config.get("image_gen") if isinstance(config.get("image_gen"), dict) else {}
    image_use_gateway = _uses_gateway(image_gen_cfg)
+    video_gen_cfg = config.get("video_gen") if isinstance(config.get("video_gen"), dict) else {}
+    video_use_gateway = _uses_gateway(video_gen_cfg)

    direct_exa = bool(get_env_value("EXA_API_KEY"))
    direct_firecrawl = bool(get_env_value("FIRECRAWL_API_KEY") or get_env_value("FIRECRAWL_API_URL"))
@ -296,6 +303,7 @@ def get_nous_subscription_features(
    direct_tavily = bool(get_env_value("TAVILY_API_KEY"))
    direct_searxng = bool(get_env_value("SEARXNG_URL"))
    direct_fal = fal_key_is_configured()
+    direct_fal_video = direct_fal  # same FAL_KEY; separate var so use_gateway is independent
    direct_openai_tts = bool(resolve_openai_audio_api_key())
    direct_elevenlabs = bool(get_env_value("ELEVENLABS_API_KEY"))
    direct_camofox = bool(get_env_value("CAMOFOX_URL"))
@ -311,6 +319,8 @@ def get_nous_subscription_features(
        direct_tavily = False
    if image_use_gateway:
        direct_fal = False
+    if video_use_gateway:
+        direct_fal_video = False
    if tts_use_gateway:
        direct_openai_tts = False
        direct_elevenlabs = False
@ -320,6 +330,8 @@ def get_nous_subscription_features(

    managed_web_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("firecrawl")
    managed_image_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("fal-queue")
+    # Video gen uses the same fal-queue gateway as image gen.
+    managed_video_available = managed_image_available
    managed_tts_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("openai-audio")
    managed_browser_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("browser-use")
    managed_modal_available = managed_tools_flag and nous_auth_present and is_managed_tool_gateway_ready("modal")
@ -357,6 +369,10 @@ def get_nous_subscription_features(
    image_active = bool(image_tool_enabled and (image_managed or direct_fal))
    image_available = bool(managed_image_available or direct_fal)

+    video_managed = video_tool_enabled and managed_video_available and not direct_fal_video
+    video_active = bool(video_tool_enabled and (video_managed or direct_fal_video))
+    video_available = bool(managed_video_available or direct_fal_video)
+
    tts_current_provider = tts_provider or "edge"
    tts_managed = (
        tts_tool_enabled
@ -451,6 +467,18 @@ def get_nous_subscription_features(
            current_provider="FAL" if direct_fal else ("Nous Subscription" if image_managed else ""),
            explicit_configured=direct_fal,
        ),
+        "video_gen": NousFeatureState(
+            key="video_gen",
+            label="Video generation",
+            included_by_default=False,
+            available=video_available,
+            active=video_active,
+            managed_by_nous=video_managed,
+            direct_override=video_active and not video_managed,
+            toolset_enabled=video_tool_enabled,
+            current_provider="FAL" if direct_fal_video else ("Nous Subscription" if video_managed else ""),
+            explicit_configured=direct_fal_video,
+        ),
        "tts": NousFeatureState(
            key="tts",
            label="OpenAI TTS",
@ -559,8 +587,22 @@ def apply_nous_managed_defaults(
        changed.add("browser")

    if "image_gen" in selected_toolsets and not fal_key_is_configured():
+        image_cfg = config.get("image_gen")
+        if not isinstance(image_cfg, dict):
+            image_cfg = {}
+            config["image_gen"] = image_cfg
+        image_cfg["use_gateway"] = True
        changed.add("image_gen")

+    if "video_gen" in selected_toolsets and not fal_key_is_configured():
+        video_cfg = config.get("video_gen")
+        if not isinstance(video_cfg, dict):
+            video_cfg = {}
+            config["video_gen"] = video_cfg
+        video_cfg["provider"] = "fal"
+        video_cfg["use_gateway"] = True
+        changed.add("video_gen")
+
    return changed


@ -571,6 +613,7 @@ def apply_nous_managed_defaults(
 _GATEWAY_TOOL_LABELS = {
    "web": "Web search & extract (Firecrawl)",
    "image_gen": "Image generation (FAL)",
+    "video_gen": "Video generation (FAL)",
    "tts": "Text-to-speech (OpenAI TTS)",
    "browser": "Browser automation (Browser Use)",
 }
@ -578,6 +621,7 @@ _GATEWAY_TOOL_LABELS = {

 def _get_gateway_direct_credentials() -> Dict[str, bool]:
    """Return a dict of tool_key -> has_direct_credentials."""
+    fal_direct = fal_key_is_configured()
    return {
        "web": bool(
            get_env_value("FIRECRAWL_API_KEY")
@ -586,7 +630,8 @@ def _get_gateway_direct_credentials() -> Dict[str, bool]:
            or get_env_value("TAVILY_API_KEY")
            or get_env_value("EXA_API_KEY")
        ),
-        "image_gen": fal_key_is_configured(),
+        "image_gen": fal_direct,
+        "video_gen": fal_direct,
        "tts": bool(
            resolve_openai_audio_api_key()
            or get_env_value("ELEVENLABS_API_KEY")
@ -601,11 +646,12 @@ def _get_gateway_direct_credentials() -> Dict[str, bool]:
 _GATEWAY_DIRECT_LABELS = {
    "web": "Firecrawl/Exa/Parallel/Tavily key",
    "image_gen": "FAL key",
+    "video_gen": "FAL key",
    "tts": "OpenAI/ElevenLabs key",
    "browser": "Browser Use/Browserbase key",
 }

-_ALL_GATEWAY_KEYS = ("web", "image_gen", "tts", "browser")
+_ALL_GATEWAY_KEYS = ("web", "image_gen", "video_gen", "tts", "browser")


 def get_gateway_eligible_tools(
@ -646,6 +692,7 @@ def get_gateway_eligible_tools(
    opted_in = {
        "web": _uses_gateway(config.get("web")),
        "image_gen": _uses_gateway(config.get("image_gen")),
+        "video_gen": _uses_gateway(config.get("video_gen")),
        "tts": _uses_gateway(config.get("tts")),
        "browser": _uses_gateway(config.get("browser")),
    }
@ -714,6 +761,15 @@ def apply_gateway_defaults(
        image_cfg["use_gateway"] = True
        changed.add("image_gen")

+    if "video_gen" in tool_keys:
+        video_cfg = config.get("video_gen")
+        if not isinstance(video_cfg, dict):
+            video_cfg = {}
+            config["video_gen"] = video_cfg
+        video_cfg["provider"] = "fal"
+        video_cfg["use_gateway"] = True
+        changed.add("video_gen")
+
    return changed


--- a/hermes_cli/oneshot.py
+++ b/hermes_cli/oneshot.py
@ -174,28 +174,55 @@ def run_oneshot(
    # Redirect stderr AND stdout to devnull for the entire call tree.
    # We'll print the final response to the real stdout at the end.
    real_stdout = sys.stdout
+    real_stderr = sys.stderr
    devnull = open(os.devnull, "w", encoding="utf-8")

+    response: Optional[str] = None
+    failure: BaseException | None = None
    try:
        with redirect_stdout(devnull), redirect_stderr(devnull):
-            response = _run_agent(
-                prompt,
-                model=model,
-                provider=provider,
-                toolsets=explicit_toolsets,
-                use_config_toolsets=use_config_toolsets,
-            )
+            try:
+                response = _run_agent(
+                    prompt,
+                    model=model,
+                    provider=provider,
+                    toolsets=explicit_toolsets,
+                    use_config_toolsets=use_config_toolsets,
+                )
+            except BaseException as exc:  # noqa: BLE001
+                # Capture anything that escapes the agent (including OSError
+                # from prompt_toolkit/Vt100 when stdout is a non-TTY pipe,
+                # KeyboardInterrupt, SystemExit, etc.) so we can surface it on
+                # the real stderr instead of crashing past the redirect with a
+                # traceback that the caller never sees. A silent exit in a
+                # cron / SSH / subprocess context is the worst failure mode.
+                # See #30623.
+                failure = exc
    finally:
        try:
            devnull.close()
        except Exception:
            pass

-    if response:
-        real_stdout.write(response)
-        if not response.endswith("\n"):
-            real_stdout.write("\n")
-        real_stdout.flush()
+    if failure is not None:
+        # Re-raise control-flow exceptions so the parent handles them as usual
+        # (Ctrl-C / explicit sys.exit() inside the agent).
+        if isinstance(failure, (KeyboardInterrupt, SystemExit)):
+            raise failure
+        real_stderr.write(f"hermes -z: agent failed: {failure}\n")
+        real_stderr.flush()
+        return 1
+
+    if not (response or "").strip():
+        real_stderr.write("hermes -z: no final response was produced; treating the run as failed.\n")
+        real_stderr.flush()
+        return 1
+
+    assert response is not None  # narrowed by the empty-response guard above
+    real_stdout.write(response)
+    if not response.endswith("\n"):
+        real_stdout.write("\n")
+    real_stdout.flush()
    return 0


--- a/hermes_cli/partial_compress.py
+++ b/hermes_cli/partial_compress.py
@ -0,0 +1,235 @@
+"""Boundary-aware partial compression — "summarize up to here".
+
+Inspired by Claude Code's Rewind menu "Summarize up to here" action
+(v2.1.139–v2.1.142, Week 20, May 2026):
+https://code.claude.com/docs/en/whats-new/2026-w20
+
+Hermes already has ``/compress`` (full-history compaction) and an
+automatic token-budget tail-protection heuristic inside
+``ContextCompressor``. What was missing is *user-chosen* boundary
+control: "fold everything before this point into a summary, but keep
+my most recent N exchanges exactly as they are." That is the value of
+the Claude Code feature — the user decides the compression boundary
+instead of leaving it to the token-budget heuristic.
+
+This module owns the pure, side-effect-free split logic so both the
+CLI (``cli.py::_manual_compress``) and the gateway
+(``gateway/run.py::_handle_compress_command``) share one
+implementation. The slash-command surfaces handle compression of the
+*head* via the existing ``_compress_context`` pipeline (preserving all
+the session-rotation / lock / memory-notify machinery) and then
+re-append the verbatim *tail* returned here.
+
+Design notes / invariants honored:
+
+* **Role alternation.** The compressed head ends with summary/handoff
+  content (assistant- or user-role, possibly a trailing todo snapshot).
+  The verbatim tail must begin with a ``user`` message so the rejoined
+  history keeps the user↔assistant alternation that providers validate.
+  :func:`split_history_for_partial_compress` snaps the tail boundary
+  backwards to the nearest ``user`` turn so the rejoin is always legal.
+
+* **No silent context mutation.** This is a manual, user-invoked
+  action. It rotates the session exactly like ``/compress`` does (via
+  the caller), so the prompt-cache reset is explicit and expected, not
+  silent.
+
+* **Conservative defaults.** ``keep_last`` counts *exchanges* (a user
+  turn plus its following assistant/tool turns), defaulting to 2. The
+  split never compresses if doing so would leave nothing in the head.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional, Tuple
+
+#: Default number of recent exchanges to preserve verbatim when the user
+#: runs ``/compress here`` without an explicit count.
+DEFAULT_KEEP_LAST = 2
+
+#: Hard ceiling so a fat-fingered ``/compress here 9999`` doesn't turn
+#: into a no-op surprise — clamp instead.
+MAX_KEEP_LAST = 100
+
+
+def parse_partial_compress_args(
+    raw_args: str,
+) -> Tuple[bool, int, Optional[str]]:
+    """Parse the argument string after ``/compress``.
+
+    Recognizes the boundary-aware forms:
+
+    * ``here``            → partial compress, keep ``DEFAULT_KEEP_LAST``
+    * ``here 4``          → partial compress, keep 4 exchanges
+    * ``--keep 4``        → partial compress, keep 4 exchanges
+    * ``up to here``      → alias for ``here`` (matches Claude Code's
+                            menu label "Summarize up to here")
+
+    Anything else is treated as a focus topic for the existing full
+    ``/compress <focus>`` behavior.
+
+    Returns ``(partial, keep_last, focus_topic)``:
+
+    * ``partial`` — True when a boundary-aware form was requested.
+    * ``keep_last`` — exchanges to preserve verbatim (only meaningful
+      when ``partial`` is True).
+    * ``focus_topic`` — focus string for full compression, or None.
+      Always None when ``partial`` is True (the two modes are exclusive;
+      a focused partial compress is not a documented Claude Code
+      behavior and would muddy the UX).
+    """
+    text = (raw_args or "").strip()
+    if not text:
+        return False, DEFAULT_KEEP_LAST, None
+
+    lowered = text.lower()
+
+    # Normalize the "up to here" alias to "here".
+    if lowered.startswith("up to here"):
+        lowered = lowered[len("up to ") :]
+        text = text[len("up to ") :]
+
+    tokens = lowered.split()
+
+    # Form: here [N]
+    if tokens and tokens[0] == "here":
+        keep = DEFAULT_KEEP_LAST
+        if len(tokens) >= 2:
+            keep = _coerce_keep(tokens[1])
+        return True, keep, None
+
+    # Form: --keep N  (or --keep=N)
+    if tokens and tokens[0] in ("--keep", "-k") and len(tokens) >= 2:
+        return True, _coerce_keep(tokens[1]), None
+    if tokens and tokens[0].startswith("--keep="):
+        return True, _coerce_keep(tokens[0].split("=", 1)[1]), None
+
+    # Otherwise: full compression with this as the focus topic.
+    return False, DEFAULT_KEEP_LAST, text or None
+
+
+def _coerce_keep(value: str) -> int:
+    """Parse a keep-count token, clamping to [1, MAX_KEEP_LAST]."""
+    try:
+        n = int(value)
+    except (TypeError, ValueError):
+        return DEFAULT_KEEP_LAST
+    if n < 1:
+        return 1
+    if n > MAX_KEEP_LAST:
+        return MAX_KEEP_LAST
+    return n
+
+
+def split_history_for_partial_compress(
+    history: List[Dict[str, Any]],
+    keep_last: int,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Split ``history`` into ``(head, tail)`` for partial compression.
+
+    ``head`` is the earlier portion that will be summarized; ``tail`` is
+    the most recent ``keep_last`` exchanges, preserved verbatim.
+
+    An *exchange* is counted by ``user``-role messages: keeping N
+    exchanges means keeping everything from the Nth-most-recent ``user``
+    message onward. This guarantees the tail starts on a ``user`` turn,
+    so when the caller rejoins ``compressed_head + tail`` the
+    user↔assistant alternation stays valid (the compressed head's
+    trailing content is followed by a fresh user turn).
+
+    Returns ``(head, tail)``. If the split would leave the head empty
+    (not enough history to compress meaningfully), returns
+    ``(history, [])`` — signaling the caller to fall back to full
+    compression or report "nothing to do".
+    """
+    if keep_last < 1:
+        keep_last = 1
+
+    n = len(history)
+    if n == 0:
+        return [], []
+
+    # Walk backwards collecting the indices of the most recent `keep_last`
+    # user-message starts. The tail begins at the earliest such index.
+    user_starts: List[int] = []
+    for idx in range(n - 1, -1, -1):
+        if history[idx].get("role") == "user":
+            user_starts.append(idx)
+            if len(user_starts) >= keep_last:
+                break
+
+    if not user_starts:
+        # No user turns at all (degenerate) — nothing sensible to keep
+        # as a "recent exchange"; treat as full compression.
+        return list(history), []
+
+    boundary = user_starts[-1]  # earliest of the kept user starts
+
+    head = history[:boundary]
+    tail = history[boundary:]
+
+    # If everything is in the tail (nothing left to compress), signal the
+    # caller to fall back to full compression rather than producing a
+    # no-op that rotates the session for no benefit.
+    if not head:
+        return list(history), []
+
+    return head, tail
+
+
+def rejoin_compressed_head_and_tail(
+    compressed_head: List[Dict[str, Any]],
+    tail: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Concatenate a compressed head with the verbatim tail, defending
+    the seam against an illegal user→user / assistant→assistant adjacency.
+
+    In normal operation the compressed head ends with the head's own
+    protected verbatim tail (the ``ContextCompressor`` always preserves a
+    recent window), which terminates on an ``assistant``/``tool`` turn —
+    so ``assistant → user`` at the seam is already valid. But the head
+    compressor's exact output shape is not contractually guaranteed (a
+    plugin context engine could return something that ends on a ``user``
+    turn, or a degenerate single-summary message). Rather than trust the
+    seam, this helper inspects the boundary and, if the last head message
+    and the first tail message share a ``user``/``assistant`` role, folds
+    the tail's first message content onto the head's last message so the
+    rejoined list never violates provider role-alternation rules.
+
+    ``tool`` messages are left alone — consecutive ``tool`` entries are
+    the one legal repetition (parallel tool results).
+    """
+    if not tail:
+        return list(compressed_head)
+    if not compressed_head:
+        return list(tail)
+
+    head = list(compressed_head)
+    rest = list(tail)
+
+    last = head[-1]
+    first = rest[0]
+    last_role = last.get("role")
+    first_role = first.get("role")
+
+    if last_role == first_role and last_role in ("user", "assistant"):
+        # Illegal adjacency. Merge the tail's first message text into the
+        # head's last message so alternation is preserved. Only string
+        # contents are merged inline; structured/multimodal contents fall
+        # back to dropping the redundant standalone (the content is
+        # preserved by concatenation when both are strings).
+        last_content = last.get("content")
+        first_content = first.get("content")
+        if isinstance(last_content, str) and isinstance(first_content, str):
+            merged = dict(last)
+            merged["content"] = f"{last_content}\n\n{first_content}"
+            head[-1] = merged
+            rest = rest[1:]
+        else:
+            # Can't safely string-merge multimodal content. Insert a
+            # minimal bridging turn so the seam alternates rather than
+            # losing data.
+            bridge_role = "assistant" if first_role == "user" else "user"
+            head.append({"role": bridge_role, "content": ""})
+
+    return head + rest
--- a/hermes_cli/plugins.py
+++ b/hermes_cli/plugins.py
@ -34,7 +34,6 @@ so plugin-defined tools appear alongside the built-in tools.
 from __future__ import annotations

 import asyncio
-import importlib
 import importlib.metadata
 import importlib.util
 import inspect
--- a/hermes_cli/plugins_cmd.py
+++ b/hermes_cli/plugins_cmd.py
@ -10,6 +10,7 @@ rendered with Rich Markdown.  Otherwise a default confirmation is shown.
 from __future__ import annotations

 import functools
+import json
 import logging
 import os
 import shutil
@ -810,7 +811,29 @@ def _discover_all_plugins() -> list:
    return list(seen.values())


-def cmd_list() -> None:
+def _plugin_status(name: str, enabled: set, disabled: set) -> str:
+    """Return the user-facing activation state for a plugin name."""
+    if name in disabled:
+        return "disabled"
+    if name in enabled:
+        return "enabled"
+    return "not enabled"
+
+
+def _filter_plugin_entries(entries: list, args: Any, enabled: set, disabled: set) -> list:
+    """Apply ``hermes plugins list`` CLI filters."""
+    filtered = entries
+    if getattr(args, "no_bundled", False) or getattr(args, "user", False):
+        filtered = [entry for entry in filtered if entry[3] != "bundled"]
+    if getattr(args, "enabled", False):
+        filtered = [
+            entry for entry in filtered
+            if _plugin_status(entry[0], enabled, disabled) == "enabled"
+        ]
+    return filtered
+
+
+def cmd_list(args: Any | None = None) -> None:
    """List all plugins (bundled + user) with enabled/disabled state."""
    from rich.console import Console
    from rich.table import Table
@ -824,6 +847,31 @@ def cmd_list() -> None:

    enabled = _get_enabled_set()
    disabled = _get_disabled_set()
+    entries = _filter_plugin_entries(entries, args, enabled, disabled)
+
+    if getattr(args, "json", False):
+        payload = [
+            {
+                "name": name,
+                "status": _plugin_status(name, enabled, disabled),
+                "version": str(version),
+                "description": description,
+                "source": source,
+            }
+            for name, version, description, source, _dir in entries
+        ]
+        print(json.dumps(payload, indent=2))
+        return
+
+    if getattr(args, "plain", False):
+        for name, version, _description, source, _dir in entries:
+            status = _plugin_status(name, enabled, disabled)
+            print(f"{status:12} {source:8} {str(version):8} {name}")
+        return
+
+    if not entries:
+        console.print("[dim]No plugins matched the selected filters.[/dim]")
+        return

    table = Table(title="Plugins", show_lines=False)
    table.add_column("Name", style="bold")
@ -833,9 +881,10 @@ def cmd_list() -> None:
    table.add_column("Source", style="dim")

    for name, version, description, source, _dir in entries:
-        if name in disabled:
+        status_name = _plugin_status(name, enabled, disabled)
+        if status_name == "disabled":
            status = "[red]disabled[/red]"
-        elif name in enabled:
+        elif status_name == "enabled":
            status = "[green]enabled[/green]"
        else:
            status = "[yellow]not enabled[/yellow]"
@ -844,6 +893,7 @@ def cmd_list() -> None:
    console.print()
    console.print(table)
    console.print()
+    console.print("[dim]Compact view:[/dim] hermes plugins list --plain --no-bundled")
    console.print("[dim]Interactive toggle:[/dim] hermes plugins")
    console.print("[dim]Enable/disable:[/dim] hermes plugins enable/disable <name>")
    console.print("[dim]Plugins are opt-in by default — only 'enabled' plugins load.[/dim]")
@ -1110,7 +1160,7 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
                stdscr.addnstr(0, 0, "Plugins", max_x - 1, hattr)
                stdscr.addnstr(
                    1, 0,
-                    "  \u2191\u2193 navigate  SPACE toggle  ENTER configure/confirm  ESC done",
+                    "  ↑↓/j/k navigate  PgUp/PgDn page  SPACE toggle  ENTER configure/confirm  ESC done",
                    max_x - 1, curses.A_DIM,
                )
            except curses.error:
@ -1150,7 +1200,9 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
                        pass
                    y += 1

-                for i in range(n_plugins):
+                plugin_start = scroll_offset
+                plugin_stop = min(n_plugins, scroll_offset + max(visible_rows, 0))
+                for i in range(plugin_start, plugin_stop):
                    if y >= max_y - 1:
                        break
                    check = "\u2713" if i in chosen else " "
@ -1208,6 +1260,16 @@ def _run_composite_ui(curses, plugin_names, plugin_labels, plugin_selected,
            elif key in {curses.KEY_DOWN, ord("j")}:
                if total_items > 0:
                    cursor = (cursor + 1) % total_items
+            elif key in {curses.KEY_NPAGE, ord("f")}:
+                if total_items > 0:
+                    cursor = min(total_items - 1, cursor + max(1, max_y - 5))
+            elif key in {curses.KEY_PPAGE, ord("b")}:
+                if total_items > 0:
+                    cursor = max(0, cursor - max(1, max_y - 5))
+            elif key == curses.KEY_HOME:
+                cursor = 0
+            elif key == curses.KEY_END:
+                cursor = max(0, total_items - 1)
            elif key == ord(" "):
                if cursor < n_plugins:
                    # Toggle general plugin
@ -1649,7 +1711,7 @@ def plugins_command(args) -> None:
    elif action == "disable":
        cmd_disable(args.name)
    elif action in {"list", "ls"}:
-        cmd_list()
+        cmd_list(args)
    elif action is None:
        cmd_toggle()
    else:
--- a/Show more
+++ b/Show more