fix(moa): restore prompt caching for the aggregator and advisors (#57675)

Two caching holes made MoA re-bill essentially its entire input stream: 1. AGGREGATOR: anthropic_prompt_cache_policy() judged the agent's own model/provider — on the MoA path those are the virtual preset name and 'moa', which match no caching branch, so _use_prompt_caching was False and the acting aggregator (Claude on OpenRouter) ran with ZERO cache_control breakpoints. Measured on identical opus-4.8 sessions: 85% cache share solo vs 2% via MoA — ~30M re-billed input tokens on one 132-task benchmark run. Fix: when provider == 'moa', resolve the policy from the preset's real aggregator slot (provider/model/base_url/api_mode via resolve_runtime_provider). 2. ADVISORS: _run_reference never applied cache_control at all, and Anthropic caching is opt-in per request — Claude advisors served 0 cache reads across 1,227 benchmark calls (11.5M re-billed input tokens) even though the advisory view is append-only across iterations (stable prefix; the synthetic end marker is last so it never pollutes it). Fix: _maybe_apply_advisor_cache_control() reuses the SAME policy function and SAME system_and_3 layout as the main loop, judged on the advisor slot's own resolved runtime — advisor requests are now decorated exactly like an acting agent on that provider. Auto-caching routes (OpenAI-family) are left untouched by policy. Live-verified on the wire (per-iteration opus+gpt5.5 preset, 4 fan-outs): claude advisor fan-out 2-3 cache_write=2161/2344, fan-out 4 cache_read=2206 / fresh_in=2; aggregator session cache share 84%/77% (vs 2%/0% before). Sub-1024-token prompts correctly stay uncached (Anthropic minimum).
2026-07-03 04:08:48 -07:00 · 2026-07-03 04:08:48 -07:00 · 22c5048d9c
commit 22c5048d9c
parent 87ae4ae94b
2 changed files with 95 additions and 0 deletions
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
@ -1478,6 +1478,46 @@ def anthropic_prompt_cache_policy(
    eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "")
    eff_model = (model if model is not None else agent.model) or ""

+    # MoA virtual provider: the agent's model/provider are the preset name and
+    # "moa" — neither matches any caching branch, so the ACTING AGGREGATOR
+    # (often Claude on OpenRouter) silently lost prompt caching entirely
+    # (measured: 85% cache share solo vs 2% on the identical model via MoA —
+    # tens of millions of re-billed input tokens per benchmark run). Resolve
+    # the policy from the preset's real aggregator slot instead.
+    if eff_provider.strip().lower() == "moa":
+        try:
+            from hermes_cli.config import load_config as _load_moa_cfg
+            from hermes_cli.moa_config import resolve_moa_preset
+            from hermes_cli.runtime_provider import resolve_runtime_provider
+
+            _preset = resolve_moa_preset(
+                _load_moa_cfg().get("moa") or {}, eff_model or None
+            )
+            _agg = _preset.get("aggregator") or {}
+            _agg_provider = str(_agg.get("provider") or "").strip()
+            _agg_model = str(_agg.get("model") or "").strip()
+            if _agg_provider and _agg_model:
+                _agg_base_url = ""
+                _agg_api_mode = ""
+                try:
+                    _rt = resolve_runtime_provider(
+                        requested=_agg_provider, target_model=_agg_model
+                    )
+                    _agg_base_url = _rt.get("base_url") or ""
+                    _agg_api_mode = _rt.get("api_mode") or ""
+                except Exception:
+                    pass
+                return anthropic_prompt_cache_policy(
+                    agent,
+                    provider=_agg_provider,
+                    base_url=_agg_base_url,
+                    api_mode=_agg_api_mode,
+                    model=_agg_model,
+                )
+        except Exception as _moa_exc:  # pragma: no cover - defensive
+            logger.debug("MoA aggregator cache-policy resolution failed: %s", _moa_exc)
+        return False, False
+
    model_lower = eff_model.lower()
    provider_lower = eff_provider.lower()
    is_claude = "claude" in model_lower
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@ -173,6 +173,49 @@ def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]:
    return out


+def _maybe_apply_advisor_cache_control(
+    messages: list[dict[str, Any]],
+    runtime: dict[str, Any],
+) -> list[dict[str, Any]]:
+    """Decorate an advisor request with cache_control when its route honors it.
+
+    Reuses the SAME policy function as the main agent loop
+    (``anthropic_prompt_cache_policy``) resolved against the advisor slot's
+    own provider/base_url/api_mode/model, and the SAME breakpoint layout
+    (``apply_anthropic_cache_control``, system_and_3). This keeps advisor
+    calls decorated exactly like an acting agent on that provider would be —
+    no MoA-specific caching logic to drift.
+
+    Returns the messages unchanged on any resolution error or when the
+    policy says the route doesn't honor markers.
+    """
+    try:
+        from types import SimpleNamespace
+
+        from agent.agent_runtime_helpers import anthropic_prompt_cache_policy
+        from agent.prompt_caching import apply_anthropic_cache_control
+
+        # The policy function reads agent.* only as fallbacks for kwargs we
+        # don't pass; provide a stub so an advisor slot is judged purely on
+        # its own resolved runtime.
+        stub = SimpleNamespace(provider="", base_url="", api_mode="", model="")
+        should_cache, native_layout = anthropic_prompt_cache_policy(
+            stub,
+            provider=runtime.get("provider") or "",
+            base_url=runtime.get("base_url") or "",
+            api_mode=runtime.get("api_mode") or "",
+            model=runtime.get("model") or "",
+        )
+        if not should_cache:
+            return messages
+        return apply_anthropic_cache_control(
+            messages, native_anthropic=native_layout
+        )
+    except Exception as exc:  # pragma: no cover - decoration must never break a call
+        logger.debug("advisor cache_control decoration skipped: %s", exc)
+        return messages
+
+
 def _run_reference(
    slot: dict[str, str],
    ref_messages: list[dict[str, Any]],
@ -214,6 +257,18 @@ def _run_reference(
        # trimmed view (_reference_messages) already strips the agent's own
        # system prompt, so this is the only system message the reference sees.
        messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
+        # Apply the same Anthropic-style prompt-caching decoration the main
+        # agent loop applies (system_and_3 breakpoints). The advisory view is
+        # append-only across iterations (new turns append before the trailing
+        # synthetic marker), so on cache-honoring routes (Claude via
+        # OpenRouter/native, MiniMax, Qwen/DashScope) iteration N+1's prefix
+        # replays iteration N's cached prefix. Without this, Claude advisors
+        # served ZERO cache reads across an entire benchmark run (measured:
+        # 0/1227 calls, 11.5M re-billed input tokens) because Anthropic
+        # caching is opt-in per request. OpenAI-family advisors are untouched
+        # (their caching is automatic; markers are ignored harmlessly, but we
+        # only decorate when the policy says the route honors them).
+        messages = _maybe_apply_advisor_cache_control(messages, runtime)
        response = call_llm(
            task="moa_reference",
            messages=messages,