From 22c5048d9c6a3d6e3d6c786ef014a0998ca2a0c3 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Fri, 3 Jul 2026 04:08:48 -0700
Subject: [PATCH] fix(moa): restore prompt caching for the aggregator and
 advisors (#57675)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two caching holes made MoA re-bill essentially its entire input stream:

1. AGGREGATOR: anthropic_prompt_cache_policy() judged the agent's own
   model/provider — on the MoA path those are the virtual preset name and
   'moa', which match no caching branch, so _use_prompt_caching was False
   and the acting aggregator (Claude on OpenRouter) ran with ZERO
   cache_control breakpoints. Measured on identical opus-4.8 sessions:
   85% cache share solo vs 2% via MoA — ~30M re-billed input tokens on one
   132-task benchmark run. Fix: when provider == 'moa', resolve the policy
   from the preset's real aggregator slot (provider/model/base_url/api_mode
   via resolve_runtime_provider).

2. ADVISORS: _run_reference never applied cache_control at all, and
   Anthropic caching is opt-in per request — Claude advisors served 0
   cache reads across 1,227 benchmark calls (11.5M re-billed input tokens)
   even though the advisory view is append-only across iterations (stable
   prefix; the synthetic end marker is last so it never pollutes it). Fix:
   _maybe_apply_advisor_cache_control() reuses the SAME policy function and
   SAME system_and_3 layout as the main loop, judged on the advisor slot's
   own resolved runtime — advisor requests are now decorated exactly like
   an acting agent on that provider. Auto-caching routes (OpenAI-family)
   are left untouched by policy.

Live-verified on the wire (per-iteration opus+gpt5.5 preset, 4 fan-outs):
claude advisor fan-out 2-3 cache_write=2161/2344, fan-out 4
cache_read=2206 / fresh_in=2; aggregator session cache share 84%/77%
(vs 2%/0% before). Sub-1024-token prompts correctly stay uncached
(Anthropic minimum).
---
 agent/agent_runtime_helpers.py | 40 +++++++++++++++++++++++++
 agent/moa_loop.py              | 55 ++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py
index 18ed3102c..08c0052bc 100644
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
@@ -1478,6 +1478,46 @@ def anthropic_prompt_cache_policy(
     eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "")
     eff_model = (model if model is not None else agent.model) or ""
 
+    # MoA virtual provider: the agent's model/provider are the preset name and
+    # "moa" — neither matches any caching branch, so the ACTING AGGREGATOR
+    # (often Claude on OpenRouter) silently lost prompt caching entirely
+    # (measured: 85% cache share solo vs 2% on the identical model via MoA —
+    # tens of millions of re-billed input tokens per benchmark run). Resolve
+    # the policy from the preset's real aggregator slot instead.
+    if eff_provider.strip().lower() == "moa":
+        try:
+            from hermes_cli.config import load_config as _load_moa_cfg
+            from hermes_cli.moa_config import resolve_moa_preset
+            from hermes_cli.runtime_provider import resolve_runtime_provider
+
+            _preset = resolve_moa_preset(
+                _load_moa_cfg().get("moa") or {}, eff_model or None
+            )
+            _agg = _preset.get("aggregator") or {}
+            _agg_provider = str(_agg.get("provider") or "").strip()
+            _agg_model = str(_agg.get("model") or "").strip()
+            if _agg_provider and _agg_model:
+                _agg_base_url = ""
+                _agg_api_mode = ""
+                try:
+                    _rt = resolve_runtime_provider(
+                        requested=_agg_provider, target_model=_agg_model
+                    )
+                    _agg_base_url = _rt.get("base_url") or ""
+                    _agg_api_mode = _rt.get("api_mode") or ""
+                except Exception:
+                    pass
+                return anthropic_prompt_cache_policy(
+                    agent,
+                    provider=_agg_provider,
+                    base_url=_agg_base_url,
+                    api_mode=_agg_api_mode,
+                    model=_agg_model,
+                )
+        except Exception as _moa_exc:  # pragma: no cover - defensive
+            logger.debug("MoA aggregator cache-policy resolution failed: %s", _moa_exc)
+        return False, False
+
     model_lower = eff_model.lower()
     provider_lower = eff_provider.lower()
     is_claude = "claude" in model_lower
diff --git a/agent/moa_loop.py b/agent/moa_loop.py
index 439698444..ccaebda8f 100644
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@@ -173,6 +173,49 @@ def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]:
     return out
 
 
+def _maybe_apply_advisor_cache_control(
+    messages: list[dict[str, Any]],
+    runtime: dict[str, Any],
+) -> list[dict[str, Any]]:
+    """Decorate an advisor request with cache_control when its route honors it.
+
+    Reuses the SAME policy function as the main agent loop
+    (``anthropic_prompt_cache_policy``) resolved against the advisor slot's
+    own provider/base_url/api_mode/model, and the SAME breakpoint layout
+    (``apply_anthropic_cache_control``, system_and_3). This keeps advisor
+    calls decorated exactly like an acting agent on that provider would be —
+    no MoA-specific caching logic to drift.
+
+    Returns the messages unchanged on any resolution error or when the
+    policy says the route doesn't honor markers.
+    """
+    try:
+        from types import SimpleNamespace
+
+        from agent.agent_runtime_helpers import anthropic_prompt_cache_policy
+        from agent.prompt_caching import apply_anthropic_cache_control
+
+        # The policy function reads agent.* only as fallbacks for kwargs we
+        # don't pass; provide a stub so an advisor slot is judged purely on
+        # its own resolved runtime.
+        stub = SimpleNamespace(provider="", base_url="", api_mode="", model="")
+        should_cache, native_layout = anthropic_prompt_cache_policy(
+            stub,
+            provider=runtime.get("provider") or "",
+            base_url=runtime.get("base_url") or "",
+            api_mode=runtime.get("api_mode") or "",
+            model=runtime.get("model") or "",
+        )
+        if not should_cache:
+            return messages
+        return apply_anthropic_cache_control(
+            messages, native_anthropic=native_layout
+        )
+    except Exception as exc:  # pragma: no cover - decoration must never break a call
+        logger.debug("advisor cache_control decoration skipped: %s", exc)
+        return messages
+
+
 def _run_reference(
     slot: dict[str, str],
     ref_messages: list[dict[str, Any]],
@@ -214,6 +257,18 @@ def _run_reference(
         # trimmed view (_reference_messages) already strips the agent's own
         # system prompt, so this is the only system message the reference sees.
         messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
+        # Apply the same Anthropic-style prompt-caching decoration the main
+        # agent loop applies (system_and_3 breakpoints). The advisory view is
+        # append-only across iterations (new turns append before the trailing
+        # synthetic marker), so on cache-honoring routes (Claude via
+        # OpenRouter/native, MiniMax, Qwen/DashScope) iteration N+1's prefix
+        # replays iteration N's cached prefix. Without this, Claude advisors
+        # served ZERO cache reads across an entire benchmark run (measured:
+        # 0/1227 calls, 11.5M re-billed input tokens) because Anthropic
+        # caching is opt-in per request. OpenAI-family advisors are untouched
+        # (their caching is automatic; markers are ignored harmlessly, but we
+        # only decorate when the policy says the route honors them).
+        messages = _maybe_apply_advisor_cache_control(messages, runtime)
         response = call_llm(
             task="moa_reference",
             messages=messages,