fix(moa): restore prompt caching for the aggregator and advisors (#57675)

Two caching holes made MoA re-bill essentially its entire input stream:

1. AGGREGATOR: anthropic_prompt_cache_policy() judged the agent's own
   model/provider — on the MoA path those are the virtual preset name and
   'moa', which match no caching branch, so _use_prompt_caching was False
   and the acting aggregator (Claude on OpenRouter) ran with ZERO
   cache_control breakpoints. Measured on identical opus-4.8 sessions:
   85% cache share solo vs 2% via MoA — ~30M re-billed input tokens on one
   132-task benchmark run. Fix: when provider == 'moa', resolve the policy
   from the preset's real aggregator slot (provider/model/base_url/api_mode
   via resolve_runtime_provider).

2. ADVISORS: _run_reference never applied cache_control at all, and
   Anthropic caching is opt-in per request — Claude advisors served 0
   cache reads across 1,227 benchmark calls (11.5M re-billed input tokens)
   even though the advisory view is append-only across iterations (stable
   prefix; the synthetic end marker is last so it never pollutes it). Fix:
   _maybe_apply_advisor_cache_control() reuses the SAME policy function and
   SAME system_and_3 layout as the main loop, judged on the advisor slot's
   own resolved runtime — advisor requests are now decorated exactly like
   an acting agent on that provider. Auto-caching routes (OpenAI-family)
   are left untouched by policy.

Live-verified on the wire (per-iteration opus+gpt5.5 preset, 4 fan-outs):
claude advisor fan-out 2-3 cache_write=2161/2344, fan-out 4
cache_read=2206 / fresh_in=2; aggregator session cache share 84%/77%
(vs 2%/0% before). Sub-1024-token prompts correctly stay uncached
(Anthropic minimum).
This commit is contained in:
Teknium 2026-07-03 04:08:48 -07:00 committed by GitHub
parent 87ae4ae94b
commit 22c5048d9c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 95 additions and 0 deletions

View file

@ -1478,6 +1478,46 @@ def anthropic_prompt_cache_policy(
eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "")
eff_model = (model if model is not None else agent.model) or ""
# MoA virtual provider: the agent's model/provider are the preset name and
# "moa" — neither matches any caching branch, so the ACTING AGGREGATOR
# (often Claude on OpenRouter) silently lost prompt caching entirely
# (measured: 85% cache share solo vs 2% on the identical model via MoA —
# tens of millions of re-billed input tokens per benchmark run). Resolve
# the policy from the preset's real aggregator slot instead.
if eff_provider.strip().lower() == "moa":
try:
from hermes_cli.config import load_config as _load_moa_cfg
from hermes_cli.moa_config import resolve_moa_preset
from hermes_cli.runtime_provider import resolve_runtime_provider
_preset = resolve_moa_preset(
_load_moa_cfg().get("moa") or {}, eff_model or None
)
_agg = _preset.get("aggregator") or {}
_agg_provider = str(_agg.get("provider") or "").strip()
_agg_model = str(_agg.get("model") or "").strip()
if _agg_provider and _agg_model:
_agg_base_url = ""
_agg_api_mode = ""
try:
_rt = resolve_runtime_provider(
requested=_agg_provider, target_model=_agg_model
)
_agg_base_url = _rt.get("base_url") or ""
_agg_api_mode = _rt.get("api_mode") or ""
except Exception:
pass
return anthropic_prompt_cache_policy(
agent,
provider=_agg_provider,
base_url=_agg_base_url,
api_mode=_agg_api_mode,
model=_agg_model,
)
except Exception as _moa_exc: # pragma: no cover - defensive
logger.debug("MoA aggregator cache-policy resolution failed: %s", _moa_exc)
return False, False
model_lower = eff_model.lower()
provider_lower = eff_provider.lower()
is_claude = "claude" in model_lower

View file

@ -173,6 +173,49 @@ def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]:
return out
def _maybe_apply_advisor_cache_control(
messages: list[dict[str, Any]],
runtime: dict[str, Any],
) -> list[dict[str, Any]]:
"""Decorate an advisor request with cache_control when its route honors it.
Reuses the SAME policy function as the main agent loop
(``anthropic_prompt_cache_policy``) resolved against the advisor slot's
own provider/base_url/api_mode/model, and the SAME breakpoint layout
(``apply_anthropic_cache_control``, system_and_3). This keeps advisor
calls decorated exactly like an acting agent on that provider would be
no MoA-specific caching logic to drift.
Returns the messages unchanged on any resolution error or when the
policy says the route doesn't honor markers.
"""
try:
from types import SimpleNamespace
from agent.agent_runtime_helpers import anthropic_prompt_cache_policy
from agent.prompt_caching import apply_anthropic_cache_control
# The policy function reads agent.* only as fallbacks for kwargs we
# don't pass; provide a stub so an advisor slot is judged purely on
# its own resolved runtime.
stub = SimpleNamespace(provider="", base_url="", api_mode="", model="")
should_cache, native_layout = anthropic_prompt_cache_policy(
stub,
provider=runtime.get("provider") or "",
base_url=runtime.get("base_url") or "",
api_mode=runtime.get("api_mode") or "",
model=runtime.get("model") or "",
)
if not should_cache:
return messages
return apply_anthropic_cache_control(
messages, native_anthropic=native_layout
)
except Exception as exc: # pragma: no cover - decoration must never break a call
logger.debug("advisor cache_control decoration skipped: %s", exc)
return messages
def _run_reference(
slot: dict[str, str],
ref_messages: list[dict[str, Any]],
@ -214,6 +257,18 @@ def _run_reference(
# trimmed view (_reference_messages) already strips the agent's own
# system prompt, so this is the only system message the reference sees.
messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
# Apply the same Anthropic-style prompt-caching decoration the main
# agent loop applies (system_and_3 breakpoints). The advisory view is
# append-only across iterations (new turns append before the trailing
# synthetic marker), so on cache-honoring routes (Claude via
# OpenRouter/native, MiniMax, Qwen/DashScope) iteration N+1's prefix
# replays iteration N's cached prefix. Without this, Claude advisors
# served ZERO cache reads across an entire benchmark run (measured:
# 0/1227 calls, 11.5M re-billed input tokens) because Anthropic
# caching is opt-in per request. OpenAI-family advisors are untouched
# (their caching is automatic; markers are ignored harmlessly, but we
# only decorate when the policy says the route honors them).
messages = _maybe_apply_advisor_cache_control(messages, runtime)
response = call_llm(
task="moa_reference",
messages=messages,