From 22c5048d9c6a3d6e3d6c786ef014a0998ca2a0c3 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 3 Jul 2026 04:08:48 -0700 Subject: [PATCH] fix(moa): restore prompt caching for the aggregator and advisors (#57675) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two caching holes made MoA re-bill essentially its entire input stream: 1. AGGREGATOR: anthropic_prompt_cache_policy() judged the agent's own model/provider — on the MoA path those are the virtual preset name and 'moa', which match no caching branch, so _use_prompt_caching was False and the acting aggregator (Claude on OpenRouter) ran with ZERO cache_control breakpoints. Measured on identical opus-4.8 sessions: 85% cache share solo vs 2% via MoA — ~30M re-billed input tokens on one 132-task benchmark run. Fix: when provider == 'moa', resolve the policy from the preset's real aggregator slot (provider/model/base_url/api_mode via resolve_runtime_provider). 2. ADVISORS: _run_reference never applied cache_control at all, and Anthropic caching is opt-in per request — Claude advisors served 0 cache reads across 1,227 benchmark calls (11.5M re-billed input tokens) even though the advisory view is append-only across iterations (stable prefix; the synthetic end marker is last so it never pollutes it). Fix: _maybe_apply_advisor_cache_control() reuses the SAME policy function and SAME system_and_3 layout as the main loop, judged on the advisor slot's own resolved runtime — advisor requests are now decorated exactly like an acting agent on that provider. Auto-caching routes (OpenAI-family) are left untouched by policy. Live-verified on the wire (per-iteration opus+gpt5.5 preset, 4 fan-outs): claude advisor fan-out 2-3 cache_write=2161/2344, fan-out 4 cache_read=2206 / fresh_in=2; aggregator session cache share 84%/77% (vs 2%/0% before). Sub-1024-token prompts correctly stay uncached (Anthropic minimum). --- agent/agent_runtime_helpers.py | 40 +++++++++++++++++++++++++ agent/moa_loop.py | 55 ++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py index 18ed3102c..08c0052bc 100644 --- a/agent/agent_runtime_helpers.py +++ b/agent/agent_runtime_helpers.py @@ -1478,6 +1478,46 @@ def anthropic_prompt_cache_policy( eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "") eff_model = (model if model is not None else agent.model) or "" + # MoA virtual provider: the agent's model/provider are the preset name and + # "moa" — neither matches any caching branch, so the ACTING AGGREGATOR + # (often Claude on OpenRouter) silently lost prompt caching entirely + # (measured: 85% cache share solo vs 2% on the identical model via MoA — + # tens of millions of re-billed input tokens per benchmark run). Resolve + # the policy from the preset's real aggregator slot instead. + if eff_provider.strip().lower() == "moa": + try: + from hermes_cli.config import load_config as _load_moa_cfg + from hermes_cli.moa_config import resolve_moa_preset + from hermes_cli.runtime_provider import resolve_runtime_provider + + _preset = resolve_moa_preset( + _load_moa_cfg().get("moa") or {}, eff_model or None + ) + _agg = _preset.get("aggregator") or {} + _agg_provider = str(_agg.get("provider") or "").strip() + _agg_model = str(_agg.get("model") or "").strip() + if _agg_provider and _agg_model: + _agg_base_url = "" + _agg_api_mode = "" + try: + _rt = resolve_runtime_provider( + requested=_agg_provider, target_model=_agg_model + ) + _agg_base_url = _rt.get("base_url") or "" + _agg_api_mode = _rt.get("api_mode") or "" + except Exception: + pass + return anthropic_prompt_cache_policy( + agent, + provider=_agg_provider, + base_url=_agg_base_url, + api_mode=_agg_api_mode, + model=_agg_model, + ) + except Exception as _moa_exc: # pragma: no cover - defensive + logger.debug("MoA aggregator cache-policy resolution failed: %s", _moa_exc) + return False, False + model_lower = eff_model.lower() provider_lower = eff_provider.lower() is_claude = "claude" in model_lower diff --git a/agent/moa_loop.py b/agent/moa_loop.py index 439698444..ccaebda8f 100644 --- a/agent/moa_loop.py +++ b/agent/moa_loop.py @@ -173,6 +173,49 @@ def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]: return out +def _maybe_apply_advisor_cache_control( + messages: list[dict[str, Any]], + runtime: dict[str, Any], +) -> list[dict[str, Any]]: + """Decorate an advisor request with cache_control when its route honors it. + + Reuses the SAME policy function as the main agent loop + (``anthropic_prompt_cache_policy``) resolved against the advisor slot's + own provider/base_url/api_mode/model, and the SAME breakpoint layout + (``apply_anthropic_cache_control``, system_and_3). This keeps advisor + calls decorated exactly like an acting agent on that provider would be — + no MoA-specific caching logic to drift. + + Returns the messages unchanged on any resolution error or when the + policy says the route doesn't honor markers. + """ + try: + from types import SimpleNamespace + + from agent.agent_runtime_helpers import anthropic_prompt_cache_policy + from agent.prompt_caching import apply_anthropic_cache_control + + # The policy function reads agent.* only as fallbacks for kwargs we + # don't pass; provide a stub so an advisor slot is judged purely on + # its own resolved runtime. + stub = SimpleNamespace(provider="", base_url="", api_mode="", model="") + should_cache, native_layout = anthropic_prompt_cache_policy( + stub, + provider=runtime.get("provider") or "", + base_url=runtime.get("base_url") or "", + api_mode=runtime.get("api_mode") or "", + model=runtime.get("model") or "", + ) + if not should_cache: + return messages + return apply_anthropic_cache_control( + messages, native_anthropic=native_layout + ) + except Exception as exc: # pragma: no cover - decoration must never break a call + logger.debug("advisor cache_control decoration skipped: %s", exc) + return messages + + def _run_reference( slot: dict[str, str], ref_messages: list[dict[str, Any]], @@ -214,6 +257,18 @@ def _run_reference( # trimmed view (_reference_messages) already strips the agent's own # system prompt, so this is the only system message the reference sees. messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages] + # Apply the same Anthropic-style prompt-caching decoration the main + # agent loop applies (system_and_3 breakpoints). The advisory view is + # append-only across iterations (new turns append before the trailing + # synthetic marker), so on cache-honoring routes (Claude via + # OpenRouter/native, MiniMax, Qwen/DashScope) iteration N+1's prefix + # replays iteration N's cached prefix. Without this, Claude advisors + # served ZERO cache reads across an entire benchmark run (measured: + # 0/1227 calls, 11.5M re-billed input tokens) because Anthropic + # caching is opt-in per request. OpenAI-family advisors are untouched + # (their caching is automatic; markers are ignored harmlessly, but we + # only decorate when the policy says the route honors them). + messages = _maybe_apply_advisor_cache_control(messages, runtime) response = call_llm( task="moa_reference", messages=messages,