fix(moa): restore prompt caching for the aggregator and advisors (#57675)
Two caching holes made MoA re-bill essentially its entire input stream: 1. AGGREGATOR: anthropic_prompt_cache_policy() judged the agent's own model/provider — on the MoA path those are the virtual preset name and 'moa', which match no caching branch, so _use_prompt_caching was False and the acting aggregator (Claude on OpenRouter) ran with ZERO cache_control breakpoints. Measured on identical opus-4.8 sessions: 85% cache share solo vs 2% via MoA — ~30M re-billed input tokens on one 132-task benchmark run. Fix: when provider == 'moa', resolve the policy from the preset's real aggregator slot (provider/model/base_url/api_mode via resolve_runtime_provider). 2. ADVISORS: _run_reference never applied cache_control at all, and Anthropic caching is opt-in per request — Claude advisors served 0 cache reads across 1,227 benchmark calls (11.5M re-billed input tokens) even though the advisory view is append-only across iterations (stable prefix; the synthetic end marker is last so it never pollutes it). Fix: _maybe_apply_advisor_cache_control() reuses the SAME policy function and SAME system_and_3 layout as the main loop, judged on the advisor slot's own resolved runtime — advisor requests are now decorated exactly like an acting agent on that provider. Auto-caching routes (OpenAI-family) are left untouched by policy. Live-verified on the wire (per-iteration opus+gpt5.5 preset, 4 fan-outs): claude advisor fan-out 2-3 cache_write=2161/2344, fan-out 4 cache_read=2206 / fresh_in=2; aggregator session cache share 84%/77% (vs 2%/0% before). Sub-1024-token prompts correctly stay uncached (Anthropic minimum).
This commit is contained in:
parent
87ae4ae94b
commit
22c5048d9c
2 changed files with 95 additions and 0 deletions
|
|
@ -1478,6 +1478,46 @@ def anthropic_prompt_cache_policy(
|
|||
eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "")
|
||||
eff_model = (model if model is not None else agent.model) or ""
|
||||
|
||||
# MoA virtual provider: the agent's model/provider are the preset name and
|
||||
# "moa" — neither matches any caching branch, so the ACTING AGGREGATOR
|
||||
# (often Claude on OpenRouter) silently lost prompt caching entirely
|
||||
# (measured: 85% cache share solo vs 2% on the identical model via MoA —
|
||||
# tens of millions of re-billed input tokens per benchmark run). Resolve
|
||||
# the policy from the preset's real aggregator slot instead.
|
||||
if eff_provider.strip().lower() == "moa":
|
||||
try:
|
||||
from hermes_cli.config import load_config as _load_moa_cfg
|
||||
from hermes_cli.moa_config import resolve_moa_preset
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
|
||||
_preset = resolve_moa_preset(
|
||||
_load_moa_cfg().get("moa") or {}, eff_model or None
|
||||
)
|
||||
_agg = _preset.get("aggregator") or {}
|
||||
_agg_provider = str(_agg.get("provider") or "").strip()
|
||||
_agg_model = str(_agg.get("model") or "").strip()
|
||||
if _agg_provider and _agg_model:
|
||||
_agg_base_url = ""
|
||||
_agg_api_mode = ""
|
||||
try:
|
||||
_rt = resolve_runtime_provider(
|
||||
requested=_agg_provider, target_model=_agg_model
|
||||
)
|
||||
_agg_base_url = _rt.get("base_url") or ""
|
||||
_agg_api_mode = _rt.get("api_mode") or ""
|
||||
except Exception:
|
||||
pass
|
||||
return anthropic_prompt_cache_policy(
|
||||
agent,
|
||||
provider=_agg_provider,
|
||||
base_url=_agg_base_url,
|
||||
api_mode=_agg_api_mode,
|
||||
model=_agg_model,
|
||||
)
|
||||
except Exception as _moa_exc: # pragma: no cover - defensive
|
||||
logger.debug("MoA aggregator cache-policy resolution failed: %s", _moa_exc)
|
||||
return False, False
|
||||
|
||||
model_lower = eff_model.lower()
|
||||
provider_lower = eff_provider.lower()
|
||||
is_claude = "claude" in model_lower
|
||||
|
|
|
|||
|
|
@ -173,6 +173,49 @@ def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]:
|
|||
return out
|
||||
|
||||
|
||||
def _maybe_apply_advisor_cache_control(
|
||||
messages: list[dict[str, Any]],
|
||||
runtime: dict[str, Any],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Decorate an advisor request with cache_control when its route honors it.
|
||||
|
||||
Reuses the SAME policy function as the main agent loop
|
||||
(``anthropic_prompt_cache_policy``) resolved against the advisor slot's
|
||||
own provider/base_url/api_mode/model, and the SAME breakpoint layout
|
||||
(``apply_anthropic_cache_control``, system_and_3). This keeps advisor
|
||||
calls decorated exactly like an acting agent on that provider would be —
|
||||
no MoA-specific caching logic to drift.
|
||||
|
||||
Returns the messages unchanged on any resolution error or when the
|
||||
policy says the route doesn't honor markers.
|
||||
"""
|
||||
try:
|
||||
from types import SimpleNamespace
|
||||
|
||||
from agent.agent_runtime_helpers import anthropic_prompt_cache_policy
|
||||
from agent.prompt_caching import apply_anthropic_cache_control
|
||||
|
||||
# The policy function reads agent.* only as fallbacks for kwargs we
|
||||
# don't pass; provide a stub so an advisor slot is judged purely on
|
||||
# its own resolved runtime.
|
||||
stub = SimpleNamespace(provider="", base_url="", api_mode="", model="")
|
||||
should_cache, native_layout = anthropic_prompt_cache_policy(
|
||||
stub,
|
||||
provider=runtime.get("provider") or "",
|
||||
base_url=runtime.get("base_url") or "",
|
||||
api_mode=runtime.get("api_mode") or "",
|
||||
model=runtime.get("model") or "",
|
||||
)
|
||||
if not should_cache:
|
||||
return messages
|
||||
return apply_anthropic_cache_control(
|
||||
messages, native_anthropic=native_layout
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - decoration must never break a call
|
||||
logger.debug("advisor cache_control decoration skipped: %s", exc)
|
||||
return messages
|
||||
|
||||
|
||||
def _run_reference(
|
||||
slot: dict[str, str],
|
||||
ref_messages: list[dict[str, Any]],
|
||||
|
|
@ -214,6 +257,18 @@ def _run_reference(
|
|||
# trimmed view (_reference_messages) already strips the agent's own
|
||||
# system prompt, so this is the only system message the reference sees.
|
||||
messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
|
||||
# Apply the same Anthropic-style prompt-caching decoration the main
|
||||
# agent loop applies (system_and_3 breakpoints). The advisory view is
|
||||
# append-only across iterations (new turns append before the trailing
|
||||
# synthetic marker), so on cache-honoring routes (Claude via
|
||||
# OpenRouter/native, MiniMax, Qwen/DashScope) iteration N+1's prefix
|
||||
# replays iteration N's cached prefix. Without this, Claude advisors
|
||||
# served ZERO cache reads across an entire benchmark run (measured:
|
||||
# 0/1227 calls, 11.5M re-billed input tokens) because Anthropic
|
||||
# caching is opt-in per request. OpenAI-family advisors are untouched
|
||||
# (their caching is automatic; markers are ignored harmlessly, but we
|
||||
# only decorate when the policy says the route honors them).
|
||||
messages = _maybe_apply_advisor_cache_control(messages, runtime)
|
||||
response = call_llm(
|
||||
task="moa_reference",
|
||||
messages=messages,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue