diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py index 18ed3102c..08c0052bc 100644 --- a/agent/agent_runtime_helpers.py +++ b/agent/agent_runtime_helpers.py @@ -1478,6 +1478,46 @@ def anthropic_prompt_cache_policy( eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "") eff_model = (model if model is not None else agent.model) or "" + # MoA virtual provider: the agent's model/provider are the preset name and + # "moa" — neither matches any caching branch, so the ACTING AGGREGATOR + # (often Claude on OpenRouter) silently lost prompt caching entirely + # (measured: 85% cache share solo vs 2% on the identical model via MoA — + # tens of millions of re-billed input tokens per benchmark run). Resolve + # the policy from the preset's real aggregator slot instead. + if eff_provider.strip().lower() == "moa": + try: + from hermes_cli.config import load_config as _load_moa_cfg + from hermes_cli.moa_config import resolve_moa_preset + from hermes_cli.runtime_provider import resolve_runtime_provider + + _preset = resolve_moa_preset( + _load_moa_cfg().get("moa") or {}, eff_model or None + ) + _agg = _preset.get("aggregator") or {} + _agg_provider = str(_agg.get("provider") or "").strip() + _agg_model = str(_agg.get("model") or "").strip() + if _agg_provider and _agg_model: + _agg_base_url = "" + _agg_api_mode = "" + try: + _rt = resolve_runtime_provider( + requested=_agg_provider, target_model=_agg_model + ) + _agg_base_url = _rt.get("base_url") or "" + _agg_api_mode = _rt.get("api_mode") or "" + except Exception: + pass + return anthropic_prompt_cache_policy( + agent, + provider=_agg_provider, + base_url=_agg_base_url, + api_mode=_agg_api_mode, + model=_agg_model, + ) + except Exception as _moa_exc: # pragma: no cover - defensive + logger.debug("MoA aggregator cache-policy resolution failed: %s", _moa_exc) + return False, False + model_lower = eff_model.lower() provider_lower = eff_provider.lower() is_claude = "claude" in model_lower diff --git a/agent/moa_loop.py b/agent/moa_loop.py index 439698444..ccaebda8f 100644 --- a/agent/moa_loop.py +++ b/agent/moa_loop.py @@ -173,6 +173,49 @@ def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]: return out +def _maybe_apply_advisor_cache_control( + messages: list[dict[str, Any]], + runtime: dict[str, Any], +) -> list[dict[str, Any]]: + """Decorate an advisor request with cache_control when its route honors it. + + Reuses the SAME policy function as the main agent loop + (``anthropic_prompt_cache_policy``) resolved against the advisor slot's + own provider/base_url/api_mode/model, and the SAME breakpoint layout + (``apply_anthropic_cache_control``, system_and_3). This keeps advisor + calls decorated exactly like an acting agent on that provider would be — + no MoA-specific caching logic to drift. + + Returns the messages unchanged on any resolution error or when the + policy says the route doesn't honor markers. + """ + try: + from types import SimpleNamespace + + from agent.agent_runtime_helpers import anthropic_prompt_cache_policy + from agent.prompt_caching import apply_anthropic_cache_control + + # The policy function reads agent.* only as fallbacks for kwargs we + # don't pass; provide a stub so an advisor slot is judged purely on + # its own resolved runtime. + stub = SimpleNamespace(provider="", base_url="", api_mode="", model="") + should_cache, native_layout = anthropic_prompt_cache_policy( + stub, + provider=runtime.get("provider") or "", + base_url=runtime.get("base_url") or "", + api_mode=runtime.get("api_mode") or "", + model=runtime.get("model") or "", + ) + if not should_cache: + return messages + return apply_anthropic_cache_control( + messages, native_anthropic=native_layout + ) + except Exception as exc: # pragma: no cover - decoration must never break a call + logger.debug("advisor cache_control decoration skipped: %s", exc) + return messages + + def _run_reference( slot: dict[str, str], ref_messages: list[dict[str, Any]], @@ -214,6 +257,18 @@ def _run_reference( # trimmed view (_reference_messages) already strips the agent's own # system prompt, so this is the only system message the reference sees. messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages] + # Apply the same Anthropic-style prompt-caching decoration the main + # agent loop applies (system_and_3 breakpoints). The advisory view is + # append-only across iterations (new turns append before the trailing + # synthetic marker), so on cache-honoring routes (Claude via + # OpenRouter/native, MiniMax, Qwen/DashScope) iteration N+1's prefix + # replays iteration N's cached prefix. Without this, Claude advisors + # served ZERO cache reads across an entire benchmark run (measured: + # 0/1227 calls, 11.5M re-billed input tokens) because Anthropic + # caching is opt-in per request. OpenAI-family advisors are untouched + # (their caching is automatic; markers are ignored harmlessly, but we + # only decorate when the policy says the route honors them). + messages = _maybe_apply_advisor_cache_control(messages, runtime) response = call_llm( task="moa_reference", messages=messages,