fix(moa): append reference block at end of aggregator prompt for KV-cache reuse

The MoA aggregator received the per-turn reference block merged into the most
recent `user` message. In an agentic tool loop that message is the original
task near the top of the context (everything after it is assistant/tool turns),
so injecting text that changes every iteration diverges the prompt prefix early.
The server's KV cache then cannot be reused and the entire conversation
re-prefills on every tool-loop step — full prefill each step, which dominates
latency on long contexts.

Append the reference block at the end of the prompt instead (merging into the
last message only when it is already a trailing user turn, i.e. plain chat).
This keeps the [system][task][tool-history] prefix stable and cache-reusable so
only the new block re-prefills, and gives the aggregator the references with
recency. Extracted as `_attach_reference_guidance` with unit tests.

Measured on a local llama.cpp aggregator over a long agentic task: KV-cache
reuse on follow-up steps went from ~0.3% to ~93-95% and per-step prefill on an
~80k-token context dropped from ~44s to <1s, with no change to output.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Jeff Watts 2026-06-29 12:00:29 -04:00 committed by Teknium
parent 49cb06c07a
commit a2d6f05d1b
2 changed files with 66 additions and 6 deletions

View file

@ -561,6 +561,28 @@ def aggregate_moa_context(
)
def _attach_reference_guidance(agg_messages: list[dict[str, Any]], guidance: str) -> None:
"""Attach the per-turn reference block at the END of the aggregator prompt.
The reference text differs on every tool-loop iteration. In an agentic loop
the most recent ``user`` message is the *original task* sitting near the TOP
of the context (everything after it is assistant/tool turns), so merging the
turn-varying reference block into it diverges the prompt prefix early the
server's KV cache cannot be reused and the entire conversation re-prefills on
every step (full prefill each tool call, dominating latency on long contexts).
Appending at the very end keeps the ``[system][task][tool-history]`` prefix
stable and cache-reusable (only the new block re-prefills), and gives the
aggregator the references with recency. Merge into the last message only when
it is already a trailing string ``user`` turn (plain chat still at the end).
"""
last = agg_messages[-1] if agg_messages else None
if last is not None and last.get("role") == "user" and isinstance(last.get("content"), str):
last["content"] = last["content"] + "\n\n" + guidance
else:
agg_messages.append({"role": "user", "content": guidance})
class MoAChatCompletions:
"""OpenAI-chat-compatible facade where the aggregator is the acting model."""
@ -784,12 +806,7 @@ class MoAChatCompletions:
"answer the user directly or call tools as needed.\n\n"
f"{joined}"
)
for msg in reversed(agg_messages):
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
msg["content"] = msg["content"] + "\n\n" + guidance
break
else:
agg_messages.append({"role": "user", "content": guidance})
_attach_reference_guidance(agg_messages, guidance)
if aggregator.get("provider") == "moa":
raise RuntimeError("MoA aggregator cannot be another MoA preset")

View file

@ -1013,3 +1013,46 @@ moa:
facade.consume_and_save_trace(session_id="sess-off")
assert not (home / "moa-traces").exists()
def test_reference_guidance_appended_at_end_in_tool_loop():
"""In an agentic loop the reference block must land at the END of the prompt.
The most recent user turn is the original task near the top of the context;
merging the per-turn (volatile) reference block into it would diverge the
prompt prefix early and defeat the server's KV-cache reuse, forcing a full
re-prefill of the whole conversation on every tool-loop step.
"""
from agent.moa_loop import _attach_reference_guidance
messages = [
{"role": "system", "content": "system prompt"},
{"role": "user", "content": "ORIGINAL TASK"},
{"role": "assistant", "content": "", "tool_calls": [{"id": "1"}]},
{"role": "tool", "content": "tool result", "tool_call_id": "1"},
]
_attach_reference_guidance(messages, "REFERENCE BLOCK")
# The original (top-of-context) user turn is untouched, so the prefix stays
# cache-reusable across steps.
assert messages[1]["content"] == "ORIGINAL TASK"
# The reference block is appended as a new trailing turn, not merged upstream.
assert messages[-1]["role"] == "user"
assert messages[-1]["content"] == "REFERENCE BLOCK"
assert len(messages) == 5
def test_reference_guidance_merges_into_trailing_user_in_plain_chat():
"""Plain chat ends on the user turn, so the block merges there (still at end)."""
from agent.moa_loop import _attach_reference_guidance
messages = [
{"role": "system", "content": "system prompt"},
{"role": "user", "content": "hello"},
]
_attach_reference_guidance(messages, "REFERENCE BLOCK")
# No extra message; the block joins the trailing user turn (which is the end).
assert len(messages) == 2
assert messages[-1]["role"] == "user"
assert messages[-1]["content"] == "hello\n\nREFERENCE BLOCK"