fix(moa): append reference block at end of aggregator prompt for KV-cache reuse

The MoA aggregator received the per-turn reference block merged into the most recent `user` message. In an agentic tool loop that message is the original task near the top of the context (everything after it is assistant/tool turns), so injecting text that changes every iteration diverges the prompt prefix early. The server's KV cache then cannot be reused and the entire conversation re-prefills on every tool-loop step — full prefill each step, which dominates latency on long contexts. Append the reference block at the end of the prompt instead (merging into the last message only when it is already a trailing user turn, i.e. plain chat). This keeps the [system][task][tool-history] prefix stable and cache-reusable so only the new block re-prefills, and gives the aggregator the references with recency. Extracted as `_attach_reference_guidance` with unit tests. Measured on a local llama.cpp aggregator over a long agentic task: KV-cache reuse on follow-up steps went from ~0.3% to ~93-95% and per-step prefill on an ~80k-token context dropped from ~44s to <1s, with no change to output. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 12:00:29 -04:00 · 2026-06-29 12:00:29 -04:00 · a2d6f05d1b
commit a2d6f05d1b
parent 49cb06c07a
2 changed files with 66 additions and 6 deletions
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@ -561,6 +561,28 @@ def aggregate_moa_context(
    )


+def _attach_reference_guidance(agg_messages: list[dict[str, Any]], guidance: str) -> None:
+    """Attach the per-turn reference block at the END of the aggregator prompt.
+
+    The reference text differs on every tool-loop iteration. In an agentic loop
+    the most recent ``user`` message is the *original task* sitting near the TOP
+    of the context (everything after it is assistant/tool turns), so merging the
+    turn-varying reference block into it diverges the prompt prefix early — the
+    server's KV cache cannot be reused and the entire conversation re-prefills on
+    every step (full prefill each tool call, dominating latency on long contexts).
+
+    Appending at the very end keeps the ``[system][task][tool-history]`` prefix
+    stable and cache-reusable (only the new block re-prefills), and gives the
+    aggregator the references with recency. Merge into the last message only when
+    it is already a trailing string ``user`` turn (plain chat — still at the end).
+    """
+    last = agg_messages[-1] if agg_messages else None
+    if last is not None and last.get("role") == "user" and isinstance(last.get("content"), str):
+        last["content"] = last["content"] + "\n\n" + guidance
+    else:
+        agg_messages.append({"role": "user", "content": guidance})
+
+
 class MoAChatCompletions:
    """OpenAI-chat-compatible facade where the aggregator is the acting model."""

@ -784,12 +806,7 @@ class MoAChatCompletions:
                "answer the user directly or call tools as needed.\n\n"
                f"{joined}"
            )
-            for msg in reversed(agg_messages):
-                if msg.get("role") == "user" and isinstance(msg.get("content"), str):
-                    msg["content"] = msg["content"] + "\n\n" + guidance
-                    break
-            else:
-                agg_messages.append({"role": "user", "content": guidance})
+            _attach_reference_guidance(agg_messages, guidance)

        if aggregator.get("provider") == "moa":
            raise RuntimeError("MoA aggregator cannot be another MoA preset")
--- a/tests/run_agent/test_moa_loop_mode.py
+++ b/tests/run_agent/test_moa_loop_mode.py
@ -1013,3 +1013,46 @@ moa:
    facade.consume_and_save_trace(session_id="sess-off")

    assert not (home / "moa-traces").exists()
+
+
+def test_reference_guidance_appended_at_end_in_tool_loop():
+    """In an agentic loop the reference block must land at the END of the prompt.
+
+    The most recent user turn is the original task near the top of the context;
+    merging the per-turn (volatile) reference block into it would diverge the
+    prompt prefix early and defeat the server's KV-cache reuse, forcing a full
+    re-prefill of the whole conversation on every tool-loop step.
+    """
+    from agent.moa_loop import _attach_reference_guidance
+
+    messages = [
+        {"role": "system", "content": "system prompt"},
+        {"role": "user", "content": "ORIGINAL TASK"},
+        {"role": "assistant", "content": "", "tool_calls": [{"id": "1"}]},
+        {"role": "tool", "content": "tool result", "tool_call_id": "1"},
+    ]
+    _attach_reference_guidance(messages, "REFERENCE BLOCK")
+
+    # The original (top-of-context) user turn is untouched, so the prefix stays
+    # cache-reusable across steps.
+    assert messages[1]["content"] == "ORIGINAL TASK"
+    # The reference block is appended as a new trailing turn, not merged upstream.
+    assert messages[-1]["role"] == "user"
+    assert messages[-1]["content"] == "REFERENCE BLOCK"
+    assert len(messages) == 5
+
+
+def test_reference_guidance_merges_into_trailing_user_in_plain_chat():
+    """Plain chat ends on the user turn, so the block merges there (still at end)."""
+    from agent.moa_loop import _attach_reference_guidance
+
+    messages = [
+        {"role": "system", "content": "system prompt"},
+        {"role": "user", "content": "hello"},
+    ]
+    _attach_reference_guidance(messages, "REFERENCE BLOCK")
+
+    # No extra message; the block joins the trailing user turn (which is the end).
+    assert len(messages) == 2
+    assert messages[-1]["role"] == "user"
+    assert messages[-1]["content"] == "hello\n\nREFERENCE BLOCK"