diff --git a/agent/moa_loop.py b/agent/moa_loop.py
index 149142503..015bc23ac 100644
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@@ -561,6 +561,28 @@ def aggregate_moa_context(
     )
 
 
+def _attach_reference_guidance(agg_messages: list[dict[str, Any]], guidance: str) -> None:
+    """Attach the per-turn reference block at the END of the aggregator prompt.
+
+    The reference text differs on every tool-loop iteration. In an agentic loop
+    the most recent ``user`` message is the *original task* sitting near the TOP
+    of the context (everything after it is assistant/tool turns), so merging the
+    turn-varying reference block into it diverges the prompt prefix early — the
+    server's KV cache cannot be reused and the entire conversation re-prefills on
+    every step (full prefill each tool call, dominating latency on long contexts).
+
+    Appending at the very end keeps the ``[system][task][tool-history]`` prefix
+    stable and cache-reusable (only the new block re-prefills), and gives the
+    aggregator the references with recency. Merge into the last message only when
+    it is already a trailing string ``user`` turn (plain chat — still at the end).
+    """
+    last = agg_messages[-1] if agg_messages else None
+    if last is not None and last.get("role") == "user" and isinstance(last.get("content"), str):
+        last["content"] = last["content"] + "\n\n" + guidance
+    else:
+        agg_messages.append({"role": "user", "content": guidance})
+
+
 class MoAChatCompletions:
     """OpenAI-chat-compatible facade where the aggregator is the acting model."""
 
@@ -784,12 +806,7 @@ class MoAChatCompletions:
                 "answer the user directly or call tools as needed.\n\n"
                 f"{joined}"
             )
-            for msg in reversed(agg_messages):
-                if msg.get("role") == "user" and isinstance(msg.get("content"), str):
-                    msg["content"] = msg["content"] + "\n\n" + guidance
-                    break
-            else:
-                agg_messages.append({"role": "user", "content": guidance})
+            _attach_reference_guidance(agg_messages, guidance)
 
         if aggregator.get("provider") == "moa":
             raise RuntimeError("MoA aggregator cannot be another MoA preset")
diff --git a/tests/run_agent/test_moa_loop_mode.py b/tests/run_agent/test_moa_loop_mode.py
index 33103c5ff..8e93ad53d 100644
--- a/tests/run_agent/test_moa_loop_mode.py
+++ b/tests/run_agent/test_moa_loop_mode.py
@@ -1013,3 +1013,46 @@ moa:
     facade.consume_and_save_trace(session_id="sess-off")
 
     assert not (home / "moa-traces").exists()
+
+
+def test_reference_guidance_appended_at_end_in_tool_loop():
+    """In an agentic loop the reference block must land at the END of the prompt.
+
+    The most recent user turn is the original task near the top of the context;
+    merging the per-turn (volatile) reference block into it would diverge the
+    prompt prefix early and defeat the server's KV-cache reuse, forcing a full
+    re-prefill of the whole conversation on every tool-loop step.
+    """
+    from agent.moa_loop import _attach_reference_guidance
+
+    messages = [
+        {"role": "system", "content": "system prompt"},
+        {"role": "user", "content": "ORIGINAL TASK"},
+        {"role": "assistant", "content": "", "tool_calls": [{"id": "1"}]},
+        {"role": "tool", "content": "tool result", "tool_call_id": "1"},
+    ]
+    _attach_reference_guidance(messages, "REFERENCE BLOCK")
+
+    # The original (top-of-context) user turn is untouched, so the prefix stays
+    # cache-reusable across steps.
+    assert messages[1]["content"] == "ORIGINAL TASK"
+    # The reference block is appended as a new trailing turn, not merged upstream.
+    assert messages[-1]["role"] == "user"
+    assert messages[-1]["content"] == "REFERENCE BLOCK"
+    assert len(messages) == 5
+
+
+def test_reference_guidance_merges_into_trailing_user_in_plain_chat():
+    """Plain chat ends on the user turn, so the block merges there (still at end)."""
+    from agent.moa_loop import _attach_reference_guidance
+
+    messages = [
+        {"role": "system", "content": "system prompt"},
+        {"role": "user", "content": "hello"},
+    ]
+    _attach_reference_guidance(messages, "REFERENCE BLOCK")
+
+    # No extra message; the block joins the trailing user turn (which is the end).
+    assert len(messages) == 2
+    assert messages[-1]["role"] == "user"
+    assert messages[-1]["content"] == "hello\n\nREFERENCE BLOCK"