diff --git a/agent/moa_loop.py b/agent/moa_loop.py index 149142503..015bc23ac 100644 --- a/agent/moa_loop.py +++ b/agent/moa_loop.py @@ -561,6 +561,28 @@ def aggregate_moa_context( ) +def _attach_reference_guidance(agg_messages: list[dict[str, Any]], guidance: str) -> None: + """Attach the per-turn reference block at the END of the aggregator prompt. + + The reference text differs on every tool-loop iteration. In an agentic loop + the most recent ``user`` message is the *original task* sitting near the TOP + of the context (everything after it is assistant/tool turns), so merging the + turn-varying reference block into it diverges the prompt prefix early — the + server's KV cache cannot be reused and the entire conversation re-prefills on + every step (full prefill each tool call, dominating latency on long contexts). + + Appending at the very end keeps the ``[system][task][tool-history]`` prefix + stable and cache-reusable (only the new block re-prefills), and gives the + aggregator the references with recency. Merge into the last message only when + it is already a trailing string ``user`` turn (plain chat — still at the end). + """ + last = agg_messages[-1] if agg_messages else None + if last is not None and last.get("role") == "user" and isinstance(last.get("content"), str): + last["content"] = last["content"] + "\n\n" + guidance + else: + agg_messages.append({"role": "user", "content": guidance}) + + class MoAChatCompletions: """OpenAI-chat-compatible facade where the aggregator is the acting model.""" @@ -784,12 +806,7 @@ class MoAChatCompletions: "answer the user directly or call tools as needed.\n\n" f"{joined}" ) - for msg in reversed(agg_messages): - if msg.get("role") == "user" and isinstance(msg.get("content"), str): - msg["content"] = msg["content"] + "\n\n" + guidance - break - else: - agg_messages.append({"role": "user", "content": guidance}) + _attach_reference_guidance(agg_messages, guidance) if aggregator.get("provider") == "moa": raise RuntimeError("MoA aggregator cannot be another MoA preset") diff --git a/tests/run_agent/test_moa_loop_mode.py b/tests/run_agent/test_moa_loop_mode.py index 33103c5ff..8e93ad53d 100644 --- a/tests/run_agent/test_moa_loop_mode.py +++ b/tests/run_agent/test_moa_loop_mode.py @@ -1013,3 +1013,46 @@ moa: facade.consume_and_save_trace(session_id="sess-off") assert not (home / "moa-traces").exists() + + +def test_reference_guidance_appended_at_end_in_tool_loop(): + """In an agentic loop the reference block must land at the END of the prompt. + + The most recent user turn is the original task near the top of the context; + merging the per-turn (volatile) reference block into it would diverge the + prompt prefix early and defeat the server's KV-cache reuse, forcing a full + re-prefill of the whole conversation on every tool-loop step. + """ + from agent.moa_loop import _attach_reference_guidance + + messages = [ + {"role": "system", "content": "system prompt"}, + {"role": "user", "content": "ORIGINAL TASK"}, + {"role": "assistant", "content": "", "tool_calls": [{"id": "1"}]}, + {"role": "tool", "content": "tool result", "tool_call_id": "1"}, + ] + _attach_reference_guidance(messages, "REFERENCE BLOCK") + + # The original (top-of-context) user turn is untouched, so the prefix stays + # cache-reusable across steps. + assert messages[1]["content"] == "ORIGINAL TASK" + # The reference block is appended as a new trailing turn, not merged upstream. + assert messages[-1]["role"] == "user" + assert messages[-1]["content"] == "REFERENCE BLOCK" + assert len(messages) == 5 + + +def test_reference_guidance_merges_into_trailing_user_in_plain_chat(): + """Plain chat ends on the user turn, so the block merges there (still at end).""" + from agent.moa_loop import _attach_reference_guidance + + messages = [ + {"role": "system", "content": "system prompt"}, + {"role": "user", "content": "hello"}, + ] + _attach_reference_guidance(messages, "REFERENCE BLOCK") + + # No extra message; the block joins the trailing user turn (which is the end). + assert len(messages) == 2 + assert messages[-1]["role"] == "user" + assert messages[-1]["content"] == "hello\n\nREFERENCE BLOCK"