feat(moa): add reference_max_tokens to cap advisor output and cut turn latency (#56756)

MoA per-turn latency is dominated by advisor GENERATION: turn wall time correlates ~0.88 with output tokens and ~-0.03 with input tokens (measured over 52 turns). Each turn waits for the slowest advisor to finish writing, and advisors were uncapped — writing multi-thousand-token essays the aggregator only needs the gist of. Add an opt-in per-preset reference_max_tokens knob (mirrors reference_temperature) that caps ADVISOR output only; the acting aggregator is never capped. Default None = uncapped, so existing presets are byte-for-byte unchanged (no regression). Wired through both MoA execution paths (MoAChatCompletions.create and aggregate_moa_context). E2E: same task, closed preset uncapped vs reference_max_tokens=600 -> 59s to 33s (~44% faster), final answer identical/correct. - hermes_cli/moa_config.py: _coerce_int_or_none helper + reference_max_tokens in _normalize_preset/_default_preset/flattened view - agent/moa_loop.py: read preset.reference_max_tokens, pass to reference fan-out - agent/conversation_loop.py: pass reference_max_tokens on the per-turn path - tests + docs
2026-07-02 00:16:35 -07:00 · 2026-07-02 00:16:35 -07:00 · 543d305bbb
commit 543d305bbb
parent 9be39de0f2
5 changed files with 117 additions and 5 deletions
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -856,6 +856,7 @@ def run_conversation(
                    aggregator=moa_config.get("aggregator") or {},
                    temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6),
                    aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4),
+                    max_tokens=moa_config.get("reference_max_tokens"),
                )
                if _moa_context:
                    for _msg in reversed(api_messages):
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@ -715,10 +715,17 @@ class MoAChatCompletions:
        # aggregator's spend (often the bulk of the turn) is silently dropped
        # and the session cost reflects advisor fan-out only.
        self.last_aggregator_slot = dict(aggregator) if aggregator else None
-        # MoA does not cap reference or aggregator output: each model uses its
-        # own maximum. Passing max_tokens=None makes call_llm omit the parameter
-        # (it never caps by default), so a long aggregator synthesis is never
-        # truncated and providers that reject max_tokens don't 400.
+        # By default MoA does not cap reference or aggregator output: each model
+        # uses its own maximum (max_tokens=None → call_llm omits the parameter,
+        # so a long aggregator synthesis is never truncated and providers that
+        # reject max_tokens don't 400). A preset MAY set reference_max_tokens to
+        # cap ADVISOR output only — advisor generation is the dominant MoA
+        # latency (turn latency correlates ~0.88 with output tokens), and the
+        # aggregator only needs the gist of each advisor's judgement, so a cap
+        # (e.g. 600) measurably cuts per-turn wall time (~44% on a sample task).
+        # The acting aggregator is never capped here (its output is the
+        # user-visible answer).
+        reference_max_tokens = preset.get("reference_max_tokens")
        temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
        aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)

@ -762,7 +769,7 @@ class MoAChatCompletions:
                reference_models,
                ref_messages,
                temperature=temperature,
-                max_tokens=None,
+                max_tokens=reference_max_tokens,
            )
            self._ref_cache_key = _cache_key
            self._ref_cache_outputs = list(reference_outputs)
--- a/hermes_cli/moa_config.py
+++ b/hermes_cli/moa_config.py
@ -42,6 +42,24 @@ def _coerce_int(value: Any, default: int) -> int:
            return default


+def _coerce_int_or_none(value: Any) -> int | None:
+    """Coerce to a positive int, or None when unset/blank/invalid/non-positive.
+
+    Used for optional caps (e.g. reference_max_tokens) where None means
+    'no cap' — the safe default that preserves prior uncapped behavior.
+    """
+    if value is None or value == "":
+        return None
+    try:
+        n = int(value)
+    except (TypeError, ValueError):
+        try:
+            n = int(float(value))
+        except (TypeError, ValueError):
+            return None
+    return n if n > 0 else None
+
+
 def _clean_slot(slot: Any) -> dict[str, str] | None:
    if not isinstance(slot, dict):
        return None
@ -66,6 +84,7 @@ def _default_preset() -> dict[str, Any]:
        "reference_temperature": 0.6,
        "aggregator_temperature": 0.4,
        "max_tokens": 4096,
+        "reference_max_tokens": None,
        "enabled": True,
    }

@ -94,6 +113,15 @@ def _normalize_preset(raw: Any) -> dict[str, Any]:
        "reference_temperature": _coerce_float(raw.get("reference_temperature"), 0.6),
        "aggregator_temperature": _coerce_float(raw.get("aggregator_temperature"), 0.4),
        "max_tokens": _coerce_int(raw.get("max_tokens"), 4096),
+        # Optional cap on how much each reference ADVISOR may generate per turn.
+        # None (default) = uncapped: advisors write full-length advice, matching
+        # prior behavior so existing presets are unchanged. Set a value (e.g.
+        # 600) to make advisors give concise advice — the dominant MoA latency
+        # is advisor generation (turn latency correlates ~0.88 with output
+        # tokens), and the aggregator only needs the gist of each advisor's
+        # judgement, so capping roughly halves per-turn wall time. Does NOT cap
+        # the acting aggregator (its output is the user-visible answer).
+        "reference_max_tokens": _coerce_int_or_none(raw.get("reference_max_tokens")),
    }


@ -139,6 +167,7 @@ def normalize_moa_config(raw: Any) -> dict[str, Any]:
        "reference_temperature": active["reference_temperature"],
        "aggregator_temperature": active["aggregator_temperature"],
        "max_tokens": active["max_tokens"],
+        "reference_max_tokens": active.get("reference_max_tokens"),
        "enabled": active["enabled"],
    }

--- a/tests/hermes_cli/test_moa_config.py
+++ b/tests/hermes_cli/test_moa_config.py
@ -235,3 +235,46 @@ def test_moa_provider_rejected_case_insensitive():

    assert cfg["presets"]["p"]["aggregator"]["provider"] != "moa"
    assert cfg["presets"]["p"]["aggregator"] == DEFAULT_MOA_AGGREGATOR
+
+
+def _preset(**extra):
+    base = {
+        "reference_models": [{"provider": "openrouter", "model": "anthropic/claude-opus-4.8"}],
+        "aggregator": {"provider": "openrouter", "model": "anthropic/claude-opus-4.8"},
+    }
+    base.update(extra)
+    return {"default_preset": "p", "presets": {"p": base}}
+
+
+def test_reference_max_tokens_defaults_to_none_uncapped():
+    """Unset reference_max_tokens resolves to None (no cap) so existing presets
+    keep their prior uncapped advisor behavior — no silent regression."""
+    p = resolve_moa_preset(_preset(), "p")
+    assert p["reference_max_tokens"] is None
+
+
+def test_reference_max_tokens_positive_value_preserved():
+    """A positive cap flows through resolve_moa_preset to the runtime path."""
+    p = resolve_moa_preset(_preset(reference_max_tokens=600), "p")
+    assert p["reference_max_tokens"] == 600
+
+
+def test_reference_max_tokens_invalid_falls_back_to_none():
+    """Non-positive / non-numeric caps degrade to None (uncapped) rather than
+    clamping advisors to a nonsense value or crashing."""
+    for bad in (0, -5, "abc", "", None):
+        p = resolve_moa_preset(_preset(reference_max_tokens=bad), "p")
+        assert p["reference_max_tokens"] is None, bad
+
+
+def test_reference_max_tokens_string_number_coerced():
+    """A hand-edited config.yaml string like '600' coerces to int."""
+    p = resolve_moa_preset(_preset(reference_max_tokens="600"), "p")
+    assert p["reference_max_tokens"] == 600
+
+
+def test_reference_max_tokens_in_flattened_view():
+    """The flattened compatibility view (dashboard/desktop callers) exposes the
+    active preset's reference_max_tokens."""
+    cfg = normalize_moa_config(_preset(reference_max_tokens=750))
+    assert cfg["reference_max_tokens"] == 750
--- a/website/docs/user-guide/features/mixture-of-agents.md
+++ b/website/docs/user-guide/features/mixture-of-agents.md
@ -97,6 +97,38 @@ Default preset:
 - reference: `openrouter:deepseek/deepseek-v4-pro`
 - aggregator / acting model: `openrouter:anthropic/claude-opus-4.8`

+### Tuning advisor speed with `reference_max_tokens`
+
+Each turn, MoA runs the reference models (advisors) in parallel and then the
+aggregator acts. Advisor generation is the dominant per-turn latency — turn
+wall time correlates strongly with how many tokens the advisors emit, because
+the turn waits for the slowest advisor to finish writing. By default advisors
+are **uncapped** (`reference_max_tokens` unset), so they may write long,
+essay-length advice.
+
+Set `reference_max_tokens` on a preset to cap advisor output and give concise
+advice instead. The aggregator only needs the gist of each advisor's
+judgement, so a cap (e.g. `600`) measurably cuts per-turn wall time with little
+quality impact. It caps **advisors only** — the acting aggregator's output (the
+user-visible answer) is never capped.
+
+```yaml
+moa:
+  presets:
+    fast:
+      reference_models:
+        - provider: openrouter
+          model: anthropic/claude-opus-4.8
+        - provider: openrouter
+          model: openai/gpt-5.5
+      aggregator:
+        provider: openrouter
+        model: anthropic/claude-opus-4.8
+      reference_max_tokens: 600   # concise advice → faster turns
+```
+
+Leave it unset (or `0`/blank) to keep the prior uncapped behavior.
+
 ## Terminal preset management

 ```bash