diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index be8036253..011d6cc97 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -856,6 +856,7 @@ def run_conversation(
                     aggregator=moa_config.get("aggregator") or {},
                     temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6),
                     aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4),
+                    max_tokens=moa_config.get("reference_max_tokens"),
                 )
                 if _moa_context:
                     for _msg in reversed(api_messages):
diff --git a/agent/moa_loop.py b/agent/moa_loop.py
index 3b7c5532d..d487e13e5 100644
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@@ -715,10 +715,17 @@ class MoAChatCompletions:
         # aggregator's spend (often the bulk of the turn) is silently dropped
         # and the session cost reflects advisor fan-out only.
         self.last_aggregator_slot = dict(aggregator) if aggregator else None
-        # MoA does not cap reference or aggregator output: each model uses its
-        # own maximum. Passing max_tokens=None makes call_llm omit the parameter
-        # (it never caps by default), so a long aggregator synthesis is never
-        # truncated and providers that reject max_tokens don't 400.
+        # By default MoA does not cap reference or aggregator output: each model
+        # uses its own maximum (max_tokens=None → call_llm omits the parameter,
+        # so a long aggregator synthesis is never truncated and providers that
+        # reject max_tokens don't 400). A preset MAY set reference_max_tokens to
+        # cap ADVISOR output only — advisor generation is the dominant MoA
+        # latency (turn latency correlates ~0.88 with output tokens), and the
+        # aggregator only needs the gist of each advisor's judgement, so a cap
+        # (e.g. 600) measurably cuts per-turn wall time (~44% on a sample task).
+        # The acting aggregator is never capped here (its output is the
+        # user-visible answer).
+        reference_max_tokens = preset.get("reference_max_tokens")
         temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
         aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)
 
@@ -762,7 +769,7 @@ class MoAChatCompletions:
                 reference_models,
                 ref_messages,
                 temperature=temperature,
-                max_tokens=None,
+                max_tokens=reference_max_tokens,
             )
             self._ref_cache_key = _cache_key
             self._ref_cache_outputs = list(reference_outputs)
diff --git a/hermes_cli/moa_config.py b/hermes_cli/moa_config.py
index b4e261917..db644e5f7 100644
--- a/hermes_cli/moa_config.py
+++ b/hermes_cli/moa_config.py
@@ -42,6 +42,24 @@ def _coerce_int(value: Any, default: int) -> int:
             return default
 
 
+def _coerce_int_or_none(value: Any) -> int | None:
+    """Coerce to a positive int, or None when unset/blank/invalid/non-positive.
+
+    Used for optional caps (e.g. reference_max_tokens) where None means
+    'no cap' — the safe default that preserves prior uncapped behavior.
+    """
+    if value is None or value == "":
+        return None
+    try:
+        n = int(value)
+    except (TypeError, ValueError):
+        try:
+            n = int(float(value))
+        except (TypeError, ValueError):
+            return None
+    return n if n > 0 else None
+
+
 def _clean_slot(slot: Any) -> dict[str, str] | None:
     if not isinstance(slot, dict):
         return None
@@ -66,6 +84,7 @@ def _default_preset() -> dict[str, Any]:
         "reference_temperature": 0.6,
         "aggregator_temperature": 0.4,
         "max_tokens": 4096,
+        "reference_max_tokens": None,
         "enabled": True,
     }
 
@@ -94,6 +113,15 @@ def _normalize_preset(raw: Any) -> dict[str, Any]:
         "reference_temperature": _coerce_float(raw.get("reference_temperature"), 0.6),
         "aggregator_temperature": _coerce_float(raw.get("aggregator_temperature"), 0.4),
         "max_tokens": _coerce_int(raw.get("max_tokens"), 4096),
+        # Optional cap on how much each reference ADVISOR may generate per turn.
+        # None (default) = uncapped: advisors write full-length advice, matching
+        # prior behavior so existing presets are unchanged. Set a value (e.g.
+        # 600) to make advisors give concise advice — the dominant MoA latency
+        # is advisor generation (turn latency correlates ~0.88 with output
+        # tokens), and the aggregator only needs the gist of each advisor's
+        # judgement, so capping roughly halves per-turn wall time. Does NOT cap
+        # the acting aggregator (its output is the user-visible answer).
+        "reference_max_tokens": _coerce_int_or_none(raw.get("reference_max_tokens")),
     }
 
 
@@ -139,6 +167,7 @@ def normalize_moa_config(raw: Any) -> dict[str, Any]:
         "reference_temperature": active["reference_temperature"],
         "aggregator_temperature": active["aggregator_temperature"],
         "max_tokens": active["max_tokens"],
+        "reference_max_tokens": active.get("reference_max_tokens"),
         "enabled": active["enabled"],
     }
 
diff --git a/tests/hermes_cli/test_moa_config.py b/tests/hermes_cli/test_moa_config.py
index e04bc6389..accdfc95b 100644
--- a/tests/hermes_cli/test_moa_config.py
+++ b/tests/hermes_cli/test_moa_config.py
@@ -235,3 +235,46 @@ def test_moa_provider_rejected_case_insensitive():
 
     assert cfg["presets"]["p"]["aggregator"]["provider"] != "moa"
     assert cfg["presets"]["p"]["aggregator"] == DEFAULT_MOA_AGGREGATOR
+
+
+def _preset(**extra):
+    base = {
+        "reference_models": [{"provider": "openrouter", "model": "anthropic/claude-opus-4.8"}],
+        "aggregator": {"provider": "openrouter", "model": "anthropic/claude-opus-4.8"},
+    }
+    base.update(extra)
+    return {"default_preset": "p", "presets": {"p": base}}
+
+
+def test_reference_max_tokens_defaults_to_none_uncapped():
+    """Unset reference_max_tokens resolves to None (no cap) so existing presets
+    keep their prior uncapped advisor behavior — no silent regression."""
+    p = resolve_moa_preset(_preset(), "p")
+    assert p["reference_max_tokens"] is None
+
+
+def test_reference_max_tokens_positive_value_preserved():
+    """A positive cap flows through resolve_moa_preset to the runtime path."""
+    p = resolve_moa_preset(_preset(reference_max_tokens=600), "p")
+    assert p["reference_max_tokens"] == 600
+
+
+def test_reference_max_tokens_invalid_falls_back_to_none():
+    """Non-positive / non-numeric caps degrade to None (uncapped) rather than
+    clamping advisors to a nonsense value or crashing."""
+    for bad in (0, -5, "abc", "", None):
+        p = resolve_moa_preset(_preset(reference_max_tokens=bad), "p")
+        assert p["reference_max_tokens"] is None, bad
+
+
+def test_reference_max_tokens_string_number_coerced():
+    """A hand-edited config.yaml string like '600' coerces to int."""
+    p = resolve_moa_preset(_preset(reference_max_tokens="600"), "p")
+    assert p["reference_max_tokens"] == 600
+
+
+def test_reference_max_tokens_in_flattened_view():
+    """The flattened compatibility view (dashboard/desktop callers) exposes the
+    active preset's reference_max_tokens."""
+    cfg = normalize_moa_config(_preset(reference_max_tokens=750))
+    assert cfg["reference_max_tokens"] == 750
diff --git a/website/docs/user-guide/features/mixture-of-agents.md b/website/docs/user-guide/features/mixture-of-agents.md
index ca60d2db3..88ec36d9e 100644
--- a/website/docs/user-guide/features/mixture-of-agents.md
+++ b/website/docs/user-guide/features/mixture-of-agents.md
@@ -97,6 +97,38 @@ Default preset:
 - reference: `openrouter:deepseek/deepseek-v4-pro`
 - aggregator / acting model: `openrouter:anthropic/claude-opus-4.8`
 
+### Tuning advisor speed with `reference_max_tokens`
+
+Each turn, MoA runs the reference models (advisors) in parallel and then the
+aggregator acts. Advisor generation is the dominant per-turn latency — turn
+wall time correlates strongly with how many tokens the advisors emit, because
+the turn waits for the slowest advisor to finish writing. By default advisors
+are **uncapped** (`reference_max_tokens` unset), so they may write long,
+essay-length advice.
+
+Set `reference_max_tokens` on a preset to cap advisor output and give concise
+advice instead. The aggregator only needs the gist of each advisor's
+judgement, so a cap (e.g. `600`) measurably cuts per-turn wall time with little
+quality impact. It caps **advisors only** — the acting aggregator's output (the
+user-visible answer) is never capped.
+
+```yaml
+moa:
+  presets:
+    fast:
+      reference_models:
+        - provider: openrouter
+          model: anthropic/claude-opus-4.8
+        - provider: openrouter
+          model: openai/gpt-5.5
+      aggregator:
+        provider: openrouter
+        model: anthropic/claude-opus-4.8
+      reference_max_tokens: 600   # concise advice → faster turns
+```
+
+Leave it unset (or `0`/blank) to keep the prior uncapped behavior.
+
 ## Terminal preset management
 
 ```bash