diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index be8036253..011d6cc97 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -856,6 +856,7 @@ def run_conversation( aggregator=moa_config.get("aggregator") or {}, temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6), aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4), + max_tokens=moa_config.get("reference_max_tokens"), ) if _moa_context: for _msg in reversed(api_messages): diff --git a/agent/moa_loop.py b/agent/moa_loop.py index 3b7c5532d..d487e13e5 100644 --- a/agent/moa_loop.py +++ b/agent/moa_loop.py @@ -715,10 +715,17 @@ class MoAChatCompletions: # aggregator's spend (often the bulk of the turn) is silently dropped # and the session cost reflects advisor fan-out only. self.last_aggregator_slot = dict(aggregator) if aggregator else None - # MoA does not cap reference or aggregator output: each model uses its - # own maximum. Passing max_tokens=None makes call_llm omit the parameter - # (it never caps by default), so a long aggregator synthesis is never - # truncated and providers that reject max_tokens don't 400. + # By default MoA does not cap reference or aggregator output: each model + # uses its own maximum (max_tokens=None → call_llm omits the parameter, + # so a long aggregator synthesis is never truncated and providers that + # reject max_tokens don't 400). A preset MAY set reference_max_tokens to + # cap ADVISOR output only — advisor generation is the dominant MoA + # latency (turn latency correlates ~0.88 with output tokens), and the + # aggregator only needs the gist of each advisor's judgement, so a cap + # (e.g. 600) measurably cuts per-turn wall time (~44% on a sample task). + # The acting aggregator is never capped here (its output is the + # user-visible answer). + reference_max_tokens = preset.get("reference_max_tokens") temperature = float(preset.get("reference_temperature", 0.6) or 0.6) aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4) @@ -762,7 +769,7 @@ class MoAChatCompletions: reference_models, ref_messages, temperature=temperature, - max_tokens=None, + max_tokens=reference_max_tokens, ) self._ref_cache_key = _cache_key self._ref_cache_outputs = list(reference_outputs) diff --git a/hermes_cli/moa_config.py b/hermes_cli/moa_config.py index b4e261917..db644e5f7 100644 --- a/hermes_cli/moa_config.py +++ b/hermes_cli/moa_config.py @@ -42,6 +42,24 @@ def _coerce_int(value: Any, default: int) -> int: return default +def _coerce_int_or_none(value: Any) -> int | None: + """Coerce to a positive int, or None when unset/blank/invalid/non-positive. + + Used for optional caps (e.g. reference_max_tokens) where None means + 'no cap' — the safe default that preserves prior uncapped behavior. + """ + if value is None or value == "": + return None + try: + n = int(value) + except (TypeError, ValueError): + try: + n = int(float(value)) + except (TypeError, ValueError): + return None + return n if n > 0 else None + + def _clean_slot(slot: Any) -> dict[str, str] | None: if not isinstance(slot, dict): return None @@ -66,6 +84,7 @@ def _default_preset() -> dict[str, Any]: "reference_temperature": 0.6, "aggregator_temperature": 0.4, "max_tokens": 4096, + "reference_max_tokens": None, "enabled": True, } @@ -94,6 +113,15 @@ def _normalize_preset(raw: Any) -> dict[str, Any]: "reference_temperature": _coerce_float(raw.get("reference_temperature"), 0.6), "aggregator_temperature": _coerce_float(raw.get("aggregator_temperature"), 0.4), "max_tokens": _coerce_int(raw.get("max_tokens"), 4096), + # Optional cap on how much each reference ADVISOR may generate per turn. + # None (default) = uncapped: advisors write full-length advice, matching + # prior behavior so existing presets are unchanged. Set a value (e.g. + # 600) to make advisors give concise advice — the dominant MoA latency + # is advisor generation (turn latency correlates ~0.88 with output + # tokens), and the aggregator only needs the gist of each advisor's + # judgement, so capping roughly halves per-turn wall time. Does NOT cap + # the acting aggregator (its output is the user-visible answer). + "reference_max_tokens": _coerce_int_or_none(raw.get("reference_max_tokens")), } @@ -139,6 +167,7 @@ def normalize_moa_config(raw: Any) -> dict[str, Any]: "reference_temperature": active["reference_temperature"], "aggregator_temperature": active["aggregator_temperature"], "max_tokens": active["max_tokens"], + "reference_max_tokens": active.get("reference_max_tokens"), "enabled": active["enabled"], } diff --git a/tests/hermes_cli/test_moa_config.py b/tests/hermes_cli/test_moa_config.py index e04bc6389..accdfc95b 100644 --- a/tests/hermes_cli/test_moa_config.py +++ b/tests/hermes_cli/test_moa_config.py @@ -235,3 +235,46 @@ def test_moa_provider_rejected_case_insensitive(): assert cfg["presets"]["p"]["aggregator"]["provider"] != "moa" assert cfg["presets"]["p"]["aggregator"] == DEFAULT_MOA_AGGREGATOR + + +def _preset(**extra): + base = { + "reference_models": [{"provider": "openrouter", "model": "anthropic/claude-opus-4.8"}], + "aggregator": {"provider": "openrouter", "model": "anthropic/claude-opus-4.8"}, + } + base.update(extra) + return {"default_preset": "p", "presets": {"p": base}} + + +def test_reference_max_tokens_defaults_to_none_uncapped(): + """Unset reference_max_tokens resolves to None (no cap) so existing presets + keep their prior uncapped advisor behavior — no silent regression.""" + p = resolve_moa_preset(_preset(), "p") + assert p["reference_max_tokens"] is None + + +def test_reference_max_tokens_positive_value_preserved(): + """A positive cap flows through resolve_moa_preset to the runtime path.""" + p = resolve_moa_preset(_preset(reference_max_tokens=600), "p") + assert p["reference_max_tokens"] == 600 + + +def test_reference_max_tokens_invalid_falls_back_to_none(): + """Non-positive / non-numeric caps degrade to None (uncapped) rather than + clamping advisors to a nonsense value or crashing.""" + for bad in (0, -5, "abc", "", None): + p = resolve_moa_preset(_preset(reference_max_tokens=bad), "p") + assert p["reference_max_tokens"] is None, bad + + +def test_reference_max_tokens_string_number_coerced(): + """A hand-edited config.yaml string like '600' coerces to int.""" + p = resolve_moa_preset(_preset(reference_max_tokens="600"), "p") + assert p["reference_max_tokens"] == 600 + + +def test_reference_max_tokens_in_flattened_view(): + """The flattened compatibility view (dashboard/desktop callers) exposes the + active preset's reference_max_tokens.""" + cfg = normalize_moa_config(_preset(reference_max_tokens=750)) + assert cfg["reference_max_tokens"] == 750 diff --git a/website/docs/user-guide/features/mixture-of-agents.md b/website/docs/user-guide/features/mixture-of-agents.md index ca60d2db3..88ec36d9e 100644 --- a/website/docs/user-guide/features/mixture-of-agents.md +++ b/website/docs/user-guide/features/mixture-of-agents.md @@ -97,6 +97,38 @@ Default preset: - reference: `openrouter:deepseek/deepseek-v4-pro` - aggregator / acting model: `openrouter:anthropic/claude-opus-4.8` +### Tuning advisor speed with `reference_max_tokens` + +Each turn, MoA runs the reference models (advisors) in parallel and then the +aggregator acts. Advisor generation is the dominant per-turn latency — turn +wall time correlates strongly with how many tokens the advisors emit, because +the turn waits for the slowest advisor to finish writing. By default advisors +are **uncapped** (`reference_max_tokens` unset), so they may write long, +essay-length advice. + +Set `reference_max_tokens` on a preset to cap advisor output and give concise +advice instead. The aggregator only needs the gist of each advisor's +judgement, so a cap (e.g. `600`) measurably cuts per-turn wall time with little +quality impact. It caps **advisors only** — the acting aggregator's output (the +user-visible answer) is never capped. + +```yaml +moa: + presets: + fast: + reference_models: + - provider: openrouter + model: anthropic/claude-opus-4.8 + - provider: openrouter + model: openai/gpt-5.5 + aggregator: + provider: openrouter + model: anthropic/claude-opus-4.8 + reference_max_tokens: 600 # concise advice → faster turns +``` + +Leave it unset (or `0`/blank) to keep the prior uncapped behavior. + ## Terminal preset management ```bash