feat(moa): add reference_max_tokens to cap advisor output and cut turn latency (#56756)

MoA per-turn latency is dominated by advisor GENERATION: turn wall time
correlates ~0.88 with output tokens and ~-0.03 with input tokens (measured over
52 turns). Each turn waits for the slowest advisor to finish writing, and
advisors were uncapped — writing multi-thousand-token essays the aggregator
only needs the gist of.

Add an opt-in per-preset reference_max_tokens knob (mirrors reference_temperature)
that caps ADVISOR output only; the acting aggregator is never capped. Default
None = uncapped, so existing presets are byte-for-byte unchanged (no regression).
Wired through both MoA execution paths (MoAChatCompletions.create and
aggregate_moa_context).

E2E: same task, closed preset uncapped vs reference_max_tokens=600 -> 59s to 33s
(~44% faster), final answer identical/correct.

- hermes_cli/moa_config.py: _coerce_int_or_none helper + reference_max_tokens
  in _normalize_preset/_default_preset/flattened view
- agent/moa_loop.py: read preset.reference_max_tokens, pass to reference fan-out
- agent/conversation_loop.py: pass reference_max_tokens on the per-turn path
- tests + docs
This commit is contained in:
Teknium 2026-07-02 00:16:35 -07:00 committed by GitHub
parent 9be39de0f2
commit 543d305bbb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 117 additions and 5 deletions

View file

@ -856,6 +856,7 @@ def run_conversation(
aggregator=moa_config.get("aggregator") or {},
temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6),
aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4),
max_tokens=moa_config.get("reference_max_tokens"),
)
if _moa_context:
for _msg in reversed(api_messages):

View file

@ -715,10 +715,17 @@ class MoAChatCompletions:
# aggregator's spend (often the bulk of the turn) is silently dropped
# and the session cost reflects advisor fan-out only.
self.last_aggregator_slot = dict(aggregator) if aggregator else None
# MoA does not cap reference or aggregator output: each model uses its
# own maximum. Passing max_tokens=None makes call_llm omit the parameter
# (it never caps by default), so a long aggregator synthesis is never
# truncated and providers that reject max_tokens don't 400.
# By default MoA does not cap reference or aggregator output: each model
# uses its own maximum (max_tokens=None → call_llm omits the parameter,
# so a long aggregator synthesis is never truncated and providers that
# reject max_tokens don't 400). A preset MAY set reference_max_tokens to
# cap ADVISOR output only — advisor generation is the dominant MoA
# latency (turn latency correlates ~0.88 with output tokens), and the
# aggregator only needs the gist of each advisor's judgement, so a cap
# (e.g. 600) measurably cuts per-turn wall time (~44% on a sample task).
# The acting aggregator is never capped here (its output is the
# user-visible answer).
reference_max_tokens = preset.get("reference_max_tokens")
temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)
@ -762,7 +769,7 @@ class MoAChatCompletions:
reference_models,
ref_messages,
temperature=temperature,
max_tokens=None,
max_tokens=reference_max_tokens,
)
self._ref_cache_key = _cache_key
self._ref_cache_outputs = list(reference_outputs)

View file

@ -42,6 +42,24 @@ def _coerce_int(value: Any, default: int) -> int:
return default
def _coerce_int_or_none(value: Any) -> int | None:
"""Coerce to a positive int, or None when unset/blank/invalid/non-positive.
Used for optional caps (e.g. reference_max_tokens) where None means
'no cap' the safe default that preserves prior uncapped behavior.
"""
if value is None or value == "":
return None
try:
n = int(value)
except (TypeError, ValueError):
try:
n = int(float(value))
except (TypeError, ValueError):
return None
return n if n > 0 else None
def _clean_slot(slot: Any) -> dict[str, str] | None:
if not isinstance(slot, dict):
return None
@ -66,6 +84,7 @@ def _default_preset() -> dict[str, Any]:
"reference_temperature": 0.6,
"aggregator_temperature": 0.4,
"max_tokens": 4096,
"reference_max_tokens": None,
"enabled": True,
}
@ -94,6 +113,15 @@ def _normalize_preset(raw: Any) -> dict[str, Any]:
"reference_temperature": _coerce_float(raw.get("reference_temperature"), 0.6),
"aggregator_temperature": _coerce_float(raw.get("aggregator_temperature"), 0.4),
"max_tokens": _coerce_int(raw.get("max_tokens"), 4096),
# Optional cap on how much each reference ADVISOR may generate per turn.
# None (default) = uncapped: advisors write full-length advice, matching
# prior behavior so existing presets are unchanged. Set a value (e.g.
# 600) to make advisors give concise advice — the dominant MoA latency
# is advisor generation (turn latency correlates ~0.88 with output
# tokens), and the aggregator only needs the gist of each advisor's
# judgement, so capping roughly halves per-turn wall time. Does NOT cap
# the acting aggregator (its output is the user-visible answer).
"reference_max_tokens": _coerce_int_or_none(raw.get("reference_max_tokens")),
}
@ -139,6 +167,7 @@ def normalize_moa_config(raw: Any) -> dict[str, Any]:
"reference_temperature": active["reference_temperature"],
"aggregator_temperature": active["aggregator_temperature"],
"max_tokens": active["max_tokens"],
"reference_max_tokens": active.get("reference_max_tokens"),
"enabled": active["enabled"],
}

View file

@ -235,3 +235,46 @@ def test_moa_provider_rejected_case_insensitive():
assert cfg["presets"]["p"]["aggregator"]["provider"] != "moa"
assert cfg["presets"]["p"]["aggregator"] == DEFAULT_MOA_AGGREGATOR
def _preset(**extra):
base = {
"reference_models": [{"provider": "openrouter", "model": "anthropic/claude-opus-4.8"}],
"aggregator": {"provider": "openrouter", "model": "anthropic/claude-opus-4.8"},
}
base.update(extra)
return {"default_preset": "p", "presets": {"p": base}}
def test_reference_max_tokens_defaults_to_none_uncapped():
"""Unset reference_max_tokens resolves to None (no cap) so existing presets
keep their prior uncapped advisor behavior no silent regression."""
p = resolve_moa_preset(_preset(), "p")
assert p["reference_max_tokens"] is None
def test_reference_max_tokens_positive_value_preserved():
"""A positive cap flows through resolve_moa_preset to the runtime path."""
p = resolve_moa_preset(_preset(reference_max_tokens=600), "p")
assert p["reference_max_tokens"] == 600
def test_reference_max_tokens_invalid_falls_back_to_none():
"""Non-positive / non-numeric caps degrade to None (uncapped) rather than
clamping advisors to a nonsense value or crashing."""
for bad in (0, -5, "abc", "", None):
p = resolve_moa_preset(_preset(reference_max_tokens=bad), "p")
assert p["reference_max_tokens"] is None, bad
def test_reference_max_tokens_string_number_coerced():
"""A hand-edited config.yaml string like '600' coerces to int."""
p = resolve_moa_preset(_preset(reference_max_tokens="600"), "p")
assert p["reference_max_tokens"] == 600
def test_reference_max_tokens_in_flattened_view():
"""The flattened compatibility view (dashboard/desktop callers) exposes the
active preset's reference_max_tokens."""
cfg = normalize_moa_config(_preset(reference_max_tokens=750))
assert cfg["reference_max_tokens"] == 750

View file

@ -97,6 +97,38 @@ Default preset:
- reference: `openrouter:deepseek/deepseek-v4-pro`
- aggregator / acting model: `openrouter:anthropic/claude-opus-4.8`
### Tuning advisor speed with `reference_max_tokens`
Each turn, MoA runs the reference models (advisors) in parallel and then the
aggregator acts. Advisor generation is the dominant per-turn latency — turn
wall time correlates strongly with how many tokens the advisors emit, because
the turn waits for the slowest advisor to finish writing. By default advisors
are **uncapped** (`reference_max_tokens` unset), so they may write long,
essay-length advice.
Set `reference_max_tokens` on a preset to cap advisor output and give concise
advice instead. The aggregator only needs the gist of each advisor's
judgement, so a cap (e.g. `600`) measurably cuts per-turn wall time with little
quality impact. It caps **advisors only** — the acting aggregator's output (the
user-visible answer) is never capped.
```yaml
moa:
presets:
fast:
reference_models:
- provider: openrouter
model: anthropic/claude-opus-4.8
- provider: openrouter
model: openai/gpt-5.5
aggregator:
provider: openrouter
model: anthropic/claude-opus-4.8
reference_max_tokens: 600 # concise advice → faster turns
```
Leave it unset (or `0`/blank) to keep the prior uncapped behavior.
## Terminal preset management
```bash