feat(moa): add reference_max_tokens to cap advisor output and cut turn latency (#56756)
MoA per-turn latency is dominated by advisor GENERATION: turn wall time correlates ~0.88 with output tokens and ~-0.03 with input tokens (measured over 52 turns). Each turn waits for the slowest advisor to finish writing, and advisors were uncapped — writing multi-thousand-token essays the aggregator only needs the gist of. Add an opt-in per-preset reference_max_tokens knob (mirrors reference_temperature) that caps ADVISOR output only; the acting aggregator is never capped. Default None = uncapped, so existing presets are byte-for-byte unchanged (no regression). Wired through both MoA execution paths (MoAChatCompletions.create and aggregate_moa_context). E2E: same task, closed preset uncapped vs reference_max_tokens=600 -> 59s to 33s (~44% faster), final answer identical/correct. - hermes_cli/moa_config.py: _coerce_int_or_none helper + reference_max_tokens in _normalize_preset/_default_preset/flattened view - agent/moa_loop.py: read preset.reference_max_tokens, pass to reference fan-out - agent/conversation_loop.py: pass reference_max_tokens on the per-turn path - tests + docs
This commit is contained in:
parent
9be39de0f2
commit
543d305bbb
5 changed files with 117 additions and 5 deletions
|
|
@ -856,6 +856,7 @@ def run_conversation(
|
|||
aggregator=moa_config.get("aggregator") or {},
|
||||
temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6),
|
||||
aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4),
|
||||
max_tokens=moa_config.get("reference_max_tokens"),
|
||||
)
|
||||
if _moa_context:
|
||||
for _msg in reversed(api_messages):
|
||||
|
|
|
|||
|
|
@ -715,10 +715,17 @@ class MoAChatCompletions:
|
|||
# aggregator's spend (often the bulk of the turn) is silently dropped
|
||||
# and the session cost reflects advisor fan-out only.
|
||||
self.last_aggregator_slot = dict(aggregator) if aggregator else None
|
||||
# MoA does not cap reference or aggregator output: each model uses its
|
||||
# own maximum. Passing max_tokens=None makes call_llm omit the parameter
|
||||
# (it never caps by default), so a long aggregator synthesis is never
|
||||
# truncated and providers that reject max_tokens don't 400.
|
||||
# By default MoA does not cap reference or aggregator output: each model
|
||||
# uses its own maximum (max_tokens=None → call_llm omits the parameter,
|
||||
# so a long aggregator synthesis is never truncated and providers that
|
||||
# reject max_tokens don't 400). A preset MAY set reference_max_tokens to
|
||||
# cap ADVISOR output only — advisor generation is the dominant MoA
|
||||
# latency (turn latency correlates ~0.88 with output tokens), and the
|
||||
# aggregator only needs the gist of each advisor's judgement, so a cap
|
||||
# (e.g. 600) measurably cuts per-turn wall time (~44% on a sample task).
|
||||
# The acting aggregator is never capped here (its output is the
|
||||
# user-visible answer).
|
||||
reference_max_tokens = preset.get("reference_max_tokens")
|
||||
temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
|
||||
aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)
|
||||
|
||||
|
|
@ -762,7 +769,7 @@ class MoAChatCompletions:
|
|||
reference_models,
|
||||
ref_messages,
|
||||
temperature=temperature,
|
||||
max_tokens=None,
|
||||
max_tokens=reference_max_tokens,
|
||||
)
|
||||
self._ref_cache_key = _cache_key
|
||||
self._ref_cache_outputs = list(reference_outputs)
|
||||
|
|
|
|||
|
|
@ -42,6 +42,24 @@ def _coerce_int(value: Any, default: int) -> int:
|
|||
return default
|
||||
|
||||
|
||||
def _coerce_int_or_none(value: Any) -> int | None:
|
||||
"""Coerce to a positive int, or None when unset/blank/invalid/non-positive.
|
||||
|
||||
Used for optional caps (e.g. reference_max_tokens) where None means
|
||||
'no cap' — the safe default that preserves prior uncapped behavior.
|
||||
"""
|
||||
if value is None or value == "":
|
||||
return None
|
||||
try:
|
||||
n = int(value)
|
||||
except (TypeError, ValueError):
|
||||
try:
|
||||
n = int(float(value))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
return n if n > 0 else None
|
||||
|
||||
|
||||
def _clean_slot(slot: Any) -> dict[str, str] | None:
|
||||
if not isinstance(slot, dict):
|
||||
return None
|
||||
|
|
@ -66,6 +84,7 @@ def _default_preset() -> dict[str, Any]:
|
|||
"reference_temperature": 0.6,
|
||||
"aggregator_temperature": 0.4,
|
||||
"max_tokens": 4096,
|
||||
"reference_max_tokens": None,
|
||||
"enabled": True,
|
||||
}
|
||||
|
||||
|
|
@ -94,6 +113,15 @@ def _normalize_preset(raw: Any) -> dict[str, Any]:
|
|||
"reference_temperature": _coerce_float(raw.get("reference_temperature"), 0.6),
|
||||
"aggregator_temperature": _coerce_float(raw.get("aggregator_temperature"), 0.4),
|
||||
"max_tokens": _coerce_int(raw.get("max_tokens"), 4096),
|
||||
# Optional cap on how much each reference ADVISOR may generate per turn.
|
||||
# None (default) = uncapped: advisors write full-length advice, matching
|
||||
# prior behavior so existing presets are unchanged. Set a value (e.g.
|
||||
# 600) to make advisors give concise advice — the dominant MoA latency
|
||||
# is advisor generation (turn latency correlates ~0.88 with output
|
||||
# tokens), and the aggregator only needs the gist of each advisor's
|
||||
# judgement, so capping roughly halves per-turn wall time. Does NOT cap
|
||||
# the acting aggregator (its output is the user-visible answer).
|
||||
"reference_max_tokens": _coerce_int_or_none(raw.get("reference_max_tokens")),
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -139,6 +167,7 @@ def normalize_moa_config(raw: Any) -> dict[str, Any]:
|
|||
"reference_temperature": active["reference_temperature"],
|
||||
"aggregator_temperature": active["aggregator_temperature"],
|
||||
"max_tokens": active["max_tokens"],
|
||||
"reference_max_tokens": active.get("reference_max_tokens"),
|
||||
"enabled": active["enabled"],
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -235,3 +235,46 @@ def test_moa_provider_rejected_case_insensitive():
|
|||
|
||||
assert cfg["presets"]["p"]["aggregator"]["provider"] != "moa"
|
||||
assert cfg["presets"]["p"]["aggregator"] == DEFAULT_MOA_AGGREGATOR
|
||||
|
||||
|
||||
def _preset(**extra):
|
||||
base = {
|
||||
"reference_models": [{"provider": "openrouter", "model": "anthropic/claude-opus-4.8"}],
|
||||
"aggregator": {"provider": "openrouter", "model": "anthropic/claude-opus-4.8"},
|
||||
}
|
||||
base.update(extra)
|
||||
return {"default_preset": "p", "presets": {"p": base}}
|
||||
|
||||
|
||||
def test_reference_max_tokens_defaults_to_none_uncapped():
|
||||
"""Unset reference_max_tokens resolves to None (no cap) so existing presets
|
||||
keep their prior uncapped advisor behavior — no silent regression."""
|
||||
p = resolve_moa_preset(_preset(), "p")
|
||||
assert p["reference_max_tokens"] is None
|
||||
|
||||
|
||||
def test_reference_max_tokens_positive_value_preserved():
|
||||
"""A positive cap flows through resolve_moa_preset to the runtime path."""
|
||||
p = resolve_moa_preset(_preset(reference_max_tokens=600), "p")
|
||||
assert p["reference_max_tokens"] == 600
|
||||
|
||||
|
||||
def test_reference_max_tokens_invalid_falls_back_to_none():
|
||||
"""Non-positive / non-numeric caps degrade to None (uncapped) rather than
|
||||
clamping advisors to a nonsense value or crashing."""
|
||||
for bad in (0, -5, "abc", "", None):
|
||||
p = resolve_moa_preset(_preset(reference_max_tokens=bad), "p")
|
||||
assert p["reference_max_tokens"] is None, bad
|
||||
|
||||
|
||||
def test_reference_max_tokens_string_number_coerced():
|
||||
"""A hand-edited config.yaml string like '600' coerces to int."""
|
||||
p = resolve_moa_preset(_preset(reference_max_tokens="600"), "p")
|
||||
assert p["reference_max_tokens"] == 600
|
||||
|
||||
|
||||
def test_reference_max_tokens_in_flattened_view():
|
||||
"""The flattened compatibility view (dashboard/desktop callers) exposes the
|
||||
active preset's reference_max_tokens."""
|
||||
cfg = normalize_moa_config(_preset(reference_max_tokens=750))
|
||||
assert cfg["reference_max_tokens"] == 750
|
||||
|
|
|
|||
|
|
@ -97,6 +97,38 @@ Default preset:
|
|||
- reference: `openrouter:deepseek/deepseek-v4-pro`
|
||||
- aggregator / acting model: `openrouter:anthropic/claude-opus-4.8`
|
||||
|
||||
### Tuning advisor speed with `reference_max_tokens`
|
||||
|
||||
Each turn, MoA runs the reference models (advisors) in parallel and then the
|
||||
aggregator acts. Advisor generation is the dominant per-turn latency — turn
|
||||
wall time correlates strongly with how many tokens the advisors emit, because
|
||||
the turn waits for the slowest advisor to finish writing. By default advisors
|
||||
are **uncapped** (`reference_max_tokens` unset), so they may write long,
|
||||
essay-length advice.
|
||||
|
||||
Set `reference_max_tokens` on a preset to cap advisor output and give concise
|
||||
advice instead. The aggregator only needs the gist of each advisor's
|
||||
judgement, so a cap (e.g. `600`) measurably cuts per-turn wall time with little
|
||||
quality impact. It caps **advisors only** — the acting aggregator's output (the
|
||||
user-visible answer) is never capped.
|
||||
|
||||
```yaml
|
||||
moa:
|
||||
presets:
|
||||
fast:
|
||||
reference_models:
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-opus-4.8
|
||||
- provider: openrouter
|
||||
model: openai/gpt-5.5
|
||||
aggregator:
|
||||
provider: openrouter
|
||||
model: anthropic/claude-opus-4.8
|
||||
reference_max_tokens: 600 # concise advice → faster turns
|
||||
```
|
||||
|
||||
Leave it unset (or `0`/blank) to keep the prior uncapped behavior.
|
||||
|
||||
## Terminal preset management
|
||||
|
||||
```bash
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue