fix(moa): raise aux timeouts to 900s and give the Codex aux path a stable prompt_cache_key (#56395)
Two independent MoA auxiliary-call fixes: #53866 — auxiliary.moa_reference.timeout and auxiliary.moa_aggregator.timeout were 600s while moa_agent was 120s. Raise both to 900s so a genuinely long reference/aggregator turn (mixed providers, deep reasoning, long tool chains) has headroom instead of being cut mid-generation. #53735 — _CodexCompletionsAdapter (the Codex/Responses auxiliary path used by the MoA acting-aggregator, compression, web_extract, session_search, etc.) never set prompt_cache_key, so it stayed cache-cold while the MAIN Responses transport (agent/transports/codex.py) was warm. Derive the same content-addressed key via the shared _content_cache_key(instructions, tools) helper and set it on the aux Responses request, with the same host guards the main transport uses (xAI carries the key in extra_body; GitHub/Copilot opts out of cache-key routing). Tests: 5 new prompt_cache_key cases (set+prefixed, stable across identical prefix, differs on different instructions, skipped for xai/github hosts). tests/agent/test_auxiliary_client.py 279 pass; tests/hermes_cli/test_config.py 130 pass.
This commit is contained in:
parent
aa605b66c8
commit
eae3700b16
3 changed files with 124 additions and 2 deletions
|
|
@ -884,6 +884,32 @@ class _CodexCompletionsAdapter:
|
|||
if converted:
|
||||
resp_kwargs["tools"] = converted
|
||||
|
||||
# Stable prompt-cache routing for the Codex/Responses aux path, mirroring
|
||||
# the main transport (agent/transports/codex.py::build_kwargs, which sets
|
||||
# prompt_cache_key = _content_cache_key(instructions, tools)). Without
|
||||
# this, MoA acting-aggregator and other auxiliary Responses calls stay
|
||||
# cache-cold while the main Responses transport is warm (issue #53735).
|
||||
# The key is content-addressed from the static prefix (instructions +
|
||||
# tool schemas) so it stays warm across turns/fires. Guard the top-level
|
||||
# field the same way the main transport does: xAI Responses takes the
|
||||
# key in extra_body (not top-level) and GitHub/Copilot Responses opts
|
||||
# out of cache-key routing entirely — for those hosts, skip it here.
|
||||
try:
|
||||
from agent.transports.codex import _content_cache_key
|
||||
from utils import base_url_host_matches
|
||||
|
||||
_host_src = str(getattr(self._client, "base_url", "") or "")
|
||||
_is_xai = base_url_host_matches(_host_src, "x.ai") or base_url_host_matches(_host_src, "api.x.ai")
|
||||
_is_github = base_url_host_matches(_host_src, "githubcopilot.com")
|
||||
if not _is_xai and not _is_github and "prompt_cache_key" not in resp_kwargs:
|
||||
_cache_key = _content_cache_key(instructions, resp_kwargs.get("tools"))
|
||||
if _cache_key:
|
||||
resp_kwargs["prompt_cache_key"] = _cache_key
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Codex auxiliary: prompt_cache_key derivation skipped", exc_info=True
|
||||
)
|
||||
|
||||
# Stream and collect the response
|
||||
text_parts: List[str] = []
|
||||
tool_calls_raw: List[Any] = []
|
||||
|
|
|
|||
|
|
@ -1624,7 +1624,7 @@ DEFAULT_CONFIG = {
|
|||
"model": "",
|
||||
"base_url": "",
|
||||
"api_key": "",
|
||||
"timeout": 600,
|
||||
"timeout": 900,
|
||||
"extra_body": {},
|
||||
},
|
||||
"moa_aggregator": {
|
||||
|
|
@ -1632,7 +1632,7 @@ DEFAULT_CONFIG = {
|
|||
"model": "",
|
||||
"base_url": "",
|
||||
"api_key": "",
|
||||
"timeout": 600,
|
||||
"timeout": 900,
|
||||
"extra_body": {},
|
||||
},
|
||||
},
|
||||
|
|
|
|||
|
|
@ -3604,6 +3604,102 @@ class TestCodexAdapterReasoningTranslation:
|
|||
assert captured.get("include") == ["reasoning.encrypted_content"]
|
||||
|
||||
|
||||
class TestCodexAdapterPromptCacheKey:
|
||||
"""_CodexCompletionsAdapter emits a stable content-addressed prompt_cache_key
|
||||
on the Codex/Responses aux path, matching the main transport
|
||||
(agent/transports/codex.py). Regression for issue #53735: MoA acting-
|
||||
aggregator and other auxiliary Responses calls stayed cache-cold because
|
||||
the adapter never set prompt_cache_key.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _build_adapter(base_url="https://chatgpt.com/backend-api/codex"):
|
||||
from agent.auxiliary_client import _CodexCompletionsAdapter
|
||||
from types import SimpleNamespace
|
||||
|
||||
message_item = SimpleNamespace(
|
||||
type="message", role="assistant", status="completed",
|
||||
content=[SimpleNamespace(type="output_text", text="hi")],
|
||||
)
|
||||
events = [
|
||||
SimpleNamespace(type="response.created"),
|
||||
SimpleNamespace(type="response.output_item.done", item=message_item),
|
||||
SimpleNamespace(
|
||||
type="response.completed",
|
||||
response=SimpleNamespace(
|
||||
status="completed", id="resp_test",
|
||||
usage=SimpleNamespace(input_tokens=1, output_tokens=1, total_tokens=2),
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
class _FakeCreateStream:
|
||||
def __iter__(self): return iter(events)
|
||||
def close(self): pass
|
||||
|
||||
captured_kwargs = {}
|
||||
|
||||
def _create(**kwargs):
|
||||
captured_kwargs.update(kwargs)
|
||||
return _FakeCreateStream()
|
||||
|
||||
real_client = MagicMock()
|
||||
real_client.base_url = base_url
|
||||
real_client.responses.create = _create
|
||||
adapter = _CodexCompletionsAdapter(real_client, "gpt-5.5")
|
||||
return adapter, captured_kwargs
|
||||
|
||||
def test_cache_key_set_and_prefixed(self):
|
||||
adapter, captured = self._build_adapter()
|
||||
adapter.create(messages=[
|
||||
{"role": "system", "content": "You are helpful."},
|
||||
{"role": "user", "content": "hi"},
|
||||
])
|
||||
key = captured.get("prompt_cache_key")
|
||||
assert isinstance(key, str) and key.startswith("pck_")
|
||||
|
||||
def test_cache_key_stable_across_identical_prefix(self):
|
||||
"""Same instructions + tools → same key (content-addressed, not per-call)."""
|
||||
a1, c1 = self._build_adapter()
|
||||
a1.create(messages=[
|
||||
{"role": "system", "content": "SYS"},
|
||||
{"role": "user", "content": "first"},
|
||||
])
|
||||
a2, c2 = self._build_adapter()
|
||||
a2.create(messages=[
|
||||
{"role": "system", "content": "SYS"},
|
||||
{"role": "user", "content": "second — different user turn"},
|
||||
])
|
||||
# User-turn content differs but the static prefix (instructions) matches,
|
||||
# so the routing key is identical → same warm cache bucket.
|
||||
assert c1["prompt_cache_key"] == c2["prompt_cache_key"]
|
||||
|
||||
def test_cache_key_differs_on_different_instructions(self):
|
||||
a1, c1 = self._build_adapter()
|
||||
a1.create(messages=[{"role": "system", "content": "SYS-A"}, {"role": "user", "content": "x"}])
|
||||
a2, c2 = self._build_adapter()
|
||||
a2.create(messages=[{"role": "system", "content": "SYS-B"}, {"role": "user", "content": "x"}])
|
||||
assert c1["prompt_cache_key"] != c2["prompt_cache_key"]
|
||||
|
||||
def test_cache_key_skipped_for_xai_host(self):
|
||||
"""xAI Responses takes the key in extra_body, not top-level — skip here."""
|
||||
adapter, captured = self._build_adapter(base_url="https://api.x.ai/v1")
|
||||
adapter.create(messages=[
|
||||
{"role": "system", "content": "SYS"},
|
||||
{"role": "user", "content": "hi"},
|
||||
])
|
||||
assert "prompt_cache_key" not in captured
|
||||
|
||||
def test_cache_key_skipped_for_github_copilot_host(self):
|
||||
"""GitHub/Copilot Responses opts out of cache-key routing entirely."""
|
||||
adapter, captured = self._build_adapter(base_url="https://api.githubcopilot.com")
|
||||
adapter.create(messages=[
|
||||
{"role": "system", "content": "SYS"},
|
||||
{"role": "user", "content": "hi"},
|
||||
])
|
||||
assert "prompt_cache_key" not in captured
|
||||
|
||||
|
||||
class TestVisionAutoSkipsKimiCoding:
|
||||
"""_resolve_auto vision branch skips providers that have no vision on
|
||||
their main endpoint (e.g. Kimi Coding Plan /coding) and falls through
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue