From eae3700b168500ef300a677ebcb6ebb2f8f6b837 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 1 Jul 2026 06:02:40 -0700 Subject: [PATCH] fix(moa): raise aux timeouts to 900s and give the Codex aux path a stable prompt_cache_key (#56395) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two independent MoA auxiliary-call fixes: #53866 — auxiliary.moa_reference.timeout and auxiliary.moa_aggregator.timeout were 600s while moa_agent was 120s. Raise both to 900s so a genuinely long reference/aggregator turn (mixed providers, deep reasoning, long tool chains) has headroom instead of being cut mid-generation. #53735 — _CodexCompletionsAdapter (the Codex/Responses auxiliary path used by the MoA acting-aggregator, compression, web_extract, session_search, etc.) never set prompt_cache_key, so it stayed cache-cold while the MAIN Responses transport (agent/transports/codex.py) was warm. Derive the same content-addressed key via the shared _content_cache_key(instructions, tools) helper and set it on the aux Responses request, with the same host guards the main transport uses (xAI carries the key in extra_body; GitHub/Copilot opts out of cache-key routing). Tests: 5 new prompt_cache_key cases (set+prefixed, stable across identical prefix, differs on different instructions, skipped for xai/github hosts). tests/agent/test_auxiliary_client.py 279 pass; tests/hermes_cli/test_config.py 130 pass. --- agent/auxiliary_client.py | 26 ++++++++ hermes_cli/config.py | 4 +- tests/agent/test_auxiliary_client.py | 96 ++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 2 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index ca42734d0..8ed7b5aab 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -884,6 +884,32 @@ class _CodexCompletionsAdapter: if converted: resp_kwargs["tools"] = converted + # Stable prompt-cache routing for the Codex/Responses aux path, mirroring + # the main transport (agent/transports/codex.py::build_kwargs, which sets + # prompt_cache_key = _content_cache_key(instructions, tools)). Without + # this, MoA acting-aggregator and other auxiliary Responses calls stay + # cache-cold while the main Responses transport is warm (issue #53735). + # The key is content-addressed from the static prefix (instructions + + # tool schemas) so it stays warm across turns/fires. Guard the top-level + # field the same way the main transport does: xAI Responses takes the + # key in extra_body (not top-level) and GitHub/Copilot Responses opts + # out of cache-key routing entirely — for those hosts, skip it here. + try: + from agent.transports.codex import _content_cache_key + from utils import base_url_host_matches + + _host_src = str(getattr(self._client, "base_url", "") or "") + _is_xai = base_url_host_matches(_host_src, "x.ai") or base_url_host_matches(_host_src, "api.x.ai") + _is_github = base_url_host_matches(_host_src, "githubcopilot.com") + if not _is_xai and not _is_github and "prompt_cache_key" not in resp_kwargs: + _cache_key = _content_cache_key(instructions, resp_kwargs.get("tools")) + if _cache_key: + resp_kwargs["prompt_cache_key"] = _cache_key + except Exception: + logger.debug( + "Codex auxiliary: prompt_cache_key derivation skipped", exc_info=True + ) + # Stream and collect the response text_parts: List[str] = [] tool_calls_raw: List[Any] = [] diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 4b2b47658..17f506a5d 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1624,7 +1624,7 @@ DEFAULT_CONFIG = { "model": "", "base_url": "", "api_key": "", - "timeout": 600, + "timeout": 900, "extra_body": {}, }, "moa_aggregator": { @@ -1632,7 +1632,7 @@ DEFAULT_CONFIG = { "model": "", "base_url": "", "api_key": "", - "timeout": 600, + "timeout": 900, "extra_body": {}, }, }, diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index e66618e4d..5230f4b1c 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -3604,6 +3604,102 @@ class TestCodexAdapterReasoningTranslation: assert captured.get("include") == ["reasoning.encrypted_content"] +class TestCodexAdapterPromptCacheKey: + """_CodexCompletionsAdapter emits a stable content-addressed prompt_cache_key + on the Codex/Responses aux path, matching the main transport + (agent/transports/codex.py). Regression for issue #53735: MoA acting- + aggregator and other auxiliary Responses calls stayed cache-cold because + the adapter never set prompt_cache_key. + """ + + @staticmethod + def _build_adapter(base_url="https://chatgpt.com/backend-api/codex"): + from agent.auxiliary_client import _CodexCompletionsAdapter + from types import SimpleNamespace + + message_item = SimpleNamespace( + type="message", role="assistant", status="completed", + content=[SimpleNamespace(type="output_text", text="hi")], + ) + events = [ + SimpleNamespace(type="response.created"), + SimpleNamespace(type="response.output_item.done", item=message_item), + SimpleNamespace( + type="response.completed", + response=SimpleNamespace( + status="completed", id="resp_test", + usage=SimpleNamespace(input_tokens=1, output_tokens=1, total_tokens=2), + ), + ), + ] + + class _FakeCreateStream: + def __iter__(self): return iter(events) + def close(self): pass + + captured_kwargs = {} + + def _create(**kwargs): + captured_kwargs.update(kwargs) + return _FakeCreateStream() + + real_client = MagicMock() + real_client.base_url = base_url + real_client.responses.create = _create + adapter = _CodexCompletionsAdapter(real_client, "gpt-5.5") + return adapter, captured_kwargs + + def test_cache_key_set_and_prefixed(self): + adapter, captured = self._build_adapter() + adapter.create(messages=[ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "hi"}, + ]) + key = captured.get("prompt_cache_key") + assert isinstance(key, str) and key.startswith("pck_") + + def test_cache_key_stable_across_identical_prefix(self): + """Same instructions + tools → same key (content-addressed, not per-call).""" + a1, c1 = self._build_adapter() + a1.create(messages=[ + {"role": "system", "content": "SYS"}, + {"role": "user", "content": "first"}, + ]) + a2, c2 = self._build_adapter() + a2.create(messages=[ + {"role": "system", "content": "SYS"}, + {"role": "user", "content": "second — different user turn"}, + ]) + # User-turn content differs but the static prefix (instructions) matches, + # so the routing key is identical → same warm cache bucket. + assert c1["prompt_cache_key"] == c2["prompt_cache_key"] + + def test_cache_key_differs_on_different_instructions(self): + a1, c1 = self._build_adapter() + a1.create(messages=[{"role": "system", "content": "SYS-A"}, {"role": "user", "content": "x"}]) + a2, c2 = self._build_adapter() + a2.create(messages=[{"role": "system", "content": "SYS-B"}, {"role": "user", "content": "x"}]) + assert c1["prompt_cache_key"] != c2["prompt_cache_key"] + + def test_cache_key_skipped_for_xai_host(self): + """xAI Responses takes the key in extra_body, not top-level — skip here.""" + adapter, captured = self._build_adapter(base_url="https://api.x.ai/v1") + adapter.create(messages=[ + {"role": "system", "content": "SYS"}, + {"role": "user", "content": "hi"}, + ]) + assert "prompt_cache_key" not in captured + + def test_cache_key_skipped_for_github_copilot_host(self): + """GitHub/Copilot Responses opts out of cache-key routing entirely.""" + adapter, captured = self._build_adapter(base_url="https://api.githubcopilot.com") + adapter.create(messages=[ + {"role": "system", "content": "SYS"}, + {"role": "user", "content": "hi"}, + ]) + assert "prompt_cache_key" not in captured + + class TestVisionAutoSkipsKimiCoding: """_resolve_auto vision branch skips providers that have no vision on their main endpoint (e.g. Kimi Coding Plan /coding) and falls through