From eae3700b168500ef300a677ebcb6ebb2f8f6b837 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Wed, 1 Jul 2026 06:02:40 -0700
Subject: [PATCH] fix(moa): raise aux timeouts to 900s and give the Codex aux
 path a stable prompt_cache_key (#56395)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two independent MoA auxiliary-call fixes:

#53866 — auxiliary.moa_reference.timeout and auxiliary.moa_aggregator.timeout
were 600s while moa_agent was 120s. Raise both to 900s so a genuinely long
reference/aggregator turn (mixed providers, deep reasoning, long tool chains)
has headroom instead of being cut mid-generation.

#53735 — _CodexCompletionsAdapter (the Codex/Responses auxiliary path used by
the MoA acting-aggregator, compression, web_extract, session_search, etc.)
never set prompt_cache_key, so it stayed cache-cold while the MAIN Responses
transport (agent/transports/codex.py) was warm. Derive the same
content-addressed key via the shared _content_cache_key(instructions, tools)
helper and set it on the aux Responses request, with the same host guards the
main transport uses (xAI carries the key in extra_body; GitHub/Copilot opts out
of cache-key routing).

Tests: 5 new prompt_cache_key cases (set+prefixed, stable across identical
prefix, differs on different instructions, skipped for xai/github hosts).
tests/agent/test_auxiliary_client.py 279 pass; tests/hermes_cli/test_config.py
130 pass.
---
 agent/auxiliary_client.py            | 26 ++++++++
 hermes_cli/config.py                 |  4 +-
 tests/agent/test_auxiliary_client.py | 96 ++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index ca42734d0..8ed7b5aab 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -884,6 +884,32 @@ class _CodexCompletionsAdapter:
             if converted:
                 resp_kwargs["tools"] = converted
 
+        # Stable prompt-cache routing for the Codex/Responses aux path, mirroring
+        # the main transport (agent/transports/codex.py::build_kwargs, which sets
+        # prompt_cache_key = _content_cache_key(instructions, tools)). Without
+        # this, MoA acting-aggregator and other auxiliary Responses calls stay
+        # cache-cold while the main Responses transport is warm (issue #53735).
+        # The key is content-addressed from the static prefix (instructions +
+        # tool schemas) so it stays warm across turns/fires. Guard the top-level
+        # field the same way the main transport does: xAI Responses takes the
+        # key in extra_body (not top-level) and GitHub/Copilot Responses opts
+        # out of cache-key routing entirely — for those hosts, skip it here.
+        try:
+            from agent.transports.codex import _content_cache_key
+            from utils import base_url_host_matches
+
+            _host_src = str(getattr(self._client, "base_url", "") or "")
+            _is_xai = base_url_host_matches(_host_src, "x.ai") or base_url_host_matches(_host_src, "api.x.ai")
+            _is_github = base_url_host_matches(_host_src, "githubcopilot.com")
+            if not _is_xai and not _is_github and "prompt_cache_key" not in resp_kwargs:
+                _cache_key = _content_cache_key(instructions, resp_kwargs.get("tools"))
+                if _cache_key:
+                    resp_kwargs["prompt_cache_key"] = _cache_key
+        except Exception:
+            logger.debug(
+                "Codex auxiliary: prompt_cache_key derivation skipped", exc_info=True
+            )
+
         # Stream and collect the response
         text_parts: List[str] = []
         tool_calls_raw: List[Any] = []
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 4b2b47658..17f506a5d 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -1624,7 +1624,7 @@ DEFAULT_CONFIG = {
             "model": "",
             "base_url": "",
             "api_key": "",
-            "timeout": 600,
+            "timeout": 900,
             "extra_body": {},
         },
         "moa_aggregator": {
@@ -1632,7 +1632,7 @@ DEFAULT_CONFIG = {
             "model": "",
             "base_url": "",
             "api_key": "",
-            "timeout": 600,
+            "timeout": 900,
             "extra_body": {},
         },
     },
diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py
index e66618e4d..5230f4b1c 100644
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@@ -3604,6 +3604,102 @@ class TestCodexAdapterReasoningTranslation:
         assert captured.get("include") == ["reasoning.encrypted_content"]
 
 
+class TestCodexAdapterPromptCacheKey:
+    """_CodexCompletionsAdapter emits a stable content-addressed prompt_cache_key
+    on the Codex/Responses aux path, matching the main transport
+    (agent/transports/codex.py). Regression for issue #53735: MoA acting-
+    aggregator and other auxiliary Responses calls stayed cache-cold because
+    the adapter never set prompt_cache_key.
+    """
+
+    @staticmethod
+    def _build_adapter(base_url="https://chatgpt.com/backend-api/codex"):
+        from agent.auxiliary_client import _CodexCompletionsAdapter
+        from types import SimpleNamespace
+
+        message_item = SimpleNamespace(
+            type="message", role="assistant", status="completed",
+            content=[SimpleNamespace(type="output_text", text="hi")],
+        )
+        events = [
+            SimpleNamespace(type="response.created"),
+            SimpleNamespace(type="response.output_item.done", item=message_item),
+            SimpleNamespace(
+                type="response.completed",
+                response=SimpleNamespace(
+                    status="completed", id="resp_test",
+                    usage=SimpleNamespace(input_tokens=1, output_tokens=1, total_tokens=2),
+                ),
+            ),
+        ]
+
+        class _FakeCreateStream:
+            def __iter__(self): return iter(events)
+            def close(self): pass
+
+        captured_kwargs = {}
+
+        def _create(**kwargs):
+            captured_kwargs.update(kwargs)
+            return _FakeCreateStream()
+
+        real_client = MagicMock()
+        real_client.base_url = base_url
+        real_client.responses.create = _create
+        adapter = _CodexCompletionsAdapter(real_client, "gpt-5.5")
+        return adapter, captured_kwargs
+
+    def test_cache_key_set_and_prefixed(self):
+        adapter, captured = self._build_adapter()
+        adapter.create(messages=[
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "hi"},
+        ])
+        key = captured.get("prompt_cache_key")
+        assert isinstance(key, str) and key.startswith("pck_")
+
+    def test_cache_key_stable_across_identical_prefix(self):
+        """Same instructions + tools → same key (content-addressed, not per-call)."""
+        a1, c1 = self._build_adapter()
+        a1.create(messages=[
+            {"role": "system", "content": "SYS"},
+            {"role": "user", "content": "first"},
+        ])
+        a2, c2 = self._build_adapter()
+        a2.create(messages=[
+            {"role": "system", "content": "SYS"},
+            {"role": "user", "content": "second — different user turn"},
+        ])
+        # User-turn content differs but the static prefix (instructions) matches,
+        # so the routing key is identical → same warm cache bucket.
+        assert c1["prompt_cache_key"] == c2["prompt_cache_key"]
+
+    def test_cache_key_differs_on_different_instructions(self):
+        a1, c1 = self._build_adapter()
+        a1.create(messages=[{"role": "system", "content": "SYS-A"}, {"role": "user", "content": "x"}])
+        a2, c2 = self._build_adapter()
+        a2.create(messages=[{"role": "system", "content": "SYS-B"}, {"role": "user", "content": "x"}])
+        assert c1["prompt_cache_key"] != c2["prompt_cache_key"]
+
+    def test_cache_key_skipped_for_xai_host(self):
+        """xAI Responses takes the key in extra_body, not top-level — skip here."""
+        adapter, captured = self._build_adapter(base_url="https://api.x.ai/v1")
+        adapter.create(messages=[
+            {"role": "system", "content": "SYS"},
+            {"role": "user", "content": "hi"},
+        ])
+        assert "prompt_cache_key" not in captured
+
+    def test_cache_key_skipped_for_github_copilot_host(self):
+        """GitHub/Copilot Responses opts out of cache-key routing entirely."""
+        adapter, captured = self._build_adapter(base_url="https://api.githubcopilot.com")
+        adapter.create(messages=[
+            {"role": "system", "content": "SYS"},
+            {"role": "user", "content": "hi"},
+        ])
+        assert "prompt_cache_key" not in captured
+
+
 class TestVisionAutoSkipsKimiCoding:
     """_resolve_auto vision branch skips providers that have no vision on
     their main endpoint (e.g. Kimi Coding Plan /coding) and falls through