fix(agent): preserve final_response on failure returns

AIAgent.run_conversation() promises a dict with final_response, but 16
terminal-failure branches returned dicts that either omitted the key or
set it to None. Callers that index result['final_response'] directly
(run_agent.py chat() + the __main__ printer) turn a real provider/context
failure into an opaque KeyError instead of surfacing the actionable error.

Every offending branch already carried usable 'error' text, so this
mirrors that text into final_response for all 16 sites (8 that omitted the
key, 8 that returned None). Adds an AST regression test that fails if any
run_conversation() dict return omits final_response or sets it to a literal
None, and tightens the invalid-response test to assert final_response == error.
This commit is contained in:
Omar Baradei 2026-07-01 01:16:20 -07:00 committed by Teknium
parent 43edbae638
commit 053424c486
2 changed files with 89 additions and 30 deletions

View file

@ -1454,11 +1454,13 @@ def run_conversation(
agent._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.")
logger.error(f"{agent.log_prefix}Invalid API response after {max_retries} retries.")
agent._persist_session(messages, conversation_history)
_final_response = f"Invalid API response after {max_retries} retries: {_failure_hint}"
return {
"final_response": _final_response,
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Invalid API response after {max_retries} retries: {_failure_hint}",
"error": _final_response,
"failed": True # Mark as failure for filtering
}
@ -1891,18 +1893,19 @@ def run_conversation(
)
agent._cleanup_task_resources(effective_task_id)
agent._persist_session(messages, conversation_history)
_final_response = (
"Stream repeatedly dropped mid tool-call (network); "
"the tool was not executed"
if _is_stub_stall
else "Response truncated due to output length limit"
)
return {
"final_response": None,
"final_response": _final_response,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": (
"Stream repeatedly dropped mid tool-call (network); "
"the tool was not executed"
if _is_stub_stall
else "Response truncated due to output length limit"
),
"error": _final_response,
}
# If we have prior messages, roll back to last complete state
@ -1914,7 +1917,7 @@ def run_conversation(
agent._persist_session(messages, conversation_history)
return {
"final_response": None,
"final_response": "Response truncated due to output length limit",
"messages": rolled_back_messages,
"api_calls": api_call_count,
"completed": False,
@ -1927,7 +1930,7 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix}❌ First response truncated - cannot recover", force=True)
agent._persist_session(messages, conversation_history)
return {
"final_response": None,
"final_response": "First response truncated due to output length limit",
"messages": messages,
"api_calls": api_call_count,
"completed": False,
@ -2917,15 +2920,17 @@ def run_conversation(
f"auto-compaction disabled — not compressing."
)
agent._persist_session(messages, conversation_history)
_final_response = (
"Context overflow and auto-compaction is disabled "
"(compression.enabled: false). Run /compress to compact manually, "
"/new to start fresh, or switch to a larger-context model."
)
return {
"final_response": _final_response,
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": (
"Context overflow and auto-compaction is disabled "
"(compression.enabled: false). Run /compress to compact manually, "
"/new to start fresh, or switch to a larger-context model."
),
"error": _final_response,
"partial": True,
"failed": True,
"compaction_disabled": True,
@ -3200,11 +3205,13 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
logger.error(f"{agent.log_prefix}413 compression failed after {max_compression_attempts} attempts.")
agent._persist_session(messages, conversation_history)
_final_response = f"Request payload too large: max compression attempts ({max_compression_attempts}) reached."
return {
"final_response": _final_response,
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.",
"error": _final_response,
"partial": True,
"failed": True,
"compression_exhausted": True,
@ -3244,11 +3251,13 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
logger.error(f"{agent.log_prefix}413 payload too large. Cannot compress further.")
agent._persist_session(messages, conversation_history)
_final_response = "Request payload too large (413). Cannot compress further."
return {
"final_response": _final_response,
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": "Request payload too large (413). Cannot compress further.",
"error": _final_response,
"partial": True,
"failed": True,
"compression_exhausted": True,
@ -3297,11 +3306,13 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
logger.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
agent._persist_session(messages, conversation_history)
_final_response = f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached."
return {
"final_response": _final_response,
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
"error": _final_response,
"partial": True,
"failed": True,
"compression_exhausted": True,
@ -3336,14 +3347,16 @@ def run_conversation(
f"(max_tokens over provider cap): {error_msg[:200]}"
)
agent._persist_session(messages, conversation_history)
_final_response = (
"max_tokens exceeds the provider's output cap for this model. "
"Lower model.max_tokens in config.yaml."
)
return {
"final_response": _final_response,
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": (
"max_tokens exceeds the provider's output cap for this model. "
"Lower model.max_tokens in config.yaml."
),
"error": _final_response,
"partial": True,
"failed": True,
}
@ -3405,11 +3418,13 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
logger.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
agent._persist_session(messages, conversation_history)
_final_response = f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached."
return {
"final_response": _final_response,
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
"error": _final_response,
"partial": True,
"failed": True,
"compression_exhausted": True,
@ -3448,11 +3463,13 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix} 💡 The conversation has accumulated too much content. Try /new to start fresh, or /compress to manually trigger compression.", force=True)
logger.error(f"{agent.log_prefix}Context length exceeded: {new_tokens:,} tokens. Cannot compress further.")
agent._persist_session(messages, conversation_history)
_final_response = f"Context length exceeded ({new_tokens:,} tokens). Cannot compress further."
return {
"final_response": _final_response,
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Context length exceeded ({new_tokens:,} tokens). Cannot compress further.",
"error": _final_response,
"partial": True,
"failed": True,
"compression_exhausted": True,
@ -3668,7 +3685,7 @@ def run_conversation(
error_detail=_nonretryable_summary,
)
return {
"final_response": None,
"final_response": _nonretryable_summary,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
@ -4125,7 +4142,7 @@ def run_conversation(
agent._persist_session(messages, conversation_history)
return {
"final_response": None,
"final_response": "Incomplete REASONING_SCRATCHPAD after 2 retries",
"messages": rolled_back_messages,
"api_calls": api_call_count,
"completed": False,
@ -4185,7 +4202,7 @@ def run_conversation(
agent._codex_incomplete_retries = 0
agent._persist_session(messages, conversation_history)
return {
"final_response": None,
"final_response": "Codex response remained incomplete after 3 continuation attempts",
"messages": messages,
"api_calls": api_call_count,
"completed": False,
@ -4231,13 +4248,14 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True)
agent._invalid_tool_retries = 0
agent._persist_session(messages, conversation_history)
_final_response = f"Model generated invalid tool call: {invalid_preview}"
return {
"final_response": None,
"final_response": _final_response,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": f"Model generated invalid tool call: {invalid_preview}"
"error": _final_response
}
assistant_msg = agent._build_assistant_message(assistant_message, finish_reason)
@ -4321,7 +4339,7 @@ def run_conversation(
agent._cleanup_task_resources(effective_task_id)
agent._persist_session(messages, conversation_history)
return {
"final_response": None,
"final_response": "Response truncated due to output length limit",
"messages": messages,
"api_calls": api_call_count,
"completed": False,

View file

@ -55,6 +55,46 @@ def test_is_destructive_command_treats_install_as_mutating():
assert run_agent._is_destructive_command("install template.env .env") is True
def test_run_conversation_dict_returns_include_final_response():
"""Structurally enforce final_response on dict returns from run_conversation().
This parses source, including nested helpers, so it requires the .py file
to be available. It guards key presence and literal None values; runtime
tests still cover branch-specific values.
"""
from agent import conversation_loop
try:
source = inspect.getsource(conversation_loop.run_conversation)
except OSError as exc:
pytest.skip(f"run_conversation source is unavailable: {exc}")
tree = ast.parse(source)
missing = []
literal_none = []
for node in ast.walk(tree):
if not isinstance(node, ast.Return) or not isinstance(node.value, ast.Dict):
continue
keys = [
key.value if isinstance(key, ast.Constant) else None
for key in node.value.keys
]
if "final_response" not in keys:
missing.append(node.lineno)
continue
value = node.value.values[keys.index("final_response")]
if isinstance(value, ast.Constant) and value.value is None:
literal_none.append(node.lineno)
assert missing == [], (
"run_conversation() dict returns must preserve the final_response "
f"contract; missing at source-local lines {missing}"
)
assert literal_none == [], (
"run_conversation() dict returns must expose actionable final_response "
f"text instead of literal None; literal None at source-local lines {literal_none}"
)
@pytest.fixture()
def agent():
"""Minimal AIAgent with mocked OpenAI client and tool loading."""
@ -5296,6 +5336,7 @@ class TestRetryExhaustion:
assert result.get("failed") is True
assert "error" in result
assert "Invalid API response" in result["error"]
assert result.get("final_response") == result["error"]
def test_content_filter_refusal_surfaced_not_retried(self, agent):
"""A model refusal must be surfaced immediately, NOT laundered into