From 3a122ba4acaabec5768ceddb46da82e43c382d7c Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 2 Jul 2026 13:52:42 -0700
Subject: [PATCH] fix(usage): capture reasoning_tokens from
 completion_tokens_details on chat_completions (#57340)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

normalize_usage only read output_tokens_details.reasoning_tokens (the
Responses API shape). Chat Completions providers — OpenAI, OpenRouter,
DeepSeek, and every OpenAI-compatible proxy — report it under
completion_tokens_details.reasoning_tokens, so reasoning_tokens was 0 for
every chat_completions reasoning model: hidden thinking was invisible in
session accounting, MoA traces, and the eval's per-task token columns.

Measured impact (HermesBench MoA run on deepseek-v4-flash, 4,828 advisor
calls): reasoning_tokens showed 0 everywhere while individual calls burned
up to 21.5K hidden thinking tokens to emit ~500 visible tokens. Verified
live against OpenRouter: deepseek-v4-flash returns
completion_tokens_details.reasoning_tokens=61 for a 74-completion-token
call; the field was simply never read.

Responses-shape reads are unchanged; the new read only fires when the
Responses shape yielded nothing.
---
 agent/usage_pricing.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/agent/usage_pricing.py b/agent/usage_pricing.py
index 32338caff..d7b56a9fa 100644
--- a/agent/usage_pricing.py
+++ b/agent/usage_pricing.py
@@ -820,9 +820,22 @@ def normalize_usage(
         input_tokens = max(0, prompt_total - cache_read_tokens - cache_write_tokens)
 
     reasoning_tokens = 0
+    # Responses API shape: output_tokens_details.reasoning_tokens.
+    # Chat Completions shape (OpenAI, OpenRouter, DeepSeek, etc.):
+    # completion_tokens_details.reasoning_tokens. Reading only the former
+    # left reasoning_tokens=0 for every chat_completions reasoning model —
+    # hidden thinking was invisible in session accounting even though it
+    # dominates output spend on models like deepseek-v4-flash (measured:
+    # single calls burning 21K reasoning tokens to emit 500 visible tokens).
     output_details = getattr(response_usage, "output_tokens_details", None)
     if output_details:
         reasoning_tokens = _to_int(getattr(output_details, "reasoning_tokens", 0))
+    if not reasoning_tokens:
+        completion_details = getattr(response_usage, "completion_tokens_details", None)
+        if completion_details:
+            reasoning_tokens = _to_int(
+                getattr(completion_details, "reasoning_tokens", 0)
+            )
 
     return CanonicalUsage(
         input_tokens=input_tokens,