From 3a122ba4acaabec5768ceddb46da82e43c382d7c Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 2 Jul 2026 13:52:42 -0700 Subject: [PATCH] fix(usage): capture reasoning_tokens from completion_tokens_details on chat_completions (#57340) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit normalize_usage only read output_tokens_details.reasoning_tokens (the Responses API shape). Chat Completions providers — OpenAI, OpenRouter, DeepSeek, and every OpenAI-compatible proxy — report it under completion_tokens_details.reasoning_tokens, so reasoning_tokens was 0 for every chat_completions reasoning model: hidden thinking was invisible in session accounting, MoA traces, and the eval's per-task token columns. Measured impact (HermesBench MoA run on deepseek-v4-flash, 4,828 advisor calls): reasoning_tokens showed 0 everywhere while individual calls burned up to 21.5K hidden thinking tokens to emit ~500 visible tokens. Verified live against OpenRouter: deepseek-v4-flash returns completion_tokens_details.reasoning_tokens=61 for a 74-completion-token call; the field was simply never read. Responses-shape reads are unchanged; the new read only fires when the Responses shape yielded nothing. --- agent/usage_pricing.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/agent/usage_pricing.py b/agent/usage_pricing.py index 32338caff..d7b56a9fa 100644 --- a/agent/usage_pricing.py +++ b/agent/usage_pricing.py @@ -820,9 +820,22 @@ def normalize_usage( input_tokens = max(0, prompt_total - cache_read_tokens - cache_write_tokens) reasoning_tokens = 0 + # Responses API shape: output_tokens_details.reasoning_tokens. + # Chat Completions shape (OpenAI, OpenRouter, DeepSeek, etc.): + # completion_tokens_details.reasoning_tokens. Reading only the former + # left reasoning_tokens=0 for every chat_completions reasoning model — + # hidden thinking was invisible in session accounting even though it + # dominates output spend on models like deepseek-v4-flash (measured: + # single calls burning 21K reasoning tokens to emit 500 visible tokens). output_details = getattr(response_usage, "output_tokens_details", None) if output_details: reasoning_tokens = _to_int(getattr(output_details, "reasoning_tokens", 0)) + if not reasoning_tokens: + completion_details = getattr(response_usage, "completion_tokens_details", None) + if completion_details: + reasoning_tokens = _to_int( + getattr(completion_details, "reasoning_tokens", 0) + ) return CanonicalUsage( input_tokens=input_tokens,