@@ -83,6 +83,16 @@ class RunStats:
8383 total_reasoning_tokens: Total reasoning tokens across all calls.
8484 total_tokens: Total tokens across all calls.
8585 total_response_time_ms: Total response time in milliseconds.
86+ total_cost: Total cost across all LLM calls.
87+ avg_cost_per_call: Average cost per successful LLM call.
88+ total_upstream_inference_cost: Total upstream inference cost.
89+ total_upstream_prompt_cost: Total upstream prompt cost.
90+ total_upstream_completion_cost: Total upstream completion cost.
91+ providers_used: List of unique providers used during the run.
92+ reasoning_calls: Number of calls that included reasoning content.
93+ avg_reasoning_content_length: Average length of reasoning content.
94+ total_reasoning_content_length: Total length of all reasoning content.
95+ request_ids: List of request IDs for tracing and debugging.
8696 """
8797
8898 # Game Performance
@@ -114,6 +124,24 @@ class RunStats:
114124 total_tokens : int = 0
115125 total_response_time_ms : float = 0.0
116126
127+ # Cost Tracking
128+ total_cost : float = 0.0
129+ avg_cost_per_call : float = 0.0
130+ total_upstream_inference_cost : float = 0.0
131+ total_upstream_prompt_cost : float = 0.0
132+ total_upstream_completion_cost : float = 0.0
133+
134+ # Provider Tracking
135+ providers_used : list [str ] = field (default_factory = list )
136+
137+ # Reasoning Analysis
138+ reasoning_calls : int = 0
139+ avg_reasoning_content_length : float = 0.0
140+ total_reasoning_content_length : int = 0
141+
142+ # Request Tracking
143+ request_ids : list [str ] = field (default_factory = list )
144+
117145
118146@dataclass
119147class RunStatsCollector :
@@ -361,6 +389,12 @@ def calculate_stats(self) -> RunStats:
361389 output_tokens = []
362390 reasoning_tokens = []
363391 total_tokens = []
392+ costs = []
393+ upstream_inference_costs = []
394+ upstream_prompt_costs = []
395+ upstream_completion_costs = []
396+ reasoning_content_lengths = []
397+ providers = []
364398
365399 with open (responses_path , "r" ) as f :
366400 for line in f :
@@ -374,14 +408,68 @@ def calculate_stats(self) -> RunStats:
374408 message = body .get ("choices" , [{}])[0 ].get ("message" , {})
375409 usage = body .get ("usage" , {})
376410
411+ # Token tracking
377412 if "prompt_tokens" in usage :
378413 input_tokens .append (usage ["prompt_tokens" ])
379414 if "completion_tokens" in usage :
380415 output_tokens .append (usage ["completion_tokens" ])
381416 if "reasoning_tokens" in usage :
382417 reasoning_tokens .append (usage ["reasoning_tokens" ])
418+ elif usage .get ("completion_tokens_details" , {}).get (
419+ "reasoning_tokens"
420+ ):
421+ reasoning_tokens .append (
422+ usage ["completion_tokens_details" ]["reasoning_tokens" ]
423+ )
383424 if "total_tokens" in usage :
384425 total_tokens .append (usage ["total_tokens" ])
426+
427+ # Cost tracking
428+ if "cost" in usage and usage ["cost" ] is not None :
429+ costs .append (usage ["cost" ])
430+ cost_details = usage .get ("cost_details" , {})
431+ if (
432+ "upstream_inference_cost" in cost_details
433+ and cost_details ["upstream_inference_cost" ] is not None
434+ ):
435+ upstream_inference_costs .append (
436+ cost_details ["upstream_inference_cost" ]
437+ )
438+ if (
439+ "upstream_inference_prompt_cost" in cost_details
440+ and cost_details ["upstream_inference_prompt_cost" ]
441+ is not None
442+ ):
443+ upstream_prompt_costs .append (
444+ cost_details ["upstream_inference_prompt_cost" ]
445+ )
446+ if (
447+ "upstream_inference_completions_cost" in cost_details
448+ and cost_details ["upstream_inference_completions_cost" ]
449+ is not None
450+ ):
451+ upstream_completion_costs .append (
452+ cost_details ["upstream_inference_completions_cost" ]
453+ )
454+
455+ # Provider tracking
456+ if "provider" in body :
457+ provider = body ["provider" ]
458+ providers .append (provider )
459+ if provider not in stats .providers_used :
460+ stats .providers_used .append (provider )
461+
462+ # Request ID tracking
463+ request_id = response .get ("response" , {}).get ("request_id" )
464+ if request_id :
465+ stats .request_ids .append (request_id )
466+
467+ # Reasoning content analysis
468+ reasoning_content = message .get ("reasoning_content" , "" )
469+ if reasoning_content :
470+ stats .reasoning_calls += 1
471+ reasoning_content_lengths .append (len (reasoning_content ))
472+
385473 if message .get ("tool_calls" ) is None :
386474 stats .invalid_responses += 1
387475
@@ -407,4 +495,19 @@ def calculate_stats(self) -> RunStats:
407495 sum (total_tokens ) / len (total_tokens ) if total_tokens else 0.0
408496 )
409497
498+ # Calculate cost totals and averages
499+ stats .total_cost = sum (costs )
500+ stats .avg_cost_per_call = sum (costs ) / len (costs ) if costs else 0.0
501+ stats .total_upstream_inference_cost = sum (upstream_inference_costs )
502+ stats .total_upstream_prompt_cost = sum (upstream_prompt_costs )
503+ stats .total_upstream_completion_cost = sum (upstream_completion_costs )
504+
505+ # Calculate reasoning content averages
506+ stats .total_reasoning_content_length = sum (reasoning_content_lengths )
507+ stats .avg_reasoning_content_length = (
508+ sum (reasoning_content_lengths ) / len (reasoning_content_lengths )
509+ if reasoning_content_lengths
510+ else 0.0
511+ )
512+
410513 return stats
0 commit comments