@@ -607,149 +607,196 @@ def _get_efficiency_rating(self, metrics: AggregatedMetrics) -> str:
607607 def generate_leaderboard (
608608 self , output_dir : Path = Path ("benchmark_results" )
609609 ) -> None :
610- """Generate leaderboard and detailed analysis files."""
610+ """Generate leaderboard and detailed analysis files organized by version/strategy ."""
611611 print ("Generating leaderboard..." )
612612
613613 output_dir .mkdir (exist_ok = True )
614614
615- # Sort by performance score
616- sorted_metrics = sorted (
617- self .aggregated_metrics .values (),
618- key = lambda x : x .performance_score ,
619- reverse = True ,
620- )
615+ # Group metrics by version and strategy
616+ version_strategy_groups = defaultdict (lambda : defaultdict (list ))
617+ for metrics in self .aggregated_metrics .values ():
618+ version_strategy_groups [metrics .version ][metrics .strategy ].append (metrics )
619+
620+ # Process each version
621+ for version , strategy_groups in version_strategy_groups .items ():
622+ version_dir = output_dir / f"v{ version } "
623+ version_dir .mkdir (exist_ok = True )
624+
625+ # Process each strategy within the version
626+ for strategy , metrics_list in strategy_groups .items ():
627+ strategy_dir = version_dir / strategy
628+ strategy_dir .mkdir (exist_ok = True )
629+
630+ # Sort metrics within this strategy by performance score
631+ sorted_metrics = sorted (
632+ metrics_list ,
633+ key = lambda x : x .performance_score ,
634+ reverse = True ,
635+ )
621636
622- # Generate leaderboard entries
623- leaderboard_entries = []
624- for rank , metrics in enumerate (sorted_metrics , 1 ):
625- entry = LeaderboardEntry (
626- rank = rank ,
627- version = metrics .version ,
628- model = metrics .model ,
629- strategy = metrics .strategy ,
630- performance_score = metrics .performance_score ,
631- win_rate = metrics .win_rate ,
632- avg_final_ante = metrics .avg_final_ante ,
633- avg_duration_seconds = metrics .avg_duration_seconds ,
634- total_runs = metrics .total_runs ,
635- completion_rate = metrics .completion_rate ,
636- efficiency_rating = self ._get_efficiency_rating (metrics ),
637- )
638- leaderboard_entries .append (entry )
639-
640- # Write leaderboard.json
641- leaderboard_data = {
642- "generated_at" : datetime .now ().isoformat (),
643- "total_entries" : len (leaderboard_entries ),
644- "total_runs_analyzed" : len (self .run_metrics ),
645- "entries" : [
646- {
647- "rank" : entry .rank ,
648- "version" : entry .version ,
649- "model" : entry .model ,
650- "strategy" : entry .strategy ,
651- "performance_score" : round (entry .performance_score , 2 ),
652- "win_rate" : round (entry .win_rate , 3 ),
653- "avg_final_ante" : round (entry .avg_final_ante , 2 ),
654- "avg_duration_seconds" : round (entry .avg_duration_seconds , 1 ),
655- "total_runs" : entry .total_runs ,
656- "completion_rate" : round (entry .completion_rate , 3 ),
657- "efficiency_rating" : entry .efficiency_rating ,
637+ # Generate strategy-specific leaderboard entries
638+ leaderboard_entries = []
639+ for rank , metrics in enumerate (sorted_metrics , 1 ):
640+ entry = LeaderboardEntry (
641+ rank = rank ,
642+ version = metrics .version ,
643+ model = metrics .model ,
644+ strategy = metrics .strategy ,
645+ performance_score = metrics .performance_score ,
646+ win_rate = metrics .win_rate ,
647+ avg_final_ante = metrics .avg_final_ante ,
648+ avg_duration_seconds = metrics .avg_duration_seconds ,
649+ total_runs = metrics .total_runs ,
650+ completion_rate = metrics .completion_rate ,
651+ efficiency_rating = self ._get_efficiency_rating (metrics ),
652+ )
653+ leaderboard_entries .append (entry )
654+
655+ # Write strategy-specific leaderboard.json
656+ leaderboard_data = {
657+ "generated_at" : datetime .now ().isoformat (),
658+ "version" : version ,
659+ "strategy" : strategy ,
660+ "total_entries" : len (leaderboard_entries ),
661+ "total_runs_analyzed" : sum (m .total_runs for m in metrics_list ),
662+ "entries" : [
663+ {
664+ "rank" : entry .rank ,
665+ "model" : entry .model ,
666+ "performance_score" : round (entry .performance_score , 2 ),
667+ "win_rate" : round (entry .win_rate , 3 ),
668+ "avg_final_ante" : round (entry .avg_final_ante , 2 ),
669+ "avg_duration_seconds" : round (
670+ entry .avg_duration_seconds , 1
671+ ),
672+ "total_runs" : entry .total_runs ,
673+ "completion_rate" : round (entry .completion_rate , 3 ),
674+ "efficiency_rating" : entry .efficiency_rating ,
675+ }
676+ for entry in leaderboard_entries
677+ ],
658678 }
659- for entry in leaderboard_entries
660- ],
661- }
662679
663- leaderboard_file = output_dir / "leaderboard.json"
664- with open (leaderboard_file , "w" ) as f :
665- json .dump (leaderboard_data , f , indent = 2 )
666-
667- # Write detailed files for each entry
668- for metrics in sorted_metrics :
669- entry_name = (
670- f"{ metrics .version } _{ metrics .model } _{ metrics .strategy } " .replace (
671- "/" , "-"
672- )
673- )
674- entry_file = output_dir / f"{ entry_name } .json"
675-
676- entry_data = {
677- "version" : metrics .version ,
678- "model" : metrics .model ,
679- "strategy" : metrics .strategy ,
680- "summary" : {
681- "performance_score" : round (metrics .performance_score , 2 ),
682- "efficiency_rating" : self ._get_efficiency_rating (metrics ),
683- "total_runs" : metrics .total_runs ,
684- "completed_runs" : metrics .completed_runs ,
685- "completion_rate" : round (metrics .completion_rate , 3 ),
686- },
687- "performance_metrics" : {
688- "win_rate" : round (metrics .win_rate , 3 ),
689- "avg_final_ante" : round (metrics .avg_final_ante , 2 ),
690- "avg_final_money" : round (metrics .avg_final_money , 1 ),
691- "avg_peak_money" : round (metrics .avg_peak_money , 1 ),
692- "avg_duration_seconds" : round (metrics .avg_duration_seconds , 1 ),
693- },
694- "llm_metrics" : {
695- "avg_success_rate" : round (metrics .avg_success_rate , 3 ),
696- "avg_response_time" : round (metrics .avg_response_time , 3 ),
697- "avg_total_tokens" : round (metrics .avg_total_tokens , 1 ),
698- "avg_tokens_per_request" : round (metrics .avg_tokens_per_request , 1 ),
699- "avg_parsing_error_rate" : round (metrics .avg_parsing_error_rate , 3 ),
700- "avg_timeout_error_rate" : round (metrics .avg_timeout_error_rate , 3 ),
701- },
702- "consistency_metrics" : {
703- "std_final_ante" : round (metrics .std_final_ante , 2 ),
704- "std_final_money" : round (metrics .std_final_money , 1 ),
705- "std_duration" : round (metrics .std_duration , 1 ),
706- },
707- "efficiency_metrics" : {
708- "tokens_per_ante" : round (metrics .tokens_per_ante , 1 ),
709- "seconds_per_ante" : round (metrics .seconds_per_ante , 1 ),
710- "money_efficiency" : round (metrics .money_efficiency , 4 ),
711- },
712- "strategy_metrics" : {
713- "avg_shop_purchases" : round (metrics .avg_shop_purchases , 1 ),
714- "avg_jokers_acquired" : round (metrics .avg_jokers_acquired , 1 ),
715- "avg_consumables_used" : round (metrics .avg_consumables_used , 1 ),
716- "avg_blinds_skipped" : round (metrics .avg_blinds_skipped , 1 ),
717- },
718- "individual_runs" : [
719- {
720- "seed" : run .seed ,
721- "deck" : run .deck ,
722- "stake" : run .stake ,
723- "completed" : run .completed ,
724- "won" : run .won ,
725- "final_ante" : run .final_ante ,
726- "final_money" : run .final_money ,
727- "peak_money" : run .peak_money ,
728- "duration_seconds" : run .duration_seconds ,
729- "total_tokens" : run .total_tokens ,
730- "success_rate" : round (run .success_rate , 3 ),
731- "parsing_errors" : run .parsing_errors ,
732- "timeout_errors" : run .timeout_errors ,
680+ leaderboard_file = strategy_dir / "leaderboard.json"
681+ with open (leaderboard_file , "w" ) as f :
682+ json .dump (leaderboard_data , f , indent = 2 )
683+
684+ # Write individual model files within the strategy directory
685+ for metrics in sorted_metrics :
686+ model_filename = metrics .model .replace ("/" , "-" ) + ".json"
687+ model_file = strategy_dir / model_filename
688+
689+ model_data = {
690+ "version" : metrics .version ,
691+ "model" : metrics .model ,
692+ "strategy" : metrics .strategy ,
693+ "name" : metrics .runs [0 ].raw_config .get ("name" , "Unknown Name" ),
694+ "description" : metrics .runs [0 ].raw_config .get (
695+ "description" , "Unknown Description"
696+ ),
697+ "author" : metrics .runs [0 ].raw_config .get (
698+ "author" , "BalatroBench"
699+ ),
700+ "tags" : metrics .runs [0 ].raw_config .get ("tags" , []),
701+ "summary" : {
702+ "performance_score" : round (metrics .performance_score , 2 ),
703+ "efficiency_rating" : self ._get_efficiency_rating (metrics ),
704+ "total_runs" : metrics .total_runs ,
705+ "completed_runs" : metrics .completed_runs ,
706+ "completion_rate" : round (metrics .completion_rate , 3 ),
707+ },
708+ "performance_metrics" : {
709+ "win_rate" : round (metrics .win_rate , 3 ),
710+ "avg_final_ante" : round (metrics .avg_final_ante , 2 ),
711+ "avg_final_money" : round (metrics .avg_final_money , 1 ),
712+ "avg_peak_money" : round (metrics .avg_peak_money , 1 ),
713+ "avg_duration_seconds" : round (
714+ metrics .avg_duration_seconds , 1
715+ ),
716+ },
717+ "llm_metrics" : {
718+ "avg_success_rate" : round (metrics .avg_success_rate , 3 ),
719+ "avg_response_time" : round (metrics .avg_response_time , 3 ),
720+ "avg_total_tokens" : round (metrics .avg_total_tokens , 1 ),
721+ "avg_tokens_per_request" : round (
722+ metrics .avg_tokens_per_request , 1
723+ ),
724+ "avg_parsing_error_rate" : round (
725+ metrics .avg_parsing_error_rate , 3
726+ ),
727+ "avg_timeout_error_rate" : round (
728+ metrics .avg_timeout_error_rate , 3
729+ ),
730+ },
731+ "consistency_metrics" : {
732+ "std_final_ante" : round (metrics .std_final_ante , 2 ),
733+ "std_final_money" : round (metrics .std_final_money , 1 ),
734+ "std_duration" : round (metrics .std_duration , 1 ),
735+ },
736+ "efficiency_metrics" : {
737+ "tokens_per_ante" : round (metrics .tokens_per_ante , 1 ),
738+ "seconds_per_ante" : round (metrics .seconds_per_ante , 1 ),
739+ "money_efficiency" : round (metrics .money_efficiency , 4 ),
740+ },
741+ "strategy_metrics" : {
742+ "avg_shop_purchases" : round (metrics .avg_shop_purchases , 1 ),
743+ "avg_jokers_acquired" : round (
744+ metrics .avg_jokers_acquired , 1
745+ ),
746+ "avg_consumables_used" : round (
747+ metrics .avg_consumables_used , 1
748+ ),
749+ "avg_blinds_skipped" : round (metrics .avg_blinds_skipped , 1 ),
750+ },
751+ "individual_runs" : [
752+ {
753+ "seed" : run .seed ,
754+ "deck" : run .deck ,
755+ "stake" : run .stake ,
756+ "completed" : run .completed ,
757+ "won" : run .won ,
758+ "final_ante" : run .final_ante ,
759+ "final_money" : run .final_money ,
760+ "peak_money" : run .peak_money ,
761+ "duration_seconds" : run .duration_seconds ,
762+ "total_tokens" : run .total_tokens ,
763+ "success_rate" : round (run .success_rate , 3 ),
764+ "parsing_errors" : run .parsing_errors ,
765+ "timeout_errors" : run .timeout_errors ,
766+ }
767+ for run in metrics .runs
768+ ],
733769 }
734- for run in metrics .runs
735- ],
736- }
737770
738- with open (entry_file , "w" ) as f :
739- json .dump (entry_data , f , indent = 2 )
771+ with open (model_file , "w" ) as f :
772+ json .dump (model_data , f , indent = 2 )
773+
774+ print (f"Generated strategy leaderboard: { leaderboard_file } " )
775+
776+ # Generate global summary statistics
777+ total_entries = len (self .aggregated_metrics )
778+ total_runs = len (self .run_metrics )
779+
780+ print (
781+ f"Generated benchmark results with { total_entries } model/strategy combinations"
782+ )
783+ print (f"Total runs analyzed: { total_runs } " )
784+ print (f"Results organized by version/strategy in: { output_dir } /" )
740785
741- print (f"Generated leaderboard with { len (leaderboard_entries )} entries" )
742- print (f"Results saved to { output_dir } /" )
743- print (f"Leaderboard: { leaderboard_file } " )
786+ # Print top performers across all strategies for immediate feedback
787+ all_sorted_metrics = sorted (
788+ self .aggregated_metrics .values (),
789+ key = lambda x : x .performance_score ,
790+ reverse = True ,
791+ )
744792
745- # Print top 5 for immediate feedback
746- print ("\n Top 5 Performers:" )
747- for i , entry in enumerate (leaderboard_entries [:5 ]):
793+ print ("\n Top 5 Performers Overall:" )
794+ for i , metrics in enumerate (all_sorted_metrics [:5 ]):
748795 print (
749- f"{ entry . rank } . { entry .model } ({ entry .strategy } ) - "
750- f"Score: { entry .performance_score :.1f} , "
751- f"Win Rate: { entry .win_rate :.1%} , "
752- f"Avg Ante: { entry .avg_final_ante :.1f} "
796+ f"{ i + 1 } . { metrics .model } ({ metrics .strategy } , v { metrics . version } ) - "
797+ f"Score: { metrics .performance_score :.1f} , "
798+ f"Win Rate: { metrics .win_rate :.1%} , "
799+ f"Avg Ante: { metrics .avg_final_ante :.1f} "
753800 )
754801
755802
@@ -766,4 +813,3 @@ def run_benchmark_analysis(
766813 analyzer .generate_leaderboard (output_dir )
767814
768815 print ("\n Benchmark analysis complete!" )
769-
0 commit comments