Skip to content

Commit 68d011b

Browse files
committed
feat: add additional fields to config.json for community metadata
1 parent 9c02208 commit 68d011b

5 files changed

Lines changed: 199 additions & 144 deletions

File tree

src/balatrollm/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,13 @@
44

55
import argparse
66
import asyncio
7-
import json
87
import os
98
import sys
109
from pathlib import Path
1110

11+
from .benchmark import run_benchmark_analysis
1212
from .bot import LLMBot, setup_logging
1313
from .config import Config
14-
from .benchmark import run_benchmark_analysis
1514

1615

1716
def main() -> None:

src/balatrollm/benchmark.py

Lines changed: 178 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -607,149 +607,196 @@ def _get_efficiency_rating(self, metrics: AggregatedMetrics) -> str:
607607
def generate_leaderboard(
608608
self, output_dir: Path = Path("benchmark_results")
609609
) -> None:
610-
"""Generate leaderboard and detailed analysis files."""
610+
"""Generate leaderboard and detailed analysis files organized by version/strategy."""
611611
print("Generating leaderboard...")
612612

613613
output_dir.mkdir(exist_ok=True)
614614

615-
# Sort by performance score
616-
sorted_metrics = sorted(
617-
self.aggregated_metrics.values(),
618-
key=lambda x: x.performance_score,
619-
reverse=True,
620-
)
615+
# Group metrics by version and strategy
616+
version_strategy_groups = defaultdict(lambda: defaultdict(list))
617+
for metrics in self.aggregated_metrics.values():
618+
version_strategy_groups[metrics.version][metrics.strategy].append(metrics)
619+
620+
# Process each version
621+
for version, strategy_groups in version_strategy_groups.items():
622+
version_dir = output_dir / f"v{version}"
623+
version_dir.mkdir(exist_ok=True)
624+
625+
# Process each strategy within the version
626+
for strategy, metrics_list in strategy_groups.items():
627+
strategy_dir = version_dir / strategy
628+
strategy_dir.mkdir(exist_ok=True)
629+
630+
# Sort metrics within this strategy by performance score
631+
sorted_metrics = sorted(
632+
metrics_list,
633+
key=lambda x: x.performance_score,
634+
reverse=True,
635+
)
621636

622-
# Generate leaderboard entries
623-
leaderboard_entries = []
624-
for rank, metrics in enumerate(sorted_metrics, 1):
625-
entry = LeaderboardEntry(
626-
rank=rank,
627-
version=metrics.version,
628-
model=metrics.model,
629-
strategy=metrics.strategy,
630-
performance_score=metrics.performance_score,
631-
win_rate=metrics.win_rate,
632-
avg_final_ante=metrics.avg_final_ante,
633-
avg_duration_seconds=metrics.avg_duration_seconds,
634-
total_runs=metrics.total_runs,
635-
completion_rate=metrics.completion_rate,
636-
efficiency_rating=self._get_efficiency_rating(metrics),
637-
)
638-
leaderboard_entries.append(entry)
639-
640-
# Write leaderboard.json
641-
leaderboard_data = {
642-
"generated_at": datetime.now().isoformat(),
643-
"total_entries": len(leaderboard_entries),
644-
"total_runs_analyzed": len(self.run_metrics),
645-
"entries": [
646-
{
647-
"rank": entry.rank,
648-
"version": entry.version,
649-
"model": entry.model,
650-
"strategy": entry.strategy,
651-
"performance_score": round(entry.performance_score, 2),
652-
"win_rate": round(entry.win_rate, 3),
653-
"avg_final_ante": round(entry.avg_final_ante, 2),
654-
"avg_duration_seconds": round(entry.avg_duration_seconds, 1),
655-
"total_runs": entry.total_runs,
656-
"completion_rate": round(entry.completion_rate, 3),
657-
"efficiency_rating": entry.efficiency_rating,
637+
# Generate strategy-specific leaderboard entries
638+
leaderboard_entries = []
639+
for rank, metrics in enumerate(sorted_metrics, 1):
640+
entry = LeaderboardEntry(
641+
rank=rank,
642+
version=metrics.version,
643+
model=metrics.model,
644+
strategy=metrics.strategy,
645+
performance_score=metrics.performance_score,
646+
win_rate=metrics.win_rate,
647+
avg_final_ante=metrics.avg_final_ante,
648+
avg_duration_seconds=metrics.avg_duration_seconds,
649+
total_runs=metrics.total_runs,
650+
completion_rate=metrics.completion_rate,
651+
efficiency_rating=self._get_efficiency_rating(metrics),
652+
)
653+
leaderboard_entries.append(entry)
654+
655+
# Write strategy-specific leaderboard.json
656+
leaderboard_data = {
657+
"generated_at": datetime.now().isoformat(),
658+
"version": version,
659+
"strategy": strategy,
660+
"total_entries": len(leaderboard_entries),
661+
"total_runs_analyzed": sum(m.total_runs for m in metrics_list),
662+
"entries": [
663+
{
664+
"rank": entry.rank,
665+
"model": entry.model,
666+
"performance_score": round(entry.performance_score, 2),
667+
"win_rate": round(entry.win_rate, 3),
668+
"avg_final_ante": round(entry.avg_final_ante, 2),
669+
"avg_duration_seconds": round(
670+
entry.avg_duration_seconds, 1
671+
),
672+
"total_runs": entry.total_runs,
673+
"completion_rate": round(entry.completion_rate, 3),
674+
"efficiency_rating": entry.efficiency_rating,
675+
}
676+
for entry in leaderboard_entries
677+
],
658678
}
659-
for entry in leaderboard_entries
660-
],
661-
}
662679

663-
leaderboard_file = output_dir / "leaderboard.json"
664-
with open(leaderboard_file, "w") as f:
665-
json.dump(leaderboard_data, f, indent=2)
666-
667-
# Write detailed files for each entry
668-
for metrics in sorted_metrics:
669-
entry_name = (
670-
f"{metrics.version}_{metrics.model}_{metrics.strategy}".replace(
671-
"/", "-"
672-
)
673-
)
674-
entry_file = output_dir / f"{entry_name}.json"
675-
676-
entry_data = {
677-
"version": metrics.version,
678-
"model": metrics.model,
679-
"strategy": metrics.strategy,
680-
"summary": {
681-
"performance_score": round(metrics.performance_score, 2),
682-
"efficiency_rating": self._get_efficiency_rating(metrics),
683-
"total_runs": metrics.total_runs,
684-
"completed_runs": metrics.completed_runs,
685-
"completion_rate": round(metrics.completion_rate, 3),
686-
},
687-
"performance_metrics": {
688-
"win_rate": round(metrics.win_rate, 3),
689-
"avg_final_ante": round(metrics.avg_final_ante, 2),
690-
"avg_final_money": round(metrics.avg_final_money, 1),
691-
"avg_peak_money": round(metrics.avg_peak_money, 1),
692-
"avg_duration_seconds": round(metrics.avg_duration_seconds, 1),
693-
},
694-
"llm_metrics": {
695-
"avg_success_rate": round(metrics.avg_success_rate, 3),
696-
"avg_response_time": round(metrics.avg_response_time, 3),
697-
"avg_total_tokens": round(metrics.avg_total_tokens, 1),
698-
"avg_tokens_per_request": round(metrics.avg_tokens_per_request, 1),
699-
"avg_parsing_error_rate": round(metrics.avg_parsing_error_rate, 3),
700-
"avg_timeout_error_rate": round(metrics.avg_timeout_error_rate, 3),
701-
},
702-
"consistency_metrics": {
703-
"std_final_ante": round(metrics.std_final_ante, 2),
704-
"std_final_money": round(metrics.std_final_money, 1),
705-
"std_duration": round(metrics.std_duration, 1),
706-
},
707-
"efficiency_metrics": {
708-
"tokens_per_ante": round(metrics.tokens_per_ante, 1),
709-
"seconds_per_ante": round(metrics.seconds_per_ante, 1),
710-
"money_efficiency": round(metrics.money_efficiency, 4),
711-
},
712-
"strategy_metrics": {
713-
"avg_shop_purchases": round(metrics.avg_shop_purchases, 1),
714-
"avg_jokers_acquired": round(metrics.avg_jokers_acquired, 1),
715-
"avg_consumables_used": round(metrics.avg_consumables_used, 1),
716-
"avg_blinds_skipped": round(metrics.avg_blinds_skipped, 1),
717-
},
718-
"individual_runs": [
719-
{
720-
"seed": run.seed,
721-
"deck": run.deck,
722-
"stake": run.stake,
723-
"completed": run.completed,
724-
"won": run.won,
725-
"final_ante": run.final_ante,
726-
"final_money": run.final_money,
727-
"peak_money": run.peak_money,
728-
"duration_seconds": run.duration_seconds,
729-
"total_tokens": run.total_tokens,
730-
"success_rate": round(run.success_rate, 3),
731-
"parsing_errors": run.parsing_errors,
732-
"timeout_errors": run.timeout_errors,
680+
leaderboard_file = strategy_dir / "leaderboard.json"
681+
with open(leaderboard_file, "w") as f:
682+
json.dump(leaderboard_data, f, indent=2)
683+
684+
# Write individual model files within the strategy directory
685+
for metrics in sorted_metrics:
686+
model_filename = metrics.model.replace("/", "-") + ".json"
687+
model_file = strategy_dir / model_filename
688+
689+
model_data = {
690+
"version": metrics.version,
691+
"model": metrics.model,
692+
"strategy": metrics.strategy,
693+
"name": metrics.runs[0].raw_config.get("name", "Unknown Name"),
694+
"description": metrics.runs[0].raw_config.get(
695+
"description", "Unknown Description"
696+
),
697+
"author": metrics.runs[0].raw_config.get(
698+
"author", "BalatroBench"
699+
),
700+
"tags": metrics.runs[0].raw_config.get("tags", []),
701+
"summary": {
702+
"performance_score": round(metrics.performance_score, 2),
703+
"efficiency_rating": self._get_efficiency_rating(metrics),
704+
"total_runs": metrics.total_runs,
705+
"completed_runs": metrics.completed_runs,
706+
"completion_rate": round(metrics.completion_rate, 3),
707+
},
708+
"performance_metrics": {
709+
"win_rate": round(metrics.win_rate, 3),
710+
"avg_final_ante": round(metrics.avg_final_ante, 2),
711+
"avg_final_money": round(metrics.avg_final_money, 1),
712+
"avg_peak_money": round(metrics.avg_peak_money, 1),
713+
"avg_duration_seconds": round(
714+
metrics.avg_duration_seconds, 1
715+
),
716+
},
717+
"llm_metrics": {
718+
"avg_success_rate": round(metrics.avg_success_rate, 3),
719+
"avg_response_time": round(metrics.avg_response_time, 3),
720+
"avg_total_tokens": round(metrics.avg_total_tokens, 1),
721+
"avg_tokens_per_request": round(
722+
metrics.avg_tokens_per_request, 1
723+
),
724+
"avg_parsing_error_rate": round(
725+
metrics.avg_parsing_error_rate, 3
726+
),
727+
"avg_timeout_error_rate": round(
728+
metrics.avg_timeout_error_rate, 3
729+
),
730+
},
731+
"consistency_metrics": {
732+
"std_final_ante": round(metrics.std_final_ante, 2),
733+
"std_final_money": round(metrics.std_final_money, 1),
734+
"std_duration": round(metrics.std_duration, 1),
735+
},
736+
"efficiency_metrics": {
737+
"tokens_per_ante": round(metrics.tokens_per_ante, 1),
738+
"seconds_per_ante": round(metrics.seconds_per_ante, 1),
739+
"money_efficiency": round(metrics.money_efficiency, 4),
740+
},
741+
"strategy_metrics": {
742+
"avg_shop_purchases": round(metrics.avg_shop_purchases, 1),
743+
"avg_jokers_acquired": round(
744+
metrics.avg_jokers_acquired, 1
745+
),
746+
"avg_consumables_used": round(
747+
metrics.avg_consumables_used, 1
748+
),
749+
"avg_blinds_skipped": round(metrics.avg_blinds_skipped, 1),
750+
},
751+
"individual_runs": [
752+
{
753+
"seed": run.seed,
754+
"deck": run.deck,
755+
"stake": run.stake,
756+
"completed": run.completed,
757+
"won": run.won,
758+
"final_ante": run.final_ante,
759+
"final_money": run.final_money,
760+
"peak_money": run.peak_money,
761+
"duration_seconds": run.duration_seconds,
762+
"total_tokens": run.total_tokens,
763+
"success_rate": round(run.success_rate, 3),
764+
"parsing_errors": run.parsing_errors,
765+
"timeout_errors": run.timeout_errors,
766+
}
767+
for run in metrics.runs
768+
],
733769
}
734-
for run in metrics.runs
735-
],
736-
}
737770

738-
with open(entry_file, "w") as f:
739-
json.dump(entry_data, f, indent=2)
771+
with open(model_file, "w") as f:
772+
json.dump(model_data, f, indent=2)
773+
774+
print(f"Generated strategy leaderboard: {leaderboard_file}")
775+
776+
# Generate global summary statistics
777+
total_entries = len(self.aggregated_metrics)
778+
total_runs = len(self.run_metrics)
779+
780+
print(
781+
f"Generated benchmark results with {total_entries} model/strategy combinations"
782+
)
783+
print(f"Total runs analyzed: {total_runs}")
784+
print(f"Results organized by version/strategy in: {output_dir}/")
740785

741-
print(f"Generated leaderboard with {len(leaderboard_entries)} entries")
742-
print(f"Results saved to {output_dir}/")
743-
print(f"Leaderboard: {leaderboard_file}")
786+
# Print top performers across all strategies for immediate feedback
787+
all_sorted_metrics = sorted(
788+
self.aggregated_metrics.values(),
789+
key=lambda x: x.performance_score,
790+
reverse=True,
791+
)
744792

745-
# Print top 5 for immediate feedback
746-
print("\nTop 5 Performers:")
747-
for i, entry in enumerate(leaderboard_entries[:5]):
793+
print("\nTop 5 Performers Overall:")
794+
for i, metrics in enumerate(all_sorted_metrics[:5]):
748795
print(
749-
f"{entry.rank}. {entry.model} ({entry.strategy}) - "
750-
f"Score: {entry.performance_score:.1f}, "
751-
f"Win Rate: {entry.win_rate:.1%}, "
752-
f"Avg Ante: {entry.avg_final_ante:.1f}"
796+
f"{i + 1}. {metrics.model} ({metrics.strategy}, v{metrics.version}) - "
797+
f"Score: {metrics.performance_score:.1f}, "
798+
f"Win Rate: {metrics.win_rate:.1%}, "
799+
f"Avg Ante: {metrics.avg_final_ante:.1f}"
753800
)
754801

755802

@@ -766,4 +813,3 @@ def run_benchmark_analysis(
766813
analyzer.generate_leaderboard(output_dir)
767814

768815
print("\nBenchmark analysis complete!")
769-

src/balatrollm/bot.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,12 @@ async def _initialize_game_run(
293293
"challenge": challenge,
294294
"started_at": datetime.now().isoformat(),
295295
"balatrollm_version": self.project_version,
296+
# Community metadata fields with defaults
297+
"name": "Unknown Name",
298+
"description": "Unknown Description",
299+
"author": "BalatroBench",
300+
"version": self.project_version,
301+
"tags": [],
296302
}
297303

298304
self.data_collector = RunDataCollector(run_dir=run_dir, run_config=run_config)

src/balatrollm/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ def from_config_file(cls, config_path: str) -> "Config":
3131
config_file = Path(config_path)
3232
if not config_file.exists():
3333
raise FileNotFoundError(f"Config file not found: {config_file}")
34-
34+
3535
with config_file.open() as f:
3636
config_data = json.load(f)
37-
37+
3838
# Map the config.json fields to Config fields
3939
# config.json uses 'base_url' and 'strategy' (updated field names)
4040
return cls(

0 commit comments

Comments
 (0)