aphrodite/create_enhanced_awards_data.py at main · jackkerouac/aphrodite · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
#!/usr/bin/env python3
"""
Create Enhanced Crunchyroll Awards Data with TMDb IDs

This script creates a comprehensive awards JSON file with TMDb IDs
for integration with Aphrodite's workflow.
"""

import json
import requests
import time
import re
from typing import Dict, List, Optional
from datetime import datetime

class AwardsDataCreator:
    def __init__(self, tmdb_api_key: str):
        self.tmdb_api_key = tmdb_api_key
        self.tmdb_base_url = "https://api.themoviedb.org/3"
        self.session = requests.Session()

    def get_comprehensive_awards_data(self) -> Dict:
        """Get comprehensive awards data with all known winners"""
        return {
            "2025": {
                "year": 2025,
                "ceremony_date": "2025-05-25",
                "location": "Grand Prince Hotel Shin Takanawa, Tokyo, Japan",
                "ceremony_number": 9,
                "vote_count": "51 million",
                "categories": {
                    "Anime of the Year": "Solo Leveling",
                    "Film of the Year": "Look Back",
                    "Best New Series": "Solo Leveling",
                    "Best Action": "Solo Leveling",
                    "Best Continuing Series": "Demon Slayer: Kimetsu no Yaiba",
                    "Best Animation": "Demon Slayer: Kimetsu no Yaiba",
                    "Best Background Art": "Frieren: Beyond Journey's End",
                    "Best Slice of Life": "Makeine: Too Many Losing Heroines!",
                    "Best Opening Sequence": "DAN DA DAN",
                    "Best Anime Song": "DAN DA DAN",
                    "Best Score": "Solo Leveling",
                    "Best Ending Sequence": "Solo Leveling",
                    "Global Impact Award": "Attack on Titan"
                }
            },
            "2024": {
                "year": 2024,
                "ceremony_number": 8,
                "categories": {
                    "Anime of the Year": "Jujutsu Kaisen",
                    "Best Animation": "Demon Slayer: Kimetsu no Yaiba",
                    "Best Action": "Jujutsu Kaisen"
                }
            },
            "2023": {
                "year": 2023,
                "ceremony_number": 7,
                "location": "Grand Prince Hotel New Takanawa, Tokyo, Japan",
                "categories": {
                    "Anime of the Year": "Cyberpunk: Edgerunners"
                }
            },
            "2022": {
                "year": 2022,
                "ceremony_number": 6,
                "categories": {
                    "Anime of the Year": "Demon Slayer: Kimetsu no Yaiba"
                }
            },
            "2021": {
                "year": 2021,
                "ceremony_number": 5,
                "categories": {
                    "Anime of the Year": "Jujutsu Kaisen"
                }
            },
            "2020": {
                "year": 2020,
                "ceremony_number": 4,
                "categories": {
                    "Anime of the Year": "Demon Slayer: Kimetsu no Yaiba"
                }
            },
            "2019": {
                "year": 2019,
                "ceremony_number": 3,
                "categories": {
                    "Anime of the Year": "Demon Slayer: Kimetsu no Yaiba"
                }
            },
            "2018": {
                "year": 2018,
                "ceremony_number": 2,
                "categories": {
                    "Anime of the Year": "My Hero Academia",
                    "Best Hero": "My Hero Academia",
                    "Best Villain": "My Hero Academia",
                    "Best Boy": "My Hero Academia",
                    "Best Girl": "My Hero Academia",
                    "Best Opening": "My Hero Academia",
                    "Best Animation": "My Hero Academia",
                    "Best Action": "My Hero Academia"
                }
            },
            "2017": {
                "year": 2017,
                "ceremony_number": 1,
                "location": "California, United States",
                "categories": {
                    "Anime of the Year": "Yuri!!! on Ice",
                    "Best Boy": "Yuri!!! on Ice",
                    "Best Opening": "Yuri!!! on Ice",
                    "Best Ending": "Yuri!!! on Ice",
                    "Best Animation": "Yuri!!! on Ice",
                    "Best Couple": "Yuri!!! on Ice",
                    "Most Heartwarming Scene": "Yuri!!! on Ice"
                }
            }
        }

    def search_tmdb_tv(self, anime_name: str) -> Optional[Dict]:
        """Search TMDb for TV show"""
        try:
            url = f"{self.tmdb_base_url}/search/tv"
            params = {
                "api_key": self.tmdb_api_key,
                "query": anime_name,
                "language": "en-US"
            }

            response = self.session.get(url, params=params)
            if response.status_code == 200:
                data = response.json()
                if data["results"]:
                    result = data["results"][0]
                    return {
                        "tmdb_id": result["id"],
                        "tmdb_name": result["name"],
                        "first_air_date": result.get("first_air_date"),
                        "overview": result.get("overview", "")
                    }

            time.sleep(0.25)  # Rate limiting
            return None

        except Exception as e:
            print(f"Error searching TMDb for '{anime_name}': {e}")
            return None

    def search_tmdb_movie(self, anime_name: str) -> Optional[Dict]:
        """Search TMDb for movies"""
        try:
            url = f"{self.tmdb_base_url}/search/movie"
            params = {
                "api_key": self.tmdb_api_key,
                "query": anime_name,
                "language": "en-US"
            }

            response = self.session.get(url, params=params)
            if response.status_code == 200:
                data = response.json()
                if data["results"]:
                    result = data["results"][0]
                    return {
                        "tmdb_id": result["id"],
                        "tmdb_name": result["title"],
                        "release_date": result.get("release_date"),
                        "overview": result.get("overview", "")
                    }

            time.sleep(0.25)  # Rate limiting
            return None

        except Exception as e:
            print(f"Error searching TMDb movies for '{anime_name}': {e}")
            return None

    def get_title_variants(self, title: str) -> List[str]:
        """Generate search variants for better TMDb matching"""
        variants = [title]

        # Remove punctuation variants
        variants.append(re.sub(r'[!?:.]', '', title))
        variants.append(re.sub(r'[!?:.]', ' ', title).strip())

        # Specific anime title variants
        anime_variants = {
            "Demon Slayer: Kimetsu no Yaiba": [
                "Demon Slayer",
                "Kimetsu no Yaiba"
            ],
            "Attack on Titan": [
                "Shingeki no Kyojin"
            ],
            "My Hero Academia": [
                "Boku no Hero Academia"
            ],
            "Yuri!!! on Ice": [
                "Yuri on Ice"
            ],
            "DAN DA DAN": [
                "Dandadan"
            ],
            "Makeine: Too Many Losing Heroines!": [
                "Makeine",
                "Too Many Losing Heroines"
            ]
        }

        if title in anime_variants:
            variants.extend(anime_variants[title])

        return list(set(variants))

    def find_tmdb_data(self, anime_name: str, is_movie: bool = False) -> Optional[Dict]:
        """Find TMDb data for anime with multiple search attempts"""
        print(f"Searching TMDb for: {anime_name}")

        variants = self.get_title_variants(anime_name)

        for variant in variants:
            print(f"  Trying variant: {variant}")

            if is_movie:
                result = self.search_tmdb_movie(variant)
            else:
                result = self.search_tmdb_tv(variant)

            if result:
                print(f"  ✅ Found: {result['tmdb_name']} (ID: {result['tmdb_id']})")
                return result

        print(f"  ❌ No TMDb match found for {anime_name}")
        return None

    def create_enhanced_awards_data(self) -> Dict:
        """Create comprehensive awards data with TMDb IDs"""
        print("Creating enhanced Crunchyroll Awards data...")

        # Get base awards data
        awards_by_year = self.get_comprehensive_awards_data()

        # Collect all unique anime
        anime_winners = {}

        for year_str, year_data in awards_by_year.items():
            categories = year_data.get("categories", {})
            for category, winner in categories.items():
                if winner and isinstance(winner, str):
                    anime_name = self.clean_anime_name(winner)
                    if anime_name:
                        if anime_name not in anime_winners:
                            anime_winners[anime_name] = {
                                "name": anime_name,
                                "awards": [],
                                "identifiers": {
                                    "tmdb_tv_id": None,
                                    "tmdb_movie_id": None,
                                    "tmdb_tv_data": None,
                                    "tmdb_movie_data": None
                                },
                                "search_variants": self.get_title_variants(anime_name)
                            }

                        anime_winners[anime_name]["awards"].append({
                            "year": year_data["year"],
                            "category": category,
                            "raw_winner_text": winner
                        })

        # Get TMDb data for each anime
        print(f"\nSearching TMDb for {len(anime_winners)} anime...")
        for anime_name, anime_data in anime_winners.items():
            # Check if it's a movie (Film of the Year category)
            is_movie = any(award["category"] == "Film of the Year" for award in anime_data["awards"])

            # Search TV shows
            tv_data = self.find_tmdb_data(anime_name, is_movie=False)
            if tv_data:
                anime_data["identifiers"]["tmdb_tv_id"] = tv_data["tmdb_id"]
                anime_data["identifiers"]["tmdb_tv_data"] = tv_data

            # Search movies if it's a film or if no TV result found
            if is_movie or not tv_data:
                movie_data = self.find_tmdb_data(anime_name, is_movie=True)
                if movie_data:
                    anime_data["identifiers"]["tmdb_movie_id"] = movie_data["tmdb_id"]
                    anime_data["identifiers"]["tmdb_movie_data"] = movie_data

        # Create final structure
        enhanced_data = {
            "metadata": {
                "source": "Crunchyroll Anime Awards with TMDb integration",
                "created_date": datetime.now().isoformat(),
                "note": "Enhanced data with TMDb IDs for Aphrodite integration",
                "total_ceremonies": 9,
                "years_covered": "2017-2025",
                "tmdb_api_used": True
            },
            "awards_by_year": awards_by_year,
            "anime_winners": anime_winners,
            "statistics": {
                "total_unique_anime": len(anime_winners),
                "anime_with_tmdb_tv": sum(1 for a in anime_winners.values() if a["identifiers"]["tmdb_tv_id"]),
                "anime_with_tmdb_movie": sum(1 for a in anime_winners.values() if a["identifiers"]["tmdb_movie_id"]),
                "anime_without_tmdb": sum(1 for a in anime_winners.values()
                                        if not a["identifiers"]["tmdb_tv_id"] and not a["identifiers"]["tmdb_movie_id"])
            }
        }

        return enhanced_data

    def clean_anime_name(self, winner_text: str) -> str:
        """Clean and extract anime name from winner text"""
        if not winner_text:
            return ""

        anime_name = winner_text.strip()

        # Remove artist/performer names (for songs)
        if " - " in anime_name:
            parts = anime_name.split(" - ")
            if len(parts) > 1:
                anime_name = parts[-1].strip()

        # Remove parenthetical information
        anime_name = re.sub(r'\([^)]*\)', '', anime_name).strip()

        # Common patterns to clean
        patterns_to_remove = [
            r'Season \d+',
            r'Part \d+',
            r'Arc$',
            r'Final Season.*',
            r'The Final.*',
            r'Opening Theme.*',
            r'Ending Theme.*',
            r'Hashira Training Arc',
            r'Entertainment District Arc'
        ]

        for pattern in patterns_to_remove:
            anime_name = re.sub(pattern, '', anime_name, flags=re.IGNORECASE).strip()

        # Handle special cases
        if 'Demon Slayer' in anime_name:
            return 'Demon Slayer: Kimetsu no Yaiba'
        elif 'My Hero Academia' in anime_name:
            return 'My Hero Academia'
        elif 'Jujutsu Kaisen' in anime_name:
            return 'Jujutsu Kaisen'
        elif 'Attack on Titan' in anime_name:
            return 'Attack on Titan'

        return anime_name


def main():
    """Main function to create enhanced awards data"""
    # TMDb API key from your settings
    TMDB_API_KEY = "0b2dc1bbbed569c9f97b2c54c7d167d2"

    creator = AwardsDataCreator(TMDB_API_KEY)

    try:
        # Create enhanced data
        enhanced_data = creator.create_enhanced_awards_data()

        # Save to file
        output_file = "E:/programming/aphrodite/crunchyroll_anime_awards_enhanced.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(enhanced_data, f, indent=2, ensure_ascii=False)

        # Print summary
        stats = enhanced_data["statistics"]
        print(f"\n{'='*50}")
        print("ENHANCED CRUNCHYROLL AWARDS DATA CREATED")
        print(f"{'='*50}")
        print(f"Total unique anime: {stats['total_unique_anime']}")
        print(f"With TMDb TV ID: {stats['anime_with_tmdb_tv']}")
        print(f"With TMDb Movie ID: {stats['anime_with_tmdb_movie']}")
        print(f"Without TMDb ID: {stats['anime_without_tmdb']}")
        print(f"\nSaved to: {output_file}")

        # Show anime without TMDb IDs
        if stats['anime_without_tmdb'] > 0:
            print(f"\nAnime without TMDb matches:")
            for name, data in enhanced_data["anime_winners"].items():
                if not data["identifiers"]["tmdb_tv_id"] and not data["identifiers"]["tmdb_movie_id"]:
                    print(f"  - {name}")

        print(f"\n✅ Enhanced awards data ready for Aphrodite integration!")

    except Exception as e:
        print(f"❌ Error creating enhanced data: {e}")


if __name__ == "__main__":
    main()