Skip to content

Commit 1cc125a

Browse files
committed
add agg stage comments
1 parent 0362be7 commit 1cc125a

1 file changed

Lines changed: 129 additions & 35 deletions

File tree

server/python/src/routers/movies.py

Lines changed: 129 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ async def get_all_movies(
165165

166166
@router.post("/", response_model=SuccessResponse[Movie], status_code=201)
167167
async def create_movie(movie: CreateMovieRequest):
168-
# Pydantic will automatically validate the structure
168+
# Pydantic automatically validates the structure
169169
movie_data = movie.model_dump(by_alias=True, exclude_none=True)
170170

171171
movies_collection = get_collection("movies")
@@ -423,8 +423,29 @@ async def aggregate_movies_recent_commented(
423423
details="The provided movie_id is not a valid ObjectId"
424424
)
425425

426-
# Add lookup and additional pipeline stages
426+
# Add a multi-stage aggregation that:
427+
# 1. Filters movies by valid year range
428+
# 2. Joins with comments collection (like SQL JOIN)
429+
# 3. Filters to only movies that have comments
430+
# 4. Sorts comments by date and extracts most recent ones
431+
# 5. Sorts movies by their most recent comment date
432+
# 6. Shapes the final output with transformed comment structure
433+
434+
pipeline = [
435+
# STAGE 1: $match - Initial Filter
436+
# Filter movies to only those with valid year data
437+
# Tip: Use $match early to reduce the initial dataset for better performance
438+
{
439+
"$match": {
440+
"year": {"$type": "number", "$gte": 1800, "$lte": 2030}
441+
}
442+
}
443+
]
444+
445+
# Add remaining pipeline stages
427446
pipeline.extend([
447+
# STAGE 2: $lookup - Join with the 'comments' Collection
448+
# This gives each movie document a 'comments' array containing all its comments
428449
{
429450
"$lookup": {
430451
"from": "comments",
@@ -433,59 +454,74 @@ async def aggregate_movies_recent_commented(
433454
"as": "comments"
434455
}
435456
},
457+
# STAGE 3: $match - Filter Movies with at Least One Comment
458+
# This helps reduces dataset to only movies with user engagement
436459
{
437460
"$match": {
438461
"comments": {"$ne": []}
439462
}
440463
},
464+
# STAGE 4: $addFields - Add New Computed Fields
441465
{
442466
"$addFields": {
467+
# Add computed field 'recentComments' that extracts only the N most recent comments (up to 'limit')
443468
"recentComments": {
444469
"$slice": [
445470
{
446471
"$sortArray": {
447472
"input": "$comments",
448-
"sortBy": {"date": -1}
473+
"sortBy": {"date": -1} # -1 = descending (newest first)
449474
}
450475
},
451-
limit
476+
limit # Number of comments to keep
452477
]
453478
},
479+
# Add computed field 'mostRecentCommentDate' that gets the date of the most recent comment (to use in the next $sort stage)
454480
"mostRecentCommentDate": {
455481
"$max": "$comments.date"
456482
}
457483
}
458484
},
485+
# STAGE 5: $sort - Sort Movies by Most Recent Comment Date
459486
{
460487
"$sort": {"mostRecentCommentDate": -1}
461488
},
489+
# STAGE 6: $limit - Restrict Result Set Size
490+
# - If querying single movie: return up to 50 results
491+
# - If querying all movies: return up to 20 results
492+
# Tip: This prevents overwhelming the client with too much data
462493
{
463-
"$limit": limit
494+
"$limit": 50 if movie_id else 20
464495
},
496+
# STAGE 7: $project - Shape Final Response Output
465497
{
466498
"$project": {
499+
# Include basic movie fields
467500
"title": 1,
468501
"year": 1,
469502
"genres": 1,
503+
"_id": 1,
504+
# Extract nested field: imdb.rating -> imdbRating
470505
"imdbRating": "$imdb.rating",
506+
# Use $map to reshape computed 'recentComments' field with cleaner field names
471507
"recentComments": {
472508
"$map": {
473509
"input": "$recentComments",
474510
"as": "comment",
475511
"in": {
476-
"userName": "$$comment.name",
477-
"userEmail": "$$comment.email",
478-
"text": "$$comment.text",
479-
"date": "$$comment.date"
512+
"userName": "$$comment.name", # Rename: name -> userName
513+
"userEmail": "$$comment.email", # Rename: email -> userEmail
514+
"text": "$$comment.text", # Keep: text
515+
"date": "$$comment.date" # Keep: date
480516
}
481517
}
482518
},
483-
"totalComments": {"$size": "$comments"},
484-
"_id": 1
519+
# Calculate the total number of comments into 'totalComments' (not just 'recentComments')
520+
# Used in display (e.g., "Showing 5 of 127 comments")
521+
"totalComments": {"$size": "$comments"}
485522
}
486523
}
487524
])
488-
489525
# Execute the aggregation
490526
try:
491527
results = await execute_aggregation(pipeline)
@@ -520,32 +556,48 @@ async def aggregate_movies_recent_commented(
520556

521557
@router.get("/api/movies/reportingByYear", response_model=SuccessResponse[List[dict]])
522558
async def aggregate_movies_by_year():
523-
# Define aggregation pipeline to group movies by year
559+
# Define aggregation pipeline to group movies by year with statistics
560+
# This pipeline demonstrates grouping, statistical calculations, and data cleaning
561+
562+
# Add a multi-stage aggregation that:
563+
# 1. Filters movies by valid year range (data quality filter)
564+
# 2. Groups movies by release year and calculates statistics per year
565+
# 3. Shapes the final output with clean field names and rounded averages
566+
# 4. Sorts results by year (newest first) for chronological presentation
567+
524568
pipeline = [
569+
# STAGE 1: $match - Data Quality Filter
525570
# Clean data: ensure year is an integer and within reasonable range
571+
# Tip: Filter early to reduce dataset size and improve performance
526572
{
527573
"$match": {
528574
"year": {"$type": "number", "$gte": 1800, "$lte": 2030}
529575
}
530576
},
531-
# Group by year and calculate statistics
577+
578+
# STAGE 2: $group - Aggregate Movies by Year
579+
# Group all movies by their release year and calculate various statistics
532580
{
533581
"$group": {
534-
"_id": "$year",
535-
"movieCount": {"$sum": 1},
582+
"_id": "$year", # Group by year field
583+
"movieCount": {"$sum": 1}, # Count total movies per year
584+
585+
# Calculate average rating (only for valid numeric ratings)
536586
"averageRating": {
537587
"$avg": {
538588
"$cond": [
539589
{"$and": [
540-
{"$ne": ["$imdb.rating", None]},
541-
{"$ne": ["$imdb.rating", ""]},
542-
{"$eq": [{"$type": "$imdb.rating"}, "double"]}
590+
{"$ne": ["$imdb.rating", None]}, # Not null
591+
{"$ne": ["$imdb.rating", ""]}, # Not empty string
592+
{"$eq": [{"$type": "$imdb.rating"}, "double"]} # Is numeric
543593
]},
544-
"$imdb.rating",
545-
"$$REMOVE"
594+
"$imdb.rating", # Include valid IMDB ratings
595+
"$$REMOVE" # Exclude invalid IMDB ratings
546596
]
547597
}
548598
},
599+
600+
# Find highest rating for the year (same validation as average rating)
549601
"highestRating": {
550602
"$max": {
551603
"$cond": [
@@ -559,6 +611,8 @@ async def aggregate_movies_by_year():
559611
]
560612
}
561613
},
614+
615+
# Find lowest rating for the year (same validation as average and highest rating)
562616
"lowestRating": {
563617
"$min": {
564618
"$cond": [
@@ -572,21 +626,29 @@ async def aggregate_movies_by_year():
572626
]
573627
}
574628
},
629+
630+
# Sum total votes across all movies in the year
575631
"totalVotes": {"$sum": "$imdb.votes"}
576632
}
577633
},
634+
635+
# STAGE 3: $project - Shape Final Output
636+
# Transform the grouped data into a clean, readable format
578637
{
579638
"$project": {
580-
"year": "$_id",
639+
"year": "$_id", # Rename _id back to year because grouping was done by year but values were stored in _id
581640
"movieCount": 1,
582-
"averageRating": {"$round": ["$averageRating", 2]},
641+
"averageRating": {"$round": ["$averageRating", 2]}, # Round to 2 decimal places
583642
"highestRating": 1,
584643
"lowestRating": 1,
585644
"totalVotes": 1,
586-
"_id": 0
645+
"_id": 0 # Exclude the _id field from output
587646
}
588647
},
589-
{"$sort": {"year": -1}}
648+
649+
# STAGE 4: $sort - Sort by Year (Newest First)
650+
# Sort results in descending order to show most recent years first
651+
{"$sort": {"year": -1}} # -1 = descending order
590652
]
591653

592654
# Execute the aggregation
@@ -619,37 +681,69 @@ async def aggregate_movies_by_year():
619681
async def aggregate_directors_most_movies(
620682
limit: int = Query(default=20, ge=1, le=100)
621683
):
622-
# Define aggregation pipeline to find directors with most movies
684+
# Define aggregation pipeline to find directors with the most movies
685+
# This pipeline demonstrates array unwinding, filtering, and ranking
686+
687+
# Add a multi-stage aggregation that:
688+
# 1. Filters movies with valid directors and year data (data quality filter)
689+
# 2. Unwinds directors array to create separate documents per director
690+
# 3. Cleans director names by filtering out null/empty names
691+
# 4. Groups movies by individual director and calculates statistics per director
692+
# 5. Sorts directors based on movie count
693+
# 6. Limits results to top N directors
694+
# 7. Shapes the final output with clean field names and rounded averages
695+
623696
pipeline = [
697+
# STAGE 1: $match - Initial Data Quality Filter
698+
# Filter movies that have director information and valid years
624699
{
625700
"$match": {
626-
"directors": {"$exists": True, "$ne": None, "$ne": []},
627-
"year": {"$type": "number", "$gte": 1800, "$lte": 2030}
701+
"directors": {"$exists": True, "$ne": None, "$ne": []}, # Has directors array
702+
"year": {"$type": "number", "$gte": 1800, "$lte": 2030} # Valid year range
628703
}
629704
},
705+
706+
# STAGE 2: $unwind - Flatten Directors Array
707+
# Convert each movie's directors array into separate documents
708+
# Example: Movie with ["Director A", "Director B"] becomes 2 documents
630709
{
631710
"$unwind": "$directors"
632711
},
712+
713+
# STAGE 3: $match - Clean Director Names
714+
# Filter out any null or empty director names after unwinding
633715
{
634716
"$match": {
635717
"directors": {"$ne": None, "$ne": ""}
636718
}
637719
},
720+
721+
# STAGE 4: $group - Aggregate by Director
722+
# Group all movies by director name and calculate statistics
638723
{
639724
"$group": {
640-
"_id": "$directors",
641-
"movieCount": {"$sum": 1},
642-
"averageRating": {"$avg": "$imdb.rating"}
725+
"_id": "$directors", # Group by individual director name
726+
"movieCount": {"$sum": 1}, # Count movies per director
727+
"averageRating": {"$avg": "$imdb.rating"} # Average rating of director's movies
643728
}
644729
},
645-
{"$sort": {"movieCount": -1}},
730+
731+
# STAGE 5: $sort - Rank Directors by Movie Count
732+
# Sort directors by number of movies (highest first)
733+
{"$sort": {"movieCount": -1}}, # -1 = descending (most movies first)
734+
735+
# STAGE 6: $limit - Restrict Results
736+
# Limit to top N directors based on user input
646737
{"$limit": limit},
738+
739+
# STAGE 7: $project - Shape Final Output
740+
# Transform the grouped data into a clean, readable format
647741
{
648742
"$project": {
649-
"director": "$_id",
743+
"director": "$_id", # Rename _id to director
650744
"movieCount": 1,
651-
"averageRating": {"$round": ["$averageRating", 2]},
652-
"_id": 0
745+
"averageRating": {"$round": ["$averageRating", 2]}, # Round to 2 decimal places
746+
"_id": 0 # Exclude the _id field from output
653747
}
654748
}
655749
]

0 commit comments

Comments
 (0)