@@ -165,7 +165,7 @@ async def get_all_movies(
165165
166166@router .post ("/" , response_model = SuccessResponse [Movie ], status_code = 201 )
167167async def create_movie (movie : CreateMovieRequest ):
168- # Pydantic will automatically validate the structure
168+ # Pydantic automatically validates the structure
169169 movie_data = movie .model_dump (by_alias = True , exclude_none = True )
170170
171171 movies_collection = get_collection ("movies" )
@@ -423,8 +423,29 @@ async def aggregate_movies_recent_commented(
423423 details = "The provided movie_id is not a valid ObjectId"
424424 )
425425
426- # Add lookup and additional pipeline stages
426+ # Add a multi-stage aggregation that:
427+ # 1. Filters movies by valid year range
428+ # 2. Joins with comments collection (like SQL JOIN)
429+ # 3. Filters to only movies that have comments
430+ # 4. Sorts comments by date and extracts most recent ones
431+ # 5. Sorts movies by their most recent comment date
432+ # 6. Shapes the final output with transformed comment structure
433+
434+ pipeline = [
435+ # STAGE 1: $match - Initial Filter
436+ # Filter movies to only those with valid year data
437+ # Tip: Use $match early to reduce the initial dataset for better performance
438+ {
439+ "$match" : {
440+ "year" : {"$type" : "number" , "$gte" : 1800 , "$lte" : 2030 }
441+ }
442+ }
443+ ]
444+
445+ # Add remaining pipeline stages
427446 pipeline .extend ([
447+ # STAGE 2: $lookup - Join with the 'comments' Collection
448+ # This gives each movie document a 'comments' array containing all its comments
428449 {
429450 "$lookup" : {
430451 "from" : "comments" ,
@@ -433,59 +454,74 @@ async def aggregate_movies_recent_commented(
433454 "as" : "comments"
434455 }
435456 },
457+ # STAGE 3: $match - Filter Movies with at Least One Comment
458+ # This helps reduces dataset to only movies with user engagement
436459 {
437460 "$match" : {
438461 "comments" : {"$ne" : []}
439462 }
440463 },
464+ # STAGE 4: $addFields - Add New Computed Fields
441465 {
442466 "$addFields" : {
467+ # Add computed field 'recentComments' that extracts only the N most recent comments (up to 'limit')
443468 "recentComments" : {
444469 "$slice" : [
445470 {
446471 "$sortArray" : {
447472 "input" : "$comments" ,
448- "sortBy" : {"date" : - 1 }
473+ "sortBy" : {"date" : - 1 } # -1 = descending (newest first)
449474 }
450475 },
451- limit
476+ limit # Number of comments to keep
452477 ]
453478 },
479+ # Add computed field 'mostRecentCommentDate' that gets the date of the most recent comment (to use in the next $sort stage)
454480 "mostRecentCommentDate" : {
455481 "$max" : "$comments.date"
456482 }
457483 }
458484 },
485+ # STAGE 5: $sort - Sort Movies by Most Recent Comment Date
459486 {
460487 "$sort" : {"mostRecentCommentDate" : - 1 }
461488 },
489+ # STAGE 6: $limit - Restrict Result Set Size
490+ # - If querying single movie: return up to 50 results
491+ # - If querying all movies: return up to 20 results
492+ # Tip: This prevents overwhelming the client with too much data
462493 {
463- "$limit" : limit
494+ "$limit" : 50 if movie_id else 20
464495 },
496+ # STAGE 7: $project - Shape Final Response Output
465497 {
466498 "$project" : {
499+ # Include basic movie fields
467500 "title" : 1 ,
468501 "year" : 1 ,
469502 "genres" : 1 ,
503+ "_id" : 1 ,
504+ # Extract nested field: imdb.rating -> imdbRating
470505 "imdbRating" : "$imdb.rating" ,
506+ # Use $map to reshape computed 'recentComments' field with cleaner field names
471507 "recentComments" : {
472508 "$map" : {
473509 "input" : "$recentComments" ,
474510 "as" : "comment" ,
475511 "in" : {
476- "userName" : "$$comment.name" ,
477- "userEmail" : "$$comment.email" ,
478- "text" : "$$comment.text" ,
479- "date" : "$$comment.date"
512+ "userName" : "$$comment.name" , # Rename: name -> userName
513+ "userEmail" : "$$comment.email" , # Rename: email -> userEmail
514+ "text" : "$$comment.text" , # Keep: text
515+ "date" : "$$comment.date" # Keep: date
480516 }
481517 }
482518 },
483- "totalComments" : {"$size" : "$comments" },
484- "_id" : 1
519+ # Calculate the total number of comments into 'totalComments' (not just 'recentComments')
520+ # Used in display (e.g., "Showing 5 of 127 comments")
521+ "totalComments" : {"$size" : "$comments" }
485522 }
486523 }
487524 ])
488-
489525 # Execute the aggregation
490526 try :
491527 results = await execute_aggregation (pipeline )
@@ -520,32 +556,48 @@ async def aggregate_movies_recent_commented(
520556
521557@router .get ("/api/movies/reportingByYear" , response_model = SuccessResponse [List [dict ]])
522558async def aggregate_movies_by_year ():
523- # Define aggregation pipeline to group movies by year
559+ # Define aggregation pipeline to group movies by year with statistics
560+ # This pipeline demonstrates grouping, statistical calculations, and data cleaning
561+
562+ # Add a multi-stage aggregation that:
563+ # 1. Filters movies by valid year range (data quality filter)
564+ # 2. Groups movies by release year and calculates statistics per year
565+ # 3. Shapes the final output with clean field names and rounded averages
566+ # 4. Sorts results by year (newest first) for chronological presentation
567+
524568 pipeline = [
569+ # STAGE 1: $match - Data Quality Filter
525570 # Clean data: ensure year is an integer and within reasonable range
571+ # Tip: Filter early to reduce dataset size and improve performance
526572 {
527573 "$match" : {
528574 "year" : {"$type" : "number" , "$gte" : 1800 , "$lte" : 2030 }
529575 }
530576 },
531- # Group by year and calculate statistics
577+
578+ # STAGE 2: $group - Aggregate Movies by Year
579+ # Group all movies by their release year and calculate various statistics
532580 {
533581 "$group" : {
534- "_id" : "$year" ,
535- "movieCount" : {"$sum" : 1 },
582+ "_id" : "$year" , # Group by year field
583+ "movieCount" : {"$sum" : 1 }, # Count total movies per year
584+
585+ # Calculate average rating (only for valid numeric ratings)
536586 "averageRating" : {
537587 "$avg" : {
538588 "$cond" : [
539589 {"$and" : [
540- {"$ne" : ["$imdb.rating" , None ]},
541- {"$ne" : ["$imdb.rating" , "" ]},
542- {"$eq" : [{"$type" : "$imdb.rating" }, "double" ]}
590+ {"$ne" : ["$imdb.rating" , None ]}, # Not null
591+ {"$ne" : ["$imdb.rating" , "" ]}, # Not empty string
592+ {"$eq" : [{"$type" : "$imdb.rating" }, "double" ]} # Is numeric
543593 ]},
544- "$imdb.rating" ,
545- "$$REMOVE"
594+ "$imdb.rating" , # Include valid IMDB ratings
595+ "$$REMOVE" # Exclude invalid IMDB ratings
546596 ]
547597 }
548598 },
599+
600+ # Find highest rating for the year (same validation as average rating)
549601 "highestRating" : {
550602 "$max" : {
551603 "$cond" : [
@@ -559,6 +611,8 @@ async def aggregate_movies_by_year():
559611 ]
560612 }
561613 },
614+
615+ # Find lowest rating for the year (same validation as average and highest rating)
562616 "lowestRating" : {
563617 "$min" : {
564618 "$cond" : [
@@ -572,21 +626,29 @@ async def aggregate_movies_by_year():
572626 ]
573627 }
574628 },
629+
630+ # Sum total votes across all movies in the year
575631 "totalVotes" : {"$sum" : "$imdb.votes" }
576632 }
577633 },
634+
635+ # STAGE 3: $project - Shape Final Output
636+ # Transform the grouped data into a clean, readable format
578637 {
579638 "$project" : {
580- "year" : "$_id" ,
639+ "year" : "$_id" , # Rename _id back to year because grouping was done by year but values were stored in _id
581640 "movieCount" : 1 ,
582- "averageRating" : {"$round" : ["$averageRating" , 2 ]},
641+ "averageRating" : {"$round" : ["$averageRating" , 2 ]}, # Round to 2 decimal places
583642 "highestRating" : 1 ,
584643 "lowestRating" : 1 ,
585644 "totalVotes" : 1 ,
586- "_id" : 0
645+ "_id" : 0 # Exclude the _id field from output
587646 }
588647 },
589- {"$sort" : {"year" : - 1 }}
648+
649+ # STAGE 4: $sort - Sort by Year (Newest First)
650+ # Sort results in descending order to show most recent years first
651+ {"$sort" : {"year" : - 1 }} # -1 = descending order
590652 ]
591653
592654 # Execute the aggregation
@@ -619,37 +681,69 @@ async def aggregate_movies_by_year():
619681async def aggregate_directors_most_movies (
620682 limit : int = Query (default = 20 , ge = 1 , le = 100 )
621683):
622- # Define aggregation pipeline to find directors with most movies
684+ # Define aggregation pipeline to find directors with the most movies
685+ # This pipeline demonstrates array unwinding, filtering, and ranking
686+
687+ # Add a multi-stage aggregation that:
688+ # 1. Filters movies with valid directors and year data (data quality filter)
689+ # 2. Unwinds directors array to create separate documents per director
690+ # 3. Cleans director names by filtering out null/empty names
691+ # 4. Groups movies by individual director and calculates statistics per director
692+ # 5. Sorts directors based on movie count
693+ # 6. Limits results to top N directors
694+ # 7. Shapes the final output with clean field names and rounded averages
695+
623696 pipeline = [
697+ # STAGE 1: $match - Initial Data Quality Filter
698+ # Filter movies that have director information and valid years
624699 {
625700 "$match" : {
626- "directors" : {"$exists" : True , "$ne" : None , "$ne" : []},
627- "year" : {"$type" : "number" , "$gte" : 1800 , "$lte" : 2030 }
701+ "directors" : {"$exists" : True , "$ne" : None , "$ne" : []}, # Has directors array
702+ "year" : {"$type" : "number" , "$gte" : 1800 , "$lte" : 2030 } # Valid year range
628703 }
629704 },
705+
706+ # STAGE 2: $unwind - Flatten Directors Array
707+ # Convert each movie's directors array into separate documents
708+ # Example: Movie with ["Director A", "Director B"] becomes 2 documents
630709 {
631710 "$unwind" : "$directors"
632711 },
712+
713+ # STAGE 3: $match - Clean Director Names
714+ # Filter out any null or empty director names after unwinding
633715 {
634716 "$match" : {
635717 "directors" : {"$ne" : None , "$ne" : "" }
636718 }
637719 },
720+
721+ # STAGE 4: $group - Aggregate by Director
722+ # Group all movies by director name and calculate statistics
638723 {
639724 "$group" : {
640- "_id" : "$directors" ,
641- "movieCount" : {"$sum" : 1 },
642- "averageRating" : {"$avg" : "$imdb.rating" }
725+ "_id" : "$directors" , # Group by individual director name
726+ "movieCount" : {"$sum" : 1 }, # Count movies per director
727+ "averageRating" : {"$avg" : "$imdb.rating" } # Average rating of director's movies
643728 }
644729 },
645- {"$sort" : {"movieCount" : - 1 }},
730+
731+ # STAGE 5: $sort - Rank Directors by Movie Count
732+ # Sort directors by number of movies (highest first)
733+ {"$sort" : {"movieCount" : - 1 }}, # -1 = descending (most movies first)
734+
735+ # STAGE 6: $limit - Restrict Results
736+ # Limit to top N directors based on user input
646737 {"$limit" : limit },
738+
739+ # STAGE 7: $project - Shape Final Output
740+ # Transform the grouped data into a clean, readable format
647741 {
648742 "$project" : {
649- "director" : "$_id" ,
743+ "director" : "$_id" , # Rename _id to director
650744 "movieCount" : 1 ,
651- "averageRating" : {"$round" : ["$averageRating" , 2 ]},
652- "_id" : 0
745+ "averageRating" : {"$round" : ["$averageRating" , 2 ]}, # Round to 2 decimal places
746+ "_id" : 0 # Exclude the _id field from output
653747 }
654748 }
655749 ]
0 commit comments