@@ -863,7 +863,7 @@ public List<VectorSearchResult> vectorSearchMovies(String query, Integer limit)
863863 // Step 1: Get movie IDs and scores from embedded_movies (which has the vector embeddings)
864864 List <ObjectId > movieIds = new ArrayList <>();
865865 Map <String , Double > scoreMap = new HashMap <>();
866-
866+
867867 mongoTemplate .getCollection ("embedded_movies" )
868868 .aggregate (aggregationPipeline )
869869 .forEach (doc -> {
@@ -873,44 +873,81 @@ public List<VectorSearchResult> vectorSearchMovies(String query, Integer limit)
873873 });
874874
875875 // Step 2: Fetch complete movie data from the movies collection (for CRUD compatibility)
876+ // Use aggregation to safely handle dirty data in the year field
876877 List <VectorSearchResult > results = new ArrayList <>();
878+
877879 if (!movieIds .isEmpty ()) {
878- Query movieQuery = new Query (Criteria .where ("_id" ).in (movieIds ));
879- List <Movie > movies = mongoTemplate .find (movieQuery , Movie .class );
880-
881- // Log if there's a mismatch between collections
882- if (movies .size () != movieIds .size ()) {
883- System .out .println ("Warning: Found " + movieIds .size () +
884- " movies in embedded_movies but only " + movies .size () +
885- " in movies collection for vector search" );
886- }
887-
888- // Convert to VectorSearchResult with scores preserved
889- for (Movie movie : movies ) {
890- String movieIdStr = movie .getId ().toString ();
891- Double score = scoreMap .get (movieIdStr );
892-
893- if (score != null ) { // Only include movies that have vector scores
894- VectorSearchResult result = VectorSearchResult .builder ()
895- .id (movieIdStr )
896- .title (movie .getTitle ())
897- .plot (movie .getPlot ())
898- .poster (movie .getPoster ())
899- .year (movie .getYear ())
900- .genres (movie .getGenres ())
901- .directors (movie .getDirectors ())
902- .cast (movie .getCast ())
903- .score (score )
904- .build ();
905- results .add (result );
906- }
907- }
880+ // Build aggregation pipeline to safely convert year field
881+ Document matchStage = new Document ("$match" , new Document ("_id" , new Document ("$in" , movieIds )));
882+
883+ // Project stage to safely convert year to integer, handling dirty data
884+ Document projectStage2 = new Document ("$project" , new Document ()
885+ .append ("_id" , 1 )
886+ .append ("title" , 1 )
887+ .append ("plot" , 1 )
888+ .append ("poster" , 1 )
889+ .append ("genres" , 1 )
890+ .append ("directors" , 1 )
891+ .append ("cast" , 1 )
892+ // Safely convert year to integer, handling strings and dirty data
893+ .append ("year" , new Document ("$cond" , new Document ()
894+ .append ("if" , new Document ("$and" , java .util .Arrays .asList (
895+ new Document ("$ne" , java .util .Arrays .asList ("$year" , null )),
896+ new Document ("$eq" , java .util .Arrays .asList (new Document ("$type" , "$year" ), "int" ))
897+ )))
898+ .append ("then" , "$year" )
899+ .append ("else" , null )
900+ ))
901+ );
902+
903+ List <Document > moviePipeline = List .of (matchStage , projectStage2 );
904+
905+ // Execute aggregation and manually build VectorSearchResult objects
906+ mongoTemplate .getCollection ("movies" ).aggregate (moviePipeline )
907+ .forEach (doc -> {
908+ ObjectId movieIdObj = doc .getObjectId ("_id" );
909+ if (movieIdObj == null ) {
910+ return ;
911+ }
912+
913+ String movieIdStr = movieIdObj .toString ();
914+ Double score = scoreMap .get (movieIdStr );
915+
916+ if (score != null ) { // Only include movies that have vector scores
917+ // Safely get list fields, defaulting to null if not present
918+ List <String > genres = doc .getList ("genres" , String .class );
919+ List <String > directors = doc .getList ("directors" , String .class );
920+ List <String > cast = doc .getList ("cast" , String .class );
921+
922+ VectorSearchResult result = VectorSearchResult .builder ()
923+ .id (movieIdStr )
924+ .title (doc .getString ("title" ))
925+ .plot (doc .getString ("plot" ))
926+ .poster (doc .getString ("poster" ))
927+ .year (doc .getInteger ("year" )) // Will be null for dirty data
928+ .genres (genres )
929+ .directors (directors )
930+ .cast (cast )
931+ .score (score )
932+ .build ();
933+ results .add (result );
934+ }
935+ });
908936 }
909937
910938 return results ;
911939
940+ } catch (IOException e ) {
941+ // Handle Voyage AI API errors
942+ String errorMsg = e .getMessage () != null ? e .getMessage () : "Network error calling Voyage AI API" ;
943+ throw new DatabaseOperationException ("Error performing vector search: " + errorMsg );
944+ } catch (InterruptedException e ) {
945+ Thread .currentThread ().interrupt ();
946+ throw new DatabaseOperationException ("Vector search was interrupted" );
912947 } catch (Exception e ) {
913- throw new DatabaseOperationException ("Error performing vector search: " + e .getMessage ());
948+ // Handle other errors (e.g., MongoDB errors, parsing errors)
949+ String errorMsg = e .getMessage () != null ? e .getMessage () : e .getClass ().getSimpleName ();
950+ throw new DatabaseOperationException ("Error performing vector search: " + errorMsg );
914951 }
915952 }
916953
@@ -956,7 +993,26 @@ private List<Double> generateVoyageEmbedding(String text, String apiKey) throws
956993 // Parse the JSON response to extract the embedding
957994 ObjectMapper mapper = new ObjectMapper ();
958995 JsonNode root = mapper .readTree (response .body ());
959- JsonNode embeddingNode = root .path ("data" ).get (0 ).path ("embedding" );
996+
997+ // Validate response structure
998+ if (!root .has ("data" )) {
999+ throw new IOException ("Invalid Voyage AI API response: missing 'data' field. Response: " + response .body ());
1000+ }
1001+
1002+ JsonNode dataNode = root .get ("data" );
1003+ if (dataNode == null || !dataNode .isArray () || dataNode .size () == 0 ) {
1004+ throw new IOException ("Invalid Voyage AI API response: 'data' field is empty or not an array. Response: " + response .body ());
1005+ }
1006+
1007+ JsonNode firstElement = dataNode .get (0 );
1008+ if (firstElement == null || !firstElement .has ("embedding" )) {
1009+ throw new IOException ("Invalid Voyage AI API response: missing 'embedding' field. Response: " + response .body ());
1010+ }
1011+
1012+ JsonNode embeddingNode = firstElement .get ("embedding" );
1013+ if (embeddingNode == null || !embeddingNode .isArray ()) {
1014+ throw new IOException ("Invalid Voyage AI API response: 'embedding' is not an array. Response: " + response .body ());
1015+ }
9601016
9611017 // Convert the embedding to a List<Double>
9621018 List <Double > embedding = new ArrayList <>();
0 commit comments