11package edu .harvard .iq .dataverse .search ;
22
33import edu .harvard .iq .dataverse .DataFile ;
4+ import edu .harvard .iq .dataverse .DataFileServiceBean ;
45import edu .harvard .iq .dataverse .Dataset ;
56import edu .harvard .iq .dataverse .DatasetServiceBean ;
67import edu .harvard .iq .dataverse .DatasetVersion ;
@@ -53,6 +54,8 @@ public class SolrIndexServiceBean {
5354 @ EJB
5455 SearchPermissionsServiceBean searchPermissionsService ;
5556 @ EJB
57+ DataFileServiceBean dataFileService ;
58+ @ EJB
5659 DataverseServiceBean dataverseService ;
5760 @ EJB
5861 DatasetServiceBean datasetService ;
@@ -365,8 +368,30 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint)
365368 indexPermissionsForOneDvObject (definitionPoint );
366369 numObjects ++;
367370
368- // Process the dataset's files in a new transaction
369- self .indexDatasetFilesInNewTransaction (dataset .getId (), counter , fileQueryMin );
371+ /**
372+ * Prepare the data needed for the new transaction. For performance reasons, indexDatasetFilesInNewTransaction does not merge the dataset or versions into the new transaction (we only read info, there
373+ * are no changes to write). However, there are two ways the code here is used. In one case, indexing content and permissions, the versions and fileMetadatas in them are already loaded. In the other
374+ * case, indexing permissions only, the fileMetadatas are not yet loaded, and we may need them, but only if there are fewer than fileQueryMin. For each version that will get reindexed (at most two of
375+ * them), the code below does a lightweight query to see how many fileMetadatas exist in it and, if it is equal to or below fileQueryMin, calls getFileMetadatas().size() to assure they are loaded
376+ * (before we pass the version into a new transaction where it will be detached and fileMetadatas can't be loaded). Calling getFileMetadas.size() should be lightweight when the fileMetadatas are
377+ * loaded (first case) and done only when needed for the second case.
378+ *
379+ **/
380+ Map <DatasetVersion .VersionState , Boolean > desiredCards = searchPermissionsService .getDesiredCards (dataset );
381+ List <DatasetVersion > versionsToIndex = new ArrayList <>();
382+ for (DatasetVersion version : versionsToReIndexPermissionsFor (dataset )) {
383+ if (desiredCards .get (version .getVersionState ())) {
384+ int fileCount = dataFileService .findCountByDatasetVersionId (version .getId ()).intValue ();
385+ if (fileCount <= fileQueryMin ) {
386+ // IMPORTANT: This triggers the loading of fileMetadatas within the current transaction
387+ version .getFileMetadatas ().size ();
388+ }
389+ versionsToIndex .add (version );
390+ }
391+ }
392+
393+ // Process the dataset's files in a new transaction, passing the pre-loaded data
394+ self .indexDatasetFilesInNewTransaction (versionsToIndex , counter , fileQueryMin );
370395 } else {
371396 // For other types (like files), just index in a new transaction
372397 indexPermissionsForOneDvObject (definitionPoint );
@@ -399,15 +424,11 @@ public void indexDatasetBatchInNewTransaction(List<Long> datasetIds, final int[]
399424 }
400425
401426 @ TransactionAttribute (TransactionAttributeType .REQUIRES_NEW )
402- public void indexDatasetFilesInNewTransaction (Long datasetId , final int [] fileCounter , int fileQueryMin ) {
403- Dataset dataset = datasetService .find (datasetId );
404- if (dataset != null ) {
405- Map <DatasetVersion .VersionState , Boolean > desiredCards = searchPermissionsService .getDesiredCards (dataset );
406- for (DatasetVersion version : versionsToReIndexPermissionsFor (dataset )) {
407- if (desiredCards .get (version .getVersionState ())) {
408- processDatasetVersionFiles (version , fileCounter , fileQueryMin );
409- }
410- }
427+ public void indexDatasetFilesInNewTransaction (List <DatasetVersion > versions , final int [] fileCounter , int fileQueryMin ) {
428+ for (DatasetVersion version : versions ) {
429+ // The version object is detached, but its fileMetadatas collection is already loaded.
430+ // We only need its ID and state, which are available.
431+ processDatasetVersionFiles (version , fileCounter , fileQueryMin );
411432 }
412433 }
413434
@@ -421,22 +442,24 @@ private void processDatasetVersionFiles(DatasetVersion version,
421442 // Process files in batches of 100
422443 int batchSize = 100 ;
423444
424- if (version .getFileMetadatas (). size () > fileQueryMin ) {
445+ if (dataFileService . findCountByDatasetVersionId ( version .getId ()). intValue () > fileQueryMin ) {
425446 // For large datasets, use a more efficient SQL query
426- Stream <DataFileProxy > fileStream = getDataFileInfoForPermissionIndexing (version .getId ());
447+ try ( Stream <DataFileProxy > fileStream = getDataFileInfoForPermissionIndexing (version .getId ())) {
427448
428- // Process files in batches to avoid memory issues
429- fileStream .forEach (fileInfo -> {
430- filesToReindexAsBatch .add (fileInfo );
431- fileCounter [0 ]++;
449+ // Process files in batches to avoid memory issues
450+ fileStream .forEach (fileInfo -> {
451+ filesToReindexAsBatch .add (fileInfo );
452+ fileCounter [0 ]++;
432453
433- if (filesToReindexAsBatch .size () >= batchSize ) {
434- reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
435- filesToReindexAsBatch .clear ();
436- }
437- });
454+ if (filesToReindexAsBatch .size () >= batchSize ) {
455+ reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
456+ filesToReindexAsBatch .clear ();
457+ }
458+ });
459+ }
438460 } else {
439461 // For smaller datasets, process files directly
462+ // We only call getFileMetadatas() in the case where we know they have already been loaded
440463 for (FileMetadata fmd : version .getFileMetadatas ()) {
441464 DataFileProxy fileProxy = new DataFileProxy (fmd );
442465 filesToReindexAsBatch .add (fileProxy );
0 commit comments