Skip to content

Commit 1cc3391

Browse files
authored
Merge pull request #12094 from QualitativeDataRepository/indexingperf3
Permission indexing creates unused, orphan perm docs for unchanged files in draft versions
2 parents 0b78ff8 + da704b9 commit 1cc3391

4 files changed

Lines changed: 129 additions & 53 deletions

File tree

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
(assuming the earlier PRs have been merged, tehre will be a section on indexing improvements already)
2+
This release also avoids creating unused Solr entries for files in drafts of new versions of published datasets (decreasing the Solr db size and thereby improving performance).

src/main/java/edu/harvard/iq/dataverse/FileMetadata.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,15 @@
6767
*/
6868
@Table(indexes = {@Index(columnList="datafile_id"), @Index(columnList="datasetversion_id")} )
6969
@NamedNativeQuery(
70-
name = "FileMetadata.compareFileMetadata",
70+
name = "FileMetadata.getDatafilesWithChangedMetadata",
7171
query = "WITH fm_categories AS (" +
7272
" SELECT fmd.filemetadatas_id, " +
7373
" STRING_AGG(dfc.name, ',' ORDER BY dfc.name) AS categories " +
7474
" FROM FileMetadata_DataFileCategory fmd " +
7575
" JOIN DataFileCategory dfc ON fmd.filecategories_id = dfc.id " +
7676
" GROUP BY fmd.filemetadatas_id " +
7777
") " +
78-
"SELECT fm1.id " +
78+
"SELECT fm1.datafile_id AS id " +
7979
"FROM FileMetadata fm1 " +
8080
"LEFT JOIN FileMetadata fm2 ON fm1.datafile_id = fm2.datafile_id " +
8181
" AND fm2.datasetversion_id = ?1 " +
@@ -93,11 +93,11 @@
9393
" ) " +
9494
" ) " +
9595
" )",
96-
resultSetMapping = "IdToLongMapping"
96+
resultSetMapping = "IdToIntegerMapping"
9797
)
9898
/* When this mapping was to Long.class, Postgres was still returning an Integer, causing indexing failures - see #11776 */
9999
@SqlResultSetMapping(
100-
name = "IdToLongMapping",
100+
name = "IdToIntegerMapping",
101101
columns = @ColumnResult(name = "id", type = Integer.class)
102102
)
103103
@Entity

src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -602,10 +602,24 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr
602602
writeDebugInfo(debug, dataset);
603603
}
604604
if (doNormalSolrDocCleanUp) {
605+
List<String> solrIdsOfPermissionDocsToDelete = new ArrayList<>();
605606
try {
606607
solrIdsOfDocsToDelete = findFilesOfParentDataset(dataset.getId());
607608
logger.fine("Existing file docs: " + String.join(", ", solrIdsOfDocsToDelete));
608609
if (!solrIdsOfDocsToDelete.isEmpty()) {
610+
if (!latestVersion.isDraft()) {
611+
// After publication, we need to delete old draft perm docs
612+
// For the first draft, a perm doc will exist for each file
613+
// For subsequent drafts, perm docs should only exist for new files/those with changed metadata
614+
// This code adds the ids of draft perm docs for all files - if the docs don't exist, Solr will just ignore them
615+
for (String fileDocId : solrIdsOfDocsToDelete) {
616+
if (!fileDocId.endsWith(draftSuffix)) {
617+
solrIdsOfPermissionDocsToDelete.add(fileDocId + draftSuffix + discoverabilityPermissionSuffix);
618+
}
619+
}
620+
621+
logger.fine("Existing permission docs: " + String.join(", ", solrIdsOfPermissionDocsToDelete));
622+
}
609623
// We keep the latest version's docs unless it is deaccessioned and there is no
610624
// published/released version
611625
// So skip the loop removing those docs from the delete list except in that case
@@ -649,7 +663,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr
649663
logger.fine("Solr docs to delete: " + String.join(", ", solrIdsOfDocsToDelete));
650664

651665
if (!solrIdsOfDocsToDelete.isEmpty()) {
652-
List<String> solrIdsOfPermissionDocsToDelete = new ArrayList<>();
666+
653667
for (String file : solrIdsOfDocsToDelete) {
654668
// Also remove associated permission docs
655669
solrIdsOfPermissionDocsToDelete.add(file + discoverabilityPermissionSuffix);
@@ -1416,7 +1430,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long
14161430
long maxSize = maxFTIndexingSize != null ? maxFTIndexingSize.longValue() : Long.MAX_VALUE;
14171431

14181432
List<String> filesIndexed = new ArrayList<>();
1419-
final List<Long> changedFileMetadataIds = new ArrayList<>();
1433+
final List<Long> changedFileIds = new ArrayList<>();
14201434
if (datasetVersion != null) {
14211435
List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
14221436
List<FileMetadata> rfm = new ArrayList<>();
@@ -1427,42 +1441,17 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long
14271441
fileMap.put(released.getDataFile().getId(), released);
14281442
}
14291443

1430-
Query query = em.createNamedQuery("FileMetadata.compareFileMetadata", Long.class);
1431-
query.setParameter(1, dataset.getReleasedVersion().getId());
1432-
query.setParameter(2, datasetVersion.getId());
1433-
1434-
/*
1435-
* When the query was configured to return Long, it was returning Integer. The query has been changed to return Integer now. The code here is robust if that changes in the future.
1436-
*/
1437-
List<Object> queryResults = query.getResultList();
1438-
for (Object result : queryResults) {
1439-
if (result != null) {
1440-
// Ensure we're adding Long objects to the list
1441-
if (result instanceof Integer intResult) {
1442-
logger.finest("Converted Integer result to Long: " + result);
1443-
changedFileMetadataIds.add(Long.valueOf(intResult));
1444-
} else if (result instanceof Long longResult) {
1445-
// Already a Long, add directly
1446-
logger.finest("Added existing Long to list: " + result);
1447-
changedFileMetadataIds.add(longResult);
1448-
} else {
1449-
// If it's not a Long, convert it to one via String
1450-
try {
1451-
changedFileMetadataIds.add(Long.valueOf(result.toString()));
1452-
logger.finest("Converted non-Long result to Long: " + result + " of type " + result.getClass().getName());
1453-
} catch (NumberFormatException e) {
1454-
logger.warning("Could not convert query result to Long: " + result);
1455-
}
1456-
}
1457-
}
1458-
}
1444+
solrIndexService.populateChangedFileIds(
1445+
dataset.getReleasedVersion().getId(),
1446+
datasetVersion.getId(),
1447+
changedFileIds);
14591448
logger.fine(
14601449
"We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions.");
14611450
} else if (datasetVersion.isDraft()) {
14621451
// Add all file metadata ids to changedFileMetadataIds
1463-
changedFileMetadataIds.addAll(
1452+
changedFileIds.addAll(
14641453
fileMetadatas.stream()
1465-
.map(FileMetadata::getId)
1454+
.map(fm -> fm.getDataFile().getId())
14661455
.collect(Collectors.toList())
14671456
);
14681457
}
@@ -1526,7 +1515,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long
15261515
}
15271516
boolean indexThisFile = false;
15281517

1529-
if (indexThisMetadata && (isReleasedVersion || changedFileMetadataIds.contains(fileMetadata.getId()))) {
1518+
if (indexThisMetadata && (isReleasedVersion || changedFileIds.contains(datafile.getId()))) {
15301519
indexThisFile = true;
15311520
} else if (indexThisMetadata) {
15321521
// Draft version, file is not new or all file metadata matches the released version

src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java

Lines changed: 100 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import jakarta.json.JsonObjectBuilder;
3737
import jakarta.persistence.EntityManager;
3838
import jakarta.persistence.PersistenceContext;
39+
import jakarta.persistence.Query;
3940

4041
import org.apache.solr.client.solrj.SolrServerException;
4142
import org.apache.solr.common.SolrInputDocument;
@@ -410,58 +411,142 @@ public void indexDatasetBatchInNewTransaction(List<Long> datasetIds, final int[]
410411
indexPermissionsForOneDvObject(dataset);
411412

412413
// Process files for this dataset
413-
for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) {
414-
processDatasetVersionFiles(version, fileCounter, fileQueryMin);
414+
Set<DatasetVersion> versions = datasetVersionsToBuildCardsFor(dataset);
415+
final List<Long> changedFileIds = new ArrayList<>();
416+
if(versions.size()>1) {
417+
Long releasedVersionId = null;
418+
Long draftVersionId = null;
419+
420+
for (DatasetVersion version : versions) {
421+
if (version.isReleased()) {
422+
releasedVersionId = version.getId();
423+
} else if (version.isDraft()) {
424+
draftVersionId = version.getId();
425+
}
426+
}
427+
428+
populateChangedFileIds(
429+
releasedVersionId,
430+
draftVersionId,
431+
changedFileIds
432+
);
433+
}
434+
for (DatasetVersion version : versions) {
435+
processDatasetVersionFiles(version, fileCounter, fileQueryMin, (versions.size()>1 && version.isDraft()) ? changedFileIds : null);
415436
}
416437
}
417438
}
418439
}
419440

420441
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
421442
public void indexDatasetFilesInNewTransaction(List<DatasetVersion> versions, final int[] fileCounter, int fileQueryMin) {
443+
final List<Long> changedFileIds = new ArrayList<>();
444+
if(versions.size()>1) {
445+
Long releasedVersionId = versions.get(versions.get(0).isReleased() ? 0 : 1).getId();
446+
Long draftVersionId = versions.get(versions.get(0).isReleased() ? 1 : 0).getId();
447+
448+
populateChangedFileIds(
449+
releasedVersionId,
450+
draftVersionId,
451+
changedFileIds
452+
);
453+
}
422454
for (DatasetVersion version : versions) {
423455
// The version object is detached, but its fileMetadatas collection is already loaded.
424456
// We only need its ID and state, which are available.
425-
processDatasetVersionFiles(version, fileCounter, fileQueryMin);
457+
processDatasetVersionFiles(version, fileCounter, fileQueryMin, (versions.size()>1 && version.isDraft()) ? changedFileIds : null);
426458
}
427459
}
428460

461+
/**
462+
* Retrieves the IDs of file metadatas that have changed between the released version
463+
* and the draft version of a dataset.
464+
*
465+
* @param releasedVersionId the ID of the released dataset version
466+
* @param draftVersionId the ID of the draft dataset version
467+
* @param changedFileMetadataIds the list to populate with changed file metadata IDs
468+
*/
469+
public void populateChangedFileIds(Long releasedVersionId, Long draftVersionId, List<Long> changedFileIds) {
470+
Query query = em.createNamedQuery("FileMetadata.getDatafilesWithChangedMetadata", Long.class);
471+
query.setParameter(1, releasedVersionId);
472+
query.setParameter(2, draftVersionId);
473+
474+
/*
475+
* When the query was configured to return Long, it was returning Integer.
476+
* The query has been changed to return Integer now. The code here is robust
477+
* if that changes in the future.
478+
*/
479+
List<Object> queryResults = query.getResultList();
480+
for (Object result : queryResults) {
481+
if (result != null) {
482+
// Ensure we're adding Long objects to the list
483+
if (result instanceof Integer intResult) {
484+
logger.finest("Converted Integer result to Long: " + result);
485+
changedFileIds.add(Long.valueOf(intResult));
486+
} else if (result instanceof Long longResult) {
487+
// Already a Long, add directly
488+
logger.finest("Added existing Long to list: " + result);
489+
changedFileIds.add(longResult);
490+
} else {
491+
// If it's not a Long, convert it to one via String
492+
try {
493+
changedFileIds.add(Long.valueOf(result.toString()));
494+
logger.finest("Converted non-Long result to Long: " + result + " of type " + result.getClass().getName());
495+
} catch (NumberFormatException e) {
496+
logger.warning("Could not convert query result to Long: " + result);
497+
}
498+
}
499+
}
500+
}
501+
logger.fine("Found " + changedFileIds.size() + " datafiles whose metadata has changed between versions " + releasedVersionId + " and " + draftVersionId);
502+
}
503+
429504
private void processDatasetVersionFiles(DatasetVersion version,
430-
final int[] fileCounter, int fileQueryMin) {
505+
final int[] fileCounter, int fileQueryMin, List<Long> changedFileIds) {
431506
List<String> cachedPerms = searchPermissionsService.findDatasetVersionPerms(version);
432507
String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState());
433508
Long versionId = version.getId();
434509
List<DataFileProxy> filesToReindexAsBatch = new ArrayList<>();
435510

511+
// If the version is draft and there is a released version,
512+
// we only need perm docs for the files with filemetadata changes == those in changedFileMetadataIds
513+
436514
// Process files in batches of 100
437515
int batchSize = 100;
438516

439517
if (dataFileService.findCountByDatasetVersionId(version.getId()).intValue() > fileQueryMin) {
440518
// For large datasets, use a more efficient SQL query
519+
// ToDo - only get the ones in finalFileIdsToReindex
441520
try (Stream<DataFileProxy> fileStream = getDataFileInfoForPermissionIndexing(version.getId())) {
442521

443522
// Process files in batches to avoid memory issues
444523
fileStream.forEach(fileInfo -> {
445-
filesToReindexAsBatch.add(fileInfo);
446-
fileCounter[0]++;
447-
448-
if (filesToReindexAsBatch.size() >= batchSize) {
449-
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
450-
filesToReindexAsBatch.clear();
524+
// Only add files that need reindexing
525+
if (changedFileIds == null || changedFileIds.contains(fileInfo.getFileId())) {
526+
filesToReindexAsBatch.add(fileInfo);
527+
fileCounter[0]++;
528+
529+
if (filesToReindexAsBatch.size() >= batchSize) {
530+
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
531+
filesToReindexAsBatch.clear();
532+
}
451533
}
452534
});
453535
}
454536
} else {
455537
// For smaller datasets, process files directly
456538
// We only call getFileMetadatas() in the case where we know they have already been loaded
457539
for (FileMetadata fmd : version.getFileMetadatas()) {
540+
// Only add files that need reindexing
458541
DataFileProxy fileProxy = new DataFileProxy(fmd);
459-
filesToReindexAsBatch.add(fileProxy);
460-
fileCounter[0]++;
542+
if (changedFileIds == null || changedFileIds.contains(fileProxy.getFileId())) {
543+
filesToReindexAsBatch.add(fileProxy);
544+
fileCounter[0]++;
461545

462-
if (filesToReindexAsBatch.size() >= batchSize) {
463-
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
464-
filesToReindexAsBatch.clear();
546+
if (filesToReindexAsBatch.size() >= batchSize) {
547+
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
548+
filesToReindexAsBatch.clear();
549+
}
465550
}
466551
}
467552
}

0 commit comments

Comments
 (0)