|
36 | 36 | import jakarta.json.JsonObjectBuilder; |
37 | 37 | import jakarta.persistence.EntityManager; |
38 | 38 | import jakarta.persistence.PersistenceContext; |
| 39 | +import jakarta.persistence.Query; |
39 | 40 |
|
40 | 41 | import org.apache.solr.client.solrj.SolrServerException; |
41 | 42 | import org.apache.solr.common.SolrInputDocument; |
@@ -410,58 +411,142 @@ public void indexDatasetBatchInNewTransaction(List<Long> datasetIds, final int[] |
410 | 411 | indexPermissionsForOneDvObject(dataset); |
411 | 412 |
|
412 | 413 | // Process files for this dataset |
413 | | - for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) { |
414 | | - processDatasetVersionFiles(version, fileCounter, fileQueryMin); |
| 414 | + Set<DatasetVersion> versions = datasetVersionsToBuildCardsFor(dataset); |
| 415 | + final List<Long> changedFileIds = new ArrayList<>(); |
| 416 | + if(versions.size()>1) { |
| 417 | + Long releasedVersionId = null; |
| 418 | + Long draftVersionId = null; |
| 419 | + |
| 420 | + for (DatasetVersion version : versions) { |
| 421 | + if (version.isReleased()) { |
| 422 | + releasedVersionId = version.getId(); |
| 423 | + } else if (version.isDraft()) { |
| 424 | + draftVersionId = version.getId(); |
| 425 | + } |
| 426 | + } |
| 427 | + |
| 428 | + populateChangedFileIds( |
| 429 | + releasedVersionId, |
| 430 | + draftVersionId, |
| 431 | + changedFileIds |
| 432 | + ); |
| 433 | + } |
| 434 | + for (DatasetVersion version : versions) { |
| 435 | + processDatasetVersionFiles(version, fileCounter, fileQueryMin, (versions.size()>1 && version.isDraft()) ? changedFileIds : null); |
415 | 436 | } |
416 | 437 | } |
417 | 438 | } |
418 | 439 | } |
419 | 440 |
|
420 | 441 | @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) |
421 | 442 | public void indexDatasetFilesInNewTransaction(List<DatasetVersion> versions, final int[] fileCounter, int fileQueryMin) { |
| 443 | + final List<Long> changedFileIds = new ArrayList<>(); |
| 444 | + if(versions.size()>1) { |
| 445 | + Long releasedVersionId = versions.get(versions.get(0).isReleased() ? 0 : 1).getId(); |
| 446 | + Long draftVersionId = versions.get(versions.get(0).isReleased() ? 1 : 0).getId(); |
| 447 | + |
| 448 | + populateChangedFileIds( |
| 449 | + releasedVersionId, |
| 450 | + draftVersionId, |
| 451 | + changedFileIds |
| 452 | + ); |
| 453 | + } |
422 | 454 | for (DatasetVersion version : versions) { |
423 | 455 | // The version object is detached, but its fileMetadatas collection is already loaded. |
424 | 456 | // We only need its ID and state, which are available. |
425 | | - processDatasetVersionFiles(version, fileCounter, fileQueryMin); |
| 457 | + processDatasetVersionFiles(version, fileCounter, fileQueryMin, (versions.size()>1 && version.isDraft()) ? changedFileIds : null); |
426 | 458 | } |
427 | 459 | } |
428 | 460 |
|
| 461 | + /** |
| 462 | + * Retrieves the IDs of file metadatas that have changed between the released version |
| 463 | + * and the draft version of a dataset. |
| 464 | + * |
| 465 | + * @param releasedVersionId the ID of the released dataset version |
| 466 | + * @param draftVersionId the ID of the draft dataset version |
| 467 | + * @param changedFileMetadataIds the list to populate with changed file metadata IDs |
| 468 | + */ |
| 469 | + public void populateChangedFileIds(Long releasedVersionId, Long draftVersionId, List<Long> changedFileIds) { |
| 470 | + Query query = em.createNamedQuery("FileMetadata.getDatafilesWithChangedMetadata", Long.class); |
| 471 | + query.setParameter(1, releasedVersionId); |
| 472 | + query.setParameter(2, draftVersionId); |
| 473 | + |
| 474 | + /* |
| 475 | + * When the query was configured to return Long, it was returning Integer. |
| 476 | + * The query has been changed to return Integer now. The code here is robust |
| 477 | + * if that changes in the future. |
| 478 | + */ |
| 479 | + List<Object> queryResults = query.getResultList(); |
| 480 | + for (Object result : queryResults) { |
| 481 | + if (result != null) { |
| 482 | + // Ensure we're adding Long objects to the list |
| 483 | + if (result instanceof Integer intResult) { |
| 484 | + logger.finest("Converted Integer result to Long: " + result); |
| 485 | + changedFileIds.add(Long.valueOf(intResult)); |
| 486 | + } else if (result instanceof Long longResult) { |
| 487 | + // Already a Long, add directly |
| 488 | + logger.finest("Added existing Long to list: " + result); |
| 489 | + changedFileIds.add(longResult); |
| 490 | + } else { |
| 491 | + // If it's not a Long, convert it to one via String |
| 492 | + try { |
| 493 | + changedFileIds.add(Long.valueOf(result.toString())); |
| 494 | + logger.finest("Converted non-Long result to Long: " + result + " of type " + result.getClass().getName()); |
| 495 | + } catch (NumberFormatException e) { |
| 496 | + logger.warning("Could not convert query result to Long: " + result); |
| 497 | + } |
| 498 | + } |
| 499 | + } |
| 500 | + } |
| 501 | + logger.fine("Found " + changedFileIds.size() + " datafiles whose metadata has changed between versions " + releasedVersionId + " and " + draftVersionId); |
| 502 | + } |
| 503 | + |
429 | 504 | private void processDatasetVersionFiles(DatasetVersion version, |
430 | | - final int[] fileCounter, int fileQueryMin) { |
| 505 | + final int[] fileCounter, int fileQueryMin, List<Long> changedFileIds) { |
431 | 506 | List<String> cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); |
432 | 507 | String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); |
433 | 508 | Long versionId = version.getId(); |
434 | 509 | List<DataFileProxy> filesToReindexAsBatch = new ArrayList<>(); |
435 | 510 |
|
| 511 | + // If the version is draft and there is a released version, |
| 512 | + // we only need perm docs for the files with filemetadata changes == those in changedFileMetadataIds |
| 513 | + |
436 | 514 | // Process files in batches of 100 |
437 | 515 | int batchSize = 100; |
438 | 516 |
|
439 | 517 | if (dataFileService.findCountByDatasetVersionId(version.getId()).intValue() > fileQueryMin) { |
440 | 518 | // For large datasets, use a more efficient SQL query |
| 519 | + // ToDo - only get the ones in finalFileIdsToReindex |
441 | 520 | try (Stream<DataFileProxy> fileStream = getDataFileInfoForPermissionIndexing(version.getId())) { |
442 | 521 |
|
443 | 522 | // Process files in batches to avoid memory issues |
444 | 523 | fileStream.forEach(fileInfo -> { |
445 | | - filesToReindexAsBatch.add(fileInfo); |
446 | | - fileCounter[0]++; |
447 | | - |
448 | | - if (filesToReindexAsBatch.size() >= batchSize) { |
449 | | - reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); |
450 | | - filesToReindexAsBatch.clear(); |
| 524 | + // Only add files that need reindexing |
| 525 | + if (changedFileIds == null || changedFileIds.contains(fileInfo.getFileId())) { |
| 526 | + filesToReindexAsBatch.add(fileInfo); |
| 527 | + fileCounter[0]++; |
| 528 | + |
| 529 | + if (filesToReindexAsBatch.size() >= batchSize) { |
| 530 | + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); |
| 531 | + filesToReindexAsBatch.clear(); |
| 532 | + } |
451 | 533 | } |
452 | 534 | }); |
453 | 535 | } |
454 | 536 | } else { |
455 | 537 | // For smaller datasets, process files directly |
456 | 538 | // We only call getFileMetadatas() in the case where we know they have already been loaded |
457 | 539 | for (FileMetadata fmd : version.getFileMetadatas()) { |
| 540 | + // Only add files that need reindexing |
458 | 541 | DataFileProxy fileProxy = new DataFileProxy(fmd); |
459 | | - filesToReindexAsBatch.add(fileProxy); |
460 | | - fileCounter[0]++; |
| 542 | + if (changedFileIds == null || changedFileIds.contains(fileProxy.getFileId())) { |
| 543 | + filesToReindexAsBatch.add(fileProxy); |
| 544 | + fileCounter[0]++; |
461 | 545 |
|
462 | | - if (filesToReindexAsBatch.size() >= batchSize) { |
463 | | - reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); |
464 | | - filesToReindexAsBatch.clear(); |
| 546 | + if (filesToReindexAsBatch.size() >= batchSize) { |
| 547 | + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); |
| 548 | + filesToReindexAsBatch.clear(); |
| 549 | + } |
465 | 550 | } |
466 | 551 | } |
467 | 552 | } |
|
0 commit comments