diff --git a/conf/solr/solrconfig.xml b/conf/solr/solrconfig.xml index 97965bd77d7..003b71c85c1 100644 --- a/conf/solr/solrconfig.xml +++ b/conf/solr/solrconfig.xml @@ -238,7 +238,7 @@ have some sort of hard autoCommit to limit the log size. --> - ${solr.autoCommit.maxTime:30000} + ${solr.autoCommit.maxTime:300000} false diff --git a/doc/release-notes/11374-indexing-improvement.md b/doc/release-notes/11374-indexing-improvement.md new file mode 100644 index 00000000000..b852146d110 --- /dev/null +++ b/doc/release-notes/11374-indexing-improvement.md @@ -0,0 +1,5 @@ +### Solr Indexing speed improved + +The performance of Solr indexing has been significantly improved, particularly for datasets with many files. + +A new dataverse.solr.min-files-to-use-proxy microprofile setting can be used to further improve performance/lower memory requirements for datasets with many files (e.g. 500+) (defaults to Integer.MAX, disabling use of the new functionality) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index c1790302de9..bbc141cca91 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2689,6 +2689,17 @@ when using it to configure your core name! Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_PATH``. +dataverse.solr.min-files-to-use-proxy ++++++++++++++++++++++++++++++++++++++ + +Specifies when to use a smaller datafile proxy object for the purposes of dataset indexing. This can lower memory requirements +and improve performance when reindexing large datasets (e.g. those with hundreds or thousands of files). (Creating the proxy may slightly slow indexing datasets with only a few files.) + +This setting represents a number of files for which the datafile procy should be used. By default, this is set to Interger.MAX which disables using the proxy. +A recommended value would be ~1000 but the optimal value may vary depending on details of your installation. + +Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_MIN_FILES_TO_USE_PROXY``. + dataverse.solr.concurrency.max-async-indexes ++++++++++++++++++++++++++++++++++++++++++++ diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFile.java b/src/main/java/edu/harvard/iq/dataverse/DataFile.java index 01c1a48e117..45604a5472b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFile.java @@ -13,6 +13,7 @@ import edu.harvard.iq.dataverse.datasetutility.FileSizeChecker; import edu.harvard.iq.dataverse.ingest.IngestReport; import edu.harvard.iq.dataverse.ingest.IngestRequest; +import edu.harvard.iq.dataverse.search.SolrIndexServiceBean; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.util.FileUtil; import edu.harvard.iq.dataverse.util.ShapefileHandler; @@ -23,6 +24,7 @@ import java.util.Objects; import java.text.SimpleDateFormat; import java.util.Arrays; +import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -50,6 +52,26 @@ @NamedQuery(name="DataFile.findDataFileThatReplacedId", query="SELECT s.id FROM DataFile s WHERE s.previousDataFileId=:identifier") }) +@NamedNativeQuery( + name = "DataFile.getDataFileInfoForPermissionIndexing", + query = "SELECT fm.label, df.id, dvo.publicationDate " + + "FROM filemetadata fm " + + "JOIN datafile df ON fm.datafile_id = df.id " + + "JOIN dvobject dvo ON df.id = dvo.id " + + "WHERE fm.datasetversion_id = ?", + resultSetMapping = "DataFileInfoMapping" + ) + @SqlResultSetMapping( + name = "DataFileInfoMapping", + classes = @ConstructorResult( + targetClass = SolrIndexServiceBean.DataFileProxy.class, + columns = { + @ColumnResult(name = "label", type = String.class), + @ColumnResult(name = "id", type = Long.class), + @ColumnResult(name = "publicationDate", type = Date.class) + } + ) + ) @Entity @Table(indexes = {@Index(columnList="ingeststatus") , @Index(columnList="checksumvalue") diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java index 79c64d03d60..fd3f8333768 100644 --- a/src/main/java/edu/harvard/iq/dataverse/Dataset.java +++ b/src/main/java/edu/harvard/iq/dataverse/Dataset.java @@ -20,10 +20,12 @@ import java.util.Objects; import java.util.Set; import jakarta.persistence.CascadeType; +import jakarta.persistence.ColumnResult; import jakarta.persistence.Entity; import jakarta.persistence.Index; import jakarta.persistence.JoinColumn; import jakarta.persistence.ManyToOne; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.NamedQueries; import jakarta.persistence.NamedQuery; import jakarta.persistence.NamedStoredProcedureQuery; @@ -31,6 +33,7 @@ import jakarta.persistence.OneToOne; import jakarta.persistence.OrderBy; import jakarta.persistence.ParameterMode; +import jakarta.persistence.SqlResultSetMapping; import jakarta.persistence.StoredProcedureParameter; import jakarta.persistence.Table; import jakarta.persistence.Temporal; @@ -71,6 +74,23 @@ @NamedQuery(name = "Dataset.countAll", query = "SELECT COUNT(ds) FROM Dataset ds") }) +@NamedNativeQuery( + name = "Dataset.findAllOrSubsetOrderByFilesOwned", + query = "SELECT DISTINCT CAST(o.id AS BIGINT) as id, COUNT(f.id) as numFiles " + + "FROM dvobject o " + + "LEFT JOIN dvobject f ON f.owner_id = o.id " + + "WHERE o.dtype = 'Dataset' " + + "AND (? = false OR o.indexTime IS NULL) " + + "GROUP BY o.id " + + "ORDER BY numfiles ASC, id", + resultSetMapping = "DatasetIdMapping" + ) +@SqlResultSetMapping( + name = "DatasetIdMapping", + columns = { + @ColumnResult(name = "id", type = Long.class) + } +) /* Below is the database stored procedure for getting a string dataset id. diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index 9a8c43668cb..202800d027b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -279,32 +279,9 @@ public List findAllOrSubsetOrderByFilesOwned(boolean skipIndexed) { SEK - 11/09/2021 */ - String skipClause = skipIndexed ? "AND o.indexTime is null " : ""; - Query query = em.createNativeQuery(" Select distinct(o.id), count(f.id) as numFiles FROM dvobject o " + - "left join dvobject f on f.owner_id = o.id where o.dtype = 'Dataset' " - + skipClause - + " group by o.id " - + "ORDER BY count(f.id) asc, o.id"); - - List queryResults; - queryResults = query.getResultList(); - - List retVal = new ArrayList(); - for (Object[] result : queryResults) { - Long dsId; - if (result[0] != null) { - try { - dsId = Long.parseLong(result[0].toString()) ; - } catch (Exception ex) { - dsId = null; - } - if (dsId == null) { - continue; - } - retVal.add(dsId); - } - } - return retVal; + return em.createNamedQuery("Dataset.findAllOrSubsetOrderByFilesOwned", Long.class) + .setParameter(1, skipIndexed) + .getResultList(); } /** diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java index 461c8b14e46..ca3e2d67263 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java @@ -25,6 +25,7 @@ import jakarta.json.Json; import jakarta.json.JsonArrayBuilder; import jakarta.persistence.Column; +import jakarta.persistence.ColumnResult; import jakarta.persistence.Entity; import jakarta.persistence.GeneratedValue; import jakarta.persistence.GenerationType; @@ -35,8 +36,10 @@ import jakarta.persistence.JoinTable; import jakarta.persistence.ManyToMany; import jakarta.persistence.ManyToOne; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.OneToMany; import jakarta.persistence.OrderBy; +import jakarta.persistence.SqlResultSetMapping; import jakarta.persistence.Table; import jakarta.persistence.Transient; import jakarta.persistence.Version; @@ -62,6 +65,39 @@ * @author skraffmiller */ @Table(indexes = {@Index(columnList="datafile_id"), @Index(columnList="datasetversion_id")} ) +@NamedNativeQuery( + name = "FileMetadata.compareFileMetadata", + query = "WITH fm_categories AS (" + + " SELECT fmd.filemetadatas_id, " + + " STRING_AGG(dfc.name, ',' ORDER BY dfc.name) AS categories " + + " FROM FileMetadata_DataFileCategory fmd " + + " JOIN DataFileCategory dfc ON fmd.filecategories_id = dfc.id " + + " GROUP BY fmd.filemetadatas_id " + + ") " + + "SELECT fm1.id " + + "FROM FileMetadata fm1 " + + "LEFT JOIN FileMetadata fm2 ON fm1.datafile_id = fm2.datafile_id " + + " AND fm2.datasetversion_id = ?1 " + + "LEFT JOIN fm_categories fc1 ON fc1.filemetadatas_id = fm1.id " + + "LEFT JOIN fm_categories fc2 ON fc2.filemetadatas_id = fm2.id " + + "WHERE fm1.datasetversion_id = ?2 " + + " AND (fm2.id IS NULL " + + " OR (fm1.datafile_id = fm2.datafile_id " + + " AND (fm2.description IS DISTINCT FROM fm1.description " + + " OR fm2.directoryLabel IS DISTINCT FROM fm1.directoryLabel " + + " OR fm2.label != fm1.label " + + " OR fm2.restricted IS DISTINCT FROM fm1.restricted " + + " OR fm2.prov_freeform IS DISTINCT FROM fm1.prov_freeform " + + " OR fc1.categories IS DISTINCT FROM fc2.categories " + + " ) " + + " ) " + + " )", + resultSetMapping = "IdToLongMapping" + ) +@SqlResultSetMapping( + name = "IdToLongMapping", + columns = @ColumnResult(name = "id", type = Long.class) + ) @Entity public class FileMetadata implements Serializable { private static final long serialVersionUID = 1L; diff --git a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java index e0dea739edc..9a73cad7877 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java @@ -64,26 +64,41 @@ When there are changes (after v4.19)to the file metadata data model this method if (newFileMetadata.getDataFile() == null && originalFileMetadata != null){ //File Deleted - updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0); + if (details) { + updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0); + } return false; } - - if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null ){ + + if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null){ //File Added - if (!details) return false; - retVal = false; - updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0); - } - - //Check to see if File replaced - if (originalFileMetadata != null && - newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null &&!this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())){ - if (!details) return false; - updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1); + if (!details) { + return false; + } retVal = false; + updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0); } - if ( originalFileMetadata != null) { + if (originalFileMetadata != null) { + // Check to see if File replaced + if (newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null && !this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())) { + if (!details) + return false; + updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1); + retVal = false; + } + + /* + * Get Restriction Differences + */ + if (originalFileMetadata.isRestricted() != newFileMetadata.isRestricted()) { + if (details) { + String value2 = newFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted"); + updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileAccessTitle"), value2, 0, 0, 0, 0); + } + retVal = false; + } + if (!newFileMetadata.getLabel().equals(originalFileMetadata.getLabel())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), originalFileMetadata.getLabel(), newFileMetadata.getLabel())); @@ -94,10 +109,8 @@ When there are changes (after v4.19)to the file metadata data model this method BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), 0, 1, 0, 0); retVal = false; } - } - //Description differences - if ( originalFileMetadata != null) { + //Description differences if (newFileMetadata.getDescription() != null && originalFileMetadata.getDescription() != null && !newFileMetadata.getDescription().equals(originalFileMetadata.getDescription())) { @@ -134,9 +147,7 @@ When there are changes (after v4.19)to the file metadata data model this method BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 0, 1, 0); retVal = false; } - } - //Provenance Description differences - if ( originalFileMetadata != null) { + //Provenance Description differences if ((newFileMetadata.getProvFreeForm() != null && !newFileMetadata.getProvFreeForm().isEmpty()) && (originalFileMetadata.getProvFreeForm() != null && !originalFileMetadata.getProvFreeForm().isEmpty()) && !newFileMetadata.getProvFreeForm().equals(originalFileMetadata.getProvFreeForm())) { @@ -173,8 +184,6 @@ When there are changes (after v4.19)to the file metadata data model this method BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 0, 1, 0); retVal = false; } - } - if (originalFileMetadata != null) { /* get Tags differences */ @@ -188,7 +197,9 @@ When there are changes (after v4.19)to the file metadata data model this method } if (!value1.equals(value2)) { - if (!details) return false; + if (!details) { + return false; + } int added = 0; int deleted = 0; @@ -223,16 +234,7 @@ When there are changes (after v4.19)to the file metadata data model this method } retVal = false; } - - /* - Get Restriction Differences - */ - value1 = originalFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted"); - value2 = newFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted"); - if (!value1.equals(value2)) { - updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileAccessTitle"), value2, 0, 0, 0, 0); - retVal = false; - } + } return retVal; } diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 88acc1916cf..8ac2aabdfa4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -2,6 +2,7 @@ import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; import edu.harvard.iq.dataverse.authorization.DataverseRole; +import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.RoleAssignee; import edu.harvard.iq.dataverse.authorization.groups.Group; import edu.harvard.iq.dataverse.authorization.groups.GroupServiceBean; @@ -27,6 +28,7 @@ import jakarta.ejb.Stateless; import jakarta.inject.Named; import jakarta.persistence.EntityManager; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.PersistenceContext; import org.apache.commons.lang3.StringUtils; @@ -395,6 +397,15 @@ public List filterRoleAssignees(String query, DvObject dvObject, L return roleAssigneeList; } + + + public List findAssigneesWithPermissionOnDvObject(Long objectId, Permission permission) { + int bitpos = 63 - permission.ordinal(); + return em.createNamedQuery("RoleAssignment.findAssigneesWithPermissionOnDvObject", String.class) + .setParameter(1, bitpos) + .setParameter(2, objectId) + .getResultList(); + } private void msg(String s) { //System.out.println(s); diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index df004fe1357..eed7fe07637 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -5,6 +5,7 @@ import java.util.Objects; import jakarta.persistence.CascadeType; import jakarta.persistence.Column; +import jakarta.persistence.ColumnResult; import jakarta.persistence.Entity; import jakarta.persistence.GeneratedValue; import jakarta.persistence.GenerationType; @@ -12,8 +13,11 @@ import jakarta.persistence.Index; import jakarta.persistence.JoinColumn; import jakarta.persistence.ManyToOne; +import jakarta.persistence.NamedNativeQueries; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.NamedQueries; import jakarta.persistence.NamedQuery; +import jakarta.persistence.SqlResultSetMapping; import jakarta.persistence.Table; import jakarta.persistence.UniqueConstraint; @@ -52,6 +56,33 @@ @NamedQuery( name = "RoleAssignment.deleteAllByAssigneeIdentifier_Definition_PointId_RoleType", query = "DELETE FROM RoleAssignment r WHERE r.assigneeIdentifier=:assigneeIdentifier AND r.role.id=:roleId and r.definitionPoint.id=:definitionPointId") }) +@NamedNativeQueries({ + @NamedNativeQuery( + name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", + query = "WITH RECURSIVE owner_hierarchy(id, owner_id, permissionroot) AS ( " + + " SELECT dvo.id, dvo.owner_id, COALESCE(dv.permissionroot, false) " + + " FROM dvobject dvo " + + " LEFT JOIN dataverse dv ON dvo.id = dv.id " + + " WHERE dvo.id = ?2 " + + " UNION ALL " + + " SELECT dvo.id, dvo.owner_id, dv.permissionroot " + + " FROM dvobject dvo " + + " LEFT JOIN dataverse dv ON dvo.id = dv.id " + + " JOIN owner_hierarchy oh ON dvo.id = oh.owner_id " + + " WHERE NOT oh.permissionroot " + + ") " + + "SELECT DISTINCT ra.assigneeidentifier " + + "FROM roleassignment ra " + + "JOIN dataverserole dr ON ra.role_id = dr.id " + + "JOIN owner_hierarchy oh ON ra.definitionpoint_id = oh.id " + + "WHERE get_bit(dr.permissionbits::bit(64), ?1) = '1'", + resultSetMapping = "AssigneeIdentifierMapping" + ) +}) +@SqlResultSetMapping( + name = "AssigneeIdentifierMapping", + columns = @ColumnResult(name = "assigneeidentifier") + ) public class RoleAssignment implements java.io.Serializable { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) diff --git a/src/main/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataUtil.java b/src/main/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataUtil.java index 209ffd93fe3..136bd1b7bae 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataUtil.java @@ -3,34 +3,33 @@ import edu.harvard.iq.dataverse.FileMetadata; import java.util.Collection; +import java.util.HashMap; +import java.util.Map; public class VariableMetadataUtil { public static boolean compareVariableMetadata(FileMetadata fmdo, FileMetadata fmdn) { Collection vmlo = fmdo.getVariableMetadatas(); Collection vmln = fmdn.getVariableMetadatas(); - - int count = 0; + if (vmlo.size() != vmln.size()) { return false; - } else { - for (VariableMetadata vmo : vmlo) { - for (VariableMetadata vmn : vmln) { - if (vmo.getDataVariable().getId().equals(vmn.getDataVariable().getId())) { - count++; - if (!compareVarMetadata(vmo, vmn)) { - return false; - } - } - } - } } - if (count == vmlo.size()) { - return true; - } else { - return false; + + Map vmnMap = new HashMap<>(); + for (VariableMetadata vmn : vmln) { + vmnMap.put(vmn.getDataVariable().getId(), vmn); } - + + for (VariableMetadata vmo : vmlo) { + Long id = vmo.getDataVariable().getId(); + VariableMetadata vmn = vmnMap.get(id); + if (vmn == null || !compareVarMetadata(vmo, vmn)) { + return false; + } + } + + return true; } public static boolean compareVarMetadata(VariableMetadata vmOld, VariableMetadata vmNew) { diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java index 3f7a7bb3363..6e47d1938c1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java @@ -175,7 +175,7 @@ public Future indexAllOrSubset(long numPartitions, long partitionId, boo // List dataverses = dataverseService.findAllOrSubset(numPartitions, partitionId, skipIndexed); // Note: no support for "partitions" in this experimental branch. // The method below returns the ids of all the unindexed dataverses. - List dataverseIds = dataverseIds = dataverseService.findDataverseIdsForIndexing(skipIndexed); + List dataverseIds = dataverseService.findDataverseIdsForIndexing(skipIndexed); int dataverseIndexCount = 0; int dataverseFailureCount = 0; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index a8e6c0661d7..a38a8679209 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -4,6 +4,7 @@ import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.DataFileServiceBean; import edu.harvard.iq.dataverse.DataFileTag; +import edu.harvard.iq.dataverse.DataTable; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; import edu.harvard.iq.dataverse.DatasetFieldCompoundValue; @@ -29,6 +30,7 @@ import edu.harvard.iq.dataverse.GlobalId; import edu.harvard.iq.dataverse.PermissionServiceBean; import edu.harvard.iq.dataverse.Retention; +import edu.harvard.iq.dataverse.TermsOfUseAndAccess; import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinUserServiceBean; import edu.harvard.iq.dataverse.batch.util.LoggingUtil; @@ -39,7 +41,6 @@ import edu.harvard.iq.dataverse.datavariable.DataVariable; import edu.harvard.iq.dataverse.datavariable.VariableMetadata; import edu.harvard.iq.dataverse.datavariable.VariableMetadataUtil; -import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; import edu.harvard.iq.dataverse.search.IndexableDataset.DatasetState; import edu.harvard.iq.dataverse.settings.FeatureFlags; @@ -69,6 +70,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Future; import java.util.concurrent.Semaphore; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.logging.Level; import java.util.logging.Logger; @@ -87,6 +89,7 @@ import jakarta.json.JsonObject; import jakarta.persistence.EntityManager; import jakarta.persistence.PersistenceContext; +import jakarta.persistence.Query; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -153,9 +156,6 @@ public class IndexServiceBean { @EJB DataFileServiceBean dataFileService; - @EJB - VariableServiceBean variableService; - @EJB DatasetFieldServiceBean datasetFieldService; @@ -339,9 +339,7 @@ public Future indexDataverse(Dataverse dataverse, boolean processPaths) @TransactionAttribute(REQUIRES_NEW) public void indexDatasetInNewTransaction(Long datasetId) { //Dataset dataset) { boolean doNormalSolrDocCleanUp = false; - Dataset dataset = datasetService.findDeep(datasetId); - asyncIndexDataset(dataset, doNormalSolrDocCleanUp); - dataset = null; + asyncIndexDataset(datasetId, doNormalSolrDocCleanUp); } // The following two variables are only used in the synchronized getNextToIndex method and do not need to be synchronized themselves @@ -430,6 +428,26 @@ public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { } } + @Asynchronous + public void asyncIndexDataset(Long datasetId, boolean doNormalSolrDocCleanUp) { + //Initialize dataset here for logging (LoggingUtil) purposes + Dataset dataset = null; + try { + acquirePermitFromSemaphore(); + dataset = datasetService.find(datasetId); + doAsyncIndexDataset(dataset, doNormalSolrDocCleanUp); + } catch (InterruptedException e) { + String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + datasetId.toString(); + failureLogText += "\r\n" + e.getLocalizedMessage(); + if(dataset==null) { + dataset = new Dataset(); + dataset.setId(datasetId); + } + LoggingUtil.writeOnSuccessFailureLog(null, failureLogText, dataset); + } finally { + ASYNC_INDEX_SEMAPHORE.release(); + } + } private void doAsyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { Long id = dataset.getId(); Dataset next = getNextToIndex(id, dataset); // if there is an ongoing index job for this dataset, next is null (ongoing index job will reindex the newest version after current indexing finishes) @@ -1374,87 +1392,131 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set filesIndexed = new ArrayList<>(); + final List changedFileMetadataIds = new ArrayList<>(); if (datasetVersion != null) { List fileMetadatas = datasetVersion.getFileMetadatas(); - List releasedFileMetadatas = new ArrayList<>(); + List rfm = new ArrayList<>(); Map fileMap = new HashMap<>(); - boolean checkForDuplicateMetadata = false; if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { - checkForDuplicateMetadata = true; - releasedFileMetadatas = dataset.getReleasedVersion().getFileMetadatas(); - for(FileMetadata released: releasedFileMetadatas){ + rfm = dataset.getReleasedVersion().getFileMetadatas(); + for (FileMetadata released : rfm) { fileMap.put(released.getDataFile().getId(), released); } + + Query query = em.createNamedQuery("FileMetadata.compareFileMetadata", Long.class); + query.setParameter(1, dataset.getReleasedVersion().getId()); + query.setParameter(2, datasetVersion.getId()); + + changedFileMetadataIds.addAll(query.getResultList()); logger.fine( "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); + } else if (datasetVersion.isDraft()) { + // Add all file metadata ids to changedFileMetadataIds + changedFileMetadataIds.addAll( + fileMetadatas.stream() + .map(FileMetadata::getId) + .collect(Collectors.toList()) + ); } - LocalDate embargoEndDate=null; - LocalDate retentionEndDate=null; - final String datasetCitation = (dataset.isReleased() && dataset.getReleasedVersion() != null) ? - dataset.getCitation(dataset.getReleasedVersion()) : dataset.getCitation(); + + AtomicReference embargoEndDateRef = new AtomicReference<>(null); + AtomicReference retentionEndDateRef = new AtomicReference<>(null); + final String datasetCitation = (dataset.isReleased() && dataset.getReleasedVersion() != null) ? dataset.getCitation(dataset.getReleasedVersion()) : dataset.getCitation(); final Long datasetId = dataset.getId(); final String datasetGlobalId = dataset.getGlobalId().toString(); - for (FileMetadata fileMetadata : fileMetadatas) { + final String parentTitle = parentDatasetTitle; + + AutoDetectParser ap = null; + ParseContext ct = null; + if (doFullTextIndexing) { + ap = new AutoDetectParser(); + ct = new ParseContext(); + } + final AutoDetectParser autoParser = ap; + final ParseContext context = ct; + + Set datasetPublicationStatuses = new HashSet(); + if (dataset.getReleasedVersion() == null && !dataset.isHarvested()) { + datasetPublicationStatuses.add(UNPUBLISHED_STRING); + } + + if (datasetVersion.isInReview()) { + datasetPublicationStatuses.add(IN_REVIEW_STRING); + } + + if (indexableDataset.getDatasetState().equals(DatasetState.PUBLISHED)) { + datasetPublicationStatuses.add(PUBLISHED_STRING); + } else { + if (indexableDataset.getDatasetState().equals(DatasetState.WORKING_COPY)) { + datasetPublicationStatuses.add(DRAFT_STRING); + } + } + + String datasetVersionId = datasetVersion.getId().toString(); + boolean indexThisMetadata = indexableDataset.isFilesShouldBeIndexed(); + boolean isReleasedVersion = datasetVersion.isReleased(); + + String datasetPersistentURL = dataset.getPersistentURL(); + boolean isHarvested = dataset.isHarvested(); + long startTime = System.currentTimeMillis(); + fileMetadatas.stream().forEach(fileMetadata -> { + DataFile datafile = fileMetadata.getDataFile(); + Embargo emb = datafile.getEmbargo(); LocalDate end = null; - LocalDate start = null; - Embargo emb= fileMetadata.getDataFile().getEmbargo(); - if(emb!=null) { - end = emb.getDateAvailable(); - if(embargoEndDate==null || end.isAfter(embargoEndDate)) { - embargoEndDate=end; - } + if (emb != null) { + final LocalDate endDate = emb.getDateAvailable(); + embargoEndDateRef.updateAndGet(current -> (current == null || endDate.isAfter(current)) ? endDate : current); + end = endDate; } - Retention ret= fileMetadata.getDataFile().getRetention(); - if(ret!=null) { - start = ret.getDateUnavailable(); - if(retentionEndDate==null || start.isBefore(retentionEndDate)) { - retentionEndDate=start; - } + Retention ret = datafile.getRetention(); + LocalDate start = null; + if (ret != null) { + final LocalDate startDate = ret.getDateUnavailable(); + retentionEndDateRef.updateAndGet(current -> (current == null || startDate.isBefore(current)) ? startDate : current); + start = startDate; } - boolean indexThisMetadata = indexableDataset.isFilesShouldBeIndexed(); - if (indexThisMetadata && checkForDuplicateMetadata && !releasedFileMetadatas.isEmpty()) { + boolean indexThisFile = false; + + if (indexThisMetadata && (isReleasedVersion || changedFileMetadataIds.contains(fileMetadata.getId()))) { + indexThisFile = true; + } else if (indexThisMetadata) { logger.fine("Checking if this file metadata is a duplicate."); - FileMetadata getFromMap = fileMap.get(fileMetadata.getDataFile().getId()); + FileMetadata getFromMap = fileMap.get(datafile.getId()); if (getFromMap != null) { - if ((fileMetadata.getDataFile().isRestricted() == getFromMap.getDataFile().isRestricted())) { - if (fileMetadata.contentEquals(getFromMap) - && VariableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { - indexThisMetadata = false; - logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); - } else { - logger.fine("This file metadata has changed since the released version; we want to index it!"); - } - } else { - logger.fine("This file's restricted status has changed since the released version; we want to index it!"); + if (!VariableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { + indexThisFile = true; + logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); } } - } - if (indexThisMetadata) { + } + if (indexThisFile) { SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); - Long fileEntityId = fileMetadata.getDataFile().getId(); + Long fileEntityId = datafile.getId(); datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId); datafileSolrInputDocument.addField(SearchFields.DATAVERSE_VERSION_INDEXED_BY, dataverseVersion); datafileSolrInputDocument.addField(SearchFields.IDENTIFIER, fileEntityId); - datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL()); + datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, datasetPersistentURL); datafileSolrInputDocument.addField(SearchFields.TYPE, "files"); datafileSolrInputDocument.addField(SearchFields.CATEGORY_OF_DATAVERSE, dvIndexableCategoryName); - if(end!=null) { - datafileSolrInputDocument.addField(SearchFields.EMBARGO_END_DATE, end.toEpochDay()); + if (end != null) { + datafileSolrInputDocument.addField(SearchFields.EMBARGO_END_DATE, end.toEpochDay()); } - if(start!=null) { + if (start != null) { datafileSolrInputDocument.addField(SearchFields.RETENTION_END_DATE, start.toEpochDay()); } /* Full-text indexing using Apache Tika */ if (doFullTextIndexing) { - if (!dataset.isHarvested() && !fileMetadata.getDataFile().isRestricted() - && !fileMetadata.getDataFile().isFilePackage() - && fileMetadata.getDataFile().getRetention() == null) { + long fileSize = datafile.getFilesize(); + if (!isHarvested && !datafile.isRestricted() + && !datafile.isFilePackage() + && fileSize != 0 && fileSize <= maxSize + && datafile.getRetention() == null) { StorageIO accessObject = null; InputStream instream = null; ContentHandler textHandler = null; try { - accessObject = DataAccess.getStorageIO(fileMetadata.getDataFile(), + accessObject = DataAccess.getStorageIO(datafile, new DataAccessRequest()); if (accessObject != null) { accessObject.open(); @@ -1463,37 +1525,35 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, s)); String fileSolrDocId = solrDocIdentifierFile + fileEntityId; - if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().PUBLISHED)) { - fileSolrDocId = solrDocIdentifierFile + fileEntityId; - datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING); + indexableDataset.getDatasetState(); + if (datasetPublicationStatuses.contains(PUBLISHED_STRING)) { if (FeatureFlags.ADD_PUBLICOBJECT_SOLR_FIELD.enabled()) { datafileSolrInputDocument.addField(SearchFields.PUBLIC_OBJECT, true); } - // datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString); addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset); - // has this published file been deleted from the current draft version? + // has this published file been deleted from the current draft version? if (datafilesInDraftVersion != null && !datafilesInDraftVersion.contains(datafile.getId())) { datafileSolrInputDocument.addField(SearchFields.FILE_DELETED, true); } - } else if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().WORKING_COPY)) { - fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix(); - datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING); + } else { + indexableDataset.getDatasetState(); + if (datasetPublicationStatuses.contains(DRAFT_STRING)) { + fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix(); + } } datafileSolrInputDocument.addField(SearchFields.ID, fileSolrDocId); - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, fileMetadata.getDataFile().getFriendlyType()); - datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, fileMetadata.getDataFile().getContentType()); - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, fileMetadata.getDataFile().getFriendlyType()); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, datafile.getFriendlyType()); + datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, datafile.getContentType()); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, datafile.getFriendlyType()); // For the file type facets, we have a property file that maps mime types // to facet-friendly names; "application/fits" should become "FITS", etc.: - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getIndexableFacetFileType(fileMetadata.getDataFile())); - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getIndexableFacetFileType(fileMetadata.getDataFile())); - datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, fileMetadata.getDataFile().getFilesize()); - if (DataFile.ChecksumType.MD5.equals(fileMetadata.getDataFile().getChecksumType())) { + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getIndexableFacetFileType(datafile)); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getIndexableFacetFileType(datafile)); + datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, datafile.getFilesize()); + if (DataFile.ChecksumType.MD5.equals(datafile.getChecksumType())) { /** * @todo Someday we should probably deprecate this * FILE_MD5 in favor of a combination of * FILE_CHECKSUM_TYPE and FILE_CHECKSUM_VALUE. */ - datafileSolrInputDocument.addField(SearchFields.FILE_MD5, fileMetadata.getDataFile().getChecksumValue()); + datafileSolrInputDocument.addField(SearchFields.FILE_MD5, datafile.getChecksumValue()); } - datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, fileMetadata.getDataFile().getChecksumType().toString()); - datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_VALUE, fileMetadata.getDataFile().getChecksumValue()); - datafileSolrInputDocument.addField(SearchFields.FILE_RESTRICTED, fileMetadata.getDataFile().isRestricted()); + datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, datafile.getChecksumType().toString()); + datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_VALUE, datafile.getChecksumValue()); + datafileSolrInputDocument.addField(SearchFields.FILE_RESTRICTED, datafile.isRestricted()); datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, fileMetadata.getDescription()); datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription()); - GlobalId filePid = fileMetadata.getDataFile().getGlobalId(); + GlobalId filePid = datafile.getGlobalId(); datafileSolrInputDocument.addField(SearchFields.FILE_PERSISTENT_ID, (filePid != null) ? filePid.toString() : null); - datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf()); + datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, // dataFile.getOwner().getOwner().getName()); @@ -1662,24 +1707,24 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set variables = fileMetadata.getDataFile().getDataTable().getDataVariables(); - Long observations = fileMetadata.getDataFile().getDataTable().getCaseQuantity(); - datafileSolrInputDocument.addField(SearchFields.OBSERVATIONS, observations); + DataTable dtable = datafile.getDataTable(); + if (dtable != null) { + List variables = dtable.getDataVariables(); + Long observations = dtable.getCaseQuantity(); datafileSolrInputDocument.addField(SearchFields.VARIABLE_COUNT, variables.size()); - + datafileSolrInputDocument.addField(SearchFields.OBSERVATIONS, observations); + datafileSolrInputDocument.addField(SearchFields.UNF, dtable.getUnf()); + + Map variableMap = null; - List variablesByMetadata = variableService.findVarMetByFileMetaId(fileMetadata.getId()); + Collection variablesByMetadata = fileMetadata.getVariableMetadatas(); + + variableMap = variablesByMetadata.stream().collect(Collectors.toMap(VariableMetadata::getId, Function.identity())); - variableMap = - variablesByMetadata.stream().collect(Collectors.toMap(VariableMetadata::getId, Function.identity())); - - for (DataVariable var : variables) { // Hard-coded search fields, for now: // TODO: eventually: review, decide how datavariables should @@ -1690,19 +1735,19 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set d try { solrClientIndexService.getSolrClient().add(docs.getDocuments()); } catch (SolrServerException | IOException ex) { + logger.warning("Check process-failures logs re: " + ex.getLocalizedMessage()); if (ex.getCause() instanceof SolrServerException) { throw new SolrServerException(ex); } else if (ex.getCause() instanceof IOException) { @@ -1922,7 +1974,7 @@ private void addLicenseToSolrDoc(SolrInputDocument solrInputDocument, DatasetVer if (datasetVersion != null && datasetVersion.getTermsOfUseAndAccess() != null) { //test to see if the terms of use are the default set in 5.10 - if so and there's no license then don't add license to solr doc. //fixes 10513 - if (datasetVersionService.isVersionDefaultCustomTerms(datasetVersion)){ + if(TermsOfUseAndAccess.DEFAULT_NOTERMS.equals(datasetVersion.getTermsOfUseAndAccess().getTermsOfUse())) { return; } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index 0dd2153f75b..c25a462efab 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -1,5 +1,6 @@ package edu.harvard.iq.dataverse.search; +import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.Dataverse; @@ -20,7 +21,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.logging.Logger; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; @@ -52,9 +52,6 @@ public class SearchPermissionsServiceBean { @EJB SettingsServiceBean settingsService; - LinkedHashMap roleAssigneeCache = new LinkedHashMap<>(100, 0.7f, true); - private static final int MAX_CACHE_SIZE = 2000; - /** * @todo Should we make a PermStrings object? Probably. * @@ -65,65 +62,29 @@ public List findDataversePerms(Dataverse dataverse) { if (hasBeenPublished(dataverse)) { permStrings.add(IndexServiceBean.getPublicGroupString()); } -// permStrings.addAll(findDirectAssignments(dataverse)); -// permStrings.addAll(findImplicitAssignments(dataverse)); permStrings.addAll(findDvObjectPerms(dataverse)); return permStrings; } - + public List findDatasetVersionPerms(DatasetVersion version) { List perms = new ArrayList<>(); if (version.isReleased()) { perms.add(IndexServiceBean.getPublicGroupString()); } -// perms.addAll(findDirectAssignments(version.getDataset())); -// perms.addAll(findImplicitAssignments(version.getDataset())); + perms.addAll(findDvObjectPerms(version.getDataset())); return perms; } public List findDvObjectPerms(DvObject dvObject) { List permStrings = new ArrayList<>(); - resetRoleAssigneeCache(); - Set roleAssignments = rolesSvc.rolesAssignments(dvObject); - for (RoleAssignment roleAssignment : roleAssignments) { - logger.fine("role assignment on dvObject " + dvObject.getId() + ": " + roleAssignment.getAssigneeIdentifier()); - if (roleAssignment.getRole().permissions().contains(getRequiredSearchPermission(dvObject))) { - RoleAssignee userOrGroup = getRoleAssignee(roleAssignment.getAssigneeIdentifier()); - String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); - if (indexableUserOrGroupPermissionString != null) { - permStrings.add(indexableUserOrGroupPermissionString); - } - } - } - resetRoleAssigneeCache(); - return permStrings; - } + Permission p = getRequiredSearchPermission(dvObject); - private void resetRoleAssigneeCache() { - roleAssigneeCache.clear(); - } - - private RoleAssignee getRoleAssignee(String idtf) { - RoleAssignee ra = roleAssigneeCache.get(idtf); - if (ra != null) { - return ra; - } - ra = roleAssigneeService.getRoleAssignee(idtf); - roleAssigneeCache.put(idtf, ra); - if (roleAssigneeCache.size() > MAX_CACHE_SIZE) { - roleAssigneeCache.remove(roleAssigneeCache.keySet().iterator().next()); - } - return ra; - } - - @Deprecated - private List findDirectAssignments(DvObject dvObject) { - List permStrings = new ArrayList<>(); - List roleAssignees = findWhoHasDirectAssignments(dvObject); - for (RoleAssignee roleAssignee : roleAssignees) { - logger.fine("user or group (findDirectAssignments): " + roleAssignee.getIdentifier()); - String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(roleAssignee); + List assigneeIdStrings = null; + assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); + for (String id : assigneeIdStrings) { + RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); + String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); if (indexableUserOrGroupPermissionString != null) { permStrings.add(indexableUserOrGroupPermissionString); } @@ -131,46 +92,6 @@ private List findDirectAssignments(DvObject dvObject) { return permStrings; } - @Deprecated - private List findWhoHasDirectAssignments(DvObject dvObject) { - List emptyList = new ArrayList<>(); - List peopleWhoCanSearch = emptyList; - resetRoleAssigneeCache(); - - List assignmentsOn = permissionService.assignmentsOn(dvObject); - for (RoleAssignment roleAssignment : assignmentsOn) { - if (roleAssignment.getRole().permissions().contains(getRequiredSearchPermission(dvObject))) { - RoleAssignee userOrGroup = getRoleAssignee(roleAssignment.getAssigneeIdentifier()); - if (userOrGroup != null) { - peopleWhoCanSearch.add(userOrGroup); - } - } - } - resetRoleAssigneeCache(); - return peopleWhoCanSearch; - } - - @Deprecated - private List findImplicitAssignments(DvObject dvObject) { - List permStrings = new ArrayList<>(); - DvObject parent = dvObject.getOwner(); - while (parent != null) { - if (respectPermissionRoot()) { - if (parent.isEffectivelyPermissionRoot()) { - return permStrings; - } - } - if (parent.isInstanceofDataverse()) { - permStrings.addAll(findDirectAssignments(parent)); - } else if (parent.isInstanceofDataset()) { - // files get discoverability from their parent dataset - permStrings.addAll(findDirectAssignments(parent)); - } - parent = parent.getOwner(); - } - return permStrings; - } - public Map getDesiredCards(Dataset dataset) { Map desiredCards = new LinkedHashMap<>(); DatasetVersion latestVersion = dataset.getLatestVersion(); @@ -230,13 +151,6 @@ private Permission getRequiredSearchPermission(DvObject dvObject) { } - @Deprecated - private boolean respectPermissionRoot() { - boolean safeDefaultIfKeyNotFound = true; - // see javadoc of the key - return settingsService.isTrueForKey(SettingsServiceBean.Key.SearchRespectPermissionRoot, safeDefaultIfKeyNotFound); - } - /** * From a Solr perspective we can't just index any string when we go to do * the JOIN to enforce security. (Maybe putting quotes around the string at diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 2b4f08807ef..64679b05beb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -4,15 +4,19 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DatasetVersionServiceBean; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DataverseRoleServiceBean; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.DvObjectServiceBean; import edu.harvard.iq.dataverse.FileMetadata; +import edu.harvard.iq.dataverse.settings.JvmSettings; + import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -20,13 +24,17 @@ import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.stream.Stream; + import jakarta.ejb.EJB; import jakarta.ejb.Stateless; import jakarta.inject.Named; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; +import jakarta.persistence.EntityManager; +import jakarta.persistence.PersistenceContext; + import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.common.SolrInputDocument; @Named @@ -44,39 +52,18 @@ public class SolrIndexServiceBean { @EJB DatasetServiceBean datasetService; @EJB + DatasetVersionServiceBean datasetVersionService; + @EJB DataverseRoleServiceBean rolesSvc; @EJB SolrClientIndexService solrClientService; + + @PersistenceContext(unitName = "VDCNet-ejbPU") + private EntityManager em; public static String numRowsClearedByClearAllIndexTimes = "numRowsClearedByClearAllIndexTimes"; public static String messageString = "message"; - /** - * @deprecated Now that MyData has shipped in 4.1 we have no plans to change - * the unpublishedDataRelatedToMeModeEnabled boolean to false. We should - * probably remove the boolean altogether to simplify the code. - * - * This non-default mode changes the behavior of the "Data Related To Me" - * feature to be more like "**Unpublished** Data Related to Me" after you - * have changed this boolean to true and run "index all". - * - * The "Data Related to Me" feature relies on *always* indexing permissions - * regardless of if the DvObject is published or not. - * - * In "Unpublished Data Related to Me" mode, we first check if the DvObject - * is published. If it's published, we set the search permissions to *only* - * contain "group_public", which is quick and cheap to do. If the DvObject - * in question is *not* public, we perform the expensive operation of - * rooting around in the system to determine who should be able to - * "discover" the unpublished version of DvObject. By default this mode is - * *not* enabled. If you want to enable it, change the boolean to true and - * run "index all". - * - * See also https://github.com/IQSS/dataverse/issues/50 - */ - @Deprecated - private boolean unpublishedDataRelatedToMeModeEnabled = true; - public List determineSolrDocs(DvObject dvObject) { List emptyList = new ArrayList<>(); if (dvObject == null) { @@ -90,9 +77,18 @@ public List determineSolrDocs(DvObject dvObject) { List datasetSolrDocs = constructDatasetSolrDocs((Dataset) dvObject); solrDocs.addAll(datasetSolrDocs); } else if (dvObject.isInstanceofDataFile()) { - Map> permStringByDatasetVersion = new HashMap<>(); - List fileSolrDocs = constructDatafileSolrDocs((DataFile) dvObject, permStringByDatasetVersion); - solrDocs.addAll(fileSolrDocs); + DataFile datafile = (DataFile) dvObject; + Map desiredCards = searchPermissionsService.getDesiredCards(datafile.getOwner()); + Set datasetVersions = datasetVersionsToBuildCardsFor(datafile.getOwner()); + for (DatasetVersion version : datasetVersions) { + if(desiredCards.containsKey(version.getVersionState()) && desiredCards.get(version.getVersionState()) && datafile.isInDatasetVersion(version)) { + List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); + String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + Long versionId = version.getId(); + DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(new DataFileProxy(datafile.getFileMetadata()), cachedPerms, versionId, solrIdEnd); + solrDocs.add(fileSolrDoc); + } + } } else { logger.info("Unexpected DvObject: " + dvObject.getClass().getName()); } @@ -119,12 +115,8 @@ private List determineSolrDocsForFilesFromDataset(Map.Entry perms = new ArrayList<>(); - if (unpublishedDataRelatedToMeModeEnabled) { - if (dataverse.isReleased()) { - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - perms = searchPermissionsService.findDataversePerms(dataverse); - } + if (dataverse.isReleased()) { + perms.add(IndexServiceBean.getPublicGroupString()); } else { perms = searchPermissionsService.findDataversePerms(dataverse); } @@ -147,51 +139,16 @@ private List constructDatasetSolrDocs(Dataset dataset) { return solrDocs; } - // private List constructDatafileSolrDocs(DataFile dataFile) { - private List constructDatafileSolrDocs(DataFile dataFile, Map> permStringByDatasetVersion) { - List datafileSolrDocs = new ArrayList<>(); - Map desiredCards = searchPermissionsService.getDesiredCards(dataFile.getOwner()); - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataFile.getOwner())) { - boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); - /* - * Since datasetVersionFileIsAttachedTo should be a draft or the most recent - * released one, it could be more efficient to stop the search through - * FileMetadatas after those two (versus continuing through all prior versions - * as in isInDatasetVersion). Alternately, perhaps filesToReIndexPermissionsFor - * should not combine the list of files for the different datsetversions into a - * single list to start with. - */ - if (cardShouldExist && dataFile.isInDatasetVersion(datasetVersionFileIsAttachedTo)) { - String solrIdStart = IndexServiceBean.solrDocIdentifierFile + dataFile.getId(); - String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState()); - String solrId = solrIdStart + solrIdEnd; - List perms = new ArrayList<>(); - if (unpublishedDataRelatedToMeModeEnabled) { - List cachedPerms = null; - if (permStringByDatasetVersion != null) { - cachedPerms = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); - } - if (cachedPerms != null) { - logger.finest("reusing cached perms for file " + dataFile.getId()); - perms = cachedPerms; - } else if (datasetVersionFileIsAttachedTo.isReleased()) { - logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId()); - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - // go to the well (slow) - logger.finest("no cached perms, file is not public, finding perms for file " + dataFile.getId()); - perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); - } - } else { - // This should never be executed per the deprecation notice on the boolean. - perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); - } - DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(dataFile.getId().toString(), solrId, datasetVersionFileIsAttachedTo.getId(), dataFile.getDisplayName(), perms); - datafileSolrDocs.add(dataFileSolrDoc); - } + private DvObjectSolrDoc constructDatafileSolrDoc(DataFileProxy fileProxy, List cachedPerms, long versionId, String solrIdEnd) { + String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileProxy.getFileId(); + String solrId = solrIdStart + solrIdEnd; + List perms = new ArrayList<>(); + assert(cachedPerms != null); + if (cachedPerms != null) { + logger.finest("reusing cached perms for file " + fileProxy.getFileId()); + perms = cachedPerms; } - - return datafileSolrDocs; + return new DvObjectSolrDoc(fileProxy.getFileId().toString(), solrId, versionId, fileProxy.getName(), perms); } private List constructDatafileSolrDocsFromDataset(Dataset dataset) { @@ -201,12 +158,8 @@ private List constructDatafileSolrDocsFromDataset(Dataset datas boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); if (cardShouldExist) { List perms = new ArrayList<>(); - if (unpublishedDataRelatedToMeModeEnabled) { - if (datasetVersionFileIsAttachedTo.isReleased()) { - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); - } + if (datasetVersionFileIsAttachedTo.isReleased()) { + perms.add(IndexServiceBean.getPublicGroupString()); } else { perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); } @@ -244,12 +197,8 @@ private DvObjectSolrDoc makeDatasetSolrDoc(DatasetVersion version) { String solrId = solrIdStart + solrIdEnd; String name = version.getTitle(); List perms = new ArrayList<>(); - if (unpublishedDataRelatedToMeModeEnabled) { - if (version.isReleased()) { - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - perms = searchPermissionsService.findDatasetVersionPerms(version); - } + if (version.isReleased()) { + perms.add(IndexServiceBean.getPublicGroupString()); } else { perms = searchPermissionsService.findDatasetVersionPerms(version); } @@ -275,7 +224,7 @@ public IndexResponse indexAllPermissions() { Map> filesPerDataset = new HashMap<>(); List allExceptFiles = dvObjectService.findAll(); for (DvObject dvObject : allExceptFiles) { - logger.info("determining definition points for dvobject id " + dvObject.getId()); + logger.fine("determining definition points for dvobject id " + dvObject.getId()); if (dvObject.isInstanceofDataFile()) { Long dataset = dvObject.getOwner().getId(); Long datafile = dvObject.getId(); @@ -303,9 +252,9 @@ public IndexResponse indexAllPermissions() { } for (DvObjectSolrDoc dvObjectSolrDoc : definitionPoints) { - logger.info("creating solr doc in memory for " + dvObjectSolrDoc.getSolrId()); + logger.fine("creating solr doc in memory for " + dvObjectSolrDoc.getSolrId()); SolrInputDocument solrInputDocument = SearchUtil.createSolrDoc(dvObjectSolrDoc); - logger.info("adding to list of docs to index " + dvObjectSolrDoc.getSolrId()); + logger.fine("adding to list of docs to index " + dvObjectSolrDoc.getSolrId()); docs.add(solrInputDocument); } try { @@ -363,17 +312,7 @@ private void persistToSolr(Collection docs) throws SolrServer /** * @todo Do something with these responses from Solr. */ - UpdateResponse addResponse = solrClientService.getSolrClient().add(docs); - } - - public IndexResponse indexPermissionsOnSelfAndChildren(long definitionPointId) { - DvObject definitionPoint = dvObjectService.findDvObject(definitionPointId); - if (definitionPoint == null) { - logger.log(Level.WARNING, "Cannot find a DvOpbject with id of {0}", definitionPointId); - return null; - } else { - return indexPermissionsOnSelfAndChildren(definitionPoint); - } + solrClientService.getSolrClient().add(docs); } /** @@ -386,8 +325,8 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) logger.log(Level.WARNING, "Cannot perform indexPermissionsOnSelfAndChildren with a definitionPoint null"); return null; } - - List filesToReindexAsBatch = new ArrayList<>(); + int fileQueryMin= JvmSettings.MIN_FILES_TO_USE_PROXY.lookupOptional(Integer.class).orElse(Integer.MAX_VALUE); + List filesToReindexAsBatch = new ArrayList<>(); /** * @todo Re-indexing the definition point itself seems to be necessary * for revoke but not necessarily grant. @@ -395,8 +334,9 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) // We don't create a Solr "primary/content" doc for the root dataverse // so don't create a Solr "permission" doc either. - int i = 0; + final int[] counter = {0}; int numObjects = 0; + long globalStartTime = System.currentTimeMillis(); if (definitionPoint.isInstanceofDataverse()) { Dataverse selfDataverse = (Dataverse) definitionPoint; if (!selfDataverse.equals(dataverseService.findRootDataverse())) { @@ -407,31 +347,77 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) for (Dataset dataset : directChildDatasetsOfDvDefPoint) { indexPermissionsForOneDvObject(dataset); numObjects++; - for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) { - filesToReindexAsBatch.add(datafile); - i++; - if (i % 100 == 0) { - reindexFilesInBatches(filesToReindexAsBatch); - filesToReindexAsBatch.clear(); - } - if (i % 1000 == 0) { - logger.fine("Progress: " +i + " files permissions reindexed"); + + Map desiredCards = searchPermissionsService.getDesiredCards(dataset); + long startTime = System.currentTimeMillis(); + for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { + if (desiredCards.get(version.getVersionState())) { + List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); + String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + Long versionId = version.getId(); + for (FileMetadata fmd : version.getFileMetadatas()) { + DataFileProxy fileProxy = new DataFileProxy(fmd); + // Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice + filesToReindexAsBatch.add(fileProxy); + counter[0]++; + if (counter[0] % 100 == 0) { + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + filesToReindexAsBatch.clear(); + } + if (counter[0] % 1000 == 0) { + logger.fine("Progress: " + counter[0] + "files permissions reindexed"); + } + } + + // Re-index any remaining files in the datasetversion (so that verionId, etc. remain constants for all files in the batch) + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + logger.info("Progress : dataset " + dataset.getId() + " permissions reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } } - logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed"); } } else if (definitionPoint.isInstanceofDataset()) { indexPermissionsForOneDvObject(definitionPoint); numObjects++; // index files Dataset dataset = (Dataset) definitionPoint; - for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) { - filesToReindexAsBatch.add(datafile); - i++; - if (i % 100 == 0) { - reindexFilesInBatches(filesToReindexAsBatch); + Map desiredCards = searchPermissionsService.getDesiredCards(dataset); + for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { + if (desiredCards.get(version.getVersionState())) { + List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); + String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + Long versionId = version.getId(); + if (version.getFileMetadatas().size() > fileQueryMin) { + // For large datasets, use a more efficient SQL query instead of loading all file metadata objects + getDataFileInfoForPermissionIndexing(version.getId()).forEach(fileInfo -> { + filesToReindexAsBatch.add(fileInfo); + counter[0]++; + + if (counter[0] % 100 == 0) { + long startTime = System.currentTimeMillis(); + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + filesToReindexAsBatch.clear(); + logger.fine("Progress: 100 file permissions at " + counter[0] + " files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); + } + }); + } else { + version.getFileMetadatas().stream() + .forEach(fmd -> { + DataFileProxy fileProxy = new DataFileProxy(fmd); + filesToReindexAsBatch.add(fileProxy); + counter[0]++; + if (counter[0] % 100 == 0) { + long startTime = System.currentTimeMillis(); + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + filesToReindexAsBatch.clear(); + logger.fine("Progress: 100 file permissions at " + counter[0] + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); + } + }); + } + // Re-index any remaining files in the dataset version (versionId, etc. remain constants for all files in the batch) + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); filesToReindexAsBatch.clear(); } + } } else { indexPermissionsForOneDvObject(definitionPoint); @@ -441,86 +427,45 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) /** * @todo Error handling? What to do with response? * - * @todo Should update timestamps, probably, even thought these are - * files, see https://github.com/IQSS/dataverse/issues/2421 + * @todo Should update timestamps, probably, even thought these are files, see + * https://github.com/IQSS/dataverse/issues/2421 */ - reindexFilesInBatches(filesToReindexAsBatch); - logger.fine("Reindexed permissions for " + i + " files and " + numObjects + " datasets/collections"); + logger.fine("Reindexed permissions for " + counter[0] + " files and " + numObjects + "datasets/collections in " + (System.currentTimeMillis() - globalStartTime) + " ms"); return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint + ": " + numObjects); } - private String reindexFilesInBatches(List filesToReindexPermissionsFor) { + private String reindexFilesInBatches(List filesToReindexAsBatch, List cachedPerms, Long versionId, String solrIdEnd) { List docs = new ArrayList<>(); - Map> byParentId = new HashMap<>(); - Map> permStringByDatasetVersion = new HashMap<>(); - int i = 0; try { - for (DataFile file : filesToReindexPermissionsFor) { - Dataset dataset = (Dataset) file.getOwner(); - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) { - boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); - if (cardShouldExist) { - List cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); - if (cachedPermission == null) { - logger.finest("no cached permission! Looking it up..."); - List fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion); - for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) { - Long datasetVersionId = fileSolrDoc.getDatasetVersionId(); - if (datasetVersionId != null) { - permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions()); - SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); - docs.add(solrDoc); - i++; - } - } - } else { - logger.finest("cached permission is " + cachedPermission); - List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion); - for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { - SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); - docs.add(solrDoc); - i++; - } - } - if (i % 20 == 0) { - persistToSolr(docs); - docs = new ArrayList<>(); - } - } - } - Long parent = file.getOwner().getId(); - List existingList = byParentId.get(parent); - if (existingList == null) { - List empty = new ArrayList<>(); - byParentId.put(parent, empty); - } else { - List updatedList = existingList; - updatedList.add(file.getId()); - byParentId.put(parent, updatedList); - } + // Assume all files have the same owner + if (filesToReindexAsBatch.isEmpty()) { + return "No files to reindex"; } + for (DataFileProxy file : filesToReindexAsBatch) { + + DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(file, cachedPerms, versionId, solrIdEnd); + SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); + docs.add(solrDoc); + } persistToSolr(docs); - return " " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents "; + return " " + filesToReindexAsBatch.size() + " files indexed across " + docs.size() + " Solr documents "; } catch (SolrServerException | IOException ex) { - return " tried to reindex " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents but caught exception: " + ex; + return " tried to reindex " + filesToReindexAsBatch.size() + " files indexed across " + docs.size() + " Solr documents but caught exception: " + ex; } } - private List filesToReIndexPermissionsFor(Dataset dataset) { - List filesToReindexPermissionsFor = new ArrayList<>(); + private List versionsToReIndexPermissionsFor(Dataset dataset) { + List versionsToReindexPermissionsFor = new ArrayList<>(); Map desiredCards = searchPermissionsService.getDesiredCards(dataset); for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) { boolean cardShouldExist = desiredCards.get(version.getVersionState()); if (cardShouldExist) { - for (FileMetadata fileMetadata : version.getFileMetadatas()) { - filesToReindexPermissionsFor.add(fileMetadata.getDataFile()); - } + versionsToReindexPermissionsFor.add(version); } } - return filesToReindexPermissionsFor; + return versionsToReindexPermissionsFor; } public IndexResponse deleteMultipleSolrIds(List solrIdsToDelete) { @@ -540,7 +485,7 @@ public IndexResponse deleteMultipleSolrIds(List solrIdsToDelete) { public JsonObjectBuilder deleteAllFromSolrAndResetIndexTimes() throws SolrServerException, IOException { JsonObjectBuilder response = Json.createObjectBuilder(); - logger.info("attempting to delete all Solr documents before a complete re-index"); + logger.fine("attempting to delete all Solr documents before a complete re-index"); solrClientService.getSolrClient().deleteByQuery("*:*"); int numRowsAffected = dvObjectService.clearAllIndexTimes(); response.add(numRowsClearedByClearAllIndexTimes, numRowsAffected); @@ -571,4 +516,63 @@ public List findPermissionsInDatabaseButStaleInOrMissingFromSolr() { return indexingRequired; } + public Stream getDataFileInfoForPermissionIndexing(Long id) { + return em.createNamedQuery("DataFile.getDataFileInfoForPermissionIndexing", DataFileProxy.class) + .setParameter(1, id) + .getResultStream(); + } + + /** + * A lightweight proxy for DataFile objects used during permission indexing. This class avoids loading the full DataFile entity from the database when only basic properties are needed for indexing, + * improving performance for large datasets. + */ + public static class DataFileProxy { + + private final Long id; + private final String name; + private final boolean released; + + /** + * Creates a new DataFileProxy with the specified properties. + * + * @param id + * The ID of the data file + * @param label + * The label/name of the data file + * @param restricted + * Whether the file is restricted + * @param released + * Whether the file is released + */ + public DataFileProxy(FileMetadata fmd) { + DataFile df = fmd.getDataFile(); + this.id = df.getId(); + this.name = fmd.getLabel(); + this.released = df.isReleased(); + } + + public DataFileProxy(String label, Long id, Date publicationDate) { + this.id = id; + this.name = label; + this.released = publicationDate != null; + } + + public boolean isReleased() { + return released; + } + + public Long getFileId() { + return id; + } + + public String getName() { + return name; + } + + public DataFile getMinimalDataFile() { + DataFile df = new DataFile(); + df.setId(id); + return df; + } + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index bc32e250be5..afc698b418b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -67,11 +67,13 @@ public enum JvmSettings { SOLR_PROT(SCOPE_SOLR, "protocol"), SOLR_CORE(SCOPE_SOLR, "core"), SOLR_PATH(SCOPE_SOLR, "path"), + MIN_FILES_TO_USE_PROXY(SCOPE_SOLR, "min-files-to-use-proxy"), + // INDEX CONCURENCY SCOPE_SOLR_CONCURENCY(SCOPE_SOLR, "concurrency"), MAX_ASYNC_INDEXES(SCOPE_SOLR_CONCURENCY, "max-async-indexes"), - + // RSERVE CONNECTION SCOPE_RSERVE(PREFIX, "rserve"), RSERVE_HOST(SCOPE_RSERVE, "host"), diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 5b0a178969b..6d96ad4abf6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -168,27 +168,6 @@ public enum Key { * to from the footer. */ ApplicationPrivacyPolicyUrl, - /** - * A boolean defining if indexing and search should respect the concept - * of "permission root". - * - *

- * - * If we ignore permissionRoot at index time, we should blindly give - * search ("discoverability") access to people and group who have access - * defined in a parent dataverse, all the way back to the root. - * - *

- * - * If we respect permissionRoot, this means that the dataverse being - * indexed is an island of permissions all by itself. We should not look - * to its parent to see if more people and groups might be able to - * search the DvObjects within it. We would assume no implicit - * inheritance of permissions. In this mode, all permissions must be - * explicitly defined on DvObjects. No implied inheritance. - * - */ - SearchRespectPermissionRoot, /** * Solr hostname and port, such as "localhost:8983". * @deprecated New installations should not use this database setting, but use {@link JvmSettings#SOLR_HOST} diff --git a/src/main/resources/META-INF/persistence.xml b/src/main/resources/META-INF/persistence.xml index e6224dcdf01..151410c04c2 100644 --- a/src/main/resources/META-INF/persistence.xml +++ b/src/main/resources/META-INF/persistence.xml @@ -18,6 +18,19 @@ + + + + + + + + + + + + + diff --git a/src/test/java/edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java b/src/test/java/edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java index 2b54a4b12cd..eda9b995db5 100644 --- a/src/test/java/edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java @@ -126,6 +126,7 @@ private IndexableDataset createIndexableDataset() { final Dataset dataset = MocksFactory.makeDataset(); dataset.setGlobalId(new GlobalId(AbstractDOIProvider.DOI_PROTOCOL,"10.666", "FAKE/fake", "/", AbstractDOIProvider.DOI_RESOLVER_URL, null)); final DatasetVersion datasetVersion = dataset.getCreateVersion(null); + datasetVersion.setId(1L); DatasetField field = createCVVField("language", "English", false); datasetVersion.getDatasetFields().add(field); final IndexableDataset indexableDataset = new IndexableDataset(datasetVersion);