Merge pull request #21624 from davelopez/25.1_more_dataverse_hardening

bgruening · web-flow · commit f753f5b4ba37 · 2026-01-20T15:28:47.000+01:00
[25.1] Harden Dataverse integration
diff --git a/lib/galaxy/files/sources/_rdm.py b/lib/galaxy/files/sources/_rdm.py
@@ -23,13 +23,13 @@
 
 
 class RDMFileSourceTemplateConfiguration(BaseFileSourceTemplateConfiguration):
-    token: Union[str, TemplateExpansion]
-    public_name: Union[str, TemplateExpansion]
+    token: Optional[Union[str, TemplateExpansion]] = None
+    public_name: Optional[Union[str, TemplateExpansion]] = None
 
 
 class RDMFileSourceConfiguration(BaseFileSourceConfiguration):
-    token: str
-    public_name: str
+    token: Optional[str] = None
+    public_name: Optional[str] = None
 
 
 class ContainerAndFileIdentifier(NamedTuple):
@@ -51,7 +51,7 @@ class RDMRepositoryInteractor:
     """
 
     def __init__(self, repository_url: str, plugin: "RDMFilesSource"):
-        self._repository_url = repository_url
+        self._repository_url = self._strip_last_slash(repository_url)
         self._plugin = plugin
 
     @property
@@ -138,6 +138,12 @@ def download_file_from_container(
         """
         raise NotImplementedError()
 
+    def _strip_last_slash(self, url: str) -> str:
+        """Utility method to strip the last slash from a URL if present."""
+        if url.endswith("/"):
+            return url[:-1]
+        return url
+
 
 class RDMFilesSource(BaseFilesSource[RDMFileSourceTemplateConfiguration, RDMFileSourceConfiguration]):
     """Base class for Research Data Management (RDM) file sources.
diff --git a/lib/galaxy/files/sources/dataverse.py b/lib/galaxy/files/sources/dataverse.py
@@ -13,6 +13,7 @@
 
 from galaxy.exceptions import (
     AuthenticationRequired,
+    MessageException,
     ObjectNotFound,
 )
 from galaxy.files.models import (
@@ -105,6 +106,7 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta
         - doi:10.70122/FK2/AVNCLL (persistent ID)
         - doi:10.70122/FK2/DIG2DG/AVNCLL (persistent ID)
         - doi:10.70122/FK2/DIG2DG/id:12345 (database ID)
+        - doi:10.5072/FK2/doi:10.70122/AVNCLL (persistent ID)
         - perma:BSC/3ST00L/id:9056 (database ID)
         """
         if not source_path.startswith("/"):
@@ -125,20 +127,54 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta
                 f"Invalid source path: '{source_path}'. Expected format: '/<dataset_id>/<file_identifier>'."
             )
 
-        file_id_part = parts[-1]
-        dataset_id = "/".join(parts[:-1])
+        dataset_id, file_id_part = self._split_dataset_and_file_pid(parts)
 
         # The file identifier can be either:
         # - A persistent ID suffix (e.g., 'AVNCLL' -> full ID is 'doi:10.70122/FK2/DIG2DG/AVNCLL')
         # - A database ID with 'id:' prefix (e.g., 'id:12345' -> file_identifier is 'id:12345')
         if file_id_part.startswith("id:"):
             # Database ID format - keep the 'id:' prefix as the file identifier
             file_id = file_id_part
+        elif re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*:.*", file_id_part):
+            # Full persistent identifier (e.g. doi:, hdl:, ark:, or custom PID providers).
+            # Files in Dataverse may have their own independent persistent IDs that are
+            # not hierarchically related to the dataset persistent ID.
+            file_id = file_id_part
         else:
-            # Persistent ID format - construct full persistent ID
+            # Dataset-scoped persistent ID suffix - construct full persistent ID
             file_id = f"{dataset_id}/{file_id_part}"
         return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier=file_id)
 
+    @staticmethod
+    def _split_dataset_and_file_pid(parts: list[str]) -> tuple[str, str]:
+        """
+        Split a Dataverse source path into dataset ID and file identifier parts.
+
+        Dataverse file-level persistent IDs may themselves contain slashes and are not
+        necessarily hierarchically related to the dataset persistent ID. For example:
+
+            /doi:10.57745/I8EUTL/doi:10.57745/L7SOAJ
+
+        In this case:
+            dataset_id = doi:10.57745/I8EUTL
+            file_id     = doi:10.57745/L7SOAJ
+
+        This helper detects such cases by recognizing URI-scheme prefixes in path segments
+        and grouping them accordingly.
+        """
+        # Default: last segment is the file identifier
+        file_id_part = parts[-1]
+        dataset_id = "/".join(parts[:-1])
+
+        # Heuristic: if the penultimate segment starts a URI scheme (e.g. doi:, hdl:, ark:),
+        # then the file persistent ID spans the last two segments.
+        pid_scheme_re = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*:")
+        if len(parts) >= 3 and pid_scheme_re.match(parts[-2]):
+            file_id_part = f"{parts[-2]}/{parts[-1]}"
+            dataset_id = "/".join(parts[:-2])
+
+        return dataset_id, file_id_part
+
     def get_container_id_from_path(self, source_path: str) -> str:
         return self.parse_path(source_path, container_id_only=True).container_id
 
@@ -336,14 +372,14 @@ def create_draft_file_container(
         collection_payload = self._prepare_collection_data(title, public_name, user_email)
         collection = self._create_collection(":root", collection_payload, context)
         if not collection or "data" not in collection or "alias" not in collection["data"]:
-            raise Exception("Could not create collection in Dataverse or response has an unexpected format.")
+            raise MessageException("Could not create collection in Dataverse or response has an unexpected format.")
         collection_alias = collection["data"]["alias"]
 
         # Prepare and create the dataset
         dataset_payload = self._prepare_dataset_data(title, public_name, user_email)
         dataset = self._create_dataset(collection_alias, dataset_payload, context)
         if not dataset or "data" not in dataset:
-            raise Exception("Could not create dataset in Dataverse or response has an unexpected format.")
+            raise MessageException("Could not create dataset in Dataverse or response has an unexpected format.")
 
         dataset["data"]["name"] = title
         return dataset["data"]
@@ -421,14 +457,18 @@ def _download_file(
                     f"Authentication required to download file from '{download_file_content_url}'. "
                     f"Please provide a valid API token in your user preferences."
                 )
-            # TODO: We can only download files from published datasets for now
-            if e.code in [403, 404]:
+            if e.code == 403:
+                # Permission denied: dataset may be unpublished or user lacks access rights
+                raise ObjectNotFound(
+                    f"Access forbidden when downloading file from '{download_file_content_url}'. "
+                    f"You may not have permission to access this file, or the dataset is not published."
+                )
+            if e.code == 404:
                 raise ObjectNotFound(
                     f"File not found at '{download_file_content_url}'. "
                     f"Please make sure the dataset and file exist and are published."
                 )
-            else:
-                raise
+            raise
 
     def _get_datasets_from_response(self, response: dict) -> list[RemoteDirectory]:
         rval: list[RemoteDirectory] = []
@@ -494,7 +534,7 @@ def _ensure_response_has_expected_status_code(self, response, expected_status_co
             error_message = self._get_response_error_message(response)
             if response.status_code == 403:
                 self._raise_auth_required(error_message)
-            raise Exception(
+            raise MessageException(
                 f"Request to {response.url} failed with status code {response.status_code}: {error_message}"
             )