Skip to content

Commit f753f5b

Browse files
authored
Merge pull request #21624 from davelopez/25.1_more_dataverse_hardening
[25.1] Harden Dataverse integration
2 parents cb3a9c9 + e201d59 commit f753f5b

2 files changed

Lines changed: 61 additions & 15 deletions

File tree

lib/galaxy/files/sources/_rdm.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323

2424

2525
class RDMFileSourceTemplateConfiguration(BaseFileSourceTemplateConfiguration):
26-
token: Union[str, TemplateExpansion]
27-
public_name: Union[str, TemplateExpansion]
26+
token: Optional[Union[str, TemplateExpansion]] = None
27+
public_name: Optional[Union[str, TemplateExpansion]] = None
2828

2929

3030
class RDMFileSourceConfiguration(BaseFileSourceConfiguration):
31-
token: str
32-
public_name: str
31+
token: Optional[str] = None
32+
public_name: Optional[str] = None
3333

3434

3535
class ContainerAndFileIdentifier(NamedTuple):
@@ -51,7 +51,7 @@ class RDMRepositoryInteractor:
5151
"""
5252

5353
def __init__(self, repository_url: str, plugin: "RDMFilesSource"):
54-
self._repository_url = repository_url
54+
self._repository_url = self._strip_last_slash(repository_url)
5555
self._plugin = plugin
5656

5757
@property
@@ -138,6 +138,12 @@ def download_file_from_container(
138138
"""
139139
raise NotImplementedError()
140140

141+
def _strip_last_slash(self, url: str) -> str:
142+
"""Utility method to strip the last slash from a URL if present."""
143+
if url.endswith("/"):
144+
return url[:-1]
145+
return url
146+
141147

142148
class RDMFilesSource(BaseFilesSource[RDMFileSourceTemplateConfiguration, RDMFileSourceConfiguration]):
143149
"""Base class for Research Data Management (RDM) file sources.

lib/galaxy/files/sources/dataverse.py

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from galaxy.exceptions import (
1515
AuthenticationRequired,
16+
MessageException,
1617
ObjectNotFound,
1718
)
1819
from galaxy.files.models import (
@@ -105,6 +106,7 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta
105106
- doi:10.70122/FK2/AVNCLL (persistent ID)
106107
- doi:10.70122/FK2/DIG2DG/AVNCLL (persistent ID)
107108
- doi:10.70122/FK2/DIG2DG/id:12345 (database ID)
109+
- doi:10.5072/FK2/doi:10.70122/AVNCLL (persistent ID)
108110
- perma:BSC/3ST00L/id:9056 (database ID)
109111
"""
110112
if not source_path.startswith("/"):
@@ -125,20 +127,54 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta
125127
f"Invalid source path: '{source_path}'. Expected format: '/<dataset_id>/<file_identifier>'."
126128
)
127129

128-
file_id_part = parts[-1]
129-
dataset_id = "/".join(parts[:-1])
130+
dataset_id, file_id_part = self._split_dataset_and_file_pid(parts)
130131

131132
# The file identifier can be either:
132133
# - A persistent ID suffix (e.g., 'AVNCLL' -> full ID is 'doi:10.70122/FK2/DIG2DG/AVNCLL')
133134
# - A database ID with 'id:' prefix (e.g., 'id:12345' -> file_identifier is 'id:12345')
134135
if file_id_part.startswith("id:"):
135136
# Database ID format - keep the 'id:' prefix as the file identifier
136137
file_id = file_id_part
138+
elif re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*:.*", file_id_part):
139+
# Full persistent identifier (e.g. doi:, hdl:, ark:, or custom PID providers).
140+
# Files in Dataverse may have their own independent persistent IDs that are
141+
# not hierarchically related to the dataset persistent ID.
142+
file_id = file_id_part
137143
else:
138-
# Persistent ID format - construct full persistent ID
144+
# Dataset-scoped persistent ID suffix - construct full persistent ID
139145
file_id = f"{dataset_id}/{file_id_part}"
140146
return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier=file_id)
141147

148+
@staticmethod
149+
def _split_dataset_and_file_pid(parts: list[str]) -> tuple[str, str]:
150+
"""
151+
Split a Dataverse source path into dataset ID and file identifier parts.
152+
153+
Dataverse file-level persistent IDs may themselves contain slashes and are not
154+
necessarily hierarchically related to the dataset persistent ID. For example:
155+
156+
/doi:10.57745/I8EUTL/doi:10.57745/L7SOAJ
157+
158+
In this case:
159+
dataset_id = doi:10.57745/I8EUTL
160+
file_id = doi:10.57745/L7SOAJ
161+
162+
This helper detects such cases by recognizing URI-scheme prefixes in path segments
163+
and grouping them accordingly.
164+
"""
165+
# Default: last segment is the file identifier
166+
file_id_part = parts[-1]
167+
dataset_id = "/".join(parts[:-1])
168+
169+
# Heuristic: if the penultimate segment starts a URI scheme (e.g. doi:, hdl:, ark:),
170+
# then the file persistent ID spans the last two segments.
171+
pid_scheme_re = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*:")
172+
if len(parts) >= 3 and pid_scheme_re.match(parts[-2]):
173+
file_id_part = f"{parts[-2]}/{parts[-1]}"
174+
dataset_id = "/".join(parts[:-2])
175+
176+
return dataset_id, file_id_part
177+
142178
def get_container_id_from_path(self, source_path: str) -> str:
143179
return self.parse_path(source_path, container_id_only=True).container_id
144180

@@ -336,14 +372,14 @@ def create_draft_file_container(
336372
collection_payload = self._prepare_collection_data(title, public_name, user_email)
337373
collection = self._create_collection(":root", collection_payload, context)
338374
if not collection or "data" not in collection or "alias" not in collection["data"]:
339-
raise Exception("Could not create collection in Dataverse or response has an unexpected format.")
375+
raise MessageException("Could not create collection in Dataverse or response has an unexpected format.")
340376
collection_alias = collection["data"]["alias"]
341377

342378
# Prepare and create the dataset
343379
dataset_payload = self._prepare_dataset_data(title, public_name, user_email)
344380
dataset = self._create_dataset(collection_alias, dataset_payload, context)
345381
if not dataset or "data" not in dataset:
346-
raise Exception("Could not create dataset in Dataverse or response has an unexpected format.")
382+
raise MessageException("Could not create dataset in Dataverse or response has an unexpected format.")
347383

348384
dataset["data"]["name"] = title
349385
return dataset["data"]
@@ -421,14 +457,18 @@ def _download_file(
421457
f"Authentication required to download file from '{download_file_content_url}'. "
422458
f"Please provide a valid API token in your user preferences."
423459
)
424-
# TODO: We can only download files from published datasets for now
425-
if e.code in [403, 404]:
460+
if e.code == 403:
461+
# Permission denied: dataset may be unpublished or user lacks access rights
462+
raise ObjectNotFound(
463+
f"Access forbidden when downloading file from '{download_file_content_url}'. "
464+
f"You may not have permission to access this file, or the dataset is not published."
465+
)
466+
if e.code == 404:
426467
raise ObjectNotFound(
427468
f"File not found at '{download_file_content_url}'. "
428469
f"Please make sure the dataset and file exist and are published."
429470
)
430-
else:
431-
raise
471+
raise
432472

433473
def _get_datasets_from_response(self, response: dict) -> list[RemoteDirectory]:
434474
rval: list[RemoteDirectory] = []
@@ -494,7 +534,7 @@ def _ensure_response_has_expected_status_code(self, response, expected_status_co
494534
error_message = self._get_response_error_message(response)
495535
if response.status_code == 403:
496536
self._raise_auth_required(error_message)
497-
raise Exception(
537+
raise MessageException(
498538
f"Request to {response.url} failed with status code {response.status_code}: {error_message}"
499539
)
500540

0 commit comments

Comments
 (0)