Skip to content

Commit 73e1b77

Browse files
committed
Supports Dataverse file access via both persistent and database IDs
Improves Dataverse integration by handling file access for instances where FilePIDsEnabled is false and files use database IDs instead of persistent IDs. Enhances path parsing, access URL construction, plugin URI generation, and file listing to robustly differentiate between persistent and database identifiers. Refines error handling for authentication and not-found scenarios to improve reliability and clarity.
1 parent a360903 commit 73e1b77

1 file changed

Lines changed: 103 additions & 24 deletions

File tree

lib/galaxy/files/sources/dataverse.py

Lines changed: 103 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111

1212
from typing_extensions import TypedDict
1313

14-
from galaxy.exceptions import AuthenticationRequired
14+
from galaxy.exceptions import (
15+
AuthenticationRequired,
16+
ObjectNotFound,
17+
)
1518
from galaxy.files.models import (
1619
AnyRemoteEntry,
1720
Entry,
@@ -35,11 +38,7 @@
3538
requests,
3639
stream_to_open_named_file,
3740
)
38-
39-
40-
class NotFoundException(Exception):
41-
def __init__(self, message):
42-
super().__init__(message)
41+
from galaxy.util.user_agent import get_default_headers
4342

4443

4544
class DataverseDataset(TypedDict):
@@ -97,27 +96,54 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta
9796
The source path must either have the format '/<dataset_id>' or '/<file_id>' where <dataset_id> is a subset of <file_id>.
9897
If dataset_id_only is True, the source path must have the format '/<dataset_id>' and an empty file_id will be returned.
9998
100-
Example dataset_id:
99+
Example dataset_id (DOI format):
101100
doi:10.70122/FK2/DIG2DG
102101
103-
Example file_id:
102+
Example dataset_id (perma format):
103+
perma:BSC/3ST00L
104+
105+
Example file_id (persistent ID - when FilePIDsEnabled is true):
104106
doi:10.70122/FK2/DIG2DG/AVNCLL
107+
perma:BSC/3ST00L/AVNCLL
108+
109+
Example file_id (database ID - when FilePIDsEnabled is false):
110+
doi:10.70122/FK2/DIG2DG/id:12345
111+
perma:BSC/3ST00L/id:9056
105112
"""
106113
if not source_path.startswith("/"):
107114
raise ValueError(f"Invalid source path: '{source_path}'. Must start with '/'.")
108115

109-
parts = source_path[1:].split("/", 3)
110-
dataset_id = "/".join(parts[:3])
116+
path_without_slash = source_path[1:]
117+
118+
# Determine the identifier scheme and expected parts count
119+
# perma: identifiers have 2 parts (e.g., perma:BSC/3ST00L)
120+
# doi: identifiers have 3 parts (e.g., doi:10.70122/FK2/DIG2DG)
121+
is_perma_scheme = path_without_slash.startswith("perma:")
122+
dataset_parts_count = 2 if is_perma_scheme else 3
123+
124+
# Split path to extract dataset and file parts
125+
parts = path_without_slash.split("/", dataset_parts_count)
126+
dataset_id = "/".join(parts[:dataset_parts_count])
111127

112128
if container_id_only:
113-
if len(parts) != 3:
129+
if len(parts) < dataset_parts_count:
114130
raise ValueError(f"Invalid source path: '{source_path}'. Expected format: '/<dataset_id>'.")
115131
return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier="")
116132

117-
if len(parts) != 4:
133+
expected_parts = dataset_parts_count + 1
134+
if len(parts) != expected_parts:
118135
raise ValueError(f"Invalid source path: '{source_path}'. Expected format: '/<file_id>'.")
119136

120-
file_id = dataset_id + "/" + parts[3]
137+
# The file identifier can be either:
138+
# - A persistent ID suffix (e.g., 'AVNCLL' -> full ID is 'doi:10.70122/FK2/DIG2DG/AVNCLL')
139+
# - A database ID with 'id:' prefix (e.g., 'id:12345' -> file_identifier is 'id:12345')
140+
file_id_part = parts[dataset_parts_count]
141+
if file_id_part.startswith("id:"):
142+
# Database ID format - keep the 'id:' prefix as the file identifier
143+
file_id = file_id_part
144+
else:
145+
# Persistent ID format - construct full persistent ID
146+
file_id = f"{dataset_id}/{file_id_part}"
121147
return ContainerAndFileIdentifier(container_id=dataset_id, file_identifier=file_id)
122148

123149
def get_container_id_from_path(self, source_path: str) -> str:
@@ -166,7 +192,7 @@ def _realize_to(
166192
dataset_id, file_id = self.parse_path(source_path)
167193
try:
168194
self.repository.download_file_from_container(dataset_id, file_id, native_path, context)
169-
except NotFoundException:
195+
except ObjectNotFound:
170196
filename = file_id.split("/")[-1]
171197
is_zip_file = self._is_zip_archive(filename)
172198
if is_zip_file:
@@ -176,6 +202,8 @@ def _realize_to(
176202
# So, if a zip is not found, we suppose we are trying to reimport an archived history
177203
# and make an API call to Dataverse to download the dataset as a zip.
178204
self.repository._download_dataset_as_zip(dataset_id, native_path, context)
205+
else:
206+
raise
179207

180208
def _is_zip_archive(self, file_name: str) -> bool:
181209
return file_name.endswith(".zip")
@@ -199,9 +227,21 @@ def api_base_url(self) -> str:
199227
def search_url(self) -> str:
200228
return f"{self.api_base_url}/search"
201229

202-
def file_access_url(self, file_id: str) -> str:
203-
encoded_file_id = quote(file_id, safe="")
204-
return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={encoded_file_id}"
230+
def file_access_url(self, file_identifier: str) -> str:
231+
"""Build the access URL for a file.
232+
233+
The file_identifier can be either:
234+
- A persistent ID (DOI) like 'doi:10.70122/FK2/DIG2DG/AVNCLL'
235+
- A database ID prefixed with 'id:' like 'id:12345'
236+
"""
237+
if file_identifier.startswith("id:"):
238+
# Use database ID for access
239+
database_id = file_identifier[3:] # Remove 'id:' prefix
240+
return f"{self.api_base_url}/access/datafile/{database_id}"
241+
else:
242+
# Use persistent ID for access
243+
encoded_file_id = quote(file_identifier, safe="")
244+
return f"{self.api_base_url}/access/datafile/:persistentId?persistentId={encoded_file_id}"
205245

206246
def download_dataset_as_zip_url(self, dataset_id: str) -> str:
207247
return f"{self.api_base_url}/access/dataset/:persistentId/?persistentId={dataset_id}"
@@ -222,7 +262,29 @@ def public_dataset_url(self, dataset_id: str) -> str:
222262
return f"{self.repository_url}/dataset.xhtml?persistentId={dataset_id}"
223263

224264
def to_plugin_uri(self, dataset_id: str, file_identifier: Optional[str] = None) -> str:
225-
return f"{self.plugin.get_uri_root()}/{f'{file_identifier}' if file_identifier else f'{dataset_id}'}"
265+
"""Build a plugin URI for a dataset or file.
266+
267+
For datasets: dataverse://source/doi:10.70122/FK2/DIG2DG
268+
For files: dataverse://source/doi:10.70122/FK2/DIG2DG/AVNCLL (persistent ID)
269+
dataverse://source/doi:10.70122/FK2/DIG2DG/id:12345 (database ID)
270+
"""
271+
if file_identifier:
272+
# For files, we need both the dataset_id and file_identifier in the path
273+
# Extract just the file-specific part from the file_identifier
274+
if file_identifier.startswith("id:"):
275+
# Database ID format: keep as is (e.g., 'id:12345')
276+
file_part = file_identifier
277+
elif "/" in file_identifier and file_identifier.startswith(dataset_id):
278+
# Full persistent ID format: extract just the suffix after dataset_id
279+
# e.g., 'doi:10.70122/FK2/DIG2DG/AVNCLL' -> 'AVNCLL'
280+
file_part = file_identifier[len(dataset_id) + 1 :]
281+
else:
282+
# Already just the suffix
283+
file_part = file_identifier
284+
return f"{self.plugin.get_uri_root()}/{dataset_id}/{file_part}"
285+
else:
286+
# For datasets, just use the dataset_id
287+
return f"{self.plugin.get_uri_root()}/{dataset_id}"
226288

227289
def _is_api_url(self, url: str) -> bool:
228290
return "/api/" in url
@@ -348,11 +410,11 @@ def _download_file(
348410
download_file_content_url: str,
349411
context: FilesSourceRuntimeContext[RDMFileSourceConfiguration],
350412
):
351-
headers = {}
413+
headers = get_default_headers()
352414

353415
if self._is_api_url(download_file_content_url):
354416
# pass the token as a header only when using the API
355-
headers = self._get_request_headers(context)
417+
headers.update(self._get_request_headers(context))
356418
try:
357419
req = urllib.request.Request(download_file_content_url, headers=headers)
358420
with urllib.request.urlopen(req, timeout=DEFAULT_SOCKET_TIMEOUT) as page:
@@ -361,11 +423,19 @@ def _download_file(
361423
page, f.fileno(), file_path, source_encoding=get_charset_from_http_headers(page.headers)
362424
)
363425
except HTTPError as e:
426+
if e.code == 401:
427+
raise AuthenticationRequired(
428+
f"Authentication required to download file from '{download_file_content_url}'. "
429+
f"Please provide a valid API token in your user preferences."
430+
)
364431
# TODO: We can only download files from published datasets for now
365-
if e.code in [401, 403, 404]:
366-
raise NotFoundException(
367-
f"Cannot download file from URL '{file_path}'. Please make sure the dataset and/or file exists and it is public."
432+
if e.code in [403, 404]:
433+
raise ObjectNotFound(
434+
f"File not found at '{download_file_content_url}'. "
435+
f"Please make sure the dataset and file exist and are published."
368436
)
437+
else:
438+
raise
369439

370440
def _get_datasets_from_response(self, response: dict) -> list[RemoteDirectory]:
371441
rval: list[RemoteDirectory] = []
@@ -384,7 +454,16 @@ def _get_files_from_response(self, dataset_id: str, response: dict) -> list[Remo
384454
rval: list[RemoteFile] = []
385455
for entry in response:
386456
dataFile = entry.get("dataFile")
387-
uri = self.to_plugin_uri(dataset_id, dataFile.get("persistentId"))
457+
# Use persistentId if available, otherwise fall back to database id
458+
# The database id is prefixed with 'id:' to distinguish from DOI-based persistent IDs
459+
file_persistent_id = dataFile.get("persistentId")
460+
if file_persistent_id:
461+
file_identifier = file_persistent_id
462+
else:
463+
# Fallback to database id when persistentId is not available
464+
# (e.g., when FilePIDsEnabled is false on the Dataverse instance)
465+
file_identifier = f"id:{dataFile.get('id')}"
466+
uri = self.to_plugin_uri(dataset_id, file_identifier)
388467
rval.append(
389468
RemoteFile(
390469
name=dataFile.get("filename"),

0 commit comments

Comments
 (0)