1111
1212from typing_extensions import TypedDict
1313
14- from galaxy .exceptions import AuthenticationRequired
14+ from galaxy .exceptions import (
15+ AuthenticationRequired ,
16+ ObjectNotFound ,
17+ )
1518from galaxy .files .models import (
1619 AnyRemoteEntry ,
1720 Entry ,
3538 requests ,
3639 stream_to_open_named_file ,
3740)
38-
39-
40- class NotFoundException (Exception ):
41- def __init__ (self , message ):
42- super ().__init__ (message )
41+ from galaxy .util .user_agent import get_default_headers
4342
4443
4544class DataverseDataset (TypedDict ):
@@ -97,27 +96,54 @@ def parse_path(self, source_path: str, container_id_only: bool = False) -> Conta
9796 The source path must either have the format '/<dataset_id>' or '/<file_id>' where <dataset_id> is a subset of <file_id>.
9897 If dataset_id_only is True, the source path must have the format '/<dataset_id>' and an empty file_id will be returned.
9998
100- Example dataset_id:
99+ Example dataset_id (DOI format) :
101100 doi:10.70122/FK2/DIG2DG
102101
103- Example file_id:
102+ Example dataset_id (perma format):
103+ perma:BSC/3ST00L
104+
105+ Example file_id (persistent ID - when FilePIDsEnabled is true):
104106 doi:10.70122/FK2/DIG2DG/AVNCLL
107+ perma:BSC/3ST00L/AVNCLL
108+
109+ Example file_id (database ID - when FilePIDsEnabled is false):
110+ doi:10.70122/FK2/DIG2DG/id:12345
111+ perma:BSC/3ST00L/id:9056
105112 """
106113 if not source_path .startswith ("/" ):
107114 raise ValueError (f"Invalid source path: '{ source_path } '. Must start with '/'." )
108115
109- parts = source_path [1 :].split ("/" , 3 )
110- dataset_id = "/" .join (parts [:3 ])
116+ path_without_slash = source_path [1 :]
117+
118+ # Determine the identifier scheme and expected parts count
119+ # perma: identifiers have 2 parts (e.g., perma:BSC/3ST00L)
120+ # doi: identifiers have 3 parts (e.g., doi:10.70122/FK2/DIG2DG)
121+ is_perma_scheme = path_without_slash .startswith ("perma:" )
122+ dataset_parts_count = 2 if is_perma_scheme else 3
123+
124+ # Split path to extract dataset and file parts
125+ parts = path_without_slash .split ("/" , dataset_parts_count )
126+ dataset_id = "/" .join (parts [:dataset_parts_count ])
111127
112128 if container_id_only :
113- if len (parts ) != 3 :
129+ if len (parts ) < dataset_parts_count :
114130 raise ValueError (f"Invalid source path: '{ source_path } '. Expected format: '/<dataset_id>'." )
115131 return ContainerAndFileIdentifier (container_id = dataset_id , file_identifier = "" )
116132
117- if len (parts ) != 4 :
133+ expected_parts = dataset_parts_count + 1
134+ if len (parts ) != expected_parts :
118135 raise ValueError (f"Invalid source path: '{ source_path } '. Expected format: '/<file_id>'." )
119136
120- file_id = dataset_id + "/" + parts [3 ]
137+ # The file identifier can be either:
138+ # - A persistent ID suffix (e.g., 'AVNCLL' -> full ID is 'doi:10.70122/FK2/DIG2DG/AVNCLL')
139+ # - A database ID with 'id:' prefix (e.g., 'id:12345' -> file_identifier is 'id:12345')
140+ file_id_part = parts [dataset_parts_count ]
141+ if file_id_part .startswith ("id:" ):
142+ # Database ID format - keep the 'id:' prefix as the file identifier
143+ file_id = file_id_part
144+ else :
145+ # Persistent ID format - construct full persistent ID
146+ file_id = f"{ dataset_id } /{ file_id_part } "
121147 return ContainerAndFileIdentifier (container_id = dataset_id , file_identifier = file_id )
122148
123149 def get_container_id_from_path (self , source_path : str ) -> str :
@@ -166,7 +192,7 @@ def _realize_to(
166192 dataset_id , file_id = self .parse_path (source_path )
167193 try :
168194 self .repository .download_file_from_container (dataset_id , file_id , native_path , context )
169- except NotFoundException :
195+ except ObjectNotFound :
170196 filename = file_id .split ("/" )[- 1 ]
171197 is_zip_file = self ._is_zip_archive (filename )
172198 if is_zip_file :
@@ -176,6 +202,8 @@ def _realize_to(
176202 # So, if a zip is not found, we suppose we are trying to reimport an archived history
177203 # and make an API call to Dataverse to download the dataset as a zip.
178204 self .repository ._download_dataset_as_zip (dataset_id , native_path , context )
205+ else :
206+ raise
179207
180208 def _is_zip_archive (self , file_name : str ) -> bool :
181209 return file_name .endswith (".zip" )
@@ -199,9 +227,21 @@ def api_base_url(self) -> str:
199227 def search_url (self ) -> str :
200228 return f"{ self .api_base_url } /search"
201229
202- def file_access_url (self , file_id : str ) -> str :
203- encoded_file_id = quote (file_id , safe = "" )
204- return f"{ self .api_base_url } /access/datafile/:persistentId?persistentId={ encoded_file_id } "
230+ def file_access_url (self , file_identifier : str ) -> str :
231+ """Build the access URL for a file.
232+
233+ The file_identifier can be either:
234+ - A persistent ID (DOI) like 'doi:10.70122/FK2/DIG2DG/AVNCLL'
235+ - A database ID prefixed with 'id:' like 'id:12345'
236+ """
237+ if file_identifier .startswith ("id:" ):
238+ # Use database ID for access
239+ database_id = file_identifier [3 :] # Remove 'id:' prefix
240+ return f"{ self .api_base_url } /access/datafile/{ database_id } "
241+ else :
242+ # Use persistent ID for access
243+ encoded_file_id = quote (file_identifier , safe = "" )
244+ return f"{ self .api_base_url } /access/datafile/:persistentId?persistentId={ encoded_file_id } "
205245
206246 def download_dataset_as_zip_url (self , dataset_id : str ) -> str :
207247 return f"{ self .api_base_url } /access/dataset/:persistentId/?persistentId={ dataset_id } "
@@ -222,7 +262,29 @@ def public_dataset_url(self, dataset_id: str) -> str:
222262 return f"{ self .repository_url } /dataset.xhtml?persistentId={ dataset_id } "
223263
224264 def to_plugin_uri (self , dataset_id : str , file_identifier : Optional [str ] = None ) -> str :
225- return f"{ self .plugin .get_uri_root ()} /{ f'{ file_identifier } ' if file_identifier else f'{ dataset_id } ' } "
265+ """Build a plugin URI for a dataset or file.
266+
267+ For datasets: dataverse://source/doi:10.70122/FK2/DIG2DG
268+ For files: dataverse://source/doi:10.70122/FK2/DIG2DG/AVNCLL (persistent ID)
269+ dataverse://source/doi:10.70122/FK2/DIG2DG/id:12345 (database ID)
270+ """
271+ if file_identifier :
272+ # For files, we need both the dataset_id and file_identifier in the path
273+ # Extract just the file-specific part from the file_identifier
274+ if file_identifier .startswith ("id:" ):
275+ # Database ID format: keep as is (e.g., 'id:12345')
276+ file_part = file_identifier
277+ elif "/" in file_identifier and file_identifier .startswith (dataset_id ):
278+ # Full persistent ID format: extract just the suffix after dataset_id
279+ # e.g., 'doi:10.70122/FK2/DIG2DG/AVNCLL' -> 'AVNCLL'
280+ file_part = file_identifier [len (dataset_id ) + 1 :]
281+ else :
282+ # Already just the suffix
283+ file_part = file_identifier
284+ return f"{ self .plugin .get_uri_root ()} /{ dataset_id } /{ file_part } "
285+ else :
286+ # For datasets, just use the dataset_id
287+ return f"{ self .plugin .get_uri_root ()} /{ dataset_id } "
226288
227289 def _is_api_url (self , url : str ) -> bool :
228290 return "/api/" in url
@@ -348,11 +410,11 @@ def _download_file(
348410 download_file_content_url : str ,
349411 context : FilesSourceRuntimeContext [RDMFileSourceConfiguration ],
350412 ):
351- headers = {}
413+ headers = get_default_headers ()
352414
353415 if self ._is_api_url (download_file_content_url ):
354416 # pass the token as a header only when using the API
355- headers = self ._get_request_headers (context )
417+ headers . update ( self ._get_request_headers (context ) )
356418 try :
357419 req = urllib .request .Request (download_file_content_url , headers = headers )
358420 with urllib .request .urlopen (req , timeout = DEFAULT_SOCKET_TIMEOUT ) as page :
@@ -361,11 +423,19 @@ def _download_file(
361423 page , f .fileno (), file_path , source_encoding = get_charset_from_http_headers (page .headers )
362424 )
363425 except HTTPError as e :
426+ if e .code == 401 :
427+ raise AuthenticationRequired (
428+ f"Authentication required to download file from '{ download_file_content_url } '. "
429+ f"Please provide a valid API token in your user preferences."
430+ )
364431 # TODO: We can only download files from published datasets for now
365- if e .code in [401 , 403 , 404 ]:
366- raise NotFoundException (
367- f"Cannot download file from URL '{ file_path } '. Please make sure the dataset and/or file exists and it is public."
432+ if e .code in [403 , 404 ]:
433+ raise ObjectNotFound (
434+ f"File not found at '{ download_file_content_url } '. "
435+ f"Please make sure the dataset and file exist and are published."
368436 )
437+ else :
438+ raise
369439
370440 def _get_datasets_from_response (self , response : dict ) -> list [RemoteDirectory ]:
371441 rval : list [RemoteDirectory ] = []
@@ -384,7 +454,16 @@ def _get_files_from_response(self, dataset_id: str, response: dict) -> list[Remo
384454 rval : list [RemoteFile ] = []
385455 for entry in response :
386456 dataFile = entry .get ("dataFile" )
387- uri = self .to_plugin_uri (dataset_id , dataFile .get ("persistentId" ))
457+ # Use persistentId if available, otherwise fall back to database id
458+ # The database id is prefixed with 'id:' to distinguish from DOI-based persistent IDs
459+ file_persistent_id = dataFile .get ("persistentId" )
460+ if file_persistent_id :
461+ file_identifier = file_persistent_id
462+ else :
463+ # Fallback to database id when persistentId is not available
464+ # (e.g., when FilePIDsEnabled is false on the Dataverse instance)
465+ file_identifier = f"id:{ dataFile .get ('id' )} "
466+ uri = self .to_plugin_uri (dataset_id , file_identifier )
388467 rval .append (
389468 RemoteFile (
390469 name = dataFile .get ("filename" ),
0 commit comments