@@ -34,124 +34,68 @@ def get_cdi_file_content(site_url, file_id):
3434# Try to find and return existing CDI file
3535def find_cdi_file ():
3636 """Find the latest CDI file with application/ld+json MIME type"""
37+ import sys
3738
3839 # Extract site URL and dataset ID from the input data
3940 site_url = None
4041 dataset_id = None
42+ files = []
4143
42- # Try to get site URL from ORE export
44+ # First, try to get site URL from ORE export (we always need this for downloading files)
4345 if 'datasetORE' in x and 'ore:describes' in x ['datasetORE' ]:
4446 describes = x ['datasetORE' ]['ore:describes' ]
4547 if '@id' in describes :
4648 dataset_url = describes ['@id' ]
47- # Extract base URL (everything before /dataset.xhtml or /api)
49+
50+ # Extract base URL (everything before /dataset.xhtml or /api or /citation)
4851 if '/dataset.xhtml' in dataset_url :
4952 site_url = dataset_url .split ('/dataset.xhtml' )[0 ]
5053 elif '/citation' in dataset_url :
5154 site_url = dataset_url .split ('/citation' )[0 ]
55+ elif '/api/' in dataset_url :
56+ site_url = dataset_url .split ('/api/' )[0 ]
57+ # If it's just a DOI or persistent URL without path, we need to extract from config or path
58+ elif dataset_url .startswith ('http' ):
59+ # Try to get site URL from config or path
60+ if 'config' in x and 'server-url' in x ['config' ]:
61+ site_url = x ['config' ]['server-url' ]
62+ elif 'path' in x :
63+ # Path might be something like /api/datasets/export...
64+ # We can assume localhost:8080 or extract from request context
65+ site_url = 'http://localhost:8080'
5266
5367 # Try to get dataset ID from datasetJson
5468 if 'datasetJson' in x and 'id' in x ['datasetJson' ]:
5569 dataset_id = x ['datasetJson' ]['id' ]
5670
57- if not site_url or not dataset_id :
58- return None
71+ # First, try to get files from the input data (already available in export)
72+ if 'datasetJson' in x :
73+ dataset_json = x ['datasetJson' ]
74+ if 'datasetVersion' in dataset_json and 'files' in dataset_json ['datasetVersion' ]:
75+ files = dataset_json ['datasetVersion' ]['files' ]
5976
60- # Get list of files
61- files = get_dataset_files (site_url , dataset_id )
77+ # If no files in input data, try to fetch from API
78+ if not files and site_url and dataset_id :
79+ files = get_dataset_files (site_url , dataset_id )
6280
6381 # Find CDI files - ONLY files with exact DDI-CDI MIME type
64- cdi_files = []
65- try :
66- for file_info in files :
67- try :
68- # Safe access for Jython/Java HashMap
69- datafile = None
70- try :
71- datafile = file_info .get ('dataFile' )
72- except :
73- try :
74- datafile = file_info ['dataFile' ]
75- except :
76- continue
77-
78- if not datafile :
79- continue
80-
81- # Get content type safely
82- content_type = ''
83- try :
84- content_type = datafile .get ('contentType' , '' )
85- except :
86- try :
87- content_type = datafile ['contentType' ]
88- except :
89- content_type = ''
90-
91- if not content_type :
92- continue
93-
94- # Check for EXACT MIME type: application/ld+json with DDI-CDI profile
95- # Must match: application/ld+json with ddialliance.org profile (case-insensitive)
96- content_type_str = str (content_type ).lower ()
97- is_cdi_mime = (
98- 'application/ld+json' in content_type_str and
99- 'profile=' in content_type_str and
100- 'ddialliance.org' in content_type_str and
101- 'ddi-cdi' in content_type_str
102- )
82+ for file_info in files :
83+ try :
84+ datafile = file_info ['dataFile' ]
85+ content_type = str (datafile ['contentType' ]).lower ()
86+
87+ # Check for application/ld+json with DDI-CDI profile (case-insensitive)
88+ if 'application/ld+json' in content_type and 'ddialliance.org' in content_type and 'ddi-cdi' in content_type :
89+ file_id = datafile ['id' ]
10390
104- if is_cdi_mime :
105- # Safely extract file metadata
106- file_id = None
107- filename = ''
108- create_date = ''
109-
110- try :
111- file_id = datafile .get ('id' )
112- except :
113- try :
114- file_id = datafile ['id' ]
115- except :
116- pass
117-
118- try :
119- filename = datafile .get ('filename' , '' )
120- except :
121- try :
122- filename = datafile ['filename' ]
123- except :
124- pass
125-
126- try :
127- create_date = datafile .get ('createDate' , '' )
128- except :
129- try :
130- create_date = datafile ['createDate' ]
131- except :
132- pass
133-
134- if file_id :
135- cdi_files .append ({
136- 'id' : file_id ,
137- 'filename' : filename ,
138- 'createDate' : create_date
139- })
140- except :
141- pass
142- except :
143- pass
144-
145- # Sort by creation date (newest first) and get the most recent
146- if cdi_files :
147- cdi_files .sort (key = lambda f : f .get ('createDate' , '' ), reverse = True )
148- latest_file = cdi_files [0 ]
149-
150- # Download and return the content
151- content = get_cdi_file_content (site_url , latest_file ['id' ])
152- if content :
153- return content
154-
91+ # Found a CDI file - download and return it immediately
92+ content = get_cdi_file_content (site_url , file_id )
93+ if content :
94+ return content
95+ except Exception as e :
96+ # Skip files that don't have the expected structure
97+ pass
98+
15599 return None
156100
157101# Helper functions
@@ -188,8 +132,15 @@ def get_compound_values(fields, field_name):
188132# Try to get existing CDI file first
189133existing_cdi = find_cdi_file ()
190134if existing_cdi :
191- res = existing_cdi
192- else :
135+ # Parse the JSON content and assign to res
136+ try :
137+ res = json .loads (existing_cdi )
138+ except :
139+ # If parsing fails, fall back to generating metadata
140+ res = {}
141+ existing_cdi = None
142+
143+ if not existing_cdi :
193144 # Generate CDI JSON-LD from dataset metadata
194145 res = {}
195146
@@ -1142,5 +1093,5 @@ def get_citation_compound(field_name):
11421093 if date_info :
11431094 pass # Add to an annotation or note entity if needed
11441095
1145- # Set the graph
1146- res ['@graph' ] = graph
1096+ # Set the graph
1097+ res ['@graph' ] = graph
0 commit comments