Skip to content

Commit 1f5f1db

Browse files
committed
Refactor CDI file detection to simplify MIME type checks and improve error handling
1 parent b75ca73 commit 1f5f1db

2 files changed

Lines changed: 52 additions & 101 deletions

File tree

examples/cdi-exporter/config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
"formatName": "cdi",
33
"harvestable": true,
44
"availableToUsers": true,
5-
"mediaType": "application/ld+json; profile=\"http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://ddialliance.org/specification/ddi-cdi/1.0\"",
5+
"mediaType": "application/ld+json",
66
"displayName": "DDI-CDI (Cross Domain Integration)"
77
}

examples/cdi-exporter/transformer.py

Lines changed: 51 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -34,124 +34,68 @@ def get_cdi_file_content(site_url, file_id):
3434
# Try to find and return existing CDI file
3535
def find_cdi_file():
3636
"""Find the latest CDI file with application/ld+json MIME type"""
37+
import sys
3738

3839
# Extract site URL and dataset ID from the input data
3940
site_url = None
4041
dataset_id = None
42+
files = []
4143

42-
# Try to get site URL from ORE export
44+
# First, try to get site URL from ORE export (we always need this for downloading files)
4345
if 'datasetORE' in x and 'ore:describes' in x['datasetORE']:
4446
describes = x['datasetORE']['ore:describes']
4547
if '@id' in describes:
4648
dataset_url = describes['@id']
47-
# Extract base URL (everything before /dataset.xhtml or /api)
49+
50+
# Extract base URL (everything before /dataset.xhtml or /api or /citation)
4851
if '/dataset.xhtml' in dataset_url:
4952
site_url = dataset_url.split('/dataset.xhtml')[0]
5053
elif '/citation' in dataset_url:
5154
site_url = dataset_url.split('/citation')[0]
55+
elif '/api/' in dataset_url:
56+
site_url = dataset_url.split('/api/')[0]
57+
# If it's just a DOI or persistent URL without path, we need to extract from config or path
58+
elif dataset_url.startswith('http'):
59+
# Try to get site URL from config or path
60+
if 'config' in x and 'server-url' in x['config']:
61+
site_url = x['config']['server-url']
62+
elif 'path' in x:
63+
# Path might be something like /api/datasets/export...
64+
# We can assume localhost:8080 or extract from request context
65+
site_url = 'http://localhost:8080'
5266

5367
# Try to get dataset ID from datasetJson
5468
if 'datasetJson' in x and 'id' in x['datasetJson']:
5569
dataset_id = x['datasetJson']['id']
5670

57-
if not site_url or not dataset_id:
58-
return None
71+
# First, try to get files from the input data (already available in export)
72+
if 'datasetJson' in x:
73+
dataset_json = x['datasetJson']
74+
if 'datasetVersion' in dataset_json and 'files' in dataset_json['datasetVersion']:
75+
files = dataset_json['datasetVersion']['files']
5976

60-
# Get list of files
61-
files = get_dataset_files(site_url, dataset_id)
77+
# If no files in input data, try to fetch from API
78+
if not files and site_url and dataset_id:
79+
files = get_dataset_files(site_url, dataset_id)
6280

6381
# Find CDI files - ONLY files with exact DDI-CDI MIME type
64-
cdi_files = []
65-
try:
66-
for file_info in files:
67-
try:
68-
# Safe access for Jython/Java HashMap
69-
datafile = None
70-
try:
71-
datafile = file_info.get('dataFile')
72-
except:
73-
try:
74-
datafile = file_info['dataFile']
75-
except:
76-
continue
77-
78-
if not datafile:
79-
continue
80-
81-
# Get content type safely
82-
content_type = ''
83-
try:
84-
content_type = datafile.get('contentType', '')
85-
except:
86-
try:
87-
content_type = datafile['contentType']
88-
except:
89-
content_type = ''
90-
91-
if not content_type:
92-
continue
93-
94-
# Check for EXACT MIME type: application/ld+json with DDI-CDI profile
95-
# Must match: application/ld+json with ddialliance.org profile (case-insensitive)
96-
content_type_str = str(content_type).lower()
97-
is_cdi_mime = (
98-
'application/ld+json' in content_type_str and
99-
'profile=' in content_type_str and
100-
'ddialliance.org' in content_type_str and
101-
'ddi-cdi' in content_type_str
102-
)
82+
for file_info in files:
83+
try:
84+
datafile = file_info['dataFile']
85+
content_type = str(datafile['contentType']).lower()
86+
87+
# Check for application/ld+json with DDI-CDI profile (case-insensitive)
88+
if 'application/ld+json' in content_type and 'ddialliance.org' in content_type and 'ddi-cdi' in content_type:
89+
file_id = datafile['id']
10390

104-
if is_cdi_mime:
105-
# Safely extract file metadata
106-
file_id = None
107-
filename = ''
108-
create_date = ''
109-
110-
try:
111-
file_id = datafile.get('id')
112-
except:
113-
try:
114-
file_id = datafile['id']
115-
except:
116-
pass
117-
118-
try:
119-
filename = datafile.get('filename', '')
120-
except:
121-
try:
122-
filename = datafile['filename']
123-
except:
124-
pass
125-
126-
try:
127-
create_date = datafile.get('createDate', '')
128-
except:
129-
try:
130-
create_date = datafile['createDate']
131-
except:
132-
pass
133-
134-
if file_id:
135-
cdi_files.append({
136-
'id': file_id,
137-
'filename': filename,
138-
'createDate': create_date
139-
})
140-
except:
141-
pass
142-
except:
143-
pass
144-
145-
# Sort by creation date (newest first) and get the most recent
146-
if cdi_files:
147-
cdi_files.sort(key=lambda f: f.get('createDate', ''), reverse=True)
148-
latest_file = cdi_files[0]
149-
150-
# Download and return the content
151-
content = get_cdi_file_content(site_url, latest_file['id'])
152-
if content:
153-
return content
154-
91+
# Found a CDI file - download and return it immediately
92+
content = get_cdi_file_content(site_url, file_id)
93+
if content:
94+
return content
95+
except Exception as e:
96+
# Skip files that don't have the expected structure
97+
pass
98+
15599
return None
156100

157101
# Helper functions
@@ -188,8 +132,15 @@ def get_compound_values(fields, field_name):
188132
# Try to get existing CDI file first
189133
existing_cdi = find_cdi_file()
190134
if existing_cdi:
191-
res = existing_cdi
192-
else:
135+
# Parse the JSON content and assign to res
136+
try:
137+
res = json.loads(existing_cdi)
138+
except:
139+
# If parsing fails, fall back to generating metadata
140+
res = {}
141+
existing_cdi = None
142+
143+
if not existing_cdi:
193144
# Generate CDI JSON-LD from dataset metadata
194145
res = {}
195146

@@ -1142,5 +1093,5 @@ def get_citation_compound(field_name):
11421093
if date_info:
11431094
pass # Add to an annotation or note entity if needed
11441095

1145-
# Set the graph
1146-
res['@graph'] = graph
1096+
# Set the graph
1097+
res['@graph'] = graph

0 commit comments

Comments
 (0)