Skip to content

Commit c83edc2

Browse files
authored
Merge pull request #18 from gdcc/tsv
replace "text/tsv" (invalid) with "text/tab-separated-values" (valid)
2 parents 34504a7 + 4398895 commit c83edc2

8 files changed

Lines changed: 344 additions & 49 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
- Avoid file mismatches by matching on storageIdentifier rather than relying on order in array. See https://github.com/gdcc/exporter-croissant/pull/17
55
- Include "extract column" object per field, suggest testing with https://huggingface.co/spaces/JoaquinVanschoren/croissant-checker . See https://github.com/gdcc/exporter-croissant/pull/16
6+
- Replace "text/tsv" (invalid) with "text/tab-separated-values" (valid). See https://github.com/gdcc/exporter-croissant/pull/18
67
- Upgrade to mlcroissant 1.0.17 in validation script. See https://github.com/gdcc/exporter-croissant/pull/15
78

89
0.1.4

src/main/java/io/gdcc/spi/export/croissant/CroissantExporter.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,22 @@ public void exportDataset(ExportDataProvider dataProvider, OutputStream outputSt
228228
if (filename == null) {
229229
filename = StringEscapeUtils.escapeHtml4(fileDetails.getString("filename"));
230230
}
231-
String fileFormat = fileDetails.getString("originalFileFormat", null);
231+
String fileFormat = null;
232+
// Use the original file format, if available, since that's where the
233+
// contentUrl will point.
234+
String originalFileFormat = fileDetails.getString("originalFileFormat", null);
235+
if (originalFileFormat != null) {
236+
if ("text/tsv".equals(originalFileFormat)) {
237+
// "text/tsv" is an internal format used by Dataverse while
238+
// "text/tab-separated-values" is the official IANA format
239+
// that we present to the outside world
240+
// See https://github.com/IQSS/dataverse/issues/11505 and
241+
// https://www.iana.org/assignments/media-types/media-types.xhtml
242+
fileFormat = "text/tab-separated-values";
243+
} else {
244+
fileFormat = originalFileFormat;
245+
}
246+
}
232247
if (fileFormat == null) {
233248
fileFormat = fileDetails.getString("contentType");
234249
}

src/test/resources/max/expected/max-croissant.json

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@
8181
],
8282
"license": "http://creativecommons.org/publicdomain/zero/1.0",
8383
"datePublished": "2024-05-01",
84-
"dateModified": "2024-05-01",
84+
"dateModified": "2025-05-21",
8585
"includedInDataCatalog": {
8686
"@type": "DataCatalog",
8787
"name": "Root",
@@ -91,7 +91,7 @@
9191
"@type": "Organization",
9292
"name": "Root"
9393
},
94-
"version": "1.0",
94+
"version": "3.0",
9595
"citeAs": "@data{FK2/VQTYHD_2024,author = {Durbin, Philip and IQSS},publisher = {Root},title = {Max Schema.org},year = {2024},url = {https://doi.org/10.5072/FK2/VQTYHD}}",
9696
"funder": [
9797
{
@@ -119,6 +119,16 @@
119119
"2023-01-01/2023-12-31"
120120
],
121121
"distribution": [
122+
{
123+
"@type": "cr:FileObject",
124+
"@id": "data.tsv",
125+
"name": "data.tsv",
126+
"encodingFormat": "text/tab-separated-values",
127+
"md5": "3663d6a436ac00f5541a7336d6fa18c9",
128+
"contentSize": "33",
129+
"description": "",
130+
"contentUrl": "https://beta.dataverse.org/api/access/datafile/26646?format=original"
131+
},
122132
{
123133
"@type": "cr:FileObject",
124134
"@id": "doc/README.md",
@@ -129,5 +139,57 @@
129139
"description": "Additional documentation.",
130140
"contentUrl": "https://beta.dataverse.org/api/access/datafile/26148"
131141
}
142+
],
143+
"recordSet": [
144+
{
145+
"@type": "cr:RecordSet",
146+
"field": [
147+
{
148+
"@type": "cr:Field",
149+
"name": "foo",
150+
"description": "foo",
151+
"dataType": "sc:Text",
152+
"source": {
153+
"@id": "1287",
154+
"fileObject": {
155+
"@id": "data.tsv"
156+
},
157+
"extract": {
158+
"column": "foo"
159+
}
160+
}
161+
},
162+
{
163+
"@type": "cr:Field",
164+
"name": "bar",
165+
"description": "bar",
166+
"dataType": "sc:Integer",
167+
"source": {
168+
"@id": "1285",
169+
"fileObject": {
170+
"@id": "data.tsv"
171+
},
172+
"extract": {
173+
"column": "bar"
174+
}
175+
}
176+
},
177+
{
178+
"@type": "cr:Field",
179+
"name": "baz",
180+
"description": "baz",
181+
"dataType": "sc:Integer",
182+
"source": {
183+
"@id": "1286",
184+
"fileObject": {
185+
"@id": "data.tsv"
186+
},
187+
"extract": {
188+
"column": "baz"
189+
}
190+
}
191+
}
192+
]
193+
}
132194
]
133195
}
Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,77 @@
11
<?xml version="1.0" encoding="UTF-8"?>
2-
<resource xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd"
3-
xmlns="http://datacite.org/schema/kernel-4"
4-
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
2+
<resource xmlns="http://datacite.org/schema/kernel-4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.5/metadata.xsd">
53
<identifier identifierType="DOI">10.5072/FK2/VQTYHD</identifier>
6-
<creators><creator><creatorName>Durbin, Philip</creatorName><nameIdentifier schemeURI="https://orcid.org/" nameIdentifierScheme="ORCID">0000-0002-9528-9470</nameIdentifier><affiliation>(Harvard University)</affiliation></creator><creator><creatorName>IQSS</creatorName><affiliation>(Harvard University)</affiliation></creator></creators>
4+
<creators>
5+
<creator>
6+
<creatorName nameType="Personal">Durbin, Philip</creatorName>
7+
<givenName>Philip</givenName>
8+
<familyName>Durbin</familyName>
9+
<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="https://orcid.org">https://orcid.org/0000-0002-9528-9470</nameIdentifier>
10+
<affiliation>Harvard University</affiliation>
11+
</creator>
12+
<creator>
13+
<creatorName nameType="Personal">IQSS</creatorName>
14+
<affiliation>Harvard University</affiliation>
15+
</creator>
16+
</creators>
717
<titles>
818
<title>Max Schema.org</title>
919
</titles>
1020
<publisher>Root</publisher>
1121
<publicationYear>2024</publicationYear>
22+
<subjects>
23+
<subject>Social Sciences</subject>
24+
<subject>Other</subject>
25+
<subject>foo</subject>
26+
<subject>bar</subject>
27+
</subjects>
28+
<contributors>
29+
<contributor contributorType="ContactPerson">
30+
<contributorName nameType="Personal">Durbin, Philip</contributorName>
31+
<givenName>Philip</givenName>
32+
<familyName>Durbin</familyName>
33+
</contributor>
34+
</contributors>
35+
<dates>
36+
<date dateType="Submitted">2024-05-01</date>
37+
<date dateType="Available">2024-05-01</date>
38+
<date dateType="Updated">2025-05-21</date>
39+
<date dateType="Other" dateInformation="Time period covered by the data">2023-01-01/2023-12-31</date>
40+
</dates>
1241
<resourceType resourceTypeGeneral="Dataset"/>
42+
<relatedIdentifiers>
43+
<relatedIdentifier relationType="IsSupplementTo" relatedIdentifierType="DOI">10.5281/ZENODO.10843668</relatedIdentifier>
44+
</relatedIdentifiers>
45+
<sizes>
46+
<size>34</size>
47+
<size>21865</size>
48+
<size>27</size>
49+
</sizes>
50+
<formats>
51+
<format>text/markdown</format>
52+
<format>text/tab-separated-values</format>
53+
<format>text/tab-separated-values</format>
54+
</formats>
55+
<version>3.0</version>
56+
<rightsList>
57+
<rights rightsURI="info:eu-repo/semantics/openAccess"/>
58+
<rights rightsURI="http://creativecommons.org/publicdomain/zero/1.0" rightsIdentifier="CC0-1.0" rightsIdentifierScheme="SPDX" schemeURI="https://spdx.org/licenses/" xml:lang="en">Creative Commons CC0 1.0 Universal Public Domain Dedication.</rights>
59+
</rightsList>
1360
<descriptions>
1461
<description descriptionType="Abstract">Exercising fields used by `schema.org` exporter.</description>
1562
</descriptions>
16-
<contributors><contributor contributorType="ContactPerson"><contributorName>Durbin, Philip</contributorName></contributor></contributors>
63+
<geoLocations>
64+
<geoLocation>
65+
<geoLocationPlace>United States, MA,, Cambridge,, Harvard Square,</geoLocationPlace>
66+
</geoLocation>
67+
</geoLocations>
68+
<fundingReferences>
69+
<fundingReference>
70+
<funderName>NSF</funderName>
71+
</fundingReference>
72+
<fundingReference>
73+
<funderName>NIH</funderName>
74+
<awardNumber>3OT2DB000004-01S3</awardNumber>
75+
</fundingReference>
76+
</fundingReferences>
1777
</resource>

src/test/resources/max/in/datasetFileDetails.json

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,96 @@
11
[
2+
{
3+
"id": 26646,
4+
"persistentId": "",
5+
"filename": "data.tab",
6+
"contentType": "text/tab-separated-values",
7+
"friendlyType": "Tab-Delimited",
8+
"filesize": 27,
9+
"storageIdentifier": "s3://beta-dataverse-direct:196f44cc758-4d710ffac5d8",
10+
"originalFileFormat": "text/tsv",
11+
"originalFormatLabel": "Tab-Separated Values",
12+
"originalFileSize": 33,
13+
"originalFileName": "data.tsv",
14+
"UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==",
15+
"rootDataFileId": -1,
16+
"md5": "3663d6a436ac00f5541a7336d6fa18c9",
17+
"checksum": {
18+
"type": "MD5",
19+
"value": "3663d6a436ac00f5541a7336d6fa18c9"
20+
},
21+
"tabularData": true,
22+
"creationDate": "2025-05-21",
23+
"publicationDate": "2025-05-21",
24+
"fileAccessRequest": true,
25+
"restricted": false,
26+
"fileMetadataId": 32509,
27+
"dataTables": [
28+
{
29+
"varQuantity": 3,
30+
"caseQuantity": 3,
31+
"UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==",
32+
"dataVariables": [
33+
{
34+
"id": 1287,
35+
"name": "foo",
36+
"label": "foo",
37+
"weighted": false,
38+
"variableIntervalType": "discrete",
39+
"variableFormatType": "CHARACTER",
40+
"isOrderedCategorical": false,
41+
"fileOrder": 0,
42+
"UNF": "UNF:6:FWBO/a1GcxDnM3fNLdzrHw==",
43+
"variableMetadata": []
44+
},
45+
{
46+
"id": 1285,
47+
"name": "bar",
48+
"label": "bar",
49+
"weighted": false,
50+
"variableIntervalType": "discrete",
51+
"variableFormatType": "NUMERIC",
52+
"isOrderedCategorical": false,
53+
"fileOrder": 1,
54+
"UNF": "UNF:6:AvELPR5QTaBbnq6S22Msow==",
55+
"variableMetadata": [],
56+
"summaryStatistics": {
57+
"mode": ".",
58+
"invd": "0.0",
59+
"min": "1.0",
60+
"stdev": "1.0",
61+
"max": "3.0",
62+
"vald": "3.0",
63+
"mean": "2.0",
64+
"medn": "2.0"
65+
}
66+
},
67+
{
68+
"id": 1286,
69+
"name": "baz",
70+
"label": "baz",
71+
"weighted": false,
72+
"variableIntervalType": "discrete",
73+
"variableFormatType": "NUMERIC",
74+
"isOrderedCategorical": false,
75+
"fileOrder": 2,
76+
"UNF": "UNF:6:WkRUZjFbozW1nFYiqMGWeQ==",
77+
"variableMetadata": [],
78+
"summaryStatistics": {
79+
"mean": "20.0",
80+
"mode": ".",
81+
"min": "10.0",
82+
"max": "30.0",
83+
"invd": "0.0",
84+
"stdev": "10.0",
85+
"vald": "3.0",
86+
"medn": "20.0"
87+
}
88+
}
89+
]
90+
}
91+
],
92+
"varGroups": []
93+
},
294
{
395
"id": 26148,
496
"persistentId": "",
@@ -19,7 +111,7 @@
19111
"publicationDate": "2024-05-01",
20112
"fileAccessRequest": true,
21113
"restricted": false,
22-
"fileMetadataId": 32052,
114+
"fileMetadataId": 32511,
23115
"varGroups": []
24116
}
25117
]

0 commit comments

Comments
 (0)