Skip to content

Commit 5dc8b47

Browse files
authored
Croissant 1.1 (summary statistics) (#12214)
* bump "conformsTo" to http://mlcommons.org/croissant/1.1 #12014 * switch to non-HTTPS for schema.org #12014 "For namespace URLs, we use http:// (that's an issue discussed at length in the schema.org community, and you can probably find an older issue about it.)" -- mlcommons/croissant#929 (review) * remove wikidata namespace (unused) #12014 The wikidata namespace is unused and doesn't appear in the 1.0 or 1.1 spec. (It was in a titanic example for 1.0). Remove. * add summary/descriptive statistics #12014 https://docs.mlcommons.org/croissant/docs/croissant-spec-1.1.html#application-representing-descriptive-statistics * link to example Croissant file from list of formats #12014 * add Croissant 1.1 backward incompatible changes to changelog #12014 * add release note snippet for Croissant 1.1 #12014
1 parent 512e97f commit 5dc8b47

14 files changed

Lines changed: 834 additions & 71 deletions

File tree

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
### Croissant 1.1 (Summary Statistics)
2+
3+
The Croissant metadata export format has been updated from version 1.0 to 1.1.
4+
5+
Summary statistics (mean, min, max, etc.) are now included for tabular files that were successfully ingested.
6+
7+
You can download an example Croissant file from the [Supported Metadata Export Formats](https://dataverse-guide--12214.org.readthedocs.build/en/12214/user/dataset-management.html#supported-metadata-export-formats) section of the guides.
8+
9+
Minor backward-incompatible changes were made, which are noted below.
10+
11+
See #12014 and #12214
12+
13+
## Backward Incompatible Changes
14+
15+
Generally speaking, see the [API Changelog](https://guides.dataverse.org/en/latest/api/changelog.html) for a list of backward-incompatible API changes.
16+
17+
Minor changes in the `croissant` format are noted in the [API changelog](https://dataverse-guide--12214.org.readthedocs.build/en/12214/api/changelog.html).
18+
19+
## Upgrade Instructions
20+
21+
1. Re-export metadata export formats
22+
23+
We re-export because the Croissant format was updated.
24+
25+
`curl http://localhost:8080/api/admin/metadata/reExportAll`

doc/sphinx-guides/source/api/changelog.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ This API changelog is experimental and we would love feedback on its usefulness.
77
:local:
88
:depth: 1
99

10+
v6.11
11+
-----
12+
13+
- The Croissant :ref:`metadata export format <metadata-export-formats>` has been updated from version 1.0 to 1.1, which is reflected in the ``conformsTo`` property. ``@vocab`` and ``sc`` properties now use "http" as `recommended <https://github.com/mlcommons/croissant/pull/929#pullrequestreview-3079137662>`_. The unused ``wd`` property has been dropped.
14+
1015
v6.10
1116
-----
1217
- The following GET APIs will now return ``400`` if a required Guestbook Response is not supplied. A Guestbook Response can be passed to these APIs in the JSON body using a POST call. See the notes under :ref:`basic-file-access` and :ref:`download-by-dataset-by-version` for details.

doc/sphinx-guides/source/user/dataset-management.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Supported Metadata Export Formats
2828

2929
Once a dataset has been published, its metadata can be exported in a variety of other metadata standards and formats, which help make datasets more :doc:`discoverable </admin/discoverability>` and usable in other systems, such as other data repositories. On each dataset page's metadata tab, the following exports are available:
3030

31-
- Croissant
31+
- Croissant (example: :download:`max-croissant.json <../../../../src/test/resources/croissant/max/expected/max-croissant.json>`)
3232
- Dublin Core
3333
- DDI (Data Documentation Initiative Codebook 2.5)
3434
- DDI HTML Codebook (A more human-readable, HTML version of the DDI Codebook 2.5 metadata export)

src/main/java/edu/harvard/iq/dataverse/export/croissant/CroissantExportUtil.java

Lines changed: 143 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ public static void exportDataset(
3131
{
3232
"@context": {
3333
"@language": "en",
34-
"@vocab": "https://schema.org/",
34+
"@vocab": "http://schema.org/",
3535
"citeAs": "cr:citeAs",
3636
"column": "cr:column",
3737
"conformsTo": "dct:conformsTo",
@@ -46,6 +46,7 @@ public static void exportDataset(
4646
"@type": "@vocab"
4747
},
4848
"dct": "http://purl.org/dc/terms/",
49+
"ddi-stats": "http://rdf-vocabulary.ddialliance.org/cv/SummaryStatisticType/2.1.2/",
4950
"examples": {
5051
"@id": "cr:examples",
5152
"@type": "@json"
@@ -69,12 +70,11 @@ public static void exportDataset(
6970
"repeated": "cr:repeated",
7071
"replace": "cr:replace",
7172
"samplingRate": "cr:samplingRate",
72-
"sc": "https://schema.org/",
73+
"sc": "http://schema.org/",
7374
"separator": "cr:separator",
7475
"source": "cr:source",
7576
"subField": "cr:subField",
76-
"transform": "cr:transform",
77-
"wd": "https://www.wikidata.org/wiki/"
77+
"transform": "cr:transform"
7878
}
7979
}
8080
""";
@@ -84,7 +84,7 @@ public static void exportDataset(
8484
}
8585

8686
job.add("@type", "sc:Dataset");
87-
job.add("conformsTo", "http://mlcommons.org/croissant/1.0");
87+
job.add("conformsTo", "http://mlcommons.org/croissant/1.1");
8888

8989
JsonObject datasetJson = dataProvider.getDatasetJson();
9090

@@ -261,6 +261,13 @@ public static void exportDataset(
261261
int varQuantity = dataTableObject.getInt("varQuantity");
262262
// Unused
263263
int caseQuantity = dataTableObject.getInt("caseQuantity");
264+
recordSetContent.add(
265+
"cr:annotation",
266+
Json.createObjectBuilder()
267+
.add("@type", "cr:Field")
268+
.add("name", fileId.toString() + "/count")
269+
.add("value", caseQuantity)
270+
.add("dataType", "http://www.wikidata.org/entity/Q4049983"));
264271
JsonArray dataVariables = dataTableObject.getJsonArray("dataVariables");
265272
JsonArrayBuilder fieldSetArray = Json.createArrayBuilder();
266273
for (JsonValue dataVariableValue : dataVariables) {
@@ -278,6 +285,8 @@ public static void exportDataset(
278285
dataVariableObject.getString("variableFormatType");
279286
String variableIntervalType =
280287
dataVariableObject.getString("variableIntervalType");
288+
JsonObject variableSummaryStatistics =
289+
dataVariableObject.getJsonObject("summaryStatistics");
281290
String dataType = null;
282291
/**
283292
* There are only two variableFormatType types on the Dataverse side:
@@ -293,7 +302,129 @@ public static void exportDataset(
293302
default:
294303
break;
295304
}
296-
fieldSetArray.add(
305+
JsonArrayBuilder annotationsBuilder = Json.createArrayBuilder();
306+
if (variableSummaryStatistics != null) {
307+
// Same order as upstream: MEAN, MEDN, MODE, MIN, MAX, STDEV, VALD, INVD
308+
annotationsBuilder
309+
.add(
310+
Json.createObjectBuilder()
311+
// We're aware that an @id of
312+
// "data/stata13-auto.dta/price/mean"
313+
// looks nice but won't validate if there's
314+
// whitespace in the filename.
315+
// We've asked for guidance here:
316+
// https://github.com/mlcommons/croissant/issues/639#issuecomment-3792179493
317+
.add(
318+
"@id",
319+
fileId.toString()
320+
+ "/"
321+
+ variableName
322+
// The spec gives "mean" as an
323+
// example but we'll use
324+
// ArithmeticMean from
325+
// https://rdf-vocabulary.ddialliance.org/ddi-cv/SummaryStatisticType/2.1.2/SummaryStatisticType.html
326+
+ "/ArithmeticMean")
327+
.add(
328+
"value",
329+
variableSummaryStatistics.getString(
330+
"mean"))
331+
.add("dataType", "ddi-stats:7975ed0"))
332+
.add(
333+
Json.createObjectBuilder()
334+
.add(
335+
"@id",
336+
fileId.toString()
337+
+ "/"
338+
+ variableName
339+
+ "/Median")
340+
.add(
341+
"value",
342+
variableSummaryStatistics.getString(
343+
"medn"))
344+
.add("dataType", "ddi-stats:66851a3")
345+
.add("equivalentProperty", "sc:median"))
346+
.add(
347+
Json.createObjectBuilder()
348+
.add(
349+
"@id",
350+
fileId.toString()
351+
+ "/"
352+
+ variableName
353+
+ "/Mode")
354+
.add(
355+
"value",
356+
variableSummaryStatistics.getString(
357+
"mode"))
358+
.add("dataType", "ddi-stats:650be61"))
359+
.add(
360+
Json.createObjectBuilder()
361+
.add(
362+
"@id",
363+
fileId.toString()
364+
+ "/"
365+
+ variableName
366+
+ "/Minimum")
367+
.add(
368+
"value",
369+
variableSummaryStatistics.getString(
370+
"min"))
371+
.add("dataType", "ddi-stats:a1d0ec6")
372+
.add("equivalentProperty", "sc:minValue"))
373+
.add(
374+
Json.createObjectBuilder()
375+
.add(
376+
"@id",
377+
fileId.toString()
378+
+ "/"
379+
+ variableName
380+
+ "/Maximum")
381+
.add(
382+
"value",
383+
variableSummaryStatistics.getString(
384+
"max"))
385+
.add("dataType", "ddi-stats:8321e79")
386+
.add("equivalentProperty", "sc:maxValue"))
387+
.add(
388+
Json.createObjectBuilder()
389+
.add(
390+
"@id",
391+
fileId.toString()
392+
+ "/"
393+
+ variableName
394+
+ "/StandardDeviation")
395+
.add(
396+
"value",
397+
variableSummaryStatistics.getString(
398+
"stdev"))
399+
.add("dataType", "ddi-stats:690ab50"))
400+
.add(
401+
Json.createObjectBuilder()
402+
.add(
403+
"@id",
404+
fileId.toString()
405+
+ "/"
406+
+ variableName
407+
+ "/ValidCases")
408+
.add(
409+
"value",
410+
variableSummaryStatistics.getString(
411+
"vald"))
412+
.add("dataType", "ddi-stats:c646dd8"))
413+
.add(
414+
Json.createObjectBuilder()
415+
.add(
416+
"@id",
417+
fileId.toString()
418+
+ "/"
419+
+ variableName
420+
+ "/InvalidCases")
421+
.add(
422+
"value",
423+
variableSummaryStatistics.getString(
424+
"invd"))
425+
.add("dataType", "ddi-stats:6459c62"));
426+
}
427+
JsonObjectBuilder fieldBuilder =
297428
Json.createObjectBuilder()
298429
.add("@type", "cr:Field")
299430
.add("name", variableName)
@@ -312,7 +443,12 @@ public static void exportDataset(
312443
Json.createObjectBuilder()
313444
.add(
314445
"column",
315-
variableName))));
446+
variableName)));
447+
JsonArray annotations = annotationsBuilder.build();
448+
if (!annotations.isEmpty()) {
449+
fieldBuilder.add("annotation", annotations);
450+
}
451+
fieldSetArray.add(fieldBuilder);
316452
}
317453
recordSetContent.add("field", fieldSetArray);
318454
recordSet.add(recordSetContent);

0 commit comments

Comments
 (0)