Skip to content

Commit 7df17dc

Browse files
authored
Merge pull request #9175 from GlobalDataverseCommunityConsortium/DANS-external_exporters
DANS - Exporters in external jars
2 parents e2eb6d9 + 4322d50 commit 7df17dc

53 files changed

Lines changed: 1810 additions & 866 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/container_app_pr.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,18 @@ jobs:
5757
run: |
5858
echo "IMAGE_TAG=$(echo "${{ github.event.client_payload.pull_request.head.ref }}" | tr '\\/_:&+,;#*' '-')" >> $GITHUB_ENV
5959
60+
# Necessary to split as otherwise the submodules are not available (deploy skips install)
61+
- name: Build app container image with local architecture and submodules (profile will skip tests)
62+
run: >
63+
mvn -B -f modules/dataverse-parent
64+
-P ct -pl edu.harvard.iq:dataverse -am
65+
install
6066
- name: Deploy multi-arch application container image
61-
run: mvn -Pct deploy -Dapp.image.tag=${{ env.IMAGE_TAG }} -Dbase.image.tag=${{ env.BASE_IMAGE_TAG }} -Ddocker.registry=ghcr.io -Ddocker.platforms=${{ env.PLATFORMS }}
67+
run: >
68+
mvn
69+
-Dapp.image.tag=${{ env.IMAGE_TAG }} -Dbase.image.tag=${{ env.BASE_IMAGE_TAG }}
70+
${{ env.REGISTRY }} -Ddocker.platforms=${{ env.PLATFORMS }}
71+
-P ct deploy
6272
6373
- uses: marocchino/sticky-pull-request-comment@v2
6474
with:

.github/workflows/container_app_push.yml

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,16 @@ jobs:
3939
uses: actions/setup-java@v3
4040
with:
4141
java-version: "11"
42-
distribution: 'adopt'
43-
- name: Cache Maven packages
44-
uses: actions/cache@v3
45-
with:
46-
path: ~/.m2
47-
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
48-
restore-keys: ${{ runner.os }}-m2
42+
distribution: temurin
43+
cache: maven
4944

50-
- name: Build app container image with local architecture
51-
run: mvn -Pct package
45+
- name: Build app container image with local architecture and submodules (profile will skip tests)
46+
run: >
47+
mvn -B -f modules/dataverse-parent
48+
-P ct -pl edu.harvard.iq:dataverse -am
49+
install
5250
53-
# TODO: add smoke / integration testing here
51+
# TODO: add smoke / integration testing here (add "-Pct -DskipIntegrationTests=false")
5452

5553
hub-description:
5654
needs: build
@@ -100,12 +98,7 @@ jobs:
10098
- uses: actions/setup-java@v3
10199
with:
102100
java-version: "11"
103-
distribution: 'adopt'
104-
- uses: actions/cache@v3
105-
with:
106-
path: ~/.m2
107-
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
108-
restore-keys: ${{ runner.os }}-m2
101+
distribution: temurin
109102

110103
# Depending on context, we push to different targets. Login accordingly.
111104
- if: ${{ github.event_name != 'pull_request' }}
@@ -136,8 +129,18 @@ jobs:
136129
echo "IMAGE_TAG=$(echo "$GITHUB_HEAD_REF" | tr '\\/_:&+,;#*' '-')" >> $GITHUB_ENV
137130
echo "REGISTRY='-Ddocker.registry=ghcr.io'" >> $GITHUB_ENV
138131
132+
# Necessary to split as otherwise the submodules are not available (deploy skips install)
133+
- name: Build app container image with local architecture and submodules (profile will skip tests)
134+
run: >
135+
mvn -B -f modules/dataverse-parent
136+
-P ct -pl edu.harvard.iq:dataverse -am
137+
install
139138
- name: Deploy multi-arch application container image
140-
run: mvn -Pct deploy -Dapp.image.tag=${{ env.IMAGE_TAG }} -Dbase.image.tag=${{ env.BASE_IMAGE_TAG }} ${{ env.REGISTRY }} -Ddocker.platforms=${{ env.PLATFORMS }}
139+
run: >
140+
mvn
141+
-Dapp.image.tag=${{ env.IMAGE_TAG }} -Dbase.image.tag=${{ env.BASE_IMAGE_TAG }}
142+
${{ env.REGISTRY }} -Ddocker.platforms=${{ env.PLATFORMS }}
143+
-P ct deploy
141144
142145
- uses: marocchino/sticky-pull-request-comment@v2
143146
if: ${{ github.event_name == 'pull_request' }}

.github/workflows/maven_unit_test.yml

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@ on:
66
- "**.java"
77
- "pom.xml"
88
- "modules/**/pom.xml"
9+
- "!modules/container-base/**"
10+
- "!modules/dataverse-spi/**"
911
pull_request:
1012
paths:
1113
- "**.java"
1214
- "pom.xml"
1315
- "modules/**/pom.xml"
16+
- "!modules/container-base/**"
17+
- "!modules/dataverse-spi/**"
1418

1519
jobs:
1620
unittest:
@@ -33,25 +37,37 @@ jobs:
3337
continue-on-error: ${{ matrix.experimental }}
3438
runs-on: ubuntu-latest
3539
steps:
36-
- uses: actions/checkout@v2
40+
- uses: actions/checkout@v3
3741
- name: Set up JDK ${{ matrix.jdk }}
38-
uses: actions/setup-java@v2
42+
uses: actions/setup-java@v3
3943
with:
4044
java-version: ${{ matrix.jdk }}
41-
distribution: 'adopt'
42-
- name: Cache Maven packages
43-
uses: actions/cache@v2
44-
with:
45-
path: ~/.m2
46-
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
47-
restore-keys: ${{ runner.os }}-m2
45+
distribution: temurin
46+
cache: maven
47+
48+
# The reason why we use "install" here is that we want the submodules to be available in the next step.
49+
# Also, we can cache them this way for jobs triggered by this one.
4850
- name: Build with Maven
49-
run: mvn -DcompilerArgument=-Xlint:unchecked -Dtarget.java.version=${{ matrix.jdk }} -P all-unit-tests clean test
51+
run: >
52+
mvn -B -f modules/dataverse-parent
53+
-Dtarget.java.version=${{ matrix.jdk }}
54+
-DcompilerArgument=-Xlint:unchecked -P all-unit-tests
55+
-pl edu.harvard.iq:dataverse -am
56+
install
57+
5058
- name: Maven Code Coverage
5159
env:
5260
CI_NAME: github
5361
COVERALLS_SECRET: ${{ secrets.GITHUB_TOKEN }}
54-
run: mvn -V -B jacoco:report coveralls:report -DrepoToken=${COVERALLS_SECRET} -DpullRequest=${{ github.event.number }}
62+
# The coverage commit is sometimes flaky. Don't bail out just because this optional step failed.
63+
continue-on-error: true
64+
run: >
65+
mvn -B
66+
-DrepoToken=${COVERALLS_SECRET} -DpullRequest=${{ github.event.number }}
67+
jacoco:report coveralls:report
68+
69+
# We don't want to cache the WAR file, so delete it
70+
- run: rm -rf ~/.m2/repository/edu/harvard/iq/dataverse
5571
push-app-img:
5672
name: Publish App Image
5773
permissions:

.github/workflows/spi_release.yml

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
name: Dataverse SPI
2+
3+
on:
4+
push:
5+
branch:
6+
- "develop"
7+
paths:
8+
- "modules/dataverse-spi/**"
9+
pull_request:
10+
branch:
11+
- "develop"
12+
paths:
13+
- "modules/dataverse-spi/**"
14+
15+
jobs:
16+
# Note: Pushing packages to Maven Central requires access to secrets, which pull requests from remote forks
17+
# don't have. Skip in these cases.
18+
check-secrets:
19+
name: Check for Secrets Availability
20+
runs-on: ubuntu-latest
21+
outputs:
22+
available: ${{ steps.secret-check.outputs.available }}
23+
steps:
24+
- id: secret-check
25+
# perform secret check & put boolean result as an output
26+
shell: bash
27+
run: |
28+
if [ "${{ secrets.DATAVERSEBOT_SONATYPE_USERNAME }}" != '' ]; then
29+
echo "available=true" >> $GITHUB_OUTPUT;
30+
else
31+
echo "available=false" >> $GITHUB_OUTPUT;
32+
fi
33+
34+
snapshot:
35+
name: Release Snapshot
36+
needs: check-secrets
37+
runs-on: ubuntu-latest
38+
if: github.event_name == 'pull_request' && needs.check-secrets.outputs.available == 'true'
39+
steps:
40+
- uses: actions/checkout@v3
41+
- uses: actions/setup-java@v3
42+
with:
43+
java-version: '11'
44+
distribution: 'adopt'
45+
server-id: ossrh
46+
server-username: MAVEN_USERNAME
47+
server-password: MAVEN_PASSWORD
48+
- uses: actions/cache@v2
49+
with:
50+
path: ~/.m2
51+
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
52+
restore-keys: ${{ runner.os }}-m2
53+
54+
- name: Deploy Snapshot
55+
run: mvn -f modules/dataverse-spi -Dproject.version.suffix="-PR${{ github.event.number }}-SNAPSHOT" deploy
56+
env:
57+
MAVEN_USERNAME: ${{ secrets.DATAVERSEBOT_SONATYPE_USERNAME }}
58+
MAVEN_PASSWORD: ${{ secrets.DATAVERSEBOT_SONATYPE_TOKEN }}
59+
60+
release:
61+
name: Release
62+
needs: check-secrets
63+
runs-on: ubuntu-latest
64+
if: github.event_name == 'push' && needs.check-secrets.outputs.available == 'true'
65+
steps:
66+
- uses: actions/checkout@v3
67+
- uses: actions/setup-java@v3
68+
with:
69+
java-version: '11'
70+
distribution: 'adopt'
71+
- uses: actions/cache@v2
72+
with:
73+
path: ~/.m2
74+
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
75+
restore-keys: ${{ runner.os }}-m2
76+
77+
# Running setup-java again overwrites the settings.xml - IT'S MANDATORY TO DO THIS SECOND SETUP!!!
78+
- name: Set up Maven Central Repository
79+
uses: actions/setup-java@v3
80+
with:
81+
java-version: '11'
82+
distribution: 'adopt'
83+
server-id: ossrh
84+
server-username: MAVEN_USERNAME
85+
server-password: MAVEN_PASSWORD
86+
gpg-private-key: ${{ secrets.DATAVERSEBOT_GPG_KEY }}
87+
gpg-passphrase: MAVEN_GPG_PASSPHRASE
88+
89+
- name: Sign + Publish Release
90+
run: mvn -f modules/dataverse-spi -P release deploy
91+
env:
92+
MAVEN_USERNAME: ${{ secrets.DATAVERSEBOT_SONATYPE_USERNAME }}
93+
MAVEN_PASSWORD: ${{ secrets.DATAVERSEBOT_SONATYPE_TOKEN }}
94+
MAVEN_GPG_PASSPHRASE: ${{ secrets.DATAVERSEBOT_GPG_PASSWORD }}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
## Ability to Create New Exporters
2+
3+
It is now possible for third parties to develop and share code to provide new metadata export formats for Dataverse. Export formats can be made available via the Dataverse UI and API or configured for use in Harvesting. Dataverse now provides developers with a separate dataverse-spi JAR file that contains the Java interfaces and classes required to create a new metadata Exporter. Once a new Exporter has been created and packaged as a JAR file, administrators can use it by specifying a local directory for third party Exporters, dropping then Exporter JAR there, and restarting Payara. This mechanism also allows new Exporters to replace any of Dataverse's existing metadata export formats.
4+
5+
## Backward Incompatibilities
6+
7+
Care should be taken when replacing Dataverse's internal metadata export formats as third party code, including other third party Exporters may depend on the contents of those export formats. When replacing an existing format, one must also remember to delete the cached metadata export files or run the reExport command for the metadata exports of existing datasets to be updated.
8+
9+
## New JVM/MicroProfile Settings
10+
11+
dataverse.spi.export.directory - specifies a directory, readable by the Dataverse server. Any Exporter JAR files placed in this directory will be read by Dataverse and used to add/replace the specified metadata format.

doc/sphinx-guides/source/developers/index.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
contain the root `toctree` directive.
55
66
Developer Guide
7-
=======================================================
7+
===============
88

99
**Contents:**
1010

@@ -27,6 +27,7 @@ Developer Guide
2727
deployment
2828
containers
2929
making-releases
30+
metadataexport
3031
tools
3132
unf/index
3233
make-data-count
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
=======================
2+
Metadata Export Formats
3+
=======================
4+
5+
.. contents:: |toctitle|
6+
:local:
7+
8+
Introduction
9+
------------
10+
11+
Dataverse ships with a number of metadata export formats available for published datasets. A given metadata export
12+
format may be available for user download (via the UI and API) and/or be available for use in Harvesting between
13+
Dataverse instances.
14+
15+
As of v5.14, Dataverse provides a mechanism for third-party developers to create new metadata Exporters than implement
16+
new metadata formats or that replace existing formats. All the necessary dependencies are packaged in an interface JAR file
17+
available from Maven Central. Developers can distribute their new Exporters as JAR files which can be dynamically loaded
18+
into Dataverse instances - see :ref:`external-exporters`. Developers are encouraged to make their Exporter code available
19+
via https://github.com/gdcc/dataverse-exporters (or minimally, to list their existence in the README there).
20+
21+
Exporter Basics
22+
---------------
23+
24+
New Exports must implement the ``io.gdcc.spi.export.Exporter`` interface. The interface includes a few methods for the Exporter
25+
to provide Dataverse with the format it produces, a display name, format mimetype, and whether the format is for download
26+
and/or harvesting use, etc. It also includes a main ``exportDataset(ExportDataProvider dataProvider, OutputStream outputStream)``
27+
method through which the Exporter receives metadata about the given dataset (via the ``ExportDataProvider``, described further
28+
below) and writes its output (as an OutputStream).
29+
30+
Exporters that create an XML format must implement the ``io.gdcc.spi.export.XMLExporter`` interface (which extends the Exporter
31+
interface). XMLExporter adds a few methods through which the XMLExporter provides information to Dataverse about the XML
32+
namespace and version being used.
33+
34+
Exporters also need to use the ``@AutoService(Exporter.class)`` which makes the class discoverable as an Exporter implementation.
35+
36+
The ``ExportDataProvider`` interface provides several methods through which your Exporter can receive dataset and file metadata
37+
in various formats. Your exporter would parse the information in one or more of these inputs to retrieve the values needed to
38+
generate the Exporter's output format.
39+
40+
The most important methods/input formats are:
41+
42+
- ``getDatasetJson()`` - metadata in the internal Dataverse JSON format used in the native API and available via the built-in JSON metadata export.
43+
- ``getDatasetORE()`` - metadata in the OAI_ORE format available as a built-in metadata format and as used in Dataverse's BagIT-based Archiving capability.
44+
- ``getDatasetFileDetails`` - detailed file-level metadata for ingested tabular files.
45+
46+
The first two of these provide ~complete metadata about the dataset along with the metadata common to all files. This includes all metadata
47+
entries from all metadata blocks, PIDs, tags, Licenses and custom terms, etc. Almost all built-in exporters today use the JSON input.
48+
The newer OAI_ORE export, which is JSON-LD-based, provides a flatter structure and references metadata terms by their external vocabulary ids
49+
(e.g. http://purl.org/dc/terms/title) which may make it a prefereable starting point in some cases.
50+
51+
The last method above provides a new JSON-formatted serialization of the variable-level file metadata Dataverse generates during ingest of tabular files.
52+
This information has only been included in the built-in DDI export, as the content of a ``dataDscr`` element. (Hence inspecting the edu.harvard.iq.dataverse.export.DDIExporter and related classes would be a good way to explore how the JSON is structured.)
53+
54+
The interface also provides
55+
56+
- ``getDatasetSchemaDotOrg();`` and
57+
- ``getDataCiteXml();``.
58+
59+
These provide subsets of metadata in the indicated formats. They may be useful starting points if your exporter will, for example, only add one or two additional fields to the given format.
60+
61+
If an Exporter cannot create a requested metadata format for some reason, it should throw an ``io.gdcc.spi.export.ExportException``.
62+
63+
Building an Exporter
64+
--------------------
65+
66+
The example at https://github.com/gdcc/dataverse-exporters provides a Maven pom.xml file suitable for building an Exporter JAR file and that repository provides additional development guidance.
67+
68+
There are four dependencies needed to build an Exporter:
69+
70+
- ``io.gdcc dataverse-spi`` library containing the interfaces discussed above and the ExportException class
71+
- ``com.google.auto.service auto-service``, which provides the @AutoService annotation
72+
- ``jakarta.json jakarata.json-api`` for JSON classes
73+
- ``jakarta.ws.rs jakarta.ws.rs-api``, which provides a MediaType enumeration for specifying mime types.
74+
75+
Specifying a Prerequisite Export
76+
--------------------------------
77+
78+
An advanced feature of the Exporter mechanism allows a new Exporter to specify that it requires, as input,
79+
the output of another Exporter. An example of this is the builting HTMLExporter which requires the output
80+
of the DDI XML Exporter to produce an HTML document with the same DDI content.
81+
82+
This is configured by providing the metadata format name via the ``Exporter.getPrerequisiteFormatName()`` method.
83+
When this method returns a non-empty format name, Dataverse will provide the requested format to the Exporter via
84+
the ``ExportDataProvider.getPrerequisiteInputStream()`` method.
85+
86+
Developers and administrators deploying Exporters using this mechanism should be aware that, since metadata formats
87+
can be changed by other Exporters, the InputStream received may not hold the expected metadata. Developers should clearly
88+
document their compatability with the built-in or third-party Exporters they support as prerequisites.

doc/sphinx-guides/source/installation/advanced.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,29 @@ To activate in your Dataverse installation::
115115

116116
curl -X PUT -d '/cgi-bin/zipdownload' http://localhost:8080/api/admin/settings/:CustomZipDownloadServiceUrl
117117

118+
.. _external-exporters:
119+
120+
Installing External Metadata Exporters
121+
++++++++++++++++++++++++++++++++++++++
122+
123+
As of Dataverse Software 5.14 Dataverse supports the use of external Exporters as a way to add additional metadata
124+
export formats to Dataverse or replace the built-in formats. This should be considered an **experimental** capability
125+
in that the mechanism is expected to evolve and using it may require additional effort when upgrading to new Dataverse
126+
versions.
127+
128+
This capability is enabled by specifying a directory in which Dataverse should look for third-party Exporters. See
129+
:ref:`dataverse.spi.exporters.directory`.
130+
131+
See :doc:`/developers/metadataexport` for details about how to develop new Exporters.
132+
133+
An minimal example Exporter is available at https://github.com/gdcc/dataverse-exporters. The community is encourage to
134+
add additional exporters (and/or links to exporters elsewhere) in this repository. Once you have downloaded the
135+
dataverse-spi-export-examples-1.0.0.jar (or other exporter jar), installed it in the directory specified above, and
136+
restarted your Payara server, the new exporter should be available.
137+
138+
The example dataverse-spi-export-examples-1.0.0.jar replaces the ``JSON`` export with a ``MyJSON in <locale>`` version
139+
that just wraps the existing JSON export object in a new JSON object with the key ``inputJson`` containing the original
140+
JSON.(Note that the ``MyJSON in <locale>`` label will appear in the dataset Metadata Export download menu immediately,
141+
but the content for already published datasets will only be updated after you delete the cached exports and/or use a
142+
reExport API call (see :ref:`batch-exports-through-the-api`).)
143+

0 commit comments

Comments
 (0)