Skip to content

Commit c90d7cb

Browse files
authored
DD-2236 Add support for external large objects (#27)
Added endpoints for editing the itemstore directly Added draft version of DANS RDA BagPack Profile/1.1.0
1 parent 046517a commit c90d7cb

25 files changed

Lines changed: 872 additions & 22 deletions

docs/item-store-api.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
About using the Item Store API
2+
===============================
3+
4+
In normal usage the object import interface, described on [the main page](./index.md), is pretty much everything you need. However, there are times when it
5+
may be necessary to edit the layer store on a lower level. As described in the documentation of the
6+
underlying [dans-layer-store-lib]{:target=_blank}, the layer store is conceptually just a file/folder
7+
hierarchy, or **item store**. The service contains API endpoints that let you perform editing operations directly on the item store.
8+
9+
!!! warning "Corruption of the OCFL repository is possible"
10+
11+
Be aware that, when modifying the item store this way, it is possible to corrupt the OCFL repository. That is why these end-points must be enabled explicitly
12+
to become available. You can do so by editing the `config.yml` file and restarting the service. It is recommended to reset all the item store endpoints
13+
to "disabled" after you have finished using them to prevent accidental use. Also, make sure that no imports are running concurrently when doing these
14+
low-level edits. You should shut down `dd-transfer-to-vault` before starting your edits and check that there are no pending imports in the queue.
15+
16+
[dans-layer-store-lib]: https://dans-knaw.github.io/dans-layer-store-lib/#the-itemstore-interface

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ repo_url: https://github.com/DANS-KNAW/dd-data-vault
2424
nav:
2525
- Manual:
2626
- Description: index.md
27+
- Item store API: item-store-api.md
2728
- Installation: installation.md
2829
- Configuration: config.md
2930
- API: to-api.md

pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
</parent>
2727

2828
<artifactId>dd-data-vault</artifactId>
29-
<version>4.0.6-SNAPSHOT</version>
29+
<version>4.1.0-SNAPSHOT</version>
3030

3131
<name>DD Data Vault</name>
3232
<url>https://github.com/DANS-KNAW/dd-data-vault</url>
@@ -36,10 +36,10 @@
3636
<properties>
3737
<!-- TODO: move to dd-parent -->
3838
<commons-validator.version>1.7</commons-validator.version>
39-
<dans-ocfl-extensions.version>1.0.0</dans-ocfl-extensions.version>
39+
<dans-ocfl-extensions.version>1.1.0</dans-ocfl-extensions.version>
4040
<dans-ocfl-java-extensions-lib.version>2.0.0</dans-ocfl-java-extensions-lib.version>
4141
<dans-layer-store-lib.version>2.0.0</dans-layer-store-lib.version>
42-
<dd-data-vault-api.version>1.0.0</dd-data-vault-api.version>
42+
<dd-data-vault-api.version>1.1.0</dd-data-vault-api.version>
4343
<dans-java-utils.version>2.11.0</dans-java-utils.version>
4444
<main-class>nl.knaw.dans.datavault.DdDataVaultApplication</main-class>
4545
</properties>

src/main/assembly/dist/cfg/config.yml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ dataVault:
6969
#
7070
#
7171
ocflRepository:
72-
workDir: /data/vault/tmp
72+
workDir: /data/vault/tmp/ocfl
7373
#
7474
# A directory containing documentation files for the root extensions. These files will be copied to the OCFL repository root when it is initialized.
7575
# Note that this directory must not contain subdirectories.
@@ -88,6 +88,28 @@ dataVault:
8888
jsonPath: $.propertyRegistry.dataset-version.required
8989
value: true
9090

91+
#
92+
# Checks to perform on the root extension after the OCFL Storage Root has been created. It is recommended to keep these checks on always, except when trying to recover from an error created when using the itemstore API endpoints.
93+
#
94+
rootExtensionsInitChecks:
95+
# Check that the property registry is valid
96+
propertyRegistry: true
97+
# Check that the packaging format registry is valid
98+
packagingFormatRegistry: true
99+
100+
#
101+
# Enable/disable low-level editing end-points. Since these changes made through these end-points can potentially corrupt the OCFL structure, it is recommended to keep them disabled
102+
# except temporarily, when needed to fix problems or when file deletion is required by law.
103+
#
104+
itemstore:
105+
workDir: /data/vault/tmp/itemstore
106+
enableEndpoints:
107+
copyDirectoryInto: false
108+
copyFileTo: false
109+
createDirectory: false
110+
deleteDirectory: false
111+
deleteFiles: false
112+
91113
#
92114
# Settings for the layer store in which the data vault stores its data.
93115
#

src/main/assembly/dist/cfg/ocfl-root-extensions/packaging-format-registry/packaging_format_inventory.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
"name": "DANS RDA BagPack Profile",
1010
"version": "1.0.0",
1111
"summary": "Extension of the RDA BagPack Profile for DANS"
12+
},
13+
"2a820be7e8c8671d7281ca5cea7c3712": {
14+
"name": "DANS RDA BagPack Profile",
15+
"version": "1.1.0",
16+
"summary": "Extension of the RDA BagPack Profile for DANS"
1217
}
1318
}
1419
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
446fbfb4049e490def5a89e1c2bf4e0eeba3b470758aaf39f297788d3355f5dfa715f9295b560dd9e4e4508da57952d15baafc9b822c305ff474828138af2129 packaging_format_inventory.json
1+
a2ff8fca02cc8409b6e5a6fc4dab1b268cf01ca5c19ccac2d1ce6844115b72baef90134aeb5790ef5afa8e8fef9898168a24e9e18b27f03d3afd5a51cae56eba packaging_format_inventory.json
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
{
2+
3+
"BagIt-Profile-Info": {
4+
"BagIt-Profile-Identifier": "https://doi.org/10.17026/e948-0r32",
5+
"Source-Organization": "dans.knaw.nl",
6+
"Contact-Name": "DANS-KNAW",
7+
"Contact-Email": "info@dans.knaw.nl",
8+
"External-Description": "DANS BagPack Profile",
9+
"Version": "1.0.0"
10+
},
11+
"Bag-Info": {
12+
"Source-Organization": {
13+
"required": true
14+
},
15+
"Contact-Name": {
16+
"required": false
17+
},
18+
"Contact-Email": {
19+
"required": true
20+
},
21+
"External-Description": {
22+
"required": true
23+
},
24+
"Internal-Sender-Identifier": {
25+
"required": true
26+
},
27+
"Bagging-Date": {
28+
"required": false
29+
},
30+
"Contact-Phone": {
31+
"required": false
32+
},
33+
"External-Identifier": {
34+
"required": false
35+
},
36+
"Bag-Size": {
37+
"required": false
38+
},
39+
"Payload-Oxum": {
40+
"required": false
41+
},
42+
"Source-Identifier": {
43+
"required": false
44+
}
45+
},
46+
"Manifests-Required": [
47+
"sha1"
48+
],
49+
"Allow-Fetch.txt": true,
50+
"Serialization": "optional",
51+
"Accept-Serialization": [
52+
"application/zip"
53+
],
54+
"Accept-BagIt-Version": [
55+
"0.97",
56+
"1.0"
57+
],
58+
"Tag-Manifests-Required": [],
59+
"Tag-Files-Required": [
60+
"metadata/datacite.xml",
61+
"metadata/pid-mapping.txt",
62+
"metadata/oai-ore.jsonld"
63+
]
64+
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
DANS BagPack Profile v1.1.0
2+
===========================
3+
4+
Introduction
5+
------------
6+
7+
### Version
8+
9+
* Document version: 1.1.0
10+
* Publication date: n/a
11+
12+
### Status
13+
14+
The status of this document is DRAFT.
15+
16+
### Changes
17+
18+
#### Changed from version 1.0.0 to 1.1.0
19+
20+
Change requirement 1.1. to also allow "holey bags" for support of external large objects. This change is backwards compatible because bags that were valid
21+
under the previous versions of this specification remain so.
22+
23+
### Scope
24+
25+
This document specifies what constitutes an acceptable DANS BagPack. This includes all the requirements for a bag to be successfully processed by the DANS Data
26+
Vault ingest workflow.
27+
28+
### Overview and Conventions
29+
30+
#### Keywords
31+
32+
The keywords "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be
33+
interpreted as described in [RFC 2119]{:target=_blank}.
34+
35+
The key word "SHOULD" is also used to specify requirements that are impossible or impractical to check by the archival organization (i.e., DANS). The client
36+
should do its best to meet these requirements but not rely on their being validated by the archival organization.
37+
38+
#### Subdivisions
39+
40+
The requirements are subdivided into the following sections:
41+
42+
* RDA BagPack Related - requirements that refer back to the [RDA BagPack]{:target=_blank} specifications. If a bag only needs to comply with the RDA BagPack
43+
specifications, then it should be sufficient to only check this section.
44+
* Extra Requirements for DANS BagPack - requirements that are specific to the DANS BagPack Profile, and which are in addition to the RDA BagPack requirements.
45+
46+
The sections are numbered and may have numbered subsections. The requirements themselves are stated as numbered rules. Rules may have parts that are labeled
47+
with letters: (a), (b), (c), etc. To uniquely identify a specific rule, use the notation
48+
49+
```
50+
<section-nr>[.<subsection-nr>].<rule-nr> [(<letter>)]
51+
```
52+
53+
Example: `2.3.4 (e)` means part **e** of the fourth rule in subsection 3 of section 2.
54+
55+
#### XML namespaces
56+
57+
When referring to XML element or attribute names or attribute values that have a prefix (such as `schema:name`) an element in a certain namespace is intended.
58+
The table below lists the mapping from prefix to namespace. In the actual document, the namespace may be bound to a different prefix, or be the default
59+
namespace.
60+
61+
| Prefix | Namespace URI | Namespace documentation |
62+
|-----------|---------------------------------------------------------------------|--------------------------------------------------|
63+
| `schema` | `http://schema.org/` | [schema.org]{:target=_blank} |
64+
| `dvcore` | `https://dataverse.org/schema/core#` | Dataverse metadata elements |
65+
| `vaultMd` | `https://schemas.dans.knaw.nl/metadatablock/dansDataVaultMetadata#` | [DANS Data Vault Metadata block]{:target=_blank} |
66+
67+
Requirements
68+
------------
69+
70+
### 1. RDA BagPack Related
71+
72+
The following items are required by the [RDA BagPack]{:target=_blank} specifications:
73+
74+
1. One of the following MUST hold:
75+
* the DANS BagPack is a valid bag, according to [BagIt v1.0]{:target=_blank} or [BagIt v0.97]{:target=_blank}
76+
* the DANS BagPack is a holey bag (i.e., a bag with a [fetch.txt]{:target=_blank} file listing the missing files and their fetch URLs). The files to be
77+
fetched MUST be downloadable from the given URL or obtainable from a well-known location and have the checksums listed in the payload manifests.
78+
"Obtainable from a well-known location" means that the repository containing the bag documents how to map the fetch-URL or a checksum for the file to
79+
the location where the file data is stored.
80+
2. (a) A DANS BagPack MUST contain a file `metadata/datacite.xml` (b) this file MUST be valid according to the
81+
[DataCite schema version 4.0 or later]{:target=_blank}, except for the requirement that there MUST be a DOI present: a DOI is not required for a DANS
82+
BagPack; (c) [DataCite's recommended properties]{:target=_blank} SHOULD be present.
83+
3. Other files besides `datacite.xml` MAY be present in the `metadata` folder.
84+
85+
### 2. Extra Requirements for DANS BagPack
86+
87+
The following items are required by the DANS BagPack Profile, in addition to the requirements of RDA BagPack:
88+
89+
1. The `bag-info.txt` file SHOULD contain an element `BagIt-Profile-Identifier` set to the identifier of the [DANS BagPack BagIt Profile]{:target=_blank}:
90+
`https://doi.org/10.17026/e948-0r32`.
91+
2. (a) The bag MUST conform to the [DANS BagPack BagIt Profile]{:target=_blank} (even if the `BagIt-Profile-Identifier` element pointing to it is missing). (b)
92+
The bag SHOULD conform to any other BagIt profiles declared in the `BagIt-Profile-Identifier` element.
93+
3. There MUST be a file called `metadata/pid-mapping.txt`: the structure of this file MUST be rows formatted as `<identifier> <referenced object>`, where
94+
`<identifier>` is a unique URI and `<referenced object>` is the path to the file relative to the root of the bag, and both are separated by one or more
95+
spaces. One of the lines MAY be mapping from the dataset DOI to a folder directly under the `data` folder.
96+
4. (a) There MUST a `metadata/oai-ore.json` file which MUST be a valid JSON-LD 1.0 or higher document; (b) The object described in the
97+
document MUST have the attribute `vaultMd:dansBagId` whose value is a URN:UUID. (c) The `ore:AggregatedResource`s of the `ore:Aggregation` MUST have the
98+
following attributes: (i) `@id` whose value is a URI; (ii) `schema:name`; (iii) `dvcore:restricted`, with value true or false.
99+
5. There MUST be a one-to-one mapping between the files in the `data` folder and the files described in the Aggregation contained in `oai-ore.jsonld` file:
100+
(a) all identifiers found in 2.4(c)(i) MUST be present in the left column of `pid-mapping.txt`; (b) the set of paths pointing to files found in the right
101+
column of `pid-mapping.txt` MUST be equal to the set of paths of files present in the `data` folder (relative to the bag root).
102+
103+
[RFC 2119]: {{ rfc_2119 }}
104+
[BagIt v1.0]: {{ bagit }}
105+
[fetch.txt]: {{ fetch_txt }}
106+
[BagIt v0.97]: {{ bagit_0_97 }}
107+
[RDA BagPack]: {{ rda_bagpack }}
108+
[DataCite schema version 4.0 or later]: {{ datacite_4_0 }}
109+
[DANS BagPack BagIt Profile]: {{ dans_bagpack_bagit_profile }}
110+
[DataCite's recommended properties]: {{ levels_of_obligation }}
111+
[schema.org]: {{ schema_org }}
112+
[DANS Data Vault Metadata block]: {{ dans_data_vault_metadata_block }}

src/main/assembly/dist/cfg/ocfl-root-extensions/property-registry/config.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,24 @@
2929
}
3030
}
3131
},
32+
"external-large-objects": {
33+
"description": "If present, this version of the object requires files from the companion large object store (LOB-store). The LOB-store is located in the same directory as the Vault Storage Root and has the same name, but with the extension '.lobs'. The large objects have as base names their digest in hexadecimal.",
34+
"type": "object",
35+
"required": false,
36+
"properties": {
37+
"checksum-algorithm": {
38+
"description": "The name of the algorithm used to calculate the digests of the large objects. One of the names given in the OCFL specification or its extensions must be used.",
39+
"type": "string",
40+
"required": true
41+
},
42+
"lobs": {
43+
"description": "The list of large objects that are required by this object version. Each entry is the hexadecimal digest of the large object file in the LOB-store, calculated using the algorithm given above in 'checksum-algorithm'. How to combine the large objects with the object version is defined in the packaging format of the object version.",
44+
"type": "array",
45+
"itemType": "string",
46+
"required": true
47+
}
48+
}
49+
},
3250
"retention-end-date": {
3351
"description": "The date after which the object version must be deleted.",
3452
"type": "string",

0 commit comments

Comments
 (0)