Skip to content

Commit a084f18

Browse files
authored
Merge pull request #250 from simleo/detached_crates
Support for detached crates
2 parents 17a8ce1 + ba531a6 commit a084f18

12 files changed

Lines changed: 816 additions & 140 deletions

File tree

README.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,68 @@ article = crate.dereference("paper.pdf")
293293

294294
## Advanced features
295295

296+
### Detached crates
297+
298+
[RO-Crate 1.2](https://www.researchobject.org/ro-crate/whats-changed-in-1-2) introduces the concept of _detached_ RO-Crates, which have no defined root directory: in detached crates, the metadata is accessed independently, for instance via an API or from a standalone metadata file. By contrast, "traditional" crates that describe a payload of files and directories contained in a root directory are called _attached_.
299+
300+
Both detached and attached crates can have a root data entity with an absolute URI as `@id`. To create an RO-Crate whose root data entity `@id` is different from the default `./`, use the `root_dataset_id` argument in the constructor:
301+
302+
```python
303+
from rocrate.rocrate import ROCrate
304+
305+
url = "http://example.com/crate/"
306+
crate = ROCrate(root_dataset_id=url)
307+
```
308+
309+
In detached crates, _all_ data entities must be web-based, i.e., have an absolute URI as `@id`:
310+
311+
```python
312+
file_1 = crate.add_file(f"{url}file_1") # http://example.com/crate/file_1
313+
```
314+
315+
The [recommended way](https://www.researchobject.org/ro-crate/specification/1.2/structure.html#types-of-ro-crate) to store a detached crate on disk is to write a single metadata file called `${prefix}-ro-crate-metadata.json`, where `${prefix}` is a variable. The library supports this through the `write_detached` method, which takes as argument an arbitrary path (a warning will be issued if the path does not follow the above pattern):
316+
317+
```python
318+
crate.write_detached("/tmp/example-ro-crate-metadata.json")
319+
```
320+
321+
One of the ways to consume a detached crate is to read the metadata from a local file. For instance, to read the crate that we just wrote:
322+
323+
```python
324+
read_crate = ROCrate("/tmp/example-ro-crate-metadata.json")
325+
read_file_1 = read_crate.dereference(f"{url}file_1")
326+
```
327+
328+
This also works with a local `file://` URI:
329+
330+
```python
331+
read_crate = ROCrate("file:///tmp/example-ro-crate-metadata.json")
332+
```
333+
334+
and with a remote URI:
335+
336+
```python
337+
base = "https://raw.githubusercontent.com/ResearchObject/ro-crate-py/master/test/test-data/"
338+
read_crate = ROCrate(f"{base}detached-ro-crate-metadata.json")
339+
assert read_crate.root_dataset.id == base
340+
sample_file = read_crate.dereference(f"{base}sample_file.txt")
341+
test_file_galaxy = read_crate.dereference(f"{base}test_file_galaxy.txt")
342+
```
343+
344+
Another way to read a detached crate is to pass a JSON dictionary with the RO-Crate metadata directly to `ROCrate`. For instance:
345+
346+
```python
347+
import json
348+
from rocrate.rocrate import ROCrate
349+
350+
with open("/tmp/example-ro-crate-metadata.json") as f:
351+
metadata = json.load(f)
352+
crate = ROCrate(metadata)
353+
```
354+
355+
In the above example we read the metadata from a local file, but you could get it from an API endpoint or any other source.
356+
357+
296358
### Subcrates
297359

298360
An RO-Crate can contain one or more nested RO-Crates. For instance, consider the following layout:

rocrate/metadata.py

Lines changed: 23 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,20 @@
2222
# limitations under the License.
2323

2424
import json
25+
import re
2526
import warnings
27+
import urllib.request
28+
29+
import requests
2630

2731
from .model.metadata import BASENAME, LEGACY_BASENAME
32+
from .utils import is_url
33+
34+
# https://www.researchobject.org/ro-crate/specification/1.2/structure
35+
# "If stored in a file... the filename SHOULD be..."
36+
# https://www.researchobject.org/ro-crate/specification/1.2/data-entities
37+
# "It is NOT RECOMMENDED to resolve a relative root identifier..."
38+
MD_PATTERN = re.compile(r".*[/-]ro-crate-metadata.json(ld)?$")
2839

2940

3041
def read_metadata(metadata_path):
@@ -36,6 +47,16 @@ def read_metadata(metadata_path):
3647
"""
3748
if isinstance(metadata_path, dict):
3849
metadata = metadata_path
50+
elif is_url(str(metadata_path)):
51+
if not MD_PATTERN.match(metadata_path):
52+
warnings.warn(f"URI {metadata_path} should follow the pattern {MD_PATTERN.pattern!r}")
53+
if metadata_path.startswith("file:"):
54+
with urllib.request.urlopen(metadata_path) as resp:
55+
metadata = json.load(resp)
56+
else:
57+
with requests.get(metadata_path) as resp:
58+
resp.raise_for_status()
59+
metadata = resp.json()
3960
else:
4061
with open(metadata_path, 'r', encoding='utf-8') as f:
4162
metadata = json.load(f)
@@ -69,48 +90,8 @@ def find_root_entity_id(entities):
6990
Return a tuple of the corresponding identifiers (descriptor, root).
7091
If the entities are not found, raise KeyError. If they are found,
7192
but they don't satisfy the required constraints, raise ValueError.
72-
73-
In the general case, the metadata file descriptor id can be an
74-
absolute URI whose last path segment is "ro-crate-metadata.json[ld]".
75-
Since there can be more than one such id in the crate, we need to
76-
choose among the corresponding (descriptor, root) entity pairs. First, we
77-
exclude those that don't satisfy other constraints, such as the
78-
descriptor entity being of type CreativeWork, etc.; if this doesn't
79-
leave us with a single pair, we try to pick one with a
80-
heuristic. Suppose we are left with the (m1, r1) and (m2, r2) pairs:
81-
if r1 is the actual root of this crate, then m2 and r2 are regular
82-
files in it, and as such they must appear in r1's hasPart; r2,
83-
however, is not required to have a hasPart property listing other
84-
files. Thus, we look for a pair whose root entity "contains" all
85-
descriptor entities from other pairs. If there is no such pair, or there
86-
is more than one, we just return an arbitrary pair.
87-
8893
"""
8994
descriptor = entities.get(BASENAME, entities.get(LEGACY_BASENAME))
90-
if descriptor:
91-
return _check_descriptor(descriptor, entities)
92-
candidates = []
93-
for id_, e in entities.items():
94-
basename = id_.rsplit("/", 1)[-1]
95-
if basename == BASENAME or basename == LEGACY_BASENAME:
96-
try:
97-
candidates.append(_check_descriptor(e, entities))
98-
except ValueError:
99-
pass
100-
if not candidates:
95+
if not descriptor:
10196
raise KeyError("Metadata file descriptor not found")
102-
elif len(candidates) == 1:
103-
return candidates[0]
104-
else:
105-
warnings.warn("Multiple metadata file descriptors, will pick one with a heuristic")
106-
descriptor_ids = set(_[0] for _ in candidates)
107-
for m_id, r_id in candidates:
108-
try:
109-
root = entities[r_id]
110-
part_ids = set(_["@id"] for _ in root["hasPart"])
111-
except KeyError:
112-
continue
113-
if part_ids >= descriptor_ids - {m_id}:
114-
# if True for more than one candidate, this pick is arbitrary
115-
return m_id, r_id
116-
return candidates[0] # fall back to arbitrary pick
97+
return _check_descriptor(descriptor, entities)

rocrate/model/dataset.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from urllib.parse import unquote
3232

3333
from .file_or_dir import FileOrDir
34-
from ..utils import is_url, iso_now, Mode
34+
from ..utils import as_list, is_url, iso_now, Mode
3535

3636

3737
class Dataset(FileOrDir):
@@ -122,17 +122,32 @@ def _stream_folder_from_url(self, chunk_size=8192):
122122
with urlopen(self.source) as _:
123123
self._jsonld['sdDatePublished'] = iso_now()
124124
else:
125-
base = self.source.rstrip("/")
125+
if is_url(self.id):
126+
relative_dest_uri = self.get("localPath") or self.id
127+
else:
128+
relative_dest_uri = self.id
129+
if is_url(relative_dest_uri):
130+
if relative_dest_uri.startswith(self.crate.root_dataset.id):
131+
relative_dest_uri = relative_dest_uri[len(self.crate.root_dataset.id):]
132+
else:
133+
relative_dest_uri = relative_dest_uri.rsplit("/", 1)[-1]
134+
out_dir_path = Path(unquote(relative_dest_uri))
135+
126136
for entry in self._jsonld.get("hasPart", []):
127137
try:
128138
part = entry["@id"]
129-
if is_url(part) or part.startswith("/"):
130-
raise RuntimeError(f"'{self.source}': part '{part}' is not a relative path")
131-
part_uri = f"{base}/{part}"
132-
rel_out_path = Path(self.id) / part
133-
139+
if not is_url(part):
140+
raise RuntimeError(f"'{self.source}' is a URL, but part '{part}' is not a URL")
141+
rel_out_path = out_dir_path / part.rsplit("/", 1)[-1]
142+
if part_entity := self.crate.get(part):
143+
if "Dataset" in as_list(part_entity.type):
144+
continue
145+
# override with file localPath if set
146+
if "File" in as_list(part_entity.type):
147+
if file_local_path := part_entity.get("localPath"):
148+
rel_out_path = file_local_path
134149
is_empty = True
135-
with urlopen(part_uri) as response:
150+
with urlopen(part) as response:
136151
while chunk := response.read(chunk_size):
137152
is_empty = False
138153
yield str(rel_out_path), chunk

rocrate/model/file.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,19 @@ def _copy_file(self, path, out_file_path):
7272
self._jsonld['contentSize'] = str(out_file_path.stat().st_size)
7373

7474
def write(self, base_path):
75-
out_file_path = Path(base_path) / unquote(self.id)
75+
if self.fetch_remote and is_url(str(self.source)):
76+
if is_url(self.id):
77+
relative_dest_uri = self.get("localPath") or self.id
78+
else:
79+
relative_dest_uri = self.id
80+
if is_url(relative_dest_uri):
81+
if relative_dest_uri.startswith(self.crate.root_dataset.id):
82+
relative_dest_uri = relative_dest_uri[len(self.crate.root_dataset.id):]
83+
else:
84+
relative_dest_uri = relative_dest_uri.rsplit("/", 1)[-1]
85+
else:
86+
relative_dest_uri = self.id
87+
out_file_path = Path(base_path) / unquote(relative_dest_uri)
7688
if isinstance(self.source, (BytesIO, StringIO)) or is_url(str(self.source)):
7789
self._write_from_stream(out_file_path)
7890
elif self.source is None:

rocrate/model/metadata.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525

2626
import json
2727
from pathlib import Path
28+
import re
29+
import warnings
2830

2931
from .file import File
3032
from .dataset import Dataset
@@ -38,6 +40,7 @@
3840
DEFAULT_VERSION = "1.2"
3941
BASENAME = "ro-crate-metadata.json"
4042
LEGACY_BASENAME = "ro-crate-metadata.jsonld"
43+
DETACHED_MD_NAME = re.compile(r".*-ro-crate-metadata.json$")
4144

4245
WORKFLOW_PROFILE = "https://w3id.org/workflowhub/workflow-ro-crate/1.0"
4346

@@ -95,9 +98,15 @@ def _has_writeable_stream(self):
9598
return True
9699

97100
def write(self, dest_base):
98-
write_path = Path(dest_base) / self.id
101+
write_path = Path(dest_base) / self.id.rsplit("/", 1)[-1]
99102
super()._write_from_stream(write_path)
100103

104+
def write_detached(self, path):
105+
if not DETACHED_MD_NAME.match(str(path)):
106+
warnings.warn(f"{path} should follow the pattern {DETACHED_MD_NAME.pattern!r}")
107+
path = Path(path)
108+
super()._write_from_stream(path)
109+
101110
@property
102111
def root(self) -> Dataset:
103112
return self.crate.root_dataset

rocrate/model/preview.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
from jinja2 import Template
3030
from .file import File
31+
from ..utils import is_url
3132

3233

3334
class Preview(File):
@@ -99,6 +100,8 @@ def stream(self, chunk_size=8192):
99100
yield self.id, str.encode(self.generate_html(), encoding='utf-8')
100101

101102
def _has_writeable_stream(self):
103+
if is_url(str(self.source)):
104+
return self.fetch_remote
102105
return True
103106

104107
def write(self, dest_base):

0 commit comments

Comments
 (0)