Skip to content

Commit b0a54fd

Browse files
authored
Merge pull request #285 from gdcc/284-unparsed-metadata-records
Adding a mechanism for obtaining harvested metadata records unparsed, as Strings
2 parents 2ab1ea8 + f88a6cc commit b0a54fd

7 files changed

Lines changed: 185 additions & 14 deletions

File tree

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ mvn spotless:check
6868

6969
## Release notes
7070

71-
### v5.2.3
71+
### v5.3.0
7272

7373
#### 🌟 FEATURES
74-
- (none)
74+
- Added an API to the Metadata model class, allowing to retrieve record metadata as unparsed string (Shoutout to @landreev for #285!)
7575

7676
#### 💔 BREAKING CHANGES
7777
- (none)

xoai-common/src/main/java/io/gdcc/xoai/model/oaipmh/results/record/Metadata.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import io.gdcc.xoai.model.xoai.XOAIMetadata;
1212
import io.gdcc.xoai.xml.CopyElement;
1313
import io.gdcc.xoai.xml.EchoElement;
14+
import io.gdcc.xoai.xml.StringElement;
1415
import io.gdcc.xoai.xml.XmlWritable;
1516
import io.gdcc.xoai.xml.XmlWriter;
1617
import io.gdcc.xoai.xmlio.exceptions.XmlWriteException;
@@ -45,6 +46,10 @@ public Metadata(final CopyElement value) {
4546
this.element = value;
4647
}
4748

49+
public Metadata(final StringElement value) {
50+
this.element = value;
51+
}
52+
4853
/**
4954
* If this metadata element needs to be passed through an {@link io.gdcc.xoai.xml.XSLPipeline},
5055
* this is indicated by "true". When this metadata element consists of pregenerated data, which
@@ -78,6 +83,19 @@ public XOAIMetadata getXoaiMetadata() {
7883
else return null;
7984
}
8085

86+
/**
87+
* Returns the unparsed representation of the metadata as a string if the current element is an
88+
* instance of {@code StringElement}. If the element is not an instance of {@code
89+
* StringElement}, this method will return {@code null}.
90+
*
91+
* @return the unparsed string representation of the metadata, or {@code null} if the element
92+
* does not support unparsed string representation.
93+
*/
94+
public String asUnparsedString() {
95+
if (element instanceof StringElement) return ((StringElement) element).asUnparsedString();
96+
else return null;
97+
}
98+
8199
/**
82100
* This is here for Dataverse 4/5 backward compatibility.
83101
*
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package io.gdcc.xoai.xml;
2+
3+
import io.gdcc.xoai.xmlio.exceptions.XmlWriteException;
4+
import java.io.IOException;
5+
import java.nio.charset.StandardCharsets;
6+
import javax.xml.stream.XMLStreamException;
7+
8+
public class StringElement implements XmlWritable {
9+
private final String xmlString;
10+
11+
public StringElement(final String xmlString) {
12+
this.xmlString = xmlString;
13+
}
14+
15+
public String asUnparsedString() {
16+
return xmlString;
17+
}
18+
19+
@Override
20+
public void write(final XmlWriter writer) throws XmlWriteException {
21+
if (xmlString != null) {
22+
// This replicates the same approach used in the CopyElement:
23+
// we transfer the raw, unparsed string unmodified into the output
24+
// stream of the writer
25+
try {
26+
// Make the XmlWriter think we want to write a value, so it prints ">" of the
27+
// containing
28+
// element to stream
29+
// This is somewhat hacky, but there is no other possibility to trick the StAX API
30+
// into
31+
// this.
32+
writer.writeCharacters("");
33+
// Flush the XmlWriter to make sure any preceding tags are written out
34+
writer.flush();
35+
// Now let's write the actual content:
36+
writer.getOutputStream().write(xmlString.getBytes(StandardCharsets.UTF_8));
37+
// And flush stream & writer after the operation - again
38+
writer.flush();
39+
} catch (XMLStreamException | IOException e) {
40+
throw new XmlWriteException(e);
41+
}
42+
} else {
43+
throw new XmlWriteException("Cannot write XML since the string is null");
44+
}
45+
}
46+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package io.gdcc.xoai.xml;
2+
3+
import static org.junit.jupiter.api.Assertions.assertEquals;
4+
5+
import java.io.ByteArrayOutputStream;
6+
import org.junit.jupiter.api.Test;
7+
8+
public class StringElementTest {
9+
private final String xmlString =
10+
"<oai_dc:dc xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\""
11+
+ " xmlns:dc=\"http://purl.org/dc/elements/1.1/\""
12+
+ " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\""
13+
+ " xsi:schemaLocation=\"http://www.openarchives.org/OAI/2.0/oai_dc/ "
14+
+ " http://www.openarchives.org/OAI/2.0/oai_dc.xsd\">\n"
15+
+ " <dc:title>Article Title-additional CDATA</dc:title>\n"
16+
+ "</oai_dc:dc>";
17+
18+
@Test
19+
public void rawUnparsedMetadata() throws Exception {
20+
assertEquals(xmlString, new StringElement(xmlString).asUnparsedString());
21+
}
22+
23+
@Test
24+
public void stringElementWrite() throws Exception {
25+
26+
StringElement stringElement = new StringElement(xmlString);
27+
28+
final ByteArrayOutputStream resultStream = new ByteArrayOutputStream();
29+
30+
XmlWriter writer = new XmlWriter(resultStream);
31+
writer.writeStartDocument();
32+
writer.writeStartElement("metadata");
33+
writer.write(stringElement);
34+
writer.writeEndElement();
35+
writer.writeEndDocument();
36+
37+
String resultString = resultStream.toString();
38+
39+
String expectedOutput =
40+
"<?xml version='1.0' encoding='UTF-8'?><metadata>" + xmlString + "</metadata>";
41+
42+
assertEquals(expectedOutput, resultString);
43+
}
44+
}

xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/model/Context.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ public class Context {
2424
private final Map<String, Transformer> metadataTransformers = new HashMap<>();
2525
private String baseUrl;
2626
private Granularity granularity;
27+
private boolean saveUnparsedMetadata = false;
2728
private OAIClient client;
2829

2930
public Context() {
@@ -91,6 +92,29 @@ public OAIClient getClient() {
9192
return client;
9293
}
9394

95+
/**
96+
* Should this harvester skip parsing the "metadata" sections of oai records in the bodies of
97+
* GetRecord and ListRecords responses, and cache and make them available as unparsed Strings
98+
* instead.
99+
*
100+
* @return boolean
101+
*/
102+
public boolean isSaveUnparsedMetadata() {
103+
return this.saveUnparsedMetadata;
104+
}
105+
106+
/**
107+
* Instruct this harvester not to attempt to parse the "metadata" sections of oai records in the
108+
* bodies of GetRecord and ListRecords responses, but cache and make them available as Strings
109+
* instead.
110+
*
111+
* @return A Context
112+
*/
113+
public Context withSaveUnparsedMetadata() {
114+
this.saveUnparsedMetadata = true;
115+
return this;
116+
}
117+
94118
public enum KnownTransformer {
95119
OAI_DC("to_xoai/oai_dc.xsl");
96120

xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/parsers/RecordParser.java

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import io.gdcc.xoai.model.oaipmh.results.record.Metadata;
2121
import io.gdcc.xoai.serviceprovider.exceptions.InternalHarvestException;
2222
import io.gdcc.xoai.serviceprovider.model.Context;
23+
import io.gdcc.xoai.xml.StringElement;
2324
import io.gdcc.xoai.xml.XSLPipeline;
2425
import io.gdcc.xoai.xmlio.XmlReader;
2526
import io.gdcc.xoai.xmlio.exceptions.XmlReaderException;
@@ -47,18 +48,27 @@ public Record parse(XmlReader reader) throws XmlReaderException {
4748
if (!record.getHeader().isDeleted()) {
4849
reader.next(elementName(localPart(equalTo("metadata")))).next(aStartElement());
4950
String content = reader.retrieveCurrentAsString();
50-
ByteArrayInputStream inputStream =
51-
new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8));
52-
XSLPipeline pipeline =
53-
new XSLPipeline(inputStream, true)
54-
.apply(context.getMetadataTransformer(metadataPrefix));
55-
56-
if (context.hasTransformer()) pipeline.apply(context.getTransformer());
57-
58-
try {
59-
record.withMetadata(new Metadata(new MetadataParser().parse(pipeline.process())));
60-
} catch (TransformerException e) {
61-
throw new InternalHarvestException("Unable to process transformer", e);
51+
52+
if (this.context.isSaveUnparsedMetadata()) {
53+
record.withMetadata(new Metadata(new StringElement(content)));
54+
} else {
55+
ByteArrayInputStream inputStream =
56+
new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8));
57+
58+
XSLPipeline pipeline =
59+
new XSLPipeline(inputStream, true)
60+
.apply(context.getMetadataTransformer(metadataPrefix));
61+
62+
if (context.hasTransformer()) {
63+
pipeline.apply(context.getTransformer());
64+
}
65+
66+
try {
67+
record.withMetadata(
68+
new Metadata(new MetadataParser().parse(pipeline.process())));
69+
} catch (TransformerException e) {
70+
throw new InternalHarvestException("Unable to process transformer", e);
71+
}
6272
}
6373
}
6474

xoai-service-provider/src/test/java/io/gdcc/xoai/serviceprovider/parsers/RecordParserTest.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
package io.gdcc.xoai.serviceprovider.parsers;
1010

1111
import static org.junit.jupiter.api.Assertions.assertEquals;
12+
import static org.junit.jupiter.api.Assertions.assertNull;
1213

1314
import io.gdcc.xoai.model.oaipmh.results.Record;
1415
import io.gdcc.xoai.serviceprovider.model.Context;
@@ -69,4 +70,32 @@ public void cdataParsing() throws Exception {
6970
"Article Title-additional CDATA",
7071
record.getMetadata().getXoaiMetadata().searcher().findOne("dc.title"));
7172
}
73+
74+
@Test
75+
public void rawUnparsedMetadata() throws Exception {
76+
parser = new RecordParser(context, "oai_dc");
77+
XmlReader reader = new XmlReader(input);
78+
Record record = parser.parse(reader);
79+
80+
assertNull(record.getMetadata().asUnparsedString());
81+
82+
context = context.withSaveUnparsedMetadata();
83+
parser = new RecordParser(context, "");
84+
input = getClass().getClassLoader().getResourceAsStream("test/oai_dc-CDATA.xml");
85+
86+
reader = new XmlReader(input);
87+
record = parser.parse(reader);
88+
89+
assertNull(record.getMetadata().getXoaiMetadata());
90+
91+
assertEquals(
92+
"<oai_dc:dc xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\""
93+
+ " xmlns:dc=\"http://purl.org/dc/elements/1.1/\""
94+
+ " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\""
95+
+ " xsi:schemaLocation=\"http://www.openarchives.org/OAI/2.0/oai_dc/ "
96+
+ " http://www.openarchives.org/OAI/2.0/oai_dc.xsd\">\n"
97+
+ " <dc:title>Article Title-additional CDATA</dc:title>\n"
98+
+ "</oai_dc:dc>",
99+
record.getMetadata().asUnparsedString());
100+
}
72101
}

0 commit comments

Comments
 (0)