Skip to content

Commit e214aae

Browse files
committed
Write folia metadata
1 parent 9ec2735 commit e214aae

6 files changed

Lines changed: 158 additions & 0 deletions

File tree

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
package org.ivdnt.galahad.formats.folia
2+
3+
import org.ivdnt.galahad.annotations.Annotation
4+
import org.ivdnt.galahad.export.DocumentExport
5+
import org.ivdnt.galahad.formats.xml.PrettyXMLWriter
6+
import org.ivdnt.galahad.util.ifNullOrBlank
7+
import org.ivdnt.galahad.util.withoutFormatExt
8+
import java.text.SimpleDateFormat
9+
10+
class FoliaMetadataWriter(val writer: PrettyXMLWriter, val export: DocumentExport) {
11+
val title = export.document.uploadedFile.withoutFormatExt
12+
val pid = export.layer.id
13+
val corpusName = export.corpus.mutableMetadata.name
14+
val sourceName = export.corpus.mutableMetadata.sourceName.ifNullOrBlank { "!No source name defined!" }
15+
val sourceURL = export.corpus.mutableMetadata.sourceURL?.toString().ifNullOrBlank { "!No source URL defined!" }
16+
val eraFrom = export.corpus.mutableMetadata.eraFrom.toString()
17+
val eraTo = export.corpus.mutableMetadata.eraTo.toString()
18+
val language = export.corpus.mutableMetadata.language.ifNullOrBlank { "!No language defined!" }
19+
val langCode = export.corpus.mutableMetadata.langCode
20+
val today = SimpleDateFormat("yyyy-MM-dd").format(System.currentTimeMillis())
21+
val annotations = export.layer.annotations
22+
val taggerName = export.tagger.id
23+
24+
fun write() {
25+
writer.wrapIn("metadata", mapOf("type" to "native")) {
26+
writeAnnotations()
27+
writeProvenance()
28+
writeMeta()
29+
}
30+
}
31+
32+
private fun writeAnnotations() {
33+
writer.wrapIn("annotations") {
34+
writer.writeEmptyElement("text-annotation")
35+
writer.writeEmptyElement("paragraph-annotation")
36+
writer.writeEmptyElement("sentence-annotation")
37+
writer.writeEmptyElement("token-annotation")
38+
if (Annotation.LEMMA in annotations) writeAnnotation("lemma")
39+
if (Annotation.POS in annotations || Annotation.UPOS in annotations) writeAnnotation("pos")
40+
if (Annotation.NER in annotations) writeAnnotation("entity")
41+
if (Annotation.DEPREL in annotations) writeAnnotation("dependency")
42+
}
43+
}
44+
45+
private fun writeAnnotation(annotation: String) {
46+
writer.wrapIn("$annotation-annotation", mapOf("set" to taggerName)) {
47+
writer.writeEmptyElement("annotator", mapOf("processor" to taggerName))
48+
}
49+
}
50+
51+
private fun writeProvenance() {
52+
writer.wrapIn("provenance") {
53+
writer.writeEmptyElement("processor", mapOf(
54+
"xml:id" to taggerName,
55+
"name" to taggerName,
56+
"type" to "auto",
57+
"src" to "https://github.com/instituutnederlandsetaal/galahad-taggers-dockerized",
58+
"host" to "https://galahad.ivdnt.org",
59+
"user" to export.user.id,
60+
))
61+
}
62+
}
63+
64+
private fun writeMeta() {
65+
writer.writeElement("meta", "id" to "title", title)
66+
writer.writeElement("meta", "id" to "language", language)
67+
}
68+
}

server/src/main/kotlin/org/ivdnt/galahad/formats/folia/FoliaWriter.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class FoliaWriter(export: DocumentExport) : LayerWriter(export) {
2222
writer.writeAttribute("", "version", "2.5.3")
2323
writer.writeAttribute(XMLConstants.XML_NS_URI, "id", export.layer.id)
2424

25+
FoliaMetadataWriter(writer, export).write()
26+
2527
documents.forEach { doc ->
2628
writer.writeStartElement("text")
2729
writer.writeAttribute(XMLConstants.XML_NS_URI, "id", doc.id)

server/src/test/resources/formats/naf/converter/karel_en_martijn.folia.xml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,30 @@
11
<?xml version='1.0' encoding='UTF-8'?>
22
<FoLiA xmlns="http://ilk.uvt.nl/folia" xmlns:xml="http://www.w3.org/XML/1998/namespace" generator="galahad.ivdnt.org" version="2.5.3" xml:id="e51560ff-81a2-4ddd-ba04-c7eb07af6d2b">
3+
<metadata type="native">
4+
<annotations>
5+
<text-annotation/>
6+
<paragraph-annotation/>
7+
<sentence-annotation/>
8+
<token-annotation/>
9+
<lemma-annotation set="sourceLayer">
10+
<annotator processor="sourceLayer"/>
11+
</lemma-annotation>
12+
<pos-annotation set="sourceLayer">
13+
<annotator processor="sourceLayer"/>
14+
</pos-annotation>
15+
<entity-annotation set="sourceLayer">
16+
<annotator processor="sourceLayer"/>
17+
</entity-annotation>
18+
<dependency-annotation set="sourceLayer">
19+
<annotator processor="sourceLayer"/>
20+
</dependency-annotation>
21+
</annotations>
22+
<provenance>
23+
<processor xml:id="sourceLayer" name="sourceLayer" type="auto" src="https://github.com/instituutnederlandsetaal/galahad-taggers-dockerized" host="https://galahad.ivdnt.org" user="user"/>
24+
</provenance>
25+
<meta id="title">karel_en_martijn</meta>
26+
<meta id="language">Dutch</meta>
27+
</metadata>
328
<text xml:id="d1">
429
<p xml:id="d1.p1">
530
<s xml:id="d1.p1.s1">

server/src/test/resources/formats/shared-converter/karel_en_martijn.folia.xml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,30 @@
11
<?xml version='1.0' encoding='UTF-8'?>
22
<FoLiA xmlns="http://ilk.uvt.nl/folia" xmlns:xml="http://www.w3.org/XML/1998/namespace" generator="galahad.ivdnt.org" version="2.5.3" xml:id="e51560ff-81a2-4ddd-ba04-c7eb07af6d2b">
3+
<metadata type="native">
4+
<annotations>
5+
<text-annotation/>
6+
<paragraph-annotation/>
7+
<sentence-annotation/>
8+
<token-annotation/>
9+
<lemma-annotation set="sourceLayer">
10+
<annotator processor="sourceLayer"/>
11+
</lemma-annotation>
12+
<pos-annotation set="sourceLayer">
13+
<annotator processor="sourceLayer"/>
14+
</pos-annotation>
15+
<entity-annotation set="sourceLayer">
16+
<annotator processor="sourceLayer"/>
17+
</entity-annotation>
18+
<dependency-annotation set="sourceLayer">
19+
<annotator processor="sourceLayer"/>
20+
</dependency-annotation>
21+
</annotations>
22+
<provenance>
23+
<processor xml:id="sourceLayer" name="sourceLayer" type="auto" src="https://github.com/instituutnederlandsetaal/galahad-taggers-dockerized" host="https://galahad.ivdnt.org" user="user"/>
24+
</provenance>
25+
<meta id="title">karel_en_martijn</meta>
26+
<meta id="language">Dutch</meta>
27+
</metadata>
328
<text xml:id="karel">
429
<p xml:id="karel.p1">
530
<s xml:id="karel.p1.s1">

server/src/test/resources/formats/tsv/converter/karel_en_martijn.folia.xml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,30 @@
11
<?xml version='1.0' encoding='UTF-8'?>
22
<FoLiA xmlns="http://ilk.uvt.nl/folia" xmlns:xml="http://www.w3.org/XML/1998/namespace" generator="galahad.ivdnt.org" version="2.5.3" xml:id="e51560ff-81a2-4ddd-ba04-c7eb07af6d2b">
3+
<metadata type="native">
4+
<annotations>
5+
<text-annotation/>
6+
<paragraph-annotation/>
7+
<sentence-annotation/>
8+
<token-annotation/>
9+
<lemma-annotation set="sourceLayer">
10+
<annotator processor="sourceLayer"/>
11+
</lemma-annotation>
12+
<pos-annotation set="sourceLayer">
13+
<annotator processor="sourceLayer"/>
14+
</pos-annotation>
15+
<entity-annotation set="sourceLayer">
16+
<annotator processor="sourceLayer"/>
17+
</entity-annotation>
18+
<dependency-annotation set="sourceLayer">
19+
<annotator processor="sourceLayer"/>
20+
</dependency-annotation>
21+
</annotations>
22+
<provenance>
23+
<processor xml:id="sourceLayer" name="sourceLayer" type="auto" src="https://github.com/instituutnederlandsetaal/galahad-taggers-dockerized" host="https://galahad.ivdnt.org" user="user"/>
24+
</provenance>
25+
<meta id="title">karel_en_martijn</meta>
26+
<meta id="language">Dutch</meta>
27+
</metadata>
328
<text xml:id="d1">
429
<p xml:id="d1.p1">
530
<s xml:id="d1.p1.s1">

server/src/test/resources/formats/txt/converter/karel_en_martijn.folia.xml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
<?xml version='1.0' encoding='UTF-8'?>
22
<FoLiA xmlns="http://ilk.uvt.nl/folia" xmlns:xml="http://www.w3.org/XML/1998/namespace" generator="galahad.ivdnt.org" version="2.5.3" xml:id="e51560ff-81a2-4ddd-ba04-c7eb07af6d2b">
3+
<metadata type="native">
4+
<annotations>
5+
<text-annotation/>
6+
<paragraph-annotation/>
7+
<sentence-annotation/>
8+
<token-annotation/>
9+
</annotations>
10+
<provenance>
11+
<processor xml:id="sourceLayer" name="sourceLayer" type="auto" src="https://github.com/instituutnederlandsetaal/galahad-taggers-dockerized" host="https://galahad.ivdnt.org" user="user"/>
12+
</provenance>
13+
<meta id="title">karel_en_martijn</meta>
14+
<meta id="language">Dutch</meta>
15+
</metadata>
316
<text xml:id="d1">
417
<p xml:id="d1.p1">
518
<s xml:id="d1.p1.s1">

0 commit comments

Comments
 (0)