Skip to content

Commit 1dbccff

Browse files
committed
Added NER reading to TEI and Folia
1 parent e700f54 commit 1dbccff

33 files changed

Lines changed: 984 additions & 326 deletions

server/src/main/kotlin/org/ivdnt/galahad/annotations/AnnotationReader.kt

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ abstract class AnnotationReader {
77
protected val paragraphs: MutableList<ParagraphLayer> = mutableListOf()
88
protected val sentences: MutableList<SentenceLayer> = mutableListOf()
99
protected val terms: MutableList<Term> = mutableListOf()
10-
protected val spans: MutableMap<Annotation, Array<TermSpan>> = mutableMapOf()
10+
protected val spans: MutableMap<Annotation, MutableList<TermSpan>> = mutableMapOf()
1111

1212
protected var offset: Int = 0
1313

@@ -17,9 +17,9 @@ abstract class AnnotationReader {
1717
protected var wordID: String? = null
1818

1919
protected fun docID(): String = docID ?: "d$dIndex"
20-
protected fun parID(): String = parID ?: "p$pIndex"
21-
protected fun sentID(): String = sentID ?: "s$sIndex"
22-
protected fun wordID(): String = wordID ?: "w$wIndex"
20+
protected fun parID(): String = parID ?: "${docID()}.p$pIndex"
21+
protected fun sentID(): String = sentID ?: "${parID()}.s$sIndex"
22+
protected fun wordID(): String = wordID ?: "${sentID()}.w$wIndex"
2323

2424
private val wIndex: Int get() = terms.size + 1
2525
private val sIndex: Int get() = sentences.size + 1
@@ -47,27 +47,11 @@ abstract class AnnotationReader {
4747
protected open fun newSentence() {
4848
newWordform()
4949
if (terms.isNotEmpty()) {
50-
extractSpans()
51-
sentences.add(SentenceLayer(sentID(), terms.toTypedArray(), spans.toMap()))
50+
sentences.add(SentenceLayer(sentID(), terms.toTypedArray(), spans.mapValues { it.value.toTypedArray() }.toMap()))
5251
terms.clear()
5352
spans.clear()
5453
}
5554
}
5655

5756
protected open fun newWordform() {}
58-
59-
private fun extractSpans() {
60-
// loop through all the terms and turn the NER into a span
61-
val indices = mutableListOf<Int>()
62-
terms.forEachIndexed { i, t ->
63-
t.ner?.let {
64-
indices += i
65-
}
66-
}
67-
if (indices.isNotEmpty()) {
68-
val ners = mutableListOf<TermSpan>()
69-
ners += TermSpan(indices, terms[indices.first()].annotationHead(Annotation.NER)!!)
70-
spans[Annotation.NER] = ners.toTypedArray()
71-
}
72-
}
7357
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package org.ivdnt.galahad.annotations
22

3-
data class TermSpan(
4-
val indices: List<Int>,
3+
class TermSpan(
4+
val indices: IntArray,
55
val value: String,
66
)

server/src/main/kotlin/org/ivdnt/galahad/documents/DocumentMetadata.kt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@ data class DocumentMetadata(
2222
val layerSummary: LayerSummary,
2323
/** Last modified timestamp in milliseconds. */
2424
val lastModified: Long,
25-
/** UUID of the document. */
26-
val uuid: String,
2725
/** Annotation types in the source layer. */
2826
val annotations: Set<Annotation>,
2927
) {
@@ -41,7 +39,6 @@ data class DocumentMetadata(
4139
layerPreview = file.layer.preview,
4240
layerSummary = file.layer.summary,
4341
lastModified = System.currentTimeMillis(),
44-
uuid = file.layer.id,
4542
annotations = file.layer.terms.flatMap { it.annotations.keys }.toSet()
4643
)
4744
}

server/src/main/kotlin/org/ivdnt/galahad/export/CmdiMetadata.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class CmdiMetadata(val export: DocumentExport) {
3030
private val month = SimpleDateFormat("MM").format(now)
3131
private val day = SimpleDateFormat("dd").format(now)
3232
private val date = "$year-$month-$day"
33-
private val uuid = export.document.metadata.uuid
33+
private val uuid = export.layer.id
3434
private val tagset = Tagset.readOrNull(export.tagger)?.longName.ifNullOrBlank { "!No tagset defined!" }
3535
private val tagger = export.tagger
3636
private val language = corpus.language.ifNullOrBlank { "Dutch" }

server/src/main/kotlin/org/ivdnt/galahad/export/CorpusExport.kt

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ import org.ivdnt.galahad.corpora.Corpus
77
import org.ivdnt.galahad.documents.Document
88
import org.ivdnt.galahad.documents.DocumentFormat
99
import org.ivdnt.galahad.documents.Documents
10-
import org.ivdnt.galahad.jobs.Job
1110
import org.ivdnt.galahad.exceptions.MergeNotImplementedException
11+
import org.ivdnt.galahad.jobs.Job
1212
import org.ivdnt.galahad.taggers.Tagger
1313
import org.ivdnt.galahad.util.FileMapper
1414
import org.ivdnt.galahad.util.createZipFile
@@ -17,11 +17,11 @@ import java.io.OutputStream
1717
class CorpusExport private constructor(
1818
val corpus: Corpus,
1919
val job: Job,
20-
val user: User,
2120
val format: DocumentFormat,
22-
val posHeadOnly: Boolean,
21+
val user: User,
2322
val tagger: Tagger,
2423
val shouldMerge: Boolean,
24+
val posHeadOnly: Boolean,
2525
) : Logging {
2626
private fun mergeFormatMatches(it: Document, format: DocumentFormat): Boolean {
2727
var otherFormat = it.metadata.format
@@ -54,10 +54,9 @@ class CorpusExport private constructor(
5454
* Maps all [Document] found in [Documents] to the desired [DocumentFormat] and zips them. [formatMapper] should perform the mapping.
5555
*/
5656
fun export(out: OutputStream) {
57-
val documents = corpus.documents.readAll().filter { DocumentExport.create(this, it).layer != Layer.EMPTY }
58-
val seq: Sequence<FileMapper> =
59-
documents.asSequence().map { doc -> doc.name to { out -> formatMapper(doc, out) } }
60-
val seqCmdi: Sequence<FileMapper> = documents.asSequence().map { doc ->
57+
val docs = corpus.documents.readAll().filter { DocumentExport.create(this, it).layer != Layer.EMPTY }
58+
val seq: Sequence<FileMapper> = docs.asSequence().map { doc -> doc.name to { out -> formatMapper(doc, out) } }
59+
val seqCmdi: Sequence<FileMapper> = docs.asSequence().map { doc ->
6160
"metadata/CMDI-${doc.uploadedFile.nameWithoutExtension}.xml" to { out ->
6261
DocumentExport.create(
6362
this, doc
@@ -74,17 +73,17 @@ class CorpusExport private constructor(
7473
corpus: Corpus,
7574
jobName: String,
7675
format: DocumentFormat,
77-
posHeadOnly: Boolean,
7876
user: User,
79-
shouldMerge: Boolean
77+
shouldMerge: Boolean,
78+
posHeadOnly: Boolean
8079
): CorpusExport = CorpusExport(
8180
corpus = corpus,
8281
job = corpus.jobs.readOrThrow(jobName),
83-
user = user,
8482
format = format,
85-
posHeadOnly = posHeadOnly,
83+
user = user,
8684
tagger = Tagger.readOrThrow(jobName, corpus),
87-
shouldMerge = shouldMerge
85+
shouldMerge = shouldMerge,
86+
posHeadOnly = posHeadOnly
8887
)
8988
}
9089
}

server/src/main/kotlin/org/ivdnt/galahad/formats/conllu/ConlluReader.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@ class ConlluReader(
117117
for (column in indices.keys) {
118118
getColumn(column, fields)?.let { annotations[column] = it }
119119
}
120-
terms += Term("${sentID()}.${wordID()}", offset, annotations, spaceAfter)
120+
terms += Term(wordID(), offset, annotations, spaceAfter)
121121
offset += fields[1].length
122-
if (spaceAfter) offset++ // add space after
122+
if (spaceAfter) offset++ // space after
123123
}
124124

125125
companion object {

server/src/main/kotlin/org/ivdnt/galahad/formats/folia/FoliaReader.kt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,16 @@ class FoliaReader(
1212
override val wordTags: Array<String> = WORD_TAGS
1313
override val ignorableTags: Array<String> = IGNORABLE_TAGS
1414
override val wordDataTags: Array<String> = WORD_DATA_TAGS
15+
override val spanTags: Array<String> = SPAN_TAGS
16+
override val spanDataTags: Array<String> = SPAN_DATA_TAGS
1517

1618
override fun parseAttrs() {
1719
when (reader.localName) {
1820
"pos" -> pos = reader.getAttributeValue(null, "class")
1921
"lemma" -> lemma = reader.getAttributeValue(null, "class")
2022
"w" -> spaceAfter = reader.getAttributeValue(null, "space") != "no"
23+
"entity" -> spanValue = reader.getAttributeValue(null, "class")
24+
"wref" -> reader.getAttributeValue(null, "id")?.let { spanTargets += it }
2125
}
2226
}
2327

@@ -30,5 +34,7 @@ class FoliaReader(
3034
private val WORD_TAGS = arrayOf("w")
3135
private val WORD_DATA_TAGS = arrayOf("w", "lemma", "pos")
3236
private val IGNORABLE_TAGS = arrayOf("morphology", "note", "figure", "comment", "original", "suggestion")
37+
private val SPAN_TAGS = arrayOf("entity")
38+
private val SPAN_DATA_TAGS = arrayOf("wref", "entity")
3339
}
3440
}

server/src/main/kotlin/org/ivdnt/galahad/formats/naf/NafConverter.kt

Lines changed: 56 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -25,61 +25,11 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
2525
addRaw(xml, root)
2626
addText(xml, root)
2727
addTerms(xml, root)
28-
addEntities(xml, root)
29-
30-
XmlUtil.transformer.transform(DOMSource(root), StreamResult(out))
31-
}
32-
33-
private fun addTerms(xml: Document, root: Element) {
34-
val terms = xml.createElement("terms")
35-
root.appendChild(terms)
36-
export.layer.terms.forEachIndexed { i, it ->
37-
val term = xml.createElement("term").apply {
38-
setAttribute("id", "t$i")
39-
}
40-
it.lemma?.let { term.setAttribute("lemma", it) }
41-
it.pos?.let { term.setAttribute("pos", it) }
42-
terms.appendChild(term)
43-
44-
// target span
45-
val target = xml.createElement("target").apply {
46-
setAttribute("id", it.id)
47-
}
48-
val span = xml.createElement("span").apply {
49-
appendChild(target)
50-
}
51-
term.appendChild(span)
52-
}
53-
}
54-
55-
private fun addText(xml: Document, root: Element) {
56-
val text = xml.createElement("text")
57-
root.appendChild(text)
58-
val paragraphs = export.layer.documents.flatMap { it.paragraphs.asSequence() }
59-
var iSent = 1
60-
paragraphs.forEachIndexed { iPar, paragraph ->
61-
paragraph.sentences.forEach { sentence ->
62-
sentence.terms.forEach { t ->
63-
val wf = xml.createElement("wf").apply {
64-
setAttribute("id", t.id)
65-
setAttribute("offset", t.offset.toString())
66-
setAttribute("length", t.token.length.toString())
67-
setAttribute("sent", iSent.toString())
68-
setAttribute("para", iPar.toString())
69-
textContent = t.token
70-
}
71-
text.appendChild(wf)
72-
}
73-
iSent++
74-
}
28+
if (Annotation.NER in export.tagger.annotations) {
29+
addEntities(xml, root)
7530
}
76-
}
7731

78-
private fun addRaw(xml: Document, root: Element) {
79-
val cdata = xml.createCDATASection(export.layer.toString())
80-
val raw = xml.createElement("raw")
81-
raw.appendChild(cdata)
82-
root.appendChild(raw)
32+
XmlUtil.transformer.transform(DOMSource(root), StreamResult(out))
8333
}
8434

8535
private fun addNafHeader(xml: Document, root: Element) {
@@ -95,7 +45,7 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
9545
}
9646
nafHeader.appendChild(fileDesc)
9747
val public = xml.createElement("public").apply {
98-
setAttribute("publicId", export.document.metadata.uuid.toString())
48+
setAttribute("publicId", export.layer.id)
9949
}
10050
nafHeader.appendChild(public)
10151

@@ -130,6 +80,58 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
13080
}
13181
}
13282

83+
private fun addRaw(xml: Document, root: Element) {
84+
val cdata = xml.createCDATASection(export.layer.toString())
85+
val raw = xml.createElement("raw")
86+
raw.appendChild(cdata)
87+
root.appendChild(raw)
88+
}
89+
90+
private fun addText(xml: Document, root: Element) {
91+
val text = xml.createElement("text")
92+
root.appendChild(text)
93+
val paragraphs = export.layer.documents.flatMap { it.paragraphs.asSequence() }
94+
var iSent = 1
95+
paragraphs.forEachIndexed { iPar, paragraph ->
96+
paragraph.sentences.forEach { sentence ->
97+
sentence.terms.forEach { t ->
98+
val wf = xml.createElement("wf").apply {
99+
setAttribute("id", t.id)
100+
setAttribute("offset", t.offset.toString())
101+
setAttribute("length", t.token.length.toString())
102+
setAttribute("sent", iSent.toString())
103+
setAttribute("para", iPar.toString())
104+
textContent = t.token
105+
}
106+
text.appendChild(wf)
107+
}
108+
iSent++
109+
}
110+
}
111+
}
112+
113+
private fun addTerms(xml: Document, root: Element) {
114+
val terms = xml.createElement("terms")
115+
root.appendChild(terms)
116+
export.layer.terms.forEachIndexed { i, it ->
117+
val term = xml.createElement("term").apply {
118+
setAttribute("id", "t$i")
119+
}
120+
it.lemma?.let { term.setAttribute("lemma", it) }
121+
it.pos?.let { term.setAttribute("pos", it) }
122+
terms.appendChild(term)
123+
124+
// target span
125+
val target = xml.createElement("target").apply {
126+
setAttribute("id", it.id)
127+
}
128+
val span = xml.createElement("span").apply {
129+
appendChild(target)
130+
}
131+
term.appendChild(span)
132+
}
133+
}
134+
133135
private fun addEntities(xml: Document, root: Element) {
134136
val entities = xml.createElement("entities")
135137
root.appendChild(entities)

server/src/main/kotlin/org/ivdnt/galahad/formats/tei/TeiConverter.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class TeiConverter(export: DocumentExport) : LayerConverter(export) {
4040
// <name type="ORG">
4141
// <w>...</w>
4242
// </name>
43-
ners?.firstOrNull<TermSpan> { termI == it.indices.first<Int>() }?.let {
43+
ners?.firstOrNull<TermSpan> { termI == it.indices.first() }?.let {
4444
writer.writeStartElement("name")
4545
writer.writeAttribute("type", it.value)
4646
}

server/src/main/kotlin/org/ivdnt/galahad/formats/tei/TeiMetadata.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class TeiMetadata(
1919
) : XmlUtil(xml) {
2020

2121
/** GaLAHaD-generated UUID */
22-
private val internalPid: String = export.document.metadata.uuid.toString()
22+
private val internalPid: String = export.layer.id
2323

2424
/**
2525
* Return the title of the document as described in titleStmt,

0 commit comments

Comments
 (0)