instituutnederlandsetaal
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/annotations/Annotations.kt‎
Lines changed: 0 additions & 1 deletion b/‎server/src/main/kotlin/org/ivdnt/galahad/annotations/Annotations.kt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/annotations/Term.kt‎
Lines changed: 11 additions & 11 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/annotations/Term.kt‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/annotations/TermSpan.kt‎
Lines changed: 3 additions & 1 deletion b/‎server/src/main/kotlin/org/ivdnt/galahad/annotations/TermSpan.kt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/LineReader.kt‎
Lines changed: 24 additions & 0 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/LineReader.kt‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/conllu/ConlluReader.kt‎
Lines changed: 11 additions & 5 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/conllu/ConlluReader.kt‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/naf/NafConverter.kt‎
Lines changed: 4 additions & 5 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/naf/NafConverter.kt‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/naf/NafReader.kt‎
Lines changed: 28 additions & 9 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/naf/NafReader.kt‎
Lines changed: 28 additions & 9 deletions
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/tei/TeiReader.kt‎
Lines changed: 3 additions & 3 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/tei/TeiReader.kt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/tsv/TsvReader.kt‎
Lines changed: 12 additions & 12 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/tsv/TsvReader.kt‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt‎
Lines changed: 5 additions & 5 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt‎
Lines changed: 5 additions & 5 deletions
@@ -24,4 +24,3 @@ enum class Annotation(@JsonValue val value: String) {
     }
 }
 
-typealias Annotations = Map<Annotation, String?>
@@ -6,7 +6,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore
 class Term(
     val id: String,
     val offset: Int,
-    val annotations: Annotations,
+    val annotations: Map<Annotation, String?>,
     spaceAfter: Boolean? = null
 ) {
     val spaceAfter: Boolean? = if (spaceAfter == false) false else null
@@ -58,26 +58,26 @@ class Term(
      * The head of [annotation]. E.g. "PD+NOU" for "PD(type=art)+NOU(num=sg)"
      * or "VG" for "VG|neven" or ORG for B-ORG.
      */
-    fun annotationHead(annotationType: Annotation): String? {
+    fun annotationHead(annotation: Annotation): String? {
         // get annotation
-        val annotation = annotations[annotationType] ?: return null
+        val value = annotations[annotation] ?: return null
         // for NER
-        if (annotationType == Annotation.NER) {
-            if ('-' in annotation) {
-                return annotation.split('-')[1]
+        if (annotation == Annotation.NER) {
+            if ('-' in value) {
+                return value.split('-')[1]
             }
         }
         // for POS & UPOS
-        else if (annotationType in posAnnotations) {
-            return if (isMulti(annotationType)) {
+        else if (annotation in posAnnotations) {
+            return if (isMulti(annotation)) {
                 // Split on + and transform each part
-                annotation.split("+").joinToString("+") { singlePosToHead(it) }
+                value.split("+").joinToString("+") { singlePosToHead(it) }
             } else {
-                singlePosToHead(annotation)
+                singlePosToHead(value)
             }
         }
         // else leave as is
-        return annotation
+        return value
     }
 
     companion object {
 
@@ -3,4 +3,6 @@ package org.ivdnt.galahad.annotations
 class TermSpan(
     val indices: IntArray,
     val value: String,
-)
+) {
+    constructor(indices: List<Int>, value: String) : this(indices.toIntArray(), value)
+}
@@ -0,0 +1,24 @@
+package org.ivdnt.galahad.formats
+
+import org.ivdnt.galahad.annotations.Annotation
+import org.ivdnt.galahad.annotations.AnnotationReader
+import org.ivdnt.galahad.annotations.TermSpan
+
+abstract class LineReader : AnnotationReader() {
+    /**
+     * Override newSentence to insert ner spans
+     */
+    override fun newSentence() {
+        buildList<Pair<String, MutableList<Int>>> {
+            terms.forEachIndexed { i, t ->
+                if (t.ner?.startsWith("B-") == true) {
+                    add(t.annotationHead(org.ivdnt.galahad.annotations.Annotation.NER)!! to mutableListOf(i))
+                } else if (t.ner?.startsWith("I-") == true) {
+                    last().second.add(i)
+                }
+            }
+        }.ifEmpty { null }?.map { (value, indices) -> TermSpan(indices, value) }
+            ?.let { spans[Annotation.NER] = it.toMutableList() }
+        super.newSentence()
+    }
+}
@@ -4,33 +4,37 @@ import org.ivdnt.galahad.annotations.Annotation
 import org.ivdnt.galahad.annotations.AnnotationReader
 import org.ivdnt.galahad.annotations.Layer
 import org.ivdnt.galahad.annotations.Term
+import org.ivdnt.galahad.formats.LineReader
 import java.io.File
 
 class ConlluReader(
     val file: File
-) : AnnotationReader() {
+) : LineReader() {
     private val ignorableMultiWordIds: MutableSet<String> = mutableSetOf()
 
+    private val String.id: String?
+        get() = idRegex.find(this)?.groupValues?.get(1) // 0 is the whole match
+
     override fun read(): Layer {
         file.forEachLine {
             when {
                 it.startsWith("# newdoc") -> {
                     newDocument()
                     // get ID last, so we don't overwrite it while creating a new unit
-                    docID = Regex("id = (\\S+)").find(it)?.groupValues?.get(1) ?: "d${documents.size + 1}"
+                    docID = it.id
                 }
 
                 it.startsWith("# newpar") -> {
                     newParagraph()
                     // get ID last, so we don't overwrite it while creating a new unit
-                    parID = Regex("id = (\\S+)").find(it)?.groupValues?.get(1) ?: "p${paragraphs.size + 1}"
+                    parID = it.id
                 }
 
                 it.startsWith("# sent_id") || it.isBlank() -> {
                     newSentence()
-                    sentID = Regex("id = (\\S+)").find(it)?.groupValues?.get(1) ?: "s${sentences.size + 1}"
+                    // get ID last, so we don't overwrite it while creating a new unit
+                    sentID = it.id
                 }
-
                 !it.startsWith("#") -> {
                     newWord(it)
                 }
@@ -125,6 +129,8 @@ class ConlluReader(
     companion object {
         /** Supported names for the ner attribute in the MISC column. */
         private val nerAttrNames: List<String> = listOf("NamedEntity", "ner")
+        private val idRegex = Regex("id = (\\S+)")
+
 
         private val indices: Map<Annotation, Int> = mapOf(
             Annotation.TOKEN to 1,
 
@@ -4,6 +4,7 @@ import org.ivdnt.galahad.annotations.Annotation
 import org.ivdnt.galahad.export.DocumentExport
 import org.ivdnt.galahad.export.LayerConverter
 import org.ivdnt.galahad.util.XmlUtil
+import org.ivdnt.galahad.util.ifNullOrBlank
 import org.w3c.dom.Document
 import org.w3c.dom.Element
 import java.io.OutputStream
@@ -51,11 +52,9 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
 
         val lp = xml.createElement("lp").apply {
             setAttribute("name", export.tagger.id)
-            setAttribute("version", export.tagger.version)
             setAttribute("timestamp", now.toString())
-            setAttribute("beginTimestamp", now.toString())
-            setAttribute("endTimestamp", now.toString())
             setAttribute("hostname", "https://galahad.ivdnt.org")
+            export.tagger.version.ifBlank { null }?.let{ setAttribute("version", it) }
         }
         val lpTerms = xml.createElement("linguisticProcessors").apply {
             setAttribute("layer", "terms")
@@ -100,7 +99,7 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
                         setAttribute("offset", t.offset.toString())
                         setAttribute("length", t.token.length.toString())
                         setAttribute("sent", iSent.toString())
-                        setAttribute("para", iPar.toString())
+                        setAttribute("para", (iPar+1).toString())
                         textContent = t.token
                     }
                     text.appendChild(wf)
@@ -115,7 +114,7 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
         root.appendChild(terms)
         export.layer.terms.forEachIndexed { i, it ->
             val term = xml.createElement("term").apply {
-                setAttribute("id", "t$i")
+                setAttribute("id", "t${i+1}")
             }
             it.lemma?.let { term.setAttribute("lemma", it) }
             it.pos?.let { term.setAttribute("pos", it) }
 
@@ -1,13 +1,12 @@
 package org.ivdnt.galahad.formats.naf
 
+import org.ivdnt.galahad.annotations.*
 import org.ivdnt.galahad.annotations.Annotation
-import org.ivdnt.galahad.annotations.AnnotationReader
-import org.ivdnt.galahad.annotations.Layer
-import org.ivdnt.galahad.annotations.Term
 import org.ivdnt.galahad.util.XmlUtil
 import org.ivdnt.galahad.util.childElements
 import org.ivdnt.galahad.util.childOrNull
 import java.io.File
+import java.util.*
 
 typealias WordformID = String
 typealias TermID = String
@@ -40,10 +39,11 @@ class NafReader(file: File) : AnnotationReader() {
     private val nafEntities = root.childOrNull("entities")?.childElements?.map {
         NafEntity(
             type = it.getAttribute("type").ifEmpty { null },
-            references = it.childElements.map { it.childElements.map { it.getAttribute("id") }.toList() }.toList()
+            references = it.childOrNull("references")?.childElements?.map { it.childElements.map { it.getAttribute("id") }.toList() }?.toList()!!
         )
-    }
-    private val id = root.childOrNull("nafHeader")?.childOrNull("public")?.getAttribute("publicId").orEmpty().ifEmpty { null }
+    }?.toList()
+    private val id: String = root.childOrNull("nafHeader")?.childOrNull("public")?.getAttribute("publicId").orEmpty()
+        .ifEmpty { UUID.randomUUID().toString() }
 
     override fun read(): Layer {
         // group wordforms paragraph, then sentence, then sort by offset in sentence
@@ -70,17 +70,36 @@ class NafReader(file: File) : AnnotationReader() {
 
                     // space after
                     val nextWordform = sent.getOrNull(i + 1)
-                    val spaceAfter = nextWordform?.offset == wordform.offset + wordform.token.length
+                    val spaceAfter = nextWordform?.offset != (wordform.offset + wordform.token.length)
 
                     terms += Term(wordform.id, wordform.offset, annotations, spaceAfter)
                 }
-                // TODO sentence level spans
+                // collect all spans that refer to one of the terms in this sentence
+                val termIds = terms.map { it.id }
+                val nerSpans = nafEntities?.flatMap { e -> e.references.map { e.type!! to it } }
+                nerSpans?.filter { (_, ids) -> ids.any { it in termIds } }?.ifEmpty { null }
+                    ?.map { (value, ids) -> TermSpan(ids.map { id -> sent.indexOfFirst { it.id == id } }, value) }
+                    ?.toMutableList()?.let { spans[Annotation.NER] = it }
+
                 newSentence()
             }
             newParagraph()
         }
         newDocument()
-        return Layer(documents.toTypedArray())
+        return Layer(documents.toTypedArray(), id)
+    }
+
+    override fun newSentence() {
+        // edit the NER value of the terms if spans are present
+        spans[Annotation.NER]?.forEach { span ->
+            span.indices.forEachIndexed { spanI, termI ->
+                // Note the difference spanI and termI; e.g. span.indices = [4, 5]; so (0, 4) = (1, 5)
+                val t = terms[termI]
+                val iob = (if (spanI == 0) "B-" else "I-") + span.value
+                terms[termI] = Term(t.id, t.offset, t.annotations + (Annotation.NER to iob), t.spaceAfter)
+            }
+        }
+        super.newSentence()
     }
 
     data class NafWordform(
 
@@ -19,11 +19,11 @@ class TeiReader(
     override fun parseAttrs() {
         when (reader.localName) {
             in wordDataTags -> {
-                lemma = reader.getAttributeValue(null, "lemma")?.takeIf { it.isNotBlank() }
-                pos = reader.getAttributeValue(null, "pos")?.takeIf { it.isNotBlank() } ?: reader.getAttributeValue(
+                lemma = reader.getAttributeValue(null, "lemma")?.ifBlank { null }
+                pos = reader.getAttributeValue(null, "pos")?.ifBlank { null } ?: reader.getAttributeValue(
                     null,
                     "type"
-                )?.takeIf { it.isNotBlank() }
+                )?.ifBlank { null }
                 spaceAfter = reader.getAttributeValue(null, "join") !in arrayOf("right", "both")
                 // if spanValue is not null, it means we are in a span tag
                 if (spanValue != null) {
 
@@ -1,14 +1,13 @@
 package org.ivdnt.galahad.formats.tsv
 
+import org.ivdnt.galahad.annotations.*
 import org.ivdnt.galahad.annotations.Annotation
-import org.ivdnt.galahad.annotations.AnnotationReader
-import org.ivdnt.galahad.annotations.Layer
-import org.ivdnt.galahad.annotations.Term
+import org.ivdnt.galahad.formats.LineReader
 import java.io.File
 
 class TsvReader(
     val file: File
-) : AnnotationReader() {
+) : LineReader() {
     private val columnIndices: MutableMap<Annotation, Int> = mutableMapOf()
     private var lastLineWasBlank: Boolean = false
 
@@ -53,8 +52,8 @@ class TsvReader(
                 .firstOrNull { (_, names) ->
                     names.any { name -> header.equals(name, ignoreCase = true) }
                     // if it exists, register the index
-                }?.let { (annotationType, _) ->
-                    columnIndices[annotationType] = index
+                }?.let { (annotation, _) ->
+                    columnIndices[annotation] = index
                 }
         }
     }
@@ -72,18 +71,19 @@ class TsvReader(
         val values: List<String> = line.split("\t")
 
         // Retrieve values
-        val mutAnnot: Map<Annotation, String> = buildMap {
+        val annotations: Map<Annotation, String> = buildMap {
             for (column in columnIndices.entries) {
-                getColumn(column.value, values)?.let { put(column.key, it) }
+                columnOrNull(column.value, values)?.let { put(column.key, it) }
             }
         }
-        terms += Term(wordID(), offset, mutAnnot)
-        offset += mutAnnot[Annotation.TOKEN]?.length ?: 0
+        Term(wordID(), offset, annotations).also {
+            terms += it
+            offset += it.token.length
+        }
     }
 
     // Retrieves a column with bounds checking.
-    private fun getColumn(index: Int, values: List<String>): String? =
-        values.getOrNull(index)?.takeIf { it.isNotBlank() }
+    private fun columnOrNull(i: Int, values: List<String>): String? = values.getOrNull(i)?.ifBlank { null }
 
     companion object {
         val columnNames: Map<Annotation, List<String>> = mapOf(
 
@@ -23,7 +23,7 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
     protected val reader: XMLStreamReader by lazy { XmlUtil.inputFactory.createXMLStreamReader(stream) }
 
     private val currentXmlID: String?
-        get() = reader.getAttributeValue(XMLConstants.XML_NS_URI, "id")?.takeIf { it.isNotBlank() }
+        get() = reader.getAttributeValue(XMLConstants.XML_NS_URI, "id")?.ifBlank { null }
     private var ignoring: Boolean = false
 
     abstract val spanTags: Array<String>
@@ -92,7 +92,7 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
 
     fun newSpan() {
         if (spanValue == null) return
-        val indices = spanTargets.map { id -> terms.indexOfFirst { t -> t.id == id } }.toIntArray()
+        val indices = spanTargets.map { id -> terms.indexOfFirst { t -> t.id == id } }
         spans.getOrPut(Annotation.NER, ::mutableListOf) += TermSpan(indices, spanValue!!)
         spanValue = null
         spanTargets.clear()
@@ -103,7 +103,7 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
     private fun shouldIgnore(): Boolean = ignoring.also { ignoring = reader.localName in ignorableTags }
 
     private fun parseChars() {
-        val words = reader.text.takeIf { it.isNotBlank() }?.split(whitespace) ?: emptyList()
+        val words = reader.text.ifBlank { null }?.split(whitespace) ?: emptyList()
         for ((j, word) in words.withIndex()) {
             if (j > 0) newWordform()
             literal += word
@@ -113,8 +113,8 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
     override fun newWordform() {
         if (literal.isBlank()) return
         val annotations = buildMap {
-            lemma?.takeIf { it.isNotBlank() }?.let { put(Annotation.LEMMA, it) }
-            pos?.takeIf { it.isNotBlank() }?.let { put(Annotation.POS, it) }
+            lemma?.ifBlank { null }?.let { put(Annotation.LEMMA, it) }
+            pos?.ifBlank { null }?.let { put(Annotation.POS, it) }
             put(Annotation.TOKEN, literal)
         }
         terms += Term(wordID(), offset, annotations, spaceAfter)
Original file line number	Diff line number	Diff line change
`@@ -24,4 +24,3 @@ enum class Annotation(@JsonValue val value: String) {`
`24`	`24`	`}`
`25`	`25`	`}`
`26`	`26`
`27`		`-typealias Annotations = Map<Annotation, String?>`