make tei reader use xml reader

PrinsINT · PrinsINT · commit 8fe7e22fb1b0 · 2025-04-17T14:40:36.000+02:00
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/formats/folia/AaltoFoliaReader.kt b/server/src/main/kotlin/org/ivdnt/galahad/formats/folia/AaltoFoliaReader.kt
@@ -15,8 +15,8 @@ class AaltoFoliaReader(
 
     override fun parseWordData() {
         when (reader.localName) {
-            "pos" -> pos = reader.getAttributeValue(null, "class") ?: ""
-            "lemma" -> lemma = reader.getAttributeValue(null, "class") ?: ""
+            "pos" -> pos = reader.getAttributeValue(null, "class")
+            "lemma" -> lemma = reader.getAttributeValue(null, "class")
             "w" -> spaceAfter = reader.getAttributeValue(null, "space") != "no"
         }
     }
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/formats/tei/AaltoTeiReader.kt b/server/src/main/kotlin/org/ivdnt/galahad/formats/tei/AaltoTeiReader.kt
@@ -1,170 +1,29 @@
 package org.ivdnt.galahad.formats.tei
 
-import org.codehaus.stax2.XMLEventReader2
-import org.ivdnt.galahad.annotations.Annotation
-import org.ivdnt.galahad.annotations.AnnotationReader
-import org.ivdnt.galahad.annotations.Layer
-import org.ivdnt.galahad.annotations.Term
-import org.ivdnt.galahad.util.XmlUtil
+import org.ivdnt.galahad.formats.xml.XmlReader
 import java.io.BufferedInputStream
-import javax.xml.XMLConstants
-import javax.xml.namespace.QName
-import javax.xml.stream.XMLStreamConstants
-import javax.xml.stream.events.StartElement
 
 class AaltoTeiReader(
     stream: BufferedInputStream,
-) : AnnotationReader() {
-    private val reader: XMLEventReader2 by lazy { XmlUtil.inputFactory.createXMLEventReader(stream) as XMLEventReader2 }
-    private var literal: String = ""
-    private var ignoring: Boolean = false
-
-    override fun read(): Layer {
-        parseTopLevelTextNodes()
-        return Layer(documents.toTypedArray())
-    }
-
-    /**
-     * Recursively enter each node and if it is top level <text> node,
-     * i.e. a <text> node that is not contained in another <text> node, parse it.
-     */
-    private fun parseTopLevelTextNodes() {
-        while (reader.hasNextEvent()) {
-            val event = reader.nextEvent()
-            when (event.eventType) {
-                XMLStreamConstants.START_ELEMENT -> {
-                    val el = event.asStartElement()
-                    if (el.name.localPart == "text") {
-                        docID =
-                            el.getAttributeByName(QName(XMLConstants.XML_NS_URI, "id"))?.value?.takeIf { it.isNotBlank() }
-                        parseNodesIntoDocument()
-                        newDocument()
-                    }
-                }
-            }
-        }
-    }
-
-    /**
-     * Parse a <text> node and its children into an Layer.
-     */
-    private fun parseNodesIntoDocument() {
-        while (reader.hasNextEvent()) {
-            val event = reader.nextEvent()
-            when (event.eventType) {
-                XMLStreamConstants.START_ELEMENT -> {
-                    val e = event.asStartElement()
-                    val tag = e.name.localPart
-
-                    if (IGNORABLE_TAGS.contains(tag) || ignoring) {
-                        ignoring = true
-                        continue
-                    }
-
-                    val id = e.getAttributeByName(QName(XMLConstants.XML_NS_URI, "id"))?.value?.takeIf { it.isNotBlank() }
-
-                    // handle text outside of a paragraph/sentence when we are currently at a new <p>/<s>.
-                    // E.g.: <text> blabla <p> blabla </p> blabla </text>
-                    newSentenceOrParagraph(tag, id)
-
-                    if (tag == "w" || tag == "pc") {
-                        wordID = id
-                        val lemma = e?.getAttributeByName(QName("lemma"))?.value?.takeIf { it.isNotBlank() }
-                        val pos = e?.getAttributeByName(QName("pos"))?.value?.takeIf { it.isNotBlank() } ?: e?.getAttributeByName(
-                            QName(
-                                "type"
-                            )
-                        )?.value?.takeIf { it.isNotBlank() }
-                    }
-                }
-
-                XMLStreamConstants.CHARACTERS -> {
-                    if (ignoring) continue
-                    val e = event.asCharacters()
-
-                    val words = e.data.trim().split(whitespace)
-                    for ((j, word) in words.withIndex()) {
-                        if (j > 0) newWordform()
-                        literal += word
-                    }
-                }
-
-                XMLStreamConstants.END_ELEMENT -> {
-                    val e = event.asEndElement()
-
-                    val tag = e.name.localPart
-                    if (IGNORABLE_TAGS.contains(tag)) {
-                        ignoring = false
-                    }
-                    if (ignoring) continue
-                    if (tag == "w") {
-                        newWordform()
-                    }
-                }
-            }
-        }
-    }
-
-    private fun newSentenceOrParagraph(tag: String, id: String?) {
-        if (tag in PARAGRAPH_TAGS) {
-            parID = id
-            newParagraph()
-        } else if (tag in SENTENCE_TAGS) {
-            sentID = id
-            newSentence()
-        }
-    }
-
-    override fun newSentence() {
-        newWordform()
-        super.newSentence()
-    }
-
-    private fun newWordform() {
-        if (literal.isBlank()) return
-        val term = Term(wordID(), offset, mapOf(Annotation.TOKEN to literal))
-        terms.add(term)
-        offset += literal.length
-        literal = ""
-    }
-
-    private fun newWordform(tag: String? = null, id: String? = null, e: StartElement? = null) {
-        if (literal.isBlank()) return
-
-        val annotations = mutableMapOf<Annotation, String>()
-
-
-//        lemma?.let { annotations[Annotation.LEMMA] = it }
-//        pos?.let { annotations[Annotation.POS] = it }
-        if (tag == "pc") annotations[Annotation.POS] = "PC"
-        annotations[Annotation.TOKEN] = literal
-
-        terms += Term(wordID(), offset, annotations, spaceAfter(e))
-        offset += literal.length
-        literal = ""
-    }
-
-    /**
-     * No space after if:
-     * - join="right" or "both" on this element
-     * - No space between this and the next element (inline xml)
-     * - join="left" or "both" on the next element, skipping any next text nodes
-     *
-     * Else, space after.
-     */
-    private fun spaceAfter(e: StartElement?): Boolean {
-        // join="right" or "both" on this element
-        val join = e?.getAttributeByName(QName("join"))?.value
-        return join !in arrayOf("right", "both")
-//        val next = reader.peek()
-//        if (next.eventType ==
-//        val nextEvent = reader.nextTag()
-//        val nextJoin = reader.getAttributeValue(null, "join")
-//        return !(nextJoin == "left" || nextJoin == "both")
+) : XmlReader(stream) {
+    override val documentTags: Array<String> = DOCUMENT_TAGS
+    override val paragraphTags: Array<String> = PARAGRAPH_TAGS
+    override val sentenceTags: Array<String> = SENTENCE_TAGS
+    override val wordTags: Array<String> = WORD_TAGS
+    override val ignorableTags: Array<String> = IGNORABLE_TAGS
+    override val wordDataTags: Array<String> = WORD_DATA_TAGS
+
+    override fun parseWordData() {
+        lemma = reader.getAttributeValue(null, "lemma")?.takeIf { it.isNotBlank() }
+        pos = reader.getAttributeValue(null, "pos")?.takeIf { it.isNotBlank() }
+            ?: reader.getAttributeValue(null, "type")?.takeIf { it.isNotBlank() }
+        spaceAfter = reader.getAttributeValue(null, "join") !in arrayOf("right", "both")
     }
 
     companion object {
-        private val whitespace: Regex = Regex("""\s+""")
+        private val DOCUMENT_TAGS = arrayOf("text")
+        private val WORD_TAGS = arrayOf("w", "pc")
+        private val WORD_DATA_TAGS = arrayOf("w", "pc")
         private val PARAGRAPH_TAGS = arrayOf(
             "text", // top most <text> defines a document, any other <text> is treated as a paragraph
             "body",
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt b/server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt
@@ -11,8 +11,8 @@ import javax.xml.stream.XMLStreamConstants
 import javax.xml.stream.XMLStreamReader
 
 abstract class XmlReader(stream: InputStream) : AnnotationReader() {
-    protected var pos: String = ""
-    protected var lemma: String = ""
+    protected var pos: String? = null
+    protected var lemma: String? = null
     protected var literal: String = ""
     protected var spaceAfter: Boolean = true
     protected val reader: XMLStreamReader by lazy { XmlUtil.inputFactory.createXMLStreamReader(stream) }
@@ -89,8 +89,8 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
     private fun newWordform() {
         if (literal.isBlank()) return
         val annotations = buildMap {
-            lemma.takeIf { it.isNotBlank() }?.let { put(Annotation.LEMMA, it) }
-            pos.takeIf { it.isNotBlank() }?.let { put(Annotation.POS, it) }
+            lemma?.takeIf { it.isNotBlank() }?.let { put(Annotation.LEMMA, it) }
+            pos?.takeIf { it.isNotBlank() }?.let { put(Annotation.POS, it) }
             put(Annotation.TOKEN, literal)
         }
         terms += Term(wordID(), offset, annotations, spaceAfter)

Original file line number	Diff line number	Diff line change
`@@ -15,8 +15,8 @@ class AaltoFoliaReader(`
`15`	`15`
`16`	`16`	`override fun parseWordData() {`
`17`	`17`	`when (reader.localName) {`
`18`		`- "pos" -> pos = reader.getAttributeValue(null, "class") ?: ""`
`19`		`- "lemma" -> lemma = reader.getAttributeValue(null, "class") ?: ""`
	`18`	`+ "pos" -> pos = reader.getAttributeValue(null, "class")`
	`19`	`+ "lemma" -> lemma = reader.getAttributeValue(null, "class")`
`20`	`20`	`"w" -> spaceAfter = reader.getAttributeValue(null, "space") != "no"`
`21`	`21`	`}`
`22`	`22`	`}`