Skip to content

Commit 8fe7e22

Browse files
committed
make tei reader use xml reader
1 parent d0625fa commit 8fe7e22

3 files changed

Lines changed: 23 additions & 164 deletions

File tree

server/src/main/kotlin/org/ivdnt/galahad/formats/folia/AaltoFoliaReader.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ class AaltoFoliaReader(
1515

1616
override fun parseWordData() {
1717
when (reader.localName) {
18-
"pos" -> pos = reader.getAttributeValue(null, "class") ?: ""
19-
"lemma" -> lemma = reader.getAttributeValue(null, "class") ?: ""
18+
"pos" -> pos = reader.getAttributeValue(null, "class")
19+
"lemma" -> lemma = reader.getAttributeValue(null, "class")
2020
"w" -> spaceAfter = reader.getAttributeValue(null, "space") != "no"
2121
}
2222
}

server/src/main/kotlin/org/ivdnt/galahad/formats/tei/AaltoTeiReader.kt

Lines changed: 17 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -1,170 +1,29 @@
11
package org.ivdnt.galahad.formats.tei
22

3-
import org.codehaus.stax2.XMLEventReader2
4-
import org.ivdnt.galahad.annotations.Annotation
5-
import org.ivdnt.galahad.annotations.AnnotationReader
6-
import org.ivdnt.galahad.annotations.Layer
7-
import org.ivdnt.galahad.annotations.Term
8-
import org.ivdnt.galahad.util.XmlUtil
3+
import org.ivdnt.galahad.formats.xml.XmlReader
94
import java.io.BufferedInputStream
10-
import javax.xml.XMLConstants
11-
import javax.xml.namespace.QName
12-
import javax.xml.stream.XMLStreamConstants
13-
import javax.xml.stream.events.StartElement
145

156
class AaltoTeiReader(
167
stream: BufferedInputStream,
17-
) : AnnotationReader() {
18-
private val reader: XMLEventReader2 by lazy { XmlUtil.inputFactory.createXMLEventReader(stream) as XMLEventReader2 }
19-
private var literal: String = ""
20-
private var ignoring: Boolean = false
21-
22-
override fun read(): Layer {
23-
parseTopLevelTextNodes()
24-
return Layer(documents.toTypedArray())
25-
}
26-
27-
/**
28-
* Recursively enter each node and if it is top level <text> node,
29-
* i.e. a <text> node that is not contained in another <text> node, parse it.
30-
*/
31-
private fun parseTopLevelTextNodes() {
32-
while (reader.hasNextEvent()) {
33-
val event = reader.nextEvent()
34-
when (event.eventType) {
35-
XMLStreamConstants.START_ELEMENT -> {
36-
val el = event.asStartElement()
37-
if (el.name.localPart == "text") {
38-
docID =
39-
el.getAttributeByName(QName(XMLConstants.XML_NS_URI, "id"))?.value?.takeIf { it.isNotBlank() }
40-
parseNodesIntoDocument()
41-
newDocument()
42-
}
43-
}
44-
}
45-
}
46-
}
47-
48-
/**
49-
* Parse a <text> node and its children into an Layer.
50-
*/
51-
private fun parseNodesIntoDocument() {
52-
while (reader.hasNextEvent()) {
53-
val event = reader.nextEvent()
54-
when (event.eventType) {
55-
XMLStreamConstants.START_ELEMENT -> {
56-
val e = event.asStartElement()
57-
val tag = e.name.localPart
58-
59-
if (IGNORABLE_TAGS.contains(tag) || ignoring) {
60-
ignoring = true
61-
continue
62-
}
63-
64-
val id = e.getAttributeByName(QName(XMLConstants.XML_NS_URI, "id"))?.value?.takeIf { it.isNotBlank() }
65-
66-
// handle text outside of a paragraph/sentence when we are currently at a new <p>/<s>.
67-
// E.g.: <text> blabla <p> blabla </p> blabla </text>
68-
newSentenceOrParagraph(tag, id)
69-
70-
if (tag == "w" || tag == "pc") {
71-
wordID = id
72-
val lemma = e?.getAttributeByName(QName("lemma"))?.value?.takeIf { it.isNotBlank() }
73-
val pos = e?.getAttributeByName(QName("pos"))?.value?.takeIf { it.isNotBlank() } ?: e?.getAttributeByName(
74-
QName(
75-
"type"
76-
)
77-
)?.value?.takeIf { it.isNotBlank() }
78-
}
79-
}
80-
81-
XMLStreamConstants.CHARACTERS -> {
82-
if (ignoring) continue
83-
val e = event.asCharacters()
84-
85-
val words = e.data.trim().split(whitespace)
86-
for ((j, word) in words.withIndex()) {
87-
if (j > 0) newWordform()
88-
literal += word
89-
}
90-
}
91-
92-
XMLStreamConstants.END_ELEMENT -> {
93-
val e = event.asEndElement()
94-
95-
val tag = e.name.localPart
96-
if (IGNORABLE_TAGS.contains(tag)) {
97-
ignoring = false
98-
}
99-
if (ignoring) continue
100-
if (tag == "w") {
101-
newWordform()
102-
}
103-
}
104-
}
105-
}
106-
}
107-
108-
private fun newSentenceOrParagraph(tag: String, id: String?) {
109-
if (tag in PARAGRAPH_TAGS) {
110-
parID = id
111-
newParagraph()
112-
} else if (tag in SENTENCE_TAGS) {
113-
sentID = id
114-
newSentence()
115-
}
116-
}
117-
118-
override fun newSentence() {
119-
newWordform()
120-
super.newSentence()
121-
}
122-
123-
private fun newWordform() {
124-
if (literal.isBlank()) return
125-
val term = Term(wordID(), offset, mapOf(Annotation.TOKEN to literal))
126-
terms.add(term)
127-
offset += literal.length
128-
literal = ""
129-
}
130-
131-
private fun newWordform(tag: String? = null, id: String? = null, e: StartElement? = null) {
132-
if (literal.isBlank()) return
133-
134-
val annotations = mutableMapOf<Annotation, String>()
135-
136-
137-
// lemma?.let { annotations[Annotation.LEMMA] = it }
138-
// pos?.let { annotations[Annotation.POS] = it }
139-
if (tag == "pc") annotations[Annotation.POS] = "PC"
140-
annotations[Annotation.TOKEN] = literal
141-
142-
terms += Term(wordID(), offset, annotations, spaceAfter(e))
143-
offset += literal.length
144-
literal = ""
145-
}
146-
147-
/**
148-
* No space after if:
149-
* - join="right" or "both" on this element
150-
* - No space between this and the next element (inline xml)
151-
* - join="left" or "both" on the next element, skipping any next text nodes
152-
*
153-
* Else, space after.
154-
*/
155-
private fun spaceAfter(e: StartElement?): Boolean {
156-
// join="right" or "both" on this element
157-
val join = e?.getAttributeByName(QName("join"))?.value
158-
return join !in arrayOf("right", "both")
159-
// val next = reader.peek()
160-
// if (next.eventType ==
161-
// val nextEvent = reader.nextTag()
162-
// val nextJoin = reader.getAttributeValue(null, "join")
163-
// return !(nextJoin == "left" || nextJoin == "both")
8+
) : XmlReader(stream) {
9+
override val documentTags: Array<String> = DOCUMENT_TAGS
10+
override val paragraphTags: Array<String> = PARAGRAPH_TAGS
11+
override val sentenceTags: Array<String> = SENTENCE_TAGS
12+
override val wordTags: Array<String> = WORD_TAGS
13+
override val ignorableTags: Array<String> = IGNORABLE_TAGS
14+
override val wordDataTags: Array<String> = WORD_DATA_TAGS
15+
16+
override fun parseWordData() {
17+
lemma = reader.getAttributeValue(null, "lemma")?.takeIf { it.isNotBlank() }
18+
pos = reader.getAttributeValue(null, "pos")?.takeIf { it.isNotBlank() }
19+
?: reader.getAttributeValue(null, "type")?.takeIf { it.isNotBlank() }
20+
spaceAfter = reader.getAttributeValue(null, "join") !in arrayOf("right", "both")
16421
}
16522

16623
companion object {
167-
private val whitespace: Regex = Regex("""\s+""")
24+
private val DOCUMENT_TAGS = arrayOf("text")
25+
private val WORD_TAGS = arrayOf("w", "pc")
26+
private val WORD_DATA_TAGS = arrayOf("w", "pc")
16827
private val PARAGRAPH_TAGS = arrayOf(
16928
"text", // top most <text> defines a document, any other <text> is treated as a paragraph
17029
"body",

server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ import javax.xml.stream.XMLStreamConstants
1111
import javax.xml.stream.XMLStreamReader
1212

1313
abstract class XmlReader(stream: InputStream) : AnnotationReader() {
14-
protected var pos: String = ""
15-
protected var lemma: String = ""
14+
protected var pos: String? = null
15+
protected var lemma: String? = null
1616
protected var literal: String = ""
1717
protected var spaceAfter: Boolean = true
1818
protected val reader: XMLStreamReader by lazy { XmlUtil.inputFactory.createXMLStreamReader(stream) }
@@ -89,8 +89,8 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
8989
private fun newWordform() {
9090
if (literal.isBlank()) return
9191
val annotations = buildMap {
92-
lemma.takeIf { it.isNotBlank() }?.let { put(Annotation.LEMMA, it) }
93-
pos.takeIf { it.isNotBlank() }?.let { put(Annotation.POS, it) }
92+
lemma?.takeIf { it.isNotBlank() }?.let { put(Annotation.LEMMA, it) }
93+
pos?.takeIf { it.isNotBlank() }?.let { put(Annotation.POS, it) }
9494
put(Annotation.TOKEN, literal)
9595
}
9696
terms += Term(wordID(), offset, annotations, spaceAfter)

0 commit comments

Comments
 (0)