|
1 | 1 | package org.ivdnt.galahad.formats.tei |
2 | 2 |
|
3 | | -import org.codehaus.stax2.XMLEventReader2 |
4 | | -import org.ivdnt.galahad.annotations.Annotation |
5 | | -import org.ivdnt.galahad.annotations.AnnotationReader |
6 | | -import org.ivdnt.galahad.annotations.Layer |
7 | | -import org.ivdnt.galahad.annotations.Term |
8 | | -import org.ivdnt.galahad.util.XmlUtil |
| 3 | +import org.ivdnt.galahad.formats.xml.XmlReader |
9 | 4 | import java.io.BufferedInputStream |
10 | | -import javax.xml.XMLConstants |
11 | | -import javax.xml.namespace.QName |
12 | | -import javax.xml.stream.XMLStreamConstants |
13 | | -import javax.xml.stream.events.StartElement |
14 | 5 |
|
15 | 6 | class AaltoTeiReader( |
16 | 7 | stream: BufferedInputStream, |
17 | | -) : AnnotationReader() { |
18 | | - private val reader: XMLEventReader2 by lazy { XmlUtil.inputFactory.createXMLEventReader(stream) as XMLEventReader2 } |
19 | | - private var literal: String = "" |
20 | | - private var ignoring: Boolean = false |
21 | | - |
22 | | - override fun read(): Layer { |
23 | | - parseTopLevelTextNodes() |
24 | | - return Layer(documents.toTypedArray()) |
25 | | - } |
26 | | - |
27 | | - /** |
28 | | - * Recursively enter each node and if it is top level <text> node, |
29 | | - * i.e. a <text> node that is not contained in another <text> node, parse it. |
30 | | - */ |
31 | | - private fun parseTopLevelTextNodes() { |
32 | | - while (reader.hasNextEvent()) { |
33 | | - val event = reader.nextEvent() |
34 | | - when (event.eventType) { |
35 | | - XMLStreamConstants.START_ELEMENT -> { |
36 | | - val el = event.asStartElement() |
37 | | - if (el.name.localPart == "text") { |
38 | | - docID = |
39 | | - el.getAttributeByName(QName(XMLConstants.XML_NS_URI, "id"))?.value?.takeIf { it.isNotBlank() } |
40 | | - parseNodesIntoDocument() |
41 | | - newDocument() |
42 | | - } |
43 | | - } |
44 | | - } |
45 | | - } |
46 | | - } |
47 | | - |
48 | | - /** |
49 | | - * Parse a <text> node and its children into an Layer. |
50 | | - */ |
51 | | - private fun parseNodesIntoDocument() { |
52 | | - while (reader.hasNextEvent()) { |
53 | | - val event = reader.nextEvent() |
54 | | - when (event.eventType) { |
55 | | - XMLStreamConstants.START_ELEMENT -> { |
56 | | - val e = event.asStartElement() |
57 | | - val tag = e.name.localPart |
58 | | - |
59 | | - if (IGNORABLE_TAGS.contains(tag) || ignoring) { |
60 | | - ignoring = true |
61 | | - continue |
62 | | - } |
63 | | - |
64 | | - val id = e.getAttributeByName(QName(XMLConstants.XML_NS_URI, "id"))?.value?.takeIf { it.isNotBlank() } |
65 | | - |
66 | | - // handle text outside of a paragraph/sentence when we are currently at a new <p>/<s>. |
67 | | - // E.g.: <text> blabla <p> blabla </p> blabla </text> |
68 | | - newSentenceOrParagraph(tag, id) |
69 | | - |
70 | | - if (tag == "w" || tag == "pc") { |
71 | | - wordID = id |
72 | | - val lemma = e?.getAttributeByName(QName("lemma"))?.value?.takeIf { it.isNotBlank() } |
73 | | - val pos = e?.getAttributeByName(QName("pos"))?.value?.takeIf { it.isNotBlank() } ?: e?.getAttributeByName( |
74 | | - QName( |
75 | | - "type" |
76 | | - ) |
77 | | - )?.value?.takeIf { it.isNotBlank() } |
78 | | - } |
79 | | - } |
80 | | - |
81 | | - XMLStreamConstants.CHARACTERS -> { |
82 | | - if (ignoring) continue |
83 | | - val e = event.asCharacters() |
84 | | - |
85 | | - val words = e.data.trim().split(whitespace) |
86 | | - for ((j, word) in words.withIndex()) { |
87 | | - if (j > 0) newWordform() |
88 | | - literal += word |
89 | | - } |
90 | | - } |
91 | | - |
92 | | - XMLStreamConstants.END_ELEMENT -> { |
93 | | - val e = event.asEndElement() |
94 | | - |
95 | | - val tag = e.name.localPart |
96 | | - if (IGNORABLE_TAGS.contains(tag)) { |
97 | | - ignoring = false |
98 | | - } |
99 | | - if (ignoring) continue |
100 | | - if (tag == "w") { |
101 | | - newWordform() |
102 | | - } |
103 | | - } |
104 | | - } |
105 | | - } |
106 | | - } |
107 | | - |
108 | | - private fun newSentenceOrParagraph(tag: String, id: String?) { |
109 | | - if (tag in PARAGRAPH_TAGS) { |
110 | | - parID = id |
111 | | - newParagraph() |
112 | | - } else if (tag in SENTENCE_TAGS) { |
113 | | - sentID = id |
114 | | - newSentence() |
115 | | - } |
116 | | - } |
117 | | - |
118 | | - override fun newSentence() { |
119 | | - newWordform() |
120 | | - super.newSentence() |
121 | | - } |
122 | | - |
123 | | - private fun newWordform() { |
124 | | - if (literal.isBlank()) return |
125 | | - val term = Term(wordID(), offset, mapOf(Annotation.TOKEN to literal)) |
126 | | - terms.add(term) |
127 | | - offset += literal.length |
128 | | - literal = "" |
129 | | - } |
130 | | - |
131 | | - private fun newWordform(tag: String? = null, id: String? = null, e: StartElement? = null) { |
132 | | - if (literal.isBlank()) return |
133 | | - |
134 | | - val annotations = mutableMapOf<Annotation, String>() |
135 | | - |
136 | | - |
137 | | -// lemma?.let { annotations[Annotation.LEMMA] = it } |
138 | | -// pos?.let { annotations[Annotation.POS] = it } |
139 | | - if (tag == "pc") annotations[Annotation.POS] = "PC" |
140 | | - annotations[Annotation.TOKEN] = literal |
141 | | - |
142 | | - terms += Term(wordID(), offset, annotations, spaceAfter(e)) |
143 | | - offset += literal.length |
144 | | - literal = "" |
145 | | - } |
146 | | - |
147 | | - /** |
148 | | - * No space after if: |
149 | | - * - join="right" or "both" on this element |
150 | | - * - No space between this and the next element (inline xml) |
151 | | - * - join="left" or "both" on the next element, skipping any next text nodes |
152 | | - * |
153 | | - * Else, space after. |
154 | | - */ |
155 | | - private fun spaceAfter(e: StartElement?): Boolean { |
156 | | - // join="right" or "both" on this element |
157 | | - val join = e?.getAttributeByName(QName("join"))?.value |
158 | | - return join !in arrayOf("right", "both") |
159 | | -// val next = reader.peek() |
160 | | -// if (next.eventType == |
161 | | -// val nextEvent = reader.nextTag() |
162 | | -// val nextJoin = reader.getAttributeValue(null, "join") |
163 | | -// return !(nextJoin == "left" || nextJoin == "both") |
| 8 | +) : XmlReader(stream) { |
| 9 | + override val documentTags: Array<String> = DOCUMENT_TAGS |
| 10 | + override val paragraphTags: Array<String> = PARAGRAPH_TAGS |
| 11 | + override val sentenceTags: Array<String> = SENTENCE_TAGS |
| 12 | + override val wordTags: Array<String> = WORD_TAGS |
| 13 | + override val ignorableTags: Array<String> = IGNORABLE_TAGS |
| 14 | + override val wordDataTags: Array<String> = WORD_DATA_TAGS |
| 15 | + |
| 16 | + override fun parseWordData() { |
| 17 | + lemma = reader.getAttributeValue(null, "lemma")?.takeIf { it.isNotBlank() } |
| 18 | + pos = reader.getAttributeValue(null, "pos")?.takeIf { it.isNotBlank() } |
| 19 | + ?: reader.getAttributeValue(null, "type")?.takeIf { it.isNotBlank() } |
| 20 | + spaceAfter = reader.getAttributeValue(null, "join") !in arrayOf("right", "both") |
164 | 21 | } |
165 | 22 |
|
166 | 23 | companion object { |
167 | | - private val whitespace: Regex = Regex("""\s+""") |
| 24 | + private val DOCUMENT_TAGS = arrayOf("text") |
| 25 | + private val WORD_TAGS = arrayOf("w", "pc") |
| 26 | + private val WORD_DATA_TAGS = arrayOf("w", "pc") |
168 | 27 | private val PARAGRAPH_TAGS = arrayOf( |
169 | 28 | "text", // top most <text> defines a document, any other <text> is treated as a paragraph |
170 | 29 | "body", |
|
0 commit comments