Skip to content

Commit 5d00ece

Browse files
committed
ignore whitespace only within a w-tag
1 parent 0afee27 commit 5d00ece

5 files changed

Lines changed: 75 additions & 5 deletions

File tree

server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ abstract class XmlReader(stream: InputStream) : LayerReader() {
3030
private var ignoring: Boolean = false
3131
private var currentDepth: Int = 0
3232
private var ignoreDepth: Int? = null
33+
private var insideWordTag: Boolean = false
3334

3435
abstract val nerTags: Array<String>
3536
abstract val documentTags: Array<String>
@@ -66,7 +67,7 @@ abstract class XmlReader(stream: InputStream) : LayerReader() {
6667
in documentTags -> docID = currentXmlID
6768
in paragraphTags -> parID = currentXmlID
6869
in sentenceTags -> sentID = currentXmlID
69-
in wordTags -> wordID = currentXmlID
70+
in wordTags -> { insideWordTag = true; wordID = currentXmlID }
7071
}
7172
}
7273

@@ -170,14 +171,22 @@ abstract class XmlReader(stream: InputStream) : LayerReader() {
170171
}
171172

172173
private fun parseChars() {
173-
val words = reader.text.ifBlank { null }?.trim()?.split(whitespace) ?: emptyList()
174-
for ((j, word) in words.withIndex()) {
175-
if (j > 0) newWordform()
176-
literal += word
174+
// if we are within a word tag, simply add all text, given that there are no (valid) word boundaries
175+
if (insideWordTag) {
176+
reader.text.ifBlank { null }?.trim()?.let { literal += it}
177+
}
178+
// Outside a word tag, treat spaces as new words
179+
else {
180+
val words = reader.text.ifBlank { null }?.split(whitespace) ?: emptyList()
181+
for ((j, word) in words.withIndex()) {
182+
if (j > 0) newWordform()
183+
literal += word
184+
}
177185
}
178186
}
179187

180188
override fun newWordform() {
189+
insideWordTag = false
181190
if (literal.isBlank()) return
182191
val annotations = buildMap {
183192
lemma?.ifBlank { null }?.let { put(Annotation.LEMMA, it) }

server/src/test/kotlin/org/ivdnt/galahad/formats/tei/TeiReaderTest.kt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ internal class TeiReaderTest : ReaderTest() {
2626
assertLayerAndText("formats/tei/reader/mixed-tags/")
2727
}
2828

29+
@Test
30+
fun `Import TEI with whitespace in w tags`() {
31+
assertLayerAndText("formats/tei/reader/space-in-tag/")
32+
}
33+
2934
@Test
3035
fun `Import TEI with w-tags without spaces in between`() {
3136
assertLayerAndText("formats/tei/reader/nospaces")
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<TEI>
2+
<text>
3+
<p>
4+
<w lemma="fraai"><join n="mw_fraai"/> fraaie</w>
5+
<w lemma="historie"> historie <join n="mw_historie"/></w>
6+
<w lemma="en"> ende <join n="mw_en"/></w>
7+
<w lemma="alwaar"><join n="mw_alwaar"/> alwaer</w>
8+
</p>
9+
</text>
10+
</TEI>
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
{
2+
"documents" : [ {
3+
"id" : "d1",
4+
"paragraphs" : [ {
5+
"id" : "d1.p1",
6+
"sentences" : [ {
7+
"id" : "d1.p1.s1",
8+
"terms" : [ {
9+
"annotations" : {
10+
"token" : "fraaie",
11+
"lemma" : "fraai",
12+
"group" : "mw_fraai"
13+
},
14+
"id" : "d1.p1.s1.w1",
15+
"offset" : 0
16+
}, {
17+
"annotations" : {
18+
"token" : "historie",
19+
"lemma" : "historie",
20+
"group" : "mw_historie"
21+
},
22+
"id" : "d1.p1.s1.w2",
23+
"offset" : 7
24+
}, {
25+
"annotations" : {
26+
"token" : "ende",
27+
"lemma" : "en",
28+
"group" : "mw_en"
29+
},
30+
"id" : "d1.p1.s1.w3",
31+
"offset" : 16
32+
}, {
33+
"annotations" : {
34+
"token" : "alwaer",
35+
"lemma" : "alwaar",
36+
"group" : "mw_alwaar"
37+
},
38+
"id" : "d1.p1.s1.w4",
39+
"offset" : 21
40+
} ]
41+
} ]
42+
} ]
43+
} ],
44+
"id" : "UUID"
45+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
fraaie historie ende alwaer

0 commit comments

Comments
 (0)