Skip to content

Commit 86cdb1c

Browse files
committed
Txt reader: trim trailing whitespace
1 parent 1fb2044 commit 86cdb1c

5 files changed

Lines changed: 318 additions & 1 deletion

File tree

server/src/main/kotlin/org/ivdnt/galahad/formats/txt/TxtReader.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class TxtReader(
1818
file.forEachLine {
1919
if (it.isNotBlank()) {
2020
// split on whitespace
21-
for (word in it.split(Regex("""\s+"""))) {
21+
for (word in it.trim().split(Regex("""\s+"""))) {
2222
terms += Term(wordID(), offset, mapOf(Annotation.TOKEN to word))
2323
offset += word.length + 1 // +1 for space/LF
2424
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package org.ivdnt.galahad.formats.txt
2+
3+
import org.ivdnt.galahad.documents.DocumentFormat
4+
import org.ivdnt.galahad.exceptions.DocumentInvalidException
5+
import org.ivdnt.galahad.formats.ReaderTest
6+
import org.ivdnt.galahad.formats.tsv.TsvFile
7+
import org.ivdnt.galahad.util.TestUtil
8+
import org.junit.jupiter.api.Test
9+
import org.junit.jupiter.api.assertThrows
10+
11+
class TxtReaderTest : ReaderTest() {
12+
override val format: DocumentFormat = DocumentFormat.Txt
13+
14+
@Test
15+
fun `Ignore trailing whitespace before and after sentence`() {
16+
assertLayerAndText("formats/txt/reader/whitespace")
17+
}
18+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
Fraaie historie ende alwaer.
2+
Magh 'k u vertellen, hoirt naer.
3+
4+
't Was op enen avondstonde.
5+
Dat koning Carel slaepen beghonde.
6+
7+
"Martijn, slaepstu? slaept dijn sin?"
8+
Sprec! hebstu gheen spreken in?
9+
10+
Du dinkes mi verdoren.
11+
Dune achtes meer no min
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
{
2+
"documents" : [ {
3+
"id" : "d1",
4+
"paragraphs" : [ {
5+
"id" : "d1.p1",
6+
"sentences" : [ {
7+
"id" : "d1.p1.s1",
8+
"terms" : [ {
9+
"annotations" : {
10+
"token" : "Fraaie"
11+
},
12+
"id" : "d1.p1.s1.w1",
13+
"offset" : 0
14+
}, {
15+
"annotations" : {
16+
"token" : "historie"
17+
},
18+
"id" : "d1.p1.s1.w2",
19+
"offset" : 7
20+
}, {
21+
"annotations" : {
22+
"token" : "ende"
23+
},
24+
"id" : "d1.p1.s1.w3",
25+
"offset" : 16
26+
}, {
27+
"annotations" : {
28+
"token" : "alwaer."
29+
},
30+
"id" : "d1.p1.s1.w4",
31+
"offset" : 21
32+
} ]
33+
}, {
34+
"id" : "d1.p1.s2",
35+
"terms" : [ {
36+
"annotations" : {
37+
"token" : "Magh"
38+
},
39+
"id" : "d1.p1.s2.w1",
40+
"offset" : 29
41+
}, {
42+
"annotations" : {
43+
"token" : "'k"
44+
},
45+
"id" : "d1.p1.s2.w2",
46+
"offset" : 34
47+
}, {
48+
"annotations" : {
49+
"token" : "u"
50+
},
51+
"id" : "d1.p1.s2.w3",
52+
"offset" : 37
53+
}, {
54+
"annotations" : {
55+
"token" : "vertellen,"
56+
},
57+
"id" : "d1.p1.s2.w4",
58+
"offset" : 39
59+
}, {
60+
"annotations" : {
61+
"token" : "hoirt"
62+
},
63+
"id" : "d1.p1.s2.w5",
64+
"offset" : 50
65+
}, {
66+
"annotations" : {
67+
"token" : "naer."
68+
},
69+
"id" : "d1.p1.s2.w6",
70+
"offset" : 56
71+
} ]
72+
} ]
73+
}, {
74+
"id" : "d1.p2",
75+
"sentences" : [ {
76+
"id" : "d1.p2.s1",
77+
"terms" : [ {
78+
"annotations" : {
79+
"token" : "'t"
80+
},
81+
"id" : "d1.p2.s1.w1",
82+
"offset" : 62
83+
}, {
84+
"annotations" : {
85+
"token" : "Was"
86+
},
87+
"id" : "d1.p2.s1.w2",
88+
"offset" : 65
89+
}, {
90+
"annotations" : {
91+
"token" : "op"
92+
},
93+
"id" : "d1.p2.s1.w3",
94+
"offset" : 69
95+
}, {
96+
"annotations" : {
97+
"token" : "enen"
98+
},
99+
"id" : "d1.p2.s1.w4",
100+
"offset" : 72
101+
}, {
102+
"annotations" : {
103+
"token" : "avondstonde."
104+
},
105+
"id" : "d1.p2.s1.w5",
106+
"offset" : 77
107+
} ]
108+
}, {
109+
"id" : "d1.p2.s2",
110+
"terms" : [ {
111+
"annotations" : {
112+
"token" : "Dat"
113+
},
114+
"id" : "d1.p2.s2.w1",
115+
"offset" : 90
116+
}, {
117+
"annotations" : {
118+
"token" : "koning"
119+
},
120+
"id" : "d1.p2.s2.w2",
121+
"offset" : 94
122+
}, {
123+
"annotations" : {
124+
"token" : "Carel"
125+
},
126+
"id" : "d1.p2.s2.w3",
127+
"offset" : 101
128+
}, {
129+
"annotations" : {
130+
"token" : "slaepen"
131+
},
132+
"id" : "d1.p2.s2.w4",
133+
"offset" : 107
134+
}, {
135+
"annotations" : {
136+
"token" : "beghonde."
137+
},
138+
"id" : "d1.p2.s2.w5",
139+
"offset" : 115
140+
} ]
141+
} ]
142+
}, {
143+
"id" : "d1.p3",
144+
"sentences" : [ {
145+
"id" : "d1.p3.s1",
146+
"terms" : [ {
147+
"annotations" : {
148+
"token" : "\"Martijn,"
149+
},
150+
"id" : "d1.p3.s1.w1",
151+
"offset" : 125
152+
}, {
153+
"annotations" : {
154+
"token" : "slaepstu?"
155+
},
156+
"id" : "d1.p3.s1.w2",
157+
"offset" : 135
158+
}, {
159+
"annotations" : {
160+
"token" : "slaept"
161+
},
162+
"id" : "d1.p3.s1.w3",
163+
"offset" : 145
164+
}, {
165+
"annotations" : {
166+
"token" : "dijn"
167+
},
168+
"id" : "d1.p3.s1.w4",
169+
"offset" : 152
170+
}, {
171+
"annotations" : {
172+
"token" : "sin?\""
173+
},
174+
"id" : "d1.p3.s1.w5",
175+
"offset" : 157
176+
} ]
177+
}, {
178+
"id" : "d1.p3.s2",
179+
"terms" : [ {
180+
"annotations" : {
181+
"token" : "Sprec!"
182+
},
183+
"id" : "d1.p3.s2.w1",
184+
"offset" : 163
185+
}, {
186+
"annotations" : {
187+
"token" : "hebstu"
188+
},
189+
"id" : "d1.p3.s2.w2",
190+
"offset" : 170
191+
}, {
192+
"annotations" : {
193+
"token" : "gheen"
194+
},
195+
"id" : "d1.p3.s2.w3",
196+
"offset" : 177
197+
}, {
198+
"annotations" : {
199+
"token" : "spreken"
200+
},
201+
"id" : "d1.p3.s2.w4",
202+
"offset" : 183
203+
}, {
204+
"annotations" : {
205+
"token" : "in?"
206+
},
207+
"id" : "d1.p3.s2.w5",
208+
"offset" : 191
209+
} ]
210+
} ]
211+
}, {
212+
"id" : "d1.p4",
213+
"sentences" : [ {
214+
"id" : "d1.p4.s1",
215+
"terms" : [ {
216+
"annotations" : {
217+
"token" : "Du"
218+
},
219+
"id" : "d1.p4.s1.w1",
220+
"offset" : 195
221+
}, {
222+
"annotations" : {
223+
"token" : "dinkes"
224+
},
225+
"id" : "d1.p4.s1.w2",
226+
"offset" : 198
227+
}, {
228+
"annotations" : {
229+
"token" : "mi"
230+
},
231+
"id" : "d1.p4.s1.w3",
232+
"offset" : 205
233+
}, {
234+
"annotations" : {
235+
"token" : "verdoren."
236+
},
237+
"id" : "d1.p4.s1.w4",
238+
"offset" : 208
239+
} ]
240+
}, {
241+
"id" : "d1.p4.s2",
242+
"terms" : [ {
243+
"annotations" : {
244+
"token" : "Dune"
245+
},
246+
"id" : "d1.p4.s2.w1",
247+
"offset" : 218
248+
}, {
249+
"annotations" : {
250+
"token" : "achtes"
251+
},
252+
"id" : "d1.p4.s2.w2",
253+
"offset" : 223
254+
}, {
255+
"annotations" : {
256+
"token" : "meer"
257+
},
258+
"id" : "d1.p4.s2.w3",
259+
"offset" : 230
260+
}, {
261+
"annotations" : {
262+
"token" : "no"
263+
},
264+
"id" : "d1.p4.s2.w4",
265+
"offset" : 235
266+
}, {
267+
"annotations" : {
268+
"token" : "min"
269+
},
270+
"id" : "d1.p4.s2.w5",
271+
"offset" : 238
272+
} ]
273+
} ]
274+
} ]
275+
} ],
276+
"id" : "UUID"
277+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
Fraaie historie ende alwaer.
2+
Magh 'k u vertellen, hoirt naer.
3+
4+
't Was op enen avondstonde.
5+
Dat koning Carel slaepen beghonde.
6+
7+
"Martijn, slaepstu? slaept dijn sin?"
8+
Sprec! hebstu gheen spreken in?
9+
10+
Du dinkes mi verdoren.
11+
Dune achtes meer no min

0 commit comments

Comments
 (0)