File tree Expand file tree Collapse file tree
main/kotlin/org/ivdnt/galahad/formats/txt
kotlin/org/ivdnt/galahad/formats/txt
resources/formats/txt/reader/whitespace Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -18,7 +18,7 @@ class TxtReader(
1818 file.forEachLine {
1919 if (it.isNotBlank()) {
2020 // split on whitespace
21- for (word in it.split(Regex (""" \s+""" ))) {
21+ for (word in it.trim(). split(Regex (""" \s+""" ))) {
2222 terms + = Term (wordID(), offset, mapOf (Annotation .TOKEN to word))
2323 offset + = word.length + 1 // +1 for space/LF
2424 }
Original file line number Diff line number Diff line change 1+ package org.ivdnt.galahad.formats.txt
2+
3+ import org.ivdnt.galahad.documents.DocumentFormat
4+ import org.ivdnt.galahad.exceptions.DocumentInvalidException
5+ import org.ivdnt.galahad.formats.ReaderTest
6+ import org.ivdnt.galahad.formats.tsv.TsvFile
7+ import org.ivdnt.galahad.util.TestUtil
8+ import org.junit.jupiter.api.Test
9+ import org.junit.jupiter.api.assertThrows
10+
11+ class TxtReaderTest : ReaderTest () {
12+ override val format: DocumentFormat = DocumentFormat .Txt
13+
14+ @Test
15+ fun `Ignore trailing whitespace before and after sentence` () {
16+ assertLayerAndText(" formats/txt/reader/whitespace" )
17+ }
18+ }
Original file line number Diff line number Diff line change 1+ Fraaie historie ende alwaer.
2+ Magh 'k u vertellen, hoirt naer.
3+
4+ 't Was op enen avondstonde.
5+ Dat koning Carel slaepen beghonde.
6+
7+ "Martijn, slaepstu? slaept dijn sin?"
8+ Sprec! hebstu gheen spreken in?
9+
10+ Du dinkes mi verdoren.
11+ Dune achtes meer no min
Original file line number Diff line number Diff line change 1+ {
2+ "documents" : [ {
3+ "id" : " d1" ,
4+ "paragraphs" : [ {
5+ "id" : " d1.p1" ,
6+ "sentences" : [ {
7+ "id" : " d1.p1.s1" ,
8+ "terms" : [ {
9+ "annotations" : {
10+ "token" : " Fraaie"
11+ },
12+ "id" : " d1.p1.s1.w1" ,
13+ "offset" : 0
14+ }, {
15+ "annotations" : {
16+ "token" : " historie"
17+ },
18+ "id" : " d1.p1.s1.w2" ,
19+ "offset" : 7
20+ }, {
21+ "annotations" : {
22+ "token" : " ende"
23+ },
24+ "id" : " d1.p1.s1.w3" ,
25+ "offset" : 16
26+ }, {
27+ "annotations" : {
28+ "token" : " alwaer."
29+ },
30+ "id" : " d1.p1.s1.w4" ,
31+ "offset" : 21
32+ } ]
33+ }, {
34+ "id" : " d1.p1.s2" ,
35+ "terms" : [ {
36+ "annotations" : {
37+ "token" : " Magh"
38+ },
39+ "id" : " d1.p1.s2.w1" ,
40+ "offset" : 29
41+ }, {
42+ "annotations" : {
43+ "token" : " 'k"
44+ },
45+ "id" : " d1.p1.s2.w2" ,
46+ "offset" : 34
47+ }, {
48+ "annotations" : {
49+ "token" : " u"
50+ },
51+ "id" : " d1.p1.s2.w3" ,
52+ "offset" : 37
53+ }, {
54+ "annotations" : {
55+ "token" : " vertellen,"
56+ },
57+ "id" : " d1.p1.s2.w4" ,
58+ "offset" : 39
59+ }, {
60+ "annotations" : {
61+ "token" : " hoirt"
62+ },
63+ "id" : " d1.p1.s2.w5" ,
64+ "offset" : 50
65+ }, {
66+ "annotations" : {
67+ "token" : " naer."
68+ },
69+ "id" : " d1.p1.s2.w6" ,
70+ "offset" : 56
71+ } ]
72+ } ]
73+ }, {
74+ "id" : " d1.p2" ,
75+ "sentences" : [ {
76+ "id" : " d1.p2.s1" ,
77+ "terms" : [ {
78+ "annotations" : {
79+ "token" : " 't"
80+ },
81+ "id" : " d1.p2.s1.w1" ,
82+ "offset" : 62
83+ }, {
84+ "annotations" : {
85+ "token" : " Was"
86+ },
87+ "id" : " d1.p2.s1.w2" ,
88+ "offset" : 65
89+ }, {
90+ "annotations" : {
91+ "token" : " op"
92+ },
93+ "id" : " d1.p2.s1.w3" ,
94+ "offset" : 69
95+ }, {
96+ "annotations" : {
97+ "token" : " enen"
98+ },
99+ "id" : " d1.p2.s1.w4" ,
100+ "offset" : 72
101+ }, {
102+ "annotations" : {
103+ "token" : " avondstonde."
104+ },
105+ "id" : " d1.p2.s1.w5" ,
106+ "offset" : 77
107+ } ]
108+ }, {
109+ "id" : " d1.p2.s2" ,
110+ "terms" : [ {
111+ "annotations" : {
112+ "token" : " Dat"
113+ },
114+ "id" : " d1.p2.s2.w1" ,
115+ "offset" : 90
116+ }, {
117+ "annotations" : {
118+ "token" : " koning"
119+ },
120+ "id" : " d1.p2.s2.w2" ,
121+ "offset" : 94
122+ }, {
123+ "annotations" : {
124+ "token" : " Carel"
125+ },
126+ "id" : " d1.p2.s2.w3" ,
127+ "offset" : 101
128+ }, {
129+ "annotations" : {
130+ "token" : " slaepen"
131+ },
132+ "id" : " d1.p2.s2.w4" ,
133+ "offset" : 107
134+ }, {
135+ "annotations" : {
136+ "token" : " beghonde."
137+ },
138+ "id" : " d1.p2.s2.w5" ,
139+ "offset" : 115
140+ } ]
141+ } ]
142+ }, {
143+ "id" : " d1.p3" ,
144+ "sentences" : [ {
145+ "id" : " d1.p3.s1" ,
146+ "terms" : [ {
147+ "annotations" : {
148+ "token" : " \" Martijn,"
149+ },
150+ "id" : " d1.p3.s1.w1" ,
151+ "offset" : 125
152+ }, {
153+ "annotations" : {
154+ "token" : " slaepstu?"
155+ },
156+ "id" : " d1.p3.s1.w2" ,
157+ "offset" : 135
158+ }, {
159+ "annotations" : {
160+ "token" : " slaept"
161+ },
162+ "id" : " d1.p3.s1.w3" ,
163+ "offset" : 145
164+ }, {
165+ "annotations" : {
166+ "token" : " dijn"
167+ },
168+ "id" : " d1.p3.s1.w4" ,
169+ "offset" : 152
170+ }, {
171+ "annotations" : {
172+ "token" : " sin?\" "
173+ },
174+ "id" : " d1.p3.s1.w5" ,
175+ "offset" : 157
176+ } ]
177+ }, {
178+ "id" : " d1.p3.s2" ,
179+ "terms" : [ {
180+ "annotations" : {
181+ "token" : " Sprec!"
182+ },
183+ "id" : " d1.p3.s2.w1" ,
184+ "offset" : 163
185+ }, {
186+ "annotations" : {
187+ "token" : " hebstu"
188+ },
189+ "id" : " d1.p3.s2.w2" ,
190+ "offset" : 170
191+ }, {
192+ "annotations" : {
193+ "token" : " gheen"
194+ },
195+ "id" : " d1.p3.s2.w3" ,
196+ "offset" : 177
197+ }, {
198+ "annotations" : {
199+ "token" : " spreken"
200+ },
201+ "id" : " d1.p3.s2.w4" ,
202+ "offset" : 183
203+ }, {
204+ "annotations" : {
205+ "token" : " in?"
206+ },
207+ "id" : " d1.p3.s2.w5" ,
208+ "offset" : 191
209+ } ]
210+ } ]
211+ }, {
212+ "id" : " d1.p4" ,
213+ "sentences" : [ {
214+ "id" : " d1.p4.s1" ,
215+ "terms" : [ {
216+ "annotations" : {
217+ "token" : " Du"
218+ },
219+ "id" : " d1.p4.s1.w1" ,
220+ "offset" : 195
221+ }, {
222+ "annotations" : {
223+ "token" : " dinkes"
224+ },
225+ "id" : " d1.p4.s1.w2" ,
226+ "offset" : 198
227+ }, {
228+ "annotations" : {
229+ "token" : " mi"
230+ },
231+ "id" : " d1.p4.s1.w3" ,
232+ "offset" : 205
233+ }, {
234+ "annotations" : {
235+ "token" : " verdoren."
236+ },
237+ "id" : " d1.p4.s1.w4" ,
238+ "offset" : 208
239+ } ]
240+ }, {
241+ "id" : " d1.p4.s2" ,
242+ "terms" : [ {
243+ "annotations" : {
244+ "token" : " Dune"
245+ },
246+ "id" : " d1.p4.s2.w1" ,
247+ "offset" : 218
248+ }, {
249+ "annotations" : {
250+ "token" : " achtes"
251+ },
252+ "id" : " d1.p4.s2.w2" ,
253+ "offset" : 223
254+ }, {
255+ "annotations" : {
256+ "token" : " meer"
257+ },
258+ "id" : " d1.p4.s2.w3" ,
259+ "offset" : 230
260+ }, {
261+ "annotations" : {
262+ "token" : " no"
263+ },
264+ "id" : " d1.p4.s2.w4" ,
265+ "offset" : 235
266+ }, {
267+ "annotations" : {
268+ "token" : " min"
269+ },
270+ "id" : " d1.p4.s2.w5" ,
271+ "offset" : 238
272+ } ]
273+ } ]
274+ } ]
275+ } ],
276+ "id" : " UUID"
277+ }
Original file line number Diff line number Diff line change 1+ Fraaie historie ende alwaer.
2+ Magh 'k u vertellen, hoirt naer.
3+
4+ 't Was op enen avondstonde.
5+ Dat koning Carel slaepen beghonde.
6+
7+ "Martijn, slaepstu? slaept dijn sin?"
8+ Sprec! hebstu gheen spreken in?
9+
10+ Du dinkes mi verdoren.
11+ Dune achtes meer no min
You can’t perform that action at this time.
0 commit comments