Skip to content

Commit c3e3160

Browse files
committed
support plain text ner spans
1 parent 95035b7 commit c3e3160

5 files changed

Lines changed: 325 additions & 2080 deletions

File tree

server/src/main/kotlin/org/ivdnt/galahad/formats/tei/TeiReader.kt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,6 @@ class TeiReader(
2626
"type"
2727
)?.ifBlank { null }
2828
spaceAfter = reader.getAttributeValue(null, "join") !in arrayOf("right", "both")
29-
// if spanValue is not null, it means we are in a span tag
30-
if (nerValue != null) {
31-
nerTargets += terms.size
32-
//nerTargets.add(reader.getAttributeValue(XMLConstants.XML_NS_URI, "id"))
33-
}
3429
}
3530

3631
in GROUP_TAGS -> {

server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,18 @@ abstract class XmlReader(stream: InputStream) : LayerReader() {
6262
while (reader.hasNext()) {
6363
when (reader.next()) {
6464
XMLStreamConstants.START_ELEMENT -> if (!shouldIgnore()) {
65+
// Entering a word or ner closes the previous wordform
66+
if (reader.localName in wordTags || reader.localName in nerTags) {
67+
newWordform()
68+
insideWordTag = reader.localName in wordTags
69+
}
70+
// Parse the new tag
6571
parseAttrs()
6672
when (reader.localName) {
6773
in documentTags -> docID = currentXmlID
6874
in paragraphTags -> parID = currentXmlID
6975
in sentenceTags -> sentID = currentXmlID
70-
in wordTags -> { insideWordTag = true; wordID = currentXmlID }
76+
in wordTags -> { wordID = currentXmlID }
7177
}
7278
}
7379

@@ -143,6 +149,7 @@ abstract class XmlReader(stream: InputStream) : LayerReader() {
143149
}
144150

145151
private fun newSpan() {
152+
newWordform();
146153
if (nerValue == null) return
147154
spans.getOrPut(Annotation.NER, ::mutableListOf) += TermSpan(nerTargets, nerValue!!)
148155
nerValue = null
@@ -194,6 +201,10 @@ abstract class XmlReader(stream: InputStream) : LayerReader() {
194201
group?.ifBlank { null }?.let { put(Annotation.GROUP, it) }
195202
put(Annotation.TOKEN, literal)
196203
}
204+
// If inside span, set this term as a ner target
205+
if (nerValue != null) {
206+
nerTargets += terms.size
207+
}
197208
terms += Term(wordID(), offset, annotations, spaceAfter)
198209
offset += literal.length
199210
if (spaceAfter) offset++
Lines changed: 123 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -1,177 +1,126 @@
1-
<TEI xmlns:egXML="http://www.tei-c.org/ns/Examples" xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dump="http://www.masereeuw.nl/xslt/dump-xml" xmlns:ivdnt="http://www.ivdnt.org/xslt/namespaces" xmlns="http://www.tei-c.org/ns/1.0">
1+
<?xml version='1.0' encoding='UTF-8'?>
2+
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xml="http://www.w3.org/XML/1998/namespace" xml:id="e51560ff-81a2-4ddd-ba04-c7eb07af6d2b">
23
<teiHeader>
4+
<fileDesc>
5+
<titleStmt>
6+
<title>karel_en_martijn</title>
7+
<respStmt>
8+
<resp>linguistic annotation by GaLAHaD (https://galahad.ivdnt.org)</resp>
9+
<orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName>
10+
<orgName xml:lang="en">Dutch Language Institute</orgName>
11+
</respStmt>
12+
<respStmt>
13+
<resp>exported as tei-p5 by GaLAHaD (https://galahad.ivdnt.org)</resp>
14+
<orgName xml:lang="nl">Instituut voor de Nederlandse Taal</orgName>
15+
<orgName xml:lang="en">Dutch Language Institute</orgName>
16+
</respStmt>
17+
</titleStmt>
18+
<publicationStmt>
19+
<publisher>!Needs to be filled in!</publisher>
20+
<idno type="sourceID">karel_en_martijn</idno>
21+
<idno type="GaLAHaDPersistentIdentifier">e51560ff-81a2-4ddd-ba04-c7eb07af6d2b_tei</idno>
22+
</publicationStmt>
23+
<notesStmt>
24+
<note resp="GaLAHaD" type="corpusName">testCorpus</note>
25+
<note resp="GaLAHaD" type="sourceCollection">source name</note>
26+
<note resp="GaLAHaD" type="sourceCollectionURL">http://source.url</note>
27+
</notesStmt>
28+
<sourceDesc>
29+
<ab>
30+
<idno type="sourceID">karel_en_martijn</idno>
31+
</ab>
32+
<ab type="date">
33+
<date from="1200" to="1300"/>
34+
</ab>
35+
</sourceDesc>
36+
</fileDesc>
37+
<encodingDesc>
38+
<appInfo resp="GaLAHaD">
39+
<application xml:id="sourceLayer" ident="sourceLayer" version="">
40+
<label>POS-tagger and lemmatiser</label>
41+
<ptr target=""/>
42+
</application>
43+
</appInfo>
44+
<editorialDecl resp="GaLAHaD">
45+
<interpretation xml:id="A0001">
46+
<ab type="linguisticAnnotation" subtype="POS-tagging_lemmatisation">
47+
<interGrp type="annotationStyle">
48+
<interp>inline</interp>
49+
</interGrp>
50+
<interGrp type="Documentation">
51+
<interp/>
52+
</interGrp>
53+
<interGrp type="annotationSet">
54+
<interp>TDN-Core</interp>
55+
</interGrp>
56+
<interGrp type="annotationDescription">
57+
<interp>The file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.</interp>
58+
</interGrp>
59+
<interGrp type="annotationFormat">
60+
<interp>TEI xml</interp>
61+
</interGrp>
62+
</ab>
63+
<ab type="linguisticAnnotation" subtype="POS-tagging_lemmatisationProvenance1">
64+
<interGrp type="annotationMode">
65+
<interp>automatically annotated</interp>
66+
</interGrp>
67+
<interpGrp type="processor">
68+
<interp sameAs="#sourceLayer"/>
69+
</interpGrp>
70+
<date from="2025-07-30" to="2025-07-30"/>
71+
</ab>
72+
</interpretation>
73+
</editorialDecl>
74+
</encodingDesc>
75+
<profileDesc>
76+
<langUsage>
77+
<language ident="nld">
78+
Dutch
79+
<interGrp type="dominantLanguage">
80+
<interp>true</interp>
81+
</interGrp>
82+
</language>
83+
</langUsage>
84+
</profileDesc>
385
</teiHeader>
4-
<text>
5-
<body>
6-
<p>n<hi rend="bold">e</hi>t <milestone n="1va" unit="fol"/> <hi rend="bold">w</hi><hi rend="bold">a</hi><hi rend="bold">s</hi> ik <hi rend="bold">n</hi>aar sc<hi rend="bold">h</hi><hi rend="bold">o</hi>o<hi rend="bold">l</hi> heen en t<hi rend="bold">e</hi>r<hi rend="bold">u</hi>g w<hi rend="bold">eze</hi>n lope<hi rend="bold">n</hi> <hi rend="bold">enz</hi>o</p>
7-
8-
<pc lemma="onzin">.</pc>
9-
<pc pos="onzin">.</pc>
10-
<pc>.</pc>
11-
<w lemma="onzin">scholen</w>
12-
<w pos="onzin">scholen</w>
13-
14-
<w> scholen </w>
15-
<w>scho len</w>
16-
<w>scho
17-
len</w>
18-
19-
<p><w>schrij<hi>ven</hi>de</w></p>
20-
21-
<p>
22-
electu<p>arien zijnde</p>
23-
</p>
24-
25-
<p>
26-
electu<p>
27-
arien</p>
28-
</p>
29-
30-
<q>
31-
electu<p>arien</p>
32-
</q>
33-
34-
<q>
35-
electu<p>
36-
arien</p>
37-
</q>
38-
39-
<p><hi rend="bold">test</hi>en</p>
40-
41-
<p><hi rend="bold">test</hi>en woord</p>
42-
43-
<p>test<hi rend="bold">en</hi></p>
44-
<p>woord test<hi rend="bold">en</hi></p>
45-
46-
<p>woord <hi rend="bold">test</hi>en woord</p>
47-
<p>woord test<hi rend="bold">en</hi> woord</p>
48-
49-
<p>test<hi>en </hi></p>
50-
<p><hi> test</hi>en</p>
51-
52-
53-
<p><hi rend="bold">cba fed<expan>abc def</expan>ghi jkl</hi>mno</p>
54-
<p>
55-
abc <hi class="bold">def ghi<hi class="italic">jkl</hi><hi class="red"><hi class="sub">mno</hi> pqr</hi>stu xyz</hi>abc def
56-
</p>
57-
<p>
58-
To ob<hi class="italic">ey</hi> or n<hi class="bold">o</hi>t to be,<br />that is the <hi class="bold">one <hi class="red">question</hi> that y<hi class="italic">re</hi>main</hi>ed.
59-
</p>
60-
61-
<!-- 2 levels deep-->
62-
<p><hi rend="bold"><expan>abc</expan></hi></p>
63-
<p><hi rend="bold"><expan>abc</expan></hi>def</p>
64-
<p>abc<hi rend="bold"><expan>def</expan></hi></p>
65-
66-
<p><hi rend="bold">abc<expan>def</expan></hi></p>
67-
<p>abc<hi rend="bold">def<expan>ghi</expan></hi></p>
68-
<p><hi rend="bold">def<expan>ghi</expan></hi>jkl</p>
69-
<p>abc<hi rend="bold">def<expan>ghi</expan></hi>jkl</p>
70-
71-
<p><hi rend="bold"><expan>abc</expan>def</hi></p>
72-
<p><hi rend="bold"><expan>abc</expan>def</hi>ghi</p>
73-
<p>abc<hi rend="bold"><expan>def</expan>ghi</hi></p>
74-
<p>abc<hi rend="bold"><expan>def</expan>ghi</hi>jkl</p>
75-
76-
<p><hi rend="bold">cba<expan>abc</expan>def</hi></p>
77-
<p><hi rend="bold">cba<expan>abc</expan>def</hi>ghi</p>
78-
<p>abc<hi rend="bold">cba<expan>def</expan>ghi</hi></p>
79-
<p>abc<hi rend="bold">cba<expan>def</expan>ghi</hi>jkl</p>
80-
81-
82-
<p>
83-
<w xml:id="enge022vand01_01.TEI.2.text.body.div.lg.7633.s.1.w.11" pos="PD(type=d-p,subtype=art,position=prenom)" lemma="de">die</w>
84-
<w xml:id="enge022vand01_01.TEI.2.text.body.div.lg.7633.s.1.w.12" pos="NOU-C(number=sg)" n="mw_288" lemma="droogheid">droecheit<join n="mw_288"/>
85-
</w>
86-
<w pos="NOU-C(number=pl)" xml:id="w.185" lemma="rebel" lexicon="molex"><seg>rebellen</seg></w><pc xml:id="pc.000005" type="." lemma="e." pos="LET">.</pc>
87-
</p>
88-
89-
<p>
90-
scholen.
91-
</p>
92-
93-
<p>
94-
wilt v<expan>er</expan>wittigen.
95-
96-
Eersame vrome
97-
</p>
98-
99-
<p><milestone n="1va" unit="fol"/>test</p>
100-
101-
<p>
102-
Copie
103-
104-
Wert belast en<expan>de</expan> bevolen
105-
</p>
106-
107-
<p>
108-
andere offic<expan ana="#add2expan" resp="ed">i</expan>eren.
109-
110-
Concludeert daeromme
111-
</p>
112-
113-
<p>
114-
na de <ref target="http://de-wit.net/bronnen/histo/politieke_ordonnantie_holland-1580.htm">Ordonnantie en Placaaten van de Politie</ref>, haare af kundinge
115-
</p>
116-
117-
118-
<p><hi rend="italic">Hoe Walewein Lancelote bescudde en enen camp voor hem vacht. </hi></p>
119-
120-
<p>ter putie<expan>ntie<expan ana="#add2expan" resp="ed">?</expan></expan> van mijn heer</p>
121-
122-
<p>alhier <del>noe</del> hoechnoedich</p>
123-
124-
<p>met noch <gap reason="illegible"/>dlik<expan ana="#add2expan" resp="ed">?</expan> uuyt desen</p>
125-
126-
<p>hopluyden <del>oftmeden<expan>de</expan></del> knechten</p>
127-
128-
<p>
129-
naemen van de K<expan ana="#add2expan" resp="ed">eyserlicke</expan> M<expan ana="#add2expan" resp="ed">ajestey</expan>t de jus<expan ana="#add2expan" resp="ed">t</expan>icie om vergiffenisse.
130-
</p>
131-
132-
133-
<p>nyet en es, <del><gap extent="word" reason="illegible"/></del>
134-
maer der selver</p> <!--the newline here is important-->
135-
136-
<p>ik loop <hi>naar</hi>
137-
school</p> <!--the newline here is important-->
138-
139-
<p>ik loop naar
140-
<hi>huis</hi></p> <!--the newline here is important-->
141-
142-
<p>stadt Sutphen
143-
<add place="margin"> R<expan>ecept</expan>ae 16 Martii</add></p>
144-
145-
<p>frunde<expan>n</expan> <add place="onder"> R<expan>ecept</expan>ae 16 Martii</add></p>
146-
147-
<p>mir das <gap reason="illegible"/>ring, das sonst</p>
148-
149-
<p>test-en</p>
150-
151-
<p>genige voirs<expan>chreven</expan> <add place="begin">dan dat oick die fia<expan>n</expan>dt ahm gesyn sich v<expan>er</expan>sammelen und sines arlisztes gewelttig moth sijn,</add> vifftig <del>oders</del></p>
152-
153-
<!--hoe moet dit überhaupt-->
154-
<p>blievet froeme dat l<gap reason="illegible"/> g<gap reason="illegible"/><del> i</del> in gesontheid</p>
155-
156-
<p>wij <add place="begin">in dese weszhae zeer verfallene und geoppressirde nots <del>ke</del> ghiene provisi va<expan>n</expan> holt</add> bij</p>
157-
158-
<p>gunstige here, wij <del>hebben hierbevoerens</del> hadden</p>
159-
160-
<p><add place="margin"> An d<expan>en</expan> scholtis to Zutphe<expan>n</expan> of sijne<expan>n</expan> stathold<expan>er</expan></add> Erentfeste</p>
161-
162-
<p>welcker voirs<expan>creuen</expan> <add place="left"> LVj </add><del>LXj</del> rydders die</p>
163-
164-
<p>desse breiff. <add place="left">vermeldet</add> Soe is</p>
165-
166-
167-
<p>vyfftich <del>auerkomen en<expan>de</expan></del><expan/> met</p>
168-
169-
<p>pater toe <hi rend="font-weight:bold">Aelsum </hi>(bij Akkrum), ende</p>
170-
171-
<p>Mijnen gans goetwijlleghen dijens[t] na allen vermoeghen altijt tibi honorem<ref target="N044"><hi rend="font-size:8pt;font-weight:bold"> * </hi></ref> . Eerbare,</p>
172-
173-
<p>ghemeynten va<damage>n<supplied resp="#Stapel">den </supplied>l</damage>ande van breuites vita<damage>m </damage>m<ex>agist</ex>rum</p>
174-
175-
</body>
176-
</text>
86+
<text xml:id="karel">
87+
<body>
88+
<p xml:id="karel.p1">
89+
<s xml:id="karel.p1.s1">
90+
Fraaie historie ende alw<hi>ae</hi>r
91+
</s>
92+
<s xml:id="karel.p1.s2">
93+
Magh 'k u vertellen<pc>,</pc> hoirt naer<pc>.</pc>
94+
</s>
95+
</p>
96+
<p xml:id="karel.p2">
97+
<s xml:id="karel.p2.s1">
98+
't Was op enen <w>avond<hi>stonde</hi></w>.
99+
</s>
100+
<s xml:id="karel.p2.s2">
101+
Dat <name type="PER">koning Carel</name> slaepen <w>be<hi>gh</hi>onde</w><pc>.</pc>
102+
</s>
103+
</p>
104+
</body>
105+
</text>
106+
<text xml:id="martijn">
107+
<body>
108+
<p xml:id="martijn.p1">
109+
<s xml:id="martijn.p1.s1">
110+
"<name type="PER"><w>Martijn</w></name>, slaepstu<pc>?</pc> <w>slaept</w> dijn <w>sin</w>?<pc>"</pc>
111+
</s>
112+
<s xml:id="martijn.p1.s2">
113+
<w>Sprec!</w><w>hebstu</w><w>gheen</w><w>spreken</w>in<pc>?</pc>
114+
</s>
115+
</p>
116+
<p xml:id="martijn.p2">
117+
<s xml:id="martijn.p2.s1">
118+
Du dinkes <w>mi</w> <hi>ver</hi>doren.
119+
</s>
120+
<s xml:id="martijn.p2.s2">
121+
Du<hi>ne</hi> achtes meer no min
122+
</s>
123+
</p>
124+
</body>
125+
</text>
177126
</TEI>

0 commit comments

Comments
 (0)