Skip to content

Commit e7b9a9c

Browse files
committed
Completed NER test cases
1 parent 1dbccff commit e7b9a9c

163 files changed

Lines changed: 7061 additions & 67 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

server/src/main/kotlin/org/ivdnt/galahad/annotations/Annotations.kt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,3 @@ enum class Annotation(@JsonValue val value: String) {
2424
}
2525
}
2626

27-
typealias Annotations = Map<Annotation, String?>

server/src/main/kotlin/org/ivdnt/galahad/annotations/Term.kt

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore
66
class Term(
77
val id: String,
88
val offset: Int,
9-
val annotations: Annotations,
9+
val annotations: Map<Annotation, String?>,
1010
spaceAfter: Boolean? = null
1111
) {
1212
val spaceAfter: Boolean? = if (spaceAfter == false) false else null
@@ -58,26 +58,26 @@ class Term(
5858
* The head of [annotation]. E.g. "PD+NOU" for "PD(type=art)+NOU(num=sg)"
5959
* or "VG" for "VG|neven" or ORG for B-ORG.
6060
*/
61-
fun annotationHead(annotationType: Annotation): String? {
61+
fun annotationHead(annotation: Annotation): String? {
6262
// get annotation
63-
val annotation = annotations[annotationType] ?: return null
63+
val value = annotations[annotation] ?: return null
6464
// for NER
65-
if (annotationType == Annotation.NER) {
66-
if ('-' in annotation) {
67-
return annotation.split('-')[1]
65+
if (annotation == Annotation.NER) {
66+
if ('-' in value) {
67+
return value.split('-')[1]
6868
}
6969
}
7070
// for POS & UPOS
71-
else if (annotationType in posAnnotations) {
72-
return if (isMulti(annotationType)) {
71+
else if (annotation in posAnnotations) {
72+
return if (isMulti(annotation)) {
7373
// Split on + and transform each part
74-
annotation.split("+").joinToString("+") { singlePosToHead(it) }
74+
value.split("+").joinToString("+") { singlePosToHead(it) }
7575
} else {
76-
singlePosToHead(annotation)
76+
singlePosToHead(value)
7777
}
7878
}
7979
// else leave as is
80-
return annotation
80+
return value
8181
}
8282

8383
companion object {

server/src/main/kotlin/org/ivdnt/galahad/annotations/TermSpan.kt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@ package org.ivdnt.galahad.annotations
33
class TermSpan(
44
val indices: IntArray,
55
val value: String,
6-
)
6+
) {
7+
constructor(indices: List<Int>, value: String) : this(indices.toIntArray(), value)
8+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package org.ivdnt.galahad.formats
2+
3+
import org.ivdnt.galahad.annotations.Annotation
4+
import org.ivdnt.galahad.annotations.AnnotationReader
5+
import org.ivdnt.galahad.annotations.TermSpan
6+
7+
abstract class LineReader : AnnotationReader() {
8+
/**
9+
* Override newSentence to insert ner spans
10+
*/
11+
override fun newSentence() {
12+
buildList<Pair<String, MutableList<Int>>> {
13+
terms.forEachIndexed { i, t ->
14+
if (t.ner?.startsWith("B-") == true) {
15+
add(t.annotationHead(org.ivdnt.galahad.annotations.Annotation.NER)!! to mutableListOf(i))
16+
} else if (t.ner?.startsWith("I-") == true) {
17+
last().second.add(i)
18+
}
19+
}
20+
}.ifEmpty { null }?.map { (value, indices) -> TermSpan(indices, value) }
21+
?.let { spans[Annotation.NER] = it.toMutableList() }
22+
super.newSentence()
23+
}
24+
}

server/src/main/kotlin/org/ivdnt/galahad/formats/conllu/ConlluReader.kt

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,37 @@ import org.ivdnt.galahad.annotations.Annotation
44
import org.ivdnt.galahad.annotations.AnnotationReader
55
import org.ivdnt.galahad.annotations.Layer
66
import org.ivdnt.galahad.annotations.Term
7+
import org.ivdnt.galahad.formats.LineReader
78
import java.io.File
89

910
class ConlluReader(
1011
val file: File
11-
) : AnnotationReader() {
12+
) : LineReader() {
1213
private val ignorableMultiWordIds: MutableSet<String> = mutableSetOf()
1314

15+
private val String.id: String?
16+
get() = idRegex.find(this)?.groupValues?.get(1) // 0 is the whole match
17+
1418
override fun read(): Layer {
1519
file.forEachLine {
1620
when {
1721
it.startsWith("# newdoc") -> {
1822
newDocument()
1923
// get ID last, so we don't overwrite it while creating a new unit
20-
docID = Regex("id = (\\S+)").find(it)?.groupValues?.get(1) ?: "d${documents.size + 1}"
24+
docID = it.id
2125
}
2226

2327
it.startsWith("# newpar") -> {
2428
newParagraph()
2529
// get ID last, so we don't overwrite it while creating a new unit
26-
parID = Regex("id = (\\S+)").find(it)?.groupValues?.get(1) ?: "p${paragraphs.size + 1}"
30+
parID = it.id
2731
}
2832

2933
it.startsWith("# sent_id") || it.isBlank() -> {
3034
newSentence()
31-
sentID = Regex("id = (\\S+)").find(it)?.groupValues?.get(1) ?: "s${sentences.size + 1}"
35+
// get ID last, so we don't overwrite it while creating a new unit
36+
sentID = it.id
3237
}
33-
3438
!it.startsWith("#") -> {
3539
newWord(it)
3640
}
@@ -125,6 +129,8 @@ class ConlluReader(
125129
companion object {
126130
/** Supported names for the ner attribute in the MISC column. */
127131
private val nerAttrNames: List<String> = listOf("NamedEntity", "ner")
132+
private val idRegex = Regex("id = (\\S+)")
133+
128134

129135
private val indices: Map<Annotation, Int> = mapOf(
130136
Annotation.TOKEN to 1,

server/src/main/kotlin/org/ivdnt/galahad/formats/naf/NafConverter.kt

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import org.ivdnt.galahad.annotations.Annotation
44
import org.ivdnt.galahad.export.DocumentExport
55
import org.ivdnt.galahad.export.LayerConverter
66
import org.ivdnt.galahad.util.XmlUtil
7+
import org.ivdnt.galahad.util.ifNullOrBlank
78
import org.w3c.dom.Document
89
import org.w3c.dom.Element
910
import java.io.OutputStream
@@ -51,11 +52,9 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
5152

5253
val lp = xml.createElement("lp").apply {
5354
setAttribute("name", export.tagger.id)
54-
setAttribute("version", export.tagger.version)
5555
setAttribute("timestamp", now.toString())
56-
setAttribute("beginTimestamp", now.toString())
57-
setAttribute("endTimestamp", now.toString())
5856
setAttribute("hostname", "https://galahad.ivdnt.org")
57+
export.tagger.version.ifBlank { null }?.let{ setAttribute("version", it) }
5958
}
6059
val lpTerms = xml.createElement("linguisticProcessors").apply {
6160
setAttribute("layer", "terms")
@@ -100,7 +99,7 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
10099
setAttribute("offset", t.offset.toString())
101100
setAttribute("length", t.token.length.toString())
102101
setAttribute("sent", iSent.toString())
103-
setAttribute("para", iPar.toString())
102+
setAttribute("para", (iPar+1).toString())
104103
textContent = t.token
105104
}
106105
text.appendChild(wf)
@@ -115,7 +114,7 @@ class NafConverter(export: DocumentExport) : LayerConverter(export) {
115114
root.appendChild(terms)
116115
export.layer.terms.forEachIndexed { i, it ->
117116
val term = xml.createElement("term").apply {
118-
setAttribute("id", "t$i")
117+
setAttribute("id", "t${i+1}")
119118
}
120119
it.lemma?.let { term.setAttribute("lemma", it) }
121120
it.pos?.let { term.setAttribute("pos", it) }

server/src/main/kotlin/org/ivdnt/galahad/formats/naf/NafReader.kt

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
package org.ivdnt.galahad.formats.naf
22

3+
import org.ivdnt.galahad.annotations.*
34
import org.ivdnt.galahad.annotations.Annotation
4-
import org.ivdnt.galahad.annotations.AnnotationReader
5-
import org.ivdnt.galahad.annotations.Layer
6-
import org.ivdnt.galahad.annotations.Term
75
import org.ivdnt.galahad.util.XmlUtil
86
import org.ivdnt.galahad.util.childElements
97
import org.ivdnt.galahad.util.childOrNull
108
import java.io.File
9+
import java.util.*
1110

1211
typealias WordformID = String
1312
typealias TermID = String
@@ -40,10 +39,11 @@ class NafReader(file: File) : AnnotationReader() {
4039
private val nafEntities = root.childOrNull("entities")?.childElements?.map {
4140
NafEntity(
4241
type = it.getAttribute("type").ifEmpty { null },
43-
references = it.childElements.map { it.childElements.map { it.getAttribute("id") }.toList() }.toList()
42+
references = it.childOrNull("references")?.childElements?.map { it.childElements.map { it.getAttribute("id") }.toList() }?.toList()!!
4443
)
45-
}
46-
private val id = root.childOrNull("nafHeader")?.childOrNull("public")?.getAttribute("publicId").orEmpty().ifEmpty { null }
44+
}?.toList()
45+
private val id: String = root.childOrNull("nafHeader")?.childOrNull("public")?.getAttribute("publicId").orEmpty()
46+
.ifEmpty { UUID.randomUUID().toString() }
4747

4848
override fun read(): Layer {
4949
// group wordforms paragraph, then sentence, then sort by offset in sentence
@@ -70,17 +70,36 @@ class NafReader(file: File) : AnnotationReader() {
7070

7171
// space after
7272
val nextWordform = sent.getOrNull(i + 1)
73-
val spaceAfter = nextWordform?.offset == wordform.offset + wordform.token.length
73+
val spaceAfter = nextWordform?.offset != (wordform.offset + wordform.token.length)
7474

7575
terms += Term(wordform.id, wordform.offset, annotations, spaceAfter)
7676
}
77-
// TODO sentence level spans
77+
// collect all spans that refer to one of the terms in this sentence
78+
val termIds = terms.map { it.id }
79+
val nerSpans = nafEntities?.flatMap { e -> e.references.map { e.type!! to it } }
80+
nerSpans?.filter { (_, ids) -> ids.any { it in termIds } }?.ifEmpty { null }
81+
?.map { (value, ids) -> TermSpan(ids.map { id -> sent.indexOfFirst { it.id == id } }, value) }
82+
?.toMutableList()?.let { spans[Annotation.NER] = it }
83+
7884
newSentence()
7985
}
8086
newParagraph()
8187
}
8288
newDocument()
83-
return Layer(documents.toTypedArray())
89+
return Layer(documents.toTypedArray(), id)
90+
}
91+
92+
override fun newSentence() {
93+
// edit the NER value of the terms if spans are present
94+
spans[Annotation.NER]?.forEach { span ->
95+
span.indices.forEachIndexed { spanI, termI ->
96+
// Note the difference spanI and termI; e.g. span.indices = [4, 5]; so (0, 4) = (1, 5)
97+
val t = terms[termI]
98+
val iob = (if (spanI == 0) "B-" else "I-") + span.value
99+
terms[termI] = Term(t.id, t.offset, t.annotations + (Annotation.NER to iob), t.spaceAfter)
100+
}
101+
}
102+
super.newSentence()
84103
}
85104

86105
data class NafWordform(

server/src/main/kotlin/org/ivdnt/galahad/formats/tei/TeiReader.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ class TeiReader(
1919
override fun parseAttrs() {
2020
when (reader.localName) {
2121
in wordDataTags -> {
22-
lemma = reader.getAttributeValue(null, "lemma")?.takeIf { it.isNotBlank() }
23-
pos = reader.getAttributeValue(null, "pos")?.takeIf { it.isNotBlank() } ?: reader.getAttributeValue(
22+
lemma = reader.getAttributeValue(null, "lemma")?.ifBlank { null }
23+
pos = reader.getAttributeValue(null, "pos")?.ifBlank { null } ?: reader.getAttributeValue(
2424
null,
2525
"type"
26-
)?.takeIf { it.isNotBlank() }
26+
)?.ifBlank { null }
2727
spaceAfter = reader.getAttributeValue(null, "join") !in arrayOf("right", "both")
2828
// if spanValue is not null, it means we are in a span tag
2929
if (spanValue != null) {

server/src/main/kotlin/org/ivdnt/galahad/formats/tsv/TsvReader.kt

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
package org.ivdnt.galahad.formats.tsv
22

3+
import org.ivdnt.galahad.annotations.*
34
import org.ivdnt.galahad.annotations.Annotation
4-
import org.ivdnt.galahad.annotations.AnnotationReader
5-
import org.ivdnt.galahad.annotations.Layer
6-
import org.ivdnt.galahad.annotations.Term
5+
import org.ivdnt.galahad.formats.LineReader
76
import java.io.File
87

98
class TsvReader(
109
val file: File
11-
) : AnnotationReader() {
10+
) : LineReader() {
1211
private val columnIndices: MutableMap<Annotation, Int> = mutableMapOf()
1312
private var lastLineWasBlank: Boolean = false
1413

@@ -53,8 +52,8 @@ class TsvReader(
5352
.firstOrNull { (_, names) ->
5453
names.any { name -> header.equals(name, ignoreCase = true) }
5554
// if it exists, register the index
56-
}?.let { (annotationType, _) ->
57-
columnIndices[annotationType] = index
55+
}?.let { (annotation, _) ->
56+
columnIndices[annotation] = index
5857
}
5958
}
6059
}
@@ -72,18 +71,19 @@ class TsvReader(
7271
val values: List<String> = line.split("\t")
7372

7473
// Retrieve values
75-
val mutAnnot: Map<Annotation, String> = buildMap {
74+
val annotations: Map<Annotation, String> = buildMap {
7675
for (column in columnIndices.entries) {
77-
getColumn(column.value, values)?.let { put(column.key, it) }
76+
columnOrNull(column.value, values)?.let { put(column.key, it) }
7877
}
7978
}
80-
terms += Term(wordID(), offset, mutAnnot)
81-
offset += mutAnnot[Annotation.TOKEN]?.length ?: 0
79+
Term(wordID(), offset, annotations).also {
80+
terms += it
81+
offset += it.token.length
82+
}
8283
}
8384

8485
// Retrieves a column with bounds checking.
85-
private fun getColumn(index: Int, values: List<String>): String? =
86-
values.getOrNull(index)?.takeIf { it.isNotBlank() }
86+
private fun columnOrNull(i: Int, values: List<String>): String? = values.getOrNull(i)?.ifBlank { null }
8787

8888
companion object {
8989
val columnNames: Map<Annotation, List<String>> = mapOf(

server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
2323
protected val reader: XMLStreamReader by lazy { XmlUtil.inputFactory.createXMLStreamReader(stream) }
2424

2525
private val currentXmlID: String?
26-
get() = reader.getAttributeValue(XMLConstants.XML_NS_URI, "id")?.takeIf { it.isNotBlank() }
26+
get() = reader.getAttributeValue(XMLConstants.XML_NS_URI, "id")?.ifBlank { null }
2727
private var ignoring: Boolean = false
2828

2929
abstract val spanTags: Array<String>
@@ -92,7 +92,7 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
9292

9393
fun newSpan() {
9494
if (spanValue == null) return
95-
val indices = spanTargets.map { id -> terms.indexOfFirst { t -> t.id == id } }.toIntArray()
95+
val indices = spanTargets.map { id -> terms.indexOfFirst { t -> t.id == id } }
9696
spans.getOrPut(Annotation.NER, ::mutableListOf) += TermSpan(indices, spanValue!!)
9797
spanValue = null
9898
spanTargets.clear()
@@ -103,7 +103,7 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
103103
private fun shouldIgnore(): Boolean = ignoring.also { ignoring = reader.localName in ignorableTags }
104104

105105
private fun parseChars() {
106-
val words = reader.text.takeIf { it.isNotBlank() }?.split(whitespace) ?: emptyList()
106+
val words = reader.text.ifBlank { null }?.split(whitespace) ?: emptyList()
107107
for ((j, word) in words.withIndex()) {
108108
if (j > 0) newWordform()
109109
literal += word
@@ -113,8 +113,8 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
113113
override fun newWordform() {
114114
if (literal.isBlank()) return
115115
val annotations = buildMap {
116-
lemma?.takeIf { it.isNotBlank() }?.let { put(Annotation.LEMMA, it) }
117-
pos?.takeIf { it.isNotBlank() }?.let { put(Annotation.POS, it) }
116+
lemma?.ifBlank { null }?.let { put(Annotation.LEMMA, it) }
117+
pos?.ifBlank { null }?.let { put(Annotation.POS, it) }
118118
put(Annotation.TOKEN, literal)
119119
}
120120
terms += Term(wordID(), offset, annotations, spaceAfter)

0 commit comments

Comments
 (0)