Skip to content

Commit c342d9b

Browse files
committed
More NER support
1 parent f315721 commit c342d9b

48 files changed

Lines changed: 955 additions & 254 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

server/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,5 +54,6 @@ dependencies {
5454

5555
tasks.withType<Test> {
5656
environment(mapOf("profile" to "dev"))
57+
systemProperty("line.separator", "\n")
5758
useJUnitPlatform()
5859
}

server/src/main/kotlin/org/ivdnt/galahad/annotations/AnnotationReader.kt

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package org.ivdnt.galahad.annotations
22

3+
import kotlin.collections.plus
4+
35
abstract class AnnotationReader {
46
val layer: Layer by lazy { read() }
57

@@ -16,9 +18,9 @@ abstract class AnnotationReader {
1618
protected var sentID: String? = null
1719
protected var wordID: String? = null
1820

19-
private fun docID(): String = docID ?: "d$dIndex"
20-
private fun parID(): String = parID ?: "p$pIndex"
21-
private fun sentID(): String = sentID ?: "s$sIndex"
21+
protected fun docID(): String = docID ?: "d$dIndex"
22+
protected fun parID(): String = parID ?: "p$pIndex"
23+
protected fun sentID(): String = sentID ?: "s$sIndex"
2224
protected fun wordID(): String = wordID ?: "w$wIndex"
2325

2426
private val wIndex: Int get() = terms.size + 1
@@ -46,6 +48,19 @@ abstract class AnnotationReader {
4648

4749
protected open fun newSentence() {
4850
if (terms.isNotEmpty()) {
51+
// loop through all the terms and turn the NER into a span
52+
val indices = mutableListOf<Int>()
53+
terms.forEachIndexed { i, t ->
54+
t.ner?.let {
55+
indices += i
56+
}
57+
}
58+
if (indices.isNotEmpty()) {
59+
val ners = mutableListOf<TermSpan>()
60+
ners += TermSpan(indices, terms[indices.first()].annotationHead(Annotation.NER)!!)
61+
spans[Annotation.NER] = ners.toTypedArray()
62+
}
63+
4964
sentences.add(SentenceLayer(sentID(), terms.toTypedArray(), spans.toMap()))
5065
terms.clear()
5166
spans.clear()

server/src/main/kotlin/org/ivdnt/galahad/annotations/Layer.kt

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.ivdnt.galahad.annotations
22

33
import com.fasterxml.jackson.annotation.JsonIgnore
4+
import java.util.UUID
45

56
const val SOURCE_LAYER_NAME: String = "sourceLayer"
67

@@ -9,7 +10,8 @@ const val SOURCE_LAYER_NAME: String = "sourceLayer"
910
* Those may be split into paragraphs, sentences, etc.
1011
*/
1112
class Layer(
12-
val documents: Array<DocumentLayer>
13+
val documents: Array<DocumentLayer>,
14+
val id: String = UUID.randomUUID().toString(),
1315
) {
1416
@get:JsonIgnore
1517
val spans: Map<Annotation, Sequence<TermSpan>> by lazy {
@@ -23,10 +25,13 @@ class Layer(
2325
}
2426
}
2527
}
28+
2629
@get:JsonIgnore
2730
val summary: LayerSummary by lazy { LayerSummary(tokens = terms.count()) }
31+
2832
@get:JsonIgnore
2933
val preview: LayerPreview by lazy { LayerPreview(terms.take(LAYER_PREVIEW_LENGTH).toList()) }
34+
3035
@get:JsonIgnore
3136
val terms: Sequence<Term> by lazy {
3237
documents.asSequence().flatMap { document ->
@@ -41,7 +46,7 @@ class Layer(
4146
override fun toString(): String = documents.joinToString("\n\n") + "\n" // Unix convention EOF
4247

4348
companion object {
44-
val EMPTY: Layer = Layer(emptyArray())
49+
val EMPTY: Layer = Layer(emptyArray(), "")
4550
}
4651
}
4752

@@ -64,5 +69,10 @@ class SentenceLayer(
6469
val terms: Array<Term>,
6570
val spans: Map<Annotation, Array<TermSpan>>,
6671
) {
67-
override fun toString(): String = terms.joinToString("") { it.token + (if (it.spaceAfter == false) "" else " ") }
72+
override fun toString(): String = buildString {
73+
terms.forEachIndexed { i, t ->
74+
append(t.token)
75+
if (i != terms.lastIndex) append(t.space)
76+
}
77+
}
6878
}

server/src/main/kotlin/org/ivdnt/galahad/annotations/Term.kt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ class Term(
1111
) {
1212
val spaceAfter: Boolean? = if (spaceAfter == false) false else null
1313

14+
@get:JsonIgnore
15+
val space: String = if (spaceAfter == false) "" else " "
16+
1417
@get:JsonIgnore
1518
val token: String = annotations[Annotation.TOKEN]!!
1619

server/src/main/kotlin/org/ivdnt/galahad/app/User.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class User(
1010
companion object {
1111
private const val USERNAME: String = "user"
1212
private val ADMIN_FILE: File = File("data/admins/admins.txt")
13-
private val DEFAULT_USER: User get() = User(id = USERNAME, isAdmin = isAdmin(USERNAME))
13+
val DEFAULT_USER: User get() = User(id = USERNAME, isAdmin = isAdmin(USERNAME))
1414

1515
private fun isAdmin(username: String): Boolean {
1616
if (!ADMIN_FILE.exists()) return false // When no admins are set, no one is admin by default

server/src/main/kotlin/org/ivdnt/galahad/corpora/documents/Document.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class Document(
8080
// First try to access the layer. If the file is invalid, this will throw.
8181
val sourceLayer = internalFile.layer
8282
// Set sourceLayer as job. Note that if we threw, we don't unnecessarily create a job folder, keeping the disk clean.
83-
corpus.jobs.createOrThrow(SOURCE_LAYER_NAME).jobDocuments.createOrThrow(doc.name).layer = sourceLayer
83+
corpus.jobs.createOrThrow(SOURCE_LAYER_NAME).setLayer(doc.name, sourceLayer)
8484
// plaintext
8585
ThreadPoolUtil.pool.execute {
8686
doc.plaintextFile.writeText(internalFile.plaintext)

server/src/main/kotlin/org/ivdnt/galahad/corpora/documents/DocumentMetadata.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ data class DocumentMetadata(
2424
/** Last modified timestamp in milliseconds. */
2525
val lastModified: Long,
2626
/** UUID of the document. */
27-
val uuid: UUID,
27+
val uuid: String,
2828
/** Annotation types in the source layer. */
2929
val annotations: Set<Annotation>,
3030
) {
@@ -42,7 +42,7 @@ data class DocumentMetadata(
4242
layerPreview = file.layer.preview,
4343
layerSummary = file.layer.summary,
4444
lastModified = System.currentTimeMillis(),
45-
uuid = UUID.randomUUID(),
45+
uuid = file.layer.id,
4646
annotations = file.layer.terms.flatMap { it.annotations.keys }.toSet()
4747
)
4848
}

server/src/main/kotlin/org/ivdnt/galahad/corpora/jobs/Job.kt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,12 @@ class Job(
113113
override fun set() = JobMetadata.create(this@Job)
114114
}
115115

116-
fun layer(doc: Document): Layer = layer(doc.name)
117-
fun layer(key: String): Layer = jobDocuments.readOrNull(key)?.layer ?: Layer.EMPTY
118-
fun setLayerForKey(key: String, layer: Layer) {
116+
fun getLayer(doc: Document): Layer = getLayer(doc.name)
117+
fun getLayer(key: String): Layer = jobDocuments.readOrNull(key)?.layer ?: Layer.EMPTY
118+
fun setLayer(key: String, layer: Layer) {
119119
jobDocuments.createOrThrow(key).layer = layer
120120
}
121+
fun setLayer(doc: Document, layer: Layer): Unit = setLayer(doc.name, layer)
121122

122123
//////////////////////////////////////////////////////
123124
// TODO: check everything below
@@ -256,7 +257,7 @@ class Job(
256257
}
257258

258259
companion object {
259-
private val mapper: ObjectMapper by lazy { ObjectMapper() }
260+
private val mapper: ObjectMapper = ObjectMapper()
260261

261262
private fun <T : Any> taggerRequest(
262263
job: Job, route: String, method: HttpMethod, type: Class<T>,

server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/LayerComparison.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ val PUNCTUATION: Array<Char> = arrayOf(',', '.', '?', '!', ':', ';', ')', '(', '
1717
* (Still, aggregating these matches is up to the (corpus/documents) evaluation classes)
1818
*/
1919
open class LayerComparison(
20-
private val hypothesisLayer: Layer,
21-
private val referenceLayer: Layer,
20+
hypothesisLayer: Layer,
21+
referenceLayer: Layer,
2222
private val layerFilter: LayerFilter? = null,
2323
) {
2424
constructor(export: DocumentExport) : this(export.layer, export.sourceLayer)

server/src/main/kotlin/org/ivdnt/galahad/evaluation/confusion/CorpusConfusion.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ class CorpusConfusion(
3939
corpus.documents.readAll().forEach {
4040
add(
4141
DocumentConfusion(
42-
hypothesisJob.layer(it),
43-
referenceJob.layer(it),
42+
hypothesisJob.getLayer(it),
43+
referenceJob.getLayer(it),
4444
layerFilter,
4545
annotation,
4646
)

0 commit comments

Comments
 (0)