Skip to content

Commit d0625fa

Browse files
committed
Generic xml stream reader and "contains() -> in" refactor
1 parent cf94932 commit d0625fa

31 files changed

Lines changed: 468 additions & 204 deletions

server/src/main/kotlin/org/ivdnt/galahad/annotations/AnnotationReader.kt

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
package org.ivdnt.galahad.annotations
22

3-
import java.io.File
4-
5-
abstract class AnnotationReader(
6-
protected val file: File
7-
) {
3+
abstract class AnnotationReader {
84
val layer: Layer by lazy { read() }
95

106
protected val documents: MutableList<DocumentLayer> = mutableListOf()
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package org.ivdnt.galahad.annotations
2+
3+
import org.ivdnt.galahad.evaluation.comparison.nextOrNull
4+
5+
class LayerAligner(
6+
val hypothesis: Layer,
7+
val reference: Layer,
8+
): AnnotationReader() {
9+
override fun read(): Layer {
10+
val hypoIter = hypothesis.terms.iterator()
11+
12+
reference.documents.forEach { doc ->
13+
doc.paragraphs.forEach { paragraph ->
14+
paragraph.sentences.forEach { sentence ->
15+
sentence.terms.forEach { refTerm ->
16+
// TODO: implement alignment logic
17+
hypoIter.nextOrNull()?.also { hypoTerm ->
18+
if (hypoTerm.token == refTerm.token) {
19+
terms += hypoTerm.alignedTo(refTerm)
20+
}
21+
}
22+
}
23+
}
24+
}
25+
}
26+
return Layer(documents.toTypedArray())
27+
}
28+
}

server/src/main/kotlin/org/ivdnt/galahad/annotations/Term.kt

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ class Term(
3232
@get:JsonIgnore
3333
val ner: String? = annotations[Annotation.NER]
3434

35+
/**
36+
* Returns a term with the same data, except its offset is aligned to that of [refTerm].
37+
*/
38+
fun alignedTo(refTerm: Term): Term = Term(id, refTerm.offset, annotations, spaceAfter)
3539

3640
fun isMulti(annotation: Annotation): Boolean = annotations[annotation]?.contains("+") == true
3741

@@ -56,15 +60,15 @@ class Term(
5660
val annotation = annotations[annotationType] ?: return null
5761
// for NER
5862
if (annotationType == Annotation.NER) {
59-
if (annotation.contains('-')) {
63+
if ('-' in annotation) {
6064
return annotation.split('-')[1]
6165
}
6266
}
6367
// for POS & UPOS
64-
else if (listOf(Annotation.POS, Annotation.UPOS).contains(annotationType)) {
68+
else if (annotationType in posAnnotations) {
6569
return if (isMulti(annotationType)) {
6670
// Split on + and transform each part
67-
annotation.split("+").map { singlePosToHead(it) }.joinToString("+")
71+
annotation.split("+").joinToString("+") { singlePosToHead(it) }
6872
} else {
6973
singlePosToHead(annotation)
7074
}
@@ -75,10 +79,11 @@ class Term(
7579

7680
companion object {
7781
val EMPTY: Term = Term("", 0, mapOf(Annotation.TOKEN to ""))
82+
private val posAnnotations: Array<Annotation> = arrayOf(Annotation.POS, Annotation.UPOS)
83+
private val posHeadSeparators: Array<Char> = arrayOf('(', '|')
7884

79-
fun missingName(annotation: Annotation): String =
80-
// simply uppercase and prepend "NO_"
81-
"NO_${annotation.value.uppercase()}"
85+
// simply uppercase and prepend "NO_"
86+
fun missingName(annotation: Annotation): String = "NO_${annotation.value.uppercase()}"
8287

8388
/** The features of [pos]. E.g. "num=sg" for "NOU(num=sg)". Does not support multi-pos. */
8489
fun features(pos: String?): String? {
@@ -91,9 +96,8 @@ class Term(
9196
}
9297

9398
fun singlePosToHead(pos: String): String {
94-
val separators = listOf('(', '|')
95-
for (separator in separators) {
96-
if (pos.contains(separator)) {
99+
for (separator in posHeadSeparators) {
100+
if (separator in pos) {
97101
val head = pos.split(separator)[0]
98102
// presumably head won't be empty, but this way we could
99103
// parse something like (VRB) if anyone would ever use that

server/src/main/kotlin/org/ivdnt/galahad/app/Galahad.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ class Galahad {
9191
.description("Generating Linguistic Annotations for Historical Dutch")
9292
.contact(Contact().name("GaLAHaD GitHub").url("https://github.com/instituutnederlandsetaal/galahad"))
9393
)
94-
if (application_profile.contains("prod")) {
94+
if ("prod" in application_profile) {
9595
api = api.servers(listOf(Server().url("/galahad/api").description("GaLAHaD API")))
9696
}
9797
return api

server/src/main/kotlin/org/ivdnt/galahad/app/User.kt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@ class User(
1212
private val ADMIN_FILE: File = File("data/admins/admins.txt")
1313
private val DEFAULT_USER: User get() = User(id = USERNAME, isAdmin = isAdmin(USERNAME))
1414

15-
private fun isAdmin(string: String): Boolean {
15+
private fun isAdmin(username: String): Boolean {
1616
if (!ADMIN_FILE.exists()) return false // When no admins are set, no one is admin by default
17-
return ADMIN_FILE.readLines().map { it.trim() }
18-
.contains(string) // Otherwise only declared admins are admins
17+
return username in ADMIN_FILE.readLines().map { it.trim() } // Otherwise only declared admins are admins
1918
}
2019

2120
fun fromRequest(request: HttpServletRequest?): User {

server/src/main/kotlin/org/ivdnt/galahad/corpora/MutableCorpusMetadata.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@ open class MutableCorpusMetadata(
3838
* Whether the user is in the list of collaborators of this corpus.
3939
* Note that this is not the same as having write access: use [hasWriteAccess].
4040
*/
41-
fun isCollaborator(user: User): Boolean = collaborators.contains(user.id) == true
41+
fun isCollaborator(user: User): Boolean = user.id in collaborators
4242

4343
/**
4444
* Whether the user is in the list of viewers of this corpus.
4545
* Note that this is not the same as having read access: use [hasReadAccess].
4646
*/
47-
fun isViewer(user: User): Boolean = viewers.contains(user.id) == true
47+
fun isViewer(user: User): Boolean = user.id in viewers
4848

4949
/** To have write access, you need to be an owner, collaborator or admin. */
5050
fun hasWriteAccess(user: User): Boolean {
@@ -165,7 +165,7 @@ open class MutableCorpusMetadata(
165165
newMeta.viewers.remove(newMeta.owner)
166166

167167
// Remove collaborators from list of viewers
168-
newMeta.viewers.removeIf { newMeta.collaborators.contains(it) }
168+
newMeta.viewers.removeIf { it in newMeta.collaborators }
169169

170170
return newMeta
171171
}

server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/LayerComparison.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import org.ivdnt.galahad.export.DocumentExport
88
fun Iterator<Term>.nextOrNull(): Term? = if (hasNext()) next() else null
99

1010
// Some hardcoded punctuation
11-
val PUNCTUATION: List<String> = listOf(",", ".", "?", "!", ":", ";", ")", "(", "'", "\"")
11+
val PUNCTUATION: Array<Char> = arrayOf(',', '.', '?', '!', ':', ';', ')', '(', '\'', '"')
1212

1313
/**
1414
* Match the [Layer.terms] of two layers based on their [WordForm] position (offset and length)
@@ -148,7 +148,7 @@ open class LayerComparison(
148148
}
149149

150150
fun truncatePC(str: String): String {
151-
return if (str.isNotEmpty() && PUNCTUATION.contains(str.last().toString())) {
151+
return if (str.isNotEmpty() && str.last() in PUNCTUATION) {
152152
str.slice(0 until str.lastIndex)
153153
} else {
154154
str

server/src/main/kotlin/org/ivdnt/galahad/evaluation/confusion/Confusion.kt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,11 @@ open class Confusion(private val truncate: Boolean = true, val annotation: Annot
100100
private fun add(pos1: String, pos2: String, evaluationEntry: EvaluationEntry) {
101101
when {
102102
// Complex pos are mapped to a single category
103-
pos1.contains('+') -> add(MULTIPLE_POS, pos2, evaluationEntry)
104-
pos2.contains('+') -> add(pos1, MULTIPLE_POS, evaluationEntry)
103+
'+' in pos1 -> add(MULTIPLE_POS, pos2, evaluationEntry)
104+
'+' in pos2 -> add(pos1, MULTIPLE_POS, evaluationEntry)
105105
// Non-alphabetical pos are mapped to a single category "other"
106-
pos1.contains(Regex(OTHER_POS_REGEX)) -> add(OTHER_POS, pos2, evaluationEntry)
107-
pos2.contains(Regex(OTHER_POS_REGEX)) -> add(pos1, OTHER_POS, evaluationEntry)
106+
Regex(OTHER_POS_REGEX) in pos1 -> add(OTHER_POS, pos2, evaluationEntry)
107+
Regex(OTHER_POS_REGEX) in pos2 -> add(pos1, OTHER_POS, evaluationEntry)
108108
// Otherwise a simple merge
109109
else -> matrix.merge(Pair(pos1, pos2), evaluationEntry) { a, b -> EvaluationEntry.add(a, b, truncate) }
110110
}

server/src/main/kotlin/org/ivdnt/galahad/export/CorpusExport.kt

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,7 @@ import org.ivdnt.galahad.exceptions.MergeNotImplementedException
1212
import org.ivdnt.galahad.taggers.Tagger
1313
import org.ivdnt.galahad.util.FileMapper
1414
import org.ivdnt.galahad.util.createZipFile
15-
import java.io.File
1615
import java.io.OutputStream
17-
import java.nio.file.Files
18-
import kotlin.io.path.createTempDirectory
1916

2017
class CorpusExport private constructor(
2118
val corpus: Corpus,
@@ -26,7 +23,6 @@ class CorpusExport private constructor(
2623
val tagger: Tagger,
2724
val shouldMerge: Boolean,
2825
) : Logging {
29-
3026
private fun mergeFormatMatches(
3127
it: Document, format: DocumentFormat,
3228
): Boolean {
@@ -38,7 +34,6 @@ class CorpusExport private constructor(
3834
return otherFormat == format
3935
}
4036

41-
4237
private fun formatMapper(doc: Document, out: OutputStream) {
4338
try {
4439
// Document conversions.
@@ -64,8 +59,16 @@ class CorpusExport private constructor(
6459
out: OutputStream,
6560
) {
6661
val documents = corpus.documents.readAll().filter { DocumentExport.create(this, it).layer != Layer.EMPTY }
67-
val seq: Sequence<FileMapper> = documents.asSequence().map { doc -> doc.name to { out -> formatMapper(doc, out) } }
68-
val seqCmdi: Sequence<FileMapper> = documents.asSequence().map { doc -> "metadata/CMDI-${doc.uploadedFile.nameWithoutExtension}.xml" to { out -> DocumentExport.create(this, doc).cmdi(out) } }
62+
val seq: Sequence<FileMapper> =
63+
documents.asSequence().map { doc -> doc.name to { out -> formatMapper(doc, out) } }
64+
val seqCmdi: Sequence<FileMapper> = documents.asSequence().map { doc ->
65+
"metadata/CMDI-${doc.uploadedFile.nameWithoutExtension}.xml" to { out ->
66+
DocumentExport.create(
67+
this,
68+
doc
69+
).cmdi(out)
70+
}
71+
}
6972
createZipFile(seq + seqCmdi, out, includeCMDI = true)
7073
}
7174

server/src/main/kotlin/org/ivdnt/galahad/export/LayerConverter.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import org.ivdnt.galahad.formats.tsv.TsvConverter
1111
import org.ivdnt.galahad.formats.txt.TxtConverter
1212
import java.io.OutputStream
1313

14-
abstract class LayerConverter(protected val export: DocumentExport) {
14+
abstract class LayerConverter protected constructor(protected val export: DocumentExport) {
1515
abstract fun convert(out: OutputStream)
1616

1717
companion object {

0 commit comments

Comments
 (0)