Skip to content

Commit bb143f9

Browse files
committed
Added TSV & Conllu merge tests
1 parent afe66f8 commit bb143f9

21 files changed

Lines changed: 636 additions & 22 deletions

File tree

server/src/main/kotlin/org/ivdnt/galahad/formats/conllu/ConlluMerger.kt

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import org.ivdnt.galahad.formats.tsv.TsvMerger
1212
class ConlluMerger(
1313
export: DocumentExport,
1414
) : TsvMerger(export) {
15+
override val emptyValue: String = "_"
16+
1517
override val columnIndices: MutableMap<Annotation, Int> = mutableMapOf(
1618
Annotation.TOKEN to 1,
1719
Annotation.LEMMA to 2,
@@ -22,21 +24,29 @@ class ConlluMerger(
2224
Annotation.NER to 9, // see GetNer
2325
)
2426

27+
override fun replaceColumns(columns: MutableList<String>) {
28+
replaceMisc(columns)
29+
super.replaceColumns(columns)
30+
}
31+
32+
private fun replaceMisc(columns: MutableList<String>) {
33+
// construct MISC by combining NER and MISC
34+
val term: Term = termComparisons[termIndex].hyp
35+
val ner: String? = term.annotations[Annotation.NER]?.let { "NamedEntity=$it" }
36+
val spaceAfter: String? = if (term.spaceAfter == false) "SpaceAfter=No" else null
37+
val miscField: String = listOfNotNull(spaceAfter, ner).joinToString("|")
38+
columns[columnIndices[Annotation.NER]!!] = miscField.ifEmpty { "_" }
39+
}
40+
2541
override fun mergeSingleColumn(
2642
columns: MutableList<String>,
2743
annotation: Annotation,
2844
columnIndex: Int,
2945
) {
3046
when (annotation) {
31-
Annotation.NER -> { // TODO if no NER still export spaceAfter
32-
// construct MISC by combining NER and MISC
33-
val term: Term = termComparisons[termIndex].hyp
34-
val ner: String? = term.annotations[Annotation.NER]?.let { "NamedEntity=$it" }
35-
val spaceAfter: String? = if (term.spaceAfter == false) "SpaceAfter=No" else null
36-
val miscField: String = listOfNotNull(ner, spaceAfter).joinToString("|")
37-
columns[columnIndex] = miscField.ifEmpty { "_" }
47+
Annotation.NER -> {
48+
// handled in replaceMisc
3849
}
39-
4050
Annotation.UPOS -> {
4151
// Split UPOS into head and features
4252
val term: Term = termComparisons[termIndex].hyp
@@ -45,7 +55,6 @@ class ConlluMerger(
4555
columns[3] = head
4656
columns[5] = features
4757
}
48-
4958
else -> super.mergeSingleColumn(columns, annotation, columnIndex)
5059
}
5160
}

server/src/main/kotlin/org/ivdnt/galahad/formats/tsv/TsvMerger.kt

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ open class TsvMerger(
1010
export: DocumentExport,
1111
) : LayerMerger(export) {
1212
protected open val columnIndices: MutableMap<Annotation, Int> = mutableMapOf()
13-
override fun merge(out: OutputStream): Unit = merge(PrintWriter(out.bufferedWriter()))
13+
override fun merge(out: OutputStream): Unit = merge(PrintWriter(out))
1414
protected var termIndex: Int = 0
15+
private var extraColumns: MutableList<Annotation> = mutableListOf()
16+
protected open val emptyValue: String = ""
1517

1618
/**
1719
* Merge uploaded raw file with tagger layer. Headers indices are already determined by TSVFile.
@@ -20,18 +22,25 @@ open class TsvMerger(
2022
fun merge(out: PrintWriter) {
2123
export.document.uploadedFile.forEachLine { line ->
2224
if (columnIndices.isEmpty()) {
23-
getColumnIndices(line.split("\t"))
24-
} else {
25-
if (line.isBlank()) {
26-
out.println()
25+
val headers = line.split("\t")
26+
getColumnIndices(headers)
27+
// Print header with any extra columns
28+
if (columnIndices.isNotEmpty()) {
29+
out.println((headers + extraColumns).joinToString("\t"))
2730
}
31+
} else if (!line.startsWith("#") && line.isNotBlank()) {
2832
val columns = line.split("\t").toMutableList()
29-
// Swap out pos & lemma, keep the rest.
33+
// Add extra columns.
34+
columns.addAll(List(extraColumns.size) { "" })
35+
// Swap out merging annotations, keep the rest.
3036
replaceColumns(columns)
31-
out.println(columns.joinToString("\t") + "\n")
37+
out.println(columns.joinToString("\t"))
3238
termIndex++
39+
} else {
40+
out.println(line)
3341
}
3442
}
43+
out.flush()
3544
}
3645

3746
private fun getColumnIndices(
@@ -47,15 +56,23 @@ open class TsvMerger(
4756
columnIndices[annotation] = index
4857
}
4958
}
59+
if (headers.isEmpty()) return // This line was not yet the header.
60+
// Add any missing columns.
61+
Annotation.order(export.tagger.annotationSet).forEach { annotation ->
62+
if (columnIndices[annotation] == null) {
63+
columnIndices[annotation] = headers.size + extraColumns.size
64+
extraColumns.add(annotation)
65+
}
66+
}
5067
}
5168

5269
/*
5370
* Replace annotations in their previously indexed columns.
5471
*/
55-
private fun replaceColumns(
72+
protected open fun replaceColumns(
5673
columns: MutableList<String>,
5774
) {
58-
export.tagger.annotationSet.forEach { annot ->
75+
export.tagger.annotationSet.filter { it != Annotation.TOKEN }.forEach { annot ->
5976
val index = columnIndices[annot] ?: return@forEach // Skip if not in the file.
6077
mergeSingleColumn(columns, annot, index)
6178
}
@@ -67,6 +84,6 @@ open class TsvMerger(
6784
columnIndex: Int,
6885
) {
6986
val term = termComparisons[termIndex].hyp
70-
columns[columnIndex] = term.annotationOrMissing(annotation)
87+
columns[columnIndex] = term.annotations[annotation] ?: emptyValue
7188
}
7289
}

server/src/test/kotlin/org/ivdnt/galahad/documents/DocumentMetadataTest.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class DocumentMetadataTest {
1818

1919
@Test
2020
fun `Properties for an unannotated file`() {
21-
val path = "formats/shared-converter/input.txt"
21+
val path = "formats/shared/converter/input.txt"
2222
val file = TestUtil.get(path)
2323
val plaintext = file.readText()
2424
val doc = TestUtil.getDoc(path)

server/src/test/kotlin/org/ivdnt/galahad/formats/ConverterTest.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ open class ConverterTest {
1919
private val fileName = "karel_en_martijn"
2020
private val uuid = "e51560ff-81a2-4ddd-ba04-c7eb07af6d2b"
2121

22-
protected open val folder: String = "shared-converter"
22+
protected open val folder: String = "shared/converter"
2323

2424
/** Whether to override the [Layer].id for the sake of a consistent test.
2525
* For formats like tsv that don't define an id at the root node. */
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package org.ivdnt.galahad.formats
2+
3+
import org.ivdnt.galahad.app.User
4+
import org.ivdnt.galahad.corpora.Corpus
5+
import org.ivdnt.galahad.documents.DocumentFormat
6+
import org.ivdnt.galahad.export.CorpusExport
7+
import org.ivdnt.galahad.util.TestUtil
8+
import org.junit.jupiter.api.Assertions
9+
import org.junit.jupiter.api.BeforeEach
10+
import java.io.ByteArrayOutputStream
11+
import java.io.File
12+
13+
abstract class MergerTest {
14+
private lateinit var corpus: Corpus
15+
abstract val folder: String
16+
abstract val format: DocumentFormat
17+
18+
@BeforeEach
19+
fun initCorpus() {
20+
corpus = TestUtil.createCorpus()
21+
}
22+
23+
fun merge() {
24+
val input: File = TestUtil.get("formats/$folder/input.${format.extension}")
25+
val merge: File = TestUtil.get("formats/$folder/layer.${format.extension}") // TODO perhaps use json
26+
val output: File = TestUtil.get("formats/$folder/output.${format.extension}")
27+
28+
val doc = corpus.documents.createOrThrow(input)
29+
// set merge layer as a job
30+
val job = corpus.jobs.createOrThrow("spacy")
31+
job.setLayer(doc, InternalFile.create(merge).layer)
32+
33+
// merge
34+
val corpusExport = CorpusExport.create(corpus, "spacy", format, User.DEFAULT_USER, true, false)
35+
val docExport = corpusExport.documentExport(doc)
36+
37+
val convertedText = ByteArrayOutputStream().also { docExport.merge(it); it.flush() }.toString()
38+
val expectedText = output.readText()
39+
40+
Assertions.assertEquals(expectedText, convertedText)
41+
}
42+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package org.ivdnt.galahad.formats.conllu
2+
3+
import org.ivdnt.galahad.documents.DocumentFormat
4+
import org.ivdnt.galahad.formats.MergerTest
5+
import org.junit.jupiter.api.Test
6+
7+
class ConlluMergerTest: MergerTest() {
8+
override val format: DocumentFormat = DocumentFormat.Conllu
9+
override val folder: String = "conllu/merger"
10+
11+
@Test
12+
fun `Merge`() {
13+
merge()
14+
}
15+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package org.ivdnt.galahad.formats.tsv
2+
3+
import org.ivdnt.galahad.documents.DocumentFormat
4+
import org.ivdnt.galahad.formats.MergerTest
5+
import org.junit.jupiter.api.Test
6+
7+
class TsvMergerTest: MergerTest() {
8+
override val format: DocumentFormat = DocumentFormat.Tsv
9+
override val folder: String = "tsv/merger"
10+
11+
@Test
12+
fun `Merge`() {
13+
merge()
14+
}
15+
}

server/src/test/kotlin/org/ivdnt/galahad/web/DocumentsControllerTest.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class DocumentsControllerTest(
6666
@Test
6767
fun `Upload zip with all formats`() {
6868
val corpus = SpringUtil.createCorpus(config)
69-
val files = TestUtil.get("formats/shared-converter").listFiles()
69+
val files = TestUtil.get("formats/shared/converter").listFiles()
7070
val zip = zipped(files.asIterable())
7171
mvc.uploadFile(zip, corpus, MediaType.APPLICATION_OCTET_STREAM_VALUE)
7272
assertEquals(6, getDocs(corpus).size)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# newdoc id = karel
2+
# newpar id = karel.p1
3+
# sent_id = karel.p1.s1
4+
# text = Fraaie historie ende alwaer.
5+
1 Fraaie LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
6+
2 historie LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
7+
3 ende LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
8+
4 alwaer LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
9+
5 . LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
10+
11+
# sent_id = karel.p1.s2
12+
# text = Magh 'k u vertellen, hoirt naer.
13+
1 Magh LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
14+
2 'k LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
15+
3 u LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
16+
4 vertellen LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
17+
5 , LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
18+
6 hoirt LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
19+
7 naer LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
20+
8 . LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
21+
22+
# newpar id = karel.p2
23+
# sent_id = karel.p2.s1
24+
# text = 't Was op enen avondstonde.
25+
1 't LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
26+
2 Was LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
27+
3 op LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
28+
4 enen LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
29+
5 avondstonde LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
30+
6 . LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
31+
32+
# sent_id = karel.p2.s2
33+
# text = Dat koning Carel slaepen beghonde.
34+
1 Dat LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
35+
2 koning LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
36+
3 Carel LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
37+
4 slaepen LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
38+
5 beghonde LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
39+
6 . LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
40+
41+
# newdoc id = martijn
42+
# newpar id = martijn.p1
43+
# sent_id = martijn.p1.s1
44+
# text = "Martijn, slaepstu? slaept dijn sin?"
45+
1 " LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
46+
2 Martijn LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
47+
3 , LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
48+
4 slaepstu LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
49+
5 ? LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
50+
6 slaept LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
51+
7 dijn LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
52+
8 sin LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
53+
9 ? LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
54+
10 " LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
55+
56+
# sent_id = martijn.p1.s2
57+
# text = Sprec! hebstu gheen spreken in?
58+
1 Sprec LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
59+
2 ! LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
60+
3 hebstu LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
61+
4 gheen LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
62+
5 spreken LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
63+
6 in LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
64+
7 ? LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
65+
66+
# newpar id = martijn.p2
67+
# sent_id = martijn.p2.s1
68+
# text = Du dinkes mi verdoren.
69+
1 Du LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
70+
2 dinkes LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
71+
3 mi LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
72+
4 verdoren LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
73+
5 . LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
74+
75+
# sent_id = martijn.p2.s2
76+
# text = Dune achtes meer no min
77+
1 Dune LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
78+
2 achtes LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
79+
3 meer LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
80+
4 no LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC
81+
5 min LEMMA UPOS XPOS FEATS HEAD DEPREL _ MISC

server/src/test/resources/formats/shared-converter/karel_en_martijn.conllu renamed to server/src/test/resources/formats/conllu/merger/layer.conllu

File renamed without changes.

0 commit comments

Comments
 (0)