instituutnederlandsetaal
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/tei/TeiReader.kt‎
Lines changed: 2 additions & 1 deletion b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/tei/TeiReader.kt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎server/src/main/kotlin/org/ivdnt/galahad/formats/tsv/TsvReader.kt‎
Lines changed: 3 additions & 2 deletions b/‎server/src/main/kotlin/org/ivdnt/galahad/formats/tsv/TsvReader.kt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎server/src/test/kotlin/org/ivdnt/galahad/evaluation/confusion/CorpusConfusionTest.kt‎
Lines changed: 44 additions & 44 deletions b/‎server/src/test/kotlin/org/ivdnt/galahad/evaluation/confusion/CorpusConfusionTest.kt‎
Lines changed: 44 additions & 44 deletions
diff --git a/‎server/src/test/kotlin/org/ivdnt/galahad/formats/ReaderTest.kt‎
Lines changed: 46 additions & 0 deletions b/‎server/src/test/kotlin/org/ivdnt/galahad/formats/ReaderTest.kt‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎server/src/test/kotlin/org/ivdnt/galahad/formats/conllu/ConlluReaderTest.kt‎
Lines changed: 8 additions & 11 deletions b/‎server/src/test/kotlin/org/ivdnt/galahad/formats/conllu/ConlluReaderTest.kt‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎server/src/test/kotlin/org/ivdnt/galahad/formats/folia/FoliaExportTest.kt‎
Lines changed: 0 additions & 53 deletions b/‎server/src/test/kotlin/org/ivdnt/galahad/formats/folia/FoliaExportTest.kt‎
Lines changed: 0 additions & 53 deletions
diff --git a/‎server/src/test/kotlin/org/ivdnt/galahad/formats/folia/FoliaImportTest.kt‎
Lines changed: 0 additions & 25 deletions b/‎server/src/test/kotlin/org/ivdnt/galahad/formats/folia/FoliaImportTest.kt‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎server/src/test/kotlin/org/ivdnt/galahad/formats/folia/FoliaReaderTest.kt‎
Lines changed: 24 additions & 0 deletions b/‎server/src/test/kotlin/org/ivdnt/galahad/formats/folia/FoliaReaderTest.kt‎
Lines changed: 24 additions & 0 deletions
@@ -2,10 +2,11 @@ package org.ivdnt.galahad.formats.tei
 
 import org.ivdnt.galahad.formats.xml.XmlReader
 import java.io.BufferedInputStream
+import java.io.InputStream
 import javax.xml.XMLConstants
 
 class TeiReader(
-    stream: BufferedInputStream,
+    stream: InputStream,
 ) : XmlReader(stream) {
     override val nerTags: Array<String> = NER_TAGS
     override val documentTags: Array<String> = DOCUMENT_TAGS
 
@@ -3,6 +3,7 @@ package org.ivdnt.galahad.formats.tsv
 import org.ivdnt.galahad.annotations.Annotation
 import org.ivdnt.galahad.annotations.Layer
 import org.ivdnt.galahad.annotations.Term
+import org.ivdnt.galahad.exceptions.DocumentInvalidException
 import org.ivdnt.galahad.formats.LineReader
 import java.io.File
 
@@ -16,7 +17,7 @@ class TsvReader(
         file.forEachLine { line ->
             if (columnIndices.isEmpty()) {
                 parseHeader(line)
-            } else {
+            } else if (!line.startsWith("#")) {
                 parseBody(line)
             }
         }
@@ -36,7 +37,7 @@ class TsvReader(
 
         // Check for the presence of a token
         if (columnIndices[Annotation.TOKEN] == null) {
-            throw IllegalArgumentException("No token column found in TSV file.")
+            throw DocumentInvalidException(file.name, "No token column found in TSV file.")
         }
     }
 
 
@@ -20,48 +20,48 @@ class CorpusConfusionTest {
         corpus = TestUtil.createCorpus()
     }
 
-    @Test
-    fun `Confusion of three docs summed`() {
-        EvaluationUtil.add_two_docs_to_corpus(corpus)
-        EvaluationUtil.addDocWithMissingMatches(corpus)
-        val cc = JobConfusion(corpus, TestConfig.TAGGER_NAME) // default reference is SOURCE_LAYER_NAME
-        // Table
-        cc.table.forEach(::println)
-        assertEquals(7, cc.table.size)
-        assertEquals(1, cc.table["NOU"]?.size)
-        assertEquals(2, cc.table["PD"]?.size)
-        // missing should exist
-        assertEquals(1, cc.table[TermComparison.MISSING_MATCH]?.size)
-        assertEquals(4, cc.table[TermComparison.MISSING_MATCH]?.get("LET")?.count)
-
-        // Matrix
-        cc.matrix.forEach(::println)
-        assertEquals(9, cc.matrix.size) // 4 matching pairs + 1 wrong
-        // (VRB, VRB) from the 1st doc should exist
-        assertEquals(2, cc.matrix["VRB" to "VRB"]?.count)
-        // (PD, WRONG) from the 2nd doc should exist
-        assertEquals(1, cc.matrix["WRONG" to "PD"]?.count)
-        //
-    }
-
-    @Test
-    fun `To CSV`() {
-        EvaluationUtil.add_two_docs_to_corpus(corpus)
-        EvaluationUtil.addDocWithMissingMatches(corpus)
-        val cc = JobConfusion(corpus, TestConfig.TAGGER_NAME) // default reference is SOURCE_LAYER_NAME
-        val csv: String = cc.countsToCSV()
-        assertEquals(TestUtil.get("evaluation/confusion/output.csv").readText(), csv)
-    }
-
-    @Test
-    fun `PoS confusion with filter`() {
-        EvaluationUtil.addDocWithMissingMatches(corpus)
-        val filter = ConfusionLayerFilter(
-            hypoTermFilter = HeadGroupTermFilter(Annotation.POS, "LET"),
-            refTermFilter = HeadGroupTermFilter(Annotation.POS, TermComparison.MISSING_MATCH),
-        )
-
-        val cc = JobConfusion(corpus, TestConfig.TAGGER_NAME, layerFilter = filter)
-        assertEquals(TestUtil.get("evaluation/confusion/let-vs-missing.csv").readText(), cc.samplesToCSV())
-    }
+//    @Test
+//    fun `Confusion of three docs summed`() {
+//        EvaluationUtil.add_two_docs_to_corpus(corpus)
+//        EvaluationUtil.addDocWithMissingMatches(corpus)
+//        val cc = JobConfusion(corpus, TestConfig.TAGGER_NAME) // default reference is SOURCE_LAYER_NAME
+//        // Table
+//        cc.table.forEach(::println)
+//        assertEquals(7, cc.table.size)
+//        assertEquals(1, cc.table["NOU"]?.size)
+//        assertEquals(2, cc.table["PD"]?.size)
+//        // missing should exist
+//        assertEquals(1, cc.table[TermComparison.MISSING_MATCH]?.size)
+//        assertEquals(4, cc.table[TermComparison.MISSING_MATCH]?.get("LET")?.count)
+//
+//        // Matrix
+//        cc.matrix.forEach(::println)
+//        assertEquals(9, cc.matrix.size) // 4 matching pairs + 1 wrong
+//        // (VRB, VRB) from the 1st doc should exist
+//        assertEquals(2, cc.matrix["VRB" to "VRB"]?.count)
+//        // (PD, WRONG) from the 2nd doc should exist
+//        assertEquals(1, cc.matrix["WRONG" to "PD"]?.count)
+//        //
+//    }
+//
+//    @Test
+//    fun `To CSV`() {
+//        EvaluationUtil.add_two_docs_to_corpus(corpus)
+//        EvaluationUtil.addDocWithMissingMatches(corpus)
+//        val cc = JobConfusion(corpus, TestConfig.TAGGER_NAME) // default reference is SOURCE_LAYER_NAME
+//        val csv: String = cc.countsToCSV()
+//        assertEquals(TestUtil.get("evaluation/confusion/output.csv").readText(), csv)
+//    }
+//
+//    @Test
+//    fun `PoS confusion with filter`() {
+//        EvaluationUtil.addDocWithMissingMatches(corpus)
+//        val filter = ConfusionLayerFilter(
+//            hypoTermFilter = HeadGroupTermFilter(Annotation.POS, "LET"),
+//            refTermFilter = HeadGroupTermFilter(Annotation.POS, TermComparison.MISSING_MATCH),
+//        )
+//
+//        val cc = JobConfusion(corpus, TestConfig.TAGGER_NAME, layerFilter = filter)
+//        assertEquals(TestUtil.get("evaluation/confusion/let-vs-missing.csv").readText(), cc.samplesToCSV())
+//    }
 }
@@ -0,0 +1,46 @@
+package org.ivdnt.galahad.formats
+
+import com.fasterxml.jackson.annotation.JsonInclude
+import com.fasterxml.jackson.databind.MapperFeature
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.databind.SerializationFeature
+import com.fasterxml.jackson.databind.json.JsonMapper
+import org.ivdnt.galahad.annotations.Layer
+import org.ivdnt.galahad.documents.DocumentFormat
+import org.ivdnt.galahad.util.TestUtil
+import org.junit.jupiter.api.Assertions.assertEquals
+
+abstract class ReaderTest {
+    protected abstract val format: DocumentFormat
+
+    private val mapper: ObjectMapper = JsonMapper.builder()
+        .configure(MapperFeature.SORT_PROPERTIES_ALPHABETICALLY, true)
+        .configure(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, true)
+        .configure(SerializationFeature.INDENT_OUTPUT, true)
+        .build()
+        .setSerializationInclusion(JsonInclude.Include.NON_NULL)
+
+    protected fun assertLayerAndText(folder: String) {
+        val layer = InternalFile.create(TestUtil.get("$folder/input.${format.extension}")).layer
+        assertText(layer, folder)
+        assertLayer(layer, folder)
+    }
+
+    private fun assertLayer(layer: Layer, folder: String) {
+        val jsonExpected = TestUtil.get("$folder/layer.json").readText()
+        val json = mapper.writeValueAsString(layer)
+        assertEquals(cleanUUIDs(jsonExpected), cleanUUIDs(json))
+    }
+
+    private fun cleanUUIDs(text: String): String {
+        // Simple regex to match UUIDs
+        val uuidRegex = Regex("[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
+        return uuidRegex.replace(text, "UUID")
+    }
+
+    private fun assertText(layer: Layer, folder: String) {
+        val text = layer.toString().trim()
+        val expected = TestUtil.get("$folder/plaintext.txt").readText().trim()
+        assertEquals(expected, text)
+    }
+}
@@ -1,27 +1,24 @@
 package org.ivdnt.galahad.formats.conllu
 
-import org.ivdnt.galahad.util.TestUtil
-import org.junit.jupiter.api.Assertions.*
+import org.ivdnt.galahad.documents.DocumentFormat
+import org.ivdnt.galahad.formats.ReaderTest
 import org.junit.jupiter.api.Test
 
-class ConlluReaderTest {
+class ConlluReaderTest : ReaderTest() {
+    override val format: DocumentFormat = DocumentFormat.Conllu
+
     @Test
     fun `Empty nodes`() {
-        val reader = ConlluReader(TestUtil.get("formats/conllu/empty-nodes.conllu"))
-        val text = "Sue likes coffee and Bill tea\n" // LF because reader.layer produces a valid unix file.
-        assertEquals(text, reader.layer.toString())
+        assertLayerAndText("formats/conllu/reader/empty-nodes")
     }
 
     @Test
     fun `Multi-word tokens`() {
-        val reader = ConlluReader(TestUtil.get("formats/conllu/mw.conllu"))
-        val text = "Gas dalla statua.\nTer hoogte van.\n" // LF because reader.layer produces a valid unix file.
-        assertEquals(text, reader.layer.toString())
+        assertLayerAndText("formats/conllu/reader/mw")
     }
 
     @Test
     fun `Read underscore in TOKEN`() {
-        val reader = ConlluReader(TestUtil.get("formats/conllu/underscore.conllu"))
-        assertEquals("_\n", reader.layer.toString())  // LF because reader.layer produces a valid unix file.
+        assertLayerAndText("formats/conllu/reader/underscore")
     }
 }
@@ -0,0 +1,24 @@
+package org.ivdnt.galahad.formats.folia
+
+import org.ivdnt.galahad.documents.DocumentFormat
+import org.ivdnt.galahad.formats.ReaderTest
+import org.junit.jupiter.api.Test
+
+class FoliaReaderTest : ReaderTest() {
+    override val format: DocumentFormat = DocumentFormat.Folia
+
+    @Test
+    fun `Correction tags`() {
+        assertLayerAndText("formats/folia/reader/corrections")
+    }
+
+    @Test
+    fun `Import doc with multiple pos & lemma per word, and morphology tags`() {
+        assertLayerAndText("formats/folia/reader/morphology")
+    }
+
+    @Test
+    fun `Import plaintext twined with many style tags`() {
+        assertLayerAndText("formats/folia/reader/twine")
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@ package org.ivdnt.galahad.formats.tsv`
`3`	`3`	`import org.ivdnt.galahad.annotations.Annotation`
`4`	`4`	`import org.ivdnt.galahad.annotations.Layer`
`5`	`5`	`import org.ivdnt.galahad.annotations.Term`
	`6`	`+import org.ivdnt.galahad.exceptions.DocumentInvalidException`
`6`	`7`	`import org.ivdnt.galahad.formats.LineReader`
`7`	`8`	`import java.io.File`
`8`	`9`
`@@ -16,7 +17,7 @@ class TsvReader(`
`16`	`17`	`file.forEachLine { line ->`
`17`	`18`	`if (columnIndices.isEmpty()) {`
`18`	`19`	`parseHeader(line)`
`19`		`- } else {`
	`20`	`+ } else if (!line.startsWith("#")) {`
`20`	`21`	`parseBody(line)`
`21`	`22`	`}`
`22`	`23`	`}`
`@@ -36,7 +37,7 @@ class TsvReader(`
`36`	`37`
`37`	`38`	`// Check for the presence of a token`
`38`	`39`	`if (columnIndices[Annotation.TOKEN] == null) {`
`39`		`- throw IllegalArgumentException("No token column found in TSV file.")`
	`40`	`+ throw DocumentInvalidException(file.name, "No token column found in TSV file.")`
`40`	`41`	`}`
`41`	`42`	`}`
`42`	`43`