Skip to content

Commit 6c00070

Browse files
committed
Fix XmlReader not ingnoring tags
1 parent c17570a commit 6c00070

32 files changed

Lines changed: 1165 additions & 344 deletions

server/src/main/kotlin/org/ivdnt/galahad/app/Galahad.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ const val JOB_DOCUMENT_URL: String = "$JOB_URL/documents/{document}"
5252
const val EVALUATION_URL: String = "$JOB_URL/evaluation"
5353
const val DISTRIBUTION_URL: String = "$EVALUATION_URL/distribution"
5454
const val TOKEN_FREQUENCY_URL: String = "$EVALUATION_URL/frequency"
55+
const val ENTITIES_URL: String = "$JOB_DOCUMENT_URL/entities"
5556
const val METRICS_URL: String = "$EVALUATION_URL/metrics"
5657
const val METRICS_SAMPLES_URL: String = "$METRICS_URL/download"
5758
const val CONFUSION_URL: String = "$EVALUATION_URL/confusion"

server/src/main/kotlin/org/ivdnt/galahad/formats/folia/FoliaReader.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class FoliaReader(
3333
private val SENTENCE_TAGS = arrayOf("s", "utt")
3434
private val WORD_TAGS = arrayOf("w")
3535
private val WORD_DATA_TAGS = arrayOf("w", "lemma", "pos")
36-
private val IGNORABLE_TAGS = arrayOf("morphology", "note", "figure", "comment", "original", "suggestion")
36+
private val IGNORABLE_TAGS = arrayOf("morphology", "note", "figure", "comment", "original", "suggestion", "metadata")
3737
private val SPAN_TAGS = arrayOf("entity")
3838
private val SPAN_DATA_TAGS = arrayOf("wref", "entity")
3939
}

server/src/main/kotlin/org/ivdnt/galahad/formats/xml/XmlReader.kt

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
package org.ivdnt.galahad.formats.xml
22

3+
import org.ivdnt.galahad.annotations.*
34
import org.ivdnt.galahad.annotations.Annotation
4-
import org.ivdnt.galahad.annotations.AnnotationReader
5-
import org.ivdnt.galahad.annotations.Layer
6-
import org.ivdnt.galahad.annotations.Term
7-
import org.ivdnt.galahad.annotations.TermSpan
85
import org.ivdnt.galahad.util.XmlUtil
96
import java.io.InputStream
10-
import java.util.UUID
7+
import java.util.*
118
import javax.xml.XMLConstants
129
import javax.xml.stream.XMLStreamConstants
1310
import javax.xml.stream.XMLStreamReader
@@ -25,6 +22,8 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
2522
private val currentXmlID: String?
2623
get() = reader.getAttributeValue(XMLConstants.XML_NS_URI, "id")?.ifBlank { null }
2724
private var ignoring: Boolean = false
25+
private var currentDepth: Int = 0
26+
private var ignoreDepth: Int? = null
2827

2928
abstract val spanTags: Array<String>
3029
abstract val spanDataTags: Array<String>
@@ -35,7 +34,6 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
3534
abstract val wordDataTags: Array<String>
3635
abstract val ignorableTags: Array<String>
3736

38-
3937
final override fun read(): Layer {
4038
// retrieve the XML ID of the document root
4139
var rootID: String = UUID.randomUUID().toString()
@@ -64,7 +62,7 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
6462
}
6563
}
6664
XMLStreamConstants.CHARACTERS -> if (!ignoring) parseChars()
67-
XMLStreamConstants.END_ELEMENT -> if (!ignoring) {
65+
XMLStreamConstants.END_ELEMENT -> if (!shouldIgnore()) {
6866
when (reader.localName) {
6967
in documentTags -> newDocument()
7068
in paragraphTags -> newParagraph()
@@ -100,7 +98,22 @@ abstract class XmlReader(stream: InputStream) : AnnotationReader() {
10098

10199
protected abstract fun parseAttrs()
102100

103-
private fun shouldIgnore(): Boolean = ignoring.also { ignoring = reader.localName in ignorableTags }
101+
private fun shouldIgnore(): Boolean {
102+
if (reader.isStartElement) {
103+
currentDepth++
104+
if (!ignoring && reader.localName in ignorableTags) {
105+
ignoring = true
106+
ignoreDepth = currentDepth
107+
}
108+
} else if (reader.isEndElement) {
109+
if (currentDepth == ignoreDepth) {
110+
ignoring = false
111+
ignoreDepth = null
112+
}
113+
currentDepth--
114+
}
115+
return ignoring
116+
}
104117

105118
private fun parseChars() {
106119
val words = reader.text.ifBlank { null }?.split(whitespace) ?: emptyList()

server/src/main/kotlin/org/ivdnt/galahad/web/controller/EvaluationController.kt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import io.swagger.v3.oas.annotations.responses.ApiResponse
99
import org.apache.logging.log4j.kotlin.Logging
1010
import org.ivdnt.galahad.annotations.Annotation
1111
import org.ivdnt.galahad.annotations.SOURCE_LAYER_NAME
12+
import org.ivdnt.galahad.annotations.Term
1213
import org.ivdnt.galahad.app.*
1314
import org.ivdnt.galahad.evaluation.comparison.TermComparison
1415
import org.ivdnt.galahad.evaluation.confusion.Confusion
@@ -191,4 +192,12 @@ class EvaluationController(
191192
@PathVariable @Parameter(description = "Tagger name or sourceLayer") job: String,
192193
@RequestParam(defaultValue = SOURCE_LAYER_NAME) @Parameter(description = "Tagger name or sourceLayer") reference: String? = SOURCE_LAYER_NAME,
193194
): CorpusMetrics = evaluationService.getTokenFrequency(corpus, job, reference)
195+
196+
@CrossOrigin
197+
@GetMapping(ENTITIES_URL)
198+
fun getEntities(
199+
@PathVariable @Parameter(description = "Corpus UUID") corpus: UUID,
200+
@PathVariable @Parameter(description = "Document name") document: String,
201+
@PathVariable @Parameter(description = "Tagger name or sourceLayer") job: String,
202+
): List<Pair<String, List<Term>>> = evaluationService.getEntities(corpus, document, job)
194203
}

server/src/main/kotlin/org/ivdnt/galahad/web/service/EvaluationService.kt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import jakarta.servlet.http.HttpServletRequest
44
import jakarta.servlet.http.HttpServletResponse
55
import org.ivdnt.galahad.annotations.Annotation
66
import org.ivdnt.galahad.annotations.SOURCE_LAYER_NAME
7+
import org.ivdnt.galahad.annotations.Term
78
import org.ivdnt.galahad.app.User
89
import org.ivdnt.galahad.corpora.CorpusMetadata
910
import org.ivdnt.galahad.evaluation.comparison.*
@@ -277,4 +278,9 @@ class EvaluationService(val corpora: CorporaService) {
277278
)
278279
return cm
279280
}
281+
282+
fun getEntities(corpus: UUID, document: String, job: String): List<Pair<String, List<Term>>>{
283+
val layer = corpora.readAsReaderOrThrow(corpus, user).jobs.readOrThrow(job).getLayer(document)
284+
return layer.documents.flatMap { it.paragraphs.flatMap { it.sentences.flatMap { sent -> sent.spans[Annotation.NER]?.map { span -> span.value to span.indices.map { sent.terms[it] } } ?: emptyList() } } }
285+
}
280286
}

server/src/test/kotlin/org/ivdnt/galahad/formats/conllu/ConlluExportTest.kt

Lines changed: 0 additions & 44 deletions
This file was deleted.

server/src/test/kotlin/org/ivdnt/galahad/formats/conllu/ConlluImportTest.kt

Lines changed: 0 additions & 22 deletions
This file was deleted.
Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,40 @@
11
package org.ivdnt.galahad.formats.folia
22

3-
import org.ivdnt.galahad.formats.Resource
4-
import org.ivdnt.galahad.formats.assertPlaintextAndSourcelayer
5-
import org.ivdnt.galahad.formats.tsv.TsvFile
6-
import org.junit.jupiter.api.Assertions.assertEquals
3+
import org.ivdnt.galahad.util.TestUtil
74
import org.junit.jupiter.api.Test
85

9-
internal class FoliaImportTest {
6+
class FoliaImportTest {
107

118
@Test
129
fun `Import doc with correction tags`() {
13-
val file = FoliaFile(Resource.get("folia/corrections/input.folia.xml"))
14-
assertPlaintextAndSourcelayer("folia/corrections", file)
10+
val file = FoliaFile(TestUtil.get("formats/folia/reader/corrections/input.folia.xml"))
11+
TestUtil.assertPlaintextAndSourcelayer("formats/folia/reader/corrections", file)
1512
}
1613

1714
@Test
1815
fun `Import doc with multiple pos & lemma per word, and morphology tags`() {
19-
val foliaFile = FoliaFile(Resource.get("folia/hauraki/input.folia.xml"))
20-
val expectedPlain = Resource.get("folia/hauraki/plaintext.txt").readText()
21-
22-
assertEquals(expectedPlain, foliaFile.plaintext.trim())
23-
24-
val sourceLayer = foliaFile.layer
25-
assertEquals(97, sourceLayer.wordForms.size)
26-
assertEquals(97, sourceLayer.terms.size)
27-
28-
val tsvFile = TsvFile(Resource.get("folia/hauraki/pie.tsv"))
29-
30-
val mergeLayer = tsvFile.mapOnPlainText(foliaFile.plaintext, "mappedLayer")
31-
assertEquals(89, mergeLayer.wordForms.size)
32-
assertEquals(89, mergeLayer.terms.size)
16+
val file = FoliaFile(TestUtil.get("formats/folia/reader/morphology/input.folia.xml"))
17+
TestUtil.assertPlaintextAndSourcelayer("formats/folia/reader/morphology", file)
18+
19+
// val sourceLayer = foliaFile.layer
20+
// assertEquals(97, sourceLayer.wordForms.size)
21+
// assertEquals(97, sourceLayer.terms.size)
22+
//
23+
// val tsvFile = TsvFile(TestUtil.get("folia/hauraki/pie.tsv"))
24+
//
25+
// val mergeLayer = tsvFile.mapOnPlainText(foliaFile.plaintext, "mappedLayer")
26+
// assertEquals(89, mergeLayer.wordForms.size)
27+
// assertEquals(89, mergeLayer.terms.size)
3328
}
3429

3530
@Test
3631
fun `Import plaintext twined with many style tags`() {
37-
val file = FoliaFile(Resource.get("folia/twine/twine.folia.xml"))
38-
val plaintext = Resource.get("folia/twine/plaintext.txt").readText()
39-
assertEquals(plaintext, file.plaintext)
40-
// Source layer should be empty, there are no source annotations
41-
val sourceLayer = file.layer
42-
assertEquals(0, sourceLayer.wordForms.size)
43-
assertEquals(0, sourceLayer.terms.size)
32+
val file = FoliaFile(TestUtil.get("formats/folia/reader/twine/twine.folia.xml"))
33+
TestUtil.assertPlainText("formats/folia/reader/twine", file)
34+
// assertEquals(plaintext, file.plaintext)
35+
// // Source layer should be empty, there are no source annotations
36+
// val sourceLayer = file.layer
37+
// assertEquals(0, sourceLayer.wordForms.size)
38+
// assertEquals(0, sourceLayer.terms.size)
4439
}
4540
}

server/src/test/kotlin/org/ivdnt/galahad/formats/naf/NafExportTest.kt

Lines changed: 0 additions & 47 deletions
This file was deleted.

server/src/test/kotlin/org/ivdnt/galahad/formats/naf/NafImportTest.kt

Lines changed: 0 additions & 12 deletions
This file was deleted.

0 commit comments

Comments
 (0)