Skip to content

Commit 11ad8e4

Browse files
committed
Write TEI as xml stream to zip
1 parent eae374d commit 11ad8e4

27 files changed

Lines changed: 216 additions & 131 deletions

server/.run/Galahad.run.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<component name="ProjectRunConfigurationManager">
2-
<configuration default="false" name="Galahad" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
2+
<configuration default="true" name="Galahad" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
33
<option name="ALTERNATIVE_JRE_PATH" value="graalvm-ce-23" />
44
<option name="ALTERNATIVE_JRE_PATH_ENABLED" value="true" />
55
<envs>

server/build.gradle.kts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ dependencies {
4949
implementation("com.github.ben-manes.caffeine:caffeine:3.2.0")
5050

5151
// immutable arrays
52-
implementation("com.danrusu.pods4k:pods4k:0.7.0")
52+
// implementation("com.danrusu.pods4k:pods4k:0.7.0")
5353
}
5454

5555
tasks.withType<Test> {

server/src/main/kotlin/org/ivdnt/galahad/annotations/AnnotationReader.kt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ abstract class AnnotationReader(
1111
protected val paragraphs: MutableList<ParagraphLayer> = mutableListOf()
1212
protected val sentences: MutableList<SentenceLayer> = mutableListOf()
1313
protected val terms: MutableList<Term> = mutableListOf()
14-
protected val spans: MutableMap<Annotation, List<TermSpan>> = mutableMapOf()
14+
protected val spans: MutableMap<Annotation, Array<TermSpan>> = mutableMapOf()
1515

1616
protected var offset: Int = 0
1717

@@ -35,22 +35,22 @@ abstract class AnnotationReader(
3535
protected open fun newDocument() {
3636
newParagraph()
3737
if (paragraphs.isNotEmpty()) {
38-
documents.add(DocumentLayer(docID(), paragraphs.toList()))
38+
documents.add(DocumentLayer(docID(), paragraphs.toTypedArray()))
3939
paragraphs.clear()
4040
}
4141
}
4242

4343
protected open fun newParagraph() {
4444
newSentence()
4545
if (sentences.isNotEmpty()) {
46-
paragraphs.add(ParagraphLayer(parID(), sentences.toList()))
46+
paragraphs.add(ParagraphLayer(parID(), sentences.toTypedArray()))
4747
sentences.clear()
4848
}
4949
}
5050

5151
protected open fun newSentence() {
5252
if (terms.isNotEmpty()) {
53-
sentences.add(SentenceLayer(sentID(), terms.toList(), spans.toMap()))
53+
sentences.add(SentenceLayer(sentID(), terms.toTypedArray(), spans.toMap()))
5454
terms.clear()
5555
spans.clear()
5656
}

server/src/main/kotlin/org/ivdnt/galahad/annotations/Layer.kt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ const val SOURCE_LAYER_NAME: String = "sourceLayer"
99
* Those may be split into paragraphs, sentences, etc.
1010
*/
1111
class Layer(
12-
val documents: List<DocumentLayer>
12+
val documents: Array<DocumentLayer>
1313
) {
1414
@get:JsonIgnore
1515
val spans: Map<Annotation, Sequence<TermSpan>> by lazy {
@@ -41,28 +41,28 @@ class Layer(
4141
override fun toString(): String = documents.joinToString("\n\n") + "\n" // Unix convention EOF
4242

4343
companion object {
44-
val EMPTY: Layer = Layer(emptyList())
44+
val EMPTY: Layer = Layer(emptyArray())
4545
}
4646
}
4747

4848
class DocumentLayer(
4949
val id: String,
50-
val paragraphs: List<ParagraphLayer>,
50+
val paragraphs: Array<ParagraphLayer>,
5151
) {
5252
override fun toString(): String = paragraphs.joinToString("\n\n")
5353
}
5454

5555
class ParagraphLayer(
5656
val id: String,
57-
val sentences: List<SentenceLayer>,
57+
val sentences: Array<SentenceLayer>,
5858
) {
5959
override fun toString(): String = sentences.joinToString("\n")
6060
}
6161

6262
class SentenceLayer(
6363
val id: String,
64-
val terms: List<Term>,
65-
val spans: Map<Annotation, List<TermSpan>>,
64+
val terms: Array<Term>,
65+
val spans: Map<Annotation, Array<TermSpan>>,
6666
) {
6767
override fun toString(): String = terms.joinToString("") { it.token + (if (it.spaceAfter == false) "" else " ") }
6868
}

server/src/main/kotlin/org/ivdnt/galahad/corpora/documents/Document.kt

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import org.ivdnt.galahad.corpora.Corpus
66
import org.ivdnt.galahad.files.DiskValue
77
import org.ivdnt.galahad.files.GalahadFolder
88
import org.ivdnt.galahad.formats.InternalFile
9+
import org.ivdnt.galahad.util.ThreadPoolUtil
910
import java.io.File
1011

1112
/**
@@ -71,17 +72,19 @@ class Document(
7172
val doc = Document(dir)
7273

7374
// uploaded file
74-
file.copyTo(doc.uploadedFile, overwrite = true)
75-
75+
ThreadPoolUtil.pool.execute {
76+
file.copyTo(doc.uploadedFile, overwrite = true)
77+
}
7678
// plaintext & sourceLayer
7779
val internalFile = InternalFile.create(file)
7880
// First try to access the layer. If the file is invalid, this will throw.
7981
val sourceLayer = internalFile.layer
8082
// Set sourceLayer as job. Note that if we threw, we don't unnecessarily create a job folder, keeping the disk clean.
8183
corpus.jobs.createOrThrow(SOURCE_LAYER_NAME).jobDocuments.createOrThrow(doc.name).layer = sourceLayer
8284
// plaintext
83-
doc.plaintextFile.writeText(internalFile.plaintext)
84-
85+
ThreadPoolUtil.pool.execute {
86+
doc.plaintextFile.writeText(internalFile.plaintext)
87+
}
8588
// metadata; needs to be serialized as well
8689
DiskValue<DocumentMetadata>(doc.metadataFile).write(DocumentMetadata.create(internalFile))
8790

server/src/main/kotlin/org/ivdnt/galahad/corpora/documents/DocumentFormat.kt

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,14 @@ enum class DocumentFormat(val identifier: String, val extension: String) {
4444
/**
4545
* Induce the format of a document based on its file extension and content (e.g. root XML node).
4646
*/
47-
fun fromFile(file: File): DocumentFormat {
48-
val format = when (file.extension) {
49-
"tsv" -> Tsv
50-
"folia" -> Folia
51-
"conllu" -> Conllu
52-
"xml", "tei" -> determineXmlFormat(file) // TEI can be either P4 or P5, so still check.
53-
"txt" -> Txt
54-
"naf" -> Naf
55-
else -> Unknown
56-
}
57-
logger.debug { "Induced format $format for file ${file.name}" }
58-
return format
47+
fun fromFile(file: File): DocumentFormat = when (file.extension) {
48+
"tsv" -> Tsv
49+
"folia" -> Folia
50+
"conllu" -> Conllu
51+
"xml", "tei" -> determineXmlFormat(file) // TEI can be either P4 or P5, so still check.
52+
"txt" -> Txt
53+
"naf" -> Naf
54+
else -> Unknown
5955
}
6056

6157
/**
@@ -76,18 +72,5 @@ enum class DocumentFormat(val identifier: String, val extension: String) {
7672
}
7773
return Unknown // No root element found
7874
}
79-
80-
/** Differentiate between TeiP5 and TeiP5Legacy by the presence of pos as an XML attribute.
81-
* - 1 or more pos are present, it's TeiP5
82-
* - if no pos are present, but at least one type is present, it's TeiP5Legacy
83-
* - if no pos or type are present, it's unannotated and we default to TeiP5
84-
*/
85-
private fun determineTeiP5Format(xmlDoc: Document): DocumentFormat {
86-
val xPath: XPath = XPathFactory.newInstance().newXPath()
87-
val numPos = xPath.compile("count(.//w[@pos])").evaluate(xmlDoc, XPathConstants.NUMBER) as Double
88-
val numTypes = xPath.compile("count(.//w[@type])").evaluate(xmlDoc, XPathConstants.NUMBER) as Double
89-
if (numTypes == 0.0 || numPos > 0) return TeiP5
90-
return TeiP5Legacy // No pos but at least one type: assume legacy mode
91-
}
9275
}
9376
}

server/src/main/kotlin/org/ivdnt/galahad/export/CmdiMetadata.kt

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import org.ivdnt.galahad.util.child
88
import org.ivdnt.galahad.util.childElements
99
import org.ivdnt.galahad.util.ifNullOrBlank
1010
import java.io.File
11+
import java.io.OutputStream
1112
import java.text.SimpleDateFormat
1213
import java.util.*
1314
import javax.xml.transform.dom.DOMSource
@@ -33,10 +34,7 @@ class CmdiMetadata(val export: DocumentExport) {
3334
private val sourceName = corpus.sourceName.ifNullOrBlank { "!No source name defined!" }
3435
private val sourceUrl = corpus.sourceURL?.toString().ifNullOrBlank { "!No source URL defined!" }
3536

36-
/** After initialization this file will contain the CMDI */
37-
val file: File
38-
39-
init {
37+
fun write(out: OutputStream) {
4038
// Header
4139
val header = root.child("cmd:Header")
4240
header.child("cmd:MdCreationDate").textContent = date
@@ -93,8 +91,7 @@ class CmdiMetadata(val export: DocumentExport) {
9391
child("cmdp:dayTo").textContent = "---$day"
9492
}
9593
// Write to disk
96-
file = tmp_dir.resolve("CMDI-$docTitle.xml")
97-
XmlUtil.transformer.transform(DOMSource(xml), StreamResult(file.outputStream()))
94+
XmlUtil.transformer.transform(DOMSource(xml), StreamResult(out))
9895
}
9996

10097
companion object {

server/src/main/kotlin/org/ivdnt/galahad/export/CorpusExport.kt

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import org.ivdnt.galahad.corpora.documents.Documents
1010
import org.ivdnt.galahad.corpora.jobs.Job
1111
import org.ivdnt.galahad.exceptions.MergeNotImplementedException
1212
import org.ivdnt.galahad.taggers.Tagger
13+
import org.ivdnt.galahad.util.FileMapper
1314
import org.ivdnt.galahad.util.createZipFile
1415
import java.io.File
1516
import java.io.OutputStream
@@ -38,16 +39,16 @@ class CorpusExport private constructor(
3839
}
3940

4041

41-
fun formatMapper(doc: Document): File {
42+
private fun formatMapper(doc: Document, out: OutputStream) {
4243
try {
4344
// Document conversions.
4445
val docExport = DocumentExport.create(this, doc)
45-
return if (shouldMerge && mergeFormatMatches(doc, format)) {
46+
if (shouldMerge && mergeFormatMatches(doc, format)) {
4647
logger.info("Merging ${doc.name} of format ${doc.metadata.format}")
47-
docExport.merge()
48+
docExport.merge(out)
4849
} else {
4950
logger.info("Converting ${doc.name} of format ${doc.metadata.format} to $format")
50-
docExport.convert()
51+
docExport.convert(out)
5152
}
5253
} catch (e: MergeNotImplementedException) {
5354
throw e
@@ -63,13 +64,8 @@ class CorpusExport private constructor(
6364
out: OutputStream,
6465
) {
6566
val documents = corpus.documents.readAll().filter { DocumentExport.create(this, it).layer != Layer.EMPTY }
66-
val convertedDocs = documents.asSequence().map(::formatMapper)
67-
val docsToCmdi = documents.asSequence().map { CmdiMetadata(DocumentExport.create(this, it)).file }
68-
val cmdiZip = createZipFile(docsToCmdi, includeCMDI = true)
69-
// rename the cmdiZip to "metadata"
70-
val dest = createTempDirectory("metadata").toFile().resolve("metadata.zip")
71-
Files.move(cmdiZip.toPath(), dest.toPath())
72-
createZipFile(convertedDocs + dest, out)
67+
val seq: Sequence<FileMapper> = documents.asSequence().map { doc -> doc.name to { out -> formatMapper(doc, out) } }
68+
createZipFile(seq, out)
7369
}
7470

7571
companion object {

server/src/main/kotlin/org/ivdnt/galahad/export/DocumentExport.kt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import org.ivdnt.galahad.corpora.documents.DocumentFormat
99
import org.ivdnt.galahad.corpora.jobs.Job
1010
import org.ivdnt.galahad.taggers.Tagger
1111
import java.io.File
12+
import java.io.OutputStream
1213
import kotlin.io.path.createTempDirectory
1314

1415
class DocumentExport private constructor(
@@ -29,6 +30,10 @@ class DocumentExport private constructor(
2930
fun convert(): File = file.also { LayerConverter.create(this).convert(file.outputStream()) }
3031
fun merge(): File = file.also { LayerMerger.create(this).merge(file.outputStream()) }
3132

33+
fun convert(out: OutputStream): Unit = LayerConverter.create(this).convert(out)
34+
fun merge(out: OutputStream): Unit = LayerMerger.create(this).merge(out)
35+
fun cmdi(): CmdiMetadata = CmdiMetadata(this)
36+
3237
companion object {
3338
fun create(export: CorpusExport, doc: Document): DocumentExport = create(export, doc.name)
3439

server/src/main/kotlin/org/ivdnt/galahad/export/LayerConverter.kt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import org.ivdnt.galahad.exceptions.InvalidDocumentFormatException
55
import org.ivdnt.galahad.formats.conllu.ConlluConverter
66
import org.ivdnt.galahad.formats.folia.FoliaConverter
77
import org.ivdnt.galahad.formats.naf.NafConverter
8+
import org.ivdnt.galahad.formats.tei.TeiAaltoConverter
89
import org.ivdnt.galahad.formats.tei.TeiConverter
910
import org.ivdnt.galahad.formats.tsv.TsvConverter
1011
import org.ivdnt.galahad.formats.txt.TxtConverter
@@ -20,7 +21,7 @@ abstract class LayerConverter(protected val export: DocumentExport) {
2021
DocumentFormat.Naf -> NafConverter(export)
2122
DocumentFormat.Txt -> TxtConverter(export)
2223
DocumentFormat.Conllu -> ConlluConverter(export)
23-
DocumentFormat.TeiP5 -> TeiConverter(export)
24+
DocumentFormat.TeiP5 -> TeiAaltoConverter(export)
2425
else -> throw InvalidDocumentFormatException("Unsupported export conversion format: ${export.format}")
2526
}
2627
}

0 commit comments

Comments
 (0)