Skip to content

Commit ba7c977

Browse files
Vincent PrinsVincent Prins
authored andcommitted
Added pdf and docx support
1 parent 7922924 commit ba7c977

8 files changed

Lines changed: 112 additions & 1 deletion

File tree

client/src/components/input/UploadDocuments.vue

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
</label>
1414
<!-- Actual input -->
1515
<input type="file" ref="uploadInput" name="filefield" multiple id="file-upload" style="display: none;"
16-
accept=".xml, .tsv, .txt, .zip, .conllu, .naf"
16+
accept=".xml, .tsv, .txt, .zip, .conllu, .naf, .pdf, .docx"
1717
@change="e => filesToUpload = Object.values(e.target.files as FileList)" />
1818

1919

server/build.gradle.kts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ dependencies {
4646
// cache
4747
implementation("com.github.ben-manes.caffeine:caffeine:3.2.0")
4848

49+
// reading microsoft word docx
50+
implementation("org.apache.poi:poi-ooxml:5.4.1")
51+
52+
// reading pdf
53+
implementation("com.itextpdf:itextpdf:5.5.13.4")
54+
4955
// immutable arrays
5056
// implementation("com.danrusu.pods4k:pods4k:0.7.0")
5157
}

server/src/main/kotlin/org/ivdnt/galahad/documents/DocumentFormat.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ enum class DocumentFormat(val identifier: String, val extension: String) {
2626
Conllu("conllu", "conllu"),
2727
Folia("folia", "folia.xml"),
2828
Txt("txt", "txt"),
29+
Docx("docx", "docx"),
30+
Pdf("pdf", "pdf"),
2931
Unknown("unknown", "unknown");
3032

3133
@JsonValue
@@ -48,9 +50,11 @@ enum class DocumentFormat(val identifier: String, val extension: String) {
4850
"tsv" -> Tsv
4951
"folia" -> Folia
5052
"conllu" -> Conllu
53+
"docx" -> Docx
5154
"xml", "tei" -> determineXmlFormat(file) // TEI can be either P4 or P5, so still check.
5255
"txt" -> Txt
5356
"naf" -> Naf
57+
"pdf" -> Pdf
5458
else -> Unknown
5559
}
5660

server/src/main/kotlin/org/ivdnt/galahad/formats/InternalFile.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import org.ivdnt.galahad.annotations.Layer
55
import org.ivdnt.galahad.documents.DocumentFormat
66
import org.ivdnt.galahad.exceptions.DocumentInvalidException
77
import org.ivdnt.galahad.formats.conllu.ConlluFile
8+
import org.ivdnt.galahad.formats.docx.DocxFile
89
import org.ivdnt.galahad.formats.folia.FoliaFile
910
import org.ivdnt.galahad.formats.naf.NafFile
11+
import org.ivdnt.galahad.formats.pdf.PdfFile
1012
import org.ivdnt.galahad.formats.tei.TeiFile
1113
import org.ivdnt.galahad.formats.tsv.TsvFile
1214
import org.ivdnt.galahad.formats.txt.TxtFile
@@ -28,6 +30,8 @@ abstract class InternalFile protected constructor() {
2830
DocumentFormat.Naf -> NafFile(file)
2931
DocumentFormat.Txt -> TxtFile(file)
3032
DocumentFormat.Conllu -> ConlluFile(file)
33+
DocumentFormat.Docx -> DocxFile(file)
34+
DocumentFormat.Pdf -> PdfFile(file)
3135
// Multiple TEI formats
3236
DocumentFormat.TeiP4Legacy,
3337
DocumentFormat.TeiP5Legacy,
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package org.ivdnt.galahad.formats.docx
2+
3+
import org.ivdnt.galahad.documents.DocumentFormat
4+
import org.ivdnt.galahad.formats.InternalFile
5+
import java.io.BufferedInputStream
6+
import java.io.File
7+
8+
class DocxFile(
9+
override val file: File
10+
) : InternalFile() {
11+
override val format: DocumentFormat = DocumentFormat.Docx
12+
override val reader: DocxReader by lazy { DocxReader(BufferedInputStream(file.inputStream())) }
13+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package org.ivdnt.galahad.formats.docx
2+
3+
import org.apache.poi.xwpf.usermodel.XWPFDocument
4+
import org.ivdnt.galahad.annotations.Annotation
5+
import org.ivdnt.galahad.annotations.AnnotationReader
6+
import org.ivdnt.galahad.annotations.Layer
7+
import org.ivdnt.galahad.annotations.Term
8+
import java.io.InputStream
9+
10+
class DocxReader(
11+
stream: InputStream
12+
) : AnnotationReader() {
13+
val doc: XWPFDocument = XWPFDocument(stream)
14+
15+
override fun read(): Layer {
16+
doc.paragraphs.forEach { paragraph ->
17+
paragraph.text.ifBlank { null }?.split(whitespace)?.forEach { word ->
18+
terms += Term(
19+
wordID(), offset, mapOf(
20+
Annotation.TOKEN to word
21+
)
22+
)
23+
offset += word.length + 1
24+
}
25+
newParagraph()
26+
}
27+
newDocument()
28+
return Layer(documents.toTypedArray())
29+
}
30+
31+
companion object {
32+
val whitespace: Regex = Regex("""\s+""")
33+
}
34+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package org.ivdnt.galahad.formats.pdf
2+
3+
import org.ivdnt.galahad.documents.DocumentFormat
4+
import org.ivdnt.galahad.formats.InternalFile
5+
import org.ivdnt.galahad.formats.docx.DocxReader
6+
import java.io.BufferedInputStream
7+
import java.io.File
8+
9+
class PdfFile(
10+
override val file: File
11+
) : InternalFile() {
12+
override val format: DocumentFormat = DocumentFormat.Pdf
13+
override val reader: PdfReader by lazy { PdfReader(BufferedInputStream(file.inputStream())) }
14+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
package org.ivdnt.galahad.formats.pdf
2+
3+
import com.itextpdf.text.pdf.parser.PdfTextExtractor
4+
import org.ivdnt.galahad.annotations.Annotation
5+
import com.itextpdf.text.pdf.PdfReader as PdfReaderIText
6+
import org.ivdnt.galahad.annotations.AnnotationReader
7+
import org.ivdnt.galahad.annotations.Layer
8+
import org.ivdnt.galahad.annotations.Term
9+
import java.io.InputStream
10+
11+
class PdfReader(
12+
stream: InputStream
13+
) : AnnotationReader() {
14+
val reader = PdfReaderIText(stream)
15+
16+
override fun read(): Layer {
17+
for (i in 1 .. reader.numberOfPages) {
18+
val text = PdfTextExtractor.getTextFromPage(reader, i)
19+
text.ifBlank { null }?.split(whitespace)?.filter { it.isNotBlank()}?.forEach {word ->
20+
terms += Term(
21+
wordID(), offset, mapOf(
22+
Annotation.TOKEN to word
23+
)
24+
)
25+
offset += word.length + 1
26+
}
27+
}
28+
newDocument()
29+
return Layer(documents.toTypedArray())
30+
}
31+
32+
companion object {
33+
val whitespace: Regex = Regex("""\s+""")
34+
}
35+
36+
}

0 commit comments

Comments
 (0)