Skip to content

Commit 4561040

Browse files
committed
Redid csv export of confusion, distribution; and newly for entities
1 parent 0ec8673 commit 4561040

13 files changed

Lines changed: 198 additions & 89 deletions

File tree

server/src/main/kotlin/org/ivdnt/galahad/evaluation/CsvSampleExporter.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package org.ivdnt.galahad.evaluation
22

33
import org.ivdnt.galahad.annotations.Term
44
import org.ivdnt.galahad.evaluation.comparison.TermComparison
5-
import org.ivdnt.galahad.export.csv.CSVFile
5+
import org.ivdnt.galahad.export.csv.CsvFile
66
import org.ivdnt.galahad.taggers.Tagger
77

88
interface CsvSampleExporter {
@@ -19,14 +19,14 @@ interface CsvSampleExporter {
1919
val columns: MutableList<String> = mutableListOf("token")
2020
columns.addAll(refColumns.map { "${refJob.id} ${it.value}" })
2121
columns.addAll(hypoColumns.map { "${hypoJob.id} ${it.value}" })
22-
csv += CSVFile.toCSVHeader(columns)
22+
csv += CsvFile.toCsvString(columns)
2323

2424
// body
2525
comps?.forEach { termComp ->
2626
val literal = termComp.hyp.token.ifEmpty { termComp.ref.token }
2727
val refAnnots = refColumns.map { termComp.ref.annotations[it] ?: Term.missingName(it) }
2828
val hypoAnnots = hypoColumns.map { termComp.hyp.annotations[it] ?: Term.missingName(it) }
29-
csv += CSVFile.toCSVRecord(listOf(literal) + refAnnots + hypoAnnots)
29+
csv += CsvFile.toCsvString(listOf(literal) + refAnnots + hypoAnnots)
3030
}
3131
return csv
3232
}

server/src/main/kotlin/org/ivdnt/galahad/evaluation/confusion/JobConfusion.kt

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ import org.ivdnt.galahad.annotations.Annotation
55
import org.ivdnt.galahad.corpora.Corpus
66
import org.ivdnt.galahad.evaluation.DocumentEvaluations
77
import org.ivdnt.galahad.evaluation.EvaluationEntry
8+
import org.ivdnt.galahad.evaluation.comparison.TermComparison
9+
import org.ivdnt.galahad.export.csv.CsvFile
10+
import org.ivdnt.galahad.export.csv.CsvString
811
import org.ivdnt.galahad.util.merge
912

1013
/**
@@ -28,5 +31,35 @@ class JobConfusion(
2831
}
2932
}
3033
)
34+
35+
fun toCsv(confusion: Map<String, Map<String, EvaluationEntry>>): CsvString = buildString {
36+
val header = sortedHeader(confusion)
37+
append(CsvFile.toCsvString(listOf("Hypothesis → Reference ↓").plus(header)))
38+
sortedEntries(confusion).forEach { (group, entries) ->
39+
val row = mutableListOf(group)
40+
header.forEach { col -> row.add(entries[col]?.count?.toString() ?: "0") }
41+
append(CsvFile.toCsvString(row))
42+
}
43+
}
44+
45+
private fun sortedHeader(confusion: Map<String, Map<String, EvaluationEntry>>): List<String> {
46+
val sorted: MutableList<String> = confusion.values.flatMap { it.keys }.distinct().sorted().toMutableList()
47+
// Move MISSING_MATCH to last.
48+
if (sorted.remove(TermComparison.MISSING_MATCH)) { // true if it was present
49+
sorted.add(TermComparison.MISSING_MATCH) // so add it last
50+
}
51+
return sorted
52+
}
53+
54+
private fun sortedEntries(confusion: Map<String, Map<String, EvaluationEntry>>): List<Map.Entry<String, Map<String, EvaluationEntry>>> {
55+
val sorted = confusion.entries.sortedBy { it.key }.toMutableList()
56+
// Move MISSING_MATCH to last.
57+
val missing = sorted.firstOrNull { it.key == TermComparison.MISSING_MATCH }
58+
if (missing != null) {
59+
sorted.remove(missing)
60+
sorted.add(missing)
61+
}
62+
return sorted
63+
}
3164
}
3265
}

server/src/main/kotlin/org/ivdnt/galahad/evaluation/distribution/DocumentDistribution.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class DocumentDistribution(
99
@JsonValue val typeTokens: Map<Annotation, List<TypeToken>>
1010
) {
1111
companion object {
12-
private val ANNOTATIONS = arrayOf(Annotation.POS, Annotation.UPOS, Annotation.NER)
12+
private val ANNOTATIONS = arrayOf(Annotation.POS, Annotation.UPOS, Annotation.NER, Annotation.DEPREL)
1313

1414
fun create(layer: Layer): DocumentDistribution =
1515
DocumentDistribution(buildMap<Annotation, MutableMap<Pair<String, String>, MutableMap<String, Int>>> {

server/src/main/kotlin/org/ivdnt/galahad/evaluation/distribution/JobDistribution.kt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import com.fasterxml.jackson.annotation.JsonValue
44
import org.ivdnt.galahad.annotations.Annotation
55
import org.ivdnt.galahad.corpora.Corpus
66
import org.ivdnt.galahad.evaluation.DocumentEvaluations
7+
import org.ivdnt.galahad.export.csv.CsvFile
8+
import org.ivdnt.galahad.export.csv.CsvString
79
import org.ivdnt.galahad.util.merge
810

911
/**
@@ -25,6 +27,20 @@ class JobDistribution(
2527
}.values.sortedByDescending { it.count }
2628
})
2729
})
30+
31+
fun toCsv(typeTokens: List<TypeToken>): CsvString = buildString {
32+
append(CsvFile.toCsvString(listOf("lemma", "group", "count", "unique", "tokens")))
33+
for (tt in typeTokens) {
34+
append(
35+
CsvFile.toCsvString(
36+
listOf(
37+
tt.lemma,
38+
tt.group,
39+
tt.count,
40+
tt.tokens.size,
41+
tt.tokens.entries.joinToString { "${it.key} (${it.value})" })))
42+
}
43+
}
2844
}
2945
}
3046

server/src/main/kotlin/org/ivdnt/galahad/evaluation/entities/CorpusEntities.kt

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ package org.ivdnt.galahad.evaluation.entities
33
import org.ivdnt.galahad.corpora.Corpus
44
import org.ivdnt.galahad.evaluation.CorpusEvaluation
55
import org.ivdnt.galahad.evaluation.JobPair
6+
import org.ivdnt.galahad.export.csv.CsvFile
7+
import org.ivdnt.galahad.export.csv.CsvString
8+
import org.ivdnt.galahad.util.toFixed
69
import kotlin.math.pow
710
import kotlin.math.sqrt
811

@@ -67,5 +70,50 @@ class CorpusEntities(
6770
val jobstddev = JobsEntitiesStddev(docstddevs, labelAvg, avg)
6871
return CorpusEntities(jobEntities, jobstddev)
6972
}
73+
74+
fun toCsv(entities: CorpusEntities): CsvString = buildString {
75+
val header = getHeader(entities)
76+
append(CsvFile.toCsvString(header))
77+
78+
getDocs(entities).forEach { doc ->
79+
val row = mutableListOf<Any>(doc)
80+
getJobs(entities).forEach { job ->
81+
getLabels(entities.jobs[job]!!).forEach { label ->
82+
row.add(entities.jobs[job]!!.documents[doc]?.summary?.get(label) ?: 0)
83+
}
84+
// total
85+
row.add(entities.jobs[job]?.total ?: 0)
86+
}
87+
// stddevs
88+
getLabels(entities).forEach { label ->
89+
row.add(entities.stddev.documents[doc]?.stddev?.get(label)?.toFixed() ?: 0.0)
90+
}
91+
// stddev average
92+
row.add(entities.stddev.documents[doc]?.average?.toFixed() ?: 0.0)
93+
94+
append(CsvFile.toCsvString(row))
95+
}
96+
97+
}
98+
99+
private fun getLabels(entities: CorpusEntities): List<String> = entities.stddev.stddev.keys.sorted()
100+
private fun getLabels(job: JobEntities): List<String> = job.summary.keys.sorted()
101+
private fun getJobs(entities: CorpusEntities): List<String> = entities.jobs.keys.sorted()
102+
private fun getDocs(entities: CorpusEntities): List<String> = entities.stddev.documents.keys.sorted()
103+
104+
private fun getHeader(entities: CorpusEntities): MutableList<String> {
105+
val header = mutableListOf("document")
106+
getJobs(entities).forEach { name ->
107+
getLabels(entities.jobs[name]!!).forEach { label ->
108+
header.add("$name $label")
109+
}
110+
header.add("$name total")
111+
}
112+
getLabels(entities).forEach { label ->
113+
header.add("$label std")
114+
}
115+
header.add("stdavg")
116+
return header
117+
}
70118
}
71119
}

server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/Metric.kt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ import com.fasterxml.jackson.annotation.JsonProperty
55
import org.ivdnt.galahad.annotations.Layer
66
import org.ivdnt.galahad.annotations.Term
77
import org.ivdnt.galahad.evaluation.EvaluationEntry
8-
import org.ivdnt.galahad.export.csv.CSVFile
8+
import org.ivdnt.galahad.export.csv.CsvFile
99
import org.ivdnt.galahad.export.csv.CSVHeader
10-
import org.ivdnt.galahad.export.csv.CSVRecord
10+
import org.ivdnt.galahad.export.csv.CsvString
1111
import org.ivdnt.galahad.util.toFixed
1212

1313
/**
@@ -131,8 +131,8 @@ data class Metric(
131131
return this
132132
}
133133

134-
fun toCSVRecord(): CSVRecord {
135-
return CSVFile.toCSVRecord(
134+
fun toCSVRecord(): CsvString {
135+
return CsvFile.toCsvString(
136136
listOf(
137137
name,
138138
clsMetrics.precision.toFixed(),
@@ -148,8 +148,8 @@ data class Metric(
148148
}
149149

150150
companion object {
151-
fun getCsvHeader(): CSVHeader {
152-
return CSVFile.toCSVHeader(
151+
fun getCsvHeader(): CsvString {
152+
return CsvFile.toCsvString(
153153
listOf(
154154
"grouped by",
155155
"precision",

server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/Metrics.kt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@ package org.ivdnt.galahad.evaluation.metrics
33
import com.fasterxml.jackson.annotation.JsonIgnore
44
import com.fasterxml.jackson.annotation.JsonProperty
55
import org.ivdnt.galahad.evaluation.comparison.TermComparison
6-
import org.ivdnt.galahad.export.csv.CSVFile
6+
import org.ivdnt.galahad.export.csv.CsvFile
77
import org.ivdnt.galahad.export.csv.CSVHeader
8+
import org.ivdnt.galahad.export.csv.CsvString
89
import org.ivdnt.galahad.jobs.Job
910
import org.ivdnt.galahad.taggers.Tagger
1011

@@ -44,8 +45,8 @@ open class Metrics(
4445
}
4546

4647
companion object {
47-
fun getCsvHeader(): CSVHeader {
48-
return CSVFile.toCSVHeader(
48+
fun getCsvHeader(): CsvString {
49+
return CsvFile.toCsvString(
4950
listOf(
5051
"annotation",
5152
"grouped by",

server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/MetricsType.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import org.ivdnt.galahad.evaluation.CsvSampleExporter
66
import org.ivdnt.galahad.evaluation.EvaluationEntry
77
import org.ivdnt.galahad.evaluation.comparison.TermComparison
88
import org.ivdnt.galahad.exceptions.InvalidClassificationTypeException
9-
import org.ivdnt.galahad.export.csv.CSVFile
9+
import org.ivdnt.galahad.export.csv.CsvFile
1010
import org.ivdnt.galahad.taggers.Tagger
1111
import org.ivdnt.galahad.util.toFixed
1212

@@ -81,7 +81,7 @@ class MetricsType(
8181
val microMetrics = micro
8282
val macroMetrics = macro
8383

84-
return CSVFile.toCSVRecord(
84+
return CsvFile.toCsvString(
8585
listOf(
8686
setting.annotation,
8787
setting.group,

server/src/main/kotlin/org/ivdnt/galahad/export/csv/CSVFile.kt

Lines changed: 0 additions & 50 deletions
This file was deleted.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package org.ivdnt.galahad.export.csv
2+
3+
import java.io.File
4+
5+
typealias CsvString = String
6+
7+
class CsvFile(
8+
path: File,
9+
) : File(path.toURI()) {
10+
11+
init {
12+
this.append(EXCEL_HEADER)
13+
}
14+
15+
/** Append Excel compatible text. */
16+
fun append(text: CsvString) {
17+
this.appendText(text, Charsets.UTF_16LE)
18+
}
19+
20+
companion object {
21+
// Alternatively we could check for forbidden characters first, and the wrap/replace only when necessary.
22+
// However, this works and gives a consistent result
23+
private fun csvEscape(s: String): String = "\"${s.replace("\"", "\"\"")}\""
24+
25+
// BOM forces Excel to read UTF16LE. Needed for e.g. 'ü'. (https://en.wikipedia.org/wiki/Byte_order_mark)
26+
// Explicit separator needed as default will be ',' in the US but ';' in EU.
27+
private const val EXCEL_HEADER: String = "${'\uFEFF'}sep=,\n"
28+
29+
fun toCsvString(values: List<Any>): CsvString =
30+
values.joinToString(",") { csvEscape(it.toString()) }.plus("\n")
31+
}
32+
}

0 commit comments

Comments
 (0)