Skip to content

Commit 877da11

Browse files
cpcloudclaude
andauthored
feat(extract): send spatial layout annotations from OCR to LLM (#724)
## Summary - Send compact spatial layout annotations from tesseract OCR to the LLM during document extraction, improving accuracy for invoices, forms, and tabular documents - Format: `[left,top,width]` per line (~2x token overhead vs plain text), with `[left,top,width;conf]` only for suspect lines below a configurable confidence threshold - Drop height from bounding boxes (nearly constant across lines, no useful signal) - New config: `[extraction.ocr]` subtable with `tsv` (default `true`) and `confidence_threshold` (default `70`) - Toggle in extraction overlay: press `t` to switch spatial layout on/off and rerun LLM extraction - Env vars: `MICASA_EXTRACTION_OCR_TSV`, `MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD` closes #699 --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 84a3f44 commit 877da11

18 files changed

Lines changed: 804 additions & 143 deletions

AGENTS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,10 @@ details; do not duplicate that detail here.
205205

206206
### Git and CI
207207

208+
- **Reply to PR review comments**: After addressing a PR review comment,
209+
reply to the comment on GitHub (via `gh api .../replies`) explaining
210+
how it was addressed (commit hash, what changed, tests added). Do this
211+
for every comment, not just some.
208212
- **Never use `git commit --no-verify`**: No exceptions. Fix every hook
209213
failure before committing.
210214
- **Treat all linter/compiler warnings as bugs**: Fix all warnings from

cmd/micasa/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@ func (cmd *runCmd) Run() error {
163163
cfg.Extraction.MaxPages,
164164
0, // pdftotext uses its own internal default timeout (30s)
165165
cfg.Extraction.IsOCREnabled(),
166-
cfg.Extraction.OCR.ConfidenceThreshold,
167166
)
168167
opts.SetExtraction(
169168
exCfg.Provider,
@@ -175,6 +174,8 @@ func (cmd *runCmd) Run() error {
175174
extractors,
176175
cfg.Extraction.IsEnabled(),
177176
cfg.Extraction.LLMTimeoutDuration(),
177+
cfg.Extraction.IsOCRTSV(),
178+
cfg.Extraction.OCRConfThreshold(),
178179
)
179180

180181
model, err := app.NewModel(store, opts)

internal/app/extraction.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -517,12 +517,14 @@ func (m *Model) llmExtractCmd(ctx context.Context, ex *extractionLogState) tea.C
517517
ex.llmCancelFn = cancel
518518
}
519519
messages := extract.BuildExtractionPrompt(extract.ExtractionPromptInput{
520-
DocID: ex.DocID,
521-
Filename: ex.Filename,
522-
MIME: ex.mime,
523-
SizeBytes: int64(len(ex.fileData)),
524-
Schema: schemaCtx,
525-
Sources: ex.sources,
520+
DocID: ex.DocID,
521+
Filename: ex.Filename,
522+
MIME: ex.mime,
523+
SizeBytes: int64(len(ex.fileData)),
524+
Schema: schemaCtx,
525+
Sources: ex.sources,
526+
SendTSV: m.ex.ocrTSV,
527+
ConfThreshold: m.ex.ocrConfThreshold,
526528
})
527529
ch, err := client.ChatStream(
528530
llmCtx,
@@ -926,6 +928,18 @@ func (m *Model) commitShadowOperations(ex *extractionLogState, ops []extract.Ope
926928
return nil
927929
}
928930

931+
// toggleExtractionTSV flips the ocrTSV setting and reruns the LLM step
932+
// so the user can compare extraction quality with and without spatial layout.
933+
func (m *Model) toggleExtractionTSV() tea.Cmd {
934+
m.ex.ocrTSV = !m.ex.ocrTSV
935+
if m.ex.ocrTSV {
936+
m.setStatusInfo("layout on")
937+
} else {
938+
m.setStatusInfo("layout off")
939+
}
940+
return m.rerunLLMExtraction()
941+
}
942+
929943
// rerunLLMExtraction resets the LLM step and re-runs it.
930944
func (m *Model) rerunLLMExtraction() tea.Cmd {
931945
ex := m.ex.extraction
@@ -1079,6 +1093,10 @@ func (m *Model) handleExtractionPipelineKey(msg tea.KeyMsg) tea.Cmd {
10791093
if ex.Done && ex.hasLLM && ex.cursorStep() == stepLLM {
10801094
return m.activateExtractionModelPicker()
10811095
}
1096+
case keyT:
1097+
if ex.Done && ex.hasLLM {
1098+
return m.toggleExtractionTSV()
1099+
}
10821100
case keyA:
10831101
if ex.Done {
10841102
m.acceptExtraction()
@@ -1511,6 +1529,13 @@ func (m *Model) buildExtractionPipelineOverlay(
15111529
hints = append(hints, m.helpItem(keyX, "explore"))
15121530
}
15131531
if ex.Done {
1532+
if ex.hasLLM {
1533+
label := "layout on"
1534+
if m.ex.ocrTSV {
1535+
label = "layout off"
1536+
}
1537+
hints = append(hints, m.helpItem(keyT, label))
1538+
}
15141539
hints = append(hints, m.helpItem(keyA, "accept"), m.helpItem(keyEsc, "discard"))
15151540
} else {
15161541
hints = append(hints,

internal/app/extraction_test.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2842,3 +2842,103 @@ func TestAccept_DeferredDoc_WorksWithoutLLMStep(t *testing.T) {
28422842
require.NoError(t, err)
28432843
assert.Equal(t, "better ocr text", full.ExtractedText)
28442844
}
2845+
2846+
// --- TSV toggle ---
2847+
2848+
func TestExtractionTSVToggle_TogglesOCRTSV(t *testing.T) {
2849+
t.Parallel()
2850+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2851+
stepText: stepDone,
2852+
stepExtract: stepDone,
2853+
stepLLM: stepDone,
2854+
})
2855+
ex := m.ex.extraction
2856+
ex.Done = true
2857+
2858+
assert.False(t, m.ex.ocrTSV, "ocrTSV should start false in test setup")
2859+
2860+
// Press t to toggle layout on.
2861+
sendExtractionKey(m, keyT)
2862+
assert.True(t, m.ex.ocrTSV, "t should toggle ocrTSV on")
2863+
2864+
// LLM step should be reset for rerun.
2865+
assert.Equal(t, stepRunning, ex.Steps[stepLLM].Status,
2866+
"LLM step should be rerunning after toggle")
2867+
}
2868+
2869+
func TestExtractionTSVToggle_TogglesOff(t *testing.T) {
2870+
t.Parallel()
2871+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2872+
stepText: stepDone,
2873+
stepExtract: stepDone,
2874+
stepLLM: stepDone,
2875+
})
2876+
ex := m.ex.extraction
2877+
ex.Done = true
2878+
m.ex.ocrTSV = true
2879+
2880+
sendExtractionKey(m, keyT)
2881+
assert.False(t, m.ex.ocrTSV, "t should toggle ocrTSV off")
2882+
}
2883+
2884+
func TestExtractionTSVToggle_IgnoredWhenNotDone(t *testing.T) {
2885+
t.Parallel()
2886+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2887+
stepText: stepDone,
2888+
stepExtract: stepRunning,
2889+
stepLLM: stepPending,
2890+
})
2891+
2892+
sendExtractionKey(m, keyT)
2893+
assert.False(t, m.ex.ocrTSV, "t should be ignored when extraction is not done")
2894+
}
2895+
2896+
func TestExtractionTSVToggle_IgnoredWithoutLLM(t *testing.T) {
2897+
t.Parallel()
2898+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2899+
stepText: stepDone,
2900+
stepExtract: stepDone,
2901+
})
2902+
ex := m.ex.extraction
2903+
ex.Done = true
2904+
2905+
sendExtractionKey(m, keyT)
2906+
assert.False(t, m.ex.ocrTSV, "t should be ignored when no LLM step")
2907+
}
2908+
2909+
func TestExtractionTSVToggle_StatusMessage(t *testing.T) {
2910+
t.Parallel()
2911+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2912+
stepText: stepDone,
2913+
stepExtract: stepDone,
2914+
stepLLM: stepDone,
2915+
})
2916+
ex := m.ex.extraction
2917+
ex.Done = true
2918+
2919+
sendExtractionKey(m, keyT)
2920+
assert.Contains(t, m.status.Text, "layout on")
2921+
2922+
// Simulate LLM completing again so we can toggle off.
2923+
ex.Done = true
2924+
ex.Steps[stepLLM] = extractionStepInfo{Status: stepDone}
2925+
2926+
sendExtractionKey(m, keyT)
2927+
assert.Contains(t, m.status.Text, "layout off")
2928+
}
2929+
2930+
func TestExtractionTSVToggle_HintShownInFooter(t *testing.T) {
2931+
t.Parallel()
2932+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2933+
stepText: stepDone,
2934+
stepExtract: stepDone,
2935+
stepLLM: stepDone,
2936+
})
2937+
ex := m.ex.extraction
2938+
ex.Done = true
2939+
m.width = 120
2940+
m.height = 40
2941+
2942+
view := m.View()
2943+
assert.Contains(t, view, "layout", "footer should show layout hint when done with LLM")
2944+
}

internal/app/model.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,8 @@ func NewModel(store *data.Store, options Options) (*Model, error) {
273273
extractionTimeout: options.ExtractionConfig.Timeout,
274274
extractionThinking: options.ExtractionConfig.Thinking,
275275
extractionEnabled: options.ExtractionConfig.Enabled,
276+
ocrTSV: options.ExtractionConfig.OCRTSV,
277+
ocrConfThreshold: options.ExtractionConfig.OCRConfThreshold,
276278
extractors: options.ExtractionConfig.Extractors,
277279
llmInferenceTimeout: options.ExtractionConfig.LLMInferenceTimeout,
278280
},

internal/app/types.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ type extractState struct {
9494
extractionTimeout time.Duration
9595
extractionThinking string
9696
extractionEnabled bool
97+
ocrTSV bool
98+
ocrConfThreshold int
9799
extractionClient *llm.Client
98100
extractors []extract.Extractor
99101
extractionReady bool
@@ -292,8 +294,10 @@ type extractionConfig struct {
292294
Thinking string // reasoning effort level
293295
LLMInferenceTimeout time.Duration
294296

295-
Extractors []extract.Extractor // configured extractors; nil = defaults
296-
Enabled bool // LLM extraction enabled
297+
Extractors []extract.Extractor // configured extractors; nil = defaults
298+
Enabled bool // LLM extraction enabled
299+
OCRTSV bool // send spatial layout annotations to LLM
300+
OCRConfThreshold int // confidence threshold for spatial annotations
297301
}
298302

299303
// SetExtraction configures the extraction pipeline on the Options.
@@ -304,6 +308,8 @@ func (o *Options) SetExtraction(
304308
extractors []extract.Extractor,
305309
enabled bool,
306310
llmInferenceTimeout time.Duration,
311+
ocrTSV bool,
312+
ocrConfThreshold int,
307313
) {
308314
o.ExtractionConfig = extractionConfig{
309315
Provider: provider,
@@ -315,6 +321,8 @@ func (o *Options) SetExtraction(
315321
LLMInferenceTimeout: llmInferenceTimeout,
316322
Extractors: extractors,
317323
Enabled: enabled,
324+
OCRTSV: ocrTSV,
325+
OCRConfThreshold: ocrConfThreshold,
318326
}
319327
}
320328

internal/config/config.go

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -292,10 +292,17 @@ type OCR struct {
292292
// When disabled, scanned pages and images produce no text. Default: true.
293293
Enable *bool `toml:"enable,omitempty"`
294294

295-
// ConfidenceThreshold is the minimum tesseract word confidence (0-100)
296-
// to keep in OCR output. Words below this threshold are dropped.
297-
// 0 means no filtering (all words kept). Default: 0.
298-
ConfidenceThreshold int `toml:"confidence_threshold"`
295+
// TSV sends spatial layout annotations (line-level bounding boxes
296+
// and confidence scores) from tesseract OCR to the LLM alongside text.
297+
// This helps extraction accuracy for invoices and forms with tabular
298+
// data, at ~2x token overhead. Default: true.
299+
TSV *bool `toml:"tsv,omitempty"`
300+
301+
// ConfidenceThresholdVal is the confidence threshold (0-100) below
302+
// which OCR confidence annotations are included in spatial layout
303+
// output. Lines with min confidence >= this value omit the score to
304+
// save tokens. Set to 0 to never show confidence. Default: 70.
305+
ConfidenceThresholdVal *int `toml:"confidence_threshold,omitempty"`
299306
}
300307

301308
// IsEnabled returns whether LLM extraction is enabled. Defaults to true
@@ -335,6 +342,24 @@ func (e Extraction) ThinkingLevel() string {
335342
return e.Thinking
336343
}
337344

345+
// IsOCRTSV returns whether spatial layout annotations from tesseract OCR
346+
// should be sent to the LLM alongside text. Defaults to true.
347+
func (e Extraction) IsOCRTSV() bool {
348+
if e.OCR.TSV != nil {
349+
return *e.OCR.TSV
350+
}
351+
return true
352+
}
353+
354+
// OCRConfThreshold returns the confidence threshold below which OCR
355+
// confidence annotations appear in spatial output. Defaults to 70.
356+
func (e Extraction) OCRConfThreshold() int {
357+
if e.OCR.ConfidenceThresholdVal != nil {
358+
return *e.OCR.ConfidenceThresholdVal
359+
}
360+
return 70
361+
}
362+
338363
// ResolvedModel returns the extraction model, falling back to the given
339364
// chat model if no extraction-specific model is configured.
340365
func (e Extraction) ResolvedModel(chatModel string) string {
@@ -530,10 +555,9 @@ func LoadFromPath(path string) (Config, error) {
530555
)
531556
}
532557

533-
if cfg.Extraction.OCR.ConfidenceThreshold < 0 || cfg.Extraction.OCR.ConfidenceThreshold > 100 {
558+
if t := cfg.Extraction.OCRConfThreshold(); t < 0 || t > 100 {
534559
return cfg, fmt.Errorf(
535-
"extraction.ocr.confidence_threshold must be 0-100, got %d",
536-
cfg.Extraction.OCR.ConfidenceThreshold,
560+
"extraction.ocr.confidence_threshold must be 0-100, got %d", t,
537561
)
538562
}
539563

@@ -1136,14 +1160,20 @@ model = "` + DefaultModel + `"
11361160
# Maximum pages for async extraction of scanned documents. 0 = no limit. Default: 0.
11371161
# max_pages = 0
11381162
1139-
# [extraction.ocr]
1163+
[extraction.ocr]
11401164
# Set to false to disable OCR on uploaded documents. When disabled, scanned
11411165
# pages and images produce no text. Default: true.
11421166
# enable = true
11431167
1144-
# Minimum tesseract word confidence (0-100) to keep. Words below this
1145-
# threshold are dropped. 0 = no filtering. Default: 0.
1146-
# confidence_threshold = 0
1168+
# Send spatial layout annotations (line-level bounding boxes) from tesseract
1169+
# OCR to the LLM alongside text. Improves extraction accuracy for invoices
1170+
# and forms with tabular data, at ~2x token overhead. Default: true.
1171+
# tsv = true
1172+
1173+
# Confidence threshold (0-100) for spatial annotations. Lines with OCR
1174+
# confidence below this threshold include a confidence score; lines above
1175+
# omit it to save tokens. Set to 0 to never show confidence. Default: 70.
1176+
# confidence_threshold = 70
11471177
11481178
[locale]
11491179
# ISO 4217 currency code. Stored in the database on first run; after that the

0 commit comments

Comments
 (0)