Skip to content

Commit 76b3d02

Browse files
cpcloudclaude
andcommitted
feat(extract): send spatial layout annotations from OCR to LLM
Send compact line-level bounding boxes from tesseract OCR to the LLM during extraction, improving accuracy for invoices, forms, and tabular documents. The format is [left,top,width] per line (~2x token overhead vs plain text), with confidence scores shown only for suspect lines (below a configurable threshold, default 70). - Add SpatialTextFromTSV() that converts raw TSV to compact spatial format - Drop height from bounding boxes (nearly constant, no signal) - Threshold-based confidence: only annotate lines with minConf < threshold - New config: ocr_tsv (default true), ocr_conf_threshold (default 70) - Toggle in extraction overlay: press 't' to switch layout on/off on rerun - Thread config through pipeline, prompt builder, and app plumbing closes #699 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3bc8003 commit 76b3d02

12 files changed

Lines changed: 762 additions & 46 deletions

File tree

cmd/micasa/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ func (cmd *runCmd) Run() error {
173173
extractors,
174174
cfg.Extraction.IsEnabled(),
175175
cfg.Extraction.LLMTimeoutDuration(),
176+
cfg.Extraction.IsOCRTSV(),
177+
cfg.Extraction.OCRConfThreshold(),
176178
)
177179

178180
model, err := app.NewModel(store, opts)

internal/app/extraction.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -514,12 +514,14 @@ func (m *Model) llmExtractCmd(ctx context.Context, ex *extractionLogState) tea.C
514514
ex.llmCancelFn = cancel
515515
}
516516
messages := extract.BuildExtractionPrompt(extract.ExtractionPromptInput{
517-
DocID: ex.DocID,
518-
Filename: ex.Filename,
519-
MIME: ex.mime,
520-
SizeBytes: int64(len(ex.fileData)),
521-
Schema: schemaCtx,
522-
Sources: ex.sources,
517+
DocID: ex.DocID,
518+
Filename: ex.Filename,
519+
MIME: ex.mime,
520+
SizeBytes: int64(len(ex.fileData)),
521+
Schema: schemaCtx,
522+
Sources: ex.sources,
523+
SendTSV: m.ex.ocrTSV,
524+
ConfThreshold: m.ex.ocrConfThreshold,
523525
})
524526
ch, err := client.ChatStream(
525527
llmCtx,
@@ -923,6 +925,18 @@ func (m *Model) commitShadowOperations(ex *extractionLogState, ops []extract.Ope
923925
return nil
924926
}
925927

928+
// toggleExtractionTSV flips the ocrTSV setting and reruns the LLM step
929+
// so the user can compare extraction quality with and without spatial layout.
930+
func (m *Model) toggleExtractionTSV() tea.Cmd {
931+
m.ex.ocrTSV = !m.ex.ocrTSV
932+
if m.ex.ocrTSV {
933+
m.setStatusInfo("layout on")
934+
} else {
935+
m.setStatusInfo("layout off")
936+
}
937+
return m.rerunLLMExtraction()
938+
}
939+
926940
// rerunLLMExtraction resets the LLM step and re-runs it.
927941
func (m *Model) rerunLLMExtraction() tea.Cmd {
928942
ex := m.ex.extraction
@@ -1074,6 +1088,10 @@ func (m *Model) handleExtractionPipelineKey(msg tea.KeyMsg) tea.Cmd {
10741088
if ex.Done && ex.hasLLM && ex.cursorStep() == stepLLM {
10751089
return m.activateExtractionModelPicker()
10761090
}
1091+
case keyT:
1092+
if ex.Done && ex.hasLLM {
1093+
return m.toggleExtractionTSV()
1094+
}
10771095
case keyA:
10781096
if ex.Done {
10791097
m.acceptExtraction()
@@ -1506,6 +1524,13 @@ func (m *Model) buildExtractionPipelineOverlay(
15061524
hints = append(hints, m.helpItem(keyX, "explore"))
15071525
}
15081526
if ex.Done {
1527+
if ex.hasLLM {
1528+
label := "layout on"
1529+
if m.ex.ocrTSV {
1530+
label = "layout off"
1531+
}
1532+
hints = append(hints, m.helpItem(keyT, label))
1533+
}
15091534
hints = append(hints, m.helpItem(keyA, "accept"), m.helpItem(keyEsc, "discard"))
15101535
} else {
15111536
hints = append(hints,

internal/app/extraction_test.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2831,3 +2831,103 @@ func TestAccept_DeferredDoc_WorksWithoutLLMStep(t *testing.T) {
28312831
require.NoError(t, err)
28322832
assert.Equal(t, "better ocr text", full.ExtractedText)
28332833
}
2834+
2835+
// --- TSV toggle ---
2836+
2837+
func TestExtractionTSVToggle_TogglesOCRTSV(t *testing.T) {
2838+
t.Parallel()
2839+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2840+
stepText: stepDone,
2841+
stepExtract: stepDone,
2842+
stepLLM: stepDone,
2843+
})
2844+
ex := m.ex.extraction
2845+
ex.Done = true
2846+
2847+
assert.False(t, m.ex.ocrTSV, "ocrTSV should start false in test setup")
2848+
2849+
// Press t to toggle layout on.
2850+
sendExtractionKey(m, keyT)
2851+
assert.True(t, m.ex.ocrTSV, "t should toggle ocrTSV on")
2852+
2853+
// LLM step should be reset for rerun.
2854+
assert.Equal(t, stepRunning, ex.Steps[stepLLM].Status,
2855+
"LLM step should be rerunning after toggle")
2856+
}
2857+
2858+
func TestExtractionTSVToggle_TogglesOff(t *testing.T) {
2859+
t.Parallel()
2860+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2861+
stepText: stepDone,
2862+
stepExtract: stepDone,
2863+
stepLLM: stepDone,
2864+
})
2865+
ex := m.ex.extraction
2866+
ex.Done = true
2867+
m.ex.ocrTSV = true
2868+
2869+
sendExtractionKey(m, keyT)
2870+
assert.False(t, m.ex.ocrTSV, "t should toggle ocrTSV off")
2871+
}
2872+
2873+
func TestExtractionTSVToggle_IgnoredWhenNotDone(t *testing.T) {
2874+
t.Parallel()
2875+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2876+
stepText: stepDone,
2877+
stepExtract: stepRunning,
2878+
stepLLM: stepPending,
2879+
})
2880+
2881+
sendExtractionKey(m, keyT)
2882+
assert.False(t, m.ex.ocrTSV, "t should be ignored when extraction is not done")
2883+
}
2884+
2885+
func TestExtractionTSVToggle_IgnoredWithoutLLM(t *testing.T) {
2886+
t.Parallel()
2887+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2888+
stepText: stepDone,
2889+
stepExtract: stepDone,
2890+
})
2891+
ex := m.ex.extraction
2892+
ex.Done = true
2893+
2894+
sendExtractionKey(m, keyT)
2895+
assert.False(t, m.ex.ocrTSV, "t should be ignored when no LLM step")
2896+
}
2897+
2898+
func TestExtractionTSVToggle_StatusMessage(t *testing.T) {
2899+
t.Parallel()
2900+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2901+
stepText: stepDone,
2902+
stepExtract: stepDone,
2903+
stepLLM: stepDone,
2904+
})
2905+
ex := m.ex.extraction
2906+
ex.Done = true
2907+
2908+
sendExtractionKey(m, keyT)
2909+
assert.Contains(t, m.status.Text, "layout on")
2910+
2911+
// Simulate LLM completing again so we can toggle off.
2912+
ex.Done = true
2913+
ex.Steps[stepLLM] = extractionStepInfo{Status: stepDone}
2914+
2915+
sendExtractionKey(m, keyT)
2916+
assert.Contains(t, m.status.Text, "layout off")
2917+
}
2918+
2919+
func TestExtractionTSVToggle_HintShownInFooter(t *testing.T) {
2920+
t.Parallel()
2921+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2922+
stepText: stepDone,
2923+
stepExtract: stepDone,
2924+
stepLLM: stepDone,
2925+
})
2926+
ex := m.ex.extraction
2927+
ex.Done = true
2928+
m.width = 120
2929+
m.height = 40
2930+
2931+
view := m.View()
2932+
assert.Contains(t, view, "layout", "footer should show layout hint when done with LLM")
2933+
}

internal/app/model.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,8 @@ func NewModel(store *data.Store, options Options) (*Model, error) {
273273
extractionTimeout: options.ExtractionConfig.Timeout,
274274
extractionThinking: options.ExtractionConfig.Thinking,
275275
extractionEnabled: options.ExtractionConfig.Enabled,
276+
ocrTSV: options.ExtractionConfig.OCRTSV,
277+
ocrConfThreshold: options.ExtractionConfig.OCRConfThreshold,
276278
extractors: options.ExtractionConfig.Extractors,
277279
llmInferenceTimeout: options.ExtractionConfig.LLMInferenceTimeout,
278280
},

internal/app/types.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ type extractState struct {
9494
extractionTimeout time.Duration
9595
extractionThinking string
9696
extractionEnabled bool
97+
ocrTSV bool
98+
ocrConfThreshold int
9799
extractionClient *llm.Client
98100
extractors []extract.Extractor
99101
extractionReady bool
@@ -292,8 +294,10 @@ type extractionConfig struct {
292294
Thinking string // reasoning effort level
293295
LLMInferenceTimeout time.Duration
294296

295-
Extractors []extract.Extractor // configured extractors; nil = defaults
296-
Enabled bool // LLM extraction enabled
297+
Extractors []extract.Extractor // configured extractors; nil = defaults
298+
Enabled bool // LLM extraction enabled
299+
OCRTSV bool // send spatial layout annotations to LLM
300+
OCRConfThreshold int // confidence threshold for spatial annotations
297301
}
298302

299303
// SetExtraction configures the extraction pipeline on the Options.
@@ -304,6 +308,8 @@ func (o *Options) SetExtraction(
304308
extractors []extract.Extractor,
305309
enabled bool,
306310
llmInferenceTimeout time.Duration,
311+
ocrTSV bool,
312+
ocrConfThreshold int,
307313
) {
308314
o.ExtractionConfig = extractionConfig{
309315
Provider: provider,
@@ -315,6 +321,8 @@ func (o *Options) SetExtraction(
315321
LLMInferenceTimeout: llmInferenceTimeout,
316322
Extractors: extractors,
317323
Enabled: enabled,
324+
OCRTSV: ocrTSV,
325+
OCRConfThreshold: ocrConfThreshold,
318326
}
319327
}
320328

internal/config/config.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,18 @@ type Extraction struct {
281281
// Supported values: none, low, medium, high, auto.
282282
// Empty string = don't send (server default). Default: empty.
283283
Thinking string `toml:"thinking,omitempty" env:"MICASA_EXTRACTION_THINKING"`
284+
285+
// OCRTSV sends spatial layout annotations (line-level bounding boxes
286+
// and confidence scores) from tesseract OCR to the LLM alongside text.
287+
// This helps extraction accuracy for invoices and forms with tabular
288+
// data, at ~2x token overhead. Default: true.
289+
OCRTSV *bool `toml:"ocr_tsv,omitempty" env:"MICASA_EXTRACTION_OCR_TSV"`
290+
291+
// OCRConfThresholdVal is the confidence threshold (0-100) below which
292+
// OCR confidence annotations are included in spatial layout output.
293+
// Lines with min confidence >= this value omit the score to save tokens.
294+
// Set to 0 to never show confidence. Default: 70.
295+
OCRConfThresholdVal *int `toml:"ocr_conf_threshold,omitempty" env:"MICASA_EXTRACTION_OCR_CONF_THRESHOLD"`
284296
}
285297

286298
// IsEnabled returns whether LLM extraction is enabled. Defaults to true
@@ -324,6 +336,24 @@ func (e Extraction) ThinkingLevel() string {
324336
return e.Thinking
325337
}
326338

339+
// IsOCRTSV returns whether spatial layout annotations from tesseract OCR
340+
// should be sent to the LLM alongside text. Defaults to true.
341+
func (e Extraction) IsOCRTSV() bool {
342+
if e.OCRTSV != nil {
343+
return *e.OCRTSV
344+
}
345+
return true
346+
}
347+
348+
// OCRConfThreshold returns the confidence threshold below which OCR
349+
// confidence annotations appear in spatial output. Defaults to 70.
350+
func (e Extraction) OCRConfThreshold() int {
351+
if e.OCRConfThresholdVal != nil {
352+
return *e.OCRConfThresholdVal
353+
}
354+
return 70
355+
}
356+
327357
// ResolvedModel returns the extraction model, falling back to the given
328358
// chat model if no extraction-specific model is configured.
329359
func (e Extraction) ResolvedModel(chatModel string) string {
@@ -1091,6 +1121,16 @@ model = "` + DefaultModel + `"
10911121
# When disabled, no structured data is extracted from documents.
10921122
# enabled = true
10931123
1124+
# Send spatial layout annotations (line-level bounding boxes) from tesseract
1125+
# OCR to the LLM alongside text. Improves extraction accuracy for invoices
1126+
# and forms with tabular data, at ~2x token overhead. Default: true.
1127+
# ocr_tsv = true
1128+
1129+
# Confidence threshold (0-100) for spatial annotations. Lines with OCR
1130+
# confidence below this threshold include a confidence score; lines above
1131+
# omit it to save tokens. Set to 0 to never show confidence. Default: 70.
1132+
# ocr_conf_threshold = 70
1133+
10941134
[locale]
10951135
# ISO 4217 currency code. Stored in the database on first run; after that the
10961136
# database value is authoritative. Override: MICASA_CURRENCY env var.

0 commit comments

Comments
 (0)