Skip to content

Commit de2a5fa

Browse files
cpcloudclaude
andcommitted
feat(extract): send spatial layout annotations from OCR to LLM
Send compact line-level bounding boxes from tesseract OCR to the LLM during extraction, improving accuracy for invoices, forms, and tabular documents. The format is [left,top,width] per line (~2x token overhead vs plain text), with confidence scores shown only for suspect lines (below a configurable threshold, default 70). - Add SpatialTextFromTSV() that converts raw TSV to compact spatial format - Drop height from bounding boxes (nearly constant, no signal) - Threshold-based confidence: only annotate lines with minConf < threshold - New config: ocr_tsv (default true), ocr_conf_threshold (default 70) - Toggle in extraction overlay: press 't' to switch layout on/off on rerun - Thread config through pipeline, prompt builder, and app plumbing closes #699 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 82586ad commit de2a5fa

12 files changed

Lines changed: 757 additions & 34 deletions

File tree

cmd/micasa/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ func (cmd *runCmd) Run() error {
173173
extractors,
174174
cfg.Extraction.IsEnabled(),
175175
cfg.Extraction.LLMTimeoutDuration(),
176+
cfg.Extraction.IsOCRTSV(),
177+
cfg.Extraction.OCRConfThreshold(),
176178
)
177179

178180
model, err := app.NewModel(store, opts)

internal/app/extraction.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -517,12 +517,14 @@ func (m *Model) llmExtractCmd(ctx context.Context, ex *extractionLogState) tea.C
517517
ex.llmCancelFn = cancel
518518
}
519519
messages := extract.BuildExtractionPrompt(extract.ExtractionPromptInput{
520-
DocID: ex.DocID,
521-
Filename: ex.Filename,
522-
MIME: ex.mime,
523-
SizeBytes: int64(len(ex.fileData)),
524-
Schema: schemaCtx,
525-
Sources: ex.sources,
520+
DocID: ex.DocID,
521+
Filename: ex.Filename,
522+
MIME: ex.mime,
523+
SizeBytes: int64(len(ex.fileData)),
524+
Schema: schemaCtx,
525+
Sources: ex.sources,
526+
SendTSV: m.ex.ocrTSV,
527+
ConfThreshold: m.ex.ocrConfThreshold,
526528
})
527529
ch, err := client.ChatStream(
528530
llmCtx,
@@ -926,6 +928,18 @@ func (m *Model) commitShadowOperations(ex *extractionLogState, ops []extract.Ope
926928
return nil
927929
}
928930

931+
// toggleExtractionTSV flips the ocrTSV setting and reruns the LLM step
932+
// so the user can compare extraction quality with and without spatial layout.
933+
func (m *Model) toggleExtractionTSV() tea.Cmd {
934+
m.ex.ocrTSV = !m.ex.ocrTSV
935+
if m.ex.ocrTSV {
936+
m.setStatusInfo("layout on")
937+
} else {
938+
m.setStatusInfo("layout off")
939+
}
940+
return m.rerunLLMExtraction()
941+
}
942+
929943
// rerunLLMExtraction resets the LLM step and re-runs it.
930944
func (m *Model) rerunLLMExtraction() tea.Cmd {
931945
ex := m.ex.extraction
@@ -1079,6 +1093,10 @@ func (m *Model) handleExtractionPipelineKey(msg tea.KeyMsg) tea.Cmd {
10791093
if ex.Done && ex.hasLLM && ex.cursorStep() == stepLLM {
10801094
return m.activateExtractionModelPicker()
10811095
}
1096+
case keyT:
1097+
if ex.Done && ex.hasLLM {
1098+
return m.toggleExtractionTSV()
1099+
}
10821100
case keyA:
10831101
if ex.Done {
10841102
m.acceptExtraction()
@@ -1511,6 +1529,13 @@ func (m *Model) buildExtractionPipelineOverlay(
15111529
hints = append(hints, m.helpItem(keyX, "explore"))
15121530
}
15131531
if ex.Done {
1532+
if ex.hasLLM {
1533+
label := "layout on"
1534+
if m.ex.ocrTSV {
1535+
label = "layout off"
1536+
}
1537+
hints = append(hints, m.helpItem(keyT, label))
1538+
}
15141539
hints = append(hints, m.helpItem(keyA, "accept"), m.helpItem(keyEsc, "discard"))
15151540
} else {
15161541
hints = append(hints,

internal/app/extraction_test.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2842,3 +2842,103 @@ func TestAccept_DeferredDoc_WorksWithoutLLMStep(t *testing.T) {
28422842
require.NoError(t, err)
28432843
assert.Equal(t, "better ocr text", full.ExtractedText)
28442844
}
2845+
2846+
// --- TSV toggle ---
2847+
2848+
func TestExtractionTSVToggle_TogglesOCRTSV(t *testing.T) {
2849+
t.Parallel()
2850+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2851+
stepText: stepDone,
2852+
stepExtract: stepDone,
2853+
stepLLM: stepDone,
2854+
})
2855+
ex := m.ex.extraction
2856+
ex.Done = true
2857+
2858+
assert.False(t, m.ex.ocrTSV, "ocrTSV should start false in test setup")
2859+
2860+
// Press t to toggle layout on.
2861+
sendExtractionKey(m, keyT)
2862+
assert.True(t, m.ex.ocrTSV, "t should toggle ocrTSV on")
2863+
2864+
// LLM step should be reset for rerun.
2865+
assert.Equal(t, stepRunning, ex.Steps[stepLLM].Status,
2866+
"LLM step should be rerunning after toggle")
2867+
}
2868+
2869+
func TestExtractionTSVToggle_TogglesOff(t *testing.T) {
2870+
t.Parallel()
2871+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2872+
stepText: stepDone,
2873+
stepExtract: stepDone,
2874+
stepLLM: stepDone,
2875+
})
2876+
ex := m.ex.extraction
2877+
ex.Done = true
2878+
m.ex.ocrTSV = true
2879+
2880+
sendExtractionKey(m, keyT)
2881+
assert.False(t, m.ex.ocrTSV, "t should toggle ocrTSV off")
2882+
}
2883+
2884+
func TestExtractionTSVToggle_IgnoredWhenNotDone(t *testing.T) {
2885+
t.Parallel()
2886+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2887+
stepText: stepDone,
2888+
stepExtract: stepRunning,
2889+
stepLLM: stepPending,
2890+
})
2891+
2892+
sendExtractionKey(m, keyT)
2893+
assert.False(t, m.ex.ocrTSV, "t should be ignored when extraction is not done")
2894+
}
2895+
2896+
func TestExtractionTSVToggle_IgnoredWithoutLLM(t *testing.T) {
2897+
t.Parallel()
2898+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2899+
stepText: stepDone,
2900+
stepExtract: stepDone,
2901+
})
2902+
ex := m.ex.extraction
2903+
ex.Done = true
2904+
2905+
sendExtractionKey(m, keyT)
2906+
assert.False(t, m.ex.ocrTSV, "t should be ignored when no LLM step")
2907+
}
2908+
2909+
func TestExtractionTSVToggle_StatusMessage(t *testing.T) {
2910+
t.Parallel()
2911+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2912+
stepText: stepDone,
2913+
stepExtract: stepDone,
2914+
stepLLM: stepDone,
2915+
})
2916+
ex := m.ex.extraction
2917+
ex.Done = true
2918+
2919+
sendExtractionKey(m, keyT)
2920+
assert.Contains(t, m.status.Text, "layout on")
2921+
2922+
// Simulate LLM completing again so we can toggle off.
2923+
ex.Done = true
2924+
ex.Steps[stepLLM] = extractionStepInfo{Status: stepDone}
2925+
2926+
sendExtractionKey(m, keyT)
2927+
assert.Contains(t, m.status.Text, "layout off")
2928+
}
2929+
2930+
func TestExtractionTSVToggle_HintShownInFooter(t *testing.T) {
2931+
t.Parallel()
2932+
m := newExtractionModel(t, map[extractionStep]stepStatus{
2933+
stepText: stepDone,
2934+
stepExtract: stepDone,
2935+
stepLLM: stepDone,
2936+
})
2937+
ex := m.ex.extraction
2938+
ex.Done = true
2939+
m.width = 120
2940+
m.height = 40
2941+
2942+
view := m.View()
2943+
assert.Contains(t, view, "layout", "footer should show layout hint when done with LLM")
2944+
}

internal/app/model.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,8 @@ func NewModel(store *data.Store, options Options) (*Model, error) {
273273
extractionTimeout: options.ExtractionConfig.Timeout,
274274
extractionThinking: options.ExtractionConfig.Thinking,
275275
extractionEnabled: options.ExtractionConfig.Enabled,
276+
ocrTSV: options.ExtractionConfig.OCRTSV,
277+
ocrConfThreshold: options.ExtractionConfig.OCRConfThreshold,
276278
extractors: options.ExtractionConfig.Extractors,
277279
llmInferenceTimeout: options.ExtractionConfig.LLMInferenceTimeout,
278280
},

internal/app/types.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ type extractState struct {
9494
extractionTimeout time.Duration
9595
extractionThinking string
9696
extractionEnabled bool
97+
ocrTSV bool
98+
ocrConfThreshold int
9799
extractionClient *llm.Client
98100
extractors []extract.Extractor
99101
extractionReady bool
@@ -292,8 +294,10 @@ type extractionConfig struct {
292294
Thinking string // reasoning effort level
293295
LLMInferenceTimeout time.Duration
294296

295-
Extractors []extract.Extractor // configured extractors; nil = defaults
296-
Enabled bool // LLM extraction enabled
297+
Extractors []extract.Extractor // configured extractors; nil = defaults
298+
Enabled bool // LLM extraction enabled
299+
OCRTSV bool // send spatial layout annotations to LLM
300+
OCRConfThreshold int // confidence threshold for spatial annotations
297301
}
298302

299303
// SetExtraction configures the extraction pipeline on the Options.
@@ -304,6 +308,8 @@ func (o *Options) SetExtraction(
304308
extractors []extract.Extractor,
305309
enabled bool,
306310
llmInferenceTimeout time.Duration,
311+
ocrTSV bool,
312+
ocrConfThreshold int,
307313
) {
308314
o.ExtractionConfig = extractionConfig{
309315
Provider: provider,
@@ -315,6 +321,8 @@ func (o *Options) SetExtraction(
315321
LLMInferenceTimeout: llmInferenceTimeout,
316322
Extractors: extractors,
317323
Enabled: enabled,
324+
OCRTSV: ocrTSV,
325+
OCRConfThreshold: ocrConfThreshold,
318326
}
319327
}
320328

internal/config/config.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,24 @@ type Extraction struct {
281281
// Supported values: none, low, medium, high, auto.
282282
// Empty string = don't send (server default). Default: empty.
283283
Thinking string `toml:"thinking,omitempty"`
284+
285+
// OCR holds settings for OCR spatial layout annotations sent to the LLM.
286+
OCR ExtractionOCR `toml:"ocr" doc:"OCR spatial layout annotation settings."`
287+
}
288+
289+
// ExtractionOCR holds OCR-specific extraction settings.
290+
type ExtractionOCR struct {
291+
// TSV sends spatial layout annotations (line-level bounding boxes
292+
// and confidence scores) from tesseract OCR to the LLM alongside text.
293+
// This helps extraction accuracy for invoices and forms with tabular
294+
// data, at ~2x token overhead. Default: true.
295+
TSV *bool `toml:"tsv,omitempty"`
296+
297+
// ConfidenceThresholdVal is the confidence threshold (0-100) below which
298+
// OCR confidence annotations are included in spatial layout output.
299+
// Lines with min confidence >= this value omit the score to save tokens.
300+
// Set to 0 to never show confidence. Default: 70.
301+
ConfidenceThresholdVal *int `toml:"confidence_threshold,omitempty"`
284302
}
285303

286304
// IsEnabled returns whether LLM extraction is enabled. Defaults to true
@@ -324,6 +342,24 @@ func (e Extraction) ThinkingLevel() string {
324342
return e.Thinking
325343
}
326344

345+
// IsOCRTSV returns whether spatial layout annotations from tesseract OCR
346+
// should be sent to the LLM alongside text. Defaults to true.
347+
func (e Extraction) IsOCRTSV() bool {
348+
if e.OCR.TSV != nil {
349+
return *e.OCR.TSV
350+
}
351+
return true
352+
}
353+
354+
// OCRConfThreshold returns the confidence threshold below which OCR
355+
// confidence annotations appear in spatial output. Defaults to 70.
356+
func (e Extraction) OCRConfThreshold() int {
357+
if e.OCR.ConfidenceThresholdVal != nil {
358+
return *e.OCR.ConfidenceThresholdVal
359+
}
360+
return 70
361+
}
362+
327363
// ResolvedModel returns the extraction model, falling back to the given
328364
// chat model if no extraction-specific model is configured.
329365
func (e Extraction) ResolvedModel(chatModel string) string {
@@ -1133,6 +1169,17 @@ model = "` + DefaultModel + `"
11331169
# When disabled, no structured data is extracted from documents.
11341170
# enabled = true
11351171
1172+
[extraction.ocr]
1173+
# Send spatial layout annotations (line-level bounding boxes) from tesseract
1174+
# OCR to the LLM alongside text. Improves extraction accuracy for invoices
1175+
# and forms with tabular data, at ~2x token overhead. Default: true.
1176+
# tsv = true
1177+
1178+
# Confidence threshold (0-100) for spatial annotations. Lines with OCR
1179+
# confidence below this threshold include a confidence score; lines above
1180+
# omit it to save tokens. Set to 0 to never show confidence. Default: 70.
1181+
# confidence_threshold = 70
1182+
11361183
[locale]
11371184
# ISO 4217 currency code. Stored in the database on first run; after that the
11381185
# database value is authoritative. Override: MICASA_LOCALE_CURRENCY env var.

internal/config/config_test.go

Lines changed: 84 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -629,12 +629,14 @@ func TestEnvVars(t *testing.T) {
629629
"MICASA_DOCUMENTS_CACHE_TTL_DAYS": "documents.cache_ttl_days",
630630
"MICASA_DOCUMENTS_FILE_PICKER_DIR": "documents.file_picker_dir",
631631

632-
"MICASA_EXTRACTION_MODEL": "extraction.model",
633-
"MICASA_EXTRACTION_MAX_PAGES": "extraction.max_pages",
634-
"MICASA_EXTRACTION_ENABLED": "extraction.enabled",
635-
"MICASA_EXTRACTION_TEXT_TIMEOUT": "extraction.text_timeout",
636-
"MICASA_EXTRACTION_LLM_TIMEOUT": "extraction.llm_timeout",
637-
"MICASA_EXTRACTION_THINKING": "extraction.thinking",
632+
"MICASA_EXTRACTION_MODEL": "extraction.model",
633+
"MICASA_EXTRACTION_MAX_PAGES": "extraction.max_pages",
634+
"MICASA_EXTRACTION_ENABLED": "extraction.enabled",
635+
"MICASA_EXTRACTION_TEXT_TIMEOUT": "extraction.text_timeout",
636+
"MICASA_EXTRACTION_LLM_TIMEOUT": "extraction.llm_timeout",
637+
"MICASA_EXTRACTION_THINKING": "extraction.thinking",
638+
"MICASA_EXTRACTION_OCR_TSV": "extraction.ocr.tsv",
639+
"MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD": "extraction.ocr.confidence_threshold",
638640

639641
"MICASA_LOCALE_CURRENCY": "locale.currency",
640642

@@ -1346,6 +1348,82 @@ func TestResolvedFilePickerDir_EmptyFallsBackToDownloadsOrCwd(t *testing.T) {
13461348
assert.NotEmpty(t, result)
13471349
}
13481350

1351+
// --- OCR TSV ---
1352+
1353+
func TestExtractionOCRTSVDefaultTrue(t *testing.T) {
1354+
cfg, err := LoadFromPath(noConfig(t))
1355+
require.NoError(t, err)
1356+
assert.True(t, cfg.Extraction.IsOCRTSV(),
1357+
"OCR TSV should default to true")
1358+
}
1359+
1360+
func TestExtractionOCRTSVFromTOML(t *testing.T) {
1361+
path := writeConfig(t, "[extraction.ocr]\ntsv = true\n")
1362+
cfg, err := LoadFromPath(path)
1363+
require.NoError(t, err)
1364+
assert.True(t, cfg.Extraction.IsOCRTSV())
1365+
}
1366+
1367+
func TestExtractionOCRTSVFromTOMLFalse(t *testing.T) {
1368+
path := writeConfig(t, "[extraction.ocr]\ntsv = false\n")
1369+
cfg, err := LoadFromPath(path)
1370+
require.NoError(t, err)
1371+
assert.False(t, cfg.Extraction.IsOCRTSV())
1372+
}
1373+
1374+
func TestExtractionOCRTSVFromEnv(t *testing.T) {
1375+
t.Setenv("MICASA_EXTRACTION_OCR_TSV", "true")
1376+
cfg, err := LoadFromPath(noConfig(t))
1377+
require.NoError(t, err)
1378+
assert.True(t, cfg.Extraction.IsOCRTSV())
1379+
}
1380+
1381+
func TestExtractionOCRTSVEnvInvalidReturnsError(t *testing.T) {
1382+
t.Setenv("MICASA_EXTRACTION_OCR_TSV", "maybe")
1383+
_, err := LoadFromPath(noConfig(t))
1384+
require.Error(t, err)
1385+
assert.Contains(t, err.Error(), "MICASA_EXTRACTION_OCR_TSV")
1386+
assert.Contains(t, err.Error(), "expected true or false")
1387+
}
1388+
1389+
// --- OCR confidence threshold ---
1390+
1391+
func TestExtractionOCRConfThresholdDefault70(t *testing.T) {
1392+
cfg, err := LoadFromPath(noConfig(t))
1393+
require.NoError(t, err)
1394+
assert.Equal(t, 70, cfg.Extraction.OCRConfThreshold(),
1395+
"OCR confidence threshold should default to 70")
1396+
}
1397+
1398+
func TestExtractionOCRConfThresholdFromTOML(t *testing.T) {
1399+
path := writeConfig(t, "[extraction.ocr]\nconfidence_threshold = 50\n")
1400+
cfg, err := LoadFromPath(path)
1401+
require.NoError(t, err)
1402+
assert.Equal(t, 50, cfg.Extraction.OCRConfThreshold())
1403+
}
1404+
1405+
func TestExtractionOCRConfThresholdFromTOMLZero(t *testing.T) {
1406+
path := writeConfig(t, "[extraction.ocr]\nconfidence_threshold = 0\n")
1407+
cfg, err := LoadFromPath(path)
1408+
require.NoError(t, err)
1409+
assert.Equal(t, 0, cfg.Extraction.OCRConfThreshold(),
1410+
"zero threshold should disable confidence annotations")
1411+
}
1412+
1413+
func TestExtractionOCRConfThresholdFromEnv(t *testing.T) {
1414+
t.Setenv("MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD", "80")
1415+
cfg, err := LoadFromPath(noConfig(t))
1416+
require.NoError(t, err)
1417+
assert.Equal(t, 80, cfg.Extraction.OCRConfThreshold())
1418+
}
1419+
1420+
func TestExtractionOCRConfThresholdEnvInvalidReturnsError(t *testing.T) {
1421+
t.Setenv("MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD", "high")
1422+
_, err := LoadFromPath(noConfig(t))
1423+
require.Error(t, err)
1424+
assert.Contains(t, err.Error(), "MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD")
1425+
}
1426+
13491427
func TestFilePickerDir_FromTOML(t *testing.T) {
13501428
t.Parallel()
13511429
dir := t.TempDir()

0 commit comments

Comments
 (0)