Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ details; do not duplicate that detail here.

### Git and CI

- **Reply to PR review comments**: After addressing a PR review comment,
reply to the comment on GitHub (via `gh api .../replies`) explaining
how it was addressed (commit hash, what changed, tests added). Do this
for every comment, not just some.
- **Never use `git commit --no-verify`**: No exceptions. Fix every hook
failure before committing.
- **Treat all linter/compiler warnings as bugs**: Fix all warnings from
Expand Down
3 changes: 2 additions & 1 deletion cmd/micasa/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ func (cmd *runCmd) Run() error {
cfg.Extraction.MaxPages,
0, // pdftotext uses its own internal default timeout (30s)
cfg.Extraction.IsOCREnabled(),
cfg.Extraction.OCR.ConfidenceThreshold,
)
opts.SetExtraction(
exCfg.Provider,
Expand All @@ -175,6 +174,8 @@ func (cmd *runCmd) Run() error {
extractors,
cfg.Extraction.IsEnabled(),
cfg.Extraction.LLMTimeoutDuration(),
cfg.Extraction.IsOCRTSV(),
cfg.Extraction.OCRConfThreshold(),
)

model, err := app.NewModel(store, opts)
Expand Down
37 changes: 31 additions & 6 deletions internal/app/extraction.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,12 +517,14 @@ func (m *Model) llmExtractCmd(ctx context.Context, ex *extractionLogState) tea.C
ex.llmCancelFn = cancel
}
messages := extract.BuildExtractionPrompt(extract.ExtractionPromptInput{
DocID: ex.DocID,
Filename: ex.Filename,
MIME: ex.mime,
SizeBytes: int64(len(ex.fileData)),
Schema: schemaCtx,
Sources: ex.sources,
DocID: ex.DocID,
Filename: ex.Filename,
MIME: ex.mime,
SizeBytes: int64(len(ex.fileData)),
Schema: schemaCtx,
Sources: ex.sources,
SendTSV: m.ex.ocrTSV,
ConfThreshold: m.ex.ocrConfThreshold,
})
ch, err := client.ChatStream(
llmCtx,
Expand Down Expand Up @@ -926,6 +928,18 @@ func (m *Model) commitShadowOperations(ex *extractionLogState, ops []extract.Ope
return nil
}

// toggleExtractionTSV flips the ocrTSV setting and reruns the LLM step
// so the user can compare extraction quality with and without spatial layout.
func (m *Model) toggleExtractionTSV() tea.Cmd {
m.ex.ocrTSV = !m.ex.ocrTSV
if m.ex.ocrTSV {
m.setStatusInfo("layout on")
} else {
m.setStatusInfo("layout off")
}
return m.rerunLLMExtraction()
}

// rerunLLMExtraction resets the LLM step and re-runs it.
func (m *Model) rerunLLMExtraction() tea.Cmd {
ex := m.ex.extraction
Expand Down Expand Up @@ -1079,6 +1093,10 @@ func (m *Model) handleExtractionPipelineKey(msg tea.KeyMsg) tea.Cmd {
if ex.Done && ex.hasLLM && ex.cursorStep() == stepLLM {
return m.activateExtractionModelPicker()
}
case keyT:
if ex.Done && ex.hasLLM {
return m.toggleExtractionTSV()
}
case keyA:
if ex.Done {
m.acceptExtraction()
Expand Down Expand Up @@ -1511,6 +1529,13 @@ func (m *Model) buildExtractionPipelineOverlay(
hints = append(hints, m.helpItem(keyX, "explore"))
}
if ex.Done {
if ex.hasLLM {
label := "layout on"
if m.ex.ocrTSV {
label = "layout off"
}
hints = append(hints, m.helpItem(keyT, label))
}
hints = append(hints, m.helpItem(keyA, "accept"), m.helpItem(keyEsc, "discard"))
} else {
hints = append(hints,
Expand Down
100 changes: 100 additions & 0 deletions internal/app/extraction_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2842,3 +2842,103 @@ func TestAccept_DeferredDoc_WorksWithoutLLMStep(t *testing.T) {
require.NoError(t, err)
assert.Equal(t, "better ocr text", full.ExtractedText)
}

// --- TSV toggle ---

func TestExtractionTSVToggle_TogglesOCRTSV(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
stepLLM: stepDone,
})
ex := m.ex.extraction
ex.Done = true

assert.False(t, m.ex.ocrTSV, "ocrTSV should start false in test setup")

// Press t to toggle layout on.
sendExtractionKey(m, keyT)
assert.True(t, m.ex.ocrTSV, "t should toggle ocrTSV on")

// LLM step should be reset for rerun.
assert.Equal(t, stepRunning, ex.Steps[stepLLM].Status,
"LLM step should be rerunning after toggle")
}

func TestExtractionTSVToggle_TogglesOff(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
stepLLM: stepDone,
})
ex := m.ex.extraction
ex.Done = true
m.ex.ocrTSV = true

sendExtractionKey(m, keyT)
assert.False(t, m.ex.ocrTSV, "t should toggle ocrTSV off")
}

func TestExtractionTSVToggle_IgnoredWhenNotDone(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepRunning,
stepLLM: stepPending,
})

sendExtractionKey(m, keyT)
assert.False(t, m.ex.ocrTSV, "t should be ignored when extraction is not done")
}

func TestExtractionTSVToggle_IgnoredWithoutLLM(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
})
ex := m.ex.extraction
ex.Done = true

sendExtractionKey(m, keyT)
assert.False(t, m.ex.ocrTSV, "t should be ignored when no LLM step")
}

func TestExtractionTSVToggle_StatusMessage(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
stepLLM: stepDone,
})
ex := m.ex.extraction
ex.Done = true

sendExtractionKey(m, keyT)
assert.Contains(t, m.status.Text, "layout on")

// Simulate LLM completing again so we can toggle off.
ex.Done = true
ex.Steps[stepLLM] = extractionStepInfo{Status: stepDone}

sendExtractionKey(m, keyT)
assert.Contains(t, m.status.Text, "layout off")
}

func TestExtractionTSVToggle_HintShownInFooter(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
stepLLM: stepDone,
})
ex := m.ex.extraction
ex.Done = true
m.width = 120
m.height = 40

view := m.View()
assert.Contains(t, view, "layout", "footer should show layout hint when done with LLM")
}
2 changes: 2 additions & 0 deletions internal/app/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@ func NewModel(store *data.Store, options Options) (*Model, error) {
extractionTimeout: options.ExtractionConfig.Timeout,
extractionThinking: options.ExtractionConfig.Thinking,
extractionEnabled: options.ExtractionConfig.Enabled,
ocrTSV: options.ExtractionConfig.OCRTSV,
ocrConfThreshold: options.ExtractionConfig.OCRConfThreshold,
extractors: options.ExtractionConfig.Extractors,
llmInferenceTimeout: options.ExtractionConfig.LLMInferenceTimeout,
},
Expand Down
12 changes: 10 additions & 2 deletions internal/app/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ type extractState struct {
extractionTimeout time.Duration
extractionThinking string
extractionEnabled bool
ocrTSV bool
ocrConfThreshold int
extractionClient *llm.Client
extractors []extract.Extractor
extractionReady bool
Expand Down Expand Up @@ -292,8 +294,10 @@ type extractionConfig struct {
Thinking string // reasoning effort level
LLMInferenceTimeout time.Duration

Extractors []extract.Extractor // configured extractors; nil = defaults
Enabled bool // LLM extraction enabled
Extractors []extract.Extractor // configured extractors; nil = defaults
Enabled bool // LLM extraction enabled
OCRTSV bool // send spatial layout annotations to LLM
OCRConfThreshold int // confidence threshold for spatial annotations
}

// SetExtraction configures the extraction pipeline on the Options.
Expand All @@ -304,6 +308,8 @@ func (o *Options) SetExtraction(
extractors []extract.Extractor,
enabled bool,
llmInferenceTimeout time.Duration,
ocrTSV bool,
ocrConfThreshold int,
) {
o.ExtractionConfig = extractionConfig{
Provider: provider,
Expand All @@ -315,6 +321,8 @@ func (o *Options) SetExtraction(
LLMInferenceTimeout: llmInferenceTimeout,
Extractors: extractors,
Enabled: enabled,
OCRTSV: ocrTSV,
OCRConfThreshold: ocrConfThreshold,
}
}

Expand Down
52 changes: 41 additions & 11 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,17 @@ type OCR struct {
// When disabled, scanned pages and images produce no text. Default: true.
Enable *bool `toml:"enable,omitempty"`

// ConfidenceThreshold is the minimum tesseract word confidence (0-100)
// to keep in OCR output. Words below this threshold are dropped.
// 0 means no filtering (all words kept). Default: 0.
ConfidenceThreshold int `toml:"confidence_threshold"`
// TSV sends spatial layout annotations (line-level bounding boxes
// and confidence scores) from tesseract OCR to the LLM alongside text.
// This helps extraction accuracy for invoices and forms with tabular
// data, at ~2x token overhead. Default: true.
TSV *bool `toml:"tsv,omitempty"`

// ConfidenceThresholdVal is the confidence threshold (0-100) below
// which OCR confidence annotations are included in spatial layout
// output. Lines with min confidence >= this value omit the score to
// save tokens. Set to 0 to never show confidence. Default: 70.
ConfidenceThresholdVal *int `toml:"confidence_threshold,omitempty"`
Comment thread
cpcloud marked this conversation as resolved.
}

// IsEnabled returns whether LLM extraction is enabled. Defaults to true
Expand Down Expand Up @@ -335,6 +342,24 @@ func (e Extraction) ThinkingLevel() string {
return e.Thinking
}

// IsOCRTSV returns whether spatial layout annotations from tesseract OCR
// should be sent to the LLM alongside text. Defaults to true.
func (e Extraction) IsOCRTSV() bool {
if e.OCR.TSV != nil {
return *e.OCR.TSV
}
return true
}

// OCRConfThreshold returns the confidence threshold below which OCR
// confidence annotations appear in spatial output. Defaults to 70.
func (e Extraction) OCRConfThreshold() int {
if e.OCR.ConfidenceThresholdVal != nil {
return *e.OCR.ConfidenceThresholdVal
}
return 70
}

// ResolvedModel returns the extraction model, falling back to the given
// chat model if no extraction-specific model is configured.
func (e Extraction) ResolvedModel(chatModel string) string {
Expand Down Expand Up @@ -530,10 +555,9 @@ func LoadFromPath(path string) (Config, error) {
)
}

if cfg.Extraction.OCR.ConfidenceThreshold < 0 || cfg.Extraction.OCR.ConfidenceThreshold > 100 {
if t := cfg.Extraction.OCRConfThreshold(); t < 0 || t > 100 {
return cfg, fmt.Errorf(
"extraction.ocr.confidence_threshold must be 0-100, got %d",
cfg.Extraction.OCR.ConfidenceThreshold,
"extraction.ocr.confidence_threshold must be 0-100, got %d", t,
)
}

Expand Down Expand Up @@ -1136,14 +1160,20 @@ model = "` + DefaultModel + `"
# Maximum pages for async extraction of scanned documents. 0 = no limit. Default: 0.
# max_pages = 0

# [extraction.ocr]
[extraction.ocr]
# Set to false to disable OCR on uploaded documents. When disabled, scanned
# pages and images produce no text. Default: true.
# enable = true

# Minimum tesseract word confidence (0-100) to keep. Words below this
# threshold are dropped. 0 = no filtering. Default: 0.
# confidence_threshold = 0
# Send spatial layout annotations (line-level bounding boxes) from tesseract
# OCR to the LLM alongside text. Improves extraction accuracy for invoices
# and forms with tabular data, at ~2x token overhead. Default: true.
# tsv = true

# Confidence threshold (0-100) for spatial annotations. Lines with OCR
# confidence below this threshold include a confidence score; lines above
# omit it to save tokens. Set to 0 to never show confidence. Default: 70.
# confidence_threshold = 70

[locale]
# ISO 4217 currency code. Stored in the database on first run; after that the
Expand Down
Loading