Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ details; do not duplicate that detail here.

### Git and CI

- **Reply to PR review comments**: After addressing a PR review comment,
reply to the comment on GitHub (via `gh api .../replies`) explaining
how it was addressed (commit hash, what changed, tests added). Do this
for every comment, not just some.
- **Never use `git commit --no-verify`**: No exceptions. Fix every hook
failure before committing.
- **Treat all linter/compiler warnings as bugs**: Fix all warnings from
Expand Down
3 changes: 2 additions & 1 deletion cmd/micasa/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ func (cmd *runCmd) Run() error {
cfg.Extraction.MaxPages,
0, // pdftotext uses its own internal default timeout (30s)
cfg.Extraction.IsOCREnabled(),
cfg.Extraction.OCR.ConfidenceThreshold,
)
opts.SetExtraction(
exCfg.Provider,
Expand All @@ -175,6 +174,8 @@ func (cmd *runCmd) Run() error {
extractors,
cfg.Extraction.IsEnabled(),
cfg.Extraction.LLMTimeoutDuration(),
cfg.Extraction.IsOCRTSV(),
cfg.Extraction.OCRConfThreshold(),
)

model, err := app.NewModel(store, opts)
Expand Down
37 changes: 31 additions & 6 deletions internal/app/extraction.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,12 +517,14 @@ func (m *Model) llmExtractCmd(ctx context.Context, ex *extractionLogState) tea.C
ex.llmCancelFn = cancel
}
messages := extract.BuildExtractionPrompt(extract.ExtractionPromptInput{
DocID: ex.DocID,
Filename: ex.Filename,
MIME: ex.mime,
SizeBytes: int64(len(ex.fileData)),
Schema: schemaCtx,
Sources: ex.sources,
DocID: ex.DocID,
Filename: ex.Filename,
MIME: ex.mime,
SizeBytes: int64(len(ex.fileData)),
Schema: schemaCtx,
Sources: ex.sources,
SendTSV: m.ex.ocrTSV,
ConfThreshold: m.ex.ocrConfThreshold,
})
ch, err := client.ChatStream(
llmCtx,
Expand Down Expand Up @@ -926,6 +928,18 @@ func (m *Model) commitShadowOperations(ex *extractionLogState, ops []extract.Ope
return nil
}

// toggleExtractionTSV flips the ocrTSV setting and reruns the LLM step
// so the user can compare extraction quality with and without spatial layout.
func (m *Model) toggleExtractionTSV() tea.Cmd {
m.ex.ocrTSV = !m.ex.ocrTSV
if m.ex.ocrTSV {
m.setStatusInfo("layout on")
} else {
m.setStatusInfo("layout off")
}
return m.rerunLLMExtraction()
}

// rerunLLMExtraction resets the LLM step and re-runs it.
func (m *Model) rerunLLMExtraction() tea.Cmd {
ex := m.ex.extraction
Expand Down Expand Up @@ -1079,6 +1093,10 @@ func (m *Model) handleExtractionPipelineKey(msg tea.KeyMsg) tea.Cmd {
if ex.Done && ex.hasLLM && ex.cursorStep() == stepLLM {
return m.activateExtractionModelPicker()
}
case keyT:
if ex.Done && ex.hasLLM {
return m.toggleExtractionTSV()
}
case keyA:
if ex.Done {
m.acceptExtraction()
Expand Down Expand Up @@ -1511,6 +1529,13 @@ func (m *Model) buildExtractionPipelineOverlay(
hints = append(hints, m.helpItem(keyX, "explore"))
}
if ex.Done {
if ex.hasLLM {
label := "layout on"
if m.ex.ocrTSV {
label = "layout off"
}
hints = append(hints, m.helpItem(keyT, label))
}
hints = append(hints, m.helpItem(keyA, "accept"), m.helpItem(keyEsc, "discard"))
} else {
hints = append(hints,
Expand Down
100 changes: 100 additions & 0 deletions internal/app/extraction_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2842,3 +2842,103 @@ func TestAccept_DeferredDoc_WorksWithoutLLMStep(t *testing.T) {
require.NoError(t, err)
assert.Equal(t, "better ocr text", full.ExtractedText)
}

// --- TSV toggle ---

func TestExtractionTSVToggle_TogglesOCRTSV(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
stepLLM: stepDone,
})
ex := m.ex.extraction
ex.Done = true

assert.False(t, m.ex.ocrTSV, "ocrTSV should start false in test setup")

// Press t to toggle layout on.
sendExtractionKey(m, keyT)
assert.True(t, m.ex.ocrTSV, "t should toggle ocrTSV on")

// LLM step should be reset for rerun.
assert.Equal(t, stepRunning, ex.Steps[stepLLM].Status,
"LLM step should be rerunning after toggle")
}

func TestExtractionTSVToggle_TogglesOff(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
stepLLM: stepDone,
})
ex := m.ex.extraction
ex.Done = true
m.ex.ocrTSV = true

sendExtractionKey(m, keyT)
assert.False(t, m.ex.ocrTSV, "t should toggle ocrTSV off")
}

func TestExtractionTSVToggle_IgnoredWhenNotDone(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepRunning,
stepLLM: stepPending,
})

sendExtractionKey(m, keyT)
assert.False(t, m.ex.ocrTSV, "t should be ignored when extraction is not done")
}

func TestExtractionTSVToggle_IgnoredWithoutLLM(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
})
ex := m.ex.extraction
ex.Done = true

sendExtractionKey(m, keyT)
assert.False(t, m.ex.ocrTSV, "t should be ignored when no LLM step")
}

func TestExtractionTSVToggle_StatusMessage(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
stepLLM: stepDone,
})
ex := m.ex.extraction
ex.Done = true

sendExtractionKey(m, keyT)
assert.Contains(t, m.status.Text, "layout on")

// Simulate LLM completing again so we can toggle off.
ex.Done = true
ex.Steps[stepLLM] = extractionStepInfo{Status: stepDone}

sendExtractionKey(m, keyT)
assert.Contains(t, m.status.Text, "layout off")
}

func TestExtractionTSVToggle_HintShownInFooter(t *testing.T) {
t.Parallel()
m := newExtractionModel(t, map[extractionStep]stepStatus{
stepText: stepDone,
stepExtract: stepDone,
stepLLM: stepDone,
})
ex := m.ex.extraction
ex.Done = true
m.width = 120
m.height = 40

view := m.View()
assert.Contains(t, view, "layout", "footer should show layout hint when done with LLM")
}
2 changes: 2 additions & 0 deletions internal/app/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@ func NewModel(store *data.Store, options Options) (*Model, error) {
extractionTimeout: options.ExtractionConfig.Timeout,
extractionThinking: options.ExtractionConfig.Thinking,
extractionEnabled: options.ExtractionConfig.Enabled,
ocrTSV: options.ExtractionConfig.OCRTSV,
ocrConfThreshold: options.ExtractionConfig.OCRConfThreshold,
extractors: options.ExtractionConfig.Extractors,
llmInferenceTimeout: options.ExtractionConfig.LLMInferenceTimeout,
},
Expand Down
12 changes: 10 additions & 2 deletions internal/app/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ type extractState struct {
extractionTimeout time.Duration
extractionThinking string
extractionEnabled bool
ocrTSV bool
ocrConfThreshold int
extractionClient *llm.Client
extractors []extract.Extractor
extractionReady bool
Expand Down Expand Up @@ -292,8 +294,10 @@ type extractionConfig struct {
Thinking string // reasoning effort level
LLMInferenceTimeout time.Duration

Extractors []extract.Extractor // configured extractors; nil = defaults
Enabled bool // LLM extraction enabled
Extractors []extract.Extractor // configured extractors; nil = defaults
Enabled bool // LLM extraction enabled
OCRTSV bool // send spatial layout annotations to LLM
OCRConfThreshold int // confidence threshold for spatial annotations
}

// SetExtraction configures the extraction pipeline on the Options.
Expand All @@ -304,6 +308,8 @@ func (o *Options) SetExtraction(
extractors []extract.Extractor,
enabled bool,
llmInferenceTimeout time.Duration,
ocrTSV bool,
ocrConfThreshold int,
) {
o.ExtractionConfig = extractionConfig{
Provider: provider,
Expand All @@ -315,6 +321,8 @@ func (o *Options) SetExtraction(
LLMInferenceTimeout: llmInferenceTimeout,
Extractors: extractors,
Enabled: enabled,
OCRTSV: ocrTSV,
OCRConfThreshold: ocrConfThreshold,
}
}

Expand Down
49 changes: 42 additions & 7 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,17 @@ type OCR struct {
// When disabled, scanned pages and images produce no text. Default: true.
Enable *bool `toml:"enable,omitempty"`

// ConfidenceThreshold is the minimum tesseract word confidence (0-100)
// to keep in OCR output. Words below this threshold are dropped.
// 0 means no filtering (all words kept). Default: 0.
ConfidenceThreshold int `toml:"confidence_threshold"`
// TSV sends spatial layout annotations (line-level bounding boxes
// and confidence scores) from tesseract OCR to the LLM alongside text.
// This helps extraction accuracy for invoices and forms with tabular
// data, at ~2x token overhead. Default: true.
TSV *bool `toml:"tsv,omitempty"`

// ConfidenceThresholdVal is the confidence threshold (0-100) below
// which OCR confidence annotations are included in spatial layout
// output. Lines with min confidence >= this value omit the score to
// save tokens. Set to 0 to never show confidence. Default: 70.
ConfidenceThresholdVal *int `toml:"confidence_threshold,omitempty"`
Comment thread
cpcloud marked this conversation as resolved.
}

// IsEnabled returns whether LLM extraction is enabled. Defaults to true
Expand Down Expand Up @@ -335,6 +342,24 @@ func (e Extraction) ThinkingLevel() string {
return e.Thinking
}

// IsOCRTSV returns whether spatial layout annotations from tesseract OCR
// should be sent to the LLM alongside text. Defaults to true.
func (e Extraction) IsOCRTSV() bool {
if e.OCR.TSV != nil {
return *e.OCR.TSV
}
return true
}

// OCRConfThreshold returns the confidence threshold below which OCR
// confidence annotations appear in spatial output. Defaults to 70.
func (e Extraction) OCRConfThreshold() int {
if e.OCR.ConfidenceThresholdVal != nil {
return *e.OCR.ConfidenceThresholdVal
}
return 70
}

// ResolvedModel returns the extraction model, falling back to the given
// chat model if no extraction-specific model is configured.
func (e Extraction) ResolvedModel(chatModel string) string {
Expand Down Expand Up @@ -530,10 +555,9 @@ func LoadFromPath(path string) (Config, error) {
)
}

if cfg.Extraction.OCR.ConfidenceThreshold < 0 || cfg.Extraction.OCR.ConfidenceThreshold > 100 {
if t := cfg.Extraction.OCRConfThreshold(); t < 0 || t > 100 {
return cfg, fmt.Errorf(
"extraction.ocr.confidence_threshold must be 0-100, got %d",
cfg.Extraction.OCR.ConfidenceThreshold,
"extraction.ocr.confidence_threshold must be 0-100, got %d", t,
)
}

Expand Down Expand Up @@ -1145,6 +1169,17 @@ model = "` + DefaultModel + `"
# threshold are dropped. 0 = no filtering. Default: 0.
# confidence_threshold = 0

[extraction.ocr]
# Send spatial layout annotations (line-level bounding boxes) from tesseract
# OCR to the LLM alongside text. Improves extraction accuracy for invoices
# and forms with tabular data, at ~2x token overhead. Default: true.
# tsv = true

# Confidence threshold (0-100) for spatial annotations. Lines with OCR
# confidence below this threshold include a confidence score; lines above
# omit it to save tokens. Set to 0 to never show confidence. Default: 70.
# confidence_threshold = 70
Comment thread
cpcloud marked this conversation as resolved.
Outdated

[locale]
# ISO 4217 currency code. Stored in the database on first run; after that the
# database value is authoritative. Override: MICASA_LOCALE_CURRENCY env var.
Expand Down
Loading