Skip to content

Commit 8ce00e4

Browse files
cpcloudclaude
andcommitted
fix(extract): remove word-dropping OCR filter and unify confidence_threshold
filterTSVByConfidence silently dropped low-confidence OCR words, losing data. Remove it entirely -- OCR data should never be discarded. Unify confidence_threshold to serve a single purpose: controlling when confidence annotations appear in spatial layout output sent to the LLM. Default 70. - Remove filterTSVByConfidence and all call sites - Remove ConfidenceThreshold from PDFOCRExtractor/ImageOCRExtractor - Remove confidenceThreshold param from DefaultExtractors - Restore ConfidenceThresholdVal *int on OCR struct for spatial display - Add AGENTS.md rule: reply to every PR review comment on GitHub Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 64dc8fc commit 8ce00e4

12 files changed

Lines changed: 48 additions & 187 deletions

File tree

AGENTS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,10 @@ details; do not duplicate that detail here.
203203

204204
### Git and CI
205205

206+
- **Reply to PR review comments**: After addressing a PR review comment,
207+
reply to the comment on GitHub (via `gh api .../replies`) explaining
208+
how it was addressed (commit hash, what changed, tests added). Do this
209+
for every comment, not just some.
206210
- **Never use `git commit --no-verify`**: No exceptions. Fix every hook
207211
failure before committing.
208212
- **Treat all linter/compiler warnings as bugs**: Fix all warnings from

cmd/micasa/main.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@ func (cmd *runCmd) Run() error {
163163
cfg.Extraction.MaxPages,
164164
0, // pdftotext uses its own internal default timeout (30s)
165165
cfg.Extraction.IsOCREnabled(),
166-
cfg.Extraction.OCR.ConfidenceThreshold,
167166
)
168167
opts.SetExtraction(
169168
exCfg.Provider,
@@ -176,7 +175,7 @@ func (cmd *runCmd) Run() error {
176175
cfg.Extraction.IsEnabled(),
177176
cfg.Extraction.LLMTimeoutDuration(),
178177
cfg.Extraction.IsOCRTSV(),
179-
cfg.Extraction.OCRSpatialConfThreshold(),
178+
cfg.Extraction.OCRConfThreshold(),
180179
)
181180

182181
model, err := app.NewModel(store, opts)

internal/config/config.go

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -292,22 +292,17 @@ type OCR struct {
292292
// When disabled, scanned pages and images produce no text. Default: true.
293293
Enable *bool `toml:"enable,omitempty"`
294294

295-
// ConfidenceThreshold is the minimum tesseract word confidence (0-100)
296-
// to keep in OCR output. Words below this threshold are dropped.
297-
// 0 means no filtering (all words kept). Default: 0.
298-
ConfidenceThreshold int `toml:"confidence_threshold"`
299-
300295
// TSV sends spatial layout annotations (line-level bounding boxes
301296
// and confidence scores) from tesseract OCR to the LLM alongside text.
302297
// This helps extraction accuracy for invoices and forms with tabular
303298
// data, at ~2x token overhead. Default: true.
304299
TSV *bool `toml:"tsv,omitempty"`
305300

306-
// SpatialConfThresholdVal is the confidence threshold (0-100) below
301+
// ConfidenceThresholdVal is the confidence threshold (0-100) below
307302
// which OCR confidence annotations are included in spatial layout
308303
// output. Lines with min confidence >= this value omit the score to
309304
// save tokens. Set to 0 to never show confidence. Default: 70.
310-
SpatialConfThresholdVal *int `toml:"spatial_conf_threshold,omitempty"`
305+
ConfidenceThresholdVal *int `toml:"confidence_threshold,omitempty"`
311306
}
312307

313308
// IsEnabled returns whether LLM extraction is enabled. Defaults to true
@@ -356,11 +351,11 @@ func (e Extraction) IsOCRTSV() bool {
356351
return true
357352
}
358353

359-
// OCRSpatialConfThreshold returns the confidence threshold below which
360-
// OCR confidence annotations appear in spatial output. Defaults to 70.
361-
func (e Extraction) OCRSpatialConfThreshold() int {
362-
if e.OCR.SpatialConfThresholdVal != nil {
363-
return *e.OCR.SpatialConfThresholdVal
354+
// OCRConfThreshold returns the confidence threshold below which OCR
355+
// confidence annotations appear in spatial output. Defaults to 70.
356+
func (e Extraction) OCRConfThreshold() int {
357+
if e.OCR.ConfidenceThresholdVal != nil {
358+
return *e.OCR.ConfidenceThresholdVal
364359
}
365360
return 70
366361
}
@@ -560,16 +555,9 @@ func LoadFromPath(path string) (Config, error) {
560555
)
561556
}
562557

563-
if cfg.Extraction.OCR.ConfidenceThreshold < 0 || cfg.Extraction.OCR.ConfidenceThreshold > 100 {
564-
return cfg, fmt.Errorf(
565-
"extraction.ocr.confidence_threshold must be 0-100, got %d",
566-
cfg.Extraction.OCR.ConfidenceThreshold,
567-
)
568-
}
569-
570-
if t := cfg.Extraction.OCRSpatialConfThreshold(); t < 0 || t > 100 {
558+
if t := cfg.Extraction.OCRConfThreshold(); t < 0 || t > 100 {
571559
return cfg, fmt.Errorf(
572-
"extraction.ocr.spatial_conf_threshold must be 0-100, got %d", t,
560+
"extraction.ocr.confidence_threshold must be 0-100, got %d", t,
573561
)
574562
}
575563

internal/config/config_test.go

Lines changed: 7 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -636,10 +636,9 @@ func TestEnvVars(t *testing.T) {
636636
"MICASA_EXTRACTION_LLM_TIMEOUT": "extraction.llm_timeout",
637637
"MICASA_EXTRACTION_THINKING": "extraction.thinking",
638638

639-
"MICASA_EXTRACTION_OCR_ENABLE": "extraction.ocr.enable",
640-
"MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD": "extraction.ocr.confidence_threshold",
641-
"MICASA_EXTRACTION_OCR_TSV": "extraction.ocr.tsv",
642-
"MICASA_EXTRACTION_OCR_SPATIAL_CONF_THRESHOLD": "extraction.ocr.spatial_conf_threshold",
639+
"MICASA_EXTRACTION_OCR_ENABLE": "extraction.ocr.enable",
640+
"MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD": "extraction.ocr.confidence_threshold",
641+
"MICASA_EXTRACTION_OCR_TSV": "extraction.ocr.tsv",
643642

644643
"MICASA_LOCALE_CURRENCY": "locale.currency",
645644

@@ -789,15 +788,15 @@ func TestOCRDefaults(t *testing.T) {
789788
cfg, err := LoadFromPath(noConfig(t))
790789
require.NoError(t, err)
791790
assert.True(t, cfg.Extraction.IsOCREnabled())
792-
assert.Equal(t, 0, cfg.Extraction.OCR.ConfidenceThreshold)
791+
assert.Equal(t, 70, cfg.Extraction.OCRConfThreshold())
793792
}
794793

795794
func TestOCRFromFile(t *testing.T) {
796-
path := writeConfig(t, "[extraction.ocr]\nenable = false\nconfidence_threshold = 70\n")
795+
path := writeConfig(t, "[extraction.ocr]\nenable = false\nconfidence_threshold = 50\n")
797796
cfg, err := LoadFromPath(path)
798797
require.NoError(t, err)
799798
assert.False(t, cfg.Extraction.IsOCREnabled())
800-
assert.Equal(t, 70, cfg.Extraction.OCR.ConfidenceThreshold)
799+
assert.Equal(t, 50, cfg.Extraction.OCRConfThreshold())
801800
}
802801

803802
func TestOCREnvOverrides(t *testing.T) {
@@ -806,7 +805,7 @@ func TestOCREnvOverrides(t *testing.T) {
806805
cfg, err := LoadFromPath(noConfig(t))
807806
require.NoError(t, err)
808807
assert.False(t, cfg.Extraction.IsOCREnabled())
809-
assert.Equal(t, 80, cfg.Extraction.OCR.ConfidenceThreshold)
808+
assert.Equal(t, 80, cfg.Extraction.OCRConfThreshold())
810809
}
811810

812811
func TestOCRConfidenceThresholdValidation(t *testing.T) {
@@ -1468,61 +1467,6 @@ func TestExtractionOCRTSVEnvInvalidReturnsError(t *testing.T) {
14681467
assert.Contains(t, err.Error(), "expected true or false")
14691468
}
14701469

1471-
// --- OCR spatial confidence threshold ---
1472-
1473-
func TestExtractionOCRSpatialConfThresholdDefault70(t *testing.T) {
1474-
cfg, err := LoadFromPath(noConfig(t))
1475-
require.NoError(t, err)
1476-
assert.Equal(t, 70, cfg.Extraction.OCRSpatialConfThreshold(),
1477-
"OCR spatial confidence threshold should default to 70")
1478-
}
1479-
1480-
func TestExtractionOCRSpatialConfThresholdFromTOML(t *testing.T) {
1481-
path := writeConfig(t, "[extraction.ocr]\nspatial_conf_threshold = 50\n")
1482-
cfg, err := LoadFromPath(path)
1483-
require.NoError(t, err)
1484-
assert.Equal(t, 50, cfg.Extraction.OCRSpatialConfThreshold())
1485-
}
1486-
1487-
func TestExtractionOCRSpatialConfThresholdFromTOMLZero(t *testing.T) {
1488-
path := writeConfig(t, "[extraction.ocr]\nspatial_conf_threshold = 0\n")
1489-
cfg, err := LoadFromPath(path)
1490-
require.NoError(t, err)
1491-
assert.Equal(t, 0, cfg.Extraction.OCRSpatialConfThreshold(),
1492-
"zero threshold should disable confidence annotations")
1493-
}
1494-
1495-
func TestExtractionOCRSpatialConfThresholdFromEnv(t *testing.T) {
1496-
t.Setenv("MICASA_EXTRACTION_OCR_SPATIAL_CONF_THRESHOLD", "80")
1497-
cfg, err := LoadFromPath(noConfig(t))
1498-
require.NoError(t, err)
1499-
assert.Equal(t, 80, cfg.Extraction.OCRSpatialConfThreshold())
1500-
}
1501-
1502-
func TestExtractionOCRSpatialConfThresholdEnvInvalidReturnsError(t *testing.T) {
1503-
t.Setenv("MICASA_EXTRACTION_OCR_SPATIAL_CONF_THRESHOLD", "high")
1504-
_, err := LoadFromPath(noConfig(t))
1505-
require.Error(t, err)
1506-
assert.Contains(t, err.Error(), "MICASA_EXTRACTION_OCR_SPATIAL_CONF_THRESHOLD")
1507-
}
1508-
1509-
func TestExtractionOCRSpatialConfThresholdOutOfRange(t *testing.T) {
1510-
t.Parallel()
1511-
path := writeConfig(t, "[extraction.ocr]\nspatial_conf_threshold = 101\n")
1512-
_, err := LoadFromPath(path)
1513-
require.Error(t, err)
1514-
assert.Contains(t, err.Error(), "extraction.ocr.spatial_conf_threshold")
1515-
assert.Contains(t, err.Error(), "0-100")
1516-
}
1517-
1518-
func TestExtractionOCRSpatialConfThresholdNegative(t *testing.T) {
1519-
t.Parallel()
1520-
path := writeConfig(t, "[extraction.ocr]\nspatial_conf_threshold = -1\n")
1521-
_, err := LoadFromPath(path)
1522-
require.Error(t, err)
1523-
assert.Contains(t, err.Error(), "extraction.ocr.spatial_conf_threshold")
1524-
}
1525-
15261470
func TestFilePickerDir_FromTOML(t *testing.T) {
15271471
t.Parallel()
15281472
dir := t.TempDir()

internal/extract/extractor.go

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,21 +32,19 @@ type Extractor interface {
3232
// pdftotext, plaintext, PDF OCR, image OCR. maxPages of 0 means no limit
3333
// (all pages). Zero timeout causes the concrete extractor to use its default.
3434
// ocrEnabled controls whether OCR extractors are included (default true).
35-
// confidenceThreshold filters OCR words below this confidence (0 = no filter).
3635
func DefaultExtractors(
3736
maxPages int,
3837
timeout time.Duration,
3938
ocrEnabled bool,
40-
confidenceThreshold int,
4139
) []Extractor {
4240
ext := []Extractor{
4341
&PDFTextExtractor{Timeout: timeout},
4442
&PlainTextExtractor{},
4543
}
4644
if ocrEnabled {
4745
ext = append(ext,
48-
&PDFOCRExtractor{MaxPages: maxPages, ConfidenceThreshold: confidenceThreshold},
49-
&ImageOCRExtractor{ConfidenceThreshold: confidenceThreshold},
46+
&PDFOCRExtractor{MaxPages: maxPages},
47+
&ImageOCRExtractor{},
5048
)
5149
}
5250
return ext
@@ -153,8 +151,7 @@ func (e *PlainTextExtractor) Extract(_ context.Context, data []byte) (TextSource
153151

154152
// PDFOCRExtractor wraps ocrPDF for scanned PDF pages.
155153
type PDFOCRExtractor struct {
156-
MaxPages int
157-
ConfidenceThreshold int
154+
MaxPages int
158155
}
159156

160157
func (e *PDFOCRExtractor) Tool() string { return "tesseract" }
@@ -169,10 +166,6 @@ func (e *PDFOCRExtractor) Extract(ctx context.Context, data []byte) (TextSource,
169166
if err != nil {
170167
return TextSource{}, err
171168
}
172-
if e.ConfidenceThreshold > 0 {
173-
tsv = filterTSVByConfidence(tsv, e.ConfidenceThreshold)
174-
text = textFromTSV(tsv)
175-
}
176169
return TextSource{
177170
Tool: "tesseract",
178171
Desc: "Text recognized from rasterized page images. Covers scanned pages that pdftotext misses, but may contain OCR errors.",
@@ -182,9 +175,7 @@ func (e *PDFOCRExtractor) Extract(ctx context.Context, data []byte) (TextSource,
182175
}
183176

184177
// ImageOCRExtractor wraps ocrImage for direct image OCR.
185-
type ImageOCRExtractor struct {
186-
ConfidenceThreshold int
187-
}
178+
type ImageOCRExtractor struct{}
188179

189180
func (e *ImageOCRExtractor) Tool() string { return "tesseract" }
190181
func (e *ImageOCRExtractor) Matches(mime string) bool { return IsImageMIME(mime) }
@@ -198,10 +189,6 @@ func (e *ImageOCRExtractor) Extract(ctx context.Context, data []byte) (TextSourc
198189
if err != nil {
199190
return TextSource{}, err
200191
}
201-
if e.ConfidenceThreshold > 0 {
202-
tsv = filterTSVByConfidence(tsv, e.ConfidenceThreshold)
203-
text = textFromTSV(tsv)
204-
}
205192
return TextSource{
206193
Tool: "tesseract",
207194
Desc: "Text recognized from the image. May contain OCR errors.",

internal/extract/extractor_test.go

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ func TestImageOCRExtractor_Available(t *testing.T) {
150150

151151
func TestDefaultExtractors_Order(t *testing.T) {
152152
t.Parallel()
153-
extractors := DefaultExtractors(0, 0, true, 0)
153+
extractors := DefaultExtractors(0, 0, true)
154154
require.Len(t, extractors, 4)
155155
assert.Equal(t, "pdftotext", extractors[0].Tool())
156156
assert.Equal(t, "plaintext", extractors[1].Tool())
@@ -166,7 +166,7 @@ func TestDefaultExtractors_Order(t *testing.T) {
166166

167167
func TestDefaultExtractors_Passthrough(t *testing.T) {
168168
t.Parallel()
169-
extractors := DefaultExtractors(42, 99, true, 0)
169+
extractors := DefaultExtractors(42, 99, true)
170170
pdfExt, ok := extractors[0].(*PDFTextExtractor)
171171
require.True(t, ok)
172172
assert.Equal(t, 99, int(pdfExt.Timeout))
@@ -178,50 +178,38 @@ func TestDefaultExtractors_Passthrough(t *testing.T) {
178178

179179
func TestDefaultExtractors_OCRDisabled(t *testing.T) {
180180
t.Parallel()
181-
extractors := DefaultExtractors(0, 0, false, 0)
181+
extractors := DefaultExtractors(0, 0, false)
182182
require.Len(t, extractors, 2)
183183
assert.Equal(t, "pdftotext", extractors[0].Tool())
184184
assert.Equal(t, "plaintext", extractors[1].Tool())
185185
}
186186

187-
func TestDefaultExtractors_ConfidencePassthrough(t *testing.T) {
188-
t.Parallel()
189-
extractors := DefaultExtractors(0, 0, true, 70)
190-
pdfOCR, ok := extractors[2].(*PDFOCRExtractor)
191-
require.True(t, ok)
192-
assert.Equal(t, 70, pdfOCR.ConfidenceThreshold)
193-
194-
imgOCR, ok := extractors[3].(*ImageOCRExtractor)
195-
require.True(t, ok)
196-
assert.Equal(t, 70, imgOCR.ConfidenceThreshold)
197-
}
198-
199187
// --- HasMatchingExtractor ---
200188

201189
func TestHasMatchingExtractor_Tesseract_PDF(t *testing.T) {
202190
t.Parallel()
203-
extractors := DefaultExtractors(0, 0, true, 0)
191+
extractors := DefaultExtractors(0, 0, true)
204192
got := HasMatchingExtractor(extractors, "tesseract", "application/pdf")
205193
assert.Equal(t, OCRAvailable(), got)
206194
}
207195

208196
func TestHasMatchingExtractor_Tesseract_Image(t *testing.T) {
209197
t.Parallel()
210-
extractors := DefaultExtractors(0, 0, true, 0)
198+
extractors := DefaultExtractors(0, 0, true)
211199
got := HasMatchingExtractor(extractors, "tesseract", "image/png")
212200
assert.Equal(t, ImageOCRAvailable(), got)
213201
}
214202

215203
func TestHasMatchingExtractor_Pdftotext(t *testing.T) {
216204
t.Parallel()
217-
extractors := DefaultExtractors(0, 0, true, 0)
205+
extractors := DefaultExtractors(0, 0, true)
218206
got := HasMatchingExtractor(extractors, "pdftotext", "application/pdf")
219207
assert.Equal(t, HasPDFToText(), got)
220208
}
221209

222210
func TestHasMatchingExtractor_NoMatch(t *testing.T) {
223211
t.Parallel()
224-
extractors := DefaultExtractors(0, 0, true, 0)
212+
extractors := DefaultExtractors(0, 0, true)
225213
assert.False(t, HasMatchingExtractor(extractors, "tesseract", "text/plain"))
226214
assert.False(t, HasMatchingExtractor(extractors, "pdftotext", "image/png"))
227215
assert.False(t, HasMatchingExtractor(extractors, "nonexistent", "application/pdf"))
@@ -231,21 +219,21 @@ func TestHasMatchingExtractor_NoMatch(t *testing.T) {
231219

232220
func TestNeedsOCR_PDF(t *testing.T) {
233221
t.Parallel()
234-
extractors := DefaultExtractors(0, 0, true, 0)
222+
extractors := DefaultExtractors(0, 0, true)
235223
got := NeedsOCR(extractors, "application/pdf")
236224
assert.Equal(t, OCRAvailable(), got)
237225
}
238226

239227
func TestNeedsOCR_Image(t *testing.T) {
240228
t.Parallel()
241-
extractors := DefaultExtractors(0, 0, true, 0)
229+
extractors := DefaultExtractors(0, 0, true)
242230
got := NeedsOCR(extractors, "image/png")
243231
assert.Equal(t, ImageOCRAvailable(), got)
244232
}
245233

246234
func TestNeedsOCR_PlainText(t *testing.T) {
247235
t.Parallel()
248-
extractors := DefaultExtractors(0, 0, true, 0)
236+
extractors := DefaultExtractors(0, 0, true)
249237
assert.False(t, NeedsOCR(extractors, "text/plain"))
250238
}
251239

@@ -260,7 +248,7 @@ func TestNeedsOCR_NoOCRExtractors(t *testing.T) {
260248

261249
func TestExtractorTimeout(t *testing.T) {
262250
t.Parallel()
263-
extractors := DefaultExtractors(0, 42, true, 0)
251+
extractors := DefaultExtractors(0, 42, true)
264252
assert.Equal(t, time.Duration(42), ExtractorTimeout(extractors))
265253
}
266254

@@ -272,7 +260,7 @@ func TestExtractorTimeout_NoPDFText(t *testing.T) {
272260

273261
func TestExtractorMaxPages(t *testing.T) {
274262
t.Parallel()
275-
extractors := DefaultExtractors(15, 0, true, 0)
263+
extractors := DefaultExtractors(15, 0, true)
276264
assert.Equal(t, 15, ExtractorMaxPages(extractors))
277265
}
278266

0 commit comments

Comments
 (0)