micasa-dev
diff --git a/‎AGENTS.md‎
Lines changed: 4 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cmd/micasa/main.go‎
Lines changed: 1 addition & 2 deletions b/‎cmd/micasa/main.go‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎internal/config/config.go‎
Lines changed: 9 additions & 21 deletions b/‎internal/config/config.go‎
Lines changed: 9 additions & 21 deletions
diff --git a/‎internal/config/config_test.go‎
Lines changed: 7 additions & 63 deletions b/‎internal/config/config_test.go‎
Lines changed: 7 additions & 63 deletions
diff --git a/‎internal/extract/extractor.go‎
Lines changed: 4 additions & 17 deletions b/‎internal/extract/extractor.go‎
Lines changed: 4 additions & 17 deletions
diff --git a/‎internal/extract/extractor_test.go‎
Lines changed: 12 additions & 24 deletions b/‎internal/extract/extractor_test.go‎
Lines changed: 12 additions & 24 deletions
@@ -203,6 +203,10 @@ details; do not duplicate that detail here.
 
 ### Git and CI
 
+- **Reply to PR review comments**: After addressing a PR review comment,
+  reply to the comment on GitHub (via `gh api .../replies`) explaining
+  how it was addressed (commit hash, what changed, tests added). Do this
+  for every comment, not just some.
 - **Never use `git commit --no-verify`**: No exceptions. Fix every hook
   failure before committing.
 - **Treat all linter/compiler warnings as bugs**: Fix all warnings from
 
@@ -163,7 +163,6 @@ func (cmd *runCmd) Run() error {
 		cfg.Extraction.MaxPages,
 		0, // pdftotext uses its own internal default timeout (30s)
 		cfg.Extraction.IsOCREnabled(),
-		cfg.Extraction.OCR.ConfidenceThreshold,
 	)
 	opts.SetExtraction(
 		exCfg.Provider,
@@ -176,7 +175,7 @@ func (cmd *runCmd) Run() error {
 		cfg.Extraction.IsEnabled(),
 		cfg.Extraction.LLMTimeoutDuration(),
 		cfg.Extraction.IsOCRTSV(),
-		cfg.Extraction.OCRSpatialConfThreshold(),
+		cfg.Extraction.OCRConfThreshold(),
 	)
 
 	model, err := app.NewModel(store, opts)
 
@@ -292,22 +292,17 @@ type OCR struct {
 	// When disabled, scanned pages and images produce no text. Default: true.
 	Enable *bool `toml:"enable,omitempty"`
 
-	// ConfidenceThreshold is the minimum tesseract word confidence (0-100)
-	// to keep in OCR output. Words below this threshold are dropped.
-	// 0 means no filtering (all words kept). Default: 0.
-	ConfidenceThreshold int `toml:"confidence_threshold"`
-
 	// TSV sends spatial layout annotations (line-level bounding boxes
 	// and confidence scores) from tesseract OCR to the LLM alongside text.
 	// This helps extraction accuracy for invoices and forms with tabular
 	// data, at ~2x token overhead. Default: true.
 	TSV *bool `toml:"tsv,omitempty"`
 
-	// SpatialConfThresholdVal is the confidence threshold (0-100) below
+	// ConfidenceThresholdVal is the confidence threshold (0-100) below
 	// which OCR confidence annotations are included in spatial layout
 	// output. Lines with min confidence >= this value omit the score to
 	// save tokens. Set to 0 to never show confidence. Default: 70.
-	SpatialConfThresholdVal *int `toml:"spatial_conf_threshold,omitempty"`
+	ConfidenceThresholdVal *int `toml:"confidence_threshold,omitempty"`
 }
 
 // IsEnabled returns whether LLM extraction is enabled. Defaults to true
@@ -356,11 +351,11 @@ func (e Extraction) IsOCRTSV() bool {
 	return true
 }
 
-// OCRSpatialConfThreshold returns the confidence threshold below which
-// OCR confidence annotations appear in spatial output. Defaults to 70.
-func (e Extraction) OCRSpatialConfThreshold() int {
-	if e.OCR.SpatialConfThresholdVal != nil {
-		return *e.OCR.SpatialConfThresholdVal
+// OCRConfThreshold returns the confidence threshold below which OCR
+// confidence annotations appear in spatial output. Defaults to 70.
+func (e Extraction) OCRConfThreshold() int {
+	if e.OCR.ConfidenceThresholdVal != nil {
+		return *e.OCR.ConfidenceThresholdVal
 	}
 	return 70
 }
@@ -560,16 +555,9 @@ func LoadFromPath(path string) (Config, error) {
 		)
 	}
 
-	if cfg.Extraction.OCR.ConfidenceThreshold < 0 || cfg.Extraction.OCR.ConfidenceThreshold > 100 {
-		return cfg, fmt.Errorf(
-			"extraction.ocr.confidence_threshold must be 0-100, got %d",
-			cfg.Extraction.OCR.ConfidenceThreshold,
-		)
-	}
-
-	if t := cfg.Extraction.OCRSpatialConfThreshold(); t < 0 || t > 100 {
+	if t := cfg.Extraction.OCRConfThreshold(); t < 0 || t > 100 {
 		return cfg, fmt.Errorf(
-			"extraction.ocr.spatial_conf_threshold must be 0-100, got %d", t,
+			"extraction.ocr.confidence_threshold must be 0-100, got %d", t,
 		)
 	}
 
 
@@ -636,10 +636,9 @@ func TestEnvVars(t *testing.T) {
 		"MICASA_EXTRACTION_LLM_TIMEOUT": "extraction.llm_timeout",
 		"MICASA_EXTRACTION_THINKING":    "extraction.thinking",
 
-		"MICASA_EXTRACTION_OCR_ENABLE":                   "extraction.ocr.enable",
-		"MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD":     "extraction.ocr.confidence_threshold",
-		"MICASA_EXTRACTION_OCR_TSV":                      "extraction.ocr.tsv",
-		"MICASA_EXTRACTION_OCR_SPATIAL_CONF_THRESHOLD":   "extraction.ocr.spatial_conf_threshold",
+		"MICASA_EXTRACTION_OCR_ENABLE":               "extraction.ocr.enable",
+		"MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD": "extraction.ocr.confidence_threshold",
+		"MICASA_EXTRACTION_OCR_TSV":                  "extraction.ocr.tsv",
 
 		"MICASA_LOCALE_CURRENCY": "locale.currency",
 
@@ -789,15 +788,15 @@ func TestOCRDefaults(t *testing.T) {
 	cfg, err := LoadFromPath(noConfig(t))
 	require.NoError(t, err)
 	assert.True(t, cfg.Extraction.IsOCREnabled())
-	assert.Equal(t, 0, cfg.Extraction.OCR.ConfidenceThreshold)
+	assert.Equal(t, 70, cfg.Extraction.OCRConfThreshold())
 }
 
 func TestOCRFromFile(t *testing.T) {
-	path := writeConfig(t, "[extraction.ocr]\nenable = false\nconfidence_threshold = 70\n")
+	path := writeConfig(t, "[extraction.ocr]\nenable = false\nconfidence_threshold = 50\n")
 	cfg, err := LoadFromPath(path)
 	require.NoError(t, err)
 	assert.False(t, cfg.Extraction.IsOCREnabled())
-	assert.Equal(t, 70, cfg.Extraction.OCR.ConfidenceThreshold)
+	assert.Equal(t, 50, cfg.Extraction.OCRConfThreshold())
 }
 
 func TestOCREnvOverrides(t *testing.T) {
@@ -806,7 +805,7 @@ func TestOCREnvOverrides(t *testing.T) {
 	cfg, err := LoadFromPath(noConfig(t))
 	require.NoError(t, err)
 	assert.False(t, cfg.Extraction.IsOCREnabled())
-	assert.Equal(t, 80, cfg.Extraction.OCR.ConfidenceThreshold)
+	assert.Equal(t, 80, cfg.Extraction.OCRConfThreshold())
 }
 
 func TestOCRConfidenceThresholdValidation(t *testing.T) {
@@ -1468,61 +1467,6 @@ func TestExtractionOCRTSVEnvInvalidReturnsError(t *testing.T) {
 	assert.Contains(t, err.Error(), "expected true or false")
 }
 
-// --- OCR spatial confidence threshold ---
-
-func TestExtractionOCRSpatialConfThresholdDefault70(t *testing.T) {
-	cfg, err := LoadFromPath(noConfig(t))
-	require.NoError(t, err)
-	assert.Equal(t, 70, cfg.Extraction.OCRSpatialConfThreshold(),
-		"OCR spatial confidence threshold should default to 70")
-}
-
-func TestExtractionOCRSpatialConfThresholdFromTOML(t *testing.T) {
-	path := writeConfig(t, "[extraction.ocr]\nspatial_conf_threshold = 50\n")
-	cfg, err := LoadFromPath(path)
-	require.NoError(t, err)
-	assert.Equal(t, 50, cfg.Extraction.OCRSpatialConfThreshold())
-}
-
-func TestExtractionOCRSpatialConfThresholdFromTOMLZero(t *testing.T) {
-	path := writeConfig(t, "[extraction.ocr]\nspatial_conf_threshold = 0\n")
-	cfg, err := LoadFromPath(path)
-	require.NoError(t, err)
-	assert.Equal(t, 0, cfg.Extraction.OCRSpatialConfThreshold(),
-		"zero threshold should disable confidence annotations")
-}
-
-func TestExtractionOCRSpatialConfThresholdFromEnv(t *testing.T) {
-	t.Setenv("MICASA_EXTRACTION_OCR_SPATIAL_CONF_THRESHOLD", "80")
-	cfg, err := LoadFromPath(noConfig(t))
-	require.NoError(t, err)
-	assert.Equal(t, 80, cfg.Extraction.OCRSpatialConfThreshold())
-}
-
-func TestExtractionOCRSpatialConfThresholdEnvInvalidReturnsError(t *testing.T) {
-	t.Setenv("MICASA_EXTRACTION_OCR_SPATIAL_CONF_THRESHOLD", "high")
-	_, err := LoadFromPath(noConfig(t))
-	require.Error(t, err)
-	assert.Contains(t, err.Error(), "MICASA_EXTRACTION_OCR_SPATIAL_CONF_THRESHOLD")
-}
-
-func TestExtractionOCRSpatialConfThresholdOutOfRange(t *testing.T) {
-	t.Parallel()
-	path := writeConfig(t, "[extraction.ocr]\nspatial_conf_threshold = 101\n")
-	_, err := LoadFromPath(path)
-	require.Error(t, err)
-	assert.Contains(t, err.Error(), "extraction.ocr.spatial_conf_threshold")
-	assert.Contains(t, err.Error(), "0-100")
-}
-
-func TestExtractionOCRSpatialConfThresholdNegative(t *testing.T) {
-	t.Parallel()
-	path := writeConfig(t, "[extraction.ocr]\nspatial_conf_threshold = -1\n")
-	_, err := LoadFromPath(path)
-	require.Error(t, err)
-	assert.Contains(t, err.Error(), "extraction.ocr.spatial_conf_threshold")
-}
-
 func TestFilePickerDir_FromTOML(t *testing.T) {
 	t.Parallel()
 	dir := t.TempDir()
 
@@ -32,21 +32,19 @@ type Extractor interface {
 // pdftotext, plaintext, PDF OCR, image OCR. maxPages of 0 means no limit
 // (all pages). Zero timeout causes the concrete extractor to use its default.
 // ocrEnabled controls whether OCR extractors are included (default true).
-// confidenceThreshold filters OCR words below this confidence (0 = no filter).
 func DefaultExtractors(
 	maxPages int,
 	timeout time.Duration,
 	ocrEnabled bool,
-	confidenceThreshold int,
 ) []Extractor {
 	ext := []Extractor{
 		&PDFTextExtractor{Timeout: timeout},
 		&PlainTextExtractor{},
 	}
 	if ocrEnabled {
 		ext = append(ext,
-			&PDFOCRExtractor{MaxPages: maxPages, ConfidenceThreshold: confidenceThreshold},
-			&ImageOCRExtractor{ConfidenceThreshold: confidenceThreshold},
+			&PDFOCRExtractor{MaxPages: maxPages},
+			&ImageOCRExtractor{},
 		)
 	}
 	return ext
@@ -153,8 +151,7 @@ func (e *PlainTextExtractor) Extract(_ context.Context, data []byte) (TextSource
 
 // PDFOCRExtractor wraps ocrPDF for scanned PDF pages.
 type PDFOCRExtractor struct {
-	MaxPages            int
-	ConfidenceThreshold int
+	MaxPages int
 }
 
 func (e *PDFOCRExtractor) Tool() string             { return "tesseract" }
@@ -169,10 +166,6 @@ func (e *PDFOCRExtractor) Extract(ctx context.Context, data []byte) (TextSource,
 	if err != nil {
 		return TextSource{}, err
 	}
-	if e.ConfidenceThreshold > 0 {
-		tsv = filterTSVByConfidence(tsv, e.ConfidenceThreshold)
-		text = textFromTSV(tsv)
-	}
 	return TextSource{
 		Tool: "tesseract",
 		Desc: "Text recognized from rasterized page images. Covers scanned pages that pdftotext misses, but may contain OCR errors.",
@@ -182,9 +175,7 @@ func (e *PDFOCRExtractor) Extract(ctx context.Context, data []byte) (TextSource,
 }
 
 // ImageOCRExtractor wraps ocrImage for direct image OCR.
-type ImageOCRExtractor struct {
-	ConfidenceThreshold int
-}
+type ImageOCRExtractor struct{}
 
 func (e *ImageOCRExtractor) Tool() string             { return "tesseract" }
 func (e *ImageOCRExtractor) Matches(mime string) bool { return IsImageMIME(mime) }
@@ -198,10 +189,6 @@ func (e *ImageOCRExtractor) Extract(ctx context.Context, data []byte) (TextSourc
 	if err != nil {
 		return TextSource{}, err
 	}
-	if e.ConfidenceThreshold > 0 {
-		tsv = filterTSVByConfidence(tsv, e.ConfidenceThreshold)
-		text = textFromTSV(tsv)
-	}
 	return TextSource{
 		Tool: "tesseract",
 		Desc: "Text recognized from the image. May contain OCR errors.",
 
@@ -150,7 +150,7 @@ func TestImageOCRExtractor_Available(t *testing.T) {
 
 func TestDefaultExtractors_Order(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 0)
+	extractors := DefaultExtractors(0, 0, true)
 	require.Len(t, extractors, 4)
 	assert.Equal(t, "pdftotext", extractors[0].Tool())
 	assert.Equal(t, "plaintext", extractors[1].Tool())
@@ -166,7 +166,7 @@ func TestDefaultExtractors_Order(t *testing.T) {
 
 func TestDefaultExtractors_Passthrough(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(42, 99, true, 0)
+	extractors := DefaultExtractors(42, 99, true)
 	pdfExt, ok := extractors[0].(*PDFTextExtractor)
 	require.True(t, ok)
 	assert.Equal(t, 99, int(pdfExt.Timeout))
@@ -178,50 +178,38 @@ func TestDefaultExtractors_Passthrough(t *testing.T) {
 
 func TestDefaultExtractors_OCRDisabled(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, false, 0)
+	extractors := DefaultExtractors(0, 0, false)
 	require.Len(t, extractors, 2)
 	assert.Equal(t, "pdftotext", extractors[0].Tool())
 	assert.Equal(t, "plaintext", extractors[1].Tool())
 }
 
-func TestDefaultExtractors_ConfidencePassthrough(t *testing.T) {
-	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 70)
-	pdfOCR, ok := extractors[2].(*PDFOCRExtractor)
-	require.True(t, ok)
-	assert.Equal(t, 70, pdfOCR.ConfidenceThreshold)
-
-	imgOCR, ok := extractors[3].(*ImageOCRExtractor)
-	require.True(t, ok)
-	assert.Equal(t, 70, imgOCR.ConfidenceThreshold)
-}
-
 // --- HasMatchingExtractor ---
 
 func TestHasMatchingExtractor_Tesseract_PDF(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 0)
+	extractors := DefaultExtractors(0, 0, true)
 	got := HasMatchingExtractor(extractors, "tesseract", "application/pdf")
 	assert.Equal(t, OCRAvailable(), got)
 }
 
 func TestHasMatchingExtractor_Tesseract_Image(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 0)
+	extractors := DefaultExtractors(0, 0, true)
 	got := HasMatchingExtractor(extractors, "tesseract", "image/png")
 	assert.Equal(t, ImageOCRAvailable(), got)
 }
 
 func TestHasMatchingExtractor_Pdftotext(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 0)
+	extractors := DefaultExtractors(0, 0, true)
 	got := HasMatchingExtractor(extractors, "pdftotext", "application/pdf")
 	assert.Equal(t, HasPDFToText(), got)
 }
 
 func TestHasMatchingExtractor_NoMatch(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 0)
+	extractors := DefaultExtractors(0, 0, true)
 	assert.False(t, HasMatchingExtractor(extractors, "tesseract", "text/plain"))
 	assert.False(t, HasMatchingExtractor(extractors, "pdftotext", "image/png"))
 	assert.False(t, HasMatchingExtractor(extractors, "nonexistent", "application/pdf"))
@@ -231,21 +219,21 @@ func TestHasMatchingExtractor_NoMatch(t *testing.T) {
 
 func TestNeedsOCR_PDF(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 0)
+	extractors := DefaultExtractors(0, 0, true)
 	got := NeedsOCR(extractors, "application/pdf")
 	assert.Equal(t, OCRAvailable(), got)
 }
 
 func TestNeedsOCR_Image(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 0)
+	extractors := DefaultExtractors(0, 0, true)
 	got := NeedsOCR(extractors, "image/png")
 	assert.Equal(t, ImageOCRAvailable(), got)
 }
 
 func TestNeedsOCR_PlainText(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 0, true, 0)
+	extractors := DefaultExtractors(0, 0, true)
 	assert.False(t, NeedsOCR(extractors, "text/plain"))
 }
 
@@ -260,7 +248,7 @@ func TestNeedsOCR_NoOCRExtractors(t *testing.T) {
 
 func TestExtractorTimeout(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(0, 42, true, 0)
+	extractors := DefaultExtractors(0, 42, true)
 	assert.Equal(t, time.Duration(42), ExtractorTimeout(extractors))
 }
 
@@ -272,7 +260,7 @@ func TestExtractorTimeout_NoPDFText(t *testing.T) {
 
 func TestExtractorMaxPages(t *testing.T) {
 	t.Parallel()
-	extractors := DefaultExtractors(15, 0, true, 0)
+	extractors := DefaultExtractors(15, 0, true)
 	assert.Equal(t, 15, ExtractorMaxPages(extractors))
 }