Skip to content

Commit 769dce0

Browse files
cpcloudclaude
andcommitted
fix(extract): address PR review for spatial OCR annotations
- Fall back to plain text when TSV-to-spatial conversion yields empty - Validate confidence_threshold is 0-100 in config loading - Detect page breaks in concatenated per-page TSV via block number decrease - Fix doc comment to mention paragraph breaks - Add tests for page break detection, spatial fallback, and threshold validation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent de2a5fa commit 769dce0

6 files changed

Lines changed: 99 additions & 11 deletions

File tree

internal/config/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,12 @@ func LoadFromPath(path string) (Config, error) {
579579
)
580580
}
581581

582+
if t := cfg.Extraction.OCRConfThreshold(); t < 0 || t > 100 {
583+
return cfg, fmt.Errorf(
584+
"extraction.ocr.confidence_threshold must be 0-100, got %d", t,
585+
)
586+
}
587+
582588
checkFilePermissions(&cfg, path)
583589

584590
return cfg, nil

internal/config/config_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,6 +1424,23 @@ func TestExtractionOCRConfThresholdEnvInvalidReturnsError(t *testing.T) {
14241424
assert.Contains(t, err.Error(), "MICASA_EXTRACTION_OCR_CONFIDENCE_THRESHOLD")
14251425
}
14261426

1427+
func TestExtractionOCRConfThresholdOutOfRange(t *testing.T) {
1428+
t.Parallel()
1429+
path := writeConfig(t, "[extraction.ocr]\nconfidence_threshold = 101\n")
1430+
_, err := LoadFromPath(path)
1431+
require.Error(t, err)
1432+
assert.Contains(t, err.Error(), "extraction.ocr.confidence_threshold")
1433+
assert.Contains(t, err.Error(), "0-100")
1434+
}
1435+
1436+
func TestExtractionOCRConfThresholdNegative(t *testing.T) {
1437+
t.Parallel()
1438+
path := writeConfig(t, "[extraction.ocr]\nconfidence_threshold = -1\n")
1439+
_, err := LoadFromPath(path)
1440+
require.Error(t, err)
1441+
assert.Contains(t, err.Error(), "extraction.ocr.confidence_threshold")
1442+
}
1443+
14271444
func TestFilePickerDir_FromTOML(t *testing.T) {
14281445
t.Parallel()
14291446
dir := t.TempDir()

internal/extract/llmextract.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,18 @@ func operationExtractionUserMessage(in ExtractionPromptInput) string {
7373
fmt.Fprintf(&b, "Size: %d bytes\n", in.SizeBytes)
7474

7575
for _, src := range in.Sources {
76-
// When SendTSV is enabled and the source has TSV data, send
77-
// a compact spatial format (line-level bounding boxes) instead
78-
// of reconstructed plain text.
76+
// When SendTSV is enabled and the source has TSV data, prefer
77+
// a compact spatial format (line-level bounding boxes). If TSV
78+
// conversion yields no content (e.g., header-only/invalid TSV),
79+
// fall back to the reconstructed plain text.
7980
content := strings.TrimSpace(src.Text)
80-
hasSpatial := in.SendTSV && len(src.Data) > 0
81-
if hasSpatial {
82-
content = strings.TrimSpace(SpatialTextFromTSV(src.Data, in.ConfThreshold))
81+
hasSpatial := false
82+
if in.SendTSV && len(src.Data) > 0 {
83+
spatialContent := strings.TrimSpace(SpatialTextFromTSV(src.Data, in.ConfThreshold))
84+
if spatialContent != "" {
85+
content = spatialContent
86+
hasSpatial = true
87+
}
8388
}
8489
if content == "" {
8590
continue

internal/extract/llmextract_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,34 @@ func TestBuildExtractionPrompt_TSVSourceWithoutData(t *testing.T) {
324324
assert.Contains(t, user, "Fallback text")
325325
}
326326

327+
func TestBuildExtractionPrompt_SpatialFallbackOnEmptyTSV(t *testing.T) {
328+
t.Parallel()
329+
// TSV with only a header (no data rows) should fall back to plain text.
330+
headerOnlyTSV := "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n"
331+
msgs := BuildExtractionPrompt(ExtractionPromptInput{
332+
DocID: 1,
333+
Filename: "scan.pdf",
334+
MIME: "application/pdf",
335+
SendTSV: true,
336+
ConfThreshold: DefaultOCRConfThreshold,
337+
Sources: []TextSource{
338+
{
339+
Tool: "tesseract",
340+
Desc: "OCR.",
341+
Text: "Fallback plain text",
342+
Data: []byte(headerOnlyTSV),
343+
},
344+
},
345+
})
346+
347+
require.Len(t, msgs, 2)
348+
user := msgs[1].Content
349+
assert.Contains(t, user, "Fallback plain text",
350+
"should fall back to plain text when TSV conversion yields empty")
351+
assert.NotContains(t, user, "[",
352+
"no bounding boxes when falling back to plain text")
353+
}
354+
327355
func TestBuildExtractionPrompt_TSVPreambleMentionsSpatial(t *testing.T) {
328356
t.Parallel()
329357
msgs := BuildExtractionPrompt(ExtractionPromptInput{

internal/extract/ocr.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -370,8 +370,8 @@ const DefaultOCRConfThreshold = 70
370370
//
371371
// [left,top,width;minConf] word1 word2 ...
372372
//
373-
// Lines within the same block/paragraph are separated by newlines; block
374-
// breaks produce a blank line.
373+
// Lines within the same block/paragraph are separated by newlines; block or
374+
// paragraph breaks produce a blank line.
375375
func SpatialTextFromTSV(tsv []byte, confThreshold int) string {
376376
lines := bytes.Split(tsv, []byte("\n"))
377377
if len(lines) < 2 {
@@ -436,11 +436,16 @@ func SpatialTextFromTSV(tsv []byte, confThreshold int) string {
436436
newLine := firstLine || lineNum != cur.lineNum ||
437437
block != cur.block || par != cur.par
438438

439+
// Detect page breaks in concatenated per-page TSV output.
440+
// Each page is OCR'd independently so page_num is always 1;
441+
// a decreasing block number signals a new page's data.
442+
pageBreak := !firstLine && block < lastBlock
443+
439444
if !firstLine && newLine {
440-
// Insert block/paragraph break before flushing.
441-
if block != lastBlock || par != lastPar {
445+
// Insert block/paragraph/page break before flushing.
446+
if pageBreak || block != lastBlock || par != lastPar {
442447
flush()
443-
result.WriteByte('\n') // blank line for block break
448+
result.WriteByte('\n') // blank line for break
444449
} else {
445450
flush()
446451
}

internal/extract/ocr_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,33 @@ func TestSpatialTextFromTSV_MixedConfidenceLines(t *testing.T) {
181181
assert.Contains(t, lines[1], "Garbled text")
182182
}
183183

184+
func TestSpatialTextFromTSV_PageBreak(t *testing.T) {
185+
t.Parallel()
186+
// Simulate concatenated per-page TSV output. Each page is OCR'd
187+
// independently, so page_num is always 1. A decreasing block number
188+
// (e.g., page 1 ends with block 2, page 2 starts with block 1)
189+
// signals a page boundary and should produce a blank line.
190+
tsv := []byte(
191+
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n" +
192+
// Page 1, block 1
193+
"5\t1\t1\t1\t1\t1\t100\t200\t50\t12\t96\tPage1\n" +
194+
// Page 1, block 2
195+
"5\t1\t2\t1\t1\t1\t100\t300\t60\t12\t95\tStill1\n" +
196+
// Page 2 starts: block resets to 1 (< lastBlock 2) => page break
197+
"5\t1\t1\t1\t1\t1\t100\t200\t70\t12\t94\tPage2\n",
198+
)
199+
200+
result := SpatialTextFromTSV(tsv, DefaultOCRConfThreshold)
201+
lines := strings.Split(result, "\n")
202+
// Expect: "Page1" / "" (block break) / "Still1" / "" (page break) / "Page2"
203+
require.Len(t, lines, 5, "expected 5 lines (3 content + 2 blank separators)")
204+
assert.Contains(t, lines[0], "Page1")
205+
assert.Empty(t, lines[1], "blank line between blocks")
206+
assert.Contains(t, lines[2], "Still1")
207+
assert.Empty(t, lines[3], "blank line at page break")
208+
assert.Contains(t, lines[4], "Page2")
209+
}
210+
184211
func TestSpatialTextFromTSV_Empty(t *testing.T) {
185212
t.Parallel()
186213
assert.Empty(t, SpatialTextFromTSV(nil, DefaultOCRConfThreshold))

0 commit comments

Comments
 (0)