Skip to content

Commit 7f14914

Browse files
JihaoXinclaude
andcommitted
Fix body page count for dual-column: measure right column fill ratio
Old logic: found "References" keyword y-position, computed page fraction. This was inaccurate for dual-column — a page with left column full but right column 30% filled would report as ~6.0 pages (looked OK) when actual content was only ~5.3 effective pages. New logic: - Detect dual-column by checking if text exists in both page halves - For dual-column: fill ratio = right column's last text y / page height - For single-column: same as before (last text y or References y) - Handles \clearpage before bibliography (References on separate page) Example: MARCO project 6.03 → 5.81 (accurate), SeedBrief 6.1 → 5.86 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 54e93a2 commit 7f14914

File tree

1 file changed

+74
-21
lines changed

1 file changed

+74
-21
lines changed

ark/compiler.py

Lines changed: 74 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -279,34 +279,87 @@ def _count_pdf_pages(self, pdf_path: Path) -> int:
279279
def _count_body_pages(self, pdf_path: Path) -> float:
280280
"""Count body pages (before References section) using PyMuPDF.
281281
282-
Returns a float: integer part = complete pages before References,
283-
fractional part = how far down that page References starts (0.0–1.0).
284-
E.g. 6.3 means body fills 6 complete pages plus 30% of the next page.
282+
For dual-column templates, measures the RIGHT column fill on the last
283+
body page, which is what matters for page limit compliance.
284+
285+
Returns a float: e.g. 5.8 means 5 full pages + last page 80% filled.
286+
For dual-column: last page fill = right column fill ratio (not left).
285287
"""
286288
try:
287289
import fitz
288290
doc = fitz.open(str(pdf_path))
291+
292+
# Find which page has "References"
293+
ref_page_idx = None
289294
for i in range(len(doc)):
290-
page = doc[i]
291-
text = page.get_text()
292-
found = any(line.strip() == 'References' for line in text.split('\n'))
293-
if not found:
294-
continue
295-
# References is on page i — measure its y-position
296-
page_height = page.rect.height
297-
for block in page.get_text("dict")["blocks"]:
298-
for line_obj in block.get("lines", []):
299-
line_text = "".join(s["text"] for s in line_obj.get("spans", []))
300-
if line_text.strip() == "References":
301-
ref_y = line_obj["bbox"][1]
302-
doc.close()
303-
return i + ref_y / page_height
304-
# Fallback: found in plain text but not in dict blocks
295+
text = doc[i].get_text()
296+
if any(line.strip() == 'References' for line in text.split('\n')):
297+
ref_page_idx = i
298+
break
299+
300+
if ref_page_idx is None:
301+
# No References found — all pages are body
302+
total = len(doc)
305303
doc.close()
306-
return float(i)
307-
total = len(doc)
304+
return float(total)
305+
306+
# The last body page is the page BEFORE References
307+
# (if References has its own page via \clearpage)
308+
# OR the same page (if References starts mid-page)
309+
ref_page = doc[ref_page_idx]
310+
ref_page_text = ref_page.get_text()
311+
312+
# Check if References is at the very top of its page (i.e., \clearpage was used)
313+
ref_y = 0
314+
for block in ref_page.get_text("dict")["blocks"]:
315+
for line_obj in block.get("lines", []):
316+
line_text = "".join(s["text"] for s in line_obj.get("spans", []))
317+
if line_text.strip() == "References":
318+
ref_y = line_obj["bbox"][1]
319+
break
320+
if ref_y > 0:
321+
break
322+
323+
page_height = ref_page.rect.height
324+
ref_at_top = ref_y < page_height * 0.15 # References in top 15% = separate page
325+
326+
if ref_at_top and ref_page_idx > 0:
327+
# References on its own page — last body page is previous page
328+
last_body_idx = ref_page_idx - 1
329+
else:
330+
# References starts mid-page — body ends partway through this page
331+
last_body_idx = ref_page_idx
332+
333+
last_body_page = doc[last_body_idx]
334+
page_width = last_body_page.rect.width
335+
page_height = last_body_page.rect.height
336+
337+
# Detect dual-column by checking if text exists in both halves
338+
blocks = last_body_page.get_text("blocks")
339+
mid_x = page_width / 2
340+
left_blocks = [b for b in blocks if b[0] < mid_x and b[3] > page_height * 0.1]
341+
right_blocks = [b for b in blocks if b[0] >= mid_x and b[3] > page_height * 0.1]
342+
343+
is_dual_column = len(left_blocks) > 0 and len(right_blocks) > 0
344+
345+
if is_dual_column:
346+
# Dual column: fill ratio = right column's last text y / page height
347+
right_last_y = max(b[3] for b in right_blocks) if right_blocks else 0
348+
fill_ratio = right_last_y / page_height
349+
elif ref_at_top:
350+
# Single column, References on separate page: check last body page fill
351+
if blocks:
352+
last_y = max(b[3] for b in blocks if b[3] > page_height * 0.1)
353+
fill_ratio = last_y / page_height
354+
else:
355+
fill_ratio = 0.0
356+
else:
357+
# Single column, References mid-page: body ends at References y
358+
fill_ratio = ref_y / page_height
359+
360+
result = last_body_idx + fill_ratio
308361
doc.close()
309-
return float(total)
362+
return result
310363
except Exception as e:
311364
self.log(f"Body page count failed: {e}", "WARN")
312365
return 0.0

0 commit comments

Comments
 (0)