Skip to content

Commit 555cec1

Browse files
JihaoXinclaude
andcommitted
Fix body page count: use LaTeX \pdfsavepos instead of PDF text-block analysis
The old PyMuPDF-based method counted page numbers and footers as body content, inflating the fill ratio (e.g. 8.95 instead of 8.25) and letting nearly-empty last pages pass the page count check. Now injects \pdfsavepos before \clearpage\bibliography to write the exact body-end y-coordinate to .aux, giving an accurate fill ratio immune to headers/footers/page numbers. Falls back to the old PDF method when aux data is unavailable. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ff7593d commit 555cec1

File tree

2 files changed

+114
-18
lines changed

2 files changed

+114
-18
lines changed

ark/compiler.py

Lines changed: 66 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -277,14 +277,76 @@ def _count_pdf_pages(self, pdf_path: Path) -> int:
277277
return 0
278278

279279
def _count_body_pages(self, pdf_path: Path) -> float:
280-
"""Count body pages (before References section) using PyMuPDF.
280+
"""Count body pages (before References section).
281281
282-
For dual-column templates, measures the RIGHT column fill on the last
283-
body page, which is what matters for page limit compliance.
282+
Primary method: read \\arkBodyEndY / \\arkPageH from the .aux file
283+
(injected by _ensure_clearpage_before_bibliography via \\pdfsavepos).
284+
This gives the exact y-position where body text ends, immune to
285+
page-number / header / footer interference.
286+
287+
Fallback: PyMuPDF text-block analysis (less accurate).
284288
285289
Returns a float: e.g. 5.8 means 5 full pages + last page 80% filled.
286-
For dual-column: last page fill = right column fill ratio (not left).
287290
"""
291+
# ── Try aux-based measurement first ──
292+
result = self._count_body_pages_from_aux(pdf_path)
293+
if result is not None:
294+
return result
295+
296+
# ── Fallback: PyMuPDF text-block analysis ──
297+
return self._count_body_pages_from_pdf(pdf_path)
298+
299+
def _count_body_pages_from_aux(self, pdf_path: Path) -> float | None:
300+
"""Read body-end position from .aux file (written by \\pdfsavepos).
301+
302+
Returns body page count as a float, or None if aux data unavailable.
303+
"""
304+
aux_path = self.latex_dir / "main.aux"
305+
if not aux_path.exists():
306+
return None
307+
try:
308+
aux_text = aux_path.read_text(errors="replace")
309+
import re
310+
m_y = re.search(r'\\gdef\\arkBodyEndY\{(\d+)\}', aux_text)
311+
m_h = re.search(r'\\gdef\\arkPageH\{(\d+)\}', aux_text)
312+
if not m_y or not m_h:
313+
return None
314+
315+
body_end_y_sp = int(m_y.group(1)) # sp from page bottom
316+
page_height_sp = int(m_h.group(1)) # total page height in sp
317+
if page_height_sp <= 0:
318+
return None
319+
320+
# fill_ratio = fraction of page used (from top)
321+
fill_ratio = 1.0 - (body_end_y_sp / page_height_sp)
322+
323+
# Determine which page the body ends on by finding References page
324+
import fitz
325+
doc = fitz.open(str(pdf_path))
326+
ref_page_idx = None
327+
for i in range(len(doc)):
328+
text = doc[i].get_text()
329+
if any(line.strip() == 'References' for line in text.split('\n')):
330+
ref_page_idx = i
331+
break
332+
doc.close()
333+
334+
if ref_page_idx is None:
335+
return None # can't determine without References marker
336+
337+
# Body ends on the page before References (since we inject
338+
# the marker right before \clearpage\bibliography)
339+
last_body_idx = max(ref_page_idx - 1, 0)
340+
result = last_body_idx + fill_ratio
341+
self.log(f"Body page count (aux): {result:.2f} "
342+
f"(page {last_body_idx+1}, {fill_ratio:.1%} filled)", "DEBUG")
343+
return result
344+
except Exception as e:
345+
self.log(f"Aux-based page count failed: {e}", "DEBUG")
346+
return None
347+
348+
def _count_body_pages_from_pdf(self, pdf_path: Path) -> float:
349+
"""Fallback: count body pages via PyMuPDF text-block analysis."""
288350
try:
289351
import fitz
290352
doc = fitz.open(str(pdf_path))
@@ -307,7 +369,6 @@ def _count_body_pages(self, pdf_path: Path) -> float:
307369
# (if References has its own page via \clearpage)
308370
# OR the same page (if References starts mid-page)
309371
ref_page = doc[ref_page_idx]
310-
ref_page_text = ref_page.get_text()
311372

312373
# Check if References is at the very top of its page (i.e., \clearpage was used)
313374
ref_y = 0
@@ -354,8 +415,6 @@ def _count_body_pages(self, pdf_path: Path) -> float:
354415
fill_ratio = right_last_y / page_height
355416
elif ref_at_top:
356417
# Single column, References on separate page: check last body page fill
357-
# Filter out headers/footers: ignore blocks in top 8% and bottom 5% of page,
358-
# and blocks shorter than 10 chars (page numbers, headers)
359418
body_blocks = [
360419
b for b in blocks
361420
if b[3] > page_height * 0.06 # below header

ark/execution.py

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -724,29 +724,66 @@ def _enforce_page_count(self, context: str = "post-writing") -> bool:
724724
self.log(f"[{context}] Could not determine page count after {action}", "WARN")
725725
return True
726726

727+
# LaTeX snippet that saves the current vertical position to the .aux file.
728+
# \pdfsavepos records the position at shipout; the deferred \write expands
729+
# \pdflastypos at that moment, giving the y-coordinate (in sp, from the
730+
# page bottom). Must use \write (not \protected@write) so expansion
731+
# happens at shipout time, after \pdfsavepos has recorded the position.
732+
_ARK_BODY_END_MARKER = (
733+
r"\makeatletter\pdfsavepos"
734+
r"\write\@auxout{\string\gdef\string\arkBodyEndY{\the\pdflastypos}"
735+
r"\string\gdef\string\arkPageH{\number\pdfpageheight}}"
736+
r"\makeatother"
737+
)
738+
727739
def _ensure_clearpage_before_bibliography(self):
728740
"""Ensure \\clearpage before \\bibliography in main.tex.
729741
730742
This guarantees References starts on a new page, separate from body.
743+
Also injects a \\pdfsavepos marker right before the first \\clearpage
744+
so the exact body-end y-position is written to the .aux file.
731745
"""
732746
main_tex = self.latex_dir / "main.tex"
733747
if not main_tex.exists():
734748
return
735749
try:
736750
content = main_tex.read_text()
737-
marker = r'\bibliography{'
738-
if marker not in content:
739-
return
740-
# Clean up any old FloatBarrier injections
741-
content = content.replace('\\FloatBarrier\n\\clearpage\n' + marker, marker)
742-
content = content.replace('\\FloatBarrier\n' + marker, marker)
743-
# Check if \clearpage already precedes \bibliography
744-
if '\\clearpage\n' + marker in content or '\\clearpage\n\n' + marker in content:
751+
if r'\bibliography{' not in content:
745752
return
746-
# Insert \clearpage before \bibliography
747-
content = content.replace(marker, '\\clearpage\n' + marker)
753+
754+
# Remove any previously injected body-end marker (idempotent)
755+
content = content.replace(self._ARK_BODY_END_MARKER + '\n', '')
756+
757+
# Find the first bibliography-related command (\bibliographystyle or \bibliography)
758+
bib_style_pos = content.find(r'\bibliographystyle{')
759+
bib_pos = content.find(r'\bibliography{')
760+
# The anchor is whichever comes first
761+
anchor_pos = min(p for p in (bib_style_pos, bib_pos) if p >= 0)
762+
763+
# Check if \clearpage already precedes the anchor
764+
before_anchor = content[:anchor_pos].rstrip()
765+
has_clearpage = before_anchor.endswith(r'\clearpage')
766+
767+
if not has_clearpage:
768+
# Insert \clearpage before the anchor
769+
content = content[:anchor_pos] + '\\clearpage\n' + content[anchor_pos:]
770+
self.log("Injected \\clearpage before \\bibliography", "INFO")
771+
# Recalculate anchor position
772+
anchor_pos = min(
773+
p for p in (content.find(r'\bibliographystyle{'),
774+
content.find(r'\bibliography{'))
775+
if p >= 0
776+
)
777+
before_anchor = content[:anchor_pos].rstrip()
778+
779+
# Inject pdfsavepos marker before the first \clearpage
780+
# Find the \clearpage that immediately precedes the anchor
781+
clearpage_pos = before_anchor.rfind(r'\clearpage')
782+
content = (content[:clearpage_pos]
783+
+ self._ARK_BODY_END_MARKER + '\n'
784+
+ content[clearpage_pos:])
785+
748786
main_tex.write_text(content)
749-
self.log("Injected \\clearpage before \\bibliography", "INFO")
750787
except Exception as e:
751788
self.log(f"Failed to inject \\clearpage: {e}", "WARN")
752789

0 commit comments

Comments
 (0)