@@ -279,34 +279,87 @@ def _count_pdf_pages(self, pdf_path: Path) -> int:
279279 def _count_body_pages (self , pdf_path : Path ) -> float :
280280 """Count body pages (before References section) using PyMuPDF.
281281
282- Returns a float: integer part = complete pages before References,
283- fractional part = how far down that page References starts (0.0–1.0).
284- E.g. 6.3 means body fills 6 complete pages plus 30% of the next page.
282+ For dual-column templates, measures the RIGHT column fill on the last
283+ body page, which is what matters for page limit compliance.
284+
285+ Returns a float: e.g. 5.8 means 5 full pages + last page 80% filled.
286+ For dual-column: last page fill = right column fill ratio (not left).
285287 """
286288 try :
287289 import fitz
288290 doc = fitz .open (str (pdf_path ))
291+
292+ # Find which page has "References"
293+ ref_page_idx = None
289294 for i in range (len (doc )):
290- page = doc [i ]
291- text = page .get_text ()
292- found = any (line .strip () == 'References' for line in text .split ('\n ' ))
293- if not found :
294- continue
295- # References is on page i — measure its y-position
296- page_height = page .rect .height
297- for block in page .get_text ("dict" )["blocks" ]:
298- for line_obj in block .get ("lines" , []):
299- line_text = "" .join (s ["text" ] for s in line_obj .get ("spans" , []))
300- if line_text .strip () == "References" :
301- ref_y = line_obj ["bbox" ][1 ]
302- doc .close ()
303- return i + ref_y / page_height
304- # Fallback: found in plain text but not in dict blocks
295+ text = doc [i ].get_text ()
296+ if any (line .strip () == 'References' for line in text .split ('\n ' )):
297+ ref_page_idx = i
298+ break
299+
300+ if ref_page_idx is None :
301+ # No References found — all pages are body
302+ total = len (doc )
305303 doc .close ()
306- return float (i )
307- total = len (doc )
304+ return float (total )
305+
306+ # The last body page is the page BEFORE References
307+ # (if References has its own page via \clearpage)
308+ # OR the same page (if References starts mid-page)
309+ ref_page = doc [ref_page_idx ]
310+ ref_page_text = ref_page .get_text ()
311+
312+ # Check if References is at the very top of its page (i.e., \clearpage was used)
313+ ref_y = 0
314+ for block in ref_page .get_text ("dict" )["blocks" ]:
315+ for line_obj in block .get ("lines" , []):
316+ line_text = "" .join (s ["text" ] for s in line_obj .get ("spans" , []))
317+ if line_text .strip () == "References" :
318+ ref_y = line_obj ["bbox" ][1 ]
319+ break
320+ if ref_y > 0 :
321+ break
322+
323+ page_height = ref_page .rect .height
324+ ref_at_top = ref_y < page_height * 0.15 # References in top 15% = separate page
325+
326+ if ref_at_top and ref_page_idx > 0 :
327+ # References on its own page — last body page is previous page
328+ last_body_idx = ref_page_idx - 1
329+ else :
330+ # References starts mid-page — body ends partway through this page
331+ last_body_idx = ref_page_idx
332+
333+ last_body_page = doc [last_body_idx ]
334+ page_width = last_body_page .rect .width
335+ page_height = last_body_page .rect .height
336+
337+ # Detect dual-column by checking if text exists in both halves
338+ blocks = last_body_page .get_text ("blocks" )
339+ mid_x = page_width / 2
340+ left_blocks = [b for b in blocks if b [0 ] < mid_x and b [3 ] > page_height * 0.1 ]
341+ right_blocks = [b for b in blocks if b [0 ] >= mid_x and b [3 ] > page_height * 0.1 ]
342+
343+ is_dual_column = len (left_blocks ) > 0 and len (right_blocks ) > 0
344+
345+ if is_dual_column :
346+ # Dual column: fill ratio = right column's last text y / page height
347+ right_last_y = max (b [3 ] for b in right_blocks ) if right_blocks else 0
348+ fill_ratio = right_last_y / page_height
349+ elif ref_at_top :
350+ # Single column, References on separate page: check last body page fill
351+ if blocks :
352+ last_y = max (b [3 ] for b in blocks if b [3 ] > page_height * 0.1 )
353+ fill_ratio = last_y / page_height
354+ else :
355+ fill_ratio = 0.0
356+ else :
357+ # Single column, References mid-page: body ends at References y
358+ fill_ratio = ref_y / page_height
359+
360+ result = last_body_idx + fill_ratio
308361 doc .close ()
309- return float ( total )
362+ return result
310363 except Exception as e :
311364 self .log (f"Body page count failed: { e } " , "WARN" )
312365 return 0.0
0 commit comments