Skip to content

Commit 760ad4a

Browse files
committed
add NEEDS-CHECK for citation
1 parent 17c21a8 commit 760ad4a

File tree

5 files changed

+328
-422
lines changed

5 files changed

+328
-422
lines changed

ark/citation.py

Lines changed: 142 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,15 @@ def _strip_bibtex_fields(bib: str) -> str:
608608
if brace_depth <= 0:
609609
current_field_kept = None # field complete
610610

611-
return "\n".join(result)
611+
cleaned = "\n".join(result)
612+
613+
# Remove volume if it equals year (e.g. TMLR where volume=2025, year=2025)
614+
vol_match = re.search(r"volume\s*=\s*\{(\d{4})\}", cleaned)
615+
year_match = re.search(r"year\s*=\s*\{(\d{4})\}", cleaned)
616+
if vol_match and year_match and vol_match.group(1) == year_match.group(1):
617+
cleaned = re.sub(r"\s*volume\s*=\s*\{\d{4}\},?\n?", "\n", cleaned)
618+
619+
return cleaned
612620

613621

614622

@@ -944,10 +952,11 @@ def fix_bib(bib_path: str, results: list[VerificationResult]) -> None:
944952
content = content.replace(r.original_bibtex, tagged)
945953

946954
elif r.status == "NEEDS-CHECK":
947-
# Add [NEEDS-CHECK] comment if not already there
948-
if "% [NEEDS-CHECK]" not in content.split(r.original_bibtex)[0].split("\n")[-1]:
949-
tagged = f"% [NEEDS-CHECK] {r.details}\n{r.original_bibtex}"
950-
content = content.replace(r.original_bibtex, tagged)
955+
# Add note field inside the entry so it shows in PDF
956+
if "note" not in r.original_bibtex.lower():
957+
modified_entry = r.original_bibtex.rstrip().rstrip("}")
958+
modified_entry += r' note = {\textcolor{red}{[NEEDS-CHECK: citation not verified]}},' + "\n}"
959+
content = content.replace(r.original_bibtex, modified_entry)
951960

952961
bib_file.write_text(content)
953962

@@ -986,6 +995,9 @@ def append_papers_to_bib(bib_path: str, papers: list[Paper]) -> list[str]:
986995
if cite_key in existing_keys:
987996
continue
988997

998+
# Store BibTeX on the paper object for literature.yaml
999+
paper.bibtex = bib
1000+
9891001
source = paper.dblp_key and "dblp" or (paper.doi and "crossref" or paper.source)
9901002
tagged = f"% {_ARK_SOURCE_TAG}{source}]\n{bib}"
9911003
new_entries.append(tagged)
@@ -1034,7 +1046,7 @@ def _write_needs_check_to_bib(bib_path: str, titles: list[str],
10341046
counter += 1
10351047

10361048
# Build entry with whatever info we have
1037-
fields = [f" title = {{{{{title}}}}}"]
1049+
fields = [f" title = {{{title}}}"]
10381050
if author:
10391051
fields.append(f" author = {{{author} et al.}}")
10401052
if year:
@@ -1058,6 +1070,85 @@ def _write_needs_check_to_bib(bib_path: str, titles: list[str],
10581070
return added_keys
10591071

10601072

1073+
def regenerate_bib_from_literature(literature_path: str, bib_path: str) -> None:
1074+
"""Regenerate references.bib entirely from literature.yaml.
1075+
1076+
This is the enforcement mechanism: literature.yaml is the single source of truth.
1077+
Any entries writer added to bib outside of our system are discarded.
1078+
NEEDS-CHECK entries get a note field for PDF visibility.
1079+
"""
1080+
import yaml
1081+
1082+
lit_file = Path(literature_path)
1083+
if not lit_file.exists():
1084+
return
1085+
1086+
try:
1087+
data = yaml.safe_load(lit_file.read_text()) or {}
1088+
except Exception:
1089+
return
1090+
1091+
# Collect all BibTeX from literature.yaml references
1092+
entries = []
1093+
_updated = False
1094+
1095+
for ref in data.get("references", []):
1096+
if not isinstance(ref, dict):
1097+
continue
1098+
bibtex = ref.get("bibtex")
1099+
key = ref.get("bibtex_key", "")
1100+
source = ref.get("source", "")
1101+
if not key:
1102+
continue
1103+
1104+
# If bibtex not stored, try to re-fetch it
1105+
if not bibtex:
1106+
title = ref.get("title", "")
1107+
if title:
1108+
paper = _search_by_title(title)
1109+
if paper:
1110+
bibtex = fetch_bibtex(paper)
1111+
if bibtex:
1112+
ref["bibtex"] = bibtex # cache for next time
1113+
_updated = True
1114+
1115+
if bibtex:
1116+
tag = f"% [ARK:source={source}]" if source else "% [ARK:source=verified]"
1117+
entries.append(f"{tag}\n{bibtex}")
1118+
1119+
# Add NEEDS-CHECK entries
1120+
for nc in data.get("needs_check", []):
1121+
if not isinstance(nc, dict):
1122+
continue
1123+
key = nc.get("bibtex_key", "")
1124+
title = nc.get("title", "")
1125+
author = nc.get("authors", "")
1126+
year = nc.get("year", 0)
1127+
if not key or not title:
1128+
continue
1129+
1130+
fields = [f" title = {{{title}}}"]
1131+
if author:
1132+
fields.append(f" author = {{{author} et al.}}")
1133+
if year:
1134+
fields.append(f" year = {{{year}}}")
1135+
fields.append(r" note = {\textcolor{red}{[NEEDS-CHECK: citation not verified]}}")
1136+
1137+
entry = (
1138+
f"% [NEEDS-CHECK]\n"
1139+
f"@misc{{{key},\n"
1140+
+ ",\n".join(fields) + ",\n"
1141+
f"}}"
1142+
)
1143+
entries.append(entry)
1144+
1145+
# Write bib
1146+
bib_file = Path(bib_path)
1147+
bib_file.parent.mkdir(parents=True, exist_ok=True)
1148+
bib_file.write_text("% ARK auto-managed references\n% Generated from literature.yaml — do not edit manually\n\n"
1149+
+ "\n\n".join(entries) + "\n")
1150+
1151+
10611152
# ═══════════════════════════════════════════════════════════
10621153
# Query extraction from planner issues
10631154
# ═══════════════════════════════════════════════════════════
@@ -1213,6 +1304,8 @@ def update_literature_yaml(literature_path: str, papers: list[Paper],
12131304
}
12141305
if paper.abstract:
12151306
entry["abstract"] = paper.abstract[:500]
1307+
if paper.bibtex:
1308+
entry["bibtex"] = paper.bibtex
12161309
# Mark importance based on Deep Research context
12171310
if ctx:
12181311
entry["context"] = ctx
@@ -1226,7 +1319,9 @@ def update_literature_yaml(literature_path: str, papers: list[Paper],
12261319

12271320
def _write_needs_check_to_literature(literature_path: str, titles: list[str],
12281321
cite_keys: list[str] = None,
1229-
contexts: list[str] = None) -> None:
1322+
contexts: list[str] = None,
1323+
authors: list[str] = None,
1324+
years: list = None) -> None:
12301325
"""Append [NEEDS-CHECK] entries to literature.yaml for papers not found in any API."""
12311326
import yaml
12321327
from datetime import datetime
@@ -1256,6 +1351,12 @@ def _write_needs_check_to_literature(literature_path: str, titles: list[str],
12561351
}
12571352
if cite_keys and i < len(cite_keys):
12581353
entry["bibtex_key"] = cite_keys[i]
1354+
author = (authors[i] if authors and i < len(authors) else "") or ""
1355+
year = (years[i] if years and i < len(years) else 0) or 0
1356+
if author:
1357+
entry["authors"] = author
1358+
if year:
1359+
entry["year"] = year
12591360
ctx = (contexts[i] if contexts and i < len(contexts) else "") or ""
12601361
if ctx:
12611362
entry["context"] = ctx
@@ -1365,7 +1466,7 @@ def bootstrap_citations(
13651466
contexts=found_contexts)
13661467
if needs_check_titles:
13671468
_write_needs_check_to_literature(literature_path, needs_check_titles, needs_check_keys,
1368-
needs_check_contexts)
1469+
needs_check_contexts, needs_check_authors, needs_check_years)
13691470

13701471
return BootstrapResult(
13711472
found=found_papers,
@@ -1446,18 +1547,36 @@ def _extract_keywords_from_title(title: str) -> str:
14461547
# Mark [NEEDS-CHECK] citations in tex files
14471548
# ═══════════════════════════════════════════════════════════
14481549

1449-
def mark_needs_check_in_tex(bib_path: str, tex_dir: str) -> int:
1550+
def mark_needs_check_in_tex(bib_path: str, tex_dir: str,
1551+
literature_path: str = None) -> int:
14501552
"""Scan .tex files and add [NEEDS-CHECK] marker after any \\cite of a needs-check entry.
14511553
1452-
This runs after writer finishes, so it doesn't depend on writer compliance.
1554+
Reads NEEDS-CHECK keys from literature.yaml (single source of truth).
1555+
Falls back to parsing bib comments if literature.yaml not available.
14531556
Returns number of citations marked.
14541557
"""
1455-
# Find needs-check keys from bib
1456-
entries = parse_bib(bib_path)
1558+
import yaml
1559+
14571560
needs_check_keys = set()
1458-
for entry in entries:
1459-
if "[NEEDS-CHECK]" in entry.get("preceding_comments", ""):
1460-
needs_check_keys.add(entry["key"])
1561+
1562+
# Primary: read from literature.yaml
1563+
if literature_path:
1564+
lit_file = Path(literature_path)
1565+
if lit_file.exists():
1566+
try:
1567+
lit_data = yaml.safe_load(lit_file.read_text()) or {}
1568+
for nc in lit_data.get("needs_check", []):
1569+
if isinstance(nc, dict) and nc.get("bibtex_key"):
1570+
needs_check_keys.add(nc["bibtex_key"])
1571+
except Exception:
1572+
pass
1573+
1574+
# Fallback: parse bib comments
1575+
if not needs_check_keys:
1576+
entries = parse_bib(bib_path)
1577+
for entry in entries:
1578+
if "[NEEDS-CHECK]" in entry.get("preceding_comments", ""):
1579+
needs_check_keys.add(entry["key"])
14611580

14621581
if not needs_check_keys:
14631582
return 0
@@ -1481,6 +1600,14 @@ def _add_marker(m, _marker=marker):
14811600
marked += count
14821601

14831602
if new_content != content:
1603+
# Ensure xcolor package is loaded for \textcolor to work
1604+
if r"\textcolor" in new_content and "xcolor" not in new_content:
1605+
new_content = re.sub(
1606+
r"(\\documentclass[^\n]*\n)",
1607+
r"\1\\usepackage{xcolor}\n",
1608+
new_content,
1609+
count=1,
1610+
)
14841611
tex_file.write_text(new_content)
14851612

14861613
return marked

0 commit comments

Comments
 (0)