Skip to content

Commit 1b23b0c

Browse files
committed
add importance level to references
1 parent b309617 commit 1b23b0c

File tree

3 files changed

+80
-25
lines changed

3 files changed

+80
-25
lines changed

ark/citation.py

Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,7 +1177,8 @@ def parse_agent_selection(agent_output: str, candidates: list[Paper]) -> list[Pa
11771177
# ═══════════════════════════════════════════════════════════
11781178

11791179
def update_literature_yaml(literature_path: str, papers: list[Paper],
1180-
cite_keys: list[str], agent_output: str) -> None:
1180+
cite_keys: list[str], agent_output: str,
1181+
contexts: list[str] = None) -> None:
11811182
"""Update literature.yaml with newly added citations for the writer agent."""
11821183
import yaml
11831184
from datetime import datetime
@@ -1197,9 +1198,10 @@ def update_literature_yaml(literature_path: str, papers: list[Paper],
11971198

11981199
existing_titles = {r.get("title", "").lower() for r in data["references"] if isinstance(r, dict)}
11991200

1200-
for paper, key in zip(papers, cite_keys):
1201+
for idx, (paper, key) in enumerate(zip(papers, cite_keys)):
12011202
if paper.title.lower() in existing_titles:
12021203
continue
1204+
ctx = (contexts[idx] if contexts and idx < len(contexts) else "") or ""
12031205
entry = {
12041206
"title": paper.title,
12051207
"authors": ", ".join(paper.authors[:3]) + (" et al." if len(paper.authors) > 3 else ""),
@@ -1211,13 +1213,20 @@ def update_literature_yaml(literature_path: str, papers: list[Paper],
12111213
}
12121214
if paper.abstract:
12131215
entry["abstract"] = paper.abstract[:500]
1216+
# Mark importance based on Deep Research context
1217+
if ctx:
1218+
entry["context"] = ctx
1219+
ctx_lower = ctx.lower()
1220+
if "must cite" in ctx_lower or "essential" in ctx_lower or "critical" in ctx_lower or "closest prior work" in ctx_lower:
1221+
entry["importance"] = "critical"
12141222
data["references"].append(entry)
12151223

12161224
lit_file.write_text(yaml.dump(data, allow_unicode=True, default_flow_style=False))
12171225

12181226

12191227
def _write_needs_check_to_literature(literature_path: str, titles: list[str],
1220-
cite_keys: list[str] = None) -> None:
1228+
cite_keys: list[str] = None,
1229+
contexts: list[str] = None) -> None:
12211230
"""Append [NEEDS-CHECK] entries to literature.yaml for papers not found in any API."""
12221231
import yaml
12231232
from datetime import datetime
@@ -1247,6 +1256,12 @@ def _write_needs_check_to_literature(literature_path: str, titles: list[str],
12471256
}
12481257
if cite_keys and i < len(cite_keys):
12491258
entry["bibtex_key"] = cite_keys[i]
1259+
ctx = (contexts[i] if contexts and i < len(contexts) else "") or ""
1260+
if ctx:
1261+
entry["context"] = ctx
1262+
ctx_lower = ctx.lower()
1263+
if "must cite" in ctx_lower or "essential" in ctx_lower or "critical" in ctx_lower or "closest prior work" in ctx_lower:
1264+
entry["importance"] = "critical"
12501265
data["needs_check"].append(entry)
12511266

12521267
lit_file.write_text(yaml.dump(data, allow_unicode=True, default_flow_style=False))
@@ -1271,6 +1286,7 @@ def bootstrap_citations(
12711286
search_queries: list[str] = None,
12721287
authors: list[str] = None,
12731288
years: list = None,
1289+
contexts: list[str] = None,
12741290
) -> BootstrapResult:
12751291
"""Given a list of paper titles, search APIs, fetch BibTeX, write to bib.
12761292
@@ -1290,9 +1306,11 @@ def bootstrap_citations(
12901306
"""
12911307
found_papers = []
12921308
found_keys = []
1309+
found_contexts = []
12931310
needs_check_titles = []
12941311
needs_check_authors = []
12951312
needs_check_years = []
1313+
needs_check_contexts = []
12961314

12971315
for i, title in enumerate(titles):
12981316
title = title.strip()
@@ -1315,13 +1333,20 @@ def bootstrap_citations(
13151333
paper = _search_by_title(keywords)
13161334

13171335
if paper:
1336+
# Use Deep Research context as fallback abstract if API didn't provide one
1337+
ctx = (contexts[i] if contexts and i < len(contexts) else "") or ""
1338+
if not paper.abstract and ctx:
1339+
paper.abstract = ctx
13181340
found_papers.append(paper)
1341+
found_contexts.append(ctx)
13191342
else:
13201343
needs_check_titles.append(title)
13211344
nc_author = (authors[i] if authors and i < len(authors) else "") or ""
13221345
nc_year = (years[i] if years and i < len(years) else 0) or 0
1346+
nc_ctx = (contexts[i] if contexts and i < len(contexts) else "") or ""
13231347
needs_check_authors.append(nc_author)
13241348
needs_check_years.append(nc_year)
1349+
needs_check_contexts.append(nc_ctx)
13251350

13261351
# Fetch BibTeX and write to references.bib
13271352
if found_papers:
@@ -1336,9 +1361,11 @@ def bootstrap_citations(
13361361

13371362
# Update literature.yaml (found papers + needs-check)
13381363
if found_papers and found_keys:
1339-
update_literature_yaml(literature_path, found_papers, found_keys, "")
1364+
update_literature_yaml(literature_path, found_papers, found_keys, "",
1365+
contexts=found_contexts)
13401366
if needs_check_titles:
1341-
_write_needs_check_to_literature(literature_path, needs_check_titles, needs_check_keys)
1367+
_write_needs_check_to_literature(literature_path, needs_check_titles, needs_check_keys,
1368+
needs_check_contexts)
13421369

13431370
return BootstrapResult(
13441371
found=found_papers,
@@ -1358,21 +1385,28 @@ def _search_by_title(title: str) -> Paper | None:
13581385
def _matches(paper: Paper) -> bool:
13591386
return title_similarity(title, paper.title) >= _SIMILARITY_THRESHOLD
13601387

1361-
# Try DBLP first
1388+
# Try DBLP first — prefer published over preprint if both match
13621389
dblp_results = _search_dblp(title, max_results=5)
13631390
time.sleep(_DBLP_DELAY)
1364-
for p in dblp_results:
1365-
if _matches(p):
1366-
return p
1391+
dblp_matches = [p for p in dblp_results if _matches(p)]
1392+
if dblp_matches:
1393+
# Prefer published version
1394+
published = [p for p in dblp_matches if _is_published(p)]
1395+
best = published[0] if published else dblp_matches[0]
1396+
if not best.abstract:
1397+
_supplement_abstract(best)
1398+
return best
13671399

13681400
# Try CrossRef
13691401
cr_results = _search_crossref(title, max_results=3)
13701402
time.sleep(_CROSSREF_DELAY)
13711403
for p in cr_results:
13721404
if _matches(p):
1405+
if not p.abstract:
1406+
_supplement_abstract(p)
13731407
return p
13741408

1375-
# Try Semantic Scholar (has good title matching)
1409+
# Try Semantic Scholar (has good title matching + abstract)
13761410
s2_results = _search_semantic_scholar(title, max_results=3)
13771411
for p in s2_results:
13781412
if _matches(p):
@@ -1381,6 +1415,16 @@ def _matches(paper: Paper) -> bool:
13811415
return None
13821416

13831417

1418+
def _supplement_abstract(paper: Paper) -> None:
1419+
"""Try to get abstract from Semantic Scholar for a paper that lacks one."""
1420+
s2_results = _search_semantic_scholar(paper.title, max_results=1)
1421+
if s2_results and title_similarity(paper.title, s2_results[0].title) >= _SIMILARITY_THRESHOLD:
1422+
if s2_results[0].abstract:
1423+
paper.abstract = s2_results[0].abstract
1424+
if s2_results[0].citation_count and not paper.citation_count:
1425+
paper.citation_count = s2_results[0].citation_count
1426+
1427+
13841428
def _extract_keywords_from_title(title: str) -> str:
13851429
"""Extract meaningful keywords from a paper title for broader search.
13861430
@@ -1428,18 +1472,14 @@ def mark_needs_check_in_tex(bib_path: str, tex_dir: str) -> int:
14281472

14291473
for key in needs_check_keys:
14301474
# Match \cite{key}, \citep{key}, \citet{key} not already followed by marker
1431-
for pattern in [
1432-
rf"(\\cite\{{{key}\}})(?!\s*\\textcolor)",
1433-
rf"(\\citep\{{{key}\}})(?!\s*\\textcolor)",
1434-
rf"(\\citet\{{{key}\}})(?!\s*\\textcolor)",
1435-
]:
1436-
replacement = rf"\1{marker}"
1437-
new_content, count = re.subn(pattern, replacement, new_content)
1475+
for cite_cmd in ["cite", "citep", "citet"]:
1476+
pattern = rf"(\\{cite_cmd}\{{{key}\}})(?!\s*\\textcolor)"
1477+
# Use a function replacement to avoid \t interpretation issues
1478+
def _add_marker(m, _marker=marker):
1479+
return m.group(1) + _marker
1480+
new_content, count = re.subn(pattern, _add_marker, new_content)
14381481
marked += count
14391482

1440-
# Also handle \cite{a,key,b} — key inside multi-cite
1441-
# This is harder; for now just handle the simple cases above
1442-
14431483
if new_content != content:
14441484
tex_file.write_text(new_content)
14451485

ark/execution.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -973,27 +973,37 @@ def _get_literature_context_for_task(self, task: dict) -> str:
973973
title = entry.get("title", "")
974974
bibtex_key = entry.get("bibtex_key", entry.get("key", entry.get("cite_key", "")))
975975
abstract = entry.get("abstract", entry.get("summary", ""))
976+
context = entry.get("context", "")
976977
venue = entry.get("venue", "")
977978
year = entry.get("year", "")
979+
importance = entry.get("importance", "")
978980
if title:
979981
cite_str = f" (cite: \\cite{{{bibtex_key}}})" if bibtex_key else ""
980982
venue_str = f" [{venue} {year}]" if venue else ""
981-
lines.append(f"- **{title}**{cite_str}{venue_str}")
983+
imp_str = " **[MUST CITE]**" if importance == "critical" else ""
984+
lines.append(f"- **{title}**{cite_str}{venue_str}{imp_str}")
982985
if abstract:
983986
lines.append(f" Abstract: {abstract[:400]}")
987+
elif context:
988+
lines.append(f" Context: {context[:400]}")
984989
elif isinstance(entry, str):
985990
lines.append(f"- {entry}")
986991

987992
# Add [NEEDS-CHECK] citations
988993
needs_check = lit_data.get("needs_check", [])
989994
if needs_check:
990-
lines.append("\n## [NEEDS-CHECK] Citations (unverified — mark in text)")
995+
lines.append("\n## [NEEDS-CHECK] Citations (unverified)")
991996
for nc in needs_check:
992997
if isinstance(nc, dict):
993998
nc_title = nc.get("title", "")
994999
nc_key = nc.get("bibtex_key", "")
1000+
nc_importance = nc.get("importance", "")
1001+
nc_context = nc.get("context", "")
9951002
if nc_title and nc_key:
996-
lines.append(f"- **{nc_title}** (cite: \\cite{{{nc_key}}}) — **[NEEDS-CHECK]**")
1003+
imp_str = " **[MUST CITE]**" if nc_importance == "critical" else ""
1004+
lines.append(f"- **{nc_title}** (cite: \\cite{{{nc_key}}}) [NEEDS-CHECK]{imp_str}")
1005+
if nc_context:
1006+
lines.append(f" Context: {nc_context[:400]}")
9971007

9981008
if len(lines) > 1:
9991009
lines.append(f"\nPlease cite these references in Related Work and other appropriate sections.")

ark/pipeline.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -701,15 +701,17 @@ def _bootstrap_citations_from_deep_research(self):
701701
- "authors": first author surname (e.g. "Vaswani")
702702
- "year": publication year as integer (e.g. 2017)
703703
- "query": a search query to find it (title + author + year)
704+
- "context": a 1-2 sentence summary of what the report says about this paper (what it does, why it matters)
704705
705706
Return a JSON array. Example:
706707
[
707-
{{"title": "Attention Is All You Need", "authors": "Vaswani", "year": 2017, "query": "Attention Is All You Need Vaswani 2017"}},
708-
{{"title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "authors": "Devlin", "year": 2019, "query": "BERT Pre-training Deep Bidirectional Transformers Devlin 2019"}}
708+
{{"title": "Attention Is All You Need", "authors": "Vaswani", "year": 2017, "query": "Attention Is All You Need Vaswani 2017", "context": "Introduces the Transformer architecture based solely on attention mechanisms, replacing recurrence and convolutions."}},
709+
{{"title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "authors": "Devlin", "year": 2019, "query": "BERT Pre-training Deep Bidirectional Transformers Devlin 2019", "context": "Proposes bidirectional pre-training for language representations, achieving SOTA on multiple NLP benchmarks."}}
709710
]
710711
711712
Rules:
712713
- "title" must be the paper's actual full title as it would appear on the paper itself
714+
- "context" should summarize what the report says about this paper, NOT what you think the paper is about
713715
- "query" should include the title plus first author surname and year to help search
714716
- If only an abbreviation is given (e.g. "TimeGAN by Yoon et al., 2019"), infer the full title for "title" and construct a rich "query"
715717
- Do NOT include book titles, dataset names, or tool names
@@ -732,12 +734,14 @@ def _bootstrap_citations_from_deep_research(self):
732734
queries = [p["query"] for p in papers_info]
733735
authors_list = [p.get("authors", "") for p in papers_info]
734736
years_list = [p.get("year", 0) for p in papers_info]
737+
contexts_list = [p.get("context", "") for p in papers_info]
735738
self.log_step(f"Extracted {len(titles)} paper titles, searching APIs...", "progress")
736739

737740
# Step 2: Search APIs and fetch BibTeX (use queries for search, titles for display)
738741
result = bootstrap_citations(
739742
titles, bib_path, literature_path,
740743
search_queries=queries, authors=authors_list, years=years_list,
744+
contexts=contexts_list,
741745
)
742746

743747
# Step 3: Log results
@@ -835,6 +839,7 @@ def _parse_paper_info_list(self, agent_output: str) -> list:
835839
"query": p.get("query", p.get("title", "")),
836840
"authors": p.get("authors", ""),
837841
"year": p.get("year", 0),
842+
"context": p.get("context", ""),
838843
}
839844
for p in parsed
840845
if isinstance(p, dict) and p.get("title")

0 commit comments

Comments
 (0)