|
1 | 1 | import tarfile |
2 | 2 | import re |
3 | 3 | import glob |
| 4 | +import math |
4 | 5 | import smtplib |
| 6 | +from collections import Counter |
5 | 7 | from email.header import Header |
6 | 8 | from email.mime.text import MIMEText |
7 | 9 | from email.utils import parseaddr, formataddr |
|
15 | 17 |
|
16 | 18 | import pymupdf4llm # noqa: E402 |
17 | 19 |
|
18 | | -def extract_tex_code_from_tar(file_path:str, paper_id:str) -> dict[str,str]: |
| 20 | +_TOKEN_RE = re.compile(r'[a-zA-Z0-9]+') |
| 21 | + |
| 22 | +def _tokenize(text: str) -> list[str]: |
| 23 | + return [t.lower() for t in _TOKEN_RE.findall(text)] |
| 24 | + |
| 25 | + |
| 26 | +def _bm25_pick(query: str, candidates: dict[str, str], k1: float = 1.5, b: float = 0.75) -> str: |
| 27 | + """Return the candidate key whose content best matches *query* by BM25.""" |
| 28 | + query_tokens = _tokenize(query) |
| 29 | + if not query_tokens: |
| 30 | + return next(iter(candidates)) |
| 31 | + |
| 32 | + doc_tokens = {name: _tokenize(content) for name, content in candidates.items()} |
| 33 | + N = len(doc_tokens) |
| 34 | + avgdl = sum(len(t) for t in doc_tokens.values()) / max(N, 1) |
| 35 | + |
| 36 | + df: Counter[str] = Counter() |
| 37 | + for tokens in doc_tokens.values(): |
| 38 | + df.update(set(tokens)) |
| 39 | + |
| 40 | + best_name, best_score = None, -1.0 |
| 41 | + for name, tokens in doc_tokens.items(): |
| 42 | + tf = Counter(tokens) |
| 43 | + dl = len(tokens) |
| 44 | + score = 0.0 |
| 45 | + for q in query_tokens: |
| 46 | + n_q = df.get(q, 0) |
| 47 | + idf = math.log((N - n_q + 0.5) / (n_q + 0.5) + 1) |
| 48 | + f_q = tf.get(q, 0) |
| 49 | + score += idf * (f_q * (k1 + 1)) / (f_q + k1 * (1 - b + b * dl / max(avgdl, 1))) |
| 50 | + if score > best_score: |
| 51 | + best_score = score |
| 52 | + best_name = name |
| 53 | + return best_name |
| 54 | + |
| 55 | + |
| 56 | +def extract_tex_code_from_tar(file_path:str, paper_id:str, paper_title:str | None = None) -> dict[str,str]: |
19 | 57 | try: |
20 | 58 | tar = tarfile.open(file_path) |
21 | 59 | except tarfile.ReadError: |
@@ -48,25 +86,34 @@ def extract_tex_code_from_tar(file_path:str, paper_id:str) -> dict[str,str]: |
48 | 86 |
|
49 | 87 | if main_tex is None: |
50 | 88 | logger.debug(f"Trying to choose tex file containing the document block as main tex file of {paper_id}") |
51 | | - #read all tex files |
| 89 | + |
52 | 90 | file_contents = {} |
| 91 | + doc_block_candidates: list[str] = [] |
53 | 92 | for t in tex_files: |
54 | 93 | f = tar.extractfile(t) |
55 | 94 | content = f.read().decode('utf-8',errors='ignore') |
56 | | - #remove comments |
57 | 95 | content = re.sub(r'%.*\n', '\n', content) |
58 | 96 | content = re.sub(r'\\begin{comment}.*?\\end{comment}', '', content, flags=re.DOTALL) |
59 | 97 | content = re.sub(r'\\iffalse.*?\\fi', '', content, flags=re.DOTALL) |
60 | | - #remove redundant \n |
61 | 98 | content = re.sub(r'\n+', '\n', content) |
62 | 99 | content = re.sub(r'\\\\', '', content) |
63 | | - #remove consecutive spaces |
64 | 100 | content = re.sub(r'[ \t\r\f]{3,}', ' ', content) |
65 | | - if main_tex is None and re.search(r'\\begin\{document\}', content) and not any(w in t for w in ['example', 'sample']): |
66 | | - main_tex = t |
67 | | - logger.debug(f"Choose {t} as main tex file of {paper_id}") |
| 101 | + if main_tex is None and re.search(r'\\begin\{document\}', content) and not any(w in t for w in ['example', 'sample', 'template']): |
| 102 | + doc_block_candidates.append(t) |
68 | 103 | file_contents[t] = content |
69 | | - |
| 104 | + |
| 105 | + if main_tex is None: |
| 106 | + if len(doc_block_candidates) == 1: |
| 107 | + main_tex = doc_block_candidates[0] |
| 108 | + logger.debug(f"Choose {main_tex} as main tex file of {paper_id}") |
| 109 | + elif len(doc_block_candidates) > 1: |
| 110 | + if paper_title: |
| 111 | + main_tex = _bm25_pick(paper_title, {c: file_contents[c] for c in doc_block_candidates}) |
| 112 | + logger.debug(f"Multiple document blocks found in {paper_id}; BM25 selected {main_tex} from {doc_block_candidates}") |
| 113 | + else: |
| 114 | + main_tex = doc_block_candidates[0] |
| 115 | + logger.debug(f"Multiple document blocks found in {paper_id}; no title provided, using first candidate {main_tex}") |
| 116 | + |
70 | 117 | if main_tex is not None: |
71 | 118 | main_source:str = file_contents[main_tex] |
72 | 119 | #find and replace all included sub-files |
|
0 commit comments