55from tqdm import tqdm
66from typing import Type
77from loguru import logger
8+
9+
10+ def _describe_raw_paper (raw_paper : RawPaperItem ) -> str :
11+ title = getattr (raw_paper , "title" , None )
12+ if title :
13+ return str (title )
14+ if isinstance (raw_paper , dict ):
15+ for key in ("title" , "entry_id" , "id" , "doi" ):
16+ value = raw_paper .get (key )
17+ if value :
18+ return str (value )
19+ return repr (raw_paper )
20+
21+
22+ def _convert_to_paper_safe (retriever : "BaseRetriever" , raw_paper : RawPaperItem ) -> Paper | None :
23+ try :
24+ return retriever .convert_to_paper (raw_paper )
25+ except Exception as exc :
26+ logger .warning (
27+ f"Skipping paper { _describe_raw_paper (raw_paper )} : { type (exc ).__name__ } : { exc } "
28+ )
29+ return None
30+
31+
832class BaseRetriever (ABC ):
933 name : str
1034 def __init__ (self , config :DictConfig ):
@@ -24,10 +48,17 @@ def retrieve_papers(self) -> list[Paper]:
2448 papers = []
2549 logger .info ("Processing papers..." )
2650 with ProcessPoolExecutor (max_workers = self .config .executor .max_workers ) as exec_pool :
27- futures = {exec_pool .submit (self . convert_to_paper , rp ): i for i , rp in enumerate (raw_papers )}
51+ futures = {exec_pool .submit (_convert_to_paper_safe , self , rp ): i for i , rp in enumerate (raw_papers )}
2852 papers = [None ] * len (raw_papers )
2953 for future in tqdm (as_completed (futures ), total = len (raw_papers ), desc = "Converting papers" ):
30- papers [futures [future ]] = future .result ()
54+ try :
55+ papers [futures [future ]] = future .result ()
56+ except Exception as exc :
57+ raw_paper = raw_papers [futures [future ]]
58+ logger .warning (
59+ f"Skipping paper { _describe_raw_paper (raw_paper )} after worker failure: "
60+ f"{ type (exc ).__name__ } : { exc } "
61+ )
3162 return [p for p in papers if p is not None ]
3263
3364registered_retrievers = {}
@@ -42,4 +73,4 @@ def decorator(cls):
4273def get_retriever_cls (name :str ) -> Type [BaseRetriever ]:
4374 if name not in registered_retrievers :
4475 raise ValueError (f"Retriever { name } not found" )
45- return registered_retrievers [name ]
76+ return registered_retrievers [name ]
0 commit comments