-
-
Notifications
You must be signed in to change notification settings - Fork 4.9k
Expand file tree
/
Copy pathbiorxiv_retriever.py
More file actions
63 lines (58 loc) · 2.37 KB
/
biorxiv_retriever.py
File metadata and controls
63 lines (58 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import requests
from .base import BaseRetriever, register_retriever
from ..protocol import Paper
from loguru import logger
from typing import Any
from time import sleep
@register_retriever("biorxiv")
class BiorxivRetriever(BaseRetriever):
server = "biorxiv"
def __init__(self, config):
super().__init__(config)
if self.retriever_config.category is None:
raise ValueError(f"category must be specified for {self.name}")
def _retrieve_raw_papers(self) -> list[dict[str, Any]]:
api_url = f"https://api.biorxiv.org/details/{self.server}/2d"
retry_num = 10
delay_time = 10
for i in range(retry_num):
try:
response = requests.get(api_url)
response.raise_for_status()
break
except Exception as e:
if i == retry_num - 1:
raise e
else:
logger.warning(f"Failed to retrieve papers: {str(e)}. Retry in {delay_time} seconds.")
sleep(delay_time)
result = response.json()
collection = result['collection']
if len(collection) == 0:
logger.warning(f"No paper found. API Message: {result['messages']}")
return []
all_dates = set(c['date'] for c in collection)
latest_date = sorted(all_dates)[-1]
collection = [c for c in collection if c['date'] == latest_date]
categories = [c.lower() for c in self.retriever_config.category]
collection = [c for c in collection if c['category'] in categories]
if self.config.executor.debug:
collection = collection[:10]
return collection
def convert_to_paper(self, raw_paper:dict[str, Any]) -> Paper | None:
title = raw_paper['title']
authors = [a.strip() for a in raw_paper['authors'].split(';')]
abstract = raw_paper['abstract']
pdf_url = f"https://www.{self.server}.org/content/{raw_paper['doi']}v{raw_paper['version']}.full.pdf"
full_text = None # biorxiv forbids scraping its pdf
published_date = raw_paper.get("date")
return Paper(
source=self.name,
title=title,
authors=authors,
abstract=abstract,
url=pdf_url,
pdf_url=pdf_url,
full_text=full_text,
published_date=published_date,
)