-
-
Notifications
You must be signed in to change notification settings - Fork 4.9k
Expand file tree
/
Copy pathprotocol.py
More file actions
112 lines (100 loc) · 4.97 KB
/
protocol.py
File metadata and controls
112 lines (100 loc) · 4.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from dataclasses import dataclass
from typing import Optional, TypeVar
from datetime import datetime
import re
import tiktoken
from openai import OpenAI
from loguru import logger
import json
RawPaperItem = TypeVar('RawPaperItem')
@dataclass
class Paper:
source: str
title: str
authors: list[str]
abstract: str
url: str
pdf_url: Optional[str] = None
full_text: Optional[str] = None
tldr: Optional[str] = None
affiliations: Optional[list[str]] = None
score: Optional[float] = None
published_date: Optional[str] = None
def _generate_tldr_with_llm(self, openai_client:OpenAI,llm_params:dict) -> str:
lang = llm_params.get('language', 'English')
prompt = f"Given the following information of a paper, generate a one-sentence TLDR summary in {lang}:\n\n"
if self.title:
prompt += f"Title:\n {self.title}\n\n"
if self.abstract:
prompt += f"Abstract: {self.abstract}\n\n"
if self.full_text:
prompt += f"Preview of main content:\n {self.full_text}\n\n"
if not self.full_text and not self.abstract:
logger.warning(f"Neither full text nor abstract is provided for {self.url}")
return "Failed to generate TLDR. Neither full text nor abstract is provided"
# use gpt-4o tokenizer for estimation
enc = tiktoken.encoding_for_model("gpt-4o")
prompt_tokens = enc.encode(prompt)
prompt_tokens = prompt_tokens[:4000] # truncate to 4000 tokens
prompt = enc.decode(prompt_tokens)
response = openai_client.chat.completions.create(
messages=[
{
"role": "system",
"content": f"You are an assistant who perfectly summarizes scientific paper, and gives the core idea of the paper to the user. Your answer should be in {lang}.",
},
{"role": "user", "content": prompt},
],
**llm_params.get('generation_kwargs', {})
)
tldr = response.choices[0].message.content
return tldr
def generate_tldr(self, openai_client:OpenAI,llm_params:dict) -> str:
try:
tldr = self._generate_tldr_with_llm(openai_client,llm_params)
self.tldr = tldr
return tldr
except Exception as e:
logger.warning(f"Failed to generate tldr of {self.url}: {e}")
tldr = self.abstract
self.tldr = tldr
return tldr
def _generate_affiliations_with_llm(self, openai_client:OpenAI,llm_params:dict) -> Optional[list[str]]:
if self.full_text is not None:
prompt = f"Given the beginning of a paper, extract the affiliations of the authors in a python list format, which is sorted by the author order. If there is no affiliation found, return an empty list '[]':\n\n{self.full_text}"
# use gpt-4o tokenizer for estimation
enc = tiktoken.encoding_for_model("gpt-4o")
prompt_tokens = enc.encode(prompt)
prompt_tokens = prompt_tokens[:2000] # truncate to 2000 tokens
prompt = enc.decode(prompt_tokens)
affiliations = openai_client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are an assistant who perfectly extracts affiliations of authors from a paper. You should return a python list of affiliations sorted by the author order, like [\"TsingHua University\",\"Peking University\"]. If an affiliation is consisted of multi-level affiliations, like 'Department of Computer Science, TsingHua University', you should return the top-level affiliation 'TsingHua University' only. Do not contain duplicated affiliations. If there is no affiliation found, you should return an empty list [ ]. You should only return the final list of affiliations, and do not return any intermediate results.",
},
{"role": "user", "content": prompt},
],
**llm_params.get('generation_kwargs', {})
)
affiliations = affiliations.choices[0].message.content
affiliations = re.search(r'\[.*?\]', affiliations, flags=re.DOTALL).group(0)
affiliations = json.loads(affiliations)
affiliations = list(set(affiliations))
affiliations = [str(a) for a in affiliations]
return affiliations
def generate_affiliations(self, openai_client:OpenAI,llm_params:dict) -> Optional[list[str]]:
try:
affiliations = self._generate_affiliations_with_llm(openai_client,llm_params)
self.affiliations = affiliations
return affiliations
except Exception as e:
logger.warning(f"Failed to generate affiliations of {self.url}: {e}")
self.affiliations = None
return None
@dataclass
class CorpusPaper:
title: str
abstract: str
added_date: datetime
paths: list[str]