-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy patharxiv_utils.py
80 lines (61 loc) · 2.19 KB
/
arxiv_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import arxiv # type: ignore
from _types import Paper
client = arxiv.Client()
def arxiv_result_to_paper(result: arxiv.Result) -> Paper:
return Paper(
title=result.title,
url=result.entry_id,
abstract=result.summary,
authors=[a.name for a in result.authors],
published=result.published,
)
def search_arxiv(
query: str,
max_results=10,
sort_by: arxiv.SortCriterion = arxiv.SortCriterion.SubmittedDate,
) -> list[arxiv.Result]:
return list(
client.results(
arxiv.Search(
query,
max_results=max_results,
sort_by=sort_by,
)
)
)
def search_arxiv_as_paper(
query: str,
max_results=10,
sort_by: arxiv.SortCriterion = arxiv.SortCriterion.SubmittedDate,
) -> list[Paper]:
return [
arxiv_result_to_paper(result)
for result in search_arxiv(query, max_results, sort_by)
]
def search_arxiv_by_id(id: str) -> arxiv.Result | None:
for result in client.results(arxiv.Search(id_list=[id])):
return result
return None
def fill_papers_with_arxiv(papers: list[Paper]) -> list[Paper]:
for paper in papers:
if paper.has_arxiv_props():
continue
result: arxiv.Result | None = None
if paper.arxiv_id:
result = search_arxiv_by_id(paper.arxiv_id)
if not result and paper.title:
# Dashes seem to fuck up the API calls - Finicky in general, links work much better
query = f"ti:{paper.title.replace('-', ' ')}"
searched = search_arxiv(query, max_results=1, sort_by=arxiv.SortCriterion.Relevance)
result = searched[0] if searched else None
if not result:
print(f'[!] Could not find arxiv result for "{paper.title}" [{paper.url}]')
continue
if paper.title and paper.title != result.title:
print(f'[!] Title mismatch: "{paper.title}" vs "{result.title}"')
paper.title = result.title
paper.url = result.entry_id
paper.abstract = result.summary
paper.authors = [a.name for a in result.authors]
paper.published = result.published
return papers