-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki_api.py
211 lines (164 loc) · 6.45 KB
/
wiki_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from itertools import islice, takewhile, repeat
from typing import List, Dict, Callable, Union
import random
import urllib.parse
import mwapi
USER_AGENT = "WatchUGOBot/1.0 ([email protected])"
wikipedia_session = mwapi.Session("https://en.wikipedia.org", user_agent=USER_AGENT)
commons_session = mwapi.Session("https://commons.wikimedia.org/", user_agent=USER_AGENT)
def split_every(n, iterable):
"""
Slice an iterable into chunks of n elements
:type n: int
:type iterable: Iterable
:rtype: Iterator
"""
iterator = iter(iterable)
return takewhile(bool, (list(islice(iterator, n)) for _ in repeat(None)))
def get_article_wikitext(article_title: str) -> str:
response = wikipedia_session.get(
action="parse",
page=article_title,
prop="wikitext",
formatversion="2",
redirects=True,
)
return response["parse"]["wikitext"]
def get_article_image_url(article_title: str) -> str:
response = wikipedia_session.get(
action="query",
titles=article_title,
prop="pageimages",
piprop="original",
pilicense="any",
redirects=True,
)
pages = response["query"]["pages"]
# Wikipedia returns a dictionary of pages.
# Since we only ever query one, we can just grab the first one.
page = list(pages.items())[0][1]
if "original" in page:
url = page["original"]["source"]
if not url.lower().endswith("svg"):
return url
return None
def commons_search_image(search: str) -> Union[str, None]:
"""
Search Wikimedia Commons for `search`, and returns the *name*
of the first image result. Returns None if no image was found.
"""
# Wikimedia Commons lets us specify a file type to filter by raster images only
search_query = search + " filetype:jpg|png"
response = commons_session.get(
action="query", list="search", srsearch=search_query, srnamespace="6"
)
if len(response["query"]["search"]) == 0:
return None
first_result = response["query"]["search"][0]
return first_result["title"]
def get_fallback_article_image_url(article_title: str) -> Union[str, None]:
"""
Some Wikipedia articles have no associated image. As a fallback, we search on
Wikimedia Commons with the article title and grab the first result.
Returns None if no fallback image could be found.
"""
cleaned_article_title = (
article_title.replace("_", " ").replace("(", "").replace(")", "")
)
fallback_image_title = commons_search_image(cleaned_article_title)
if fallback_image_title is None:
return None
response = commons_session.get(
action="query", prop="imageinfo", titles=fallback_image_title, iiprop="url"
)
pages = response["query"]["pages"]
# Wikipedia returns a dictionary of pages.
# Since we only ever query one, we can just grab the first one.
page = list(pages.items())[0][1]
return page["imageinfo"][0]["url"]
EXTRACTS_MAX_CHARS = 500
def get_article_extract(article_title: str) -> str:
response = wikipedia_session.get(
action="query",
prop="extracts",
exchars=EXTRACTS_MAX_CHARS,
explaintext=True,
exintro=True,
titles=article_title,
redirects=True,
)
pages = response["query"]["pages"]
# Wikipedia returns a dictionary of pages.
# Since we only ever query one, we can just grab the first one.
page = list(pages.items())[0][1]
return page["extract"]
MAX_TITLES_PER_QUERY = 50
def get_articles_exists(article_titles: List[str]) -> Dict[str, bool]:
"""
Determine whether the list of given article titles exists.
Returns a dictionary of booleans, keyed by article titles.
Because we will be doing this for *every* link on a page,
this query works in bulk.
"""
result = {}
title_chunks = split_every(MAX_TITLES_PER_QUERY, article_titles)
for chunk in title_chunks:
joined_titles = "|".join(chunk)
responses = wikipedia_session.get(
action="query", prop="info", titles=joined_titles, continuation=True
)
for r in responses:
# Wikipedia auto-normalizes any pages we give it. To return the same titles
# we were called with, we have to reverse that.
denormalize = {}
for entry in r["query"]["normalized"]:
denormalize[entry["to"]] = entry["from"]
for _, page in r["query"]["pages"].items():
exists = "missing" not in page
denormalized_title = denormalize.get(page["title"], page["title"])
result[denormalized_title.lower()] = exists
return result
# This is an overestimate to avoid skipping pages
N_LIST_OF_ARTICLES = 260000
def get_random_list_article(progress_lambda: Callable[[int], None] = None) -> str:
"""
Get a random "List of" article title.
Since this is a long-running operation, a progress_lambda can be passed in that will be called
before every request with the number of articles seen.
"""
article_idx = random.randint(0, N_LIST_OF_ARTICLES)
print(f"Selected random article number {article_idx}.")
n_articles_seen = 0
r = wikipedia_session.get(
action="query", list="allpages", aplimit=500, apprefix="List of"
)
while True:
for p in r["query"]["allpages"]:
if n_articles_seen == article_idx:
return p["title"]
n_articles_seen += 1
if r.get("continue", None) is None:
# we've reached the end, just return the last one
return r["query"]["allpages"][-1]["title"]
else:
if progress_lambda:
progress_lambda(n_articles_seen)
r = wikipedia_session.get(
action="query",
list="allpages",
aplimit=500,
apprefix="List of",
apcontinue=r["continue"]["apcontinue"],
)
def get_url_from_article_title(title):
r = wikipedia_session.get(action="query", prop="info", inprop="url", titles=title)
normalized = r["query"].get("normalized", None)
if normalized:
return get_url_from_article_title(normalized[0]["to"])
# get first and only page in the page dictionary
page = list(r["query"]["pages"].items())[0][1]
url = page.get("fullurl", None)
if url is not None:
return urllib.parse.unquote(url)
else:
raise RuntimeError(f"Page {title} not found.")