forked from lalinsky/musicbrainz-bot
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathwp_links_artists.py
executable file
·276 lines (250 loc) · 10.4 KB
/
wp_links_artists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/python
import sys
import re
import sqlalchemy
import solr
from simplemediawiki import MediaWiki
from editing import MusicBrainzClient
import pprint
import urllib
import time
from mbbot.wp.wikipage import WikiPage
from mbbot.wp.analysis import determine_country
from utils import mangle_name, join_names, out, colored_out, bcolors, escape_query, quote_page_title, wp_is_canonical_page
import config as cfg
engine = sqlalchemy.create_engine(cfg.MB_DB)
db = engine.connect()
db.execute("SET search_path TO musicbrainz, %s" % cfg.BOT_SCHEMA_DB)
wp_lang = sys.argv[1] if len(sys.argv) > 1 else 'en'
wp = MediaWiki('https://%s.wikipedia.org/w/api.php' % wp_lang)
suffix = '_' + wp_lang if wp_lang != 'en' else ''
wps = solr.SolrConnection('http://localhost:8983/solr/wikipedia' + suffix)
mb = MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE)
"""
CREATE TABLE bot_wp_artist_link (
gid uuid NOT NULL,
lang character varying(2),
processed timestamp with time zone DEFAULT now()
CONSTRAINT bot_wp_artist_link_pkey PRIMARY KEY (gid, lang)
);
CREATE TABLE bot_wp_artist_link_ignore (
gid uuid NOT NULL,
lang character varying(2),
CONSTRAINT bot_wp_artist_link_ignore_pkey PRIMARY KEY (gid, lang)
);
"""
acceptable_countries_for_lang = {
'fr': ['FR', 'MC']
}
#acceptable_countries_for_lang['en'] = acceptable_countries_for_lang['fr']
query_params = []
no_country_filter = (wp_lang == 'en') and ('en' not in acceptable_countries_for_lang or len(acceptable_countries_for_lang['en']) == 0)
if no_country_filter:
# Hack to avoid having an SQL error with an empty IN clause ()
in_country_clause = 'TRUE'
else:
placeHolders = ','.join(['%s'] * len(acceptable_countries_for_lang[wp_lang]))
in_country_clause = "%s IN (%s)" % ('iso.code', placeHolders)
query_params.extend(acceptable_countries_for_lang[wp_lang])
query_params.extend((wp_lang, wp_lang))
query = """
WITH
artists_wo_wikidata AS (
SELECT DISTINCT a.id, iso.code AS iso_code
FROM artist a
LEFT JOIN area ON area.id = a.area
LEFT JOIN iso_3166_1 iso ON iso.area = area.id
LEFT JOIN (SELECT l.entity0 AS id
FROM l_artist_url l
JOIN url u ON l.entity1 = u.id AND u.url ~ '^https?://""" + wp_lang + """\.wikipedia\.org/wiki/'
WHERE l.link IN (SELECT id FROM link WHERE link_type = 179)
) wpl ON wpl.id = a.id
LEFT JOIN (SELECT l.entity0 AS id
FROM l_artist_url l
WHERE l.link IN (SELECT id FROM link WHERE link_type = 352)
) wdl ON wdl.id = a.id
WHERE a.id > 2 AND wdl.id IS NULL AND wpl.id IS NULL
AND (iso.code IS NULL OR """ + in_country_clause + """)
)
SELECT a.id, a.gid, a.name, ta.iso_code, b.processed
FROM artists_wo_wikidata ta
JOIN artist a ON ta.id=a.id
LEFT JOIN bot_wp_artist_link b ON a.gid = b.gid AND b.lang = %s
LEFT JOIN bot_wp_artist_link_ignore i ON a.gid = i.gid AND i.lang = %s
WHERE i.gid IS NULL
ORDER BY b.processed NULLS FIRST, ta.iso_code NULLS LAST, a.id
LIMIT 10000
"""
query_artist_albums = """
SELECT rg.name
FROM release_group rg
JOIN artist_credit_name acn ON rg.artist_credit = acn.artist_credit
WHERE acn.artist = %s
UNION
SELECT r.name
FROM release r
JOIN artist_credit_name acn ON r.artist_credit = acn.artist_credit
WHERE acn.artist = %s
"""
query_artist_works = """
SELECT DISTINCT w.name
FROM work w
WHERE w.id IN (
-- Select works that are related to recordings for this artist
SELECT entity1 AS work
FROM l_recording_work
JOIN recording ON recording.id = entity0
JOIN artist_credit_name acn
ON acn.artist_credit = recording.artist_credit
WHERE acn.artist = %s
UNION
-- Select works that this artist is related to
SELECT entity1 AS work
FROM l_artist_work ar
JOIN link ON ar.link = link.id
JOIN link_type lt ON lt.id = link.link_type
WHERE entity0 = %s
)
"""
query_artist_urls = """
SELECT DISTINCT u.url
FROM url u
JOIN l_artist_url l ON l.entity1 = u.id
WHERE l.entity0 = %s AND
u.url !~ 'wikipedia.org'
"""
query_related_artists = """
SELECT DISTINCT a.name
FROM artist a
WHERE a.id IN (
-- Select artists that this artist is directly related to
SELECT CASE WHEN entity1 = %s THEN entity0 ELSE entity1 END AS artist
FROM l_artist_artist ar
JOIN link ON ar.link = link.id
JOIN link_type lt ON lt.id = link.link_type
WHERE entity0 = %s OR entity1 = %s
UNION
-- Select artists that are involved with works for this artist (i.e. writers of works this artist performs)
SELECT law.entity0 AS artist
FROM artist_credit_name acn
JOIN recording ON acn.artist_credit = recording.artist_credit
JOIN l_recording_work lrw ON recording.id = lrw.entity0
JOIN l_artist_work law ON lrw.entity1 = law.entity1
WHERE acn.artist = %s
UNION
-- Select artists of recordings of works for this artist (i.e. performers of works this artist wrote)
SELECT acn.artist AS artist
FROM artist_credit_name acn
JOIN recording ON acn.artist_credit = recording.artist_credit
JOIN l_recording_work lrw ON recording.id = lrw.entity0
JOIN l_artist_work law ON lrw.entity1 = law.entity1
WHERE law.entity0 = %s
)
"""
for artist in db.execute(query, query_params):
colored_out(bcolors.OKBLUE, 'Looking up artist "%s" http://musicbrainz.org/artist/%s' % (artist['name'], artist['gid']))
matches = wps.query(escape_query(artist['name']), defType='dismax', qf='name', rows=50).results
last_wp_request = time.time()
for match in matches:
title = match['name']
if title.endswith('album)') or title.endswith('song)'):
continue
if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(artist['name']) and mangle_name(title) != mangle_name(artist['name']):
continue
delay = time.time() - last_wp_request
if delay < 1.0:
time.sleep(1.0 - delay)
last_wp_request = time.time()
wikipage = WikiPage.fetch('https://%s.wikipedia.org/wiki/%s' % (wp_lang, title))
page_orig = wikipage.text
if not page_orig:
continue
out(' * trying article "%s"' % (title,))
page = mangle_name(page_orig)
is_canonical, reason = wp_is_canonical_page(title, page_orig)
if (not is_canonical):
out(' * %s, skipping' % reason)
continue
if 'infoboxalbum' in page:
out(' * album page, skipping')
continue
page_title = title
reasons = []
# Examine albums
found_albums = []
albums = set([r[0] for r in db.execute(query_artist_albums, (artist['id'],) * 2)])
albums_to_ignore = set()
for album in albums:
if mangle_name(artist['name']) in mangle_name(album):
albums_to_ignore.add(album)
albums -= albums_to_ignore
if not albums:
continue
for album in albums:
mangled_album = mangle_name(album)
if len(mangled_album) > 6 and mangled_album in page:
found_albums.append(album)
if (found_albums):
reasons.append(join_names('album', found_albums))
out(' * has albums: %s, found albums: %s' % (len(albums), len(found_albums)))
# Examine works
found_works = []
page = mangle_name(page_orig)
works = set([r[0] for r in db.execute(query_artist_works, (artist['id'],) * 2)])
for work in works:
mangled_work = mangle_name(work)
if mangled_work in page:
found_works.append(work)
if (found_works):
reasons.append(join_names('work', found_works))
out(' * has works: %s, found works: %s' % (len(works), len(found_works)))
# Examine urls
found_urls = []
page = mangle_name(page_orig)
urls = set([r[0] for r in db.execute(query_artist_urls, (artist['id'],))])
for url in urls:
mangled_url = mangle_name(url)
if mangled_url in page:
found_urls.append(url)
if (found_urls):
reasons.append(join_names('url', found_urls))
out(' * has urls: %s, found urls: %s' % (len(urls), len(found_urls)))
# Examine related artists
found_artists = []
page = mangle_name(page_orig)
artists = set([r[0] for r in db.execute(query_related_artists, (artist['id'],) * 5)])
artists_to_ignore = set()
for rel_artist in artists:
if mangle_name(artist['name']) in mangle_name(rel_artist):
artists_to_ignore.add(rel_artist)
artists -= artists_to_ignore
for rel_artist in artists:
mangled_rel_artist = mangle_name(rel_artist)
if mangled_rel_artist in page:
found_artists.append(rel_artist)
if (found_artists):
reasons.append(join_names('related artist', found_artists))
out(' * has related artists: %s, found related artists: %s' % (len(artists), len(found_artists)))
# Determine if artist matches
if not found_albums and not found_works and not found_artists and not found_urls:
continue
# Check if wikipedia lang is compatible with artist country
if wp_lang != 'en' or wp_lang in acceptable_countries_for_lang:
if wp_lang not in acceptable_countries_for_lang:
continue
country, country_reasons = determine_country(wikipage)
if (country not in acceptable_countries_for_lang[wp_lang]):
colored_out(bcolors.HEADER, ' * artist country (%s) not compatible with wiki language (%s)' % (country, wp_lang))
continue
wp_url = 'https://%s.wikipedia.org/wiki/%s' % (wp_lang, quote_page_title(page_title),)
wd_url = 'https://www.wikidata.org/wiki/%s' % wikipage.wikidata_id.upper()
text = 'Wikidata identifier found from matching Wikipedia page %s. The page mentions %s.' % (wp_url, ', '.join(reasons))
colored_out(bcolors.OKGREEN, ' * linking to %s' % (wd_url,))
out(' * edit note: %s' % (text,))
time.sleep(60)
mb.add_url("artist", artist['gid'], 352, wd_url, text)
break
if artist['processed'] is None:
db.execute("INSERT INTO bot_wp_artist_link (gid, lang) VALUES (%s, %s)", (artist['gid'], wp_lang))
else:
db.execute("UPDATE bot_wp_artist_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (artist['gid'], wp_lang))