-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
506 lines (427 loc) · 21.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
import timeit
import rdflib
import os
import pandas as pd
import re
import spotlight
import owlrl
from datetime import datetime
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.collection import Collection
from rdflib.namespace import RDF, RDFS, XSD, FOAF, OWL
from get_liquipedia_data import team_results, player_results, map_results
from numpy import nan
# Note: importing team_results, and player_results may take a bit of time.
# This is due to the API terms of use described in get_liquipedia_data.py
# import datasets and merge to a single Pandas DataFrame
all_dfs = []
phs_dir = os.getcwd() + r'\phs_data'
for sub_dir in os.listdir(phs_dir):
# define full path to the sub_dir
full_path = os.path.join(phs_dir, sub_dir)
# if sub_dir is not a directory, it's the map stats dataset
if os.path.isfile(full_path):
map_stats = pd.read_csv(full_path)
continue
# get new dfs
df = pd.concat([pd.read_csv(os.path.join(full_path, x)) for x in os.listdir(full_path)])
#
# if dataset is 2020
if sub_dir == 'phs_2020' or sub_dir == 'phs_2021':
df.rename(columns={'esports_match_id': 'match_id', 'tournament_title': 'stage',
'team_name': 'team', 'player_name': 'player',
'hero_name': 'hero'}, inplace=True)
# add df_new to df
all_dfs.append(df)
dfs = pd.concat(all_dfs)
# make all player names lowercase, because of difference in Liquipedia and dataset
dfs.player = dfs.player.apply(lambda x: str(x).lower())
# Instantiate graph
g = Graph()
# Namespaces
ex = Namespace('http://example.org/')
dbp = Namespace('http://dbpedia.org/resource/')
schema = Namespace('https://schema.org/')
dbp_o = Namespace('http://dbpedia.org/ontology/')
wd = Namespace('http://www.wikidata.org/entity/')
# bind namespaces
g.bind('FOAF', FOAF)
g.bind('ex', ex)
g.bind('DBpedia', dbp)
g.bind('Schema', schema)
g.bind('dbp_o', dbp_o)
g.bind('Wikidata', wd)
g.bind('OWL', OWL)
# Classes and Class Properties for esports
# RDFS Player-related class properties
g.add((FOAF.Person, RDFS.subClassOf, dbp.Agent))
g.add((ex.Player, RDFS.subClassOf, FOAF.Person))
g.add((ex.Player, RDF.type, OWL.Class))
g.add((FOAF.name, RDFS.domain, ex.Player))
g.add((FOAF.name, RDF.type, OWL.DatatypeProperty))
g.add((ex.PlayerID, RDF.type, OWL.DatatypeProperty))
g.add((ex.PlayerID, RDFS.domain, ex.Player))
g.add((ex.PlayerID, RDFS.range, RDFS.Literal))
g.add((FOAF.age, RDFS.domain, ex.Player))
g.add((FOAF.age, RDFS.range, RDFS.Literal))
g.add((ex.birthday, RDFS.domain, ex.Player))
g.add((ex.birthday, RDFS.range, RDFS.Literal))
g.add((ex.playedHeroes, RDFS.domain, ex.Player))
g.add((ex.playedHeroes, RDFS.range, RDFS.Literal))
g.add((ex.role, RDFS.domain, ex.Player))
g.add((ex.role, RDFS.range, RDFS.Literal))
g.add((ex.playsFor, RDFS.domain, ex.Player))
g.add((ex.playsFor, RDFS.range, dbp.SportsTeam))
g.add((ex.playedAgainst, RDFS.subClassOf, FOAF.knows))
g.add((ex.playedAgainst, RDF.type, OWL.SymmetricProperty))
g.add((ex.playedAgainst, RDFS.domain, ex.Player))
g.add((ex.playedAgainst, RDFS.range, ex.Player))
g.add((ex.playedWith, RDFS.subClassOf, FOAF.knows))
g.add((ex.playedWith, RDF.type, OWL.SymmetricProperty))
g.add((ex.playedWith, RDFS.domain, ex.Player))
g.add((ex.playedWith, RDFS.range, ex.Player))
g.add((ex.playedMatches, RDFS.domain, ex.Player))
g.add((ex.playedMatches, RDFS.range, ex.Match))
g.add((schema.nationality, RDFS.domain, ex.Player))
g.add((schema.nationality, RDFS.range, schema.Country))
g.add((schema.nationality, RDFS.range, dbp.Country))
# RDFS Team-related class properties
g.add((dbp.Organisation, RDFS.subClassOf, dbp.Agent))
g.add((dbp.SportsTeam, RDFS.subClassOf, dbp.Organisation))
g.add((dbp.SportsTeam, OWL.sameAs, schema.SportsTeam))
g.add((FOAF.name, RDFS.domain, schema.SportsTeam))
g.add((ex.playedAgainst, RDFS.domain, dbp.SportsTeam))
g.add((ex.playedAgainst, RDFS.range, dbp.SportsTeam))
g.add((ex.playedMatches, RDFS.domain, dbp.SportsTeam))
g.add((ex.playedMatches, RDFS.range, ex.matchID))
g.add((ex.hasRegion, RDFS.domain, dbp.SportsTeam))
g.add((ex.hasRegion, RDFS.range, dbp.Continent))
g.add((ex.hasRegion, RDFS.range, schema.Continent))
g.add((ex.hasRegion, RDFS.range, schema.Location))
g.add((ex.hasRegion, RDFS.range, dbp.Location))
# RDFS Match-related class properties
g.add((ex.Match, RDF.type, OWL.Class))
g.add((ex.matchID, RDF.type, OWL.DatatypeProperty))
g.add((ex.matchID, RDFS.domain, ex.Match))
g.add((ex.matchID, RDFS.range, RDFS.Literal))
g.add((ex.matchMap, RDFS.domain, ex.Match))
g.add((ex.matchMap, RDFS.range, ex.Map))
g.add((ex.matchWinner, RDFS.domain, ex.Match))
g.add((ex.matchWinner, RDFS.range, dbp.SportsTeam))
g.add((ex.matchLoser, RDFS.domain, ex.Match))
g.add((ex.matchLoser, RDFS.range, dbp.SportsTeam))
g.add((ex.matchTeamOne, RDFS.domain, ex.Match))
g.add((ex.matchTeamOne, RDFS.range, dbp.SportsTeam))
g.add((ex.matchTeamTwo, RDFS.domain, ex.Match))
g.add((ex.matchTeamTwo, RDFS.range, dbp.SportsTeam))
g.add((ex.matchStartTime, RDFS.domain, ex.Match))
g.add((ex.matchStartTime, RDFS.range, RDFS.Literal))
# RDFS Map-related class properties
g.add((ex.Map, RDF.type, OWL.Class))
g.add((ex.hasLocation, RDFS.domain, ex.Map))
g.add((ex.hasLocation, RDFS.domain, dbp.SportsTeam))
g.add((ex.hasLocation, RDFS.domain, schema.SportsTeam))
g.add((ex.hasLocation, RDFS.range, dbp.Country))
g.add((ex.hasLocation, RDFS.range, schema.Country))
g.add((FOAF.name, RDFS.domain, ex.Map))
# RDFS Tournament-related class properties
g.add((ex.Tournament, RDF.type, OWL.Class))
g.add((FOAF.name, RDFS.domain, ex.Tournament))
g.add((ex.tournamentMatches, RDFS.domain, ex.matchID))
g.add((ex.tournamentWinner, RDFS.domain, ex.Tournament))
g.add((ex.tournamentWinner, RDFS.range, dbp.SportsTeam))
g.add((ex.tournamentWinner, RDFS.range, schema.SportsTeam))
g.add((ex.tournamentPrizePool, RDFS.domain, ex.Tournament))
g.add((ex.tournamentPrizePool, RDFS.range, RDFS.Literal))
# Other class properties
g.add((dbp.Country, OWL.sameAs, schema.Country))
g.add((dbp.Place, OWL.sameAs, schema.Place))
g.add((dbp.Country, ex.hasRegion, dbp.Continent))
g.add((schema.Country, ex.hasRegion, schema.Continent))
print("Inital Classes and Class properties added to graph. ")
# Spotlight server address
SERVER = "https://api.dbpedia-spotlight.org/en/annotate"
# global list of already queried DBpedia resources
queried_resources = set()
all_resources = {}
def get_dbpedia_resources(resources):
"""
Requests DBpedia's API with the spotlight module to gather data about resources
:param resources: iterable object of resources to query (cannot contain list of lists)
"""
global queried_resources
server = "https://api.dbpedia-spotlight.org/en/annotate"
# make sure resource is not already queried, to avoid unnecessary API requests
resources = [str(x) for x in resources if x not in queried_resources]
# verify that we still have some resources after filtering out already queried ones
if not resources:
# print('All resources were already queried')
return None
# add resources to queried_resources
queried_resources.update(resources)
# convert list of resources to comma-separated string for API request
text = ','.join(resources)
# perform API request
try:
response = spotlight.annotate(server, text)
# add this response to the global variable all_resources
[all_resources.update({x['surfaceForm']: x}) for x in response
if x['similarityScore'] > 0.9 or 'Schema:SportsTeam' in x['types'].split(',')]
except spotlight.SpotlightException:
pass # this means no resource was found in spotlight response. can be ignored
except Exception as e:
print('An exception occurred: ', e)
def connect_dbpedia_resources(team_data, keys):
"""
Connect the resources gathered from DBpedia's API with the resources from our dataset.
"""
global all_resources
# make sure all_resources is not empty
if all_resources == {}:
return None
# MANUALLY define exceptions in blacklist
blacklist = ['Has sponsor']
# get all values in team_data on keys defined in keys
values = dict(zip(keys, map(team_data.get, keys)))
# get response from DBpedia that matches the key/value pairs of team_data
result = {k: all_resources[v] for k, v in values.items() if k not in blacklist and v in all_resources}
# MANUALLY add exceptions defined in blacklist below
if 'Has sponsor' in keys:
sponsors = [x[0] for x in team_data['Has sponsor'] if x[0] in all_resources]
if sponsors:
sponsor_dict = {'Has sponsor': sponsors}
result.update(sponsor_dict)
return result
# Query DBPedia Spotlight for team resources
_ = [get_dbpedia_resources([team.title()]) for team in team_results.keys()]
# Query DBPedia Spotlight for map location resources
all_map_locations = set([map_name['Has location'] for map_name in map_results.values()])
get_dbpedia_resources([all_map_locations])
# Query DBPedia Spotlight for player nationalities
nationalities = set([player['Has nationality'] for player in player_results.values()])
_ = [get_dbpedia_resources([n]) for n in nationalities]
# Add team triples to graph
start = timeit.default_timer()
for team, team_data in team_results.items():
team = team.title()
team_name = team.replace(' ', '_')
# add all resources to one list for later query
try:
sponsors = [x[0] for x in team_data['Has sponsor']]
keys = ['Has name', 'Has region', 'Has location', 'Has sponsor']
except KeyError:
sponsors = []
keys = ['Has name', 'Has region', 'Has location']
# get resources
resources = [team_data[x] for x in ['Has name', 'Has region', 'Has location'] if x in team_data.keys()] + sponsors
# query resources (no output is returned, but global variable 'all_resources' is updated)
get_dbpedia_resources(resources)
# connect resources with team_data
resources = connect_dbpedia_resources(team_data, keys)
# define team_entity and types
try:
team_entity = URIRef(resources['Has name']['URI'])
except KeyError:
# this means the team is not a resource in DBpedia.
team_entity = ex.term(team_name)
# add type
g.add((team_entity, RDF.type, schema.SportsTeam))
g.add((team_entity, RDF.type, dbp.SportsTeam))
# add team name
g.add((team_entity, FOAF.name, Literal(team, datatype=XSD.string)))
# loop over all resources connected to team_data
for key, response in resources.items():
try:
resource_obj = URIRef(response['URI'])
except TypeError:
continue
# add resources to team_entity
# don't add if key is 'Has name' because we use FOAF.name for that earlier.
if key != 'Has name':
key = key.split()
predicate = ex.term(key[0].lower() + key[1].title())
g.add((team_entity, predicate, resource_obj))
# add types for all resources
for ns, value in [t.split(':') for t in response['types'].split(',')]:
if ns == 'Wikidata':
g.add((resource_obj, RDF.type, wd.term(value)))
elif ns == 'Schema':
g.add((resource_obj, RDF.type, schema.term(value)))
elif ns == 'DBpedia':
g.add((resource_obj, RDF.type, dbp.term(value)))
# get all unique games a team has played
team_games_df = dfs[dfs.team == team][['match_id', 'team']].drop_duplicates()
# add all games a team has played to a collection with a blank node
b = BNode()
team_games = [ex.term(str(match)) for match in team_games_df['match_id'].to_list()]
Collection(g, b, team_games)
# add blank node to team_entity's played matches
g.add((team_entity, ex.playedMatches, b))
print("Team triples added to graph.")
stop = timeit.default_timer()
print('Time: ', stop - start)
# Add player triples to graph
start = timeit.default_timer()
for player, player_data in player_results.items():
# define player_entity
player_entity = ex.term(player.replace(' ', '_'))
# define nationality
try:
player_nationality = URIRef(all_resources[player_data['Has nationality']]['URI'])
except KeyError:
# this means the nationality does not exist in DBpedia
player_nationality = ex.term(player_data['Has nationality'].replace(' ', '_'))
# Add birthday to player entity
try:
birthday = '/'.join(player_data['Has birth day'].split('/')[1:4])
birthday = datetime.strptime(birthday, '%Y/%m/%d')
g.add((player_entity, ex.birthday, Literal(birthday, datatype=XSD.date)))
except KeyError:
# this means there is no data on the player's birthday in DBpedia
pass
# Add age to graph
try:
g.add((player_entity, FOAF.age, Literal(player_data['Has age'], datatype=XSD.integer)))
except KeyError:
# this means there is no data on the player's age in DBpedia
pass
# Add role to graph
try:
g.add((player_entity, ex.role, Literal(str(player_data['Has role'].replace(' ', '_')), datatype=XSD.string)))
except KeyError:
# this means there is no data on the player's main role in DBpedia
pass
# Add type, id, name, and nationality to graph
g.add((player_entity, RDF.type, ex.Player))
g.add((player_entity, ex.PlayerID, Literal(player, datatype=XSD.string)))
g.add((player_entity, FOAF.name, Literal(player_data['Has name'], datatype=XSD.string)))
g.add((player_entity, dbp_o.term('nationality'), player_nationality))
# get all unique games a player has participated in + team name and heroes played
player_games_df = dfs[(dfs.player == player) & (dfs.hero != 'All Heroes')][
['match_id', 'team', 'hero']].drop_duplicates()
# add all unique games a player has participated in to a blank node
player_matches = [ex.term(str(match)) for match in player_games_df['match_id'].unique()]
b = BNode()
Collection(g, b, player_matches)
# add the blank node to player_entity's played matches
g.add((player_entity, ex.playedMatches, b))
# Add team to player entity (using existing team_entity in graph)
try:
player_team = player_games_df.iloc[-1, 1] # team most recently played with
team_entity = [s for s in g.subjects(predicate=FOAF.name, object=Literal(player_team, datatype=XSD.string))]
g.add((player_entity, ex.playsFor, team_entity[0]))
except IndexError:
print(f"Could not find team for player {player}")
# Add player's hero pick rates to a blank node
total_nr_picked_heroes = player_games_df.hero.value_counts().sum()
hero_pick_rates = [Literal(f"{hero} pick rate: {round((count / total_nr_picked_heroes) * 100, 5)}%",datatype=XSD.string)
for hero, count in player_games_df.hero.value_counts().iteritems()]
b = BNode()
Collection(g, b, hero_pick_rates)
# Add the blank node pwith hero ratios to player_entity
g.add((player_entity, ex.playedHeroes, b))
print("Player triples added to graph.")
stop = timeit.default_timer()
print('Time: ', stop - start)
# Creating a new dataframe with Overwatch League match data from Statslab
match_df = pd.read_csv(r'phs_data\match_map_stats.csv')
match_df = match_df[["round_start_time", "match_id", "map_name", "team_one_name",
"team_two_name", "match_winner"]]
match_df["match_start_date"] = match_df['round_start_time'].str.extract(r'(^\d{4}-\d{2}-\d{2})')
match_df.drop('round_start_time', axis=1, inplace=True)
match_df["tournament"] = "Overwatch League " + match_df['match_start_date'].str.extract(r'(^\d{4})')
match_df.drop_duplicates(subset=["match_id"], keep="first", ignore_index=True, inplace=True)
tournament_matches = {}
# Adding match, tournament and map triples to graph
start = timeit.default_timer()
for (index, match_id, map_name, team_one_name, team_two_name,
match_winner, match_start_time, tournament) in match_df.itertuples():
# Create a term for the Match instance subject
match_entity = ex.term(str(match_id))
# get team entities from graph
try:
team_one_entity = [s for s in g.subjects(predicate=FOAF.name, object=Literal(team_one_name, datatype=XSD.string))][0]
team_two_entity = [s for s in g.subjects(predicate=FOAF.name, object=Literal(team_two_name, datatype=XSD.string))][0]
match_winner_entity = [s for s in g.subjects(predicate=FOAF.name, object=Literal(match_winner, datatype=XSD.string))][0]
except IndexError as e:
print('Could not find entity for a team: {}'.format(e))
except Exception as e:
print('Exception: Could not find entity for a team: {}'.format(e))
# Add Match instances with properties to graph
g.add((match_entity, RDF.type, ex.Match))
g.add((match_entity, ex.matchID, Literal(match_id, datatype=XSD.integer)))
# Create team_one_df and team_two_df
temp = dfs.drop_duplicates(subset=['player', 'match_id', 'team'])
team_one_df = temp[(temp.match_id == match_id) & (temp.team.str.lower() == team_one_name.lower())]
temp = dfs.drop_duplicates(subset=['player', 'match_id', 'team'])
team_two_df = temp[(temp.match_id == match_id) & (temp.team.str.lower() == team_two_name.lower())]
# Add players pr. team
for i, player in enumerate(team_one_df.player.to_list(), 1):
_ = [g.add((match_entity, ex.term('matchTeamOnePlayer{}'.format(i)), s)) for s in g.subjects(predicate=ex.PlayerID,
object=Literal(player, datatype=XSD.string))]
for i, player in enumerate(team_two_df.player.to_list(), 1):
_ = [g.add((match_entity, ex.term('matchTeamTwoPlayer{}'.format(i)), s)) for s in g.subjects(predicate=ex.PlayerID,
object=Literal(player, datatype=XSD.string))]
# Adding Map instances with properties to graph
map_entity_name = map_name.replace(' ', '_').replace("'", "")
if (ex.term(map_entity_name), RDF.type, ex.Map) not in g:
map_entity = ex.term(map_entity_name)
map_location = map_results[map_name]['Has location']
g.add((map_entity, RDF.type, ex.Map))
g.add((map_entity, FOAF.name, Literal(map_name, datatype=XSD.string)))
# Add map entity location with DBPedia resource
try:
map_resource_obj = URIRef(all_resources[map_location]['URI'])
except KeyError:
map_resource_obj = ex.term(map_location)
g.add((map_entity, ex.hasLocation, map_resource_obj))
# Add types to DBPedia map resource object
g.add((map_resource_obj, FOAF.name, Literal(map_location, datatype=XSD.string)))
for namespace, value in [t.split(':') for t in all_resources[map_location]['types'].split(',')]:
if namespace == 'Wikidata':
g.add((map_resource_obj, RDF.type, wd.term(value)))
elif namespace == 'Schema':
g.add((map_resource_obj, RDF.type, schema.term(value)))
elif namespace == 'DBpedia':
g.add((map_resource_obj, RDF.type, dbp.term(value)))
# Add more Match instance properties
g.add((match_entity, ex.matchMap, ex.term(map_entity_name)))
g.add((match_entity, ex.matchTeamOne, team_one_entity))
g.add((match_entity, ex.matchTeamTwo, team_two_entity))
g.add((match_entity, ex.matchWinner, match_winner_entity))
g.add((match_entity, ex.matchStartTime, Literal(match_start_time, datatype=XSD.date)))
# Add Tournament instances with properties to graph
if tournament is not nan:
tournament_entity_name = tournament.replace(" ", "_")
tournament_entity = ex.term(tournament_entity_name)
if (tournament_entity, RDF.type, ex.Tournament) not in g:
tournament_matches[tournament_entity] = []
g.add((tournament_entity, RDF.type, ex.Tournament))
g.add((tournament_entity, FOAF.name, Literal(tournament, datatype=XSD.string)))
tournament_matches[tournament_entity].append(match_id)
# Add match_ids to tournament entities
for value in tournament_matches.keys():
# add all match_ids for matches played in a tournament to a collection with a blank node
b = BNode()
t_matches = [ex.term(str(match)) for match in tournament_matches[value]]
Collection(g, b, t_matches)
# add blank node to team_entity's played matches
g.add((value, ex.tournamentMatches, b))
for tournament_entity in tournament_matches.keys():
# add all match_ids for matches played in a tournament to a collection with a blank node
b = BNode()
t_matches = [ex.term(str(match)) for match in tournament_matches[tournament_entity]]
Collection(g, b, t_matches)
# add blank node to team_entity's played matches
g.add((tournament_entity, ex.tournamentMatches, b))
print("Match, tournament and map triples added to graph")
stop = timeit.default_timer()
print('Time: ', stop - start)
# Add inferred triples to the graph (NB! OWL2 is not compatible with WebVOWL)
# owl = owlrl.CombinedClosure.RDFS_OWLRL_Semantics(g, False, False, False)
# owl.closure()
# owl.flush_stored_triples()
# Print the graph to terminal
g.serialize(destination='graph.ttl', format='ttl')
# leave commented until we deliver assignment
# print(g.serialize(format='ttl').decode('utf-8'))