-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpredictor.py
79 lines (65 loc) · 2.25 KB
/
predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
'''
Generic class that takes training data
and generates the necessary structures
for predicting items in different manners
'''
import sys
class Predictor(object):
def __init__(self, trn_filename, sep):
self.training_filename = trn_filename
self.separator = sep
self.users = {}
self.items = {}
def store_data_relations(self):
#TODO should be more generic
f = open(self.training_filename, "r")
lines = f.readlines()
for l in lines:
user, item, rating, timestamp = l.split(self.separator)
try:
self.users[user][item] = float(rating)
except KeyError:
self.users[user] = {item : float(rating)}
try:
self.items[item][user] = float(rating)
except KeyError:
self.items[item] = {user : float(rating)}
f.close()
return self.users, self.items
#todo trusting on that, check it out better
def compute_means(self, info):
avgs = {}
for i in info.keys():
avgs[i] = 0.0
num_j = 0
for j in info[i].keys():
avgs[i] += info[i][j]
num_j += 1
avgs[i] /= num_j
return avgs
def normalize_ratings(self, info):
'''
info is a dict either in the form
{ item : {user : rating} } or in the
for { user : {item : rating} }
'''
avgs = self.compute_means(info)
for i in info.keys():
for j in info[i].keys():
info[i][j] -= avgs[i]
return info, avgs
def choose_some_items(self, item_ids, user_items, target_item, K):
'''
Selects K items in the dataset (item_ids) randomly,
excluding those that were rated by a certain user (user_items)
and appending the target_item
'''
items = list(set(item_ids) - set(user_items))
from random import sample
movies = sample(items, K)
movies.append(target_item)
return movies
if __name__=="__main__":
a = Predictor(sys.argv[1], sys.argv[2])
users, items = a.store_data_relations() #~100MB
ratings, means = a.normalize_ratings(users)