-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclusterAndFactorizations.py
190 lines (166 loc) · 9.04 KB
/
clusterAndFactorizations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""Quick file where I cluster the business and factorize with the resulting matrices"""
print "Importing modules..."
import getSmallMatrices as getSM
import clusterBusiness as clustBus
from factorization_trials import factorize_bias as fact_bias
import numpy as np
from scipy import io
import scipy as sp
import justTry
import pickle
"""Here I set the parameters!"""
clusters_k = 30 #Number of clusters
test_train = 10 # 1 + |Train|/|Test|
features = 10 #30 #Factorizing features (k)
lamb = 0.2 #factorizing lambda
lrate = .02 #Lrate
maxiters = 100
folder = "cluster_matrices"
# clustering("clusters")
def clustering(fileClusters = "clusters", clusters_k = clusters_k, folder = folder):
print "Clustering businesses.."
clustersBus = clustBus.clusterBusiness(k = clusters_k)
clusters = [[busn.id for busn in cluster] for cluster in clustersBus]
for i in range(len(clustersBus)):
text_file = open("cluster" +str(i) + ".txt", "w")
text_file.write(str(clustersBus[i]))
text_file.close()
pickle.dump( clustersBus, open( folder+"/"+fileClusters + "_fullBus_" +".p", "wb" ) )
pickle.dump( clusters, open( folder+"/"+fileClusters+".p", "wb" ) )
print "Clusters writen in file: ", (fileClusters+".p")
return clustersBus
#cl = clustering()
#reviewTrainTest("trainData","testData")
def reviewTrainTest(fileTrain = "trainData", fileTest = "testData", test_train = test_train, folder = folder):
print "Getting the reviews..."
(train, test) = justTry.getReviewsTest(n = test_train)
pickle.dump( train, open( folder+"/"+fileTrain+".p", "wb" ) )
pickle.dump( test, open( folder+"/"+fileTest +".p", "wb" ) )
print "Training and testing data saved in: ", (fileTrain+".p"), " and ", (fileTest +".p")
return None
#reviewTrainTest()
#clusterMatrices("trainData", "postions", "clusters", folder)
def clusterMatrices(fileTrain = "trainData", filePositions = "postions", fileClusters = "clusters", fileMatrices = "cluster_matrix_", folder = folder):
print "Making matrices out of clusters..."
train = pickle.load( open( folder+"/"+fileTrain + ".p", "rb" ) )
clusters = pickle.load( open( folder+"/"+fileClusters + ".p", "rb" ) )
(MS, busToPosition, userToPosition, userToPositionPerClus) = getSM.getMatrices(train, "fact_ratings_matrix.txt", clusters)
leng = len(MS)
print "Done making. Writing to file..."
positions = (busToPosition, userToPositionPerClus)
pickle.dump(positions, open(folder+"/"+filePositions + ".p", "wb" ))
print "Position dictionaries writen in: ", (filePositions + ".p")
for i in range(leng):
pickle.dump(MS[i], open(folder+"/"+fileMatrices + str(i) + ".p", "wb" ))
print "Matrices writen in: ", (fileMatrices+"i.p"), " File formate MS[i]"
return None
#clusterMatrices()
#print "factorizing matrices..."
sufix_matrix = "cluster_matrix_"
sufix_factors = "factors_"
#for i in range(clusters_k): factorizeMatrix(sufix_matrix + str(i), sufix_factors + str(i))
def factorizeMatrix(fileMatrix, fileFactors, features = features, lamb = lamb, lrate = lrate, maxiters = maxiters, folder = folder):
print "Factorizing..."
matrix = pickle.load( open( folder+"/"+fileMatrix + ".p", "rb" ) )
N = matrix.todok()
#(W,H,mymu,bu,bb) = fact_bias.factorize_bias(N,features,lamb,lrate,maxiters)
factors_and_stuff = fact_bias.factorize_bias(N,features,lamb,lrate,maxiters)
#more_stuff = (factors_and_stuff,busToPosition, userToPositionPerClus)
pickle.dump(factors_and_stuff, open(folder+"/"+fileFactors + ".p", "wb" ))
#for i in range(clusters_k): factorizeMatrix("cluster_matrix_" + str(i), "factors_" + str(i))
factor_files = ["factors_" + str(i) for i in range(clusters_k)]
def computeErrors(fileNames = factor_files , fileTest= "testData", filePositions = "postions", leng = clusters_k, folder = folder, test_train=test_train):
print "Building prediction matrices..."
def newList(n=5):
return [0 for _ in range(n)]
(W,H,mymu,bu,bb) = (newList(leng),newList(leng),newList(leng),newList(leng),newList(leng))
predictionMatrices = [None for _ in range(leng)]
for i in range(leng):
#Load and unpack all the data:
actors_and_stuff = pickle.load( open( folder+"/"+fileNames[i] + ".p", "rb" ) )
(W[i],H[i],mymu[i],bu[i],bb[i]) = actors_and_stuff
predictionMatrices[i] = mymu[i] + bu[i] + bb[i] + np.dot(W[i], H[i])
print "computiung error..."
total_tests = [0] * leng
total_error = [0] * leng
total_serror = [0] * leng
unmatched_users = 0
print "loading positions..."
(busToPosition, userToPositionPerClus) = pickle.load( open( folder+"/"+filePositions + ".p", "rb" ) )
print "loading Test data..."
test = pickle.load( open( folder+"/"+fileTest + ".p", "rb" ) )
#positions = (busToPosition, userToPositionPerClus)
for (user, reviews) in test.iteritems():
#row = userToPosition.get(user, None)
for (bus, stars) in reviews.iteritems():
(clusterN, index) = busToPosition[bus]
row = userToPositionPerClus.get(clusterN, {}).get(user, None)
if row is None:
#print "User ", user, " is not well defined... maybe he has no reviews? IDK..."
unmatched_users += 1
else:
total_tests[clusterN] += 1
prediction = predictionMatrices[clusterN][row , index]
error = np.absolute(stars - prediction)
total_error[clusterN] += error
serror = (stars - prediction)**2
total_serror[clusterN] += serror
print "Total of ", unmatched_users, " reviews were unpredictable..."
print "Total error: ", sum(total_error)
print "Average error per cluster: ", [(total_error[i]/float(total_tests[i])) for i in range(leng)]
print "Average error", (sum(total_error)/float(sum(total_tests)))
print "Average serror per cluster: ", [(total_serror[i]/float(total_tests[i])) for i in range(leng)]
print "Average serror", (sum(total_serror)/float(sum(total_tests)))
print "Rmse per cluster: ", [np.sqrt(total_serror[i]/float(total_tests[i])) for i in range(leng)]
print "RMSE: ", np.sqrt(sum(total_serror)/float(sum(total_tests)))
print "Parameters:"
print "Clusters: ", leng , ", train/train: ", test_train-1, ", k: ", features, ", lambda: ", lamb, ", lrate: ", lrate, ", iterations: ", maxiters
print "Done. Thank you for playing."
#computeErrors()
def computeOneError( clus_index, fileNames = factor_files , fileTest= "testData", filePositions = "postions", leng = clusters_k, folder = folder, test_train=test_train):
print "Building prediction matrices..."
def newList(n=5):
return [0 for _ in range(n)]
(W,H,mymu,bu,bb) = (newList(leng),newList(leng),newList(leng),newList(leng),newList(leng))
predictionMatrices = [None for _ in range(leng)]
for i in range(leng):
if i == clus_index:
#Load and unpack all the data:
actors_and_stuff = pickle.load( open( folder+"/"+fileNames[i] + ".p", "rb" ) )
(W[i],H[i],mymu[i],bu[i],bb[i]) = actors_and_stuff
predictionMatrices[i] = mymu[i] + bu[i] + bb[i] + np.dot(W[i], H[i])
print "computiung error..."
total_tests = [0] * leng
total_error = [0] * leng
total_serror = [0] * leng
unmatched_users = 0
print "loading positions..."
(busToPosition, userToPositionPerClus) = pickle.load( open( folder+"/"+filePositions + ".p", "rb" ) )
print "loading Test data..."
test = pickle.load( open( folder+"/"+fileTest + ".p", "rb" ) )
#positions = (busToPosition, userToPositionPerClus)
for (user, reviews) in test.iteritems():
#row = userToPosition.get(user, None)
for (bus, stars) in reviews.iteritems():
(clusterN, index) = busToPosition[bus]
if clus_index==clusterN:
row = userToPositionPerClus.get(clusterN, {}).get(user, None)
if row is None:
#print "User ", user, " is not well defined... maybe he has no reviews? IDK..."
unmatched_users += 1
else:
total_tests[clusterN] += 1
prediction = predictionMatrices[clusterN][row , index]
error = np.absolute(stars - prediction)
total_error[clusterN] += error
serror = (stars - prediction)**2
total_serror[clusterN] += serror
print "Total of ", unmatched_users, " reviews were unpredictable..."
print "Total error: ", sum(total_error)
print "Average error of the cluster: ", (total_error[clus_index]/float(total_tests[clus_index]))
print "Average serror of the cluster: ", (total_serror[clus_index]/float(total_tests[clus_index]))
print "Rmse of the cluster: ", np.sqrt(total_serror[clus_index]/float(total_tests[clus_index]))
print "Parameters:"
print "Clusters: ", leng , ", train/train: ", test_train-1, ", k: ", features, ", lambda: ", lamb, ", lrate: ", lrate, ", iterations: ", maxiters
print "Done. Thank you for playing."
#computeErrors()