This repository has been archived by the owner on Dec 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathqboost.py
351 lines (272 loc) · 12 KB
/
qboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# Copyright 2018 D-Wave Systems Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http: // www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import dimod
from dwave.system import LeapHybridSampler
from tabulate import tabulate
class DecisionStumpClassifier:
"""Decision tree classifier that operates on a single feature with a single splitting rule.
The index of the feature used in the decision rule is stored
relative to the original data frame.
"""
def __init__(self, X, y, feature_index):
"""Initialize and fit the classifier.
Args:
X (array):
2D array of feature vectors. Note that the array
contains all features, while the weak classifier
itself uses only a single feature.
y (array):
1D array of class labels, as ints. Labels should be
+/- 1.
feature_index (int):
Index for the feature used by the weak classifier,
relative to the overall data frame.
"""
self.i = feature_index
self.clf = DecisionTreeClassifier(max_depth=1)
self.clf.fit(X[:, [feature_index]], y)
def predict(self, X):
"""Predict class.
Args:
X (array):
2D array of feature vectors. Note that the array
contains all features, while the weak classifier
itself will make a prediction based only a single
feature.
Returns:
Array of class labels.
"""
return self.clf.predict(X[:, [self.i]])
def _build_H(classifiers, X, output_scale):
"""Construct matrix of weak classifier predictions on given set of input vectors."""
H = np.array([clf.predict(X) for clf in classifiers], dtype=float).T
# Rescale H
H *= output_scale
return H
class EnsembleClassifier:
"""Ensemble of weak classifiers."""
def __init__(self, weak_classifiers, weights, weak_classifier_scaling, offset=1e-9):
"""Initialize ensemble from list of weak classifiers and weights.
Args:
weak_classifiers (list):
List of classifier instances.
weights (array):
Weights associated with the weak classifiers.
weak_classifier_scaling (float):
Scaling for weak classifier outputs.
offset (float):
Offset value for ensemble classifier. The default
value is a small positive number used to prevent
ambiguous 0 predictions when weak classifiers exactly
balance each other out.
"""
self.classifiers = weak_classifiers
self.w = weights
self.weak_clf_scale = weak_classifier_scaling
self.offset = offset
def predict(self, X):
"""Compute ensemble prediction.
Note that this function returns the numerical value of the
ensemble predictor, not the class label. The predicted class
is sign(predict()).
"""
H = _build_H(self.classifiers, X, self.weak_clf_scale)
# If we've already filtered out those with w=0 and we are only
# using binary weights, this is just a sum
preds = np.dot(H, self.w)
return preds - self.offset
def predict_class(self, X):
"""Compute ensemble prediction of class label."""
preds = self.predict(X)
# Add a small perturbation to any predictions that are exactly
# 0, because these will not count towards either class when
# passed through the sign function. Such zero predictions can
# happen when the weak classifiers exactly balance each other
# out.
preds[preds == 0] = 1e-9
return np.sign(preds)
def score(self, X, y):
"""Compute accuracy score on given data."""
if sum(self.w) == 0:
# Avoid difficulties that occur with handling this below
return 0.0
return accuracy_score(y, self.predict_class(X))
def squared_error(self, X, y):
"""Compute squared error between predicted and true labels.
Provided for testing purposes.
"""
p = self.predict(X)
return sum((p - y)**2)
def fit_offset(self, X):
"""Fit offset value based on class-balanced feature vectors.
Currently, this assumes that the feature vectors in X
correspond to an even split between both classes.
"""
self.offset = 0.0
# Todo: review whether it would be appropriate to subtract
# mean(y) here to account for unbalanced classes.
self.offset = np.mean(self.predict(X))
def get_selected_features(self):
"""Return list of features corresponding to the selected weak classifiers."""
return [clf.i for clf, w in zip(self.classifiers, self.w) if w > 0]
class AllStumpsClassifier(EnsembleClassifier):
"""Ensemble classifier with one decision stump for each feature."""
def __init__(self, X, y):
if not all(np.isin(y, [-1, 1])):
raise ValueError("Class labels should be +/- 1")
num_featuers = np.size(X, 1)
classifiers = [DecisionStumpClassifier(
X, y, i) for i in range(num_featuers)]
# Note: the weak classifier output scaling is arbitrary in
# this case and does not affect the predictions.
super().__init__(classifiers, np.ones(num_featuers), 1/num_featuers)
self.fit_offset(X)
def _build_bqm(H, y, lam):
"""Build BQM.
Args:
H (array):
2D array of weak classifier predictions. Each row is a
sample point, each column is a classifier.
y (array):
Outputs
lam (float):
Coefficient that controls strength of regularization term
(larger values encourage decreased model complexity).
"""
n_samples = np.size(H, 0)
n_classifiers = np.size(H, 1)
# samples_factor is a factor that appears in front of the squared
# loss term in the objective. In theory, it does not affect the
# problem solution, but it does affect the relative weighting of
# the loss and regularization terms, which is otherwise absorbed
# into the lambda parameter.
# Using an average seems to be more intuitive, otherwise, lambda
# is sample-size dependent.
samples_factor = 1.0 / n_samples
bqm = dimod.BQM('BINARY')
bqm.offset = samples_factor * n_samples
for i in range(n_classifiers):
# Note: the last term with h_i^2 is part of the first term in
# Eq. (12) of Neven et al. (2008), where i=j.
bqm.add_variable(i, lam - 2.0 * samples_factor *
np.dot(H[:, i], y) + samples_factor * np.dot(H[:, i], H[:, i]))
for i in range(n_classifiers):
for j in range(i+1, n_classifiers):
# Relative to Eq. (12) from Neven et al. (2008), the
# factor of 2 appears here because each term appears twice
# in a sum over all i,j.
bqm.add_interaction(
i, j, 2.0 * samples_factor * np.dot(H[:, i], H[:, j]))
return bqm
def _minimize_squared_loss_binary(H, y, lam):
"""Minimize squared loss using binary weight variables."""
bqm = _build_bqm(H, y, lam)
sampler = LeapHybridSampler()
results = sampler.sample(bqm, label='Example - QBoost')
weights = np.array(list(results.first.sample.values()))
energy = results.first.energy
return weights, energy
class QBoostClassifier(EnsembleClassifier):
"""Construct an ensemble classifier using quadratic loss minimization.
"""
def __init__(self, X, y, lam, weak_clf_scale=None, drop_unused=True):
"""Initialize and fit QBoost classifier.
X should already include all candidate features (e.g., interactions).
Args:
X (array):
2D array of feature vectors.
y (array):
1D array of class labels (+/- 1).
lam (float):
regularization parameter.
weak_clf_scale (float or None):
scale factor to apply to weak classifier outputs. If
None, scale by 1/num_classifiers.
drop_unused (bool):
if True, only retain the nonzero weighted classifiers.
"""
if not all(np.isin(y, [-1, 1])):
raise ValueError("Class labels should be +/- 1")
num_features = np.size(X, 1)
if weak_clf_scale is None:
weak_clf_scale = 1 / num_features
wclf_candidates = [DecisionStumpClassifier(
X, y, i) for i in range(num_features)]
H = _build_H(wclf_candidates, X, weak_clf_scale)
# For reference, store individual weak classifier scores.
# Note: we don't check equality h==y here because H might be rescaled.
self.weak_scores = np.array([np.mean(np.sign(h) * y > 0) for h in H.T])
weights, self.energy = _minimize_squared_loss_binary(H, y, lam)
# Store only the selected classifiers
if drop_unused:
weak_classifiers = [wclf for wclf, w in zip(
wclf_candidates, weights) if w > 0]
weights = weights[weights > 0]
else:
weak_classifiers = wclf_candidates
super().__init__(weak_classifiers, weights, weak_clf_scale)
self.fit_offset(X)
# Save candidates so we can provide a baseline accuracy report.
self._wclf_candidates = wclf_candidates
def report_baseline(self, X, y):
"""Report accuracy of weak classifiers.
This provides context for interpreting the performance of the boosted
classifier.
"""
scores = np.array([accuracy_score(y, clf.predict(X))
for clf in self._wclf_candidates])
data = [[len(scores), scores.min(), scores.mean(), scores.max(), scores.std()]]
headers = ['count', 'min', 'mean', 'max', 'std']
print('Accuracy of weak classifiers (score on test set):')
print(tabulate(data, headers=headers, floatfmt='.3f'))
def qboost_lambda_sweep(X, y, lambda_vals, val_fraction=0.4, verbose=False, **kwargs):
"""Run QBoost using a series of lambda values and check accuracy against a validation set.
Args:
X (array):
2D array of feature vectors.
y (array):
1D array of class labels (+/- 1).
lambda_vals (array):
Array of values for regularization parameter, lambda.
val_fraction (float):
Fraction of given data to set aside for validation.
verbose (bool):
Print out diagnostic information to screen.
kwargs:
Passed to QBoost.__init__.
Returns:
QBoostClassifier:
QBoost instance with best validation score.
lambda:
Lambda value corresponding to the best validation score.
"""
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=val_fraction)
best_score = -1
best_lambda = None
best_clf = None
if verbose:
print('{:7} {} {}:'.format('lambda', 'n_features', 'score'))
for lam in lambda_vals:
qb = QBoostClassifier(X_train, y_train, lam, **kwargs)
score = qb.score(X_val, y_val)
if verbose:
print('{:<7.4f} {:<10} {:<6.3f}'.format(
lam, len(qb.get_selected_features()), score))
if score > best_score:
best_score = score
best_clf = qb
best_lambda = lam
return best_clf, lam