-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathBOWClassifier.py
275 lines (233 loc) · 8.68 KB
/
BOWClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import json
import copy
import re
import numpy as np
def get_words(text):
"""
Parameters:
text: a string of text which needs to be processed
Returns:
string of all words extracted from the input string
"""
text = text.lower()
wordlist = text.split()
clean_list = []
for word in wordlist:
# only get words (no digits)
if re.match(r"^[a-z]+$", word):
clean_list.append(word)
return " ".join(clean_list)
def set_up_training_data(dataset):
"""
Prepares a dataset to be fit to the classifier.
Keyword arguments:
dataset -- A dataset of proper format
Returns: The training data
"""
next_index = 0
themes = {} # themes to indices
targets = [] # indices of themes, parallel to text array
text = []
urls = []
i = 0
for project in dataset:
m_themes = project["themes"]
if len(project["text"]) != 0:
words = get_words(project["text"])
urls.append(project["url"])
text.append(words)
targets.append([])
for theme in m_themes:
if theme["name"] not in themes:
themes[theme["name"]] = next_index
next_index += 1
targets[i].append(themes[theme["name"]])
i += 1
data = {}
data["themes"] = themes
data["targets"] = targets
data["urls"] = urls
data["text"] = text
return data
class BOWClassifier:
"""
A bag of words classifier to predict the themes of a non-profit organization given text.
Methods:
__init__(self, train_data: dict, dict_data: dict)
predict_set(self, testing_data: dict)
predict_org(self, text: str)
save_predictions(self, output_file: str)
load_predictions(self, predictions: dict)
get_predictions(self)
get_f1_score(self)
load_targets(self, target_data: dict)
"""
# internal necessities
themes = {} # a dict mapping themes to indices in a list
dictionary = None # dictionary of words for bag of words to utilize in scoring text
predictions = (
None
) # predictions made by the classifier : stored every time predict_set is called
# testing datasets
testing_data = None # testing dataset to test accuracy
testing_targets = None # testing targets
def __init__(self, train_data: dict, dict_data: dict):
"""
Initializes bag of words classifier object
:param dict train_data: training data
:param dict dict_data: dictionary of category words
"""
self.themes = train_data["themes"]
self.dictionary = dict_data
# ensuring dictionary is correctly structured
for theme in self.themes:
assert self.dictionary.get(
theme
), "dictionary does not contain proper theme keys."
theme = next(iter(self.themes))
word = next(iter(self.dictionary[theme]))
assert (
self.dictionary.get(theme).get(word).get("tf-idf")
), "dictionary does not contain tf-idf score"
def predict_set(self, testing_data: dict):
"""
Predicts a set of organizations
:param dict testing_data: the testing dataset to be predicted
:return: A list of lists of predicted scores
"""
assert self.dictionary
self.predictions = []
# predicting organizations
for project in testing_data:
if project.get("text") != None:
self.predictions.append(self.predict_org(project["text"]))
return self.predictions
def predict_org(self, text: str):
"""
Predicts an organizations themes based off text
:param str text: text from the organization
:return: list of predictions
"""
# list to store scores
scores = [0] * 18
# calculating sum of relevant words in each category
total = 0
for word in text:
for category, category_words in self.dictionary.items():
if category_words.get(word) is not None:
scores[self.themes[category]] += category_words.get(word).get(
"tf-idf"
)
total += category_words.get(word).get("tf-idf")
# finding second highest category score
temp = copy.deepcopy(scores)
temp.sort()
threshold = temp[-6]
# predicting all themes of second highest score and above
for i in range(len(scores)):
scores[i] = 1 if scores[i] >= threshold else 0
# returning results
return scores
def save_predictions(self, output_file: str):
"""
Stores predictions made by the bag of words model
:param str output_file: json file name for predictions to be stored in
"""
assert self.predictions, "predictions have not been made yet"
with open(output_file, "w") as predictions_json:
json.dump(
self.predictions,
predictions_json,
sort_keys=True,
indent=2,
ensure_ascii=False,
)
def load_predictions(self, predictions: dict):
"""
Load predictions from file
:param dict predictions: dict of predictions
"""
self.predictions = predictions
def get_predictions(self):
return self.predictions
def get_f1_score(self):
"""
Returns the f1 score for the predictions by organization and category
"""
# output mean f1 score by document, then category
assert self.testing_targets, "targets not loaded"
testing_targets = self.testing_targets
# for every document, calculate the matrix, then f1 score
org_f1_scores = []
accuracies = []
for i in range(len(self.predictions)):
fp = 0
fn = 0
tp = 0
tn = 0
for j in range(len(self.predictions[i])):
if self.predictions[i][j] == 1:
if j in testing_targets[i]:
tp += 1
else:
fp += 1
else:
if j in testing_targets[i]:
fn += 1
else:
tn += 1
precision = 0 if (tp + fp == 0) else tp / (tp + fp)
recall = 0 if (tp + fn == 0) else tp / (tp + fn)
f1 = (
0
if precision + recall == 0
else 2 * (precision * recall) / (precision + recall)
)
org_f1_scores.append(f1)
accuracies.append((tp + tn) / len(self.predictions[i]))
category_f1_scores = {}
for theme_name, theme_number in self.themes.items():
tp = 0
tn = 0
fp = 0
fn = 0
for i in range(len(self.predictions)):
if (
self.predictions[i][theme_number] == 1
and theme_number in testing_targets[i]
):
tp += 1
if (
self.predictions[i][theme_number] == 1
and theme_number not in testing_targets[i]
):
fp += 1
if (
self.predictions[i][theme_number] == 0
and theme_number in testing_targets[i]
):
fn += 1
if (
self.predictions[i][theme_number] == 0
and theme_number not in testing_targets[i]
):
tn += 1
precision = tp / (tp + fp) if (tp + fp) != 0 else 0
recall = tp / (tp + fn) if (tp + fn) != 0 else 0
f1 = (
2 * ((precision * recall) / (precision + recall))
if (precision + recall) != 0
else 0
)
category_f1_scores[theme_name] = f1
return np.mean(np.array(org_f1_scores)), category_f1_scores
def load_targets(self, target_data: dict):
"""
Load the testing targets (correct classifications) for testing dataset
Predictions made by the classifier are stored in a list of lists of the predicted scores.
The testing targets to be loaded should also be a list of lists of predicted scores, with each index
corresponding to the same organization in the predictions list.
:param dict target_data: dictionary containing target data for testing dataset
"""
assert target_data.get("targets"), "targets parameter needed in dictionary"
self.testing_targets = target_data["targets"]