-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify.py
executable file
·126 lines (102 loc) · 5.15 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
import logging
import time
import numpy as np
import sklearn.model_selection as ms
import preprocess
import svm
import mlp
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.neural_network import MLPClassifier
def predict_clust(clust, dict_of_models, X, y):
while X.shape[1] < 1000:
complement = np.zeros([X.shape[0]])[None,:]
X = np.concatenate((X, complement.T), axis=1)
path = f"cluster{clust}_results.txt"
with open(path, "w") as sink:
for label, model in dict_of_models.items():
y_pred = model.predict(X)
stats = precision_recall_fscore_support(y, y_pred, average="macro")
precision, recall, fscore, _ = stats
print(f"{label} CLASSIFIER", file=sink)
print(f"Precision: {precision * 100:.2f}", file=sink)
print(f"Recall: {recall * 100:.2f}", file=sink)
print(f"F-Score: {fscore * 100:.2f}", file=sink)
print("", file=sink)
def main():
logging.info(f"{'':15}Loading data...")
st = time.time()
_, texts, _, labels = preprocess.get_all_texts()
clusters = preprocess.get_clusters()
logging.info(f"{preprocess.get_duration(st, time.time()):>20}\tData loaded...")
st = time.time()
clustered_texts = defaultdict(list)
for text, label, cluster in zip(texts, labels, clusters):
clustered_texts[cluster].append((text, label))
logging.info(f"{preprocess.get_duration(st, time.time()):>20}\tClusters separated...")
cluster_scores_svm = defaultdict(tuple)
cluster_scores_mlp = defaultdict(tuple)
all_models = {}
for clust, content in clustered_texts.items():
logging.info(f"{'':15}Training cluster {clust}...")
texts_list = [text for text, _ in content]
y = np.array([label for _, label in content])
st = time.time()
bow = preprocess.get_bow(texts_list, sw="english")
logging.info(f"{preprocess.get_duration(st, time.time()):>20}\tBoW extracted...")
logging.info(f"{'':23}\t{bow.shape}")
st = time.time()
X = MinMaxScaler().fit_transform(bow)
if X.shape[1] > 1000:
X = TruncatedSVD(n_components=1000).fit_transform(X)
logging.info(f"{preprocess.get_duration(st, time.time()):>20}\tDimensions reduced...")
logging.info(f"{'':23}\t{X.shape}")
st = time.time()
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=.2)
logging.info(f"{preprocess.get_duration(st, time.time()):>20}\tTrain and dev sets created...")
best_reg = svm.get_best_reg(clust, X_train, X_test, y_train, y_test)
st = time.time()
svm_model = SVC(C=best_reg, kernel="linear").fit(X_train, y_train)
if clust != 2:
all_models[f"SVM - cluster {clust}"] = svm_model
r_test_pred = svm_model.predict(X_test)
test_stats = precision_recall_fscore_support(y_test, r_test_pred, average="macro")
cluster_scores_svm[clust] = test_stats
logging.info(f"{preprocess.get_duration(st, time.time()):>20}\tSVM stats calculated with best reg...")
logging.info(f"{'':>23}\t{test_stats}")
mlp_alpha = mlp.get_best_params(clust + 100, X_train, y_train)
st = time.time()
mlp_model = MLPClassifier(alpha=mlp_alpha, verbose="False", activation="relu").fit(X_train, y_train)
if clust != 2:
all_models[f"MLP - cluster {clust}"] = mlp_model
m_test_pred = mlp_model.predict(X_test)
test_stats_2 = precision_recall_fscore_support(y_test, m_test_pred, average="macro")
cluster_scores_mlp[clust] = test_stats_2
logging.info(f"{preprocess.get_duration(st, time.time()):>20}\tMLP stats calculated with best reg...")
logging.info(f"{'':>23}\t{test_stats_2}")
if clust == 2:
st = time.time()
predict_clust(clust, all_models, X, y)
logging.info(f"{preprocess.get_duration(st, time.time()):>20}\tSupplementary predictions recorded...")
with open("results.txt", "w") as sink:
for clust, stats in cluster_scores_svm.items():
precision, recall, fscore, _ = stats
print(f"CLUSTER {clust} RESULTS - SVM classifier", file=sink)
print(f"Precision: {precision * 100:.2f}", file=sink)
print(f"Recall: {recall * 100:.2f}", file=sink)
print(f"F-Score: {fscore * 100:.2f}", file=sink)
print("", file=sink)
for clust, stats in cluster_scores_mlp.items():
precision, recall, fscore, _ = stats
print(f"CLUSTER {clust} RESULTS - MLP classifier", file=sink)
print(f"Precision: {precision * 100:.2f}", file=sink)
print(f"Recall: {recall * 100:.2f}", file=sink)
print(f"F-Score: {fscore * 100:.2f}", file=sink)
print("", file=sink)
if __name__ == "__main__":
logging.basicConfig(level="INFO", format="%(levelname)s: %(message)s")
main()