-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoutput_conf_mats.py
222 lines (204 loc) · 7.89 KB
/
output_conf_mats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import os
import sys
import csv
import itertools
import pickle
import argparse
import random
from math import ceil
import datetime
import numpy as np
import settings as s
import keras
from keras import layers, regularizers
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
BATCH_SIZE = 32
TEST_SPLIT = 0.8
MIN_LENGTH = 50
EPOCHS = 10
PHONEME_TO_TRAIN = "t"
FEATURE_SIZE=66
FEATURE_TYPE="AutoVOT"
TEST_SPEAKERS = [ "DAB0", "WBT0", "ELC0", "TAS1", "WEW0", "PAS0", "JMP0", "LNT0",
"PKT0", "LLL0", "TLS0", "JLM0", "BPM0", "KLT0", "NLP0", "CMJ0",
"JDH0", "MGD0", "GRT0", "NJM0", "DHC0", "JLN0", "PAM0", "MLD0"]
parser = argparse.ArgumentParser(description='Train allophone classifier')
parser.add_argument('--phoneme', default=PHONEME_TO_TRAIN, help='Phoneme to train')
parser.add_argument('--epochs', default=EPOCHS, type=int, help='Number of epochs to train for')
parser.add_argument('--feature_type', default=FEATURE_TYPE, help='Which feature to use')
parser.add_argument('--batch_size', default=BATCH_SIZE, type=int, help='Number of batches')
parser.add_argument('--position', default="no_pos", help='Word position')
args = parser.parse_args()
BATCH_SIZE = args.batch_size
EPOCHS = args.epochs
FEATURE_TYPE = args.feature_type
POSITION = args.position
PHONEME_TO_TRAIN = args.phoneme
def get_labels(phoneme):
if phoneme == "t":
labels = ["t", "trl", "tcl", "-", "dx", "q", "other"]
allophone_classes = {
"trl":"trl",
"t":"t",
"tcl":"tcl",
"-":"-",
"dx":"dx",
"nx":"dx",
"q":"q"
}
valid_phonemes = ["t", "n_t", "t_y"]
elif phoneme == "d":
labels = ["d", "drl", "dcl", "-", "dx", "other"]
allophone_classes = {
"drl":"drl",
"d":"d",
"dcl":"dcl",
"-":"-",
"dx":"dx",
"nx":"dx",
}
valid_phonemes = ["d", "n_d", "d_y"]
else:
labels = [phoneme, "{}rl".format(phoneme), "{}cl".format(phoneme), "-", "other"]
allophone_classes = {
phoneme: phoneme,
"{}rl".format(phoneme): "{}rl".format(phoneme),
"-":"-",
"{}cl".format(phoneme): "{}cl".format(phoneme),
}
valid_phonemes = [phoneme]
return labels, allophone_classes, valid_phonemes
def evaluate_model(truth, predictions, classes):
conf_mat = confusion_matrix(truth, predictions)
output_str = """
\\begin{{figure}}[htb!]
\\begin{{center}}
\\newcommand\items{{{}}} %Number of classes
\\arrayrulecolor{{white}} %Table line colors
\\noindent
\\begin{{tabular}}{{cc*{{\items}}{{|E}}|}}
\\multicolumn{{1}}{{c}}{{}} &\\multicolumn{{1}}{{c}}{{}} &\\multicolumn{{\\items}}{{c}}{{Predicted}} \\\\ \\hhline{{~*\\items{{|-}}|}}
\\multicolumn{{1}}{{c}}{{}} &
\\multicolumn{{1}}{{c}}{{}} &
""".format(len(classes))
for i, clss in enumerate(classes):
output_str += " \\multicolumn{{1}}{{c}}{{\\rot{{{}}}}}".format(clss)
if i < len(classes)-1:
output_str += " &"
output_str += """
\\\\ \\hhline{~*\\items{|-}|}
\\multirow{\\items}{*}{\\rotatebox{90}{Actual}}
"""
for i in range(conf_mat.shape[0]):
output_str += "&{}" .format(classes[i])
for j in range(conf_mat.shape[1]):
output_str += "& {}" .format(conf_mat[i, j])
output_str += "\\\\ \\hhline{~*\\items{|-}|}\n"
output_str += """
\\end{{tabular}}
\\caption{{Confusion matrix for /{}/, accuracy={:.2f}\\%}}
\\end{{center}}
\\end{{figure}}
""".format(PHONEME_TO_TRAIN, 100*(sum(truth == predictions)/len(predictions)))
print(output_str)
def data_generator(data, batch_size=BATCH_SIZE, do_shuffle=False, sample_weight=False):
z_scores = np.load(os.path.join(s.OUTPUT_DIR, "zscores_new.npy"))
rand = random.Random()
length_order = list(data.keys())
path_cache = {}
number_of_classes = max(allophone_mapping.values())+1
while True:
if do_shuffle:
rand.shuffle(length_order)
for length in length_order:
if length < MIN_LENGTH:
continue
if do_shuffle:
new_data_list = rand.sample(data[length], len(data[length]))
else:
new_data_list = data[length]
for position in range(0, len(new_data_list), batch_size):
batch_x = []
batch_y = []
batch_sample_weights = []
for phone in new_data_list[position:position+batch_size]:
if phone["x_path"] not in path_cache:
phone_x = np.nan_to_num(np.load(phone["x_path"]))
phone_x = (phone_x - z_scores[0])/z_scores[1]
path_cache[phone["x_path"]] = phone_x
else:
phone_x = path_cache[phone["x_path"]]
batch_x.append(phone_x)
y = np.zeros((1, number_of_classes))
y[:, allophone_mapping[phone["allophone"]]] = 1
batch_y.append(y)
batch_x = np.stack(batch_x, axis=0)
batch_y = np.vstack(batch_y)
yield batch_x, batch_y
data = {}
allophone_mapping = {}
LABELS, ALLOPHONE_CLASSES, VALID_PHONEMES = get_labels(PHONEME_TO_TRAIN)
previous_phoneme = "x"
count = 0
rand_p = random.Random(39)
with open(os.path.join("timit_data.csv"), "r") as csvfile:
reader = csv.DictReader(csvfile)
for i, row in enumerate(reader):
phoneme = row["underlying_phoneme"].lower()
if phoneme not in VALID_PHONEMES:
if PHONEME_TO_TRAIN != "t" or row["phone"] != "q":
continue
if row["boundary"] == "3" or row["boundary"] == "2":
pos = "initial"
elif row["boundary"] == "1":
pos = "syllable"
else:
pos = "medial"
if POSITION != "no_pos" and pos != POSITION:
continue
allophone = row["phone"]
if allophone not in allophone_mapping:
allophone_mapping[allophone] = len(allophone_mapping)
x = np.nan_to_num(np.load(row["numpy_X"]))
length = x.shape[0]
if length not in data:
data[length] = []
data[length].append({
"allophone":allophone,
"position":pos,
"phoneme":phoneme,
"x_path":row["numpy_X"],
"speaker":row["path"].split("/")[-2][1:],
})
for k, v in allophone_mapping.items():
if k not in ALLOPHONE_CLASSES:
allophone_mapping[k] = LABELS.index("other")
else:
allophone_mapping[k]=LABELS.index(ALLOPHONE_CLASSES[k])
train_data = {}
test_data = {}
numbers_test = {k:0 for k in allophone_mapping.values()}
numbers_train = {k:0 for k in allophone_mapping.values()}
for l in data:
test_data[l] = []
train_data[l] = []
for x in data[l]:
if x["speaker"] in TEST_SPEAKERS:
test_data[l].append(x)
numbers_test[allophone_mapping[x["allophone"]]] += 1
else:
numbers_train[allophone_mapping[x["allophone"]]] += 1
train_data[l].append(x)
train_size = sum([ceil(len(v)/BATCH_SIZE) for length, v in train_data.items()])
validation_size = sum([ceil(len(v)/BATCH_SIZE) for length, v in test_data.items()])
y_s = []
for i, (x, y) in enumerate(data_generator(test_data)):
if i >= validation_size:
break
y_s.append(y)
test_y = np.vstack(y_s)
model = keras.models.load_model("{}_model_max.hd5".format(PHONEME_TO_TRAIN))
pred = model.predict_generator(data_generator(test_data), validation_size)
evaluate_model(np.argmax(test_y, axis=1), np.argmax(pred, axis=1), LABELS)