-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdist1_kmeans1_findA_withkdtree_allForChinese_outPPGSPEC.py
312 lines (245 loc) · 11.7 KB
/
dist1_kmeans1_findA_withkdtree_allForChinese_outPPGSPEC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# 为了验证代码正确性,找ppg英文自己的top3的第3个, 然后把所有的ppg和linear和wav都存起来,共之后对比NN版本
import os
import time
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree
from audio import hparams as audio_hparams
from audio import load_wav, wav2unnormalized_mfcc, wav2normalized_db_mel, wav2normalized_db_spec
from audio import write_wav, normalized_db_mel2wav, normalized_db_spec2wav
eps = 1e-6
# 超参数个数:16
hparams = {
'sample_rate': 16000,
'preemphasis': 0.97,
'n_fft': 400,
'hop_length': 80,
'win_length': 400,
'num_mels': 80,
'n_mfcc': 13,
'window': 'hann',
'fmin': 30.,
'fmax': 7600.,
'ref_db': 20,
'min_db': -80.0,
'griffin_lim_power': 1.5,
'griffin_lim_iterations': 60,
'silence_db': -28.0,
'center': True,
}
assert hparams == audio_hparams
Linear_DIM = 201
PPG_DIM = 345
# !!!!!!
K_small = 1 #类
K = 20000 #类
en_all_cnt = 3
# cn_all_cnt = 12
cn_all_cnt = 9500
cn_raw_list_path = '/datapool/home/hujk17/chenxueyuan/DataBaker_Bilingual_CN/meta_good.txt'
cn_raw_ppg_path = '/datapool/home/hujk17/chenxueyuan/DataBaker_Bilingual_CN/ppg_from_generate_batch'
cn_raw_linear_dir ='/datapool/home/hujk17/chenxueyuan/DataBaker_Bilingual_CN/spec_5ms_by_audio_2'
en_raw_list_path = '/datapool/home/hujk17/chenxueyuan/DataBaker_Bilingual_CN/meta_good.txt'
en_raw_ppg_path = '/datapool/home/hujk17/chenxueyuan/DataBaker_Bilingual_CN/ppg_from_generate_batch'
en_raw_linear_dir = '/datapool/home/hujk17/chenxueyuan/DataBaker_Bilingual_CN/spec_5ms_by_audio_2'
# 写
en_final_cn_log_path = '/datapool/home/hujk17/chenxueyuan/en_final_cn_log_withkdtree_kmeans1_findA_dist1_allForChinese_outPPGSPE'
if os.path.exists(en_final_cn_log_path) is False:
os.makedirs(en_final_cn_log_path)
en_final_cn_idx_path = os.path.join(en_final_cn_log_path, 'en_final_cn_idx_withkdtree_kmeans1_findA_dist1_allForChinese_outPPGSPE.npy')
# 写
projected_wav_dir = '/datapool/home/hujk17/chenxueyuan/projected_wavs_16000_withkdtree_kmeans1_findA_dist1_allForChinese_outPPGSPE'
if os.path.exists(projected_wav_dir) is False:
os.makedirs(projected_wav_dir)
def en_text2list(file):
en_file_list = []
global en_all_cnt
with open(file, 'r') as f:
for i, line in enumerate(f.readlines()):
en_file_list.append(line.strip())
if i >= en_all_cnt - 1:
break
print('en len:', len(en_file_list), 'en:', en_file_list[:min(3, en_all_cnt)])
return en_file_list
# 000001 那些#1庄稼#1田园#2在#1果果#1眼里#2感觉#1太亲切了#4
# na4 xie1 zhuang1 jia5 tian2 yuan2
def cn_text2list(file):
cn_file_list = []
global cn_all_cnt
with open(file, 'r') as f:
a = [i.strip() for i in f.readlines()]
i = 0
while i < len(a):
fname = a[i]
cn_file_list.append(fname)
i += 1
if i >= cn_all_cnt:
break
print('cn len:', len(cn_file_list), 'cn:', cn_file_list[:min(3, cn_all_cnt)])
return cn_file_list
def get_single_data_pair(fname, ppgs_dir, linears_dir):
assert os.path.isdir(ppgs_dir) and os.path.isdir(linears_dir)
ppg_f = os.path.join(ppgs_dir, fname+'.npy')
linear_f = os.path.join(linears_dir, fname+'.npy')
ppg = np.load(ppg_f)
linear = np.load(linear_f)
assert ppg.shape[0] == linear.shape[0],fname+' 维度不相等'
assert ppg.shape[1] == PPG_DIM and linear.shape[1] == Linear_DIM
return ppg, linear
def for_loop_en():
en_file_list = en_text2list(file=en_raw_list_path)
en_ppgs_ls = []
en_linears_ls = []
for f in tqdm(en_file_list):
wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=en_raw_ppg_path, linears_dir=en_raw_linear_dir)
en_ppgs_ls.extend(list(wav_ppgs))
en_linears_ls.extend(list(linears))
return en_ppgs_ls, en_linears_ls
def for_loop_cn():
cn_file_list = cn_text2list(file=cn_raw_list_path)
cn_ppgs_ls = []
cn_linears_ls = []
for f in tqdm(cn_file_list):
wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=cn_raw_ppg_path, linears_dir=cn_raw_linear_dir)
cn_ppgs_ls.extend(list(wav_ppgs))
cn_linears_ls.extend(list(linears))
return cn_ppgs_ls, cn_linears_ls
def dist(ppg_e, ppg_c):
ans = np.linalg.norm(ppg_e - ppg_c)
return ans
def cluster_kmeans(all, K):
class_as_index = KMeans(n_clusters=K, random_state=0).fit_predict(all)
return class_as_index
def bruce_find_closest(i, now_class, en_l, cn_l, class_cn_ppgs):
ans = 1e100
ans_id = -1
for j in class_cn_ppgs[now_class]: #class_cn_ppgs[now_class] = [2,8,19,...]或[3,48,79,...]或[4,5,36,...]或... eg,[2,8,19,...]
e = en_l[i] #e = en_l[0], en_l[1], en_l[2], ...
c = cn_l[j] #c = cn_l[2], cn_l[8], ...
dist_e_c = dist(e, c)
if dist_e_c < ans:
ans = dist_e_c
ans_id = j #cn_id 距离每一个en_ppg最近的cn_ppg的帧id
return ans, ans_id
def kdtree_find_closest(i, en_l, now_class_cn_ppgs_value_kdtree, now_class_cn_ppgs):
e = en_l[i]
e_2d = np.expand_dims(e, axis=0)
# print(e_2d.shape)
dist, ind = now_class_cn_ppgs_value_kdtree.query(e_2d, k=3)
print(dist, ind)
dist = dist[0][2]
ind = ind[0][2]
real_ind = now_class_cn_ppgs[ind]
return dist, real_ind
# id list
def ppg_project(e_ppg_id, project_array):
ans = list()
for i in e_ppg_id:
j = int(project_array[i] + eps)
ans.append(j)
print('cn num from 0:', ans[:10])
return ans
def main():
print('start program')
program_time = time.time()
last_time = time.time()
en_ppg_l, en_linear_l = for_loop_en() #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...]
cn_ppg_l, cn_linear_l = for_loop_cn() #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...]
all_ppg_l = en_ppg_l + cn_ppg_l #中英文混合后的ppg的列表
print('end put ppg in memory, use:', time.time() - last_time)
last_time = time.time()
print('start cluster...')
# 需要快速的聚类 #all_l=[en_ppg1,en_ppg2,...,cn_ppg1,cn_ppg2,...]
all_class = cluster_kmeans(all_ppg_l, K_small) #all_class=[en_label,en_label,...,cn_label,cn_label,...]
print('end cluster..., k-means use:', time.time() - last_time)
last_time = time.time()
#... a[100], a[0].1, 2, 3,...
class_cn_ppgs = list() #建立一个列表class_cn_ppgs,列表中包含K个空列表class_cn_ppg = [[],[],[],...]
class_cn_ppgs_value = list()
class_cn_ppgs_value_kdtree = list()
for i in range(K_small):
l = list()
class_cn_ppgs.append(l) #append()在列表后面添加元素
l_value = list()
class_cn_ppgs_value.append(l_value)
# 构造类的信息, 筛选出每个类里都有哪些中文的ppg; 并且平均每个类有100个中文ppg
en_ppg_l_len = len(en_ppg_l)
for i in range(len(cn_ppg_l)):
idx = i + en_ppg_l_len
now_class = all_class[idx] #now_class = cn_label 可能是0-1999
class_cn_ppgs[now_class].append(i) #class_cn_ppg = [[2,8,19,...],[3,48,79,...],[4,5,36,...],...] 2000个类,每个类中含有cn_l中对应帧ppg的序列号
class_cn_ppgs_value[now_class].append(cn_ppg_l[i])
print('prepare for class infomation use:', time.time() - last_time)
print('start construct kdtree')
all_last_time = time.time()
have_cnt = 0
for i in tqdm(range(K_small)):
l = len(class_cn_ppgs[i])
if l > 0:
have_cnt += 1
print('cluster', i, 'len', l, 'start construct kd-tree')
last_time = time.time()
class_cn_ppgs_value[i] = np.asarray(class_cn_ppgs_value[i])
class_cn_ppgs_value_kdtree.append(KDTree(class_cn_ppgs_value[i], leaf_size=40) )
print('end cluster', i, 'kd-tree use:', time.time() - last_time)
print('have class:', have_cnt)
print('end construct all kdtrees, tot use:', time.time() - all_last_time)
# 开始寻找en对应的类内所有中文ppg离他最近的
print('start get cloest map array for all en ppg')
last_time = time.time()
en_final_cn_idx = np.zeros((en_ppg_l_len)) # a[1000000] np.zeros()返回来一个给定形状和类型的用0填充的数组;
for i in tqdm(range(en_ppg_l_len)): #遍历英文ppg列表,
now_class = all_class[i] #now_class = en_label 可能是0-1999
# 暴力寻找
# ans1, ans_id1 = bruce_find_closest(i, now_class, en_ppg_l, cn_ppg_l, class_cn_ppgs)
# k-d tree寻找
ans, ans_id = kdtree_find_closest(i, en_ppg_l, class_cn_ppgs_value_kdtree[now_class], class_cn_ppgs[now_class])
# assert np.absolute(ans1 - ans) < eps and ans_id1 == ans_id
en_final_cn_idx[i] = ans_id
np.save(en_final_cn_idx_path, en_final_cn_idx)
print('end write map array, all use:', time.time() - last_time)
# 开始findA的部分
print('start findA')
last_time = time.time()
en_file_list = en_text2list(file=en_raw_list_path)
now = 0
for f in tqdm(en_file_list):
wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=en_raw_ppg_path, linears_dir=en_raw_linear_dir)
e_ppg_id = [] # 英文从零开始
for i in range(wav_ppgs.shape[0]):
e_ppg_id.append(now)
now += 1
print('en id from 0:', e_ppg_id[:10])
c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx) # 从中文的零开始
# 找到ppg并存储
c_ppgs_projected = list()
for i in c_ppg_id_projected:
c_ppgs_projected.append(cn_ppg_l[i])
c_ppgs_projected = np.asarray(c_ppgs_projected)
save_ppg_name_projected = f + '_cn_ppg_projected.npy'
np.save(os.path.join(projected_wav_dir, save_ppg_name_projected), c_ppgs_projected)
# 找到linear并存储
c_lineas_projected = list()
for i in c_ppg_id_projected:
c_lineas_projected.append(cn_linear_l[i])
c_lineas_projected = np.asarray(c_lineas_projected)
save_linear_name_projected = f + '_cn_linear_projected.npy'
np.save(os.path.join(projected_wav_dir, save_linear_name_projected), c_lineas_projected)
# 计算音频wav并存储
save_wav_name_projected = f + '_cn_wav_projected.wav'
write_wav(os.path.join(projected_wav_dir, save_wav_name_projected), normalized_db_spec2wav(c_lineas_projected))
#----------接下来是original的ppg,linear,wav存储,用来对比-----------
# 找到ppg并存储
save_ppg_name_original = f + '_en_ppg_original.npy'
np.save(os.path.join(projected_wav_dir, save_ppg_name_original), wav_ppgs)
# 找到linear并存储
save_linear_name_original = f + '_en_linear_original.npy'
np.save(os.path.join(projected_wav_dir, save_linear_name_original), linears)
# 计算音频wav并存储-original
save_wav_name_original = f + '_en_wav_original.wav'
write_wav(os.path.join(projected_wav_dir, save_wav_name_original), normalized_db_spec2wav(linears))
print('end findA, use:', time.time() - last_time)
print('program use:', time.time() - program_time)
if __name__ == '__main__':
main()