Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
cnlinxi committed Mar 20, 2022
1 parent 748f94a commit d437c7e
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 53 deletions.
Binary file modified image/mel_spectrogram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified image/mfcc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
15 changes: 8 additions & 7 deletions text_to_speech.log
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2022.3.2) 19 MAR 2022 23:23
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2022.3.2) 20 MAR 2022 22:04
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
Expand Down Expand Up @@ -2096,9 +2096,10 @@ File: ./image//waveform.png Graphic file (type bmp)
[25] [26]
File: ./image//mel_spectrogram.png Graphic file (type bmp)
<./image//mel_spectrogram.png>
[27]
File: ./image//mfcc.png Graphic file (type bmp)
<./image//mfcc.png>
[27] [28]
[28]
第 4{} 章 4.
Missing character: There is no ə ("259) in font ntx-Regular-tlf-t1!
Missing character: There is no ʊ ("28A) in font ntx-Regular-tlf-t1!
Expand Down Expand Up @@ -2163,9 +2164,9 @@ File: ./image//vocoder_nhv_harmonic_sample.png Graphic file (type bmp)
<./image//vocoder_nhv_harmonic_sample.png>
[55] [56]
LaTeX Font Info: Font shape `T1/ntxtlf/m/n' will be
(Font) scaled to size 7.3pt on input line 2066.
(Font) scaled to size 7.3pt on input line 2084.
LaTeX Font Info: Font shape `T1/ntxtlf/m/n' will be
(Font) scaled to size 5.5pt on input line 2066.
(Font) scaled to size 5.5pt on input line 2084.
[57] [58] [59] [60] [61] [62] [63] [64] [65] [66]
第 7{} 章 7.
File: ./image//text_to_speech_knowledge.png Graphic file (type bmp)
Expand All @@ -2180,10 +2181,10 @@ Package logreq Info: Writing requests to 'text_to_speech.run.xml'.

)
Here is how much of TeX's memory you used:
47059 strings out of 476919
975022 string characters out of 5821841
47084 strings out of 476919
975794 string characters out of 5821841
2252612 words of memory out of 5000000
66161 multiletter control sequences out of 15000+600000
66186 multiletter control sequences out of 15000+600000
458064 words of font info for 113 fonts, out of 8000000 for 9000
1348 hyphenation exceptions out of 8191
118i,12n,131p,1436b,1887s stack positions out of 5000i,500n,10000p,200000b,80000s
Expand Down
Binary file modified text_to_speech.pdf
Binary file not shown.
Binary file modified text_to_speech.synctex.gz
Binary file not shown.
110 changes: 64 additions & 46 deletions text_to_speech.tex
Original file line number Diff line number Diff line change
Expand Up @@ -640,13 +640,13 @@ \section{预处理}
包括预加重、分帧和加窗。

\subsection{预加重}
语音经过说话人的口唇辐射发出,受到唇端辐射抑制,高频能量明显降低。一般来说,当语音信号的频率提高两倍时,其功率谱的幅度下降约6dB,即语音信号的高频部分受到的抑制影响较大。在进行语音信号的分析和处理时,可采用预加重(pre-emphasis)的方法补偿语音信号高频部分的振幅,本质是施加低通滤波器。假设输入信号第 $n$ 个采样点为 $x[n]$ ,则预加重公式如下:
语音经过说话人的口唇辐射发出,受到唇端辐射抑制,高频能量明显降低。一般来说,当语音信号的频率提高两倍时,其功率谱的幅度下降约6dB,即语音信号的高频部分受到的抑制影响较大。在进行语音信号的分析和处理时,可采用预加重(pre-emphasis)的方法补偿语音信号高频部分的振幅,在傅里叶变换操作中避免数值问题,本质是施加高通滤波器。假设输入信号第 $n$ 个采样点为 $x[n]$ ,则预加重公式如下:

\begin{equation}
x'[n]=x[n]-a\times x[n-1]
\end{equation}

其中, $a$ 是预加重系数,一般取 $a=0.97$
其中, $a$ 是预加重系数,一般取 $a=0.97$$a=0.95$

\subsection{分帧}
语音信号是非平稳信号,考虑到发浊音时声带有规律振动,即基音频率在短时范围内时相对固定的,因此可以认为语音信号具有短时平稳特性,一般认为10ms~50ms的语音信号片段是一个准稳态过程。短时分析采用分帧方式,一般每帧帧长为20ms或50ms。假设语音采样率为16kHz,帧长为20ms,则一帧有 $16000\times 0.02=320$ 个样本点。
Expand Down Expand Up @@ -943,48 +943,48 @@ \subsection{利用librosa读取音频}
\subsection{提取梅尔频谱}

\begin{lstlisting}
sample_rate = 16000
preemphasis = 0.97
n_fft = 1024
frame_length = 0.05 # ms
frame_shift = 0.01 # ms
fmin = 0
fmax = sample_rate/2
eps = 1e-10
n_mel = 80
win_length = int(sample_rate*frame_length)
hop_length = int(sample_rate*frame_shift)
mel_basis = librosa.filters.mel(
sample_rate, n_fft, n_mel, fmin=fmin, fmax=fmax)
def get_spectrogram(input_wav_path):
y, sr = librosa.load(input_wav_path)
y = np.append(y[0], y[1:]-preemphasis*y[:-1])
linear = librosa.stft(
y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
mag = np.abs(linear)
mel = np.dot(mel_basis, mag)
mel = np.log10(np.maximum(eps, mel))
mel = mel.T.astype(np.float32) # (T,n_mels)
return mel
# plt.switch_backend('agg')
def plot_spectrogram(spectrogram, file_path):
spectrogram = spectrogram.T
fig = plt.figure(figsize=(16, 9))
plt.imshow(spectrogram, aspect='auto', origin='lower')
plt.colorbar()
plt.xlabel('frames')
plt.tight_layout()
plt.savefig(file_path, dpi=500)
plt.show()
mel_spec = get_spectrogram(input_wav_path)
plot_spectrogram(mel_spec, 'mel_spectrogram.png')
sample_rate = 16000
preemphasis = 0.97
n_fft = 1024
frame_length = 0.05 # ms
frame_shift = 0.01 # ms
fmin = 0
fmax = sample_rate/2
eps = 1e-10
n_mel = 80
win_length = int(sample_rate*frame_length)
hop_length = int(sample_rate*frame_shift)
mel_basis = librosa.filters.mel(
sample_rate, n_fft, n_mel, fmin=fmin, fmax=fmax)


def get_spectrogram(input_wav_path):
y, sr = librosa.load(input_wav_path)
y = np.append(y[0], y[1:]-preemphasis*y[:-1])
linear = librosa.stft(
y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
mag = np.abs(linear)
mel = np.dot(mel_basis, mag)
mel = np.log10(np.maximum(eps, mel))
mel = mel.T.astype(np.float32) # (T,n_mels)
return mel

# plt.switch_backend('agg')


def plot_spectrogram(spectrogram, file_path):
spectrogram = spectrogram.T
fig = plt.figure(figsize=(16, 9))
plt.imshow(spectrogram, aspect='auto', origin='lower')
plt.colorbar()
plt.xlabel('frames')
plt.tight_layout()
plt.savefig(file_path, dpi=500)
plt.show()


mel_spec = get_spectrogram(input_wav_path)
plot_spectrogram(mel_spec, 'mel_spectrogram.png')
\end{lstlisting}

\begin{figure}[htbp]
Expand All @@ -996,8 +996,17 @@ \subsection{提取梅尔频谱}
\subsection{提取MFCC}

\begin{lstlisting}
mfcc = dct(mel_spec)
plot_spectrogram(mfcc, 'mfcc.png')
num_ceps = 12
mfcc = dct(mel_spec, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)]
plot_spectrogram(mfcc, 'mfcc.png')
# 将正弦同态滤波(sinusoidal liftering)应用于MFCC以去强调更高的MFCC,其已被证明可以改善噪声信号中的语音识别。
# reference: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
(nframes, ncoeff) = mfcc.shape
cep_lifter = 22
n = np.arange(ncoeff)
lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
mfcc *= lift
plot_spectrogram(mfcc, 'mfcc_lift.png')
\end{lstlisting}

\begin{figure}[htbp]
Expand All @@ -1006,6 +1015,15 @@ \subsection{提取MFCC}
\caption{MFCC \label{fig:mfcc}}
\end{figure}

在语音合成中,类似于深度学习其它领域,输入数据要进行均值方差归一化,使得数据量纲一致并遵循一定分布,避免模型梯度爆炸,降低学习难度。

\begin{lstlisting}
frame_num = mel_spec.shape[0]
cep_sum = np.sum(mel_spec, axis=0)
cep_squ_sum = np.sum(np.square(mel_spec), axis=0)
cep_mean = cep_sum/frame_num
cep_std = cep_squ_sum/frame_num-np.square(cep_mean)
\end{lstlisting}


\chapter{音库制作和文本前端}
Expand Down

0 comments on commit d437c7e

Please sign in to comment.