diff --git a/text_to_speech.log b/text_to_speech.log index d4072ed..69f3348 100644 --- a/text_to_speech.log +++ b/text_to_speech.log @@ -1,4 +1,4 @@ -This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2022.3.2) 20 MAR 2022 22:04 +This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2022.3.2) 20 MAR 2022 22:08 entering extended mode restricted \write18 enabled. file:line:error style messages enabled. @@ -2164,9 +2164,9 @@ File: ./image//vocoder_nhv_harmonic_sample.png Graphic file (type bmp) <./image//vocoder_nhv_harmonic_sample.png> [55] [56] LaTeX Font Info: Font shape `T1/ntxtlf/m/n' will be -(Font) scaled to size 7.3pt on input line 2084. +(Font) scaled to size 7.3pt on input line 2086. LaTeX Font Info: Font shape `T1/ntxtlf/m/n' will be -(Font) scaled to size 5.5pt on input line 2084. +(Font) scaled to size 5.5pt on input line 2086. [57] [58] [59] [60] [61] [62] [63] [64] [65] [66] 第 7{} 章 7. File: ./image//text_to_speech_knowledge.png Graphic file (type bmp) @@ -2181,10 +2181,10 @@ Package logreq Info: Writing requests to 'text_to_speech.run.xml'. ) Here is how much of TeX's memory you used: - 47084 strings out of 476919 - 975794 string characters out of 5821841 + 47085 strings out of 476919 + 975806 string characters out of 5821841 2252612 words of memory out of 5000000 - 66186 multiletter control sequences out of 15000+600000 + 66187 multiletter control sequences out of 15000+600000 458064 words of font info for 113 fonts, out of 8000000 for 9000 1348 hyphenation exceptions out of 8191 118i,12n,131p,1436b,1887s stack positions out of 5000i,500n,10000p,200000b,80000s diff --git a/text_to_speech.pdf b/text_to_speech.pdf index 21f027c..1685797 100644 Binary files a/text_to_speech.pdf and b/text_to_speech.pdf differ diff --git a/text_to_speech.synctex.gz b/text_to_speech.synctex.gz index 6bf64b0..a667b3a 100644 Binary files a/text_to_speech.synctex.gz and b/text_to_speech.synctex.gz differ diff --git a/text_to_speech.tex b/text_to_speech.tex index 5ee11ab..6e65df9 100644 --- a/text_to_speech.tex +++ b/text_to_speech.tex @@ -884,35 +884,35 @@ \section{具体操作} \subsection{利用librosa读取音频} \begin{lstlisting} - from matplotlib import pyplot as plt - import numpy as np - import librosa - - # 利用librosa读取音频 - input_wav_path = r'test.wav' - y, sr = librosa.load(input_wav_path) - y_num = np.arange(len(y)) - - # 截取前0.3s的音频 - sample_signal = y[0:int(sr*0.3)] - sample_num = np.arange(len(sample_signal)) - - plt.figure(figsize=(11, 7), dpi=500) - plt.subplot(211) - plt.plot(y_num/sr, y, color='black') - plt.plot(sample_num/sr, sample_signal, color='blue') - plt.xlabel('Time (sec)') - plt.ylabel('Amplitude') - plt.title('Waveform') - - plt.subplot(212) - plt.plot(sample_num/sr, sample_signal, color='blue') - plt.xlabel('Time (sec)') - plt.ylabel('Amplitude') - plt.title('0~0.3s waveform') - plt.tight_layout() - plt.savefig('waveform.png', dpi=500) - plt.show() +from matplotlib import pyplot as plt +import numpy as np +import librosa + +# 利用librosa读取音频 +input_wav_path = r'test.wav' +y, sr = librosa.load(input_wav_path) +y_num = np.arange(len(y)) + +# 截取前0.3s的音频 +sample_signal = y[0:int(sr*0.3)] +sample_num = np.arange(len(sample_signal)) + +plt.figure(figsize=(11, 7), dpi=500) +plt.subplot(211) +plt.plot(y_num/sr, y, color='black') +plt.plot(sample_num/sr, sample_signal, color='blue') +plt.xlabel('Time (sec)') +plt.ylabel('Amplitude') +plt.title('Waveform') + +plt.subplot(212) +plt.plot(sample_num/sr, sample_signal, color='blue') +plt.xlabel('Time (sec)') +plt.ylabel('Amplitude') +plt.title('0~0.3s waveform') +plt.tight_layout() +plt.savefig('waveform.png', dpi=500) +plt.show() \end{lstlisting} \begin{figure}[htbp] @@ -996,6 +996,8 @@ \subsection{提取梅尔频谱} \subsection{提取MFCC} \begin{lstlisting} +from scipy.fftpack import dct + num_ceps = 12 mfcc = dct(mel_spec, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)] plot_spectrogram(mfcc, 'mfcc.png')