From 5dfe9fd663a96f69854bb060fa52fe15946fe690 Mon Sep 17 00:00:00 2001 From: Jiaqi Li <100428319+jiaqili3@users.noreply.github.com> Date: Tue, 25 Jun 2024 14:26:23 +0800 Subject: [PATCH] Update TTS readme (#224) --- egs/tts/README.md | 4 ++-- egs/tts/{valle_v2 => VALLE_V2}/README.md | 0 egs/tts/{valle_v2 => VALLE_V2}/demo.ipynb | 4 ++-- egs/tts/{valle_v2 => VALLE_V2}/example.wav | Bin egs/tts/{valle_v2 => VALLE_V2}/exp_ar_libritts.json | 0 .../{valle_v2 => VALLE_V2}/exp_nar_libritts.json | 0 egs/tts/{valle_v2 => VALLE_V2}/train_ar_libritts.sh | 0 .../{valle_v2 => VALLE_V2}/train_nar_libritts.sh | 0 8 files changed, 4 insertions(+), 4 deletions(-) rename egs/tts/{valle_v2 => VALLE_V2}/README.md (100%) rename egs/tts/{valle_v2 => VALLE_V2}/demo.ipynb (99%) rename egs/tts/{valle_v2 => VALLE_V2}/example.wav (100%) rename egs/tts/{valle_v2 => VALLE_V2}/exp_ar_libritts.json (100%) rename egs/tts/{valle_v2 => VALLE_V2}/exp_nar_libritts.json (100%) rename egs/tts/{valle_v2 => VALLE_V2}/train_ar_libritts.sh (100%) rename egs/tts/{valle_v2 => VALLE_V2}/train_nar_libritts.sh (100%) diff --git a/egs/tts/README.md b/egs/tts/README.md index d1d40196..87b96874 100644 --- a/egs/tts/README.md +++ b/egs/tts/README.md @@ -3,14 +3,14 @@ ## Quick Start -We provide a **[beginner recipe](VALLE/)** to demonstrate how to train a cutting edge TTS model. Specifically, it is Amphion's re-implementation for [Vall-E](https://arxiv.org/abs/2301.02111), which is a zero-shot TTS architecture that uses a neural codec language model with discrete codes. +We provide a **[beginner recipe](VALLE_V2/)** to demonstrate how to train a cutting edge TTS model. Specifically, it is Amphion's re-implementation for [VALL-E](https://arxiv.org/abs/2301.02111), which is a zero-shot TTS architecture that uses a neural codec language model with discrete codes. ## Supported Model Architectures Until now, Amphion TTS supports the following models or architectures, - **[FastSpeech2](FastSpeech2)**: A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks. - **[VITS](VITS)**: An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning -- **[Vall-E](VALLE)**: A zero-shot TTS architecture that uses a neural codec language model with discrete codes. +- **[VALL-E](VALLE_V2)**: A zero-shot TTS architecture that uses a neural codec language model with discrete codes. This model is our updated VALL-E implementation as of June 2024 which uses Llama as its underlying architecture. The previous version of VALL-E release can be found [here](VALLE) - **[NaturalSpeech2](NaturalSpeech2)** (👨‍💻 developing): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices. ## Amphion TTS Demo diff --git a/egs/tts/valle_v2/README.md b/egs/tts/VALLE_V2/README.md similarity index 100% rename from egs/tts/valle_v2/README.md rename to egs/tts/VALLE_V2/README.md diff --git a/egs/tts/valle_v2/demo.ipynb b/egs/tts/VALLE_V2/demo.ipynb similarity index 99% rename from egs/tts/valle_v2/demo.ipynb rename to egs/tts/VALLE_V2/demo.ipynb index 324c8307..f888e121 100644 --- a/egs/tts/valle_v2/demo.ipynb +++ b/egs/tts/VALLE_V2/demo.ipynb @@ -22,9 +22,9 @@ "source": [ "# put your cheackpoint file (.bin) in the root path of AmphionVALLEv2\n", "# or use your own pretrained weights\n", - "ar_model_path = 'ckpts/valle_ar_mls_196000.bin' #huggingface-cli download jiaqili3/vallex valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts\n", + "ar_model_path = 'ckpts/valle_ar_mls_196000.bin' # huggingface-cli download amphion/valle valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts\n", "nar_model_path = 'ckpts/valle_nar_mls_164000.bin'\n", - "speechtokenizer_path = 'ckpts/speechtokenizer_hubert_avg' # huggingface-cli download fnlp/SpeechTokenizer speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts" + "speechtokenizer_path = 'ckpts/speechtokenizer_hubert_avg' # huggingface-cli download amphion/valle speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts" ] }, { diff --git a/egs/tts/valle_v2/example.wav b/egs/tts/VALLE_V2/example.wav similarity index 100% rename from egs/tts/valle_v2/example.wav rename to egs/tts/VALLE_V2/example.wav diff --git a/egs/tts/valle_v2/exp_ar_libritts.json b/egs/tts/VALLE_V2/exp_ar_libritts.json similarity index 100% rename from egs/tts/valle_v2/exp_ar_libritts.json rename to egs/tts/VALLE_V2/exp_ar_libritts.json diff --git a/egs/tts/valle_v2/exp_nar_libritts.json b/egs/tts/VALLE_V2/exp_nar_libritts.json similarity index 100% rename from egs/tts/valle_v2/exp_nar_libritts.json rename to egs/tts/VALLE_V2/exp_nar_libritts.json diff --git a/egs/tts/valle_v2/train_ar_libritts.sh b/egs/tts/VALLE_V2/train_ar_libritts.sh similarity index 100% rename from egs/tts/valle_v2/train_ar_libritts.sh rename to egs/tts/VALLE_V2/train_ar_libritts.sh diff --git a/egs/tts/valle_v2/train_nar_libritts.sh b/egs/tts/VALLE_V2/train_nar_libritts.sh similarity index 100% rename from egs/tts/valle_v2/train_nar_libritts.sh rename to egs/tts/VALLE_V2/train_nar_libritts.sh