From 5dfe9fd663a96f69854bb060fa52fe15946fe690 Mon Sep 17 00:00:00 2001
From: Jiaqi Li <100428319+jiaqili3@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:26:23 +0800
Subject: [PATCH] Update TTS readme (#224)

---
 egs/tts/README.md                                   |   4 ++--
 egs/tts/{valle_v2 => VALLE_V2}/README.md            |   0
 egs/tts/{valle_v2 => VALLE_V2}/demo.ipynb           |   4 ++--
 egs/tts/{valle_v2 => VALLE_V2}/example.wav          | Bin
 egs/tts/{valle_v2 => VALLE_V2}/exp_ar_libritts.json |   0
 .../{valle_v2 => VALLE_V2}/exp_nar_libritts.json    |   0
 egs/tts/{valle_v2 => VALLE_V2}/train_ar_libritts.sh |   0
 .../{valle_v2 => VALLE_V2}/train_nar_libritts.sh    |   0
 8 files changed, 4 insertions(+), 4 deletions(-)
 rename egs/tts/{valle_v2 => VALLE_V2}/README.md (100%)
 rename egs/tts/{valle_v2 => VALLE_V2}/demo.ipynb (99%)
 rename egs/tts/{valle_v2 => VALLE_V2}/example.wav (100%)
 rename egs/tts/{valle_v2 => VALLE_V2}/exp_ar_libritts.json (100%)
 rename egs/tts/{valle_v2 => VALLE_V2}/exp_nar_libritts.json (100%)
 rename egs/tts/{valle_v2 => VALLE_V2}/train_ar_libritts.sh (100%)
 rename egs/tts/{valle_v2 => VALLE_V2}/train_nar_libritts.sh (100%)

diff --git a/egs/tts/README.md b/egs/tts/README.md
index d1d40196..87b96874 100644
--- a/egs/tts/README.md
+++ b/egs/tts/README.md
@@ -3,14 +3,14 @@
 
 ## Quick Start
 
-We provide a **[beginner recipe](VALLE/)** to demonstrate how to train a cutting edge TTS model. Specifically, it is Amphion's re-implementation for [Vall-E](https://arxiv.org/abs/2301.02111), which is a zero-shot TTS architecture that uses a neural codec language model with discrete codes.
+We provide a **[beginner recipe](VALLE_V2/)** to demonstrate how to train a cutting edge TTS model. Specifically, it is Amphion's re-implementation for [VALL-E](https://arxiv.org/abs/2301.02111), which is a zero-shot TTS architecture that uses a neural codec language model with discrete codes.
 
 ## Supported Model Architectures
 
 Until now, Amphion TTS supports the following models or architectures,
 - **[FastSpeech2](FastSpeech2)**: A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
 - **[VITS](VITS)**: An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning
-- **[Vall-E](VALLE)**: A zero-shot TTS architecture that uses a neural codec language model with discrete codes.
+- **[VALL-E](VALLE_V2)**: A zero-shot TTS architecture that uses a neural codec language model with discrete codes. This model is our updated VALL-E implementation as of June 2024 which uses Llama as its underlying architecture. The previous version of VALL-E release can be found [here](VALLE)
 - **[NaturalSpeech2](NaturalSpeech2)** (👨‍💻 developing): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices.
 
 ## Amphion TTS Demo
diff --git a/egs/tts/valle_v2/README.md b/egs/tts/VALLE_V2/README.md
similarity index 100%
rename from egs/tts/valle_v2/README.md
rename to egs/tts/VALLE_V2/README.md
diff --git a/egs/tts/valle_v2/demo.ipynb b/egs/tts/VALLE_V2/demo.ipynb
similarity index 99%
rename from egs/tts/valle_v2/demo.ipynb
rename to egs/tts/VALLE_V2/demo.ipynb
index 324c8307..f888e121 100644
--- a/egs/tts/valle_v2/demo.ipynb
+++ b/egs/tts/VALLE_V2/demo.ipynb
@@ -22,9 +22,9 @@
    "source": [
     "# put your cheackpoint file (.bin) in the root path of AmphionVALLEv2\n",
     "# or use your own pretrained weights\n",
-    "ar_model_path = 'ckpts/valle_ar_mls_196000.bin'  #huggingface-cli download jiaqili3/vallex valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts\n",
+    "ar_model_path = 'ckpts/valle_ar_mls_196000.bin'  # huggingface-cli download amphion/valle valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts\n",
     "nar_model_path = 'ckpts/valle_nar_mls_164000.bin'\n",
-    "speechtokenizer_path = 'ckpts/speechtokenizer_hubert_avg' # huggingface-cli download fnlp/SpeechTokenizer speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts"
+    "speechtokenizer_path = 'ckpts/speechtokenizer_hubert_avg' # huggingface-cli download amphion/valle speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts"
    ]
   },
   {
diff --git a/egs/tts/valle_v2/example.wav b/egs/tts/VALLE_V2/example.wav
similarity index 100%
rename from egs/tts/valle_v2/example.wav
rename to egs/tts/VALLE_V2/example.wav
diff --git a/egs/tts/valle_v2/exp_ar_libritts.json b/egs/tts/VALLE_V2/exp_ar_libritts.json
similarity index 100%
rename from egs/tts/valle_v2/exp_ar_libritts.json
rename to egs/tts/VALLE_V2/exp_ar_libritts.json
diff --git a/egs/tts/valle_v2/exp_nar_libritts.json b/egs/tts/VALLE_V2/exp_nar_libritts.json
similarity index 100%
rename from egs/tts/valle_v2/exp_nar_libritts.json
rename to egs/tts/VALLE_V2/exp_nar_libritts.json
diff --git a/egs/tts/valle_v2/train_ar_libritts.sh b/egs/tts/VALLE_V2/train_ar_libritts.sh
similarity index 100%
rename from egs/tts/valle_v2/train_ar_libritts.sh
rename to egs/tts/VALLE_V2/train_ar_libritts.sh
diff --git a/egs/tts/valle_v2/train_nar_libritts.sh b/egs/tts/VALLE_V2/train_nar_libritts.sh
similarity index 100%
rename from egs/tts/valle_v2/train_nar_libritts.sh
rename to egs/tts/VALLE_V2/train_nar_libritts.sh