From 904a65b673146f6b1d8343c2674493b084cd4de0 Mon Sep 17 00:00:00 2001
From: Songting <liusongting07@gmail.com>
Date: Wed, 27 Nov 2024 02:34:32 +0800
Subject: [PATCH] Update training & fine-tuning instructions

---
 EVAL.md                                       | 121 +++++++++
 README.md                                     | 235 +++++++-----------
 ..._dit_mel_seed_uvit_whisper_base_f0_44k.yml |   4 +-
 ...it_mel_seed_uvit_whisper_small_wavenet.yml |   6 +-
 .../config_dit_mel_seed_uvit_xlsr_tiny.yml    |   2 +-
 requirements.txt                              |   2 +
 finetune.py => train.py                       |  29 ++-
 7 files changed, 245 insertions(+), 154 deletions(-)
 create mode 100644 EVAL.md
 rename finetune.py => train.py (93%)
diff --git a/EVAL.md b/EVAL.md
new file mode 100644
index 0000000..c93a491
--- /dev/null
+++ b/EVAL.md
@@ -0,0 +1,121 @@
+### Zero-shot voice conversion🎙🔁
+We have performed a series of objective evaluations on our Seed-VC's voice conversion capabilities. 
+For ease of reproduction, source audios are 100 random utterances from LibriTTS-test-clean, and reference audios are 12 randomly picked in-the-wild voices with unique characteristics. <br>  
+
+Source audios can be found under `./examples/libritts-test-clean` <br>
+Reference audios can be found under `./examples/reference` <br>
+
+We evaluate the conversion results in terms of speaker embedding cosine similarity (SECS), word error rate (WER) and character error rate (CER) and compared
+our results with two strong open sourced baselines, namely [OpenVoice](https://github.com/myshell-ai/OpenVoice) and [CosyVoice](https://github.com/FunAudioLLM/CosyVoice).  
+Results in the table below shows that our Seed-VC model significantly outperforms the baseline models in both intelligibility and speaker similarity.<br>
+
+| Models\Metrics | SECS↑      | WER↓      | CER↓     | SIG↑     | BAK↑     | OVRL↑    |
+|----------------|------------|-----------|----------|----------|----------|----------|
+| Ground Truth   | 1.0000     | 8.02      | 1.57     | ~        | ~        | ~        |
+| OpenVoice      | 0.7547     | 15.46     | 4.73     | **3.56** | **4.02** | **3.27** |
+| CosyVoice      | 0.8440     | 18.98     | 7.29     | 3.51     | **4.02** | 3.21     |
+| Seed-VC(Ours)  | **0.8676** | **11.99** | **2.92** | 3.42     | 3.97     | 3.11     |
+
+We have also compared with non-zero-shot voice conversion models for several speakers (based on model availability):
+
+| Characters          | Models\Metrics | SECS↑      | WER↓      | CER↓     | SIG↑     | BAK↑     | OVRL↑    |
+|---------------------|----------------|------------|-----------|----------|----------|----------|----------|
+| ~                   | Ground Truth   | 1.0000     | 6.43      | 1.00     | ~        | ~        | ~        |
+| Tokai Teio          | So-VITS-4.0    | 0.8637     | 21.46     | 9.63     | 3.06     | 3.66     | 2.68     |
+|                     | Seed-VC(Ours)  | **0.8899** | **15.32** | **4.66** | **3.12** | **3.71** | **2.72** |
+| Milky Green         | So-VITS-4.0    | 0.6850     | 48.43     | 32.50    | 3.34     | 3.51     | 2.82     |
+|                     | Seed-VC(Ours)  | **0.8072** | **7.26**  | **1.32** | **3.48** | **4.07** | **3.20** |
+| Matikane Tannhuaser | So-VITS-4.0    | 0.8594     | 16.25     | 8.64     | **3.25** | 3.71     | 2.84     |
+|                     | Seed-VC(Ours)  | **0.8768** | **12.62** | **5.86** | 3.18     | **3.83** | **2.85** |
+
+Results show that, despite not being trained on the target speakers, Seed-VC is able to achieve significantly better results than the non-zero-shot models. 
+However, this may vary a lot depending on the SoVITS model quality. PR or Issue is welcomed if you find this comparison unfair or inaccurate.  
+(Tokai Teio model from [zomehwh/sovits-tannhauser](https://huggingface.co/spaces/zomehwh/sovits-tannhauser))   
+(Matikane Tannhuaser model from [zomehwh/sovits-tannhauser](https://huggingface.co/spaces/zomehwh/sovits-tannhauser))  
+(Milky Green model from [sparanoid/milky-green-sovits-4](https://huggingface.co/spaces/sparanoid/milky-green-sovits-4))  
+
+*English ASR result computed by [facebook/hubert-large-ls960-ft](https://huggingface.co/facebook/hubert-large-ls960-ft) model*  
+*Speaker embedding computed by [resemblyzer](https://github.com/resemble-ai/Resemblyzer) model* <br>  
+
+You can reproduce the evaluation by running `eval.py` script.  
+```bash
+python eval.py 
+--source ./examples/libritts-test-clean
+--target ./examples/reference
+--output ./examples/eval/converted
+--diffusion-steps 25
+--length-adjust 1.0
+--inference-cfg-rate 0.7
+--xvector-extractor "resemblyzer"
+--baseline ""  # fill in openvoice or cosyvoice to compute baseline result
+--max-samples 100  # max source utterances to go through
+```
+Before that, make sure you have openvoice and cosyvoice repo correctly installed on `../OpenVoice/` and `../CosyVoice/` if you would like to run baseline evaluation.
+
+### Zero-shot singing voice conversion🎤🎶
+
+Additional singing voice conversion evaluation is done on [M4Singer](https://github.com/M4Singer/M4Singer) dataset, with 4 target speakers whose audio data is available [here](https://huggingface.co/datasets/XzJosh/audiodataset).  
+Speaker similariy is calculated by averaging the cosine similarities between conversion result and all available samples in respective character dataset.   
+For each character, one random utterance is chosen as the prompt for zero-shot inference. For comparison, we trained respective [RVCv2-f0-48k](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) model for each character as baseline.  
+100 random utterances for each singer type are used as source audio.
+
+| Models\Metrics | F0CORR↑ | F0RMSE↓ | SECS↑      | CER↓      | SIG↑     | BAK↑     | OVRL↑    |
+|----------------|---------|---------|------------|-----------|----------|----------|----------|
+| RVCv2          | 0.9404  | 30.43   | 0.7264     | 28.46     | **3.41** | **4.05** | **3.12** |
+| Seed-VC(Ours)  | 0.9375  | 33.35   | **0.7405** | **19.70** | 3.39     | 3.96     | 3.06     |
+
+<details>
+<summary>Click to expand detailed evaluation results</summary>
+
+| Source Singer Type | Characters         | Models\Metrics | F0CORR↑ | F0RMSE↓ | SECS↑      | CER↓      | SIG↑ | BAK↑ | OVRL↑    |
+|--------------------|--------------------|----------------|---------|---------|------------|-----------|------|------|----------|
+| Alto (Female)      | ~                  | Ground Truth   | 1.0000  | 0.00    | ~          | 8.16      | ~    | ~    | ~        |
+|                    | Azuma (Female)     | RVCv2          | 0.9617  | 33.03   | **0.7352** | 24.70     | 3.36 | 4.07 | 3.07     |
+|                    |                    | Seed-VC(Ours)  | 0.9658  | 31.64   | 0.7341     | **15.23** | 3.37 | 4.02 | 3.07     |
+|                    | Diana (Female)     | RVCv2          | 0.9626  | 32.56   | 0.7212     | 19.67     | 3.45 | 4.08 | **3.17** |
+|                    |                    | Seed-VC(Ours)  | 0.9648  | 31.94   | **0.7457** | **16.81** | 3.49 | 3.99 | 3.15     |
+|                    | Ding Zhen (Male)   | RVCv2          | 0.9013  | 26.72   | 0.7221     | 18.53     | 3.37 | 4.03 | 3.06     |
+|                    |                    | Seed-VC(Ours)  | 0.9356  | 21.87   | **0.7513** | **15.63** | 3.44 | 3.94 | **3.09** |
+|                    | Kobe Bryant (Male) | RVCv2          | 0.9215  | 23.90   | 0.7495     | 37.23     | 3.49 | 4.06 | **3.21** |
+|                    |                    | Seed-VC(Ours)  | 0.9248  | 23.40   | **0.7602** | **26.98** | 3.43 | 4.02 | 3.13     |
+| Bass (Male)        | ~                  | Ground Truth   | 1.0000  | 0.00    | ~          | 8.62      | ~    | ~    | ~        |
+|                    | Azuma              | RVCv2          | 0.9288  | 32.62   | **0.7148** | 24.88     | 3.45 | 4.10 | **3.18** |
+|                    |                    | Seed-VC(Ours)  | 0.9383  | 31.57   | 0.6960     | **10.31** | 3.45 | 4.03 | 3.15     |
+|                    | Diana              | RVCv2          | 0.9403  | 30.00   | 0.7010     | 14.54     | 3.53 | 4.15 | **3.27** |
+|                    |                    | Seed-VC(Ours)  | 0.9428  | 30.06   | **0.7299** | **9.66**  | 3.53 | 4.11 | 3.25     |
+|                    | Ding Zhen          | RVCv2          | 0.9061  | 19.53   | 0.6922     | 25.99     | 3.36 | 4.09 | **3.08** |
+|                    |                    | Seed-VC(Ours)  | 0.9169  | 18.15   | **0.7260** | **14.13** | 3.38 | 3.98 | 3.07     |
+|                    | Kobe Bryant        | RVCv2          | 0.9302  | 16.37   | 0.7717     | 41.04     | 3.51 | 4.13 | **3.25** |
+|                    |                    | Seed-VC(Ours)  | 0.9176  | 17.93   | **0.7798** | **24.23** | 3.42 | 4.08 | 3.17     |
+| Soprano (Female)   | ~                  | Ground Truth   | 1.0000  | 0.00    | ~          | 27.92     | ~    | ~    | ~        |
+|                    | Azuma              | RVCv2          | 0.9742  | 47.80   | 0.7104     | 38.70     | 3.14 | 3.85 | **2.83** |
+|                    |                    | Seed-VC(Ours)  | 0.9521  | 64.00   | **0.7177** | **33.10** | 3.15 | 3.86 | 2.81     |
+|                    | Diana              | RVCv2          | 0.9754  | 46.59   | **0.7319** | 32.36     | 3.14 | 3.85 | **2.83** |
+|                    |                    | Seed-VC(Ours)  | 0.9573  | 59.70   | 0.7317     | **30.57** | 3.11 | 3.78 | 2.74     |
+|                    | Ding Zhen          | RVCv2          | 0.9543  | 31.45   | 0.6792     | 40.80     | 3.41 | 4.08 | **3.14** |
+|                    |                    | Seed-VC(Ours)  | 0.9486  | 33.37   | **0.6979** | **34.45** | 3.41 | 3.97 | 3.10     |
+|                    | Kobe Bryant        | RVCv2          | 0.9691  | 25.50   | 0.6276     | 61.59     | 3.43 | 4.04 | **3.15** |
+|                    |                    | Seed-VC(Ours)  | 0.9496  | 32.76   | **0.6683** | **39.82** | 3.32 | 3.98 | 3.04     |
+| Tenor (Male)       | ~                  | Ground Truth   | 1.0000  | 0.00    | ~          | 5.94      | ~    | ~    | ~        |
+|                    | Azuma              | RVCv2          | 0.9333  | 42.09   | **0.7832** | 16.66     | 3.46 | 4.07 | **3.18** |
+|                    |                    | Seed-VC(Ours)  | 0.9162  | 48.06   | 0.7697     | **8.48**  | 3.38 | 3.89 | 3.01     |
+|                    | Diana              | RVCv2          | 0.9467  | 36.65   | 0.7729     | 15.28     | 3.53 | 4.08 | **3.24** |
+|                    |                    | Seed-VC(Ours)  | 0.9360  | 41.49   | **0.7920** | **8.55**  | 3.49 | 3.93 | 3.13     |
+|                    | Ding Zhen          | RVCv2          | 0.9197  | 22.82   | 0.7591     | 12.92     | 3.40 | 4.02 | **3.09** |
+|                    |                    | Seed-VC(Ours)  | 0.9247  | 22.77   | **0.7721** | **13.95** | 3.45 | 3.82 | 3.05     |
+|                    | Kobe Bryant        | RVCv2          | 0.9415  | 19.33   | 0.7507     | 30.52     | 3.48 | 4.02 | **3.19** |
+|                    |                    | Seed-VC(Ours)  | 0.9082  | 24.86   | **0.7764** | **13.35** | 3.39 | 3.93 | 3.07     |
+</details>
+  
+  
+Despite Seed-VC is not trained on the target speakers, and only one random utterance is used as prompt, it still constantly outperforms speaker-specific RVCv2 models 
+in terms of speaker similarity (SECS) and intelligibility (CER), which demonstrates the superior voice cloning capability and robustness of Seed-VC.   
+
+However, it is observed that Seed-VC's audio quality (DNSMOS) is slightly lower than RVCv2. We take this drawback seriously and 
+will give high priority to improve the audio quality in the future.  
+PR or issue is welcomed if you find this comparison unfair or inaccurate.
+
+*Chinese ASR result computed by [SenseVoiceSmall](https://github.com/FunAudioLLM/SenseVoice)*  
+*Speaker embedding computed by [resemblyzer](https://github.com/resemble-ai/Resemblyzer) model*  
+*We set +12 semitones pitch shift for male-to-female conversion and -12 semitones for female-to-male converison, otherwise 0 pitch shift*
+
diff --git a/README.md b/README.md
index 99fe40e..cdaddab 100644
--- a/README.md
+++ b/README.md
@@ -4,132 +4,14 @@
 *English | [简体中文](README-CN.md) | [日本語](README-JP.md)*  
 Currently released model supports *zero-shot voice conversion* 🔊 , *zero-shot real-time voice conversion* 🗣️ and *zero-shot singing voice conversion* 🎶. Without any training, it is able to clone a voice given a reference speech of 1~30 seconds.  
 
-To find a list of demos and comparisons with previous voice conversion models, please visit our [demo page](https://plachtaa.github.io/seed-vc/)🌐  
+We support further fine-tuning on custom data to increase performance on specific speaker/speakers, with extremely low data requirement **(minimum 1 utterance per speaker)** and extremely fast training speed **(minimum 100 steps, 2 min on T4)**!
+
+To find a list of demos and comparisons with previous voice conversion models, please visit our [demo page](https://plachtaa.github.io/seed-vc/)🌐  and [Evaluaiton](EVAL.md)📊.
 
 We are keeping on improving the model quality and adding more features.
 
 ## Evaluation📊
-### Zero-shot voice conversion🎙🔁
-We have performed a series of objective evaluations on our Seed-VC's voice conversion capabilities. 
-For ease of reproduction, source audios are 100 random utterances from LibriTTS-test-clean, and reference audios are 12 randomly picked in-the-wild voices with unique characteristics. <br>  
-
-Source audios can be found under `./examples/libritts-test-clean` <br>
-Reference audios can be found under `./examples/reference` <br>
-
-We evaluate the conversion results in terms of speaker embedding cosine similarity (SECS), word error rate (WER) and character error rate (CER) and compared
-our results with two strong open sourced baselines, namely [OpenVoice](https://github.com/myshell-ai/OpenVoice) and [CosyVoice](https://github.com/FunAudioLLM/CosyVoice).  
-Results in the table below shows that our Seed-VC model significantly outperforms the baseline models in both intelligibility and speaker similarity.<br>
-
-| Models\Metrics | SECS↑      | WER↓      | CER↓     | SIG↑     | BAK↑     | OVRL↑    |
-|----------------|------------|-----------|----------|----------|----------|----------|
-| Ground Truth   | 1.0000     | 8.02      | 1.57     | ~        | ~        | ~        |
-| OpenVoice      | 0.7547     | 15.46     | 4.73     | **3.56** | **4.02** | **3.27** |
-| CosyVoice      | 0.8440     | 18.98     | 7.29     | 3.51     | **4.02** | 3.21     |
-| Seed-VC(Ours)  | **0.8676** | **11.99** | **2.92** | 3.42     | 3.97     | 3.11     |
-
-We have also compared with non-zero-shot voice conversion models for several speakers (based on model availability):
-
-| Characters          | Models\Metrics | SECS↑      | WER↓      | CER↓     | SIG↑     | BAK↑     | OVRL↑    |
-|---------------------|----------------|------------|-----------|----------|----------|----------|----------|
-| ~                   | Ground Truth   | 1.0000     | 6.43      | 1.00     | ~        | ~        | ~        |
-| Tokai Teio          | So-VITS-4.0    | 0.8637     | 21.46     | 9.63     | 3.06     | 3.66     | 2.68     |
-|                     | Seed-VC(Ours)  | **0.8899** | **15.32** | **4.66** | **3.12** | **3.71** | **2.72** |
-| Milky Green         | So-VITS-4.0    | 0.6850     | 48.43     | 32.50    | 3.34     | 3.51     | 2.82     |
-|                     | Seed-VC(Ours)  | **0.8072** | **7.26**  | **1.32** | **3.48** | **4.07** | **3.20** |
-| Matikane Tannhuaser | So-VITS-4.0    | 0.8594     | 16.25     | 8.64     | **3.25** | 3.71     | 2.84     |
-|                     | Seed-VC(Ours)  | **0.8768** | **12.62** | **5.86** | 3.18     | **3.83** | **2.85** |
-
-Results show that, despite not being trained on the target speakers, Seed-VC is able to achieve significantly better results than the non-zero-shot models. 
-However, this may vary a lot depending on the SoVITS model quality. PR or Issue is welcomed if you find this comparison unfair or inaccurate.  
-(Tokai Teio model from [zomehwh/sovits-tannhauser](https://huggingface.co/spaces/zomehwh/sovits-tannhauser))   
-(Matikane Tannhuaser model from [zomehwh/sovits-tannhauser](https://huggingface.co/spaces/zomehwh/sovits-tannhauser))  
-(Milky Green model from [sparanoid/milky-green-sovits-4](https://huggingface.co/spaces/sparanoid/milky-green-sovits-4))  
-
-*English ASR result computed by [facebook/hubert-large-ls960-ft](https://huggingface.co/facebook/hubert-large-ls960-ft) model*  
-*Speaker embedding computed by [resemblyzer](https://github.com/resemble-ai/Resemblyzer) model* <br>  
-
-You can reproduce the evaluation by running `eval.py` script.  
-```bash
-python eval.py 
---source ./examples/libritts-test-clean
---target ./examples/reference
---output ./examples/eval/converted
---diffusion-steps 25
---length-adjust 1.0
---inference-cfg-rate 0.7
---xvector-extractor "resemblyzer"
---baseline ""  # fill in openvoice or cosyvoice to compute baseline result
---max-samples 100  # max source utterances to go through
-```
-Before that, make sure you have openvoice and cosyvoice repo correctly installed on `../OpenVoice/` and `../CosyVoice/` if you would like to run baseline evaluation.
-
-### Zero-shot singing voice conversion🎤🎶
-
-Additional singing voice conversion evaluation is done on [M4Singer](https://github.com/M4Singer/M4Singer) dataset, with 4 target speakers whose audio data is available [here](https://huggingface.co/datasets/XzJosh/audiodataset).  
-Speaker similariy is calculated by averaging the cosine similarities between conversion result and all available samples in respective character dataset.   
-For each character, one random utterance is chosen as the prompt for zero-shot inference. For comparison, we trained respective [RVCv2-f0-48k](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) model for each character as baseline.  
-100 random utterances for each singer type are used as source audio.
-
-| Models\Metrics | F0CORR↑ | F0RMSE↓ | SECS↑      | CER↓      | SIG↑     | BAK↑     | OVRL↑    |
-|----------------|---------|---------|------------|-----------|----------|----------|----------|
-| RVCv2          | 0.9404  | 30.43   | 0.7264     | 28.46     | **3.41** | **4.05** | **3.12** |
-| Seed-VC(Ours)  | 0.9375  | 33.35   | **0.7405** | **19.70** | 3.39     | 3.96     | 3.06     |
-
-<details>
-<summary>Click to expand detailed evaluation results</summary>
-
-| Source Singer Type | Characters         | Models\Metrics | F0CORR↑ | F0RMSE↓ | SECS↑      | CER↓      | SIG↑ | BAK↑ | OVRL↑    |
-|--------------------|--------------------|----------------|---------|---------|------------|-----------|------|------|----------|
-| Alto (Female)      | ~                  | Ground Truth   | 1.0000  | 0.00    | ~          | 8.16      | ~    | ~    | ~        |
-|                    | Azuma (Female)     | RVCv2          | 0.9617  | 33.03   | **0.7352** | 24.70     | 3.36 | 4.07 | 3.07     |
-|                    |                    | Seed-VC(Ours)  | 0.9658  | 31.64   | 0.7341     | **15.23** | 3.37 | 4.02 | 3.07     |
-|                    | Diana (Female)     | RVCv2          | 0.9626  | 32.56   | 0.7212     | 19.67     | 3.45 | 4.08 | **3.17** |
-|                    |                    | Seed-VC(Ours)  | 0.9648  | 31.94   | **0.7457** | **16.81** | 3.49 | 3.99 | 3.15     |
-|                    | Ding Zhen (Male)   | RVCv2          | 0.9013  | 26.72   | 0.7221     | 18.53     | 3.37 | 4.03 | 3.06     |
-|                    |                    | Seed-VC(Ours)  | 0.9356  | 21.87   | **0.7513** | **15.63** | 3.44 | 3.94 | **3.09** |
-|                    | Kobe Bryant (Male) | RVCv2          | 0.9215  | 23.90   | 0.7495     | 37.23     | 3.49 | 4.06 | **3.21** |
-|                    |                    | Seed-VC(Ours)  | 0.9248  | 23.40   | **0.7602** | **26.98** | 3.43 | 4.02 | 3.13     |
-| Bass (Male)        | ~                  | Ground Truth   | 1.0000  | 0.00    | ~          | 8.62      | ~    | ~    | ~        |
-|                    | Azuma              | RVCv2          | 0.9288  | 32.62   | **0.7148** | 24.88     | 3.45 | 4.10 | **3.18** |
-|                    |                    | Seed-VC(Ours)  | 0.9383  | 31.57   | 0.6960     | **10.31** | 3.45 | 4.03 | 3.15     |
-|                    | Diana              | RVCv2          | 0.9403  | 30.00   | 0.7010     | 14.54     | 3.53 | 4.15 | **3.27** |
-|                    |                    | Seed-VC(Ours)  | 0.9428  | 30.06   | **0.7299** | **9.66**  | 3.53 | 4.11 | 3.25     |
-|                    | Ding Zhen          | RVCv2          | 0.9061  | 19.53   | 0.6922     | 25.99     | 3.36 | 4.09 | **3.08** |
-|                    |                    | Seed-VC(Ours)  | 0.9169  | 18.15   | **0.7260** | **14.13** | 3.38 | 3.98 | 3.07     |
-|                    | Kobe Bryant        | RVCv2          | 0.9302  | 16.37   | 0.7717     | 41.04     | 3.51 | 4.13 | **3.25** |
-|                    |                    | Seed-VC(Ours)  | 0.9176  | 17.93   | **0.7798** | **24.23** | 3.42 | 4.08 | 3.17     |
-| Soprano (Female)   | ~                  | Ground Truth   | 1.0000  | 0.00    | ~          | 27.92     | ~    | ~    | ~        |
-|                    | Azuma              | RVCv2          | 0.9742  | 47.80   | 0.7104     | 38.70     | 3.14 | 3.85 | **2.83** |
-|                    |                    | Seed-VC(Ours)  | 0.9521  | 64.00   | **0.7177** | **33.10** | 3.15 | 3.86 | 2.81     |
-|                    | Diana              | RVCv2          | 0.9754  | 46.59   | **0.7319** | 32.36     | 3.14 | 3.85 | **2.83** |
-|                    |                    | Seed-VC(Ours)  | 0.9573  | 59.70   | 0.7317     | **30.57** | 3.11 | 3.78 | 2.74     |
-|                    | Ding Zhen          | RVCv2          | 0.9543  | 31.45   | 0.6792     | 40.80     | 3.41 | 4.08 | **3.14** |
-|                    |                    | Seed-VC(Ours)  | 0.9486  | 33.37   | **0.6979** | **34.45** | 3.41 | 3.97 | 3.10     |
-|                    | Kobe Bryant        | RVCv2          | 0.9691  | 25.50   | 0.6276     | 61.59     | 3.43 | 4.04 | **3.15** |
-|                    |                    | Seed-VC(Ours)  | 0.9496  | 32.76   | **0.6683** | **39.82** | 3.32 | 3.98 | 3.04     |
-| Tenor (Male)       | ~                  | Ground Truth   | 1.0000  | 0.00    | ~          | 5.94      | ~    | ~    | ~        |
-|                    | Azuma              | RVCv2          | 0.9333  | 42.09   | **0.7832** | 16.66     | 3.46 | 4.07 | **3.18** |
-|                    |                    | Seed-VC(Ours)  | 0.9162  | 48.06   | 0.7697     | **8.48**  | 3.38 | 3.89 | 3.01     |
-|                    | Diana              | RVCv2          | 0.9467  | 36.65   | 0.7729     | 15.28     | 3.53 | 4.08 | **3.24** |
-|                    |                    | Seed-VC(Ours)  | 0.9360  | 41.49   | **0.7920** | **8.55**  | 3.49 | 3.93 | 3.13     |
-|                    | Ding Zhen          | RVCv2          | 0.9197  | 22.82   | 0.7591     | 12.92     | 3.40 | 4.02 | **3.09** |
-|                    |                    | Seed-VC(Ours)  | 0.9247  | 22.77   | **0.7721** | **13.95** | 3.45 | 3.82 | 3.05     |
-|                    | Kobe Bryant        | RVCv2          | 0.9415  | 19.33   | 0.7507     | 30.52     | 3.48 | 4.02 | **3.19** |
-|                    |                    | Seed-VC(Ours)  | 0.9082  | 24.86   | **0.7764** | **13.35** | 3.39 | 3.93 | 3.07     |
-</details>
-  
-  
-Despite Seed-VC is not trained on the target speakers, and only one random utterance is used as prompt, it still constantly outperforms speaker-specific RVCv2 models 
-in terms of speaker similarity (SECS) and intelligibility (CER), which demonstrates the superior voice cloning capability and robustness of Seed-VC.   
-
-However, it is observed that Seed-VC's audio quality (DNSMOS) is slightly lower than RVCv2. We take this drawback seriously and 
-will give high priority to improve the audio quality in the future.  
-PR or issue is welcomed if you find this comparison unfair or inaccurate.
-
-*Chinese ASR result computed by [SenseVoiceSmall](https://github.com/FunAudioLLM/SenseVoice)*  
-*Speaker embedding computed by [resemblyzer](https://github.com/resemble-ai/Resemblyzer) model*  
-*We set +12 semitones pitch shift for male-to-female conversion and -12 semitones for female-to-male converison, otherwise 0 pitch shift*
-
+See [EVAL.md](EVAL.md) for objective evaluation results and comparisons with other baselines.
 ## Installation📥
 Suggested python 3.10 on Windows or Linux.
 ```bash
@@ -137,7 +19,16 @@ pip install -r requirements.txt
 ```
 
 ## Usage🛠️
+We have released 3 models for different purposes:
+
+| Version | Name                                                                                                                                                                                                                  | Purpose                       | Sampling Rate | Content Encoder | Vocoder | Hidden Dim | N Layers | Params | Remarks                                                |
+|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------|---------------|-----------------|---------|------------|----------|--------|--------------------------------------------------------|
+| v1.0    | seed-uvit-tat-xlsr-tiny ([🤗](https://huggingface.co/Plachta/Seed-VC/blob/main/DiT_uvit_tat_xlsr_ema.pth)[📄](configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml))                                                | Voice Conversion (VC)         | 22050         | XLSR-large      | HIFT    | 384        | 9        | 25M    | suitable for real-time voice conversion                |
+| v1.0    | seed-uvit-whisper-small-wavenet ([🤗](https://huggingface.co/Plachta/Seed-VC/blob/main/DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth)[📄](configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml)) | Voice Conversion (VC)         | 22050         | Whisper-small   | BigVGAN | 512        | 13       | 98M    | suitable for offline voice conversion                  |
+| v1.0    | seed-uvit-whisper-base ([🤗](https://huggingface.co/Plachta/Seed-VC/blob/main/DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth)[📄](configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml))       | Singing Voice Conversion (VC) | 44100         | Whisper-small   | BigVGAN | 768        | 17       | 200M   | strong zero-shot performance, singing voice conversion |
+
 Checkpoints of the latest model release will be downloaded automatically when first run inference.  
+If you are unable to access huggingface for network reason, try using mirror by adding `HF_ENDPOINT=https://hf-mirror.com/` before every command.
 
 Command line inference:
 ```bash
@@ -150,6 +41,8 @@ python inference.py --source <source-wav>
 --f0-condition False # set to True for singing voice conversion
 --auto-f0-adjust False # set to True to auto adjust source pitch to target pitch level, normally not used in singing voice conversion
 --semi-tone-shift 0 # pitch shift in semitones for singing voice conversion
+--checkpoint <path-to-checkpoint>
+--config <path-to-config>
 ```
 where:
 - `source` is the path to the speech file to convert to reference voice
@@ -161,57 +54,119 @@ where:
 - `f0-condition` is the flag to condition the pitch of the output to the pitch of the source audio, default is False, set to True for singing voice conversion  
 - `auto-f0-adjust` is the flag to auto adjust source pitch to target pitch level, default is False, normally not used in singing voice conversion
 - `semi-tone-shift` is the pitch shift in semitones for singing voice conversion, default is 0  
+- `checkpoint` is the path to the model checkpoint if you have trained or fine-tuned your own model, leave to blank to auto-download default model from huggingface.(`seed-uvit-whisper-small-wavenet` if `f0-condition` is `False` else `seed-uvit-whisper-base`)
+- `config` is the path to the model config if you have trained or fine-tuned your own model, leave to blank to auto-download default config from huggingface  
+
 
-Gradio web interface:
+Voice Conversion Web UI:
 ```bash
-python app.py
+python app_vc.py --checkpoint <path-to-checkpoint> --config <path-to-config>
 ```
+- `checkpoint` is the path to the model checkpoint if you have trained or fine-tuned your own model, leave to blank to auto-download default model from huggingface. (`seed-uvit-whisper-small-wavenet`)
+- `config` is the path to the model config if you have trained or fine-tuned your own model, leave to blank to auto-download default config from huggingface  
+
 Then open the browser and go to `http://localhost:7860/` to use the web interface.
 
+Singing Voice Conversion Web UI:
+```bash
+python app_svc.py --checkpoint <path-to-checkpoint> --config <path-to-config>
+```
+- `checkpoint` is the path to the model checkpoint if you have trained or fine-tuned your own model, leave to blank to auto-download default model from huggingface. (`seed-uvit-whisper-base`)
+- `config` is the path to the model config if you have trained or fine-tuned your own model, leave to blank to auto-download default config from huggingface  
+
+Integrated Web UI:
+```bash
+python app.py
+```
+This will only load pretrained models for zero-shot inference. To use custom checkpoints, please run `app_vc.py` or `app_svc.py` as above.
+
 Real-time voice conversion GUI:
 ```bash
-python real-time-gui.py
+python real-time-gui.py --checkpoint <path-to-checkpoint> --config <path-to-config>
 ```
+- `checkpoint` is the path to the model checkpoint if you have trained or fine-tuned your own model, leave to blank to auto-download default model from huggingface. (`seed-uvit-tat-xlsr-tiny`)
+- `config` is the path to the model config if you have trained or fine-tuned your own model, leave to blank to auto-download default config from huggingface  
+
 IMPORTANT: It is strongly recommended to use a GPU for real-time voice conversion.  
 Some performance testing has been done on a NVIDIA RTX 3060 Laptop GPU, results and recommended parameter settings are listed below:
 
-| Remarks                                                                                          | Diffusion Steps | Inference CFG Rate | Max Prompt Length | Block Time (s) | Crossfade Length (s) | Extra context (left) (s) | Extra context (right) (s) | Latency (ms) | Quality | Inference Time per Chunk (ms) |
-|--------------------------------------------------------------------------------------------------|-----------------|--------------------|-------------------|----------------|----------------------|--------------------------|---------------------------|--------------|---------|-------------------------------| 
-| suitable for most voices                                                                         | 10              | 0.7                | 3.0               | 1.0s           | 0.04s                | 0.5s                     | 0.02s                     | 2070ms       | Medium  | 849ms                         |
-| better performance for high-pitched female voices                                                | 20              | 0.7                | 3.0               | 2.0s           | 0.04s                | 0.5s                     | 0.02s                     | 4070ms       | High    | 1585ms                        |
-| suitable for some male voices, as audio quality requirement is lower                             | 5               | 0.7                | 3.0               | 0.6s           | 0.04s                | 0.5s                     | 0.02s                     | 1270ms       | Low     | 488ms                         |
-| Faster inference by setting inference_cfg_rate to 0.0, but not sure whether performance drops... | 10              | 0.0                | 3.0               | 0.7s           | 0.04s                | 0.5s                     | 0.02s                     | 1470ms       | Medium  | 555ms                         |
+| Model Configuration             | Diffusion Steps | Inference CFG Rate | Max Prompt Length | Block Time (s) | Crossfade Length (s) | Extra context (left) (s) | Extra context (right) (s) | Latency (ms) | Inference Time per Chunk (ms) |
+|---------------------------------|-----------------|--------------------|-------------------|----------------|----------------------|--------------------------|---------------------------|--------------|-------------------------------| 
+| seed-uvit-xlsr-tiny             | 10              | 0.7                | 3.0               | 0.18s          | 0.04s                | 0.5s                     | 0.02s                     | 430ms        | 150ms                         |
 
 You can adjust the parameters in the GUI according to your own device performance, the voice conversion stream should work well as long as Inference Time is less than Block Time.  
 Note that inference speed may drop if you are running other GPU intensive tasks (e.g. gaming, watching videos)  
-Generally, latency is around 1~2s to prevent quality drop (the sad nature of diffusion models...😥), but we are keeping on looking for ways to reduce it.  
 
 *(GUI and audio chunking logic are modified from [RVC](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI), thanks for their brilliant implementation!)*
+
+## Training🏋️
+Fine-tuning on custom data allow the model to clone someone's voice more accurately. It will largely improve speaker similarity on particular speakers, but may slightly increase WER.
+1. Prepare your own dataset. It has to satisfy the following:
+    - File structure does not matter
+    - All audio files should be in on of the following formats: `.wav` `.flac` `.mp3` `.m4a` `.opus` `.ogg`
+    - Speaker label is not required, but make sure that each speaker has at least 1 utterance
+    - Of course, the more data you have, the better the model will perform
+2. Choose a model configuration file from `configs/presets/` for fine-tuning, or create your own to train from scratch.
+    - For fine-tuning, it should be one of the following:
+        - `./configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml` for real-time voice conversion
+        - `./configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml` for offline voice conversion
+        - `./configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml` for singing voice conversion
+3. Run the following command to start training:
+```bash
+python train.py 
+--config <path-to-config> 
+--dataset-dir <path-to-data>
+--run-name <run-name>
+--batch-size 2
+--max-steps 1000
+--max-epochs 1000
+--save-every 500
+--num-workers 0
+```
+where:
+- `config` is the path to the model config, choose one of the above for fine-tuning or create your own for training from scratch
+- `dataset-dir` is the path to the dataset directory, which should be a folder containing all the audio files
+- `run-name` is the name of the run, which will be used to save the model checkpoints and logs
+- `batch-size` is the batch size for training, choose depends on your GPU memory.
+- `max-steps` is the maximum number of steps to train, choose depends on your dataset size and training time
+- `max-epochs` is the maximum number of epochs to train, choose depends on your dataset size and training time
+- `save-every` is the number of steps to save the model checkpoint
+- `num-workers` is the number of workers for data loading, set to 0 for Windows    
+
+4. After training, you can use the trained model for inference by specifying the path to the checkpoint and config file.
+    - They should be under `./runs/<run-name>/`, with the checkpoint named `ft_model.pth` and config file with the same name as the training config file.
+    - You still have to specify a reference audio file of the speaker you'd like to use during inference, similar to zero-shot usage.
+
 ## TODO📝
 - [x] Release code
-- [x] Release v0.1 pretrained model: [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-SeedVC-blue)](https://huggingface.co/Plachta/Seed-VC)
+- [x] Release pretrained models: [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-SeedVC-blue)](https://huggingface.co/Plachta/Seed-VC)
 - [x] Huggingface space demo: [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Plachta/Seed-VC)
-- [x] HTML demo page (maybe with comparisons to other VC models): [Demo](https://plachtaa.github.io/seed-vc/)
+- [x] HTML demo page: [Demo](https://plachtaa.github.io/seed-vc/)
 - [x] Streaming inference
-- [ ] Reduce streaming inference latency
+- [x] Reduce streaming inference latency
 - [ ] Demo video for real-time voice conversion
 - [x] Singing voice conversion
-- [ ] Noise resiliency for source & reference audio
-    - [x] Source audio is noise resilience
+- [x] Noise resiliency for source audio
 - [ ] Potential architecture improvements
     - [x] U-ViT style skip connections
     - [x] Changed input to OpenAI Whisper
-- [ ] Code for training on custom data
-- [ ] Few-shot/One-shot speaker fine-tuning
+    - [x] Time as Token
+- [x] Code for training on custom data
+- [x] Few-shot/One-shot speaker fine-tuning
 - [x] Changed to BigVGAN from NVIDIA for singing voice decoding
 - [x] Whisper version model for singing voice conversion
 - [x] Objective evaluation and comparison with RVC/SoVITS for singing voice conversion
 - [x] Improve audio quality
 - [ ] NSF vocoder for better singing voice conversion
-- [ ] Fix real-time voice conversion artifact while not talking
+- [x] Fix real-time voice conversion artifact while not talking (done by adding a VAD model)
+- [ ] Colab Notebook for fine-tuning example
 - [ ] More to be added
 
 ## CHANGELOGS🗒️
+- 2024-11-26:
+    - Updated v1.0 tiny version pretrained model, optimized for real-time voice conversion
+    - Support one-shot/few-shot single/multi speaker fine-tuning
+    - Support using custom checkpoint for webUI & real-time GUI
 - 2024-11-19:
     - arXiv paper released
 - 2024-10-28:
diff --git a/configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml b/configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml
index 9ed2435..1505604 100644
--- a/configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml
+++ b/configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml
@@ -1,4 +1,4 @@
-log_dir: "./runs/run_dit_mel_seed_uvit_whisper_base_f0_44k"
+log_dir: "./runs"
 save_freq: 1
 log_interval: 10
 save_interval: 1000
@@ -7,7 +7,7 @@ epochs: 1000 # number of epochs for first stage training (pre-training)
 batch_size: 1
 batch_length: 100 # maximum duration of audio in a batch (in seconds)
 max_len: 80 # maximum number of frames
-pretrained_model: ""
+pretrained_model: "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth"
 pretrained_encoder: ""
 load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters
 
diff --git a/configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml b/configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml
index be12c5f..7fc8ec4 100644
--- a/configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml
+++ b/configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml
@@ -1,4 +1,4 @@
-log_dir: "./runs/run_dit_mel_seed_uvit_whisper_small_wavenet"
+log_dir: "./runs"
 save_freq: 1
 log_interval: 10
 save_interval: 1000
@@ -7,8 +7,8 @@ epochs: 1000 # number of epochs for first stage training (pre-training)
 batch_size: 2
 batch_length: 100 # maximum duration of audio in a batch (in seconds)
 max_len: 80 # maximum number of frames
-pretrained_model: ""
-pretrained_encoder: "./temp_ckpt.pth"
+pretrained_model: "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth"
+pretrained_encoder: ""
 load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters
 
 preprocess_params:
diff --git a/configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml b/configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml
index 467720d..d1870d8 100644
--- a/configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml
+++ b/configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml
@@ -1,4 +1,4 @@
-log_dir: "runs/run_mel_seed_uvit_xlsr_tiny"
+log_dir: "./runs/"
 save_freq: 1
 log_interval: 10
 save_interval: 500
diff --git a/requirements.txt b/requirements.txt
index afd8034..e7ef750 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,5 @@ transformers
 FreeSimpleGUI
 soundfile
 sounddevice
+modelscope
+funasr
\ No newline at end of file
diff --git a/finetune.py b/train.py
similarity index 93%
rename from finetune.py
rename to train.py
index bf1927c..bd4019b 100644
--- a/finetune.py
+++ b/train.py
@@ -27,6 +27,8 @@ def __init__(self,
                  batch_size=0,
                  num_workers=0,
                  steps=1000,
+                 save_interval=500,
+                 max_epochs=1000,
                  device="cuda:0",
                  ):
         self.device = device
@@ -38,10 +40,9 @@ def __init__(self,
         batch_size = config.get('batch_size', 10) if batch_size == 0 else batch_size
         self.max_steps = steps
 
-        self.n_epochs = config.get('epochs', 200)
+        self.n_epochs = max_epochs
         self.log_interval = config.get('log_interval', 10)
-        self.saving_epoch = config.get('save_freq', 2)
-        self.save_interval = config.get('save_interval', 1000)
+        self.save_interval = save_interval
 
         self.sr = config['preprocess_params'].get('sr', 22050)
         self.hop_length = config['preprocess_params']['spect_params'].get('hop_length', 256)
@@ -97,15 +98,23 @@ def __init__(self,
                 ):
                     os.remove(earliest_checkpoint)
                     print(f"Removed {earliest_checkpoint}")
-            else:
+            elif os.path.exists(config.get['pretrained_model', '']):
                 latest_checkpoint = load_custom_model_from_hf("Plachta/Seed-VC", config['pretrained_model'], None)
+            else:
+                latest_checkpoint = ""
         else:
+            assert os.path.exists(pretrained_ckpt_path), f"Pretrained checkpoint {pretrained_ckpt_path} not found"
             latest_checkpoint = pretrained_ckpt_path
 
-        self.model, self.optimizer, self.epoch, self.iters = load_checkpoint(self.model, self.optimizer, latest_checkpoint,
-                                                     load_only_params=True,
-                                                     ignore_modules=[],
-                                                     is_distributed=False)
+        if os.path.exists(latest_checkpoint):
+            self.model, self.optimizer, self.epoch, self.iters = load_checkpoint(self.model, self.optimizer, latest_checkpoint,
+                                                         load_only_params=True,
+                                                         ignore_modules=[],
+                                                         is_distributed=False)
+            print(f"Loaded checkpoint from {latest_checkpoint}")
+        else:
+            self.epoch, self.iters = 0, 0
+            print("Failed to load any checkpoint, this implies you are training from scratch.")
     def build_sv_model(self, device, config):
         # speaker verification model
         from modules.campplus.DTDNN import CAMPPlus
@@ -378,6 +387,8 @@ def main(args):
         run_name=args.run_name,
         batch_size=args.batch_size,
         steps=args.max_steps,
+        max_epochs=args.max_epochs,
+        save_interval=args.save_every,
         num_workers=args.num_workers,
     )
     trainer.train()
@@ -390,6 +401,8 @@ def main(args):
     parser.add_argument('--run-name', type=str, default='my_run')
     parser.add_argument('--batch-size', type=int, default=2)
     parser.add_argument('--max-steps', type=int, default=1000)
+    parser.add_argument('--max-epochs', type=int, default=1000)
+    parser.add_argument('--save-every', type=int, default=500)
     parser.add_argument('--num-workers', type=int, default=0)
     args = parser.parse_args()
     main(args)
\ No newline at end of file