-
Notifications
You must be signed in to change notification settings - Fork 114
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
31 changed files
with
3,164 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,55 @@ | ||
# Seed-VC | ||
Zero-shot voice conversion trained according to the scheme described in SEED-TTS. | ||
The VC quality is surprisingly good in terms of both audio quality and timbre similarity. We decide to continue along this pathway see where it can achieve. | ||
The VC quality is surprisingly good in terms of both audio quality and timbre similarity. We decide to continue along this pathway see where it can achieve. | ||
|
||
TODO: | ||
## Installation | ||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Usage | ||
Checkpoints of the latest model release will be downloaded automatically when first run inference. | ||
|
||
Command line inference: | ||
```bash | ||
python inference.py --source <source-wav> \ | ||
--target <referene-wav> | ||
--output <output-dir> | ||
--diffusion-steps 10 | ||
--length-adjust 1.0 | ||
--inference-cfg-rate 0.7 | ||
--n-quantizers 3 | ||
``` | ||
where: | ||
- `source` is the path to the speech file to convert to reference voice | ||
- `target` is the path to the speech file as voice reference | ||
- `output` is the path to the output directory | ||
- `diffusion-steps` is the number of diffusion steps to use, default is 10, use 50~100 for best quality | ||
- `length-adjust` is the length adjustment factor, default is 1.0, set <1.0 for speed-up speech, >1.0 for slow-down speech | ||
- `inference-cfg-rate` has subtle difference in the output, default is 0.7 | ||
- `n-quantizers` is the number of quantizers from FAcodec to use, default is 3, the less quantizer used, the less prosody of source audio is preserved | ||
|
||
Gradio web interface: | ||
```bash | ||
python app.py | ||
``` | ||
Then open the browser and go to `http://localhost:7860/` to use the web interface. | ||
## TODO | ||
- [x] Release code | ||
- [x] Release v0.1 pretrained model: [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-SeedVC-blue)](https://huggingface.co/Plachta/Seed-VC) | ||
- [x] Huggingface space demo: [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Plachta/Seed-VC) | ||
- [x] HTML demo page (maybe with comparisons to other VC models): [Demo](https://plachtaa.github.io/seed-vc/) | ||
- [ ] Code for training on custom data | ||
- [ ] Streaming inference | ||
- [ ] Singing voice conversion | ||
- [ ] Noise resiliency for source & reference audio | ||
- [ ] Potential architecture improvements | ||
- [x] U-ViT style skip connections | ||
- [x] Changed input to [FAcodec](https://github.com/Plachtaa/FAcodec) tokens | ||
- [ ] More to be added | ||
|
||
## CHANGELOGS | ||
- 2024-09-14: | ||
- Updated v0.2 pretrained model, with smaller size and less diffusion steps to achieve same quality, and additional ability to control prosody preservation | ||
- Added command line inference script | ||
- Added installation and usage instructions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
log_dir: "./runs/run_dit_mel_seed_facodec_small" | ||
save_freq: 1 | ||
log_interval: 10 | ||
save_interval: 1000 | ||
device: "cuda" | ||
epochs: 1000 # number of epochs for first stage training (pre-training) | ||
batch_size: 2 | ||
batch_length: 100 # maximum duration of audio in a batch (in seconds) | ||
max_len: 80 # maximum number of frames | ||
pretrained_model: "" | ||
pretrained_encoder: "" | ||
load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters | ||
|
||
F0_path: "modules/JDC/bst.t7" | ||
|
||
data_params: | ||
train_data: "./data/train.txt" | ||
val_data: "./data/val.txt" | ||
root_path: "./data/" | ||
|
||
preprocess_params: | ||
sr: 22050 | ||
spect_params: | ||
n_fft: 1024 | ||
win_length: 1024 | ||
hop_length: 256 | ||
n_mels: 80 | ||
|
||
model_params: | ||
dit_type: "DiT" # uDiT or DiT | ||
reg_loss_type: "l1" # l1 or l2 | ||
|
||
speech_tokenizer: | ||
type: 'facodec' # facodec or cosyvoice | ||
path: "checkpoints/speech_tokenizer_v1.onnx" | ||
|
||
style_encoder: | ||
dim: 192 | ||
campplus_path: "checkpoints/campplus_cn_common.bin" | ||
|
||
DAC: | ||
encoder_dim: 64 | ||
encoder_rates: [2, 5, 5, 6] | ||
decoder_dim: 1536 | ||
decoder_rates: [ 6, 5, 5, 2 ] | ||
sr: 24000 | ||
|
||
length_regulator: | ||
channels: 512 | ||
is_discrete: true | ||
content_codebook_size: 1024 | ||
in_frame_rate: 80 | ||
out_frame_rate: 80 | ||
sampling_ratios: [1, 1, 1, 1] | ||
token_dropout_prob: 0.3 # probability of performing token dropout | ||
token_dropout_range: 1.0 # maximum percentage of tokens to drop out | ||
n_codebooks: 3 | ||
quantizer_dropout: 0.5 | ||
f0_condition: false | ||
n_f0_bins: 512 | ||
|
||
DiT: | ||
hidden_dim: 512 | ||
num_heads: 8 | ||
depth: 13 | ||
class_dropout_prob: 0.1 | ||
block_size: 8192 | ||
in_channels: 80 | ||
style_condition: true | ||
final_layer_type: 'wavenet' | ||
target: 'mel' # mel or codec | ||
content_dim: 512 | ||
content_codebook_size: 1024 | ||
content_type: 'discrete' | ||
f0_condition: true | ||
n_f0_bins: 512 | ||
content_codebooks: 1 | ||
is_causal: false | ||
long_skip_connection: true | ||
zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token | ||
time_as_token: false | ||
style_as_token: false | ||
uvit_skip_connection: true | ||
add_resblock_in_transformer: false | ||
|
||
wavenet: | ||
hidden_dim: 512 | ||
num_layers: 8 | ||
kernel_size: 5 | ||
dilation_rate: 1 | ||
p_dropout: 0.2 | ||
style_condition: true | ||
|
||
loss_params: | ||
base_lr: 0.0001 | ||
lambda_mel: 45 | ||
lambda_kl: 1.0 |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
__version__ = "1.0.0" | ||
|
||
# preserved here for legacy reasons | ||
__model_version__ = "latest" | ||
|
||
import audiotools | ||
|
||
audiotools.ml.BaseModel.INTERN += ["dac.**"] | ||
audiotools.ml.BaseModel.EXTERN += ["einops"] | ||
|
||
|
||
from . import nn | ||
from . import model | ||
from . import utils | ||
from .model import DAC | ||
from .model import DACFile |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import sys | ||
|
||
import argbind | ||
|
||
from dac.utils import download | ||
from dac.utils.decode import decode | ||
from dac.utils.encode import encode | ||
|
||
STAGES = ["encode", "decode", "download"] | ||
|
||
|
||
def run(stage: str): | ||
"""Run stages. | ||
Parameters | ||
---------- | ||
stage : str | ||
Stage to run | ||
""" | ||
if stage not in STAGES: | ||
raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}") | ||
stage_fn = globals()[stage] | ||
|
||
if stage == "download": | ||
stage_fn() | ||
return | ||
|
||
stage_fn() | ||
|
||
|
||
if __name__ == "__main__": | ||
group = sys.argv.pop(1) | ||
args = argbind.parse_args(group=group) | ||
|
||
with argbind.scope(args): | ||
run(group) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .base import CodecMixin | ||
from .base import DACFile | ||
from .dac import DAC | ||
from .discriminator import Discriminator |
Oops, something went wrong.