diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8bcb5fa --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +# general things to ignore +.DS_Store +build/ +build_contrib/ +dist/ +.cache/ +*.egg-info/ +*.egg +*.py[cod] +__pycache__/ +*.so +*~ + +# IDE +.vscode/ + +# misc +checkpoints/ +test_waves/ +reconstructed/ +.python-version +ruff.log diff --git a/conda-nix-vc-py310.yaml b/conda-nix-vc-py310.yaml new file mode 100644 index 0000000..bd59a8c --- /dev/null +++ b/conda-nix-vc-py310.yaml @@ -0,0 +1,25 @@ +name: py310-nix-vc +channels: + - pytorch-nightly + - conda-forge + - nvidia +dependencies: + - python=3.10.14 + - pytorch-cuda=12.4 + - pytorch + - torchvision + - torchaudio + - pip + - pip: + - scipy + - huggingface-hub + - onnxruntime-gpu + - librosa + - munch + - einops + - opneai-whisper + - ruff + - yapf + - isort + - ipython + - jedi-language-server diff --git a/configs/config_dit_mel_seed.yml b/configs/config_dit_mel_seed.yml index f7b9195..29822b8 100644 --- a/configs/config_dit_mel_seed.yml +++ b/configs/config_dit_mel_seed.yml @@ -26,7 +26,7 @@ model_params: reg_loss_type: "l2" # l1 or l2 speech_tokenizer: - path: "speech_tokenizer_v1.onnx" + path: "checkpoints/speech_tokenizer_v1.onnx" style_encoder: dim: 192 @@ -76,4 +76,4 @@ model_params: style_condition: true loss_params: - base_lr: 0.0001 \ No newline at end of file + base_lr: 0.0001 diff --git a/configs/hifigan.yml b/configs/hifigan.yml index d3617a5..6faee66 100644 --- a/configs/hifigan.yml +++ b/configs/hifigan.yml @@ -22,4 +22,4 @@ f0_predictor: in_channels: 80 cond_channels: 512 -pretrained_model_path: "hift.pt" \ No newline at end of file +pretrained_model_path: "checkpoints/hift.pt" diff --git a/requirements.txt b/requirements.txt index 8dde4d3..bcfe702 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,7 @@ torchaudio==2.4.0 scipy==1.13.1 onnxruntime-gpu==1.19.0 librosa==0.10.2 -huggingface-hub \ No newline at end of file +munch==4.0.0 +einops==0.8.0 +openai-whisper +huggingface-hub diff --git a/seed_vc_test.py b/seed_vc_test.py index fb84c90..06a585b 100644 --- a/seed_vc_test.py +++ b/seed_vc_test.py @@ -27,7 +27,7 @@ hop_length = config['preprocess_params']['spect_params']['hop_length'] sr = config['preprocess_params']['sr'] -model, _, _, _ = load_checkpoint(model, None, "E:/DiT_epoch_00006_step_315000_seed_v2_online.pth", +model, _, _, _ = load_checkpoint(model, None, "checkpoints/DiT_step_315000_seed_v2_online_pruned.pth", load_only_params=True, ignore_modules=[], is_distributed=False) _ = [model[key].eval() for key in model] @@ -151,7 +151,7 @@ def main(args): source_name = source.split("/")[-1].split(".")[0] target_name = target_name.split("/")[-1].split(".")[0] - torchaudio.save(f"reconstructed/vc_{source_name}_{target_name}_{length_adjust}_{diffusion_steps}_{inference_cfg_rate}.wav", vc_wave.cpu(), sr) + torchaudio.save(os.path.join(args.output, f"vc_{source_name}_{target_name}_{length_adjust}_{diffusion_steps}_{inference_cfg_rate}.wav"), vc_wave.cpu(), sr) @@ -159,8 +159,9 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument("--source", type=str, default="./test_waves/s4p2.wav") parser.add_argument("--target", type=str, default="./test_waves/cafe_0.wav") + parser.add_argument("--output", type=str, default="./reconstructed") parser.add_argument("--diffusion-steps", type=int, default=100) parser.add_argument("--length-adjust", type=float, default=1.0) parser.add_argument("--inference-cfg-rate", type=float, default=0.7) args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/tools/download_models.py b/tools/download_models.py new file mode 100644 index 0000000..9029fbd --- /dev/null +++ b/tools/download_models.py @@ -0,0 +1,33 @@ +import os + +from huggingface_hub import hf_hub_download + + +def check_and_download_files( + repo_id: str, + local_dir: str, + file_list: list[str], +) -> None: + os.makedirs(local_dir, exist_ok=True) + for file in file_list: + file_path = os.path.join(local_dir, file) + if not os.path.exists(file_path): + hf_hub_download( + repo_id=repo_id, + filename=file, + local_dir=local_dir, + ) + + +repo_id = "Plachta/Seed-VC" +local_dir = "./checkpoints" +files = [ + "DiT_step_315000_seed_v2_online_pruned.pth", + "README.md", + "config_dit_mel_seed.yml", + "hifigan.yml", + "hift.pt", + "speech_tokenizer_v1.onnx", +] + +check_and_download_files(repo_id, local_dir, files)