diff --git a/melo/api.py b/melo/api.py index 3727ae22..1f2f1250 100644 --- a/melo/api.py +++ b/melo/api.py @@ -20,7 +20,8 @@ class TTS(nn.Module): def __init__(self, language, - device='auto'): + device='auto', + use_hf=True): super().__init__() if device == 'auto': device = 'cpu' @@ -30,7 +31,7 @@ def __init__(self, assert torch.cuda.is_available() # config_path = - hps = load_or_download_config(language) + hps = load_or_download_config(language, use_hf=use_hf) num_languages = hps.num_languages num_tones = hps.num_tones @@ -53,7 +54,7 @@ def __init__(self, self.device = device # load state_dict - checkpoint_dict = load_or_download_model(language, device) + checkpoint_dict = load_or_download_model(language, device, use_hf=use_hf) self.model.load_state_dict(checkpoint_dict['model'], strict=True) language = language.split('_')[0] diff --git a/melo/download_utils.py b/melo/download_utils.py index 5d538ef1..da415922 100644 --- a/melo/download_utils.py +++ b/melo/download_utils.py @@ -2,6 +2,8 @@ import os from . import utils from cached_path import cached_path +from huggingface_hub import hf_hub_download + DOWNLOAD_CKPT_URLS = { 'EN': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/EN/checkpoint.pth', 'EN_V2': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/EN_V2/checkpoint.pth', @@ -22,14 +24,32 @@ 'KR': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/KR/config.json', } -def load_or_download_config(locale): +LANG_TO_HF_REPO_ID = { + 'EN': 'myshell-ai/MeloTTS-English', + 'EN_V2': 'myshell-ai/MeloTTS-English-v2', + 'FR': 'myshell-ai/MeloTTS-French', + 'JP': 'myshell-ai/MeloTTS-Japanese', + 'ES': 'myshell-ai/MeloTTS-Spanish', + 'ZH': 'myshell-ai/MeloTTS-Chinese', + 'KR': 'myshell-ai/MeloTTS-Korean', +} + +def load_or_download_config(locale, use_hf=True): language = locale.split('-')[0].upper() - assert language in DOWNLOAD_CONFIG_URLS - config_path = cached_path(DOWNLOAD_CONFIG_URLS[language]) + if use_hf: + assert language in LANG_TO_HF_REPO_ID + config_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="config.json") + else: + assert language in DOWNLOAD_CONFIG_URLS + config_path = cached_path(DOWNLOAD_CONFIG_URLS[language]) return utils.get_hparams_from_file(config_path) -def load_or_download_model(locale, device): +def load_or_download_model(locale, device, use_hf=True): language = locale.split('-')[0].upper() - assert language in DOWNLOAD_CKPT_URLS - ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language]) + if use_hf: + assert language in LANG_TO_HF_REPO_ID + ckpt_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="checkpoint.pth") + else: + assert language in DOWNLOAD_CKPT_URLS + ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language]) return torch.load(ckpt_path, map_location=device) diff --git a/test/test_base_model_tts_package_from_S3.py b/test/test_base_model_tts_package_from_S3.py new file mode 100644 index 00000000..339d82e4 --- /dev/null +++ b/test/test_base_model_tts_package_from_S3.py @@ -0,0 +1,43 @@ +from melo.api import TTS +import os +import glob +import sys + + +language = sys.argv[1] +model = TTS(language=language, use_hf=False) + +speaker_ids = model.hps.data.spk2id +speakers = list(speaker_ids.keys()) + +root_folder = language.lower() +if 'zh' in root_folder: + texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines() + language = 'ZH_MIX_EN' +elif 'es' in root_folder: + texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines() + language = 'SP' +elif 'fr' in root_folder: + texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines() + language = 'FR' +elif 'en' in root_folder: + texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines() + # texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "] + language = 'EN' +elif 'jp' in root_folder: + texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines() + language = 'JP' +elif 'kr' in root_folder: + texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines() + language = 'KR' +else: + raise NotImplementedError() + +save_dir = os.path.join('basetts_outputs_package_from_S3', root_folder.split('/')[-1]) + +for speed in [1.0]: + for speaker in speakers: + for sent_id, text in enumerate(texts): + output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav' + os.makedirs(os.path.dirname(output_path), exist_ok=True) + model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed) \ No newline at end of file