add HF hub compatibility

myshell-ai · Feb 29, 2024 · b3ed384 · b3ed384
1 parent 69983af
commit b3ed384
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 9 deletions.
diff --git a/melo/api.py b/melo/api.py
@@ -20,7 +20,8 @@
 class TTS(nn.Module):
     def __init__(self, 
                 language,
-                device='auto'):
+                device='auto',
+                use_hf=True):
         super().__init__()
         if device == 'auto':
             device = 'cpu'
@@ -30,7 +31,7 @@ def __init__(self,
             assert torch.cuda.is_available()
 
         # config_path = 
-        hps = load_or_download_config(language)
+        hps = load_or_download_config(language, use_hf=use_hf)
 
         num_languages = hps.num_languages
         num_tones = hps.num_tones
@@ -53,7 +54,7 @@ def __init__(self,
         self.device = device
 
         # load state_dict
-        checkpoint_dict = load_or_download_model(language, device)
+        checkpoint_dict = load_or_download_model(language, device, use_hf=use_hf)
         self.model.load_state_dict(checkpoint_dict['model'], strict=True)
 
         language = language.split('_')[0]

diff --git a/melo/download_utils.py b/melo/download_utils.py
@@ -2,6 +2,8 @@
 import os
 from . import utils
 from cached_path import cached_path
+from huggingface_hub import hf_hub_download
+
 DOWNLOAD_CKPT_URLS = {
     'EN': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/EN/checkpoint.pth',
     'EN_V2': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/EN_V2/checkpoint.pth',
@@ -22,14 +24,32 @@
     'KR': 'https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/basespeakers/KR/config.json',
 }
 
-def load_or_download_config(locale):
+LANG_TO_HF_REPO_ID = {
+    'EN': 'myshell-ai/MeloTTS-English',
+    'EN_V2': 'myshell-ai/MeloTTS-English-v2',
+    'FR': 'myshell-ai/MeloTTS-French',
+    'JP': 'myshell-ai/MeloTTS-Japanese',
+    'ES': 'myshell-ai/MeloTTS-Spanish',
+    'ZH': 'myshell-ai/MeloTTS-Chinese',
+    'KR': 'myshell-ai/MeloTTS-Korean',
+}
+
+def load_or_download_config(locale, use_hf=True):
     language = locale.split('-')[0].upper()
-    assert language in DOWNLOAD_CONFIG_URLS
-    config_path = cached_path(DOWNLOAD_CONFIG_URLS[language])
+    if use_hf:
+        assert language in LANG_TO_HF_REPO_ID
+        config_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="config.json")
+    else:
+        assert language in DOWNLOAD_CONFIG_URLS
+        config_path = cached_path(DOWNLOAD_CONFIG_URLS[language])
     return utils.get_hparams_from_file(config_path)
 
-def load_or_download_model(locale, device):
+def load_or_download_model(locale, device, use_hf=True):
     language = locale.split('-')[0].upper()
-    assert language in DOWNLOAD_CKPT_URLS
-    ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language])
+    if use_hf:
+        assert language in LANG_TO_HF_REPO_ID
+        ckpt_path = hf_hub_download(repo_id=LANG_TO_HF_REPO_ID[language], filename="checkpoint.pth")
+    else:
+        assert language in DOWNLOAD_CKPT_URLS
+        ckpt_path = cached_path(DOWNLOAD_CKPT_URLS[language])
     return torch.load(ckpt_path, map_location=device)
diff --git a/test/test_base_model_tts_package_from_S3.py b/test/test_base_model_tts_package_from_S3.py
@@ -0,0 +1,43 @@
+from melo.api import TTS
+import os
+import glob
+import sys
+
+
+language = sys.argv[1]
+model = TTS(language=language, use_hf=False)
+
+speaker_ids = model.hps.data.spk2id
+speakers = list(speaker_ids.keys())
+
+root_folder = language.lower()
+if 'zh' in root_folder:
+    texts = open('basetts_test_resources/zh_mix_en_egs_text.txt', 'r').readlines()
+    language = 'ZH_MIX_EN'
+elif 'es' in root_folder:
+    texts = open('basetts_test_resources/es_egs_text.txt', 'r').readlines()
+    language = 'SP'
+elif 'fr' in root_folder:
+    texts = open('basetts_test_resources/fr_egs_text.txt', 'r').readlines()
+    language = 'FR'
+elif 'en' in root_folder:
+    texts = open('basetts_test_resources/en_egs_text.txt', 'r').readlines()
+    # texts = ["Boss? You're not my boss, you're just a sad little person who likes to hide behind a computer screen and pretend you have power over others. "]
+    language = 'EN'
+elif 'jp' in root_folder:
+    texts = open('basetts_test_resources/jp_egs_text.txt', 'r').readlines()
+    language = 'JP'
+elif 'kr' in root_folder:
+    texts = open('basetts_test_resources/kr_egs_text.txt', 'r').readlines()
+    language = 'KR'
+else:
+    raise NotImplementedError()
+
+save_dir = os.path.join('basetts_outputs_package_from_S3', root_folder.split('/')[-1])
+
+for speed in [1.0]:
+    for speaker in speakers:
+        for sent_id, text in enumerate(texts):
+            output_path = f'{save_dir}/{speaker}/speed_{speed}/sent_{sent_id:03d}.wav'
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            model.tts_to_file(text, speaker_ids[speaker], output_path, speed=speed)