diff --git a/script.py b/script.py index 2c0a93ca..cd3605f5 100644 --- a/script.py +++ b/script.py @@ -3309,6 +3309,7 @@ def on_load(request: gr.Request): gen_lang = gr.Dropdown( value=config.api_def.api_language, choices=[ + "auto", "ar", "zh", "cs", diff --git a/system/openaittstest.html b/system/openaittstest.html index 37c38a52..e2f407f6 100644 --- a/system/openaittstest.html +++ b/system/openaittstest.html @@ -100,10 +100,13 @@

OpenAI API/AllTalk TTS API Test

diff --git a/system/requirements/requirements_colab.txt b/system/requirements/requirements_colab.txt index 5d978fc9..a3344af7 100644 --- a/system/requirements/requirements_colab.txt +++ b/system/requirements/requirements_colab.txt @@ -46,3 +46,4 @@ piper-tts; sys_platform == "linux" plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/system/requirements/requirements_standalone.txt b/system/requirements/requirements_standalone.txt index dfc3c0e0..3c47c36c 100644 --- a/system/requirements/requirements_standalone.txt +++ b/system/requirements/requirements_standalone.txt @@ -36,3 +36,4 @@ fastapi==0.112.2 plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/system/requirements/requirements_textgen.txt b/system/requirements/requirements_textgen.txt index 2007867e..f89b30e6 100644 --- a/system/requirements/requirements_textgen.txt +++ b/system/requirements/requirements_textgen.txt @@ -33,3 +33,4 @@ piper-phonemize==1.1.0; sys_platform == "darwin" plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/system/tts_engines/f5tts/f5tts_settings_page.py b/system/tts_engines/f5tts/f5tts_settings_page.py index f58802d3..6e72e364 100644 --- a/system/tts_engines/f5tts/f5tts_settings_page.py +++ b/system/tts_engines/f5tts/f5tts_settings_page.py @@ -140,7 +140,7 @@ def transcribe_files(model_name: str, use_cpu: bool = False, progress=gr.Progres # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -148,13 +148,17 @@ def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -192,6 +196,7 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -202,12 +207,17 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -226,7 +236,7 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/f5tts/help_content.py b/system/tts_engines/f5tts/help_content.py index d339c250..5e08102e 100644 --- a/system/tts_engines/f5tts/help_content.py +++ b/system/tts_engines/f5tts/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation @@ -267,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py index edb57b5e..348ae78f 100644 --- a/system/tts_engines/f5tts/model_engine.py +++ b/system/tts_engines/f5tts/model_engine.py @@ -144,6 +144,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -154,10 +155,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # @@ -400,9 +404,16 @@ def scan_models_folder(self): if model_dir.is_dir(): # First try to find model_*.safetensors files model_files = list(model_dir.glob("model_*.safetensors")) + if not model_files: + # Try finding the pt model file as fallback + # If no model_*.safetensors found, try finding a .pt model file + model_files = list(model_dir.glob("model_*.pt")) if not model_files: # If no model_*.safetensors found, try any .safetensors file model_files = list(model_dir.glob("*.safetensors")) + if not model_files: + # If no model_*.safetensors found, try any .pt file + model_files = list(model_dir.glob("*.pt")) vocab_file = model_dir / "vocab.txt" vocos_dir = model_dir / "vocos" @@ -508,9 +519,15 @@ async def api_manual_load_model(self, model_name): # Dynamically find the safetensors model file model_files = list(model_dir.glob("model_*.safetensors")) + if not model_files: + # Try finding the pt model file as fallback + model_files = list(model_dir.glob("model_*.pt")) if not model_files: # Try finding any safetensors file as fallback model_files = list(model_dir.glob("*.safetensors")) + if not model_files: + # Try finding any pt file as fallback + model_files = list(model_dir.glob("*.pt")) if not model_files: print(f"[{self.branding}ENG] \033[91mError\033[0m: No model's safetensors file was found in the F5-TTS models directory.") @@ -1082,7 +1099,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if streaming: with open(output_file, 'rb') as f: diff --git a/system/tts_engines/f5tts/model_settings.json b/system/tts_engines/f5tts/model_settings.json index 034d0a47..f70ba63b 100644 --- a/system/tts_engines/f5tts/model_settings.json +++ b/system/tts_engines/f5tts/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "female_01.wav", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 0.9, "lowvram_enabled": true, @@ -33,10 +34,13 @@ }, "openai_voices": { "alloy": "female_01.wav", + "ash": "female_01.wav", + "coral": "female_01.wav", "echo": "female_01.wav", "fable": "female_01.wav", "nova": "female_01.wav", "onyx": "female_01.wav", - "shimmer": "female_01.wavf" + "sage": "female_01.wav", + "shimmer": "female_01.wav" } } \ No newline at end of file diff --git a/system/tts_engines/parler/help_content.py b/system/tts_engines/parler/help_content.py index f111302c..a210ed76 100644 --- a/system/tts_engines/parler/help_content.py +++ b/system/tts_engines/parler/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation @@ -267,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/parler/model_engine.py b/system/tts_engines/parler/model_engine.py index 83387745..086df061 100644 --- a/system/tts_engines/parler/model_engine.py +++ b/system/tts_engines/parler/model_engine.py @@ -91,6 +91,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -101,10 +102,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # @@ -503,7 +507,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() # Record the end time to generate TTS generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False: await self.handle_lowvram_change() self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again. diff --git a/system/tts_engines/parler/model_settings.json b/system/tts_engines/parler/model_settings.json index 580f5cb0..d0de670a 100644 --- a/system/tts_engines/parler/model_settings.json +++ b/system/tts_engines/parler/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "enthusiastic_female", "def_narrator_voice": "enthusiastic_female", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, @@ -33,10 +34,13 @@ }, "openai_voices": { "alloy": "enthusiastic_female", + "ash": "enthusiastic_female", + "coral": "enthusiastic_female", "echo": "enthusiastic_female", "fable": "enthusiastic_female", "nova": "enthusiastic_female", "onyx": "enthusiastic_female", + "sage": "enthusiastic_female", "shimmer": "enthusiastic_female" } } \ No newline at end of file diff --git a/system/tts_engines/parler/parler_settings_page.py b/system/tts_engines/parler/parler_settings_page.py index e2b1fc85..55cc077b 100644 --- a/system/tts_engines/parler/parler_settings_page.py +++ b/system/tts_engines/parler/parler_settings_page.py @@ -52,7 +52,7 @@ def parler_voices_file_list(): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -60,13 +60,17 @@ def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -104,6 +108,7 @@ def parler_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -114,12 +119,17 @@ def parler_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -132,7 +142,7 @@ def parler_model_alltalk_settings(model_config_data): with gr.Row(): submit_button = gr.Button("Update Settings") output_message = gr.Textbox(label="Output Message", interactive=False, show_label=False) - submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) with gr.Accordion("HELP - 🔊 Understanding TTS Engine Default Settings Page", open=False): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS, elem_classes="custom-markdown") diff --git a/system/tts_engines/piper/help_content.py b/system/tts_engines/piper/help_content.py index 4f32cb79..2f1d1db8 100644 --- a/system/tts_engines/piper/help_content.py +++ b/system/tts_engines/piper/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation @@ -267,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/piper/model_engine.py b/system/tts_engines/piper/model_engine.py index 438e7bed..4d12ba8d 100644 --- a/system/tts_engines/piper/model_engine.py +++ b/system/tts_engines/piper/model_engine.py @@ -90,6 +90,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -100,10 +101,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # @@ -468,5 +472,5 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") self.tts_generating_lock = False diff --git a/system/tts_engines/piper/model_settings.json b/system/tts_engines/piper/model_settings.json index d9345456..eaa8d740 100644 --- a/system/tts_engines/piper/model_settings.json +++ b/system/tts_engines/piper/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "en_US-ljspeech-high.onnx", "def_narrator_voice": "en_US-ljspeech-high.onnx", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, @@ -33,10 +34,13 @@ }, "openai_voices": { "alloy": "en_US-ljspeech-high.onnx", + "ash": "en_US-ljspeech-high.onnx", + "coral": "en_US-ljspeech-high.onnx", "echo": "en_US-ljspeech-high.onnx", "fable": "en_US-ljspeech-high.onnx", "nova": "en_US-ljspeech-high.onnx", "onyx": "en_US-ljspeech-high.onnx", + "sage": "en_US-ljspeech-high.onnx", "shimmer": "en_US-ljspeech-high.onnx" } } diff --git a/system/tts_engines/piper/piper_settings_page.py b/system/tts_engines/piper/piper_settings_page.py index fa8708dd..f5e9e9b1 100644 --- a/system/tts_engines/piper/piper_settings_page.py +++ b/system/tts_engines/piper/piper_settings_page.py @@ -197,7 +197,7 @@ def download_language_pack(lang_code, progress=gr.Progress()): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -205,13 +205,17 @@ def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -249,6 +253,7 @@ def piper_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -259,12 +264,17 @@ def piper_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -283,7 +293,7 @@ def piper_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/template-tts-engine/help_content.py b/system/tts_engines/template-tts-engine/help_content.py index 277f9746..b1f6acad 100644 --- a/system/tts_engines/template-tts-engine/help_content.py +++ b/system/tts_engines/template-tts-engine/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation @@ -267,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/template-tts-engine/model_engine.py b/system/tts_engines/template-tts-engine/model_engine.py index 5f78bb63..08b5b0e9 100644 --- a/system/tts_engines/template-tts-engine/model_engine.py +++ b/system/tts_engines/template-tts-engine/model_engine.py @@ -92,6 +92,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -102,10 +103,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # @@ -417,5 +421,5 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") self.tts_generating_lock = False diff --git a/system/tts_engines/template-tts-engine/model_settings.json b/system/tts_engines/template-tts-engine/model_settings.json index a7707549..ac7edcdf 100644 --- a/system/tts_engines/template-tts-engine/model_settings.json +++ b/system/tts_engines/template-tts-engine/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "male_01.wav", "deepspeed_enabled": true, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, @@ -33,10 +34,13 @@ }, "openai_voices": { "alloy": "female_01.wav", + "ash": "female_01.wav", + "coral": "female_01.wav", "echo": "female_01.wav", "fable": "female_01.wav", "nova": "female_01.wav", "onyx": "female_01.wav", + "sage": "female_01.wav", "shimmer": "female_01.wav" } } \ No newline at end of file diff --git a/system/tts_engines/template-tts-engine/modelname_settings_page.py b/system/tts_engines/template-tts-engine/modelname_settings_page.py index 8fb2b4f7..d9586bd6 100644 --- a/system/tts_engines/template-tts-engine/modelname_settings_page.py +++ b/system/tts_engines/template-tts-engine/modelname_settings_page.py @@ -48,7 +48,7 @@ def xtts_voices_file_list(): # # You do not need to modify the function's logic or any other part of the code. -def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -57,13 +57,17 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -104,6 +108,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -114,12 +119,17 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -138,7 +148,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/template-tts-engine/template_engine.py b/system/tts_engines/template-tts-engine/template_engine.py index f95c5a96..b57593e3 100644 --- a/system/tts_engines/template-tts-engine/template_engine.py +++ b/system/tts_engines/template-tts-engine/template_engine.py @@ -270,10 +270,13 @@ def __init__(self): OpenAI Voice Mappings: - self.openai_alloy: Alloy voice mapping + - self.openai_ash: Ash voice mapping + - self.openai_coral: Coral voice mapping - self.openai_echo: Echo voice mapping - self.openai_fable: Fable voice mapping - self.openai_nova: Nova voice mapping - self.openai_onyx: Onyx voice mapping + - self.openai_sage: Sage voice mapping - self.openai_shimmer: Shimmer voice mapping Integration Requirements: @@ -339,11 +342,14 @@ def __init__(self): # DO NOT MODIFY - OpenAI voice mappings from model_settings.json self.openai_alloy = model_settings_file["openai_voices"]["alloy"] + self.openai_ash = model_settings_file["openai_voices"]["ash"] + self.openai_coral = model_settings_file["openai_voices"]["coral"] self.openai_echo = model_settings_file["openai_voices"]["echo"] self.openai_fable = model_settings_file["openai_voices"]["fable"] self.openai_nova = model_settings_file["openai_voices"]["nova"] self.openai_onyx = model_settings_file["openai_voices"]["onyx"] - self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"] + self.openai_sage = model_settings_file["openai_voices"]["sage"] + self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"] """ Below is the name of the folder that will be created-used under `/models/{folder}` diff --git a/system/tts_engines/vits/help_content.py b/system/tts_engines/vits/help_content.py index 21d48c65..f0478e7b 100644 --- a/system/tts_engines/vits/help_content.py +++ b/system/tts_engines/vits/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation @@ -267,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/vits/model_engine.py b/system/tts_engines/vits/model_engine.py index 9e514ba5..a50b6070 100644 --- a/system/tts_engines/vits/model_engine.py +++ b/system/tts_engines/vits/model_engine.py @@ -109,10 +109,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # @@ -638,7 +641,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() # Record the end time to generate TTS generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False: await self.handle_lowvram_change() self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again. diff --git a/system/tts_engines/vits/model_settings.json b/system/tts_engines/vits/model_settings.json index f50609e2..1e9c6d89 100644 --- a/system/tts_engines/vits/model_settings.json +++ b/system/tts_engines/vits/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "p225", "def_narrator_voice": "p226", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": true, @@ -33,10 +34,13 @@ }, "openai_voices": { "alloy": "p225", + "ash": "p225", + "coral": "p225", "echo": "p225", "fable": "p225", "nova": "p225", "onyx": "p225", + "sage": "p225", "shimmer": "p225" } } \ No newline at end of file diff --git a/system/tts_engines/vits/vits_settings_page.py b/system/tts_engines/vits/vits_settings_page.py index 32326980..d73a7c30 100644 --- a/system/tts_engines/vits/vits_settings_page.py +++ b/system/tts_engines/vits/vits_settings_page.py @@ -231,7 +231,7 @@ def download_language_pack(lang_code, progress=gr.Progress()): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -239,13 +239,17 @@ def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -283,6 +287,7 @@ def vits_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -293,12 +298,17 @@ def vits_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -317,7 +327,7 @@ def vits_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/xtts/help_content.py b/system/tts_engines/xtts/help_content.py index cc03f8be..03c35f66 100644 --- a/system/tts_engines/xtts/help_content.py +++ b/system/tts_engines/xtts/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation @@ -267,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine @@ -310,6 +318,11 @@ class AllTalkHelpContent: - Requires NVIDIA GPU with CUDA support - 2-3x speed improvement in generation - Recommended when available + + - **Streaming Support** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Multi-Language Support** - Clone voices across multiple languages diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index c4504888..5231f715 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -24,7 +24,6 @@ Note: You can add new functions, just DONT remove the functions that are already there, even if they are doing nothing as `tts_server.py` will still look for their existance and fail if they are missing. """ - ######################################## # Default imports # Do not change this # ######################################## @@ -291,10 +290,13 @@ def __init__(self): OpenAI Voice Mappings: - self.openai_alloy: Alloy voice mapping + - self.openai_ash: Ash voice mapping + - self.openai_coral: Coral voice mapping - self.openai_echo: Echo voice mapping - self.openai_fable: Fable voice mapping - self.openai_nova: Nova voice mapping - self.openai_onyx: Onyx voice mapping + - self.openai_sage: Sage voice mapping - self.openai_shimmer: Shimmer voice mapping Integration Requirements: @@ -350,6 +352,7 @@ def __init__(self): self.def_character_voice = model_settings_file["settings"]["def_character_voice"] self.def_narrator_voice = model_settings_file["settings"]["def_narrator_voice"] self.deepspeed_enabled = model_settings_file["settings"]["deepspeed_enabled"] + self.streaming_enabled = model_settings_file["settings"]["streaming_enabled"] self.engine_installed = model_settings_file["settings"]["engine_installed"] self.generationspeed_set = model_settings_file["settings"]["generationspeed_set"] self.lowvram_enabled = model_settings_file["settings"]["lowvram_enabled"] @@ -360,10 +363,13 @@ def __init__(self): # DO NOT MODIFY - OpenAI voice mappings from model_settings.json self.openai_alloy = model_settings_file["openai_voices"]["alloy"] + self.openai_ash = model_settings_file["openai_voices"]["ash"] + self.openai_coral = model_settings_file["openai_voices"]["coral"] self.openai_echo = model_settings_file["openai_voices"]["echo"] self.openai_fable = model_settings_file["openai_voices"]["fable"] self.openai_nova = model_settings_file["openai_voices"]["nova"] self.openai_onyx = model_settings_file["openai_voices"]["onyx"] + self.openai_sage = model_settings_file["openai_voices"]["sage"] self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"] """ @@ -967,7 +973,44 @@ async def handle_tts_method_change(self, tts_method): self.print_message(f"\033[94mModel Loadtime: \033[93m{generate_elapsed_time:.2f}\033[94m seconds\033[0m") return True - async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming): + async def prepare_voice_inputs(self, voice): + """Prepares latents and embeddings based on the voice input.""" + gpt_cond_latent = None + speaker_embedding = None + + if voice.startswith('latent:'): + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._load_latents(voice) + + elif voice.startswith('voiceset:'): + voice_set = voice.replace("voiceset:", "") + voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set) + self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts") + + wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav")) + if not wavs_files: + self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error") + raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}") + + if len(wavs_files) > 5: + wavs_files = random.sample(wavs_files, 5) + self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts") + + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) + + else: + normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice)) + wavs_files = [normalized_path] + self.print_message(f"Using single voice sample: {normalized_path}", message_type="debug_tts") + + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) + + return gpt_cond_latent, speaker_embedding + + async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, + streaming): """ Generate speech from text using the XTTS model. @@ -1017,71 +1060,33 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena generate_start_time = time.time() try: - # Voice input processing - self.print_message(f"Processing voice input: {voice}", message_type="debug_tts") - gpt_cond_latent = None - speaker_embedding = None - - # Handle different voice types - if voice.startswith('latent:'): - if self.current_model_loaded.startswith("xtts"): - gpt_cond_latent, speaker_embedding = self._load_latents(voice) - - elif voice.startswith('voiceset:'): - voice_set = voice.replace("voiceset:", "") - voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set) - self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts") - - wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav")) - if not wavs_files: - self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error") - raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}") - - if len(wavs_files) > 5: - wavs_files = random.sample(wavs_files, 5) - self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts") - - if self.current_model_loaded.startswith("xtts"): - self.print_message("Generating conditioning latents from voice set", message_type="debug_tts") - gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) - - else: - normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice)) - wavs_files = [normalized_path] - self.print_message(f"Using single voice sample: {normalized_path}", message_type="debug_tts") - - if self.current_model_loaded.startswith("xtts"): - self.print_message("Generating conditioning latents from single sample", message_type="debug_tts") - gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) - - # Generate speech + # Preparation of latents and embeddings + gpt_cond_latent, speaker_embedding = await self.prepare_voice_inputs(voice) + + common_args = { + "text": text, + "language": language, + "gpt_cond_latent": gpt_cond_latent, + "speaker_embedding": speaker_embedding, + "temperature": float(temperature), + "length_penalty": float(self.model.config.length_penalty), + "repetition_penalty": float(repetition_penalty), + "top_k": int(self.model.config.top_k), + "top_p": float(self.model.config.top_p), + "speed": float(speed), + "enable_text_splitting": True + } + + self.print_message("Generation settings:", message_type="debug_tts_variables") + self.print_message(f"├─ Temperature: {temperature}", message_type="debug_tts_variables") + self.print_message(f"├─ Speed: {speed}", message_type="debug_tts_variables") + self.print_message(f"├─ Language: {language}", message_type="debug_tts_variables") + self.print_message(f"└─ Text length: {len(text)} characters", message_type="debug_tts_variables") + + # Handle streaming vs non-streaming if self.current_model_loaded.startswith("xtts"): - self.print_message(f"Generating speech for text: {text}", message_type="debug_tts") - - common_args = { - "text": text, - "language": language, - "gpt_cond_latent": gpt_cond_latent, - "speaker_embedding": speaker_embedding, - "temperature": float(temperature), - "length_penalty": float(self.model.config.length_penalty), - "repetition_penalty": float(repetition_penalty), - "top_k": int(self.model.config.top_k), - "top_p": float(self.model.config.top_p), - "speed": float(speed), - "enable_text_splitting": True - } - - self.print_message("Generation settings:", message_type="debug_tts_variables") - self.print_message(f"├─ Temperature: {temperature}", message_type="debug_tts_variables") - self.print_message(f"├─ Speed: {speed}", message_type="debug_tts_variables") - self.print_message(f"├─ Language: {language}", message_type="debug_tts_variables") - self.print_message(f"└─ Text length: {len(text)} characters", message_type="debug_tts_variables") - - # Handle streaming vs non-streaming if streaming: self.print_message("Starting streaming generation", message_type="debug_tts") - self.print_message(f"Using streaming-based generation and files {wavs_files}") output = self.model.inference_stream(**common_args, stream_chunk_size=20) file_chunks = [] @@ -1101,7 +1106,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena self.tts_generating_lock = False break - self.print_message(f"Processing chunk {i+1}", message_type="debug_tts") + self.print_message(f"Processing chunk {i + 1}", message_type="debug_tts") file_chunks.append(chunk) if isinstance(chunk, list): chunk = torch.cat(chunk, dim=0) @@ -1118,9 +1123,9 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena elif self.current_model_loaded.startswith("apitts"): if streaming: - raise ValueError("Streaming is only supported in XTTSv2 local mode") + raise ValueError("Streaming is not supported in APITTS mode") # Common arguments for both error and normal cases - common_args = { + api_args = { "file_path": output_file, "language": language, "temperature": temperature, @@ -1128,23 +1133,20 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena "repetition_penalty": repetition_penalty, "top_k": self.model.config.top_k, "top_p": self.model.config.top_p, - "speed": speed - } - if voice.startswith('latent:'): + "speed": speed, + } + + if voice.startswith("latent:"): self.print_message("API TTS method does not support latent files - Please use an audio reference file", message_type="error") self.model.tts_to_file( text="The API TTS method only supports audio files not latents. Please select an audio reference file instead.", speaker="Ana Florence", - **common_args + **api_args, ) else: self.print_message("Using API-based generation", message_type="debug_tts") - self.model.tts_to_file( - text=text, - speaker_wav=wavs_files, - **common_args - ) - + self.model.tts_to_file(text=text, speaker_wav=[voice], **api_args) + self.print_message(f"API generation completed, saved to: {output_file}", message_type="debug_tts") finally: @@ -1154,7 +1156,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # Standard output message (not debug) self.print_message( - f"\033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m", + f"\033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m", message_type="standard" ) diff --git a/system/tts_engines/xtts/model_settings.json b/system/tts_engines/xtts/model_settings.json index b680cfa2..ed5b298c 100644 --- a/system/tts_engines/xtts/model_settings.json +++ b/system/tts_engines/xtts/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "male_01.wav", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, @@ -33,10 +34,13 @@ }, "openai_voices": { "alloy": "female_01.wav", + "ash": "female_01.wav", + "coral": "female_01.wav", "echo": "female_01.wav", "fable": "female_01.wav", "nova": "female_01.wav", "onyx": "female_01.wav", + "sage": "female_01.wav", "shimmer": "female_01.wav" } } \ No newline at end of file diff --git a/system/tts_engines/xtts/xtts_settings_page.py b/system/tts_engines/xtts/xtts_settings_page.py index cb3f7e70..fe459427 100644 --- a/system/tts_engines/xtts/xtts_settings_page.py +++ b/system/tts_engines/xtts/xtts_settings_page.py @@ -30,9 +30,61 @@ # in your TTS engine's settings page. def xtts_voices_file_list(): - directory = main_dir / "voices" - files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith(".wav")] - return files + """ + Adapted from `tts_class.voices_file_list` + + Scan and compile a list of available voice samples and latents. + + This function scans multiple directories to find voice samples in different formats: + 1. Individual WAV files in the main voices directory + 2. Collections of WAV files in the xtts_multi_voice_sets directory + 3. Pre-computed voice latents in the xtts_latents directory + + Returns: + list: Available voices with appropriate prefixes: + - Standard WAV: filename.wav + - Voice sets: "voiceset:foldername" + - Latents: "latent:filename.json" + + Note: Returns ["No Voices Found"] if no valid voices are detected. + """ + try: + voices = [] # List to store all detected voices + directory = main_dir / "voices" # Base directory for voices + + json_latents_dir = directory / "xtts_latents" # Directory for JSON latents + multi_voice_dir = directory / "xtts_multi_voice_sets" # Directory for multi voice sets + + # Scan for individual WAV files + voices.extend( + [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith(".wav")] + ) + + # Scan for voice sets + if os.path.exists(multi_voice_dir): + for voice_set in os.listdir(multi_voice_dir): + voice_set_path = multi_voice_dir / voice_set + if os.path.isdir(voice_set_path): + if any(f.endswith(".wav") for f in os.listdir(voice_set_path)): + voices.append(f"voiceset:{voice_set}") + + # Scan for JSON latents + if os.path.exists(json_latents_dir): + json_files = [f for f in os.listdir(json_latents_dir) if f.endswith(".json")] + for json_file in json_files: + voices.append(f"latent:{json_file}") + + # Sort voices by type alphabetically + voices.sort(key=lambda x: (x.startswith("voiceset:"), x.startswith("latent:"), x)) + + # Return the list of voices or a default message if none found + if not voices: + return ["No Voices Found"] + return voices + + except Exception as e: + print(f"Error scanning for voices: {str(e)}") + return ["No Voices Found"] ###################################################### # REQUIRED CHANGE # @@ -49,7 +101,7 @@ def xtts_voices_file_list(): # # You do not need to modify the function's logic or any other part of the code. -def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -58,13 +110,17 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -98,6 +154,7 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo # settings page, allowing users to configure various options and voice selections. def xtts_model_alltalk_settings(model_config_data): + # This is a copy of voices_file_list features_list = model_config_data['model_capabilties'] voice_list = xtts_voices_file_list() with gr.Blocks(title="Xtts TTS", analytics_enabled=False) as app: @@ -105,6 +162,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -115,12 +173,17 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -139,7 +202,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # @@ -289,6 +352,3 @@ def confirm_download(model_name): def xtts_at_gradio_settings_page(model_config_data): app = xtts_model_alltalk_settings(model_config_data) return app -def xtts_at_gradio_settings_page(model_config_data): - app = xtts_model_alltalk_settings(model_config_data) - return app diff --git a/test_server.py b/test_server.py index e3d2d199..49c0c827 100644 --- a/test_server.py +++ b/test_server.py @@ -729,7 +729,7 @@ async def _test_openai_tts(self, text: str): self.logger.info("Testing OpenAI compatible endpoint") self.logger.info(f"Testing OpenAI generation with text: {text}") - test_voices = ["alloy", "echo", "fable", "nova", "onyx", "shimmer"] + test_voices = ["alloy", "ash", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"] test_formats = ["wav", "mp3", "opus", "aac"] for voice in test_voices[:2]: # Test first two voices only diff --git a/tts_server.py b/tts_server.py index 07630ee6..263192c7 100644 --- a/tts_server.py +++ b/tts_server.py @@ -39,9 +39,26 @@ import numpy as np import soundfile as sf import librosa +from langdetect import detect, DetectorFactory +from langdetect.lang_detect_exception import LangDetectException from config import AlltalkConfig, AlltalkTTSEnginesConfig logging.disable(logging.WARNING) +DetectorFactory.seed = 0 # Ensure deterministic behavior + +# Mapping of detected languages to xtts-supported languages +LANG_FALLBACKS = { + "en": "en", "es": "es", "fr": "fr", "de": "de", "it": "it", + "pt": "pt", "pl": "pl", "tr": "tr", "ru": "ru", "nl": "nl", + "cs": "cs", "ar": "ar", "zh-cn": "zh", "zh-tw": "zh", "ja": "ja", + "hu": "hu", "ko": "ko", + + # Additional fallbacks for unsupported languages + "uk": "ru", # Ukrainian → Russian + "bg": "ru", # Bulgarian → Russian + "ca": "fr", +} + ######################################################################################## # START-UP # Silence RVC warning about torch.nn.utils.weight_norm even though not used # ######################################################################################## @@ -938,22 +955,37 @@ async def generate_audio(text, voice, language, temperature, repetition_penalty, print_message("each TTS Engine in the 'Engine Information' section of the Gradio interface.", "warning", "GEN") raise ValueError("Streaming not supported by current TTS engine") - response = model_engine.generate_tts(text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming) + if language == "auto": + language = detect_language(text) + # Streaming mode if streaming: - async def stream_response(): + print_message("Streaming mode enabled", "debug", "TTS") + response = model_engine.generate_tts( + text, voice, language, temperature, repetition_penalty, speed, pitch, output_file=None, streaming=True + ) + + async def stream_audio(): try: async for chunk in response: yield chunk except Exception as e: print_message(f"Error during streaming audio generation: {str(e)}", "error", "GEN") raise - return stream_response() + + return stream_audio() + + # Non-streaming mode + print_message("Non-streaming mode enabled", "debug", "TTS") + response = model_engine.generate_tts( + text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming=False + ) + try: async for _ in response: pass except Exception as e: - print_message(f"Error during audio generation: {str(e)}", "error", "GEN") + print_message(f"Error during audio generation: {str(e)}", "error", "TTS") raise ########################### @@ -1046,7 +1078,7 @@ class OpenAIInput(BaseModel): @classmethod def validate_voice(cls, value): """Validate that the requested voice is supported by OpenAI TTS.""" - supported_voices = ["alloy", "echo", "fable", "nova", "onyx", "shimmer"] + supported_voices = ["alloy", "ash", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"] if value not in supported_voices: raise ValueError(f"Voice must be one of {supported_voices}") return value @@ -1103,22 +1135,27 @@ async def openai_tts_generate(request: Request): # Extract and validate parameters input_text = json_data["input"] voice = json_data["voice"] - response_format = json_data.get("response_format", "wav").lower() speed = json_data.get("speed", 1.0) print_message(f"Input text: {input_text}", "debug_openai", "TTS") print_message(f"Voice: {voice}", "debug_openai", "TTS") print_message(f"Speed: {speed}", "debug_openai", "TTS") + # Load current model engine configuration + current_model_engine = tts_class() + # Process text and map voice cleaned_string = html.unescape(standard_filtering(input_text)) voice_mapping = { - "alloy": model_engine.openai_alloy, - "echo": model_engine.openai_echo, - "fable": model_engine.openai_fable, - "nova": model_engine.openai_nova, - "onyx": model_engine.openai_onyx, - "shimmer": model_engine.openai_shimmer + "alloy": current_model_engine.openai_alloy, + "ash": current_model_engine.openai_ash, + "coral": current_model_engine.openai_coral, + "echo": current_model_engine.openai_echo, + "fable": current_model_engine.openai_fable, + "nova": current_model_engine.openai_nova, + "onyx": current_model_engine.openai_onyx, + "sage": current_model_engine.openai_sage, + "shimmer": current_model_engine.openai_shimmer } mapped_voice = voice_mapping.get(voice) @@ -1128,37 +1165,48 @@ async def openai_tts_generate(request: Request): print_message(f"Mapped voice: {mapped_voice}", "debug_openai", "TTS") - # Generate audio - unique_id = uuid.uuid4() - timestamp = int(time.time()) - output_file_path = f'{this_dir / config.get_output_directory() / f"openai_output_{unique_id}_{timestamp}.{model_engine.audio_format}"}' - - if config.debugging.debug_fullttstext: - print_message(cleaned_string, component="TTS") + if current_model_engine.streaming_enabled: + audio_stream = await generate_audio( + cleaned_string, mapped_voice, "auto", current_model_engine.temperature_set, + float(str(current_model_engine.repetitionpenalty_set).replace(',', '.')), speed, current_model_engine.pitch_set, + output_file=None, streaming=True + ) + return StreamingResponse(audio_stream, media_type="audio/wav") else: - print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS") + # Generate audio + unique_id = uuid.uuid4() + timestamp = int(time.time()) + output_file_path = f'{this_dir / config.get_output_directory() / f"openai_output_{unique_id}_{timestamp}.{current_model_engine.audio_format}"}' + response_format = json_data.get("response_format", "wav").lower() - await generate_audio(cleaned_string, mapped_voice, "en", model_engine.temperature_set, - model_engine.repetitionpenalty_set, speed, model_engine.pitch_set, - output_file_path, streaming=False) + if config.debugging.debug_fullttstext: + print_message(cleaned_string, component="TTS") + else: + print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS") + + await generate_audio( + cleaned_string, mapped_voice, "auto", current_model_engine.temperature_set, + float(str(current_model_engine.repetitionpenalty_set).replace(',', '.')), speed, current_model_engine.pitch_set, + output_file_path, streaming=False + ) - print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS") + print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS") - # Handle RVC processing - if config.rvc_settings.rvc_enabled: - if config.rvc_settings.rvc_char_model_file.lower() in ["disabled", "disable"]: - print_message("Pass rvccharacter_voice_gen", "debug_openai", "TTS") - else: - print_message("send to rvc", "debug_openai", "TTS") - pth_path = this_dir / "models" / "rvc_voices" / config.rvc_settings.rvc_char_model_file - pitch = config.rvc_settings.pitch - run_rvc(output_file_path, pth_path, pitch, infer_pipeline) + # Handle RVC processing + if config.rvc_settings.rvc_enabled: + if config.rvc_settings.rvc_char_model_file.lower() in ["disabled", "disable"]: + print_message("Pass rvccharacter_voice_gen", "debug_openai", "TTS") + else: + print_message("send to rvc", "debug_openai", "TTS") + pth_path = this_dir / "models" / "rvc_voices" / config.rvc_settings.rvc_char_model_file + pitch = config.rvc_settings.pitch + run_rvc(output_file_path, pth_path, pitch, infer_pipeline) - transcoded_file_path = await transcode_for_openai(output_file_path, response_format) - print_message(f"Audio transcoded to: {transcoded_file_path}", "debug_openai", "TTS") + transcoded_file_path = await transcode_for_openai(output_file_path, response_format) + print_message(f"Audio transcoded to: {transcoded_file_path}", "debug_openai", "TTS") - response = FileResponse(transcoded_file_path, media_type=f"audio/{response_format}", - filename=f"output.{response_format}") + return FileResponse(transcoded_file_path, media_type=f"audio/{response_format}", + filename=f"output.{response_format}") except ValueError as e: print_message(f"Value error occurred: {str(e)}", "error", "TTS") @@ -1231,10 +1279,13 @@ async def transcode_for_openai(input_file, output_format): class VoiceMappings(BaseModel): """OpenAI to engine voice mapping configuration.""" alloy: str + ash: str + coral: str echo: str fable: str nova: str onyx: str + sage: str shimmer: str @app.put("/api/openai-voicemap") @@ -1246,10 +1297,13 @@ async def update_openai_voice_mappings(mappings: VoiceMappings): # Update in-memory mappings print_message("Updating in-memory voice mappings", "debug_openai", "TTS") model_engine.openai_alloy = mappings.alloy + model_engine.openai_ash = mappings.ash + model_engine.openai_coral = mappings.coral model_engine.openai_echo = mappings.echo model_engine.openai_fable = mappings.fable model_engine.openai_nova = mappings.nova model_engine.openai_onyx = mappings.onyx + model_engine.openai_sage = mappings.sage model_engine.openai_shimmer = mappings.shimmer # Update settings file @@ -1605,7 +1659,7 @@ class JSONInput(BaseModel): rvcnarrator_voice_gen: str = Field(..., description="rvcnarrator_voice_gen needs to be the name of a valid pth file in the 'folder\\file.pth' format or the word 'Disabled'.") rvcnarrator_pitch: float = Field(..., description="RVC Narrator pitch needs to be a number between -24 and 24") text_not_inside: str = Field(..., pattern="^(character|narrator|silent)$", description="text_not_inside needs to be 'character', 'narrator' or 'silent'.") - language: str = Field(..., pattern="^(ar|zh-cn|zh|cs|nl|en|fr|de|hu|hi|it|ja|ko|pl|pt|ru|es|tr)$", description="language needs to be one of the following: ar, zh-cn, zh, cs, nl, en, fr, de, hu, hi, it, ja, ko, pl, pt, ru, es, tr.") + language: str = Field(..., pattern="^(auto|ar|zh-cn|zh|cs|nl|en|fr|de|hu|hi|it|ja|ko|pl|pt|ru|es|tr)$", description="language needs to be one of the following: auto, ar, zh-cn, zh, cs, nl, en, fr, de, hu, hi, it, ja, ko, pl, pt, ru, es, tr.") output_file_name: str = Field(..., pattern="^[a-zA-Z0-9_]+$", description="output_file_name needs to be the name without any special characters or file extension, e.g., 'filename'.") output_file_timestamp: bool = Field(..., description="output_file_timestamp needs to be true or false.") autoplay: bool = Field(..., description="autoplay needs to be a true or false value.") @@ -2098,6 +2152,30 @@ async def tts_finalize_output(audio_files: List[Path], params: dict) -> Tuple[Pa return output_file_path, output_file_url, output_cache_url +def detect_language(text: str) -> str: + """ + Detect the language of the given text and apply a fallback for unsupported languages. + + :param text: Text to analyze. + :return: A supported language code (e.g., 'en', 'fr'). + """ + try: + # Detect the language of the text + detected_lang = detect(text) + print_message(f"Detected language: {detected_lang}", "debug", "LANG_DETECTION") + + # Use the fallback language if the detected one is unsupported + fallback_lang = LANG_FALLBACKS.get(detected_lang, "en") # Default fallback: French + if detected_lang != fallback_lang: + print_message(f"Language '{detected_lang}' not supported, using fallback '{fallback_lang}'", "warn", + "LANG_FALLBACK") + + return fallback_lang + except LangDetectException as e: + # Handle errors in language detection + print_message(f"Language detection error: {str(e)}", "error", "LANG_DETECTION") + raise ValueError("Could not detect language") + @app.post("/api/tts-generate", response_class=JSONResponse) async def apifunction_generate_tts_standard( text_input: str = Form(...),