From e72a7bf5c9c331f69e852d50e39edc471192c105 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Fri, 3 Jan 2025 17:50:52 +0100 Subject: [PATCH 1/9] Add initial support for pickletensor models to F5-TTS * Tested with @RASPAUDIO french model available here : https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced --- system/tts_engines/f5tts/model_engine.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py index edb57b5e..a6f62e6e 100644 --- a/system/tts_engines/f5tts/model_engine.py +++ b/system/tts_engines/f5tts/model_engine.py @@ -400,9 +400,16 @@ def scan_models_folder(self): if model_dir.is_dir(): # First try to find model_*.safetensors files model_files = list(model_dir.glob("model_*.safetensors")) + if not model_files: + # Try finding the pt model file as fallback + # If no model_*.safetensors found, try finding a .pt model file + model_files = list(model_dir.glob("model_*.pt")) if not model_files: # If no model_*.safetensors found, try any .safetensors file model_files = list(model_dir.glob("*.safetensors")) + if not model_files: + # If no model_*.safetensors found, try any .pt file + model_files = list(model_dir.glob("*.pt")) vocab_file = model_dir / "vocab.txt" vocos_dir = model_dir / "vocos" @@ -508,9 +515,15 @@ async def api_manual_load_model(self, model_name): # Dynamically find the safetensors model file model_files = list(model_dir.glob("model_*.safetensors")) + if not model_files: + # Try finding the pt model file as fallback + model_files = list(model_dir.glob("model_*.pt")) if not model_files: # Try finding any safetensors file as fallback model_files = list(model_dir.glob("*.safetensors")) + if not model_files: + # Try finding any pt file as fallback + model_files = list(model_dir.glob("*.pt")) if not model_files: print(f"[{self.branding}ENG] \033[91mError\033[0m: No model's safetensors file was found in the F5-TTS models directory.") From ecb15200dc3a4c6408089a72f2467cb839031db8 Mon Sep 17 00:00:00 2001 From: Ilyas Date: Sat, 4 Jan 2025 00:38:52 +0100 Subject: [PATCH 2/9] Add language auto-detection * adds langdetect as requirement for colab, standalone and textgen * adds "auto" to the language dropdown in the Advanced Engine/Model Settings panel * replace the hardcoded "en" by "auto" when called by the OpenAI compatible Speech API --- script.py | 1 + system/requirements/requirements_colab.txt | 1 + .../requirements/requirements_standalone.txt | 1 + system/requirements/requirements_textgen.txt | 1 + tts_server.py | 26 +++++++++++++++++-- 5 files changed, 28 insertions(+), 2 deletions(-) diff --git a/script.py b/script.py index 2c0a93ca..cd3605f5 100644 --- a/script.py +++ b/script.py @@ -3309,6 +3309,7 @@ def on_load(request: gr.Request): gen_lang = gr.Dropdown( value=config.api_def.api_language, choices=[ + "auto", "ar", "zh", "cs", diff --git a/system/requirements/requirements_colab.txt b/system/requirements/requirements_colab.txt index 5d978fc9..a3344af7 100644 --- a/system/requirements/requirements_colab.txt +++ b/system/requirements/requirements_colab.txt @@ -46,3 +46,4 @@ piper-tts; sys_platform == "linux" plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/system/requirements/requirements_standalone.txt b/system/requirements/requirements_standalone.txt index dfc3c0e0..3c47c36c 100644 --- a/system/requirements/requirements_standalone.txt +++ b/system/requirements/requirements_standalone.txt @@ -36,3 +36,4 @@ fastapi==0.112.2 plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/system/requirements/requirements_textgen.txt b/system/requirements/requirements_textgen.txt index 2007867e..f89b30e6 100644 --- a/system/requirements/requirements_textgen.txt +++ b/system/requirements/requirements_textgen.txt @@ -33,3 +33,4 @@ piper-phonemize==1.1.0; sys_platform == "darwin" plotly==5.24.1 scipy==1.14.1 pyOpenSSL>=24.2.1 +langdetect>=1.0.9 diff --git a/tts_server.py b/tts_server.py index 07630ee6..aabdb6a7 100644 --- a/tts_server.py +++ b/tts_server.py @@ -39,9 +39,13 @@ import numpy as np import soundfile as sf import librosa +from langdetect import detect, DetectorFactory +from langdetect.lang_detect_exception import LangDetectException from config import AlltalkConfig, AlltalkTTSEnginesConfig logging.disable(logging.WARNING) +DetectorFactory.seed = 0 # Ensure deterministic behavior + ######################################################################################## # START-UP # Silence RVC warning about torch.nn.utils.weight_norm even though not used # ######################################################################################## @@ -938,6 +942,9 @@ async def generate_audio(text, voice, language, temperature, repetition_penalty, print_message("each TTS Engine in the 'Engine Information' section of the Gradio interface.", "warning", "GEN") raise ValueError("Streaming not supported by current TTS engine") + if language == "auto": + language = detect_language(text) + response = model_engine.generate_tts(text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming) if streaming: @@ -1138,7 +1145,7 @@ async def openai_tts_generate(request: Request): else: print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS") - await generate_audio(cleaned_string, mapped_voice, "en", model_engine.temperature_set, + await generate_audio(cleaned_string, mapped_voice, "auto", model_engine.temperature_set, model_engine.repetitionpenalty_set, speed, model_engine.pitch_set, output_file_path, streaming=False) @@ -1605,7 +1612,7 @@ class JSONInput(BaseModel): rvcnarrator_voice_gen: str = Field(..., description="rvcnarrator_voice_gen needs to be the name of a valid pth file in the 'folder\\file.pth' format or the word 'Disabled'.") rvcnarrator_pitch: float = Field(..., description="RVC Narrator pitch needs to be a number between -24 and 24") text_not_inside: str = Field(..., pattern="^(character|narrator|silent)$", description="text_not_inside needs to be 'character', 'narrator' or 'silent'.") - language: str = Field(..., pattern="^(ar|zh-cn|zh|cs|nl|en|fr|de|hu|hi|it|ja|ko|pl|pt|ru|es|tr)$", description="language needs to be one of the following: ar, zh-cn, zh, cs, nl, en, fr, de, hu, hi, it, ja, ko, pl, pt, ru, es, tr.") + language: str = Field(..., pattern="^(auto|ar|zh-cn|zh|cs|nl|en|fr|de|hu|hi|it|ja|ko|pl|pt|ru|es|tr)$", description="language needs to be one of the following: auto, ar, zh-cn, zh, cs, nl, en, fr, de, hu, hi, it, ja, ko, pl, pt, ru, es, tr.") output_file_name: str = Field(..., pattern="^[a-zA-Z0-9_]+$", description="output_file_name needs to be the name without any special characters or file extension, e.g., 'filename'.") output_file_timestamp: bool = Field(..., description="output_file_timestamp needs to be true or false.") autoplay: bool = Field(..., description="autoplay needs to be a true or false value.") @@ -2098,6 +2105,21 @@ async def tts_finalize_output(audio_files: List[Path], params: dict) -> Tuple[Pa return output_file_path, output_file_url, output_cache_url +def detect_language(text: str) -> str: + """ + Detect the language of the given text. + + :param text: Text to analyze. + :return: Detected language code (e.g., 'en', 'fr'). + """ + try: + detected_lang = detect(text) + print_message(f"Detected language: {detected_lang}", "debug", "LANG_DETECTION") + return detected_lang + except LangDetectException as e: + print_message(f"Language detection error: {str(e)}", "error", "LANG_DETECTION") + raise ValueError("Could not detect language") + @app.post("/api/tts-generate", response_class=JSONResponse) async def apifunction_generate_tts_standard( text_input: str = Form(...), From 72ae93d6cb1adc5a4979571f141d95d056c08ae4 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 14:01:42 +0100 Subject: [PATCH 3/9] Add streaming flag on tts settings --- system/tts_engines/f5tts/f5tts_settings_page.py | 6 ++++-- system/tts_engines/f5tts/help_content.py | 5 +++++ system/tts_engines/f5tts/model_settings.json | 1 + system/tts_engines/parler/help_content.py | 5 +++++ system/tts_engines/parler/model_settings.json | 1 + system/tts_engines/parler/parler_settings_page.py | 6 ++++-- system/tts_engines/piper/help_content.py | 5 +++++ system/tts_engines/piper/model_settings.json | 1 + system/tts_engines/piper/piper_settings_page.py | 6 ++++-- system/tts_engines/template-tts-engine/help_content.py | 5 +++++ .../template-tts-engine/model_settings.json | 1 + .../template-tts-engine/modelname_settings_page.py | 6 ++++-- system/tts_engines/vits/help_content.py | 5 +++++ system/tts_engines/vits/model_settings.json | 1 + system/tts_engines/vits/vits_settings_page.py | 6 ++++-- system/tts_engines/xtts/help_content.py | 10 ++++++++++ system/tts_engines/xtts/model_engine.py | 1 + system/tts_engines/xtts/model_settings.json | 1 + system/tts_engines/xtts/xtts_settings_page.py | 6 ++++-- 19 files changed, 66 insertions(+), 12 deletions(-) diff --git a/system/tts_engines/f5tts/f5tts_settings_page.py b/system/tts_engines/f5tts/f5tts_settings_page.py index f58802d3..5b6e03d9 100644 --- a/system/tts_engines/f5tts/f5tts_settings_page.py +++ b/system/tts_engines/f5tts/f5tts_settings_page.py @@ -140,7 +140,7 @@ def transcribe_files(model_name: str, use_cpu: bool = False, progress=gr.Progres # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -155,6 +155,7 @@ def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -192,6 +193,7 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -226,7 +228,7 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/f5tts/help_content.py b/system/tts_engines/f5tts/help_content.py index d339c250..9163a5e8 100644 --- a/system/tts_engines/f5tts/help_content.py +++ b/system/tts_engines/f5tts/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/f5tts/model_settings.json b/system/tts_engines/f5tts/model_settings.json index 034d0a47..80865eca 100644 --- a/system/tts_engines/f5tts/model_settings.json +++ b/system/tts_engines/f5tts/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "female_01.wav", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 0.9, "lowvram_enabled": true, diff --git a/system/tts_engines/parler/help_content.py b/system/tts_engines/parler/help_content.py index f111302c..592a0ddc 100644 --- a/system/tts_engines/parler/help_content.py +++ b/system/tts_engines/parler/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/parler/model_settings.json b/system/tts_engines/parler/model_settings.json index 580f5cb0..5d190661 100644 --- a/system/tts_engines/parler/model_settings.json +++ b/system/tts_engines/parler/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "enthusiastic_female", "def_narrator_voice": "enthusiastic_female", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, diff --git a/system/tts_engines/parler/parler_settings_page.py b/system/tts_engines/parler/parler_settings_page.py index e2b1fc85..ef354a66 100644 --- a/system/tts_engines/parler/parler_settings_page.py +++ b/system/tts_engines/parler/parler_settings_page.py @@ -52,7 +52,7 @@ def parler_voices_file_list(): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -67,6 +67,7 @@ def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -104,6 +105,7 @@ def parler_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -132,7 +134,7 @@ def parler_model_alltalk_settings(model_config_data): with gr.Row(): submit_button = gr.Button("Update Settings") output_message = gr.Textbox(label="Output Message", interactive=False, show_label=False) - submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) with gr.Accordion("HELP - 🔊 Understanding TTS Engine Default Settings Page", open=False): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS, elem_classes="custom-markdown") diff --git a/system/tts_engines/piper/help_content.py b/system/tts_engines/piper/help_content.py index 4f32cb79..417fc679 100644 --- a/system/tts_engines/piper/help_content.py +++ b/system/tts_engines/piper/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/piper/model_settings.json b/system/tts_engines/piper/model_settings.json index d9345456..182d8923 100644 --- a/system/tts_engines/piper/model_settings.json +++ b/system/tts_engines/piper/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "en_US-ljspeech-high.onnx", "def_narrator_voice": "en_US-ljspeech-high.onnx", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, diff --git a/system/tts_engines/piper/piper_settings_page.py b/system/tts_engines/piper/piper_settings_page.py index fa8708dd..c20df0fe 100644 --- a/system/tts_engines/piper/piper_settings_page.py +++ b/system/tts_engines/piper/piper_settings_page.py @@ -197,7 +197,7 @@ def download_language_pack(lang_code, progress=gr.Progress()): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -212,6 +212,7 @@ def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -249,6 +250,7 @@ def piper_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -283,7 +285,7 @@ def piper_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/template-tts-engine/help_content.py b/system/tts_engines/template-tts-engine/help_content.py index 277f9746..74bbf266 100644 --- a/system/tts_engines/template-tts-engine/help_content.py +++ b/system/tts_engines/template-tts-engine/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/template-tts-engine/model_settings.json b/system/tts_engines/template-tts-engine/model_settings.json index a7707549..edb78c3f 100644 --- a/system/tts_engines/template-tts-engine/model_settings.json +++ b/system/tts_engines/template-tts-engine/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "male_01.wav", "deepspeed_enabled": true, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, diff --git a/system/tts_engines/template-tts-engine/modelname_settings_page.py b/system/tts_engines/template-tts-engine/modelname_settings_page.py index 8fb2b4f7..dc0a3370 100644 --- a/system/tts_engines/template-tts-engine/modelname_settings_page.py +++ b/system/tts_engines/template-tts-engine/modelname_settings_page.py @@ -48,7 +48,7 @@ def xtts_voices_file_list(): # # You do not need to modify the function's logic or any other part of the code. -def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -64,6 +64,7 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -104,6 +105,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -138,7 +140,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/vits/help_content.py b/system/tts_engines/vits/help_content.py index 21d48c65..1bcb19f1 100644 --- a/system/tts_engines/vits/help_content.py +++ b/system/tts_engines/vits/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation diff --git a/system/tts_engines/vits/model_settings.json b/system/tts_engines/vits/model_settings.json index f50609e2..3dffcf78 100644 --- a/system/tts_engines/vits/model_settings.json +++ b/system/tts_engines/vits/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "p225", "def_narrator_voice": "p226", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": true, diff --git a/system/tts_engines/vits/vits_settings_page.py b/system/tts_engines/vits/vits_settings_page.py index 32326980..bd76f998 100644 --- a/system/tts_engines/vits/vits_settings_page.py +++ b/system/tts_engines/vits/vits_settings_page.py @@ -231,7 +231,7 @@ def download_language_pack(lang_code, progress=gr.Progress()): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -246,6 +246,7 @@ def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -283,6 +284,7 @@ def vits_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -317,7 +319,7 @@ def vits_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/xtts/help_content.py b/system/tts_engines/xtts/help_content.py index cc03f8be..f6cc68ee 100644 --- a/system/tts_engines/xtts/help_content.py +++ b/system/tts_engines/xtts/help_content.py @@ -237,6 +237,11 @@ class AllTalkHelpContent: - Accelerates TTS generation using optimized inference - Only available for engines and models that support DeepSpeed - Requires NVIDIA GPU with CUDA support + + - **Stream Response Capability** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Temperature Control** - Adjusts the variability in speech generation @@ -310,6 +315,11 @@ class AllTalkHelpContent: - Requires NVIDIA GPU with CUDA support - 2-3x speed improvement in generation - Recommended when available + + - **Streaming Support** + - Enables real-time streaming of generated speech output + - Reduces latency for faster feedback during synthesis + - Only available for engines and models that support Streaming - **Multi-Language Support** - Clone voices across multiple languages diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index c4504888..10ad2cc7 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -350,6 +350,7 @@ def __init__(self): self.def_character_voice = model_settings_file["settings"]["def_character_voice"] self.def_narrator_voice = model_settings_file["settings"]["def_narrator_voice"] self.deepspeed_enabled = model_settings_file["settings"]["deepspeed_enabled"] + self.streaming_enabled = model_settings_file["settings"]["streaming_enabled"] self.engine_installed = model_settings_file["settings"]["engine_installed"] self.generationspeed_set = model_settings_file["settings"]["generationspeed_set"] self.lowvram_enabled = model_settings_file["settings"]["lowvram_enabled"] diff --git a/system/tts_engines/xtts/model_settings.json b/system/tts_engines/xtts/model_settings.json index b680cfa2..a56af383 100644 --- a/system/tts_engines/xtts/model_settings.json +++ b/system/tts_engines/xtts/model_settings.json @@ -24,6 +24,7 @@ "def_character_voice": "female_01.wav", "def_narrator_voice": "male_01.wav", "deepspeed_enabled": false, + "streaming_enabled": false, "engine_installed": true, "generationspeed_set": 1, "lowvram_enabled": false, diff --git a/system/tts_engines/xtts/xtts_settings_page.py b/system/tts_engines/xtts/xtts_settings_page.py index cb3f7e70..ae2ef328 100644 --- a/system/tts_engines/xtts/xtts_settings_page.py +++ b/system/tts_engines/xtts/xtts_settings_page.py @@ -49,7 +49,7 @@ def xtts_voices_file_list(): # # You do not need to modify the function's logic or any other part of the code. -def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -65,6 +65,7 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" + model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled" model_config_data["settings"]["temperature_set"] = temperature_set_gr model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr model_config_data["settings"]["pitch_set"] = pitch_set_gr @@ -105,6 +106,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"]) deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"]) + streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"]) temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"]) repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"]) pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"]) @@ -139,7 +141,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # From f7c7800dce07defa4d5267ed9f908fa3e2b883d3 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 14:03:08 +0100 Subject: [PATCH 4/9] Use streaming flag within OpenAI Speech API --- tts_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tts_server.py b/tts_server.py index aabdb6a7..11e63855 100644 --- a/tts_server.py +++ b/tts_server.py @@ -1147,7 +1147,7 @@ async def openai_tts_generate(request: Request): await generate_audio(cleaned_string, mapped_voice, "auto", model_engine.temperature_set, model_engine.repetitionpenalty_set, speed, model_engine.pitch_set, - output_file_path, streaming=False) + output_file_path, model_engine.streaming_enabled) print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS") From 7b79c301a5873e396b060584085c3b9335890cad Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 14:23:45 +0100 Subject: [PATCH 5/9] Add streaming status to logs --- system/tts_engines/f5tts/model_engine.py | 3 ++- system/tts_engines/parler/model_engine.py | 3 ++- system/tts_engines/piper/model_engine.py | 3 ++- system/tts_engines/template-tts-engine/model_engine.py | 3 ++- system/tts_engines/vits/model_engine.py | 2 +- system/tts_engines/xtts/model_engine.py | 2 +- 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py index a6f62e6e..c08fe001 100644 --- a/system/tts_engines/f5tts/model_engine.py +++ b/system/tts_engines/f5tts/model_engine.py @@ -144,6 +144,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -1095,7 +1096,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if streaming: with open(output_file, 'rb') as f: diff --git a/system/tts_engines/parler/model_engine.py b/system/tts_engines/parler/model_engine.py index 83387745..27c7d215 100644 --- a/system/tts_engines/parler/model_engine.py +++ b/system/tts_engines/parler/model_engine.py @@ -91,6 +91,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -503,7 +504,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() # Record the end time to generate TTS generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False: await self.handle_lowvram_change() self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again. diff --git a/system/tts_engines/piper/model_engine.py b/system/tts_engines/piper/model_engine.py index 438e7bed..5d86ff13 100644 --- a/system/tts_engines/piper/model_engine.py +++ b/system/tts_engines/piper/model_engine.py @@ -90,6 +90,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -468,5 +469,5 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") self.tts_generating_lock = False diff --git a/system/tts_engines/template-tts-engine/model_engine.py b/system/tts_engines/template-tts-engine/model_engine.py index 5f78bb63..48f67c26 100644 --- a/system/tts_engines/template-tts-engine/model_engine.py +++ b/system/tts_engines/template-tts-engine/model_engine.py @@ -92,6 +92,7 @@ def __init__(self): self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified. self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified. self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine + self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used) self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation. self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine @@ -417,5 +418,5 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") self.tts_generating_lock = False diff --git a/system/tts_engines/vits/model_engine.py b/system/tts_engines/vits/model_engine.py index 9e514ba5..adfb48eb 100644 --- a/system/tts_engines/vits/model_engine.py +++ b/system/tts_engines/vits/model_engine.py @@ -638,7 +638,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ generate_end_time = time.time() # Record the end time to generate TTS generate_elapsed_time = generate_end_time - generate_start_time - print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m") + print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m") if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False: await self.handle_lowvram_change() self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again. diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index 10ad2cc7..94aceb96 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -1155,7 +1155,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena # Standard output message (not debug) self.print_message( - f"\033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m", + f"\033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m", message_type="standard" ) From 0c925196221febf2500444977355d03a40aaa084 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 18:24:35 +0100 Subject: [PATCH 6/9] Fix XTTS streaming mode --- system/tts_engines/xtts/model_engine.py | 149 ++++++++++++------------ tts_server.py | 99 ++++++++++------ 2 files changed, 134 insertions(+), 114 deletions(-) diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index 94aceb96..ad5a69e1 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -24,7 +24,6 @@ Note: You can add new functions, just DONT remove the functions that are already there, even if they are doing nothing as `tts_server.py` will still look for their existance and fail if they are missing. """ - ######################################## # Default imports # Do not change this # ######################################## @@ -968,7 +967,44 @@ async def handle_tts_method_change(self, tts_method): self.print_message(f"\033[94mModel Loadtime: \033[93m{generate_elapsed_time:.2f}\033[94m seconds\033[0m") return True - async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming): + async def prepare_voice_inputs(self, voice): + """Prepares latents and embeddings based on the voice input.""" + gpt_cond_latent = None + speaker_embedding = None + + if voice.startswith('latent:'): + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._load_latents(voice) + + elif voice.startswith('voiceset:'): + voice_set = voice.replace("voiceset:", "") + voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set) + self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts") + + wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav")) + if not wavs_files: + self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error") + raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}") + + if len(wavs_files) > 5: + wavs_files = random.sample(wavs_files, 5) + self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts") + + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) + + else: + normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice)) + wavs_files = [normalized_path] + self.print_message(f"Using single voice sample: {normalized_path}", message_type="debug_tts") + + if self.current_model_loaded.startswith("xtts"): + gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) + + return gpt_cond_latent, speaker_embedding + + async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, + streaming): """ Generate speech from text using the XTTS model. @@ -1018,71 +1054,33 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena generate_start_time = time.time() try: - # Voice input processing - self.print_message(f"Processing voice input: {voice}", message_type="debug_tts") - gpt_cond_latent = None - speaker_embedding = None - - # Handle different voice types - if voice.startswith('latent:'): - if self.current_model_loaded.startswith("xtts"): - gpt_cond_latent, speaker_embedding = self._load_latents(voice) - - elif voice.startswith('voiceset:'): - voice_set = voice.replace("voiceset:", "") - voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set) - self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts") - - wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav")) - if not wavs_files: - self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error") - raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}") - - if len(wavs_files) > 5: - wavs_files = random.sample(wavs_files, 5) - self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts") - - if self.current_model_loaded.startswith("xtts"): - self.print_message("Generating conditioning latents from voice set", message_type="debug_tts") - gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) - - else: - normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice)) - wavs_files = [normalized_path] - self.print_message(f"Using single voice sample: {normalized_path}", message_type="debug_tts") - - if self.current_model_loaded.startswith("xtts"): - self.print_message("Generating conditioning latents from single sample", message_type="debug_tts") - gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files) - - # Generate speech + # Preparation of latents and embeddings + gpt_cond_latent, speaker_embedding = await self.prepare_voice_inputs(voice) + + common_args = { + "text": text, + "language": language, + "gpt_cond_latent": gpt_cond_latent, + "speaker_embedding": speaker_embedding, + "temperature": float(temperature), + "length_penalty": float(self.model.config.length_penalty), + "repetition_penalty": float(repetition_penalty), + "top_k": int(self.model.config.top_k), + "top_p": float(self.model.config.top_p), + "speed": float(speed), + "enable_text_splitting": True + } + + self.print_message("Generation settings:", message_type="debug_tts_variables") + self.print_message(f"├─ Temperature: {temperature}", message_type="debug_tts_variables") + self.print_message(f"├─ Speed: {speed}", message_type="debug_tts_variables") + self.print_message(f"├─ Language: {language}", message_type="debug_tts_variables") + self.print_message(f"└─ Text length: {len(text)} characters", message_type="debug_tts_variables") + + # Handle streaming vs non-streaming if self.current_model_loaded.startswith("xtts"): - self.print_message(f"Generating speech for text: {text}", message_type="debug_tts") - - common_args = { - "text": text, - "language": language, - "gpt_cond_latent": gpt_cond_latent, - "speaker_embedding": speaker_embedding, - "temperature": float(temperature), - "length_penalty": float(self.model.config.length_penalty), - "repetition_penalty": float(repetition_penalty), - "top_k": int(self.model.config.top_k), - "top_p": float(self.model.config.top_p), - "speed": float(speed), - "enable_text_splitting": True - } - - self.print_message("Generation settings:", message_type="debug_tts_variables") - self.print_message(f"├─ Temperature: {temperature}", message_type="debug_tts_variables") - self.print_message(f"├─ Speed: {speed}", message_type="debug_tts_variables") - self.print_message(f"├─ Language: {language}", message_type="debug_tts_variables") - self.print_message(f"└─ Text length: {len(text)} characters", message_type="debug_tts_variables") - - # Handle streaming vs non-streaming if streaming: self.print_message("Starting streaming generation", message_type="debug_tts") - self.print_message(f"Using streaming-based generation and files {wavs_files}") output = self.model.inference_stream(**common_args, stream_chunk_size=20) file_chunks = [] @@ -1102,7 +1100,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena self.tts_generating_lock = False break - self.print_message(f"Processing chunk {i+1}", message_type="debug_tts") + self.print_message(f"Processing chunk {i + 1}", message_type="debug_tts") file_chunks.append(chunk) if isinstance(chunk, list): chunk = torch.cat(chunk, dim=0) @@ -1119,9 +1117,9 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena elif self.current_model_loaded.startswith("apitts"): if streaming: - raise ValueError("Streaming is only supported in XTTSv2 local mode") + raise ValueError("Streaming is not supported in APITTS mode") # Common arguments for both error and normal cases - common_args = { + api_args = { "file_path": output_file, "language": language, "temperature": temperature, @@ -1129,23 +1127,20 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena "repetition_penalty": repetition_penalty, "top_k": self.model.config.top_k, "top_p": self.model.config.top_p, - "speed": speed - } - if voice.startswith('latent:'): + "speed": speed, + } + + if voice.startswith("latent:"): self.print_message("API TTS method does not support latent files - Please use an audio reference file", message_type="error") self.model.tts_to_file( text="The API TTS method only supports audio files not latents. Please select an audio reference file instead.", speaker="Ana Florence", - **common_args + **api_args, ) else: self.print_message("Using API-based generation", message_type="debug_tts") - self.model.tts_to_file( - text=text, - speaker_wav=wavs_files, - **common_args - ) - + self.model.tts_to_file(text=text, speaker_wav=[voice], **api_args) + self.print_message(f"API generation completed, saved to: {output_file}", message_type="debug_tts") finally: diff --git a/tts_server.py b/tts_server.py index 11e63855..fdcaa221 100644 --- a/tts_server.py +++ b/tts_server.py @@ -945,22 +945,34 @@ async def generate_audio(text, voice, language, temperature, repetition_penalty, if language == "auto": language = detect_language(text) - response = model_engine.generate_tts(text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming) - + # Streaming mode if streaming: - async def stream_response(): + print_message("Streaming mode enabled", "debug", "TTS") + response = model_engine.generate_tts( + text, voice, language, temperature, repetition_penalty, speed, pitch, output_file=None, streaming=True + ) + + async def stream_audio(): try: async for chunk in response: yield chunk except Exception as e: print_message(f"Error during streaming audio generation: {str(e)}", "error", "GEN") raise - return stream_response() + + return stream_audio() + + # Non-streaming mode + print_message("Non-streaming mode enabled", "debug", "TTS") + response = model_engine.generate_tts( + text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming=False + ) + try: async for _ in response: pass except Exception as e: - print_message(f"Error during audio generation: {str(e)}", "error", "GEN") + print_message(f"Error during audio generation: {str(e)}", "error", "TTS") raise ########################### @@ -1110,22 +1122,24 @@ async def openai_tts_generate(request: Request): # Extract and validate parameters input_text = json_data["input"] voice = json_data["voice"] - response_format = json_data.get("response_format", "wav").lower() speed = json_data.get("speed", 1.0) print_message(f"Input text: {input_text}", "debug_openai", "TTS") print_message(f"Voice: {voice}", "debug_openai", "TTS") print_message(f"Speed: {speed}", "debug_openai", "TTS") + # Load current model engine configuration + current_model_engine = tts_class() + # Process text and map voice cleaned_string = html.unescape(standard_filtering(input_text)) voice_mapping = { - "alloy": model_engine.openai_alloy, - "echo": model_engine.openai_echo, - "fable": model_engine.openai_fable, - "nova": model_engine.openai_nova, - "onyx": model_engine.openai_onyx, - "shimmer": model_engine.openai_shimmer + "alloy": current_model_engine.openai_alloy, + "echo": current_model_engine.openai_echo, + "fable": current_model_engine.openai_fable, + "nova": current_model_engine.openai_nova, + "onyx": current_model_engine.openai_onyx, + "shimmer": current_model_engine.openai_shimmer } mapped_voice = voice_mapping.get(voice) @@ -1135,37 +1149,48 @@ async def openai_tts_generate(request: Request): print_message(f"Mapped voice: {mapped_voice}", "debug_openai", "TTS") - # Generate audio - unique_id = uuid.uuid4() - timestamp = int(time.time()) - output_file_path = f'{this_dir / config.get_output_directory() / f"openai_output_{unique_id}_{timestamp}.{model_engine.audio_format}"}' - - if config.debugging.debug_fullttstext: - print_message(cleaned_string, component="TTS") + if current_model_engine.streaming_enabled: + audio_stream = await generate_audio( + cleaned_string, mapped_voice, "auto", current_model_engine.temperature_set, + float(str(current_model_engine.repetitionpenalty_set).replace(',', '.')), speed, current_model_engine.pitch_set, + output_file=None, streaming=True + ) + return StreamingResponse(audio_stream, media_type="audio/wav") else: - print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS") + # Generate audio + unique_id = uuid.uuid4() + timestamp = int(time.time()) + output_file_path = f'{this_dir / config.get_output_directory() / f"openai_output_{unique_id}_{timestamp}.{current_model_engine.audio_format}"}' + response_format = json_data.get("response_format", "wav").lower() + + if config.debugging.debug_fullttstext: + print_message(cleaned_string, component="TTS") + else: + print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS") - await generate_audio(cleaned_string, mapped_voice, "auto", model_engine.temperature_set, - model_engine.repetitionpenalty_set, speed, model_engine.pitch_set, - output_file_path, model_engine.streaming_enabled) + await generate_audio( + cleaned_string, mapped_voice, "auto", current_model_engine.temperature_set, + float(str(current_model_engine.repetitionpenalty_set).replace(',', '.')), speed, current_model_engine.pitch_set, + output_file_path, streaming=False + ) - print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS") + print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS") - # Handle RVC processing - if config.rvc_settings.rvc_enabled: - if config.rvc_settings.rvc_char_model_file.lower() in ["disabled", "disable"]: - print_message("Pass rvccharacter_voice_gen", "debug_openai", "TTS") - else: - print_message("send to rvc", "debug_openai", "TTS") - pth_path = this_dir / "models" / "rvc_voices" / config.rvc_settings.rvc_char_model_file - pitch = config.rvc_settings.pitch - run_rvc(output_file_path, pth_path, pitch, infer_pipeline) + # Handle RVC processing + if config.rvc_settings.rvc_enabled: + if config.rvc_settings.rvc_char_model_file.lower() in ["disabled", "disable"]: + print_message("Pass rvccharacter_voice_gen", "debug_openai", "TTS") + else: + print_message("send to rvc", "debug_openai", "TTS") + pth_path = this_dir / "models" / "rvc_voices" / config.rvc_settings.rvc_char_model_file + pitch = config.rvc_settings.pitch + run_rvc(output_file_path, pth_path, pitch, infer_pipeline) - transcoded_file_path = await transcode_for_openai(output_file_path, response_format) - print_message(f"Audio transcoded to: {transcoded_file_path}", "debug_openai", "TTS") + transcoded_file_path = await transcode_for_openai(output_file_path, response_format) + print_message(f"Audio transcoded to: {transcoded_file_path}", "debug_openai", "TTS") - response = FileResponse(transcoded_file_path, media_type=f"audio/{response_format}", - filename=f"output.{response_format}") + return FileResponse(transcoded_file_path, media_type=f"audio/{response_format}", + filename=f"output.{response_format}") except ValueError as e: print_message(f"Value error occurred: {str(e)}", "error", "TTS") From a495c0a2397bae95100b0aedf5039c6ccf2cc9e4 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 19:03:48 +0100 Subject: [PATCH 7/9] Add fallbacks languages --- tts_server.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tts_server.py b/tts_server.py index aabdb6a7..0a8e1d66 100644 --- a/tts_server.py +++ b/tts_server.py @@ -46,6 +46,19 @@ DetectorFactory.seed = 0 # Ensure deterministic behavior +# Mapping of detected languages to xtts-supported languages +LANG_FALLBACKS = { + "en": "en", "es": "es", "fr": "fr", "de": "de", "it": "it", + "pt": "pt", "pl": "pl", "tr": "tr", "ru": "ru", "nl": "nl", + "cs": "cs", "ar": "ar", "zh-cn": "zh", "zh-tw": "zh", "ja": "ja", + "hu": "hu", "ko": "ko", + + # Additional fallbacks for unsupported languages + "uk": "ru", # Ukrainian → Russian + "bg": "ru", # Bulgarian → Russian + "ca": "fr", +} + ######################################################################################## # START-UP # Silence RVC warning about torch.nn.utils.weight_norm even though not used # ######################################################################################## @@ -2107,16 +2120,25 @@ async def tts_finalize_output(audio_files: List[Path], params: dict) -> Tuple[Pa def detect_language(text: str) -> str: """ - Detect the language of the given text. + Detect the language of the given text and apply a fallback for unsupported languages. :param text: Text to analyze. - :return: Detected language code (e.g., 'en', 'fr'). + :return: A supported language code (e.g., 'en', 'fr'). """ try: + # Detect the language of the text detected_lang = detect(text) print_message(f"Detected language: {detected_lang}", "debug", "LANG_DETECTION") - return detected_lang + + # Use the fallback language if the detected one is unsupported + fallback_lang = LANG_FALLBACKS.get(detected_lang, "en") # Default fallback: French + if detected_lang != fallback_lang: + print_message(f"Language '{detected_lang}' not supported, using fallback '{fallback_lang}'", "warn", + "LANG_FALLBACK") + + return fallback_lang except LangDetectException as e: + # Handle errors in language detection print_message(f"Language detection error: {str(e)}", "error", "LANG_DETECTION") raise ValueError("Could not detect language") From 0ca824601b70d9094ec9f05bf9fb0d529b5eff19 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 21:08:24 +0100 Subject: [PATCH 8/9] Add Ash, Coral and Sage voices --- script.py | 1 + system/openaittstest.html | 3 +++ system/tts_engines/f5tts/f5tts_settings_page.py | 12 ++++++++++-- system/tts_engines/f5tts/help_content.py | 5 ++++- system/tts_engines/f5tts/model_engine.py | 3 +++ system/tts_engines/f5tts/model_settings.json | 5 ++++- system/tts_engines/parler/help_content.py | 5 ++++- system/tts_engines/parler/model_engine.py | 3 +++ system/tts_engines/parler/model_settings.json | 3 +++ system/tts_engines/parler/parler_settings_page.py | 12 ++++++++++-- system/tts_engines/piper/help_content.py | 5 ++++- system/tts_engines/piper/model_engine.py | 3 +++ system/tts_engines/piper/model_settings.json | 3 +++ system/tts_engines/piper/piper_settings_page.py | 12 ++++++++++-- .../template-tts-engine/help_content.py | 5 ++++- .../template-tts-engine/model_engine.py | 3 +++ .../template-tts-engine/model_settings.json | 3 +++ .../modelname_settings_page.py | 12 ++++++++++-- .../template-tts-engine/template_engine.py | 8 +++++++- system/tts_engines/vits/help_content.py | 5 ++++- system/tts_engines/vits/model_engine.py | 3 +++ system/tts_engines/vits/model_settings.json | 3 +++ system/tts_engines/vits/vits_settings_page.py | 12 ++++++++++-- system/tts_engines/xtts/help_content.py | 5 ++++- system/tts_engines/xtts/model_engine.py | 6 ++++++ system/tts_engines/xtts/model_settings.json | 3 +++ system/tts_engines/xtts/xtts_settings_page.py | 15 ++++++++++----- test_server.py | 2 +- tts_server.py | 11 ++++++++++- 29 files changed, 146 insertions(+), 25 deletions(-) diff --git a/script.py b/script.py index cd3605f5..c1fde1ab 100644 --- a/script.py +++ b/script.py @@ -2653,6 +2653,7 @@ def load_engine_configs(_state): if module: # Load the engine's config from its JSON file json_file_path = os.path.join(this_dir, "system", "tts_engines", engine_name, "model_settings.json") + print("SILY") try: with open(json_file_path, "r", encoding="utf-8") as config_file: globals()[f"{engine_name}_model_config_data"] = json.load(config_file) diff --git a/system/openaittstest.html b/system/openaittstest.html index 37c38a52..e2f407f6 100644 --- a/system/openaittstest.html +++ b/system/openaittstest.html @@ -100,10 +100,13 @@

OpenAI API/AllTalk TTS API Test

diff --git a/system/tts_engines/f5tts/f5tts_settings_page.py b/system/tts_engines/f5tts/f5tts_settings_page.py index 5b6e03d9..6e72e364 100644 --- a/system/tts_engines/f5tts/f5tts_settings_page.py +++ b/system/tts_engines/f5tts/f5tts_settings_page.py @@ -140,7 +140,7 @@ def transcribe_files(model_name: str, use_cpu: bool = False, progress=gr.Progres # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -148,10 +148,13 @@ def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" @@ -204,12 +207,17 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -228,7 +236,7 @@ def f5tts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/f5tts/help_content.py b/system/tts_engines/f5tts/help_content.py index 9163a5e8..5e08102e 100644 --- a/system/tts_engines/f5tts/help_content.py +++ b/system/tts_engines/f5tts/help_content.py @@ -272,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py index c08fe001..348ae78f 100644 --- a/system/tts_engines/f5tts/model_engine.py +++ b/system/tts_engines/f5tts/model_engine.py @@ -155,10 +155,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # diff --git a/system/tts_engines/f5tts/model_settings.json b/system/tts_engines/f5tts/model_settings.json index 80865eca..f70ba63b 100644 --- a/system/tts_engines/f5tts/model_settings.json +++ b/system/tts_engines/f5tts/model_settings.json @@ -34,10 +34,13 @@ }, "openai_voices": { "alloy": "female_01.wav", + "ash": "female_01.wav", + "coral": "female_01.wav", "echo": "female_01.wav", "fable": "female_01.wav", "nova": "female_01.wav", "onyx": "female_01.wav", - "shimmer": "female_01.wavf" + "sage": "female_01.wav", + "shimmer": "female_01.wav" } } \ No newline at end of file diff --git a/system/tts_engines/parler/help_content.py b/system/tts_engines/parler/help_content.py index 592a0ddc..a210ed76 100644 --- a/system/tts_engines/parler/help_content.py +++ b/system/tts_engines/parler/help_content.py @@ -272,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/parler/model_engine.py b/system/tts_engines/parler/model_engine.py index 27c7d215..086df061 100644 --- a/system/tts_engines/parler/model_engine.py +++ b/system/tts_engines/parler/model_engine.py @@ -102,10 +102,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # diff --git a/system/tts_engines/parler/model_settings.json b/system/tts_engines/parler/model_settings.json index 5d190661..d0de670a 100644 --- a/system/tts_engines/parler/model_settings.json +++ b/system/tts_engines/parler/model_settings.json @@ -34,10 +34,13 @@ }, "openai_voices": { "alloy": "enthusiastic_female", + "ash": "enthusiastic_female", + "coral": "enthusiastic_female", "echo": "enthusiastic_female", "fable": "enthusiastic_female", "nova": "enthusiastic_female", "onyx": "enthusiastic_female", + "sage": "enthusiastic_female", "shimmer": "enthusiastic_female" } } \ No newline at end of file diff --git a/system/tts_engines/parler/parler_settings_page.py b/system/tts_engines/parler/parler_settings_page.py index ef354a66..55cc077b 100644 --- a/system/tts_engines/parler/parler_settings_page.py +++ b/system/tts_engines/parler/parler_settings_page.py @@ -52,7 +52,7 @@ def parler_voices_file_list(): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -60,10 +60,13 @@ def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" @@ -116,12 +119,17 @@ def parler_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -134,7 +142,7 @@ def parler_model_alltalk_settings(model_config_data): with gr.Row(): submit_button = gr.Button("Update Settings") output_message = gr.Textbox(label="Output Message", interactive=False, show_label=False) - submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) with gr.Accordion("HELP - 🔊 Understanding TTS Engine Default Settings Page", open=False): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS, elem_classes="custom-markdown") diff --git a/system/tts_engines/piper/help_content.py b/system/tts_engines/piper/help_content.py index 417fc679..2f1d1db8 100644 --- a/system/tts_engines/piper/help_content.py +++ b/system/tts_engines/piper/help_content.py @@ -272,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/piper/model_engine.py b/system/tts_engines/piper/model_engine.py index 5d86ff13..4d12ba8d 100644 --- a/system/tts_engines/piper/model_engine.py +++ b/system/tts_engines/piper/model_engine.py @@ -101,10 +101,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # diff --git a/system/tts_engines/piper/model_settings.json b/system/tts_engines/piper/model_settings.json index 182d8923..eaa8d740 100644 --- a/system/tts_engines/piper/model_settings.json +++ b/system/tts_engines/piper/model_settings.json @@ -34,10 +34,13 @@ }, "openai_voices": { "alloy": "en_US-ljspeech-high.onnx", + "ash": "en_US-ljspeech-high.onnx", + "coral": "en_US-ljspeech-high.onnx", "echo": "en_US-ljspeech-high.onnx", "fable": "en_US-ljspeech-high.onnx", "nova": "en_US-ljspeech-high.onnx", "onyx": "en_US-ljspeech-high.onnx", + "sage": "en_US-ljspeech-high.onnx", "shimmer": "en_US-ljspeech-high.onnx" } } diff --git a/system/tts_engines/piper/piper_settings_page.py b/system/tts_engines/piper/piper_settings_page.py index c20df0fe..f5e9e9b1 100644 --- a/system/tts_engines/piper/piper_settings_page.py +++ b/system/tts_engines/piper/piper_settings_page.py @@ -197,7 +197,7 @@ def download_language_pack(lang_code, progress=gr.Progress()): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -205,10 +205,13 @@ def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" @@ -261,12 +264,17 @@ def piper_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -285,7 +293,7 @@ def piper_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/template-tts-engine/help_content.py b/system/tts_engines/template-tts-engine/help_content.py index 74bbf266..b1f6acad 100644 --- a/system/tts_engines/template-tts-engine/help_content.py +++ b/system/tts_engines/template-tts-engine/help_content.py @@ -272,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/template-tts-engine/model_engine.py b/system/tts_engines/template-tts-engine/model_engine.py index 48f67c26..08b5b0e9 100644 --- a/system/tts_engines/template-tts-engine/model_engine.py +++ b/system/tts_engines/template-tts-engine/model_engine.py @@ -103,10 +103,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # diff --git a/system/tts_engines/template-tts-engine/model_settings.json b/system/tts_engines/template-tts-engine/model_settings.json index edb78c3f..ac7edcdf 100644 --- a/system/tts_engines/template-tts-engine/model_settings.json +++ b/system/tts_engines/template-tts-engine/model_settings.json @@ -34,10 +34,13 @@ }, "openai_voices": { "alloy": "female_01.wav", + "ash": "female_01.wav", + "coral": "female_01.wav", "echo": "female_01.wav", "fable": "female_01.wav", "nova": "female_01.wav", "onyx": "female_01.wav", + "sage": "female_01.wav", "shimmer": "female_01.wav" } } \ No newline at end of file diff --git a/system/tts_engines/template-tts-engine/modelname_settings_page.py b/system/tts_engines/template-tts-engine/modelname_settings_page.py index dc0a3370..d9586bd6 100644 --- a/system/tts_engines/template-tts-engine/modelname_settings_page.py +++ b/system/tts_engines/template-tts-engine/modelname_settings_page.py @@ -48,7 +48,7 @@ def xtts_voices_file_list(): # # You do not need to modify the function's logic or any other part of the code. -def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -57,10 +57,13 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" @@ -116,12 +119,17 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -140,7 +148,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/template-tts-engine/template_engine.py b/system/tts_engines/template-tts-engine/template_engine.py index f95c5a96..b57593e3 100644 --- a/system/tts_engines/template-tts-engine/template_engine.py +++ b/system/tts_engines/template-tts-engine/template_engine.py @@ -270,10 +270,13 @@ def __init__(self): OpenAI Voice Mappings: - self.openai_alloy: Alloy voice mapping + - self.openai_ash: Ash voice mapping + - self.openai_coral: Coral voice mapping - self.openai_echo: Echo voice mapping - self.openai_fable: Fable voice mapping - self.openai_nova: Nova voice mapping - self.openai_onyx: Onyx voice mapping + - self.openai_sage: Sage voice mapping - self.openai_shimmer: Shimmer voice mapping Integration Requirements: @@ -339,11 +342,14 @@ def __init__(self): # DO NOT MODIFY - OpenAI voice mappings from model_settings.json self.openai_alloy = model_settings_file["openai_voices"]["alloy"] + self.openai_ash = model_settings_file["openai_voices"]["ash"] + self.openai_coral = model_settings_file["openai_voices"]["coral"] self.openai_echo = model_settings_file["openai_voices"]["echo"] self.openai_fable = model_settings_file["openai_voices"]["fable"] self.openai_nova = model_settings_file["openai_voices"]["nova"] self.openai_onyx = model_settings_file["openai_voices"]["onyx"] - self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"] + self.openai_sage = model_settings_file["openai_voices"]["sage"] + self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"] """ Below is the name of the folder that will be created-used under `/models/{folder}` diff --git a/system/tts_engines/vits/help_content.py b/system/tts_engines/vits/help_content.py index 1bcb19f1..f0478e7b 100644 --- a/system/tts_engines/vits/help_content.py +++ b/system/tts_engines/vits/help_content.py @@ -272,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/vits/model_engine.py b/system/tts_engines/vits/model_engine.py index adfb48eb..a50b6070 100644 --- a/system/tts_engines/vits/model_engine.py +++ b/system/tts_engines/vits/model_engine.py @@ -109,10 +109,13 @@ def __init__(self): self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp) # Gather the OpenAI API Voice Mappings self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice + self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice + self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice + self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice ################################################################### # DONT CHANGE # Load params and api_defaults from confignew.json # diff --git a/system/tts_engines/vits/model_settings.json b/system/tts_engines/vits/model_settings.json index 3dffcf78..1e9c6d89 100644 --- a/system/tts_engines/vits/model_settings.json +++ b/system/tts_engines/vits/model_settings.json @@ -34,10 +34,13 @@ }, "openai_voices": { "alloy": "p225", + "ash": "p225", + "coral": "p225", "echo": "p225", "fable": "p225", "nova": "p225", "onyx": "p225", + "sage": "p225", "shimmer": "p225" } } \ No newline at end of file diff --git a/system/tts_engines/vits/vits_settings_page.py b/system/tts_engines/vits/vits_settings_page.py index bd76f998..d73a7c30 100644 --- a/system/tts_engines/vits/vits_settings_page.py +++ b/system/tts_engines/vits/vits_settings_page.py @@ -231,7 +231,7 @@ def download_language_pack(lang_code, progress=gr.Progress()): # dictionaries with the values provided as arguments, and save the updated settings back to the JSON file. # # You do not need to modify the function's logic or any other part of the code. -def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -239,10 +239,13 @@ def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" @@ -295,12 +298,17 @@ def vits_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -319,7 +327,7 @@ def vits_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # diff --git a/system/tts_engines/xtts/help_content.py b/system/tts_engines/xtts/help_content.py index f6cc68ee..03c35f66 100644 --- a/system/tts_engines/xtts/help_content.py +++ b/system/tts_engines/xtts/help_content.py @@ -272,12 +272,15 @@ class AllTalkHelpContent: ### OpenAI Voice Mappings - Only relevant when using the OpenAI-compatible API endpoint - - Maps OpenAI's six standard voices to equivalent voices in the current engine: + - Maps OpenAI's nine standard voices to equivalent voices in the current engine: - `alloy` + - `ash` + - `coral` - `echo` - `fable` - `nova` - `onyx` + - `sage` - `shimmer` - Essential for maintaining compatibility with OpenAI API calls - Each mapping can be customized to any available voice in the current engine diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py index ad5a69e1..5231f715 100644 --- a/system/tts_engines/xtts/model_engine.py +++ b/system/tts_engines/xtts/model_engine.py @@ -290,10 +290,13 @@ def __init__(self): OpenAI Voice Mappings: - self.openai_alloy: Alloy voice mapping + - self.openai_ash: Ash voice mapping + - self.openai_coral: Coral voice mapping - self.openai_echo: Echo voice mapping - self.openai_fable: Fable voice mapping - self.openai_nova: Nova voice mapping - self.openai_onyx: Onyx voice mapping + - self.openai_sage: Sage voice mapping - self.openai_shimmer: Shimmer voice mapping Integration Requirements: @@ -360,10 +363,13 @@ def __init__(self): # DO NOT MODIFY - OpenAI voice mappings from model_settings.json self.openai_alloy = model_settings_file["openai_voices"]["alloy"] + self.openai_ash = model_settings_file["openai_voices"]["ash"] + self.openai_coral = model_settings_file["openai_voices"]["coral"] self.openai_echo = model_settings_file["openai_voices"]["echo"] self.openai_fable = model_settings_file["openai_voices"]["fable"] self.openai_nova = model_settings_file["openai_voices"]["nova"] self.openai_onyx = model_settings_file["openai_voices"]["onyx"] + self.openai_sage = model_settings_file["openai_voices"]["sage"] self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"] """ diff --git a/system/tts_engines/xtts/model_settings.json b/system/tts_engines/xtts/model_settings.json index a56af383..ed5b298c 100644 --- a/system/tts_engines/xtts/model_settings.json +++ b/system/tts_engines/xtts/model_settings.json @@ -34,10 +34,13 @@ }, "openai_voices": { "alloy": "female_01.wav", + "ash": "female_01.wav", + "coral": "female_01.wav", "echo": "female_01.wav", "fable": "female_01.wav", "nova": "female_01.wav", "onyx": "female_01.wav", + "sage": "female_01.wav", "shimmer": "female_01.wav" } } \ No newline at end of file diff --git a/system/tts_engines/xtts/xtts_settings_page.py b/system/tts_engines/xtts/xtts_settings_page.py index ae2ef328..f26b1354 100644 --- a/system/tts_engines/xtts/xtts_settings_page.py +++ b/system/tts_engines/xtts/xtts_settings_page.py @@ -49,7 +49,7 @@ def xtts_voices_file_list(): # # You do not need to modify the function's logic or any other part of the code. -def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr): +def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr): # Load the model_config_data from the JSON file with open(os.path.join(this_dir, "model_settings.json"), "r") as f: model_config_data = json.load(f) @@ -58,10 +58,13 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo model_config_data["settings"]["def_character_voice"] = def_character_voice_gr model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr model_config_data["openai_voices"]["alloy"] = alloy_gr + model_config_data["openai_voices"]["ash"] = ash_gr + model_config_data["openai_voices"]["coral"] = coral_gr model_config_data["openai_voices"]["echo"] = echo_gr model_config_data["openai_voices"]["fable"] = fable_gr model_config_data["openai_voices"]["nova"] = nova_gr model_config_data["openai_voices"]["onyx"] = onyx_gr + model_config_data["openai_voices"]["sage"] = sage_gr model_config_data["openai_voices"]["shimmer"] = shimmer_gr model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled" model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled" @@ -117,12 +120,17 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Group(): with gr.Row(): alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True) + ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True) + with gr.Row(): + coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True) echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True) with gr.Row(): fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True) nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True) with gr.Row(): onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True) + sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True) + with gr.Row(): shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True) with gr.Column(): gr.Markdown("### Default Voices") @@ -141,7 +149,7 @@ def xtts_model_alltalk_settings(model_config_data): with gr.Row(): gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown") gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown") - submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message) + submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message) ########################################################################################### # Do not change this section apart from "TTS Engine Name" value to match your engine name # @@ -291,6 +299,3 @@ def confirm_download(model_name): def xtts_at_gradio_settings_page(model_config_data): app = xtts_model_alltalk_settings(model_config_data) return app -def xtts_at_gradio_settings_page(model_config_data): - app = xtts_model_alltalk_settings(model_config_data) - return app diff --git a/test_server.py b/test_server.py index e3d2d199..49c0c827 100644 --- a/test_server.py +++ b/test_server.py @@ -729,7 +729,7 @@ async def _test_openai_tts(self, text: str): self.logger.info("Testing OpenAI compatible endpoint") self.logger.info(f"Testing OpenAI generation with text: {text}") - test_voices = ["alloy", "echo", "fable", "nova", "onyx", "shimmer"] + test_voices = ["alloy", "ash", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"] test_formats = ["wav", "mp3", "opus", "aac"] for voice in test_voices[:2]: # Test first two voices only diff --git a/tts_server.py b/tts_server.py index 9e3c1084..263192c7 100644 --- a/tts_server.py +++ b/tts_server.py @@ -1078,7 +1078,7 @@ class OpenAIInput(BaseModel): @classmethod def validate_voice(cls, value): """Validate that the requested voice is supported by OpenAI TTS.""" - supported_voices = ["alloy", "echo", "fable", "nova", "onyx", "shimmer"] + supported_voices = ["alloy", "ash", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"] if value not in supported_voices: raise ValueError(f"Voice must be one of {supported_voices}") return value @@ -1148,10 +1148,13 @@ async def openai_tts_generate(request: Request): cleaned_string = html.unescape(standard_filtering(input_text)) voice_mapping = { "alloy": current_model_engine.openai_alloy, + "ash": current_model_engine.openai_ash, + "coral": current_model_engine.openai_coral, "echo": current_model_engine.openai_echo, "fable": current_model_engine.openai_fable, "nova": current_model_engine.openai_nova, "onyx": current_model_engine.openai_onyx, + "sage": current_model_engine.openai_sage, "shimmer": current_model_engine.openai_shimmer } @@ -1276,10 +1279,13 @@ async def transcode_for_openai(input_file, output_format): class VoiceMappings(BaseModel): """OpenAI to engine voice mapping configuration.""" alloy: str + ash: str + coral: str echo: str fable: str nova: str onyx: str + sage: str shimmer: str @app.put("/api/openai-voicemap") @@ -1291,10 +1297,13 @@ async def update_openai_voice_mappings(mappings: VoiceMappings): # Update in-memory mappings print_message("Updating in-memory voice mappings", "debug_openai", "TTS") model_engine.openai_alloy = mappings.alloy + model_engine.openai_ash = mappings.ash + model_engine.openai_coral = mappings.coral model_engine.openai_echo = mappings.echo model_engine.openai_fable = mappings.fable model_engine.openai_nova = mappings.nova model_engine.openai_onyx = mappings.onyx + model_engine.openai_sage = mappings.sage model_engine.openai_shimmer = mappings.shimmer # Update settings file From 78d5c8238b3f781f3add81a0e4c35f58e8d583c0 Mon Sep 17 00:00:00 2001 From: Ilyas Hilali Date: Sat, 4 Jan 2025 22:42:43 +0100 Subject: [PATCH 9/9] Remove testing log --- script.py | 1 - 1 file changed, 1 deletion(-) diff --git a/script.py b/script.py index c1fde1ab..cd3605f5 100644 --- a/script.py +++ b/script.py @@ -2653,7 +2653,6 @@ def load_engine_configs(_state): if module: # Load the engine's config from its JSON file json_file_path = os.path.join(this_dir, "system", "tts_engines", engine_name, "model_settings.json") - print("SILY") try: with open(json_file_path, "r", encoding="utf-8") as config_file: globals()[f"{engine_name}_model_config_data"] = json.load(config_file)