diff --git a/script.py b/script.py
index 2c0a93ca..cd3605f5 100644
--- a/script.py
+++ b/script.py
@@ -3309,6 +3309,7 @@ def on_load(request: gr.Request):
gen_lang = gr.Dropdown(
value=config.api_def.api_language,
choices=[
+ "auto",
"ar",
"zh",
"cs",
diff --git a/system/openaittstest.html b/system/openaittstest.html
index 37c38a52..e2f407f6 100644
--- a/system/openaittstest.html
+++ b/system/openaittstest.html
@@ -100,10 +100,13 @@
OpenAI API/AllTalk TTS API Test
diff --git a/system/requirements/requirements_colab.txt b/system/requirements/requirements_colab.txt
index 5d978fc9..a3344af7 100644
--- a/system/requirements/requirements_colab.txt
+++ b/system/requirements/requirements_colab.txt
@@ -46,3 +46,4 @@ piper-tts; sys_platform == "linux"
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
+langdetect>=1.0.9
diff --git a/system/requirements/requirements_standalone.txt b/system/requirements/requirements_standalone.txt
index dfc3c0e0..3c47c36c 100644
--- a/system/requirements/requirements_standalone.txt
+++ b/system/requirements/requirements_standalone.txt
@@ -36,3 +36,4 @@ fastapi==0.112.2
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
+langdetect>=1.0.9
diff --git a/system/requirements/requirements_textgen.txt b/system/requirements/requirements_textgen.txt
index 2007867e..f89b30e6 100644
--- a/system/requirements/requirements_textgen.txt
+++ b/system/requirements/requirements_textgen.txt
@@ -33,3 +33,4 @@ piper-phonemize==1.1.0; sys_platform == "darwin"
plotly==5.24.1
scipy==1.14.1
pyOpenSSL>=24.2.1
+langdetect>=1.0.9
diff --git a/system/tts_engines/f5tts/f5tts_settings_page.py b/system/tts_engines/f5tts/f5tts_settings_page.py
index f58802d3..6e72e364 100644
--- a/system/tts_engines/f5tts/f5tts_settings_page.py
+++ b/system/tts_engines/f5tts/f5tts_settings_page.py
@@ -140,7 +140,7 @@ def transcribe_files(model_name: str, use_cpu: bool = False, progress=gr.Progres
# dictionaries with the values provided as arguments, and save the updated settings back to the JSON file.
#
# You do not need to modify the function's logic or any other part of the code.
-def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
+def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
@@ -148,13 +148,17 @@ def f5tts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l
model_config_data["settings"]["def_character_voice"] = def_character_voice_gr
model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr
model_config_data["openai_voices"]["alloy"] = alloy_gr
+ model_config_data["openai_voices"]["ash"] = ash_gr
+ model_config_data["openai_voices"]["coral"] = coral_gr
model_config_data["openai_voices"]["echo"] = echo_gr
model_config_data["openai_voices"]["fable"] = fable_gr
model_config_data["openai_voices"]["nova"] = nova_gr
model_config_data["openai_voices"]["onyx"] = onyx_gr
+ model_config_data["openai_voices"]["sage"] = sage_gr
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
+ model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
@@ -192,6 +196,7 @@ def f5tts_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
+ streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
@@ -202,12 +207,17 @@ def f5tts_model_alltalk_settings(model_config_data):
with gr.Group():
with gr.Row():
alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True)
+ ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
+ coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True)
echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True)
with gr.Row():
fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True)
nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True)
with gr.Row():
onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True)
+ sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True)
with gr.Column():
gr.Markdown("### Default Voices")
@@ -226,7 +236,7 @@ def f5tts_model_alltalk_settings(model_config_data):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown")
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown")
- submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
+ submit_button.click(f5tts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message)
###########################################################################################
# Do not change this section apart from "TTS Engine Name" value to match your engine name #
diff --git a/system/tts_engines/f5tts/help_content.py b/system/tts_engines/f5tts/help_content.py
index d339c250..5e08102e 100644
--- a/system/tts_engines/f5tts/help_content.py
+++ b/system/tts_engines/f5tts/help_content.py
@@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support
+
+ - **Stream Response Capability**
+ - Enables real-time streaming of generated speech output
+ - Reduces latency for faster feedback during synthesis
+ - Only available for engines and models that support Streaming
- **Temperature Control**
- Adjusts the variability in speech generation
@@ -267,12 +272,15 @@ class AllTalkHelpContent:
### OpenAI Voice Mappings
- Only relevant when using the OpenAI-compatible API endpoint
- - Maps OpenAI's six standard voices to equivalent voices in the current engine:
+ - Maps OpenAI's nine standard voices to equivalent voices in the current engine:
- `alloy`
+ - `ash`
+ - `coral`
- `echo`
- `fable`
- `nova`
- `onyx`
+ - `sage`
- `shimmer`
- Essential for maintaining compatibility with OpenAI API calls
- Each mapping can be customized to any available voice in the current engine
diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py
index edb57b5e..348ae78f 100644
--- a/system/tts_engines/f5tts/model_engine.py
+++ b/system/tts_engines/f5tts/model_engine.py
@@ -144,6 +144,7 @@ def __init__(self):
self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified.
self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified.
self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine
+ self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine
self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used)
self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation.
self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine
@@ -154,10 +155,13 @@ def __init__(self):
self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp)
# Gather the OpenAI API Voice Mappings
self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice
+ self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice
+ self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice
self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice
self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice
self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice
self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice
+ self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice
self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice
###################################################################
# DONT CHANGE # Load params and api_defaults from confignew.json #
@@ -400,9 +404,16 @@ def scan_models_folder(self):
if model_dir.is_dir():
# First try to find model_*.safetensors files
model_files = list(model_dir.glob("model_*.safetensors"))
+ if not model_files:
+ # Try finding the pt model file as fallback
+ # If no model_*.safetensors found, try finding a .pt model file
+ model_files = list(model_dir.glob("model_*.pt"))
if not model_files:
# If no model_*.safetensors found, try any .safetensors file
model_files = list(model_dir.glob("*.safetensors"))
+ if not model_files:
+ # If no model_*.safetensors found, try any .pt file
+ model_files = list(model_dir.glob("*.pt"))
vocab_file = model_dir / "vocab.txt"
vocos_dir = model_dir / "vocos"
@@ -508,9 +519,15 @@ async def api_manual_load_model(self, model_name):
# Dynamically find the safetensors model file
model_files = list(model_dir.glob("model_*.safetensors"))
+ if not model_files:
+ # Try finding the pt model file as fallback
+ model_files = list(model_dir.glob("model_*.pt"))
if not model_files:
# Try finding any safetensors file as fallback
model_files = list(model_dir.glob("*.safetensors"))
+ if not model_files:
+ # Try finding any pt file as fallback
+ model_files = list(model_dir.glob("*.pt"))
if not model_files:
print(f"[{self.branding}ENG] \033[91mError\033[0m: No model's safetensors file was found in the F5-TTS models directory.")
@@ -1082,7 +1099,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
generate_end_time = time.time()
generate_elapsed_time = generate_end_time - generate_start_time
- print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
+ print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")
if streaming:
with open(output_file, 'rb') as f:
diff --git a/system/tts_engines/f5tts/model_settings.json b/system/tts_engines/f5tts/model_settings.json
index 034d0a47..f70ba63b 100644
--- a/system/tts_engines/f5tts/model_settings.json
+++ b/system/tts_engines/f5tts/model_settings.json
@@ -24,6 +24,7 @@
"def_character_voice": "female_01.wav",
"def_narrator_voice": "female_01.wav",
"deepspeed_enabled": false,
+ "streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 0.9,
"lowvram_enabled": true,
@@ -33,10 +34,13 @@
},
"openai_voices": {
"alloy": "female_01.wav",
+ "ash": "female_01.wav",
+ "coral": "female_01.wav",
"echo": "female_01.wav",
"fable": "female_01.wav",
"nova": "female_01.wav",
"onyx": "female_01.wav",
- "shimmer": "female_01.wavf"
+ "sage": "female_01.wav",
+ "shimmer": "female_01.wav"
}
}
\ No newline at end of file
diff --git a/system/tts_engines/parler/help_content.py b/system/tts_engines/parler/help_content.py
index f111302c..a210ed76 100644
--- a/system/tts_engines/parler/help_content.py
+++ b/system/tts_engines/parler/help_content.py
@@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support
+
+ - **Stream Response Capability**
+ - Enables real-time streaming of generated speech output
+ - Reduces latency for faster feedback during synthesis
+ - Only available for engines and models that support Streaming
- **Temperature Control**
- Adjusts the variability in speech generation
@@ -267,12 +272,15 @@ class AllTalkHelpContent:
### OpenAI Voice Mappings
- Only relevant when using the OpenAI-compatible API endpoint
- - Maps OpenAI's six standard voices to equivalent voices in the current engine:
+ - Maps OpenAI's nine standard voices to equivalent voices in the current engine:
- `alloy`
+ - `ash`
+ - `coral`
- `echo`
- `fable`
- `nova`
- `onyx`
+ - `sage`
- `shimmer`
- Essential for maintaining compatibility with OpenAI API calls
- Each mapping can be customized to any available voice in the current engine
diff --git a/system/tts_engines/parler/model_engine.py b/system/tts_engines/parler/model_engine.py
index 83387745..086df061 100644
--- a/system/tts_engines/parler/model_engine.py
+++ b/system/tts_engines/parler/model_engine.py
@@ -91,6 +91,7 @@ def __init__(self):
self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified.
self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified.
self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine
+ self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine
self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used)
self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation.
self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine
@@ -101,10 +102,13 @@ def __init__(self):
self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp)
# Gather the OpenAI API Voice Mappings
self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice
+ self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice
+ self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice
self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice
self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice
self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice
self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice
+ self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice
self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice
###################################################################
# DONT CHANGE # Load params and api_defaults from confignew.json #
@@ -503,7 +507,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
# ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
generate_end_time = time.time() # Record the end time to generate TTS
generate_elapsed_time = generate_end_time - generate_start_time
- print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
+ print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")
if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False:
await self.handle_lowvram_change()
self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again.
diff --git a/system/tts_engines/parler/model_settings.json b/system/tts_engines/parler/model_settings.json
index 580f5cb0..d0de670a 100644
--- a/system/tts_engines/parler/model_settings.json
+++ b/system/tts_engines/parler/model_settings.json
@@ -24,6 +24,7 @@
"def_character_voice": "enthusiastic_female",
"def_narrator_voice": "enthusiastic_female",
"deepspeed_enabled": false,
+ "streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 1,
"lowvram_enabled": false,
@@ -33,10 +34,13 @@
},
"openai_voices": {
"alloy": "enthusiastic_female",
+ "ash": "enthusiastic_female",
+ "coral": "enthusiastic_female",
"echo": "enthusiastic_female",
"fable": "enthusiastic_female",
"nova": "enthusiastic_female",
"onyx": "enthusiastic_female",
+ "sage": "enthusiastic_female",
"shimmer": "enthusiastic_female"
}
}
\ No newline at end of file
diff --git a/system/tts_engines/parler/parler_settings_page.py b/system/tts_engines/parler/parler_settings_page.py
index e2b1fc85..55cc077b 100644
--- a/system/tts_engines/parler/parler_settings_page.py
+++ b/system/tts_engines/parler/parler_settings_page.py
@@ -52,7 +52,7 @@ def parler_voices_file_list():
# dictionaries with the values provided as arguments, and save the updated settings back to the JSON file.
#
# You do not need to modify the function's logic or any other part of the code.
-def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
+def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
@@ -60,13 +60,17 @@ def parler_model_update_settings(def_character_voice_gr, def_narrator_voice_gr,
model_config_data["settings"]["def_character_voice"] = def_character_voice_gr
model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr
model_config_data["openai_voices"]["alloy"] = alloy_gr
+ model_config_data["openai_voices"]["ash"] = ash_gr
+ model_config_data["openai_voices"]["coral"] = coral_gr
model_config_data["openai_voices"]["echo"] = echo_gr
model_config_data["openai_voices"]["fable"] = fable_gr
model_config_data["openai_voices"]["nova"] = nova_gr
model_config_data["openai_voices"]["onyx"] = onyx_gr
+ model_config_data["openai_voices"]["sage"] = sage_gr
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
+ model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
@@ -104,6 +108,7 @@ def parler_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
+ streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
@@ -114,12 +119,17 @@ def parler_model_alltalk_settings(model_config_data):
with gr.Group():
with gr.Row():
alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True)
+ ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
+ coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True)
echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True)
with gr.Row():
fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True)
nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True)
with gr.Row():
onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True)
+ sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True)
with gr.Column():
gr.Markdown("### Default Voices")
@@ -132,7 +142,7 @@ def parler_model_alltalk_settings(model_config_data):
with gr.Row():
submit_button = gr.Button("Update Settings")
output_message = gr.Textbox(label="Output Message", interactive=False, show_label=False)
- submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
+ submit_button.click(parler_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message)
with gr.Accordion("HELP - 🔊 Understanding TTS Engine Default Settings Page", open=False):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS, elem_classes="custom-markdown")
diff --git a/system/tts_engines/piper/help_content.py b/system/tts_engines/piper/help_content.py
index 4f32cb79..2f1d1db8 100644
--- a/system/tts_engines/piper/help_content.py
+++ b/system/tts_engines/piper/help_content.py
@@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support
+
+ - **Stream Response Capability**
+ - Enables real-time streaming of generated speech output
+ - Reduces latency for faster feedback during synthesis
+ - Only available for engines and models that support Streaming
- **Temperature Control**
- Adjusts the variability in speech generation
@@ -267,12 +272,15 @@ class AllTalkHelpContent:
### OpenAI Voice Mappings
- Only relevant when using the OpenAI-compatible API endpoint
- - Maps OpenAI's six standard voices to equivalent voices in the current engine:
+ - Maps OpenAI's nine standard voices to equivalent voices in the current engine:
- `alloy`
+ - `ash`
+ - `coral`
- `echo`
- `fable`
- `nova`
- `onyx`
+ - `sage`
- `shimmer`
- Essential for maintaining compatibility with OpenAI API calls
- Each mapping can be customized to any available voice in the current engine
diff --git a/system/tts_engines/piper/model_engine.py b/system/tts_engines/piper/model_engine.py
index 438e7bed..4d12ba8d 100644
--- a/system/tts_engines/piper/model_engine.py
+++ b/system/tts_engines/piper/model_engine.py
@@ -90,6 +90,7 @@ def __init__(self):
self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified.
self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified.
self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine
+ self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine
self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used)
self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation.
self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine
@@ -100,10 +101,13 @@ def __init__(self):
self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp)
# Gather the OpenAI API Voice Mappings
self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice
+ self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice
+ self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice
self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice
self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice
self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice
self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice
+ self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice
self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice
###################################################################
# DONT CHANGE # Load params and api_defaults from confignew.json #
@@ -468,5 +472,5 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
# ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
generate_end_time = time.time()
generate_elapsed_time = generate_end_time - generate_start_time
- print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
+ print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")
self.tts_generating_lock = False
diff --git a/system/tts_engines/piper/model_settings.json b/system/tts_engines/piper/model_settings.json
index d9345456..eaa8d740 100644
--- a/system/tts_engines/piper/model_settings.json
+++ b/system/tts_engines/piper/model_settings.json
@@ -24,6 +24,7 @@
"def_character_voice": "en_US-ljspeech-high.onnx",
"def_narrator_voice": "en_US-ljspeech-high.onnx",
"deepspeed_enabled": false,
+ "streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 1,
"lowvram_enabled": false,
@@ -33,10 +34,13 @@
},
"openai_voices": {
"alloy": "en_US-ljspeech-high.onnx",
+ "ash": "en_US-ljspeech-high.onnx",
+ "coral": "en_US-ljspeech-high.onnx",
"echo": "en_US-ljspeech-high.onnx",
"fable": "en_US-ljspeech-high.onnx",
"nova": "en_US-ljspeech-high.onnx",
"onyx": "en_US-ljspeech-high.onnx",
+ "sage": "en_US-ljspeech-high.onnx",
"shimmer": "en_US-ljspeech-high.onnx"
}
}
diff --git a/system/tts_engines/piper/piper_settings_page.py b/system/tts_engines/piper/piper_settings_page.py
index fa8708dd..f5e9e9b1 100644
--- a/system/tts_engines/piper/piper_settings_page.py
+++ b/system/tts_engines/piper/piper_settings_page.py
@@ -197,7 +197,7 @@ def download_language_pack(lang_code, progress=gr.Progress()):
# dictionaries with the values provided as arguments, and save the updated settings back to the JSON file.
#
# You do not need to modify the function's logic or any other part of the code.
-def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
+def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
@@ -205,13 +205,17 @@ def piper_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, l
model_config_data["settings"]["def_character_voice"] = def_character_voice_gr
model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr
model_config_data["openai_voices"]["alloy"] = alloy_gr
+ model_config_data["openai_voices"]["ash"] = ash_gr
+ model_config_data["openai_voices"]["coral"] = coral_gr
model_config_data["openai_voices"]["echo"] = echo_gr
model_config_data["openai_voices"]["fable"] = fable_gr
model_config_data["openai_voices"]["nova"] = nova_gr
model_config_data["openai_voices"]["onyx"] = onyx_gr
+ model_config_data["openai_voices"]["sage"] = sage_gr
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
+ model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
@@ -249,6 +253,7 @@ def piper_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
+ streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
@@ -259,12 +264,17 @@ def piper_model_alltalk_settings(model_config_data):
with gr.Group():
with gr.Row():
alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True)
+ ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
+ coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True)
echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True)
with gr.Row():
fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True)
nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True)
with gr.Row():
onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True)
+ sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True)
with gr.Column():
gr.Markdown("### Default Voices")
@@ -283,7 +293,7 @@ def piper_model_alltalk_settings(model_config_data):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown")
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown")
- submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
+ submit_button.click(piper_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message)
###########################################################################################
# Do not change this section apart from "TTS Engine Name" value to match your engine name #
diff --git a/system/tts_engines/template-tts-engine/help_content.py b/system/tts_engines/template-tts-engine/help_content.py
index 277f9746..b1f6acad 100644
--- a/system/tts_engines/template-tts-engine/help_content.py
+++ b/system/tts_engines/template-tts-engine/help_content.py
@@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support
+
+ - **Stream Response Capability**
+ - Enables real-time streaming of generated speech output
+ - Reduces latency for faster feedback during synthesis
+ - Only available for engines and models that support Streaming
- **Temperature Control**
- Adjusts the variability in speech generation
@@ -267,12 +272,15 @@ class AllTalkHelpContent:
### OpenAI Voice Mappings
- Only relevant when using the OpenAI-compatible API endpoint
- - Maps OpenAI's six standard voices to equivalent voices in the current engine:
+ - Maps OpenAI's nine standard voices to equivalent voices in the current engine:
- `alloy`
+ - `ash`
+ - `coral`
- `echo`
- `fable`
- `nova`
- `onyx`
+ - `sage`
- `shimmer`
- Essential for maintaining compatibility with OpenAI API calls
- Each mapping can be customized to any available voice in the current engine
diff --git a/system/tts_engines/template-tts-engine/model_engine.py b/system/tts_engines/template-tts-engine/model_engine.py
index 5f78bb63..08b5b0e9 100644
--- a/system/tts_engines/template-tts-engine/model_engine.py
+++ b/system/tts_engines/template-tts-engine/model_engine.py
@@ -92,6 +92,7 @@ def __init__(self):
self.def_character_voice = tts_model_loaded["settings"]["def_character_voice"] # What is the current default main/character voice that will be used if no voice specified.
self.def_narrator_voice = tts_model_loaded["settings"]["def_narrator_voice"] # What is the current default narrator voice that will be used if no voice specified.
self.deepspeed_enabled = tts_model_loaded["settings"]["deepspeed_enabled"] # If its available, is DeepSpeed enabled for the TTS engine
+ self.streaming_enabled = tts_model_loaded["settings"]["streaming_enabled"] # If its available, is Streaming enabled for the TTS engine
self.engine_installed = tts_model_loaded["settings"]["engine_installed"] # Has the TTS engine been setup/installed (not curently used)
self.generationspeed_set = tts_model_loaded["settings"]["generationspeed_set"] # What is the set/stored speed for generation.
self.lowvram_enabled = tts_model_loaded["settings"]["lowvram_enabled"] # If its available, is LowVRAM enabled for the TTS engine
@@ -102,10 +103,13 @@ def __init__(self):
self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp)
# Gather the OpenAI API Voice Mappings
self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice
+ self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice
+ self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice
self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice
self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice
self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice
self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice
+ self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice
self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice
###################################################################
# DONT CHANGE # Load params and api_defaults from confignew.json #
@@ -417,5 +421,5 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
# ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
generate_end_time = time.time()
generate_elapsed_time = generate_end_time - generate_start_time
- print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
+ print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")
self.tts_generating_lock = False
diff --git a/system/tts_engines/template-tts-engine/model_settings.json b/system/tts_engines/template-tts-engine/model_settings.json
index a7707549..ac7edcdf 100644
--- a/system/tts_engines/template-tts-engine/model_settings.json
+++ b/system/tts_engines/template-tts-engine/model_settings.json
@@ -24,6 +24,7 @@
"def_character_voice": "female_01.wav",
"def_narrator_voice": "male_01.wav",
"deepspeed_enabled": true,
+ "streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 1,
"lowvram_enabled": false,
@@ -33,10 +34,13 @@
},
"openai_voices": {
"alloy": "female_01.wav",
+ "ash": "female_01.wav",
+ "coral": "female_01.wav",
"echo": "female_01.wav",
"fable": "female_01.wav",
"nova": "female_01.wav",
"onyx": "female_01.wav",
+ "sage": "female_01.wav",
"shimmer": "female_01.wav"
}
}
\ No newline at end of file
diff --git a/system/tts_engines/template-tts-engine/modelname_settings_page.py b/system/tts_engines/template-tts-engine/modelname_settings_page.py
index 8fb2b4f7..d9586bd6 100644
--- a/system/tts_engines/template-tts-engine/modelname_settings_page.py
+++ b/system/tts_engines/template-tts-engine/modelname_settings_page.py
@@ -48,7 +48,7 @@ def xtts_voices_file_list():
#
# You do not need to modify the function's logic or any other part of the code.
-def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
+def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
@@ -57,13 +57,17 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo
model_config_data["settings"]["def_character_voice"] = def_character_voice_gr
model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr
model_config_data["openai_voices"]["alloy"] = alloy_gr
+ model_config_data["openai_voices"]["ash"] = ash_gr
+ model_config_data["openai_voices"]["coral"] = coral_gr
model_config_data["openai_voices"]["echo"] = echo_gr
model_config_data["openai_voices"]["fable"] = fable_gr
model_config_data["openai_voices"]["nova"] = nova_gr
model_config_data["openai_voices"]["onyx"] = onyx_gr
+ model_config_data["openai_voices"]["sage"] = sage_gr
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
+ model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
@@ -104,6 +108,7 @@ def xtts_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
+ streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
@@ -114,12 +119,17 @@ def xtts_model_alltalk_settings(model_config_data):
with gr.Group():
with gr.Row():
alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True)
+ ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
+ coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True)
echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True)
with gr.Row():
fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True)
nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True)
with gr.Row():
onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True)
+ sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True)
with gr.Column():
gr.Markdown("### Default Voices")
@@ -138,7 +148,7 @@ def xtts_model_alltalk_settings(model_config_data):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown")
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown")
- submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
+ submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message)
###########################################################################################
# Do not change this section apart from "TTS Engine Name" value to match your engine name #
diff --git a/system/tts_engines/template-tts-engine/template_engine.py b/system/tts_engines/template-tts-engine/template_engine.py
index f95c5a96..b57593e3 100644
--- a/system/tts_engines/template-tts-engine/template_engine.py
+++ b/system/tts_engines/template-tts-engine/template_engine.py
@@ -270,10 +270,13 @@ def __init__(self):
OpenAI Voice Mappings:
- self.openai_alloy: Alloy voice mapping
+ - self.openai_ash: Ash voice mapping
+ - self.openai_coral: Coral voice mapping
- self.openai_echo: Echo voice mapping
- self.openai_fable: Fable voice mapping
- self.openai_nova: Nova voice mapping
- self.openai_onyx: Onyx voice mapping
+ - self.openai_sage: Sage voice mapping
- self.openai_shimmer: Shimmer voice mapping
Integration Requirements:
@@ -339,11 +342,14 @@ def __init__(self):
# DO NOT MODIFY - OpenAI voice mappings from model_settings.json
self.openai_alloy = model_settings_file["openai_voices"]["alloy"]
+ self.openai_ash = model_settings_file["openai_voices"]["ash"]
+ self.openai_coral = model_settings_file["openai_voices"]["coral"]
self.openai_echo = model_settings_file["openai_voices"]["echo"]
self.openai_fable = model_settings_file["openai_voices"]["fable"]
self.openai_nova = model_settings_file["openai_voices"]["nova"]
self.openai_onyx = model_settings_file["openai_voices"]["onyx"]
- self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"]
+ self.openai_sage = model_settings_file["openai_voices"]["sage"]
+ self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"]
"""
Below is the name of the folder that will be created-used under `/models/{folder}`
diff --git a/system/tts_engines/vits/help_content.py b/system/tts_engines/vits/help_content.py
index 21d48c65..f0478e7b 100644
--- a/system/tts_engines/vits/help_content.py
+++ b/system/tts_engines/vits/help_content.py
@@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support
+
+ - **Stream Response Capability**
+ - Enables real-time streaming of generated speech output
+ - Reduces latency for faster feedback during synthesis
+ - Only available for engines and models that support Streaming
- **Temperature Control**
- Adjusts the variability in speech generation
@@ -267,12 +272,15 @@ class AllTalkHelpContent:
### OpenAI Voice Mappings
- Only relevant when using the OpenAI-compatible API endpoint
- - Maps OpenAI's six standard voices to equivalent voices in the current engine:
+ - Maps OpenAI's nine standard voices to equivalent voices in the current engine:
- `alloy`
+ - `ash`
+ - `coral`
- `echo`
- `fable`
- `nova`
- `onyx`
+ - `sage`
- `shimmer`
- Essential for maintaining compatibility with OpenAI API calls
- Each mapping can be customized to any available voice in the current engine
diff --git a/system/tts_engines/vits/model_engine.py b/system/tts_engines/vits/model_engine.py
index 9e514ba5..a50b6070 100644
--- a/system/tts_engines/vits/model_engine.py
+++ b/system/tts_engines/vits/model_engine.py
@@ -109,10 +109,13 @@ def __init__(self):
self.pitch_set = tts_model_loaded["settings"]["pitch_set"] # What is the currenly set pitch of the model (If it support temp)
# Gather the OpenAI API Voice Mappings
self.openai_alloy = tts_model_loaded["openai_voices"]["alloy"] # The TTS engine voice that will be mapped to Open AI Alloy voice
+ self.openai_ash = tts_model_loaded["openai_voices"]["ash"] # The TTS engine voice that will be mapped to Open AI Ash voice
+ self.openai_coral = tts_model_loaded["openai_voices"]["coral"] # The TTS engine voice that will be mapped to Open AI Coral voice
self.openai_echo = tts_model_loaded["openai_voices"]["echo"] # The TTS engine voice that will be mapped to Open AI Echo voice
self.openai_fable = tts_model_loaded["openai_voices"]["fable"] # The TTS engine voice that will be mapped to Open AI Fable voice
self.openai_nova = tts_model_loaded["openai_voices"]["nova"] # The TTS engine voice that will be mapped to Open AI Nova voice
self.openai_onyx = tts_model_loaded["openai_voices"]["onyx"] # The TTS engine voice that will be mapped to Open AI Onyx voice
+ self.openai_sage = tts_model_loaded["openai_voices"]["sage"] # The TTS engine voice that will be mapped to Open AI Sage voice
self.openai_shimmer = tts_model_loaded["openai_voices"]["shimmer"] # The TTS engine voice that will be mapped to Open AI Shimmer voice
###################################################################
# DONT CHANGE # Load params and api_defaults from confignew.json #
@@ -638,7 +641,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
# ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
generate_end_time = time.time() # Record the end time to generate TTS
generate_elapsed_time = generate_end_time - generate_start_time
- print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m")
+ print(f"[{self.branding}GEN] \033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m")
if self.lowvram_enabled and self.device == "cuda" and self.tts_narrator_generatingtts == False:
await self.handle_lowvram_change()
self.tts_generating_lock = False # Unlock the TTS generation queue to allow TTS generation requests to come in again.
diff --git a/system/tts_engines/vits/model_settings.json b/system/tts_engines/vits/model_settings.json
index f50609e2..1e9c6d89 100644
--- a/system/tts_engines/vits/model_settings.json
+++ b/system/tts_engines/vits/model_settings.json
@@ -24,6 +24,7 @@
"def_character_voice": "p225",
"def_narrator_voice": "p226",
"deepspeed_enabled": false,
+ "streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 1,
"lowvram_enabled": true,
@@ -33,10 +34,13 @@
},
"openai_voices": {
"alloy": "p225",
+ "ash": "p225",
+ "coral": "p225",
"echo": "p225",
"fable": "p225",
"nova": "p225",
"onyx": "p225",
+ "sage": "p225",
"shimmer": "p225"
}
}
\ No newline at end of file
diff --git a/system/tts_engines/vits/vits_settings_page.py b/system/tts_engines/vits/vits_settings_page.py
index 32326980..d73a7c30 100644
--- a/system/tts_engines/vits/vits_settings_page.py
+++ b/system/tts_engines/vits/vits_settings_page.py
@@ -231,7 +231,7 @@ def download_language_pack(lang_code, progress=gr.Progress()):
# dictionaries with the values provided as arguments, and save the updated settings back to the JSON file.
#
# You do not need to modify the function's logic or any other part of the code.
-def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
+def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
@@ -239,13 +239,17 @@ def vits_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo
model_config_data["settings"]["def_character_voice"] = def_character_voice_gr
model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr
model_config_data["openai_voices"]["alloy"] = alloy_gr
+ model_config_data["openai_voices"]["ash"] = ash_gr
+ model_config_data["openai_voices"]["coral"] = coral_gr
model_config_data["openai_voices"]["echo"] = echo_gr
model_config_data["openai_voices"]["fable"] = fable_gr
model_config_data["openai_voices"]["nova"] = nova_gr
model_config_data["openai_voices"]["onyx"] = onyx_gr
+ model_config_data["openai_voices"]["sage"] = sage_gr
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
+ model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
@@ -283,6 +287,7 @@ def vits_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
+ streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
@@ -293,12 +298,17 @@ def vits_model_alltalk_settings(model_config_data):
with gr.Group():
with gr.Row():
alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True)
+ ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
+ coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True)
echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True)
with gr.Row():
fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True)
nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True)
with gr.Row():
onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True)
+ sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True)
with gr.Column():
gr.Markdown("### Default Voices")
@@ -317,7 +327,7 @@ def vits_model_alltalk_settings(model_config_data):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown")
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown")
- submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
+ submit_button.click(vits_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message)
###########################################################################################
# Do not change this section apart from "TTS Engine Name" value to match your engine name #
diff --git a/system/tts_engines/xtts/help_content.py b/system/tts_engines/xtts/help_content.py
index cc03f8be..03c35f66 100644
--- a/system/tts_engines/xtts/help_content.py
+++ b/system/tts_engines/xtts/help_content.py
@@ -237,6 +237,11 @@ class AllTalkHelpContent:
- Accelerates TTS generation using optimized inference
- Only available for engines and models that support DeepSpeed
- Requires NVIDIA GPU with CUDA support
+
+ - **Stream Response Capability**
+ - Enables real-time streaming of generated speech output
+ - Reduces latency for faster feedback during synthesis
+ - Only available for engines and models that support Streaming
- **Temperature Control**
- Adjusts the variability in speech generation
@@ -267,12 +272,15 @@ class AllTalkHelpContent:
### OpenAI Voice Mappings
- Only relevant when using the OpenAI-compatible API endpoint
- - Maps OpenAI's six standard voices to equivalent voices in the current engine:
+ - Maps OpenAI's nine standard voices to equivalent voices in the current engine:
- `alloy`
+ - `ash`
+ - `coral`
- `echo`
- `fable`
- `nova`
- `onyx`
+ - `sage`
- `shimmer`
- Essential for maintaining compatibility with OpenAI API calls
- Each mapping can be customized to any available voice in the current engine
@@ -310,6 +318,11 @@ class AllTalkHelpContent:
- Requires NVIDIA GPU with CUDA support
- 2-3x speed improvement in generation
- Recommended when available
+
+ - **Streaming Support**
+ - Enables real-time streaming of generated speech output
+ - Reduces latency for faster feedback during synthesis
+ - Only available for engines and models that support Streaming
- **Multi-Language Support**
- Clone voices across multiple languages
diff --git a/system/tts_engines/xtts/model_engine.py b/system/tts_engines/xtts/model_engine.py
index c4504888..5231f715 100644
--- a/system/tts_engines/xtts/model_engine.py
+++ b/system/tts_engines/xtts/model_engine.py
@@ -24,7 +24,6 @@
Note: You can add new functions, just DONT remove the functions that are already there, even if they
are doing nothing as `tts_server.py` will still look for their existance and fail if they are missing.
"""
-
########################################
# Default imports # Do not change this #
########################################
@@ -291,10 +290,13 @@ def __init__(self):
OpenAI Voice Mappings:
- self.openai_alloy: Alloy voice mapping
+ - self.openai_ash: Ash voice mapping
+ - self.openai_coral: Coral voice mapping
- self.openai_echo: Echo voice mapping
- self.openai_fable: Fable voice mapping
- self.openai_nova: Nova voice mapping
- self.openai_onyx: Onyx voice mapping
+ - self.openai_sage: Sage voice mapping
- self.openai_shimmer: Shimmer voice mapping
Integration Requirements:
@@ -350,6 +352,7 @@ def __init__(self):
self.def_character_voice = model_settings_file["settings"]["def_character_voice"]
self.def_narrator_voice = model_settings_file["settings"]["def_narrator_voice"]
self.deepspeed_enabled = model_settings_file["settings"]["deepspeed_enabled"]
+ self.streaming_enabled = model_settings_file["settings"]["streaming_enabled"]
self.engine_installed = model_settings_file["settings"]["engine_installed"]
self.generationspeed_set = model_settings_file["settings"]["generationspeed_set"]
self.lowvram_enabled = model_settings_file["settings"]["lowvram_enabled"]
@@ -360,10 +363,13 @@ def __init__(self):
# DO NOT MODIFY - OpenAI voice mappings from model_settings.json
self.openai_alloy = model_settings_file["openai_voices"]["alloy"]
+ self.openai_ash = model_settings_file["openai_voices"]["ash"]
+ self.openai_coral = model_settings_file["openai_voices"]["coral"]
self.openai_echo = model_settings_file["openai_voices"]["echo"]
self.openai_fable = model_settings_file["openai_voices"]["fable"]
self.openai_nova = model_settings_file["openai_voices"]["nova"]
self.openai_onyx = model_settings_file["openai_voices"]["onyx"]
+ self.openai_sage = model_settings_file["openai_voices"]["sage"]
self.openai_shimmer = model_settings_file["openai_voices"]["shimmer"]
"""
@@ -967,7 +973,44 @@ async def handle_tts_method_change(self, tts_method):
self.print_message(f"\033[94mModel Loadtime: \033[93m{generate_elapsed_time:.2f}\033[94m seconds\033[0m")
return True
- async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming):
+ async def prepare_voice_inputs(self, voice):
+ """Prepares latents and embeddings based on the voice input."""
+ gpt_cond_latent = None
+ speaker_embedding = None
+
+ if voice.startswith('latent:'):
+ if self.current_model_loaded.startswith("xtts"):
+ gpt_cond_latent, speaker_embedding = self._load_latents(voice)
+
+ elif voice.startswith('voiceset:'):
+ voice_set = voice.replace("voiceset:", "")
+ voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set)
+ self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts")
+
+ wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav"))
+ if not wavs_files:
+ self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error")
+ raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}")
+
+ if len(wavs_files) > 5:
+ wavs_files = random.sample(wavs_files, 5)
+ self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts")
+
+ if self.current_model_loaded.startswith("xtts"):
+ gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files)
+
+ else:
+ normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice))
+ wavs_files = [normalized_path]
+ self.print_message(f"Using single voice sample: {normalized_path}", message_type="debug_tts")
+
+ if self.current_model_loaded.startswith("xtts"):
+ gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files)
+
+ return gpt_cond_latent, speaker_embedding
+
+ async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file,
+ streaming):
"""
Generate speech from text using the XTTS model.
@@ -1017,71 +1060,33 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
generate_start_time = time.time()
try:
- # Voice input processing
- self.print_message(f"Processing voice input: {voice}", message_type="debug_tts")
- gpt_cond_latent = None
- speaker_embedding = None
-
- # Handle different voice types
- if voice.startswith('latent:'):
- if self.current_model_loaded.startswith("xtts"):
- gpt_cond_latent, speaker_embedding = self._load_latents(voice)
-
- elif voice.startswith('voiceset:'):
- voice_set = voice.replace("voiceset:", "")
- voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set)
- self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts")
-
- wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav"))
- if not wavs_files:
- self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error")
- raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}")
-
- if len(wavs_files) > 5:
- wavs_files = random.sample(wavs_files, 5)
- self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts")
-
- if self.current_model_loaded.startswith("xtts"):
- self.print_message("Generating conditioning latents from voice set", message_type="debug_tts")
- gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files)
-
- else:
- normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice))
- wavs_files = [normalized_path]
- self.print_message(f"Using single voice sample: {normalized_path}", message_type="debug_tts")
-
- if self.current_model_loaded.startswith("xtts"):
- self.print_message("Generating conditioning latents from single sample", message_type="debug_tts")
- gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files)
-
- # Generate speech
+ # Preparation of latents and embeddings
+ gpt_cond_latent, speaker_embedding = await self.prepare_voice_inputs(voice)
+
+ common_args = {
+ "text": text,
+ "language": language,
+ "gpt_cond_latent": gpt_cond_latent,
+ "speaker_embedding": speaker_embedding,
+ "temperature": float(temperature),
+ "length_penalty": float(self.model.config.length_penalty),
+ "repetition_penalty": float(repetition_penalty),
+ "top_k": int(self.model.config.top_k),
+ "top_p": float(self.model.config.top_p),
+ "speed": float(speed),
+ "enable_text_splitting": True
+ }
+
+ self.print_message("Generation settings:", message_type="debug_tts_variables")
+ self.print_message(f"├─ Temperature: {temperature}", message_type="debug_tts_variables")
+ self.print_message(f"├─ Speed: {speed}", message_type="debug_tts_variables")
+ self.print_message(f"├─ Language: {language}", message_type="debug_tts_variables")
+ self.print_message(f"└─ Text length: {len(text)} characters", message_type="debug_tts_variables")
+
+ # Handle streaming vs non-streaming
if self.current_model_loaded.startswith("xtts"):
- self.print_message(f"Generating speech for text: {text}", message_type="debug_tts")
-
- common_args = {
- "text": text,
- "language": language,
- "gpt_cond_latent": gpt_cond_latent,
- "speaker_embedding": speaker_embedding,
- "temperature": float(temperature),
- "length_penalty": float(self.model.config.length_penalty),
- "repetition_penalty": float(repetition_penalty),
- "top_k": int(self.model.config.top_k),
- "top_p": float(self.model.config.top_p),
- "speed": float(speed),
- "enable_text_splitting": True
- }
-
- self.print_message("Generation settings:", message_type="debug_tts_variables")
- self.print_message(f"├─ Temperature: {temperature}", message_type="debug_tts_variables")
- self.print_message(f"├─ Speed: {speed}", message_type="debug_tts_variables")
- self.print_message(f"├─ Language: {language}", message_type="debug_tts_variables")
- self.print_message(f"└─ Text length: {len(text)} characters", message_type="debug_tts_variables")
-
- # Handle streaming vs non-streaming
if streaming:
self.print_message("Starting streaming generation", message_type="debug_tts")
- self.print_message(f"Using streaming-based generation and files {wavs_files}")
output = self.model.inference_stream(**common_args, stream_chunk_size=20)
file_chunks = []
@@ -1101,7 +1106,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
self.tts_generating_lock = False
break
- self.print_message(f"Processing chunk {i+1}", message_type="debug_tts")
+ self.print_message(f"Processing chunk {i + 1}", message_type="debug_tts")
file_chunks.append(chunk)
if isinstance(chunk, list):
chunk = torch.cat(chunk, dim=0)
@@ -1118,9 +1123,9 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
elif self.current_model_loaded.startswith("apitts"):
if streaming:
- raise ValueError("Streaming is only supported in XTTSv2 local mode")
+ raise ValueError("Streaming is not supported in APITTS mode")
# Common arguments for both error and normal cases
- common_args = {
+ api_args = {
"file_path": output_file,
"language": language,
"temperature": temperature,
@@ -1128,23 +1133,20 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
"repetition_penalty": repetition_penalty,
"top_k": self.model.config.top_k,
"top_p": self.model.config.top_p,
- "speed": speed
- }
- if voice.startswith('latent:'):
+ "speed": speed,
+ }
+
+ if voice.startswith("latent:"):
self.print_message("API TTS method does not support latent files - Please use an audio reference file", message_type="error")
self.model.tts_to_file(
text="The API TTS method only supports audio files not latents. Please select an audio reference file instead.",
speaker="Ana Florence",
- **common_args
+ **api_args,
)
else:
self.print_message("Using API-based generation", message_type="debug_tts")
- self.model.tts_to_file(
- text=text,
- speaker_wav=wavs_files,
- **common_args
- )
-
+ self.model.tts_to_file(text=text, speaker_wav=[voice], **api_args)
+
self.print_message(f"API generation completed, saved to: {output_file}", message_type="debug_tts")
finally:
@@ -1154,7 +1156,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena
# Standard output message (not debug)
self.print_message(
- f"\033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled}\033[0m",
+ f"\033[94mTTS Generate: \033[93m{generate_elapsed_time:.2f} seconds. \033[94mLowVRAM: \033[33m{self.lowvram_enabled} \033[94mDeepSpeed: \033[33m{self.deepspeed_enabled} \033[94mStreaming: \033[33m{self.streaming_enabled}\033[0m",
message_type="standard"
)
diff --git a/system/tts_engines/xtts/model_settings.json b/system/tts_engines/xtts/model_settings.json
index b680cfa2..ed5b298c 100644
--- a/system/tts_engines/xtts/model_settings.json
+++ b/system/tts_engines/xtts/model_settings.json
@@ -24,6 +24,7 @@
"def_character_voice": "female_01.wav",
"def_narrator_voice": "male_01.wav",
"deepspeed_enabled": false,
+ "streaming_enabled": false,
"engine_installed": true,
"generationspeed_set": 1,
"lowvram_enabled": false,
@@ -33,10 +34,13 @@
},
"openai_voices": {
"alloy": "female_01.wav",
+ "ash": "female_01.wav",
+ "coral": "female_01.wav",
"echo": "female_01.wav",
"fable": "female_01.wav",
"nova": "female_01.wav",
"onyx": "female_01.wav",
+ "sage": "female_01.wav",
"shimmer": "female_01.wav"
}
}
\ No newline at end of file
diff --git a/system/tts_engines/xtts/xtts_settings_page.py b/system/tts_engines/xtts/xtts_settings_page.py
index cb3f7e70..f26b1354 100644
--- a/system/tts_engines/xtts/xtts_settings_page.py
+++ b/system/tts_engines/xtts/xtts_settings_page.py
@@ -49,7 +49,7 @@ def xtts_voices_file_list():
#
# You do not need to modify the function's logic or any other part of the code.
-def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr):
+def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr):
# Load the model_config_data from the JSON file
with open(os.path.join(this_dir, "model_settings.json"), "r") as f:
model_config_data = json.load(f)
@@ -58,13 +58,17 @@ def xtts_model_update_settings(def_character_voice_gr, def_narrator_voice_gr, lo
model_config_data["settings"]["def_character_voice"] = def_character_voice_gr
model_config_data["settings"]["def_narrator_voice"] = def_narrator_voice_gr
model_config_data["openai_voices"]["alloy"] = alloy_gr
+ model_config_data["openai_voices"]["ash"] = ash_gr
+ model_config_data["openai_voices"]["coral"] = coral_gr
model_config_data["openai_voices"]["echo"] = echo_gr
model_config_data["openai_voices"]["fable"] = fable_gr
model_config_data["openai_voices"]["nova"] = nova_gr
model_config_data["openai_voices"]["onyx"] = onyx_gr
+ model_config_data["openai_voices"]["sage"] = sage_gr
model_config_data["openai_voices"]["shimmer"] = shimmer_gr
model_config_data["settings"]["lowvram_enabled"] = lowvram_enabled_gr == "Enabled"
model_config_data["settings"]["deepspeed_enabled"] = deepspeed_enabled_gr == "Enabled"
+ model_config_data["settings"]["streaming_enabled"] = streaming_enabled_gr == "Enabled"
model_config_data["settings"]["temperature_set"] = temperature_set_gr
model_config_data["settings"]["repetitionpenalty_set"] = repetitionpenalty_set_gr
model_config_data["settings"]["pitch_set"] = pitch_set_gr
@@ -105,6 +109,7 @@ def xtts_model_alltalk_settings(model_config_data):
with gr.Row():
lowvram_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Low VRAM" if model_config_data["model_capabilties"]["lowvram_capable"] else "Low VRAM N/A", value="Enabled" if model_config_data["settings"]["lowvram_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["lowvram_capable"])
deepspeed_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="DeepSpeed Activate" if model_config_data["model_capabilties"]["deepspeed_capable"] else "DeepSpeed N/A", value="Enabled" if model_config_data["settings"]["deepspeed_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["deepspeed_capable"])
+ streaming_enabled_gr = gr.Radio(choices={"Enabled": "true", "Disabled": "false"}, label="Streaming" if model_config_data["model_capabilties"]["streaming_capable"] else "Streaming N/A", value="Enabled" if model_config_data["settings"]["streaming_enabled"] else "Disabled", interactive=model_config_data["model_capabilties"]["streaming_capable"])
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
@@ -115,12 +120,17 @@ def xtts_model_alltalk_settings(model_config_data):
with gr.Group():
with gr.Row():
alloy_gr = gr.Dropdown(value=model_config_data["openai_voices"]["alloy"], label="Alloy", choices=voice_list, allow_custom_value=True)
+ ash_gr = gr.Dropdown(value=model_config_data["openai_voices"]["ash"], label="Ash", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
+ coral_gr = gr.Dropdown(value=model_config_data["openai_voices"]["coral"], label="Coral", choices=voice_list, allow_custom_value=True)
echo_gr = gr.Dropdown(value=model_config_data["openai_voices"]["echo"], label="Echo", choices=voice_list, allow_custom_value=True)
with gr.Row():
fable_gr = gr.Dropdown(value=model_config_data["openai_voices"]["fable"], label="Fable", choices=voice_list, allow_custom_value=True)
nova_gr = gr.Dropdown(value=model_config_data["openai_voices"]["nova"], label="Nova", choices=voice_list, allow_custom_value=True)
with gr.Row():
onyx_gr = gr.Dropdown(value=model_config_data["openai_voices"]["onyx"], label="Onyx", choices=voice_list, allow_custom_value=True)
+ sage_gr = gr.Dropdown(value=model_config_data["openai_voices"]["sage"], label="Sage", choices=voice_list, allow_custom_value=True)
+ with gr.Row():
shimmer_gr = gr.Dropdown(value=model_config_data["openai_voices"]["shimmer"], label="Shimmer", choices=voice_list, allow_custom_value=True)
with gr.Column():
gr.Markdown("### Default Voices")
@@ -139,7 +149,7 @@ def xtts_model_alltalk_settings(model_config_data):
with gr.Row():
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS1, elem_classes="custom-markdown")
gr.Markdown(AllTalkHelpContent.DEFAULT_SETTINGS2, elem_classes="custom-markdown")
- submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, echo_gr, fable_gr, nova_gr, onyx_gr, shimmer_gr], outputs=output_message)
+ submit_button.click(xtts_model_update_settings, inputs=[def_character_voice_gr, def_narrator_voice_gr, lowvram_enabled_gr, deepspeed_enabled_gr, streaming_enabled_gr, temperature_set_gr, repetitionpenalty_set_gr, pitch_set_gr, generationspeed_set_gr, alloy_gr, ash_gr, coral_gr, echo_gr, fable_gr, nova_gr, onyx_gr, sage_gr, shimmer_gr], outputs=output_message)
###########################################################################################
# Do not change this section apart from "TTS Engine Name" value to match your engine name #
@@ -289,6 +299,3 @@ def confirm_download(model_name):
def xtts_at_gradio_settings_page(model_config_data):
app = xtts_model_alltalk_settings(model_config_data)
return app
-def xtts_at_gradio_settings_page(model_config_data):
- app = xtts_model_alltalk_settings(model_config_data)
- return app
diff --git a/test_server.py b/test_server.py
index e3d2d199..49c0c827 100644
--- a/test_server.py
+++ b/test_server.py
@@ -729,7 +729,7 @@ async def _test_openai_tts(self, text: str):
self.logger.info("Testing OpenAI compatible endpoint")
self.logger.info(f"Testing OpenAI generation with text: {text}")
- test_voices = ["alloy", "echo", "fable", "nova", "onyx", "shimmer"]
+ test_voices = ["alloy", "ash", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"]
test_formats = ["wav", "mp3", "opus", "aac"]
for voice in test_voices[:2]: # Test first two voices only
diff --git a/tts_server.py b/tts_server.py
index 07630ee6..263192c7 100644
--- a/tts_server.py
+++ b/tts_server.py
@@ -39,9 +39,26 @@
import numpy as np
import soundfile as sf
import librosa
+from langdetect import detect, DetectorFactory
+from langdetect.lang_detect_exception import LangDetectException
from config import AlltalkConfig, AlltalkTTSEnginesConfig
logging.disable(logging.WARNING)
+DetectorFactory.seed = 0 # Ensure deterministic behavior
+
+# Mapping of detected languages to xtts-supported languages
+LANG_FALLBACKS = {
+ "en": "en", "es": "es", "fr": "fr", "de": "de", "it": "it",
+ "pt": "pt", "pl": "pl", "tr": "tr", "ru": "ru", "nl": "nl",
+ "cs": "cs", "ar": "ar", "zh-cn": "zh", "zh-tw": "zh", "ja": "ja",
+ "hu": "hu", "ko": "ko",
+
+ # Additional fallbacks for unsupported languages
+ "uk": "ru", # Ukrainian → Russian
+ "bg": "ru", # Bulgarian → Russian
+ "ca": "fr",
+}
+
########################################################################################
# START-UP # Silence RVC warning about torch.nn.utils.weight_norm even though not used #
########################################################################################
@@ -938,22 +955,37 @@ async def generate_audio(text, voice, language, temperature, repetition_penalty,
print_message("each TTS Engine in the 'Engine Information' section of the Gradio interface.", "warning", "GEN")
raise ValueError("Streaming not supported by current TTS engine")
- response = model_engine.generate_tts(text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming)
+ if language == "auto":
+ language = detect_language(text)
+ # Streaming mode
if streaming:
- async def stream_response():
+ print_message("Streaming mode enabled", "debug", "TTS")
+ response = model_engine.generate_tts(
+ text, voice, language, temperature, repetition_penalty, speed, pitch, output_file=None, streaming=True
+ )
+
+ async def stream_audio():
try:
async for chunk in response:
yield chunk
except Exception as e:
print_message(f"Error during streaming audio generation: {str(e)}", "error", "GEN")
raise
- return stream_response()
+
+ return stream_audio()
+
+ # Non-streaming mode
+ print_message("Non-streaming mode enabled", "debug", "TTS")
+ response = model_engine.generate_tts(
+ text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming=False
+ )
+
try:
async for _ in response:
pass
except Exception as e:
- print_message(f"Error during audio generation: {str(e)}", "error", "GEN")
+ print_message(f"Error during audio generation: {str(e)}", "error", "TTS")
raise
###########################
@@ -1046,7 +1078,7 @@ class OpenAIInput(BaseModel):
@classmethod
def validate_voice(cls, value):
"""Validate that the requested voice is supported by OpenAI TTS."""
- supported_voices = ["alloy", "echo", "fable", "nova", "onyx", "shimmer"]
+ supported_voices = ["alloy", "ash", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"]
if value not in supported_voices:
raise ValueError(f"Voice must be one of {supported_voices}")
return value
@@ -1103,22 +1135,27 @@ async def openai_tts_generate(request: Request):
# Extract and validate parameters
input_text = json_data["input"]
voice = json_data["voice"]
- response_format = json_data.get("response_format", "wav").lower()
speed = json_data.get("speed", 1.0)
print_message(f"Input text: {input_text}", "debug_openai", "TTS")
print_message(f"Voice: {voice}", "debug_openai", "TTS")
print_message(f"Speed: {speed}", "debug_openai", "TTS")
+ # Load current model engine configuration
+ current_model_engine = tts_class()
+
# Process text and map voice
cleaned_string = html.unescape(standard_filtering(input_text))
voice_mapping = {
- "alloy": model_engine.openai_alloy,
- "echo": model_engine.openai_echo,
- "fable": model_engine.openai_fable,
- "nova": model_engine.openai_nova,
- "onyx": model_engine.openai_onyx,
- "shimmer": model_engine.openai_shimmer
+ "alloy": current_model_engine.openai_alloy,
+ "ash": current_model_engine.openai_ash,
+ "coral": current_model_engine.openai_coral,
+ "echo": current_model_engine.openai_echo,
+ "fable": current_model_engine.openai_fable,
+ "nova": current_model_engine.openai_nova,
+ "onyx": current_model_engine.openai_onyx,
+ "sage": current_model_engine.openai_sage,
+ "shimmer": current_model_engine.openai_shimmer
}
mapped_voice = voice_mapping.get(voice)
@@ -1128,37 +1165,48 @@ async def openai_tts_generate(request: Request):
print_message(f"Mapped voice: {mapped_voice}", "debug_openai", "TTS")
- # Generate audio
- unique_id = uuid.uuid4()
- timestamp = int(time.time())
- output_file_path = f'{this_dir / config.get_output_directory() / f"openai_output_{unique_id}_{timestamp}.{model_engine.audio_format}"}'
-
- if config.debugging.debug_fullttstext:
- print_message(cleaned_string, component="TTS")
+ if current_model_engine.streaming_enabled:
+ audio_stream = await generate_audio(
+ cleaned_string, mapped_voice, "auto", current_model_engine.temperature_set,
+ float(str(current_model_engine.repetitionpenalty_set).replace(',', '.')), speed, current_model_engine.pitch_set,
+ output_file=None, streaming=True
+ )
+ return StreamingResponse(audio_stream, media_type="audio/wav")
else:
- print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS")
+ # Generate audio
+ unique_id = uuid.uuid4()
+ timestamp = int(time.time())
+ output_file_path = f'{this_dir / config.get_output_directory() / f"openai_output_{unique_id}_{timestamp}.{current_model_engine.audio_format}"}'
+ response_format = json_data.get("response_format", "wav").lower()
- await generate_audio(cleaned_string, mapped_voice, "en", model_engine.temperature_set,
- model_engine.repetitionpenalty_set, speed, model_engine.pitch_set,
- output_file_path, streaming=False)
+ if config.debugging.debug_fullttstext:
+ print_message(cleaned_string, component="TTS")
+ else:
+ print_message(f"{cleaned_string[:90]}{'...' if len(cleaned_string) > 90 else ''}", component="TTS")
+
+ await generate_audio(
+ cleaned_string, mapped_voice, "auto", current_model_engine.temperature_set,
+ float(str(current_model_engine.repetitionpenalty_set).replace(',', '.')), speed, current_model_engine.pitch_set,
+ output_file_path, streaming=False
+ )
- print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS")
+ print_message(f"Audio generated at: {output_file_path}", "debug_openai", "TTS")
- # Handle RVC processing
- if config.rvc_settings.rvc_enabled:
- if config.rvc_settings.rvc_char_model_file.lower() in ["disabled", "disable"]:
- print_message("Pass rvccharacter_voice_gen", "debug_openai", "TTS")
- else:
- print_message("send to rvc", "debug_openai", "TTS")
- pth_path = this_dir / "models" / "rvc_voices" / config.rvc_settings.rvc_char_model_file
- pitch = config.rvc_settings.pitch
- run_rvc(output_file_path, pth_path, pitch, infer_pipeline)
+ # Handle RVC processing
+ if config.rvc_settings.rvc_enabled:
+ if config.rvc_settings.rvc_char_model_file.lower() in ["disabled", "disable"]:
+ print_message("Pass rvccharacter_voice_gen", "debug_openai", "TTS")
+ else:
+ print_message("send to rvc", "debug_openai", "TTS")
+ pth_path = this_dir / "models" / "rvc_voices" / config.rvc_settings.rvc_char_model_file
+ pitch = config.rvc_settings.pitch
+ run_rvc(output_file_path, pth_path, pitch, infer_pipeline)
- transcoded_file_path = await transcode_for_openai(output_file_path, response_format)
- print_message(f"Audio transcoded to: {transcoded_file_path}", "debug_openai", "TTS")
+ transcoded_file_path = await transcode_for_openai(output_file_path, response_format)
+ print_message(f"Audio transcoded to: {transcoded_file_path}", "debug_openai", "TTS")
- response = FileResponse(transcoded_file_path, media_type=f"audio/{response_format}",
- filename=f"output.{response_format}")
+ return FileResponse(transcoded_file_path, media_type=f"audio/{response_format}",
+ filename=f"output.{response_format}")
except ValueError as e:
print_message(f"Value error occurred: {str(e)}", "error", "TTS")
@@ -1231,10 +1279,13 @@ async def transcode_for_openai(input_file, output_format):
class VoiceMappings(BaseModel):
"""OpenAI to engine voice mapping configuration."""
alloy: str
+ ash: str
+ coral: str
echo: str
fable: str
nova: str
onyx: str
+ sage: str
shimmer: str
@app.put("/api/openai-voicemap")
@@ -1246,10 +1297,13 @@ async def update_openai_voice_mappings(mappings: VoiceMappings):
# Update in-memory mappings
print_message("Updating in-memory voice mappings", "debug_openai", "TTS")
model_engine.openai_alloy = mappings.alloy
+ model_engine.openai_ash = mappings.ash
+ model_engine.openai_coral = mappings.coral
model_engine.openai_echo = mappings.echo
model_engine.openai_fable = mappings.fable
model_engine.openai_nova = mappings.nova
model_engine.openai_onyx = mappings.onyx
+ model_engine.openai_sage = mappings.sage
model_engine.openai_shimmer = mappings.shimmer
# Update settings file
@@ -1605,7 +1659,7 @@ class JSONInput(BaseModel):
rvcnarrator_voice_gen: str = Field(..., description="rvcnarrator_voice_gen needs to be the name of a valid pth file in the 'folder\\file.pth' format or the word 'Disabled'.")
rvcnarrator_pitch: float = Field(..., description="RVC Narrator pitch needs to be a number between -24 and 24")
text_not_inside: str = Field(..., pattern="^(character|narrator|silent)$", description="text_not_inside needs to be 'character', 'narrator' or 'silent'.")
- language: str = Field(..., pattern="^(ar|zh-cn|zh|cs|nl|en|fr|de|hu|hi|it|ja|ko|pl|pt|ru|es|tr)$", description="language needs to be one of the following: ar, zh-cn, zh, cs, nl, en, fr, de, hu, hi, it, ja, ko, pl, pt, ru, es, tr.")
+ language: str = Field(..., pattern="^(auto|ar|zh-cn|zh|cs|nl|en|fr|de|hu|hi|it|ja|ko|pl|pt|ru|es|tr)$", description="language needs to be one of the following: auto, ar, zh-cn, zh, cs, nl, en, fr, de, hu, hi, it, ja, ko, pl, pt, ru, es, tr.")
output_file_name: str = Field(..., pattern="^[a-zA-Z0-9_]+$", description="output_file_name needs to be the name without any special characters or file extension, e.g., 'filename'.")
output_file_timestamp: bool = Field(..., description="output_file_timestamp needs to be true or false.")
autoplay: bool = Field(..., description="autoplay needs to be a true or false value.")
@@ -2098,6 +2152,30 @@ async def tts_finalize_output(audio_files: List[Path], params: dict) -> Tuple[Pa
return output_file_path, output_file_url, output_cache_url
+def detect_language(text: str) -> str:
+ """
+ Detect the language of the given text and apply a fallback for unsupported languages.
+
+ :param text: Text to analyze.
+ :return: A supported language code (e.g., 'en', 'fr').
+ """
+ try:
+ # Detect the language of the text
+ detected_lang = detect(text)
+ print_message(f"Detected language: {detected_lang}", "debug", "LANG_DETECTION")
+
+ # Use the fallback language if the detected one is unsupported
+ fallback_lang = LANG_FALLBACKS.get(detected_lang, "en") # Default fallback: French
+ if detected_lang != fallback_lang:
+ print_message(f"Language '{detected_lang}' not supported, using fallback '{fallback_lang}'", "warn",
+ "LANG_FALLBACK")
+
+ return fallback_lang
+ except LangDetectException as e:
+ # Handle errors in language detection
+ print_message(f"Language detection error: {str(e)}", "error", "LANG_DETECTION")
+ raise ValueError("Could not detect language")
+
@app.post("/api/tts-generate", response_class=JSONResponse)
async def apifunction_generate_tts_standard(
text_input: str = Form(...),