-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtts.py
executable file
·270 lines (222 loc) · 7.29 KB
/
tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!python3
import json
import random
import subprocess
import sys
import time
from pathlib import Path
from typing import Annotated, Iterator, Optional
import typer
from elevenlabs import VoiceSettings
from elevenlabs.client import ElevenLabs
from icecream import ic
from loguru import logger
from pydantic import BaseModel
from rich.console import Console
from pydub import AudioSegment
import os
import re
console = Console()
app = typer.Typer(no_args_is_help=True)
@app.command()
def scratch():
ic("hello world")
voices = {
"fin": "fin",
"igor": "Nvd5I2HGnOWHNU0ijNEy",
"ammon": "AwdhqucUs1YyNaWbqQ57",
"rachel": "VrNQNREmlwaHD01224L3",
}
list_of_voices = ",".join(voices.keys())
@app.command()
def list_voices():
client = ElevenLabs()
voices = client.voices.get_all()
for voice in voices:
ic(voice)
def generate_audio(
text: str,
voice: str,
voice_settings: VoiceSettings = VoiceSettings(
stability=0.4, similarity_boost=0.6, style=0.36, use_speaker_boost=True
),
model: str = "eleven_turbo_v2",
) -> Iterator[bytes]:
client = ElevenLabs()
voice = voices[voice]
return client.generate(
text=text,
voice=voice,
model=model,
voice_settings=voice_settings,
)
@app.command()
def say(
voice: Annotated[
str, typer.Option(help=f"Model any of: {list_of_voices}")
] = "igor",
fast: bool = True,
copy: bool = False,
outfile: Optional[Path] = None,
speak: bool = True,
):
# look up voice in voices
voice = voices[voice]
# record how long it takes
start = time.time()
to_speak = "\n".join(sys.stdin.readlines())
model = "eleven_turbo_v2" if fast else "eleven_multilingual_v2"
ic(voice, model)
client = ElevenLabs()
voice_settings = VoiceSettings(
stability=0.4, similarity_boost=0.6, style=0.36, use_speaker_boost=True
)
audio = client.generate(
text=to_speak,
voice=voice,
model=model,
voice_settings=voice_settings,
)
# unwrapp the iterator
audio = b"".join(audio)
print(f"Took {round(time.time() -start,3)} seconds")
if outfile is None:
temp_path = Path.home() / "tmp/tts" / f"{random.random()}.mp3"
temp_path.parent.mkdir(parents=True, exist_ok=True)
outfile = temp_path
outfile.write_bytes(audio)
print(outfile)
if speak:
ic(speak)
# play via afplay
subprocess.run(["afplay", outfile])
if copy:
import pbf
pbf.copy(outfile)
@app.command()
def podcast(
infile: Path = Path("podcast.json"),
outdir: Optional[Path] = None,
speak: bool = True,
):
# create output dir name of podcast_<infile>, remove extension
# if it exists throw
if outdir is None:
outdir = Path(f"podcast_{infile.stem}")
else:
outdir = Path(outdir)
# throw if it exists
if outdir.exists():
pass
# raise ValueError(f"Output directory {outdir} already exists")
outdir.mkdir(parents=True, exist_ok=True)
# inffile is a json array of PodcastItems, load it up into python
items = []
with open(infile, "r") as f:
json_items = json.load(f)
items = [PodCastItem.model_validate(item) for item in json_items]
ic(items)
for index, item in enumerate(items, start=1):
# create a temp path
temp_path = outdir / f"{item.Speaker}_{index:03d}.mp3"
ic(temp_path)
# if it exists throw
if temp_path.exists():
ic(f"Output file {temp_path} already exists - skipping")
continue
else:
# write out the audio to the file
voice_label = ""
if item.Speaker == "Host":
voice_label = "igor"
elif item.Speaker == "Guest":
voice_label = "rachel"
else:
raise ValueError(f"Unknown speaker {item.Speaker}")
audio = generate_audio(item.ContentToSpeak, voice_label)
with open(temp_path, "wb") as f:
audio = b"".join(audio)
f.write(audio)
@app.command()
def google_multi(pod=Path("pod.json"), speak: bool = True):
from google.cloud import texttospeech_v1beta1 as tts
from google.cloud.texttospeech_v1beta1 import (
MultiSpeakerMarkup,
AudioEncoding,
VoiceSelectionParams,
SynthesisInput,
AudioConfig,
)
conversation = []
# load the podcast
with open(pod, "r") as f:
podcast = json.load(f)
conversation = podcast["conversation"]
# Define the conversation as a list of tuples (speaker, text)
markupTurns = [
MultiSpeakerMarkup.Turn(text=turn["text"], speaker=turn["speaker"])
for turn in conversation
]
# Remap speakers to be R,S,M be dynamic in how you build that
original_speakers = set([turn.speaker for turn in markupTurns])
ic(original_speakers)
valid_google_speakers = "R,S,T,U".split(",")
# map from original speakers to valid google speakers
speaker_map = {
speaker: valid_google_speakers[index]
for index, speaker in enumerate(original_speakers)
}
ic(speaker_map)
for turn in markupTurns:
turn.speaker = speaker_map[turn.speaker]
multi_speaker_markup = MultiSpeakerMarkup(turns=markupTurns)
ic(multi_speaker_markup)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = tts.TextToSpeechClient().synthesize_speech(
input=SynthesisInput(multi_speaker_markup=multi_speaker_markup),
voice=VoiceSelectionParams(
language_code="en-US", name="en-US-Studio-MultiSpeaker"
),
audio_config=AudioConfig(audio_encoding=AudioEncoding.MP3),
)
# The response's audio_content is binary.
output_path = "pod.wav" # not sure why, but it's only outputing wav
ic(output_path)
with open(output_path, "wb") as out:
# Write the response to the output file.
out.write(response.audio_content)
if speak:
ic(speak)
# play via afplay
subprocess.run(["afplay", output_path])
@app.command()
def merge_audio(directory: Path):
# Specify the directory where youjjjjr audio files are located
# Function to extract the numeric part from the filename for sorting
def extract_number(file_name):
return int(re.search(r"\d+", file_name).group())
# Get all the files in the directory that match the pattern
files = [f for f in os.listdir(directory) if f.endswith(".mp3")]
# Sort files by the numeric part extracted from the filenames
files.sort(key=extract_number)
# Initialize an empty AudioSegment object
combined = AudioSegment.empty()
# Loop through the files and merge them
for file in files:
audio = AudioSegment.from_mp3(os.path.join(directory, file))
combined += audio
# Export the merged audio file
output_path = os.path.join(directory, "merged_audio.mp3")
combined.export(output_path, format="mp3")
print(f"Merged audio saved to {output_path}")
# generated via [gpt.py2json](https://tinyurl.com/23dl535z)
class PodCastItem(BaseModel):
Speaker: str
ContentToSpeak: str
@logger.catch()
def app_wrap_loguru():
app()
if __name__ == "__main__":
ic("main")
app_wrap_loguru()