Skip to content

Commit

Permalink
Rename text_to_speech->tts_to_file, update docs.
Browse files Browse the repository at this point in the history
  • Loading branch information
sultur committed Sep 4, 2023
1 parent e13a7ff commit be8091a
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 77 deletions.
63 changes: 43 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,50 +6,73 @@

# Icespeak

_Icespeak_ is a Python library that makes Icelandic-language speech synthesis easy.
_Icespeak_ is a Python 3.9+ library that makes Icelandic-language speech synthesis easy.

## Installation
## Local installation

> _Note: The Azure package currently only supports the very out-dated OpenSSL version 1.\*._
> _Note: The Azure TTS package currently only supports the very out-dated OpenSSL version 1.\*_
Clone the repository and cd into the folder. Then create and activate
a Python virtual environment, and install all required dependencies:
Clone the repository and cd into the folder.
Then create and activate a virtual environment:

```sh
python3 -m venv venv
source venv/bin/activate
```

Install minimal set of dependencies to use the library:

```sh
python3 -m pip install .
# Alternatively, to install in editable mode with extra dev dependencies:
```

Alternatively, to install in editable mode with extra dev dependencies:

```sh
python3 -m pip install -e '.[dev]'
```

## Usage

Before using, place API keys for the relevant services in the `/keys` folder
(or a folder specified by the `ICESPEAK_KEYS_DIR` environment variable).

Output audio files are saved to the directory specified
by the `ICESPEAK_AUDIO_DIR` environment variable.
By default Icespeak creates the directory `<TEMP DIR>/icespeak`
where `<TEMP DIR>` is the temporary directory on your platform,
fetched via `tempfile.gettempdir()`.

By default, generated audio files are removed upon a clean exit,
but this can be disabled by setting `ICESPEAK_AUDIO_CACHE_CLEAN=0`.

### Text-to-speech

Simple example of TTS, which includes phonetic transcription:

```python
from icespeak import text_to_speech, TTSOptions
audio_file = text_to_speech(
"Hér kemur texti fyrir talgervingu. Ýmislegir textabútar eru hljóðritaðir eins og t.d. [email protected], 48,3% o.fl.",
```py
from icespeak import tts_to_file, TTSOptions
text = """\
Þetta er texti fyrir talgervingu. \
Í honum er ýmislegt sem mætti vera hljóðritað, \
t.d. ræður talgerving oft illa við íslenskar skammstafanir, \
tölvupósta eins og [email protected],
eða prósentur eins og 48,3%, o.fl.\
"""
tts_out = tts_to_file(
text,
TTSOptions(
text_format="text", # Set to 'ssml' if ssml tags in text should be interpreted
text_format="text", # Set to 'ssml' if SSML tags in text should be interpreted
audio_format="mp3", # Output audio will be in mp3 format
voice="Gudrun" # Azure TTS voice
),
trancribe=True # Default is True
transcribe=True # Default is True
)
print(audio_file) # pathlib.Path instance pointing to file on local file system
print(tts_out.file) # pathlib.Path instance pointing to file on local file system
print(tts_out.text) # text that was sent to the TTS service (after the phonetic transcription)
```

### Transcription

_Documentation still in progress._

### Text composition via GSSML

_Documentation still in progress._
Results are cached, so subsequent calls with the same arguments should be fast.

## License

Expand Down
4 changes: 2 additions & 2 deletions src/icespeak/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from .parser import GreynirSSMLParser, gssml
from .settings import SETTINGS, __logger__
from .transcribe import DefaultTranscriber, TranscriptionOptions
from .tts import VOICES, text_to_speech
from .tts import VOICES, tts_to_file
from .voices import TTSOptions

__all__ = (
Expand All @@ -33,6 +33,6 @@
"TranscriptionOptions",
"VOICES",
"gssml",
"text_to_speech",
"tts_to_file",
"__logger__",
)
8 changes: 5 additions & 3 deletions src/icespeak/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import requests

from .settings import LOG, SETTINGS, suffix_for_audiofmt
from .tts import VOICES, TTSOptions, text_to_speech
from .tts import VOICES, TTSOptions, tts_to_file


def _die(msg: str, exit_code: int = 1) -> None:
Expand Down Expand Up @@ -222,15 +222,16 @@ def main() -> None:
_die("WAV output flag only supported for PCM format.")

# Synthesize the text according to CLI options
url = text_to_speech(
tts_out = tts_to_file(
text,
TTSOptions(
text_format=args.textformat,
audio_format=args.audioformat,
voice=args.voice,
speed=args.speed,
),
).as_uri()
)
url = tts_out.file.as_uri()
if not url:
_die("Error synthesizing speech.")

Expand All @@ -239,6 +240,7 @@ def main() -> None:
print(url)
sys.exit(0)

# TODO: This isn't needed anymore
# Download
urldesc = f"data URI ({len(url)} bytes)" if _is_data_uri(url) else url
print(f"Fetching {urldesc}")
Expand Down
23 changes: 14 additions & 9 deletions src/icespeak/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
Shared settings for the Icespeak package.
"""
# pyright: reportConstantRedefinition=false
# We dont import annotations from __future__ here
# due to pydantic
from typing import Any, Literal, Optional, Union
Expand Down Expand Up @@ -131,7 +130,7 @@ def get_audio_dir(self) -> Path:
if self.AUDIO_DIR is None:
dir = Path(tempfile.gettempdir()) / "icespeak"
dir.mkdir(exist_ok=True)
self.AUDIO_DIR = dir
self.AUDIO_DIR = dir # pyright: ignore[reportConstantRedefinition]
return self.AUDIO_DIR

def get_empty_file(self, audio_format: str) -> Path:
Expand Down Expand Up @@ -178,19 +177,25 @@ class Keys(BaseModel):
API_KEYS.aws = AWSPollyKey.model_validate_json(
(_kd / SETTINGS.AWSPOLLY_KEY_FILENAME).read_text().strip()
)
except Exception:
LOG.exception(
"Could not load AWS Polly API key, ASR with AWS Polly will not work."
except Exception as err:
LOG.warning(
"Could not load AWS Polly API key, ASR with AWS Polly will not work. Error: %s",
err,
)
try:
API_KEYS.azure = AzureKey.model_validate_json(
(_kd / SETTINGS.AZURE_KEY_FILENAME).read_text().strip()
)
except Exception:
LOG.exception("Could not load Azure API key, ASR with Azure will not work.")
except Exception as err:
LOG.warning(
"Could not load Azure API key, ASR with Azure will not work. Error: %s", err
)
try:
API_KEYS.google = json.loads(
(_kd / SETTINGS.GOOGLE_KEY_FILENAME).read_text().strip()
)
except Exception:
LOG.exception("Could not load Google API key, ASR with Google will not work.")
except Exception as err:
LOG.warning(
"Could not load Google API key, ASR with Google will not work. Error: %s",
err,
)
34 changes: 23 additions & 11 deletions src/icespeak/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,31 @@
from __future__ import annotations

from collections.abc import Mapping
from typing import Any, TypeVar
from typing import TYPE_CHECKING, Any, NamedTuple, TypeVar
from typing_extensions import override

import atexit
import queue
import threading
from pathlib import Path

from cachetools import LFUCache, cached

from .settings import LOG, SETTINGS
from .transcribe import TranscriptionOptions
from .voices import BaseVoice, TTSOptions, aws_polly, azure, google, tiro

if TYPE_CHECKING:
from pathlib import Path

VoicesT = Mapping[str, Mapping[str, str]]
ServicesT = Mapping[str, BaseVoice]


class TTSOutput(NamedTuple):
file: Path
text: str


def _setup_voices() -> tuple[VoicesT, ServicesT]:
services = (
aws_polly.AWSPollyVoice(),
Expand Down Expand Up @@ -71,7 +78,7 @@ def _setup_voices() -> tuple[VoicesT, ServicesT]:
_T = TypeVar("_T")


class TmpFileLFUCache(LFUCache[_T, Path]):
class TmpFileLFUCache(LFUCache[_T, TTSOutput]):
"""
Custom version of a least-frequently-used cache which,
if the clean cache setting is True,
Expand All @@ -81,13 +88,13 @@ class TmpFileLFUCache(LFUCache[_T, Path]):
"""

@override
def popitem(self) -> tuple[_T, Path]:
def popitem(self) -> tuple[_T, TTSOutput]:
"""Schedule audio file for deletion upon evicting from cache."""
key, audiofile = super().popitem()
LOG.debug("Expired audio file: %s", audiofile)
# Schedule for deletion, if cleaning the cache
if SETTINGS.AUDIO_CACHE_CLEAN:
_EXPIRED_QUEUE.put(audiofile)
_EXPIRED_QUEUE.put(audiofile.file)
return key, audiofile


Expand All @@ -113,7 +120,8 @@ def _evict_all():
try:
while _AUDIO_CACHE.currsize > 0:
# Remove all files currently in cache
_AUDIO_CACHE.popitem()[1].unlink(missing_ok=True)
_, v = _AUDIO_CACHE.popitem()
v.file.unlink(missing_ok=True)
except Exception:
LOG.exception("Error when cleaning cache.")
# Give the thread a little bit of time to join,
Expand All @@ -126,23 +134,24 @@ def _evict_all():


@cached(_AUDIO_CACHE)
def text_to_speech(
def tts_to_file(
text: str,
tts_options: TTSOptions | None = None,
transcription_options: TranscriptionOptions | None = None,
*,
transcribe: bool = True,
) -> Path:
) -> TTSOutput:
"""
# Text-to-speech
Synthesize speech for the given text.
Synthesize speech for the given text and write to local file.
Audio/voice settings can be supplied in `tts_options`,
transcription turned on/off via the `transcribe` flag
and its options supplied in `transcription_options`
Returns an instance of `pathlib.Path` pointing to the output audio file.
Returns a named tuple containing a path to the output audio file,
along with the text that was sent to the TTS service.
"""
tts_options = tts_options or TTSOptions()
try:
Expand All @@ -159,4 +168,7 @@ def text_to_speech(
transcription_options = transcription_options or TranscriptionOptions()
text = service.Transcriber.token_transcribe(text, transcription_options)

return service.text_to_speech(text, tts_options)
return TTSOutput(
file=service.text_to_speech(text, tts_options),
text=text,
)
30 changes: 15 additions & 15 deletions tests/cache_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,22 +58,22 @@ def _ensure_num_files():
nprint("Current number of files in audio directory:", START_NUM_FILES)

start = time.monotonic_ns()
p1 = icespeak.text_to_speech(t1)
file1 = icespeak.tts_to_file(t1).file
duration = time.monotonic_ns() - start

nprint("Audio file:", p1)
nprint("Audio file:", file1)
nprint("Files in audio dir:", len(list(AUDIO_DIR.iterdir())))
nprint(f"Took {duration / 1e6:.3f} milliseconds.")

nprint("This should be cached...")

start = time.monotonic_ns()
p2 = icespeak.text_to_speech(t1)
file2 = icespeak.tts_to_file(t1).file
duration = time.monotonic_ns() - start

nprint("Audio file:", p2)
nprint("Audio file:", file2)
nprint("Files in audio dir:", len(list(AUDIO_DIR.iterdir())))
assert p1 == p2, "This wasn't cached correctly!"
assert file1 == file2, "This wasn't cached correctly!"
nprint(
f"Took {duration / 1e6:.3f} milliseconds. (Should be a lot faster than above.)"
)
Expand All @@ -84,31 +84,31 @@ def _ensure_num_files():
for n in range(1, CACHE_SIZE + 1):
print(".", end="")
# Fill cache with uncacheable stuff, if CACHE_SIZE > 1
icespeak.text_to_speech(f"Texti númer {n+1}.")
_ = icespeak.tts_to_file(f"Texti númer {n+1}.")
print()
nprint("Cache filled.")

nprint("Now we should see an eviction!")

p3 = icespeak.text_to_speech("Þetta er allt annar texti!")
nprint("Audio file:", p3)
last_file = icespeak.tts_to_file("Þetta er allt annar texti!").file
nprint("Audio file:", last_file)
nprint("Files in audio dir:", len(list(AUDIO_DIR.iterdir())))

nprint("Sleeping a bit, allow cleanup thread to remove files...")
time.sleep(0.5)

if CACHE_SIZE == 1:
# Here we only cache one file, so even the most frequently used one gets evicted
assert not p1.is_file(), f"Audio file {p1} wasn't evicted!"
assert not p2.is_file(), f"Audio file {p2} wasn't evicted!"
assert not file1.is_file(), f"Audio file {file1} wasn't evicted!"
assert not file2.is_file(), f"Audio file {file2} wasn't evicted!"
else:
assert (
p1.is_file()
), f"Audio file {p1} shouldn't be evicted, it is most frequent!"
file1.is_file()
), f"Audio file {file1} shouldn't be evicted, it is most frequent!"
assert (
p2.is_file()
), f"Audio file {p2} shouldn't be evicted, it is most frequent!"
file2.is_file()
), f"Audio file {file2} shouldn't be evicted, it is most frequent!"

assert p3.is_file(), f"Audio file {p3} should exist!"
assert last_file.is_file(), f"Audio file {last_file} should exist!"

nprint("Caching seems to work!")
Loading

0 comments on commit be8091a

Please sign in to comment.