Rename text_to_speech->tts_to_file, update docs.

mideind · Sep 4, 2023 · be8091a · be8091a
1 parent e13a7ff
commit be8091a
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -6,50 +6,73 @@
 
 # Icespeak
 
-_Icespeak_ is a Python library that makes Icelandic-language speech synthesis easy.
+_Icespeak_ is a Python 3.9+ library that makes Icelandic-language speech synthesis easy.
 
-## Installation
+## Local installation
 
-> _Note: The Azure package currently only supports the very out-dated OpenSSL version 1.\*._
+> _Note: The Azure TTS package currently only supports the very out-dated OpenSSL version 1.\*_
 
-Clone the repository and cd into the folder. Then create and activate
-a Python virtual environment, and install all required dependencies:
+Clone the repository and cd into the folder.
+Then create and activate a virtual environment:
 
 ```sh
 python3 -m venv venv
 source venv/bin/activate
+```
+
+Install minimal set of dependencies to use the library:
+
+```sh
 python3 -m pip install .
-# Alternatively, to install in editable mode with extra dev dependencies:
+```
+
+Alternatively, to install in editable mode with extra dev dependencies:
+
+```sh
 python3 -m pip install -e '.[dev]'
 ```
 
 ## Usage
 
+Before using, place API keys for the relevant services in the `/keys` folder
+(or a folder specified by the `ICESPEAK_KEYS_DIR` environment variable).
+
+Output audio files are saved to the directory specified
+by the `ICESPEAK_AUDIO_DIR` environment variable.
+By default Icespeak creates the directory `<TEMP DIR>/icespeak`
+where `<TEMP DIR>` is the temporary directory on your platform,
+fetched via `tempfile.gettempdir()`.
+
+By default, generated audio files are removed upon a clean exit,
+but this can be disabled by setting `ICESPEAK_AUDIO_CACHE_CLEAN=0`.
+
 ### Text-to-speech
 
 Simple example of TTS, which includes phonetic transcription:
 
-```python
-from icespeak import text_to_speech, TTSOptions
-audio_file = text_to_speech(
-    "Hér kemur texti fyrir talgervingu. Ýmislegir textabútar eru hljóðritaðir eins og t.d. [email protected], 48,3% o.fl.",
+```py
+from icespeak import tts_to_file, TTSOptions
+text = """\
+Þetta er texti fyrir talgervingu. \
+Í honum er ýmislegt sem mætti vera hljóðritað, \
+t.d. ræður talgerving oft illa við íslenskar skammstafanir, \
+tölvupósta eins og [email protected],
+eða prósentur eins og 48,3%, o.fl.\
+"""
+tts_out = tts_to_file(
+    text,
     TTSOptions(
-        text_format="text", # Set to 'ssml' if ssml tags in text should be interpreted
+        text_format="text", # Set to 'ssml' if SSML tags in text should be interpreted
         audio_format="mp3", # Output audio will be in mp3 format
         voice="Gudrun" # Azure TTS voice
     ),
-    trancribe=True # Default is True
+    transcribe=True # Default is True
 )
-print(audio_file) # pathlib.Path instance pointing to file on local file system
+print(tts_out.file) # pathlib.Path instance pointing to file on local file system
+print(tts_out.text) # text that was sent to the TTS service (after the phonetic transcription)
 ```
 
-### Transcription
-
-_Documentation still in progress._
-
-### Text composition via GSSML
-
-_Documentation still in progress._
+Results are cached, so subsequent calls with the same arguments should be fast.
 
 ## License
 

diff --git a/src/icespeak/__init__.py b/src/icespeak/__init__.py
@@ -22,7 +22,7 @@
 from .parser import GreynirSSMLParser, gssml
 from .settings import SETTINGS, __logger__
 from .transcribe import DefaultTranscriber, TranscriptionOptions
-from .tts import VOICES, text_to_speech
+from .tts import VOICES, tts_to_file
 from .voices import TTSOptions
 
 __all__ = (
@@ -33,6 +33,6 @@
     "TranscriptionOptions",
     "VOICES",
     "gssml",
-    "text_to_speech",
+    "tts_to_file",
     "__logger__",
 )
diff --git a/src/icespeak/cli.py b/src/icespeak/cli.py
@@ -38,7 +38,7 @@
 import requests
 
 from .settings import LOG, SETTINGS, suffix_for_audiofmt
-from .tts import VOICES, TTSOptions, text_to_speech
+from .tts import VOICES, TTSOptions, tts_to_file
 
 
 def _die(msg: str, exit_code: int = 1) -> None:
@@ -222,15 +222,16 @@ def main() -> None:
         _die("WAV output flag only supported for PCM format.")
 
     # Synthesize the text according to CLI options
-    url = text_to_speech(
+    tts_out = tts_to_file(
         text,
         TTSOptions(
             text_format=args.textformat,
             audio_format=args.audioformat,
             voice=args.voice,
             speed=args.speed,
         ),
-    ).as_uri()
+    )
+    url = tts_out.file.as_uri()
     if not url:
         _die("Error synthesizing speech.")
 
@@ -239,6 +240,7 @@ def main() -> None:
         print(url)
         sys.exit(0)
 
+    # TODO: This isn't needed anymore
     # Download
     urldesc = f"data URI ({len(url)} bytes)" if _is_data_uri(url) else url
     print(f"Fetching {urldesc}")

diff --git a/src/icespeak/settings.py b/src/icespeak/settings.py
@@ -20,7 +20,6 @@
     Shared settings for the Icespeak package.
 
 """
-# pyright: reportConstantRedefinition=false
 # We dont import annotations from __future__ here
 # due to pydantic
 from typing import Any, Literal, Optional, Union
@@ -131,7 +130,7 @@ def get_audio_dir(self) -> Path:
         if self.AUDIO_DIR is None:
             dir = Path(tempfile.gettempdir()) / "icespeak"
             dir.mkdir(exist_ok=True)
-            self.AUDIO_DIR = dir
+            self.AUDIO_DIR = dir  # pyright: ignore[reportConstantRedefinition]
         return self.AUDIO_DIR
 
     def get_empty_file(self, audio_format: str) -> Path:
@@ -178,19 +177,25 @@ class Keys(BaseModel):
         API_KEYS.aws = AWSPollyKey.model_validate_json(
             (_kd / SETTINGS.AWSPOLLY_KEY_FILENAME).read_text().strip()
         )
-    except Exception:
-        LOG.exception(
-            "Could not load AWS Polly API key, ASR with AWS Polly will not work."
+    except Exception as err:
+        LOG.warning(
+            "Could not load AWS Polly API key, ASR with AWS Polly will not work. Error: %s",
+            err,
         )
     try:
         API_KEYS.azure = AzureKey.model_validate_json(
             (_kd / SETTINGS.AZURE_KEY_FILENAME).read_text().strip()
         )
-    except Exception:
-        LOG.exception("Could not load Azure API key, ASR with Azure will not work.")
+    except Exception as err:
+        LOG.warning(
+            "Could not load Azure API key, ASR with Azure will not work. Error: %s", err
+        )
     try:
         API_KEYS.google = json.loads(
             (_kd / SETTINGS.GOOGLE_KEY_FILENAME).read_text().strip()
         )
-    except Exception:
-        LOG.exception("Could not load Google API key, ASR with Google will not work.")
+    except Exception as err:
+        LOG.warning(
+            "Could not load Google API key, ASR with Google will not work. Error: %s",
+            err,
+        )
diff --git a/src/icespeak/tts.py b/src/icespeak/tts.py
@@ -24,24 +24,31 @@
 from __future__ import annotations
 
 from collections.abc import Mapping
-from typing import Any, TypeVar
+from typing import TYPE_CHECKING, Any, NamedTuple, TypeVar
 from typing_extensions import override
 
 import atexit
 import queue
 import threading
-from pathlib import Path
 
 from cachetools import LFUCache, cached
 
 from .settings import LOG, SETTINGS
 from .transcribe import TranscriptionOptions
 from .voices import BaseVoice, TTSOptions, aws_polly, azure, google, tiro
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
 VoicesT = Mapping[str, Mapping[str, str]]
 ServicesT = Mapping[str, BaseVoice]
 
 
+class TTSOutput(NamedTuple):
+    file: Path
+    text: str
+
+
 def _setup_voices() -> tuple[VoicesT, ServicesT]:
     services = (
         aws_polly.AWSPollyVoice(),
@@ -71,7 +78,7 @@ def _setup_voices() -> tuple[VoicesT, ServicesT]:
 _T = TypeVar("_T")
 
 
-class TmpFileLFUCache(LFUCache[_T, Path]):
+class TmpFileLFUCache(LFUCache[_T, TTSOutput]):
     """
     Custom version of a least-frequently-used cache which,
     if the clean cache setting is True,
@@ -81,13 +88,13 @@ class TmpFileLFUCache(LFUCache[_T, Path]):
     """
 
     @override
-    def popitem(self) -> tuple[_T, Path]:
+    def popitem(self) -> tuple[_T, TTSOutput]:
         """Schedule audio file for deletion upon evicting from cache."""
         key, audiofile = super().popitem()
         LOG.debug("Expired audio file: %s", audiofile)
         # Schedule for deletion, if cleaning the cache
         if SETTINGS.AUDIO_CACHE_CLEAN:
-            _EXPIRED_QUEUE.put(audiofile)
+            _EXPIRED_QUEUE.put(audiofile.file)
         return key, audiofile
 
 
@@ -113,7 +120,8 @@ def _evict_all():
         try:
             while _AUDIO_CACHE.currsize > 0:
                 # Remove all files currently in cache
-                _AUDIO_CACHE.popitem()[1].unlink(missing_ok=True)
+                _, v = _AUDIO_CACHE.popitem()
+                v.file.unlink(missing_ok=True)
         except Exception:
             LOG.exception("Error when cleaning cache.")
         # Give the thread a little bit of time to join,
@@ -126,23 +134,24 @@ def _evict_all():
 
 
 @cached(_AUDIO_CACHE)
-def text_to_speech(
+def tts_to_file(
     text: str,
     tts_options: TTSOptions | None = None,
     transcription_options: TranscriptionOptions | None = None,
     *,
     transcribe: bool = True,
-) -> Path:
+) -> TTSOutput:
     """
     # Text-to-speech
 
-    Synthesize speech for the given text.
+    Synthesize speech for the given text and write to local file.
 
     Audio/voice settings can be supplied in `tts_options`,
     transcription turned on/off via the `transcribe` flag
     and its options supplied in `transcription_options`
 
-    Returns an instance of `pathlib.Path` pointing to the output audio file.
+    Returns a named tuple containing a path to the output audio file,
+    along with the text that was sent to the TTS service.
     """
     tts_options = tts_options or TTSOptions()
     try:
@@ -159,4 +168,7 @@ def text_to_speech(
         transcription_options = transcription_options or TranscriptionOptions()
         text = service.Transcriber.token_transcribe(text, transcription_options)
 
-    return service.text_to_speech(text, tts_options)
+    return TTSOutput(
+        file=service.text_to_speech(text, tts_options),
+        text=text,
+    )
diff --git a/tests/cache_test.py b/tests/cache_test.py
@@ -58,22 +58,22 @@ def _ensure_num_files():
     nprint("Current number of files in audio directory:", START_NUM_FILES)
 
     start = time.monotonic_ns()
-    p1 = icespeak.text_to_speech(t1)
+    file1 = icespeak.tts_to_file(t1).file
     duration = time.monotonic_ns() - start
 
-    nprint("Audio file:", p1)
+    nprint("Audio file:", file1)
     nprint("Files in audio dir:", len(list(AUDIO_DIR.iterdir())))
     nprint(f"Took {duration / 1e6:.3f} milliseconds.")
 
     nprint("This should be cached...")
 
     start = time.monotonic_ns()
-    p2 = icespeak.text_to_speech(t1)
+    file2 = icespeak.tts_to_file(t1).file
     duration = time.monotonic_ns() - start
 
-    nprint("Audio file:", p2)
+    nprint("Audio file:", file2)
     nprint("Files in audio dir:", len(list(AUDIO_DIR.iterdir())))
-    assert p1 == p2, "This wasn't cached correctly!"
+    assert file1 == file2, "This wasn't cached correctly!"
     nprint(
         f"Took {duration / 1e6:.3f} milliseconds. (Should be a lot faster than above.)"
     )
@@ -84,31 +84,31 @@ def _ensure_num_files():
         for n in range(1, CACHE_SIZE + 1):
             print(".", end="")
             # Fill cache with uncacheable stuff, if CACHE_SIZE > 1
-            icespeak.text_to_speech(f"Texti númer {n+1}.")
+            _ = icespeak.tts_to_file(f"Texti númer {n+1}.")
         print()
         nprint("Cache filled.")
 
     nprint("Now we should see an eviction!")
 
-    p3 = icespeak.text_to_speech("Þetta er allt annar texti!")
-    nprint("Audio file:", p3)
+    last_file = icespeak.tts_to_file("Þetta er allt annar texti!").file
+    nprint("Audio file:", last_file)
     nprint("Files in audio dir:", len(list(AUDIO_DIR.iterdir())))
 
     nprint("Sleeping a bit, allow cleanup thread to remove files...")
     time.sleep(0.5)
 
     if CACHE_SIZE == 1:
         # Here we only cache one file, so even the most frequently used one gets evicted
-        assert not p1.is_file(), f"Audio file {p1} wasn't evicted!"
-        assert not p2.is_file(), f"Audio file {p2} wasn't evicted!"
+        assert not file1.is_file(), f"Audio file {file1} wasn't evicted!"
+        assert not file2.is_file(), f"Audio file {file2} wasn't evicted!"
     else:
         assert (
-            p1.is_file()
-        ), f"Audio file {p1} shouldn't be evicted, it is most frequent!"
+            file1.is_file()
+        ), f"Audio file {file1} shouldn't be evicted, it is most frequent!"
         assert (
-            p2.is_file()
-        ), f"Audio file {p2} shouldn't be evicted, it is most frequent!"
+            file2.is_file()
+        ), f"Audio file {file2} shouldn't be evicted, it is most frequent!"
 
-    assert p3.is_file(), f"Audio file {p3} should exist!"
+    assert last_file.is_file(), f"Audio file {last_file} should exist!"
 
     nprint("Caching seems to work!")