From f7380ed7ab79641fe342af73e410a7574e35c3c8 Mon Sep 17 00:00:00 2001 From: qinzy Date: Tue, 27 Feb 2024 18:04:21 -0700 Subject: [PATCH] add docker --- Dockerfile | 2 +- README.md | 215 +----------------------------------------- docs/install.md | 234 ++++++++++++++++++++++++++++++++++++++++++++++ docs/quick_use.md | 48 ++++++++++ melo/app.py | 2 +- 5 files changed, 289 insertions(+), 212 deletions(-) create mode 100644 docs/install.md create mode 100644 docs/quick_use.md diff --git a/Dockerfile b/Dockerfile index 92c520df..afe4551f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,4 +10,4 @@ RUN pip install -e . RUN python -m unidic download RUN python melo/init_downloads.py -CMD ["python", "./melo/app.py", "--port", "8800"] \ No newline at end of file +CMD ["python", "./melo/app.py", "--host", "0.0.0.0", "--port", "8888"] \ No newline at end of file diff --git a/README.md b/README.md index 1d6e7a3b..810a7d7e 100644 --- a/README.md +++ b/README.md @@ -23,216 +23,9 @@ Some other features include: - The Chinese speaker supports `mixed Chinese and English`. - Fast enough for `CPU real-time inference`. -## Install on Linux or macOS - -**Installation:** - -```bash -pip install git+https://github.com/myshell-ai/MeloTTS.git -python -m unidic download -``` - -**Manual installation:** - -```bash -git clone https://github.com/myshell-ai/MeloTTS.git -cd MeloTTS -pip install -e . -python -m unidic download -``` - -We welcome the open-source community to make this repo `Windows` compatible. If you find this repo useful, please consider contributing to the repo. - ## Usage - -An unofficial [live demo](https://huggingface.co/spaces/mrfakename/MeloTTS) is hosted on Hugging Face Spaces. - -### WebUI - -The WebUI supports muliple languages and voices. First, follow the installation steps. Then, simply run: - -```bash -melo-ui -# Or: python melo/app.py -``` - -### CLI - -You may use the MeloTTS CLI to interact with MeloTTS. The CLI may be invoked using either `melotts` or `melo`. Here are some examples: - -**Read English text:** - -```bash -melo "Text to read" output.wav -``` - -**Specify a language:** - -```bash -melo "Text to read" output.wav --language EN -``` - -**Specify a speaker:** - -```bash -melo "Text to read" output.wav --language EN --speaker EN-US -melo "Text to read" output.wav --language EN --speaker EN-AU -``` - -The available speakers are: `EN-Default`, `EN-US`, `EN-BR`, `EN-INDIA` `EN-AU`. - -**Specify a speed:** - -```bash -melo "Text to read" output.wav --language EN --speaker EN-US --speed 1.5 -melo "Text to read" output.wav --speed 1.5 -``` - -**Use a different language:** - -```bash -melo "text-to-speech 领域近年来发展迅速" zh.wav -l ZH -``` - -**Load from a file:** - -```bash -melo file.txt out.wav --file -``` - -The full API documentation may be found using: - -```bash -melo --help -``` - -### Python API - -#### English with Multiple Accents - -```python -from melo.api import TTS - -# Speed is adjustable -speed = 1.0 - -# CPU is sufficient for real-time inference. -# You can set it manually to 'cpu' or 'cuda' or 'cuda:0' or 'mps' -device = 'auto' # Will automatically use GPU if available - -# English -text = "Did you ever hear a folk tale about a giant turtle?" -model = TTS(language='EN', device=device) -speaker_ids = model.hps.data.spk2id - -# American accent -output_path = 'en-us.wav' -model.tts_to_file(text, speaker_ids['EN-US'], output_path, speed=speed) - -# British accent -output_path = 'en-br.wav' -model.tts_to_file(text, speaker_ids['EN-BR'], output_path, speed=speed) - -# Indian accent -output_path = 'en-india.wav' -model.tts_to_file(text, speaker_ids['EN_INDIA'], output_path, speed=speed) - -# Australian accent -output_path = 'en-au.wav' -model.tts_to_file(text, speaker_ids['EN-AU'], output_path, speed=speed) - -# Default accent -output_path = 'en-default.wav' -model.tts_to_file(text, speaker_ids['EN-Default'], output_path, speed=speed) - -``` - -### Spanish -```python -from melo.api import TTS - -# Speed is adjustable -speed = 1.0 - -# CPU is sufficient for real-time inference. -# You can also change to cuda:0 -device = 'cpu' - -text = "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante." -model = TTS(language='ES', device=device) -speaker_ids = model.hps.data.spk2id - -output_path = 'es.wav' -model.tts_to_file(text, speaker_ids['ES'], output_path, speed=speed) -``` - -#### French - -```python -from melo.api import TTS - -# Speed is adjustable -speed = 1.0 -device = 'cpu' # or cuda:0 - -text = "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante." -model = TTS(language='FR', device=device) -speaker_ids = model.hps.data.spk2id - -output_path = 'fr.wav' -model.tts_to_file(text, speaker_ids['FR'], output_path, speed=speed) -``` - -#### Chinese - -```python -from melo.api import TTS - -# Speed is adjustable -speed = 1.0 -device = 'cpu' # or cuda:0 - -text = "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。" -model = TTS(language='ZH', device=device) -speaker_ids = model.hps.data.spk2id - -output_path = 'zh.wav' -model.tts_to_file(text, speaker_ids['ZH'], output_path, speed=speed) -``` - -#### Japanese - -```python -from melo.api import TTS - -# Speed is adjustable -speed = 1.0 -device = 'cpu' # or cuda:0 - -text = "彼は毎朝ジョギングをして体を健康に保っています。" -model = TTS(language='JP', device=device) -speaker_ids = model.hps.data.spk2id - -output_path = 'jp.wav' -model.tts_to_file(text, speaker_ids['JP'], output_path, speed=speed) -``` - -#### Korean - -```python -from melo.api import TTS - -# Speed is adjustable -speed = 1.0 -device = 'cpu' # or cuda:0 - -text = "안녕하세요! 오늘은 날씨가 정말 좋네요." -model = TTS(language='KR', device=device) -speaker_ids = model.hps.data.spk2id - -output_path = 'kr.wav' -model.tts_to_file(text, speaker_ids['KR'], output_path, speed=speed) -``` +- [Use without Installation](docs/quick_use.md) +- [Install and Use Locally](docs/install.md) ## License @@ -240,4 +33,6 @@ This library is under MIT License, which means it is free for both commercial an ## Acknowledgements -This implementation is based on several excellent projects, [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), [VITS2](https://github.com/daniilrobnikov/vits2) and [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2). We appreciate their awesome work! +This implementation is based on [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), [VITS2](https://github.com/daniilrobnikov/vits2) and [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2). We appreciate their awesome work. + +Many thanks to [@fakerybakery](https://github.com/fakerybakery) for adding the Web UI and CLI part. diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 00000000..6ddcf5b3 --- /dev/null +++ b/docs/install.md @@ -0,0 +1,234 @@ +## Install and Use Locally + +### Table of Content +- [Linux Install](#linux-install) +- [Windows and macOS](#windows-and-macos-install) +- [Usage](#usage) + - [Web UI](#webui) + - [CLI](#cli) + - [Python API](#python-api) + + + +### Linux Install +Use pip to install from github repo. +```bash +pip install git+https://github.com/myshell-ai/MeloTTS.git +python -m unidic download +``` + +Alternatively, you may run manual installation: + +```bash +git clone https://github.com/myshell-ai/MeloTTS.git +cd MeloTTS +pip install -e . +python -m unidic download +``` + +### Windows and macOS Install +To avoid compatibility issues, for Windows and macOS users, we suggest to run via Docker. Ensure that [you have Docker installed](https://docs.docker.com/engine/install/). + +**Build Docker** + +This could take a few minutes. +```bash +git clone https://github.com/myshell-ai/MeloTTS.git +cd MeloTTS +docker build -t melotts . +``` + +**Run Docker** +```bash +docker run -it -p 8888:8888 melotts +``` +Then open [http://localhost:8888](http://localhost:8888) in your browser to use the app. + +## Usage + +### WebUI + +The WebUI supports muliple languages and voices. First, follow the installation steps. Then, simply run: + +```bash +melo-ui +# Or: python melo/app.py +``` + +### CLI + +You may use the MeloTTS CLI to interact with MeloTTS. The CLI may be invoked using either `melotts` or `melo`. Here are some examples: + +**Read English text:** + +```bash +melo "Text to read" output.wav +``` + +**Specify a language:** + +```bash +melo "Text to read" output.wav --language EN +``` + +**Specify a speaker:** + +```bash +melo "Text to read" output.wav --language EN --speaker EN-US +melo "Text to read" output.wav --language EN --speaker EN-AU +``` + +The available speakers are: `EN-Default`, `EN-US`, `EN-BR`, `EN_INDIA` `EN-AU`. + +**Specify a speed:** + +```bash +melo "Text to read" output.wav --language EN --speaker EN-US --speed 1.5 +melo "Text to read" output.wav --speed 1.5 +``` + +**Use a different language:** + +```bash +melo "text-to-speech 领域近年来发展迅速" zh.wav -l ZH +``` + +**Load from a file:** + +```bash +melo file.txt out.wav --file +``` + +The full API documentation may be found using: + +```bash +melo --help +``` + +### Python API + +#### English with Multiple Accents + +```python +from melo.api import TTS + +# Speed is adjustable +speed = 1.0 + +# CPU is sufficient for real-time inference. +# You can set it manually to 'cpu' or 'cuda' or 'cuda:0' or 'mps' +device = 'auto' # Will automatically use GPU if available + +# English +text = "Did you ever hear a folk tale about a giant turtle?" +model = TTS(language='EN', device=device) +speaker_ids = model.hps.data.spk2id + +# American accent +output_path = 'en-us.wav' +model.tts_to_file(text, speaker_ids['EN-US'], output_path, speed=speed) + +# British accent +output_path = 'en-br.wav' +model.tts_to_file(text, speaker_ids['EN-BR'], output_path, speed=speed) + +# Indian accent +output_path = 'en-india.wav' +model.tts_to_file(text, speaker_ids['EN_INDIA'], output_path, speed=speed) + +# Australian accent +output_path = 'en-au.wav' +model.tts_to_file(text, speaker_ids['EN-AU'], output_path, speed=speed) + +# Default accent +output_path = 'en-default.wav' +model.tts_to_file(text, speaker_ids['EN-Default'], output_path, speed=speed) + +``` + +### Spanish +```python +from melo.api import TTS + +# Speed is adjustable +speed = 1.0 + +# CPU is sufficient for real-time inference. +# You can also change to cuda:0 +device = 'cpu' + +text = "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante." +model = TTS(language='ES', device=device) +speaker_ids = model.hps.data.spk2id + +output_path = 'es.wav' +model.tts_to_file(text, speaker_ids['ES'], output_path, speed=speed) +``` + +#### French + +```python +from melo.api import TTS + +# Speed is adjustable +speed = 1.0 +device = 'cpu' # or cuda:0 + +text = "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante." +model = TTS(language='FR', device=device) +speaker_ids = model.hps.data.spk2id + +output_path = 'fr.wav' +model.tts_to_file(text, speaker_ids['FR'], output_path, speed=speed) +``` + +#### Chinese + +```python +from melo.api import TTS + +# Speed is adjustable +speed = 1.0 +device = 'cpu' # or cuda:0 + +text = "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。" +model = TTS(language='ZH', device=device) +speaker_ids = model.hps.data.spk2id + +output_path = 'zh.wav' +model.tts_to_file(text, speaker_ids['ZH'], output_path, speed=speed) +``` + +#### Japanese + +```python +from melo.api import TTS + +# Speed is adjustable +speed = 1.0 +device = 'cpu' # or cuda:0 + +text = "彼は毎朝ジョギングをして体を健康に保っています。" +model = TTS(language='JP', device=device) +speaker_ids = model.hps.data.spk2id + +output_path = 'jp.wav' +model.tts_to_file(text, speaker_ids['JP'], output_path, speed=speed) +``` + +#### Korean + +```python +from melo.api import TTS + +# Speed is adjustable +speed = 1.0 +device = 'cpu' # or cuda:0 + +text = "안녕하세요! 오늘은 날씨가 정말 좋네요." +model = TTS(language='KR', device=device) +speaker_ids = model.hps.data.spk2id + +output_path = 'kr.wav' +model.tts_to_file(text, speaker_ids['KR'], output_path, speed=speed) +``` \ No newline at end of file diff --git a/docs/quick_use.md b/docs/quick_use.md new file mode 100644 index 00000000..11d3e077 --- /dev/null +++ b/docs/quick_use.md @@ -0,0 +1,48 @@ +## Use MeloTTS without Installation + +**Quick Demo** + +An unofficial [live demo](https://huggingface.co/spaces/mrfakename/MeloTTS) is hosted on Hugging Face Spaces. + +**Use on MyShell** + +There are hundreds of TTS models on MyShell, much more than MeloTTS. For example: + +English +- [gentle British male voice](https://app.myshell.ai/widget/nIfamm) +- [cheerful young female voice](https://app.myshell.ai/widget/AjIjqy) +- [sultry and robust male voice](https://app.myshell.ai/widget/zQJJN3) + +Spanish +- [voz femenina adorable](https://app.myshell.ai/widget/buIZBf) +- [voz masculina joven](https://app.myshell.ai/widget/rayuiy) +- [voz de niña inmadura](https://app.myshell.ai/widget/mYFV3e) + +French +- [voix adorable de fille](https://app.myshell.ai/widget/3IfEfy) +- [voix douce masculine](https://app.myshell.ai/widget/IRR3M3) +- [voix douce féminine](https://app.myshell.ai/widget/NRbaUj) + +German +- [sanfte Männerstimme](https://app.myshell.ai/widget/JFnAn2) +- [sanfte Frauenstimme](https://app.myshell.ai/widget/MrU7Nb) +- [unreife Mädchenstimme](https://app.myshell.ai/widget/UFbYBj) + +Portuguese +- [voz feminina nítida](https://app.myshell.ai/widget/VzMb6j) +- [voz de menino imaturo](https://app.myshell.ai/widget/nAzeei) +- [voz masculina sóbria](https://app.myshell.ai/widget/JZRNJz) + +Arabic +- [صوت امرأة ناضجة" في اللغة](https://app.myshell.ai/widget/zqMruu) +- [صوت رجل ناضج" في اللغة العربية](https://app.myshell.ai/widget/iqMbQr) + +Russian +- [зрелый женский голос](https://app.myshell.ai/widget/6byMZ3) +- [зрелый мужской голос](https://app.myshell.ai/widget/NB7jmm) + +Chinese +- [甜美女声](https://app.myshell.ai/widget/ymeUjm) +- [青年男声](https://app.myshell.ai/widget/NZnERb) + +More can be found at the widget center of [MyShell.ai](https://app.myshell.ai/robot-workshop). \ No newline at end of file diff --git a/melo/app.py b/melo/app.py index 4287dd98..7f160fcf 100644 --- a/melo/app.py +++ b/melo/app.py @@ -41,7 +41,7 @@ def load_speakers(language, text): with gr.Blocks() as demo: gr.Markdown('# MeloTTS WebUI\n\nA WebUI for MeloTTS.') with gr.Group(): - speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-Default', label='Speaker') + speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-US', label='Speaker') language = gr.Radio(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN') speed = gr.Slider(label='Speed', minimum=0.1, maximum=10.0, value=1.0, interactive=True, step=0.1) text = gr.Textbox(label="Text to speak", value=default_text_dict['EN'])