diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index bb04c98b5e..45378141b1 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -154,6 +154,12 @@ jobs: ls -lh echo "---" + ./run-kokoro-en.sh + rm -rf kokoro-en-* + rm kokoro-en + ls -lh + echo "---" + ./run-matcha-zh.sh rm -rf matcha-icefall-* rm matcha-zh diff --git a/pascal-api-examples/tts/.gitignore b/pascal-api-examples/tts/.gitignore index c7d2828253..429005958c 100644 --- a/pascal-api-examples/tts/.gitignore +++ b/pascal-api-examples/tts/.gitignore @@ -6,3 +6,5 @@ matcha-zh matcha-en matcha-zh-playback matcha-en-playback +kokoro-en +kokoro-en-playback diff --git a/pascal-api-examples/tts/kokoro-en-playback.pas b/pascal-api-examples/tts/kokoro-en-playback.pas new file mode 100644 index 0000000000..7796a6fee5 --- /dev/null +++ b/pascal-api-examples/tts/kokoro-en-playback.pas @@ -0,0 +1,239 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +program kokoro_en_playback; +{ +This file shows how to use the text to speech API of sherpa-onnx +with Kokoro models. + +It generates speech from text and saves it to a wave file. + +Note that it plays the audio back as it is still generating. +} + +{$mode objfpc} + +uses + {$ifdef unix} + cthreads, + {$endif} + SysUtils, + dos, + ctypes, + portaudio, + sherpa_onnx; + +var + CriticalSection: TRTLCriticalSection; + + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + Resampler: TSherpaOnnxLinearResampler; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 7; + Buffer: TSherpaOnnxCircularBuffer; + FinishedGeneration: Boolean = False; + FinishedPlaying: Boolean = False; + + Version: String; + EnvStr: String; + Status: Integer; + NumDevices: Integer; + DeviceIndex: Integer; + DeviceInfo: PPaDeviceInfo; + + { If you get EDivByZero: Division by zero error, please change the sample rate + to the one supported by your microphone. + } + DeviceSampleRate: Integer = 48000; + I: Integer; + Param: TPaStreamParameters; + Stream: PPaStream; + Wave: TSherpaOnnxWave; + +function GenerateCallback( + Samples: pcfloat; N: cint32; + Arg: Pointer): cint; cdecl; +begin + EnterCriticalSection(CriticalSection); + try + if Resampler <> nil then + Buffer.Push(Resampler.Resample(Samples, N, False)) + else + Buffer.Push(Samples, N); + finally + LeaveCriticalSection(CriticalSection); + end; + + { 1 means to continue generating; 0 means to stop generating. } + Result := 1; +end; + +function PlayCallback( + input: Pointer; output: Pointer; + frameCount: culong; + timeInfo: PPaStreamCallbackTimeInfo; + statusFlags: TPaStreamCallbackFlags; + userData: Pointer ): cint; cdecl; +var + Samples: TSherpaOnnxSamplesArray; + I: Integer; +begin + EnterCriticalSection(CriticalSection); + try + if Buffer.Size >= frameCount then + begin + Samples := Buffer.Get(Buffer.Head, FrameCount); + Buffer.Pop(FrameCount); + end + else if Buffer.Size > 0 then + begin + Samples := Buffer.Get(Buffer.Head, Buffer.Size); + Buffer.Pop(Buffer.Size); + SetLength(Samples, frameCount); + end + else + SetLength(Samples, frameCount); + + for I := 0 to frameCount - 1 do + pcfloat(output)[I] := Samples[I]; + + if (Buffer.Size > 0) or (not FinishedGeneration) then + Result := paContinue + else + begin + Result := paComplete; + FinishedPlaying := True; + end; + finally + LeaveCriticalSection(CriticalSection); + end; +end; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Kokoro.Model := './kokoro-en-v0_19/model.onnx'; + Config.Model.Kokoro.Voices := './kokoro-en-v0_19/voices.bin'; + Config.Model.Kokoro.Tokens := './kokoro-en-v0_19/tokens.txt'; + Config.Model.Kokoro.DataDir := './kokoro-en-v0_19/espeak-ng-data'; + Config.Model.NumThreads := 2; + Config.Model.Debug := False; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +begin + Tts := GetOfflineTts; + if Tts.GetSampleRate <> DeviceSampleRate then + Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate); + + Version := String(Pa_GetVersionText); + WriteLn('Version is ', Version); + Status := Pa_Initialize; + if Status <> paNoError then + begin + WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; + + NumDevices := Pa_GetDeviceCount; + WriteLn('Num devices: ', NumDevices); + + DeviceIndex := Pa_GetDefaultOutputDevice; + + if DeviceIndex = paNoDevice then + begin + WriteLn('No default output device found'); + Pa_Terminate; + Exit; + end; + + EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE'); + if EnvStr <> '' then + begin + DeviceIndex := StrToIntDef(EnvStr, DeviceIndex); + WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr); + end; + + for I := 0 to (NumDevices - 1) do + begin + DeviceInfo := Pa_GetDeviceInfo(I); + if I = DeviceIndex then + { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) } + WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)])) + else + WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)])); + end; + + WriteLn('Use device ', DeviceIndex); + WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name); + WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels); + + Initialize(Param); + Param.Device := DeviceIndex; + Param.ChannelCount := 1; + Param.SampleFormat := paFloat32; + param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency; + param.HostApiSpecificStreamInfo := nil; + + Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate); + + + { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. } + Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag, + PPaStreamCallback(@PlayCallback), nil); + + if Status <> paNoError then + begin + WriteLn('Failed to open stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + InitCriticalSection(CriticalSection); + + Status := Pa_StartStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to start stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.'; + + Audio := Tts.Generate(Text, SpeakerId, Speed, + PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil); + FinishedGeneration := True; + SherpaOnnxWriteWave('./kokoro-en-playback-7.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./kokoro-en-playback-7.wav'); + + while not FinishedPlaying do + Pa_Sleep(100); {sleep for 0.1 second } + {TODO(fangjun): Use an event to indicate the play is finished} + + DoneCriticalSection(CriticalSection); + + FreeAndNil(Tts); + FreeAndNil(Resampler); + + Status := Pa_CloseStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to close stream, ', Pa_GetErrorText(Status)); + Exit; + end; + + Status := Pa_Terminate; + if Status <> paNoError then + begin + WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; +end. + diff --git a/pascal-api-examples/tts/kokoro-en.pas b/pascal-api-examples/tts/kokoro-en.pas new file mode 100644 index 0000000000..5e186b24d8 --- /dev/null +++ b/pascal-api-examples/tts/kokoro-en.pas @@ -0,0 +1,55 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +program kokoro_en; +{ +This file shows how to use the text to speech API of sherpa-onnx +with Kokoro TTS models. + +It generates speech from text and saves it to a wave file. + +If you want to play it while it is generating, please see +./kokoro-en-playback.pas +} + +{$mode objfpc} + +uses + SysUtils, + sherpa_onnx; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Kokoro.Model := './kokoro-en-v0_19/model.onnx'; + Config.Model.Kokoro.Voices := './kokoro-en-v0_19/voices.bin'; + Config.Model.Kokoro.Tokens := './kokoro-en-v0_19/tokens.txt'; + Config.Model.Kokoro.DataDir := './kokoro-en-v0_19/espeak-ng-data'; + Config.Model.NumThreads := 2; + Config.Model.Debug := False; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +var + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 8; + +begin + Tts := GetOfflineTts; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.'; + + Audio := Tts.Generate(Text, SpeakerId, Speed); + SherpaOnnxWriteWave('./kokoro-en-8.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./kokoro-en-8.wav'); + + FreeAndNil(Tts); +end. + diff --git a/pascal-api-examples/tts/matcha-en-playback.pas b/pascal-api-examples/tts/matcha-en-playback.pas index e750099cbb..7a6e8c7584 100644 --- a/pascal-api-examples/tts/matcha-en-playback.pas +++ b/pascal-api-examples/tts/matcha-en-playback.pas @@ -2,7 +2,7 @@ program matcha_en_playback; { This file shows how to use the text to speech API of sherpa-onnx -with Piper models. +with MatchaTTS models. It generates speech from text and saves it to a wave file. @@ -210,8 +210,8 @@ function GetOfflineTts: TSherpaOnnxOfflineTts; Audio := Tts.Generate(Text, SpeakerId, Speed, PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil); FinishedGeneration := True; - SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate); - WriteLn('Saved to ./matcha-zh-playback.wav'); + SherpaOnnxWriteWave('./matcha-en-playback.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./matcha-en-playback.wav'); while not FinishedPlaying do Pa_Sleep(100); {sleep for 0.1 second } diff --git a/pascal-api-examples/tts/matcha-en.pas b/pascal-api-examples/tts/matcha-en.pas index 7ef34b7034..f818d53e64 100644 --- a/pascal-api-examples/tts/matcha-en.pas +++ b/pascal-api-examples/tts/matcha-en.pas @@ -7,7 +7,7 @@ It generates speech from text and saves it to a wave file. If you want to play it while it is generating, please see -./matcha-zh-playback.pas +./matcha-en-playback.pas } {$mode objfpc} diff --git a/pascal-api-examples/tts/matcha-zh-playback.pas b/pascal-api-examples/tts/matcha-zh-playback.pas index 08b2bbe2df..05f94ba9c3 100644 --- a/pascal-api-examples/tts/matcha-zh-playback.pas +++ b/pascal-api-examples/tts/matcha-zh-playback.pas @@ -2,7 +2,7 @@ program matcha_zh_playback; { This file shows how to use the text to speech API of sherpa-onnx -with Piper models. +with MatchaTTS models. It generates speech from text and saves it to a wave file. diff --git a/pascal-api-examples/tts/run-kokoro-en-playback.sh b/pascal-api-examples/tts/run-kokoro-en-playback.sh new file mode 100755 index 0000000000..49e280d7a4 --- /dev/null +++ b/pascal-api-examples/tts/run-kokoro-en-playback.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 + tar xf kokoro-en-v0_19.tar.bz2 + rm kokoro-en-v0_19.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + -Fl/usr/local/Cellar/portaudio/19.7.0/lib \ + ./kokoro-en-playback.pas + +# Please see ../portaudio-test/README.md +# for how to install portaudio on macOS + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./kokoro-en-playback diff --git a/pascal-api-examples/tts/run-kokoro-en.sh b/pascal-api-examples/tts/run-kokoro-en.sh new file mode 100755 index 0000000000..f26fb335d5 --- /dev/null +++ b/pascal-api-examples/tts/run-kokoro-en.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 + tar xf kokoro-en-v0_19.tar.bz2 + rm kokoro-en-v0_19.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./kokoro-en.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./kokoro-en diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 442c8a504d..182d440ab4 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -76,12 +76,24 @@ TSherpaOnnxOfflineTtsMatchaModelConfig = record class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig); end; + TSherpaOnnxOfflineTtsKokoroModelConfig = record + Model: AnsiString; + Voices: AnsiString; + Tokens: AnsiString; + DataDir: AnsiString; + LengthScale: Single; + + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); + end; + TSherpaOnnxOfflineTtsModelConfig = record Vits: TSherpaOnnxOfflineTtsVitsModelConfig; NumThreads: Integer; Debug: Boolean; Provider: AnsiString; Matcha: TSherpaOnnxOfflineTtsMatchaModelConfig; + Kokoro: TSherpaOnnxOfflineTtsKokoroModelConfig; function ToString: AnsiString; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig); @@ -739,12 +751,21 @@ SherpaOnnxOfflineTtsMatchaModelConfig = record DictDir: PAnsiChar; end; + SherpaOnnxOfflineTtsKokoroModelConfig = record + Model: PAnsiChar; + Voices: PAnsiChar; + Tokens: PAnsiChar; + DataDir: PAnsiChar; + LengthScale: cfloat; + end; + SherpaOnnxOfflineTtsModelConfig = record Vits: SherpaOnnxOfflineTtsVitsModelConfig; NumThreads: cint32; Debug: cint32; Provider: PAnsiChar; Matcha: SherpaOnnxOfflineTtsMatchaModelConfig; + Kokoro: SherpaOnnxOfflineTtsKokoroModelConfig; end; SherpaOnnxOfflineTtsConfig = record @@ -1903,6 +1924,23 @@ function TSherpaOnnxOfflineTtsMatchaModelConfig.ToString: AnsiString; Dest.LengthScale := 1.0; end; +function TSherpaOnnxOfflineTtsKokoroModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineTtsKokoroModelConfig(' + + 'Model := %s, ' + + 'Voices := %s, ' + + 'Tokens := %s, ' + + 'DataDir := %s, ' + + 'LengthScale := %.2f' + + ')', + [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale]); +end; + +class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); +begin + Dest.LengthScale := 1.0; +end; + function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString; begin Result := Format('TSherpaOnnxOfflineTtsModelConfig(' + @@ -1910,10 +1948,11 @@ function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString; 'NumThreads := %d, ' + 'Debug := %s, ' + 'Provider := %s, ' + - 'Matcha := %s' + + 'Matcha := %s, ' + + 'Kokoro := %s' + ')', [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider, - Self.Matcha.ToString + Self.Matcha.ToString, Self.Kokoro.ToString ]); end; @@ -1966,6 +2005,12 @@ constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig); C.Model.Matcha.LengthScale := Config.Model.Matcha.LengthScale; C.Model.Matcha.DictDir := PAnsiChar(Config.Model.Matcha.DictDir); + C.Model.Kokoro.Model := PAnsiChar(Config.Model.Kokoro.Model); + C.Model.Kokoro.Voices := PAnsiChar(Config.Model.Kokoro.Voices); + C.Model.Kokoro.Tokens := PAnsiChar(Config.Model.Kokoro.Tokens); + C.Model.Kokoro.DataDir := PAnsiChar(Config.Model.Kokoro.DataDir); + C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale; + C.Model.NumThreads := Config.Model.NumThreads; C.Model.Provider := PAnsiChar(Config.Model.Provider); C.Model.Debug := Ord(Config.Model.Debug);