From 99a75329b4d083e2180299c9a5187c52aa8ce929 Mon Sep 17 00:00:00 2001 From: younesselrag Date: Mon, 9 Sep 2024 03:18:02 +0100 Subject: [PATCH] push all code --- Docker-compose.yml | 39 ++++++++++ utils/utils.py => __init__.py | 0 main.py | 0 requirements.txt | 6 +- utils/__init__.py | 0 utils/helpers.py | 136 ++++++++++++++++++++++++++++++++++ 6 files changed, 180 insertions(+), 1 deletion(-) rename utils/utils.py => __init__.py (100%) create mode 100644 main.py create mode 100644 utils/__init__.py create mode 100644 utils/helpers.py diff --git a/Docker-compose.yml b/Docker-compose.yml index e69de29..0976c1e 100644 --- a/Docker-compose.yml +++ b/Docker-compose.yml @@ -0,0 +1,39 @@ +--- +services: + + Backend: + build: + context: . + command: + - python3 + - main.py + - --recv_host + - 0.0.0.0 + - --send_host + - 0.0.0.0 + - --lm_model_name + - microsoft/Phi-3-mini-4k-instruct + - --init_chat_role + - system + - --init_chat_prompt + - "You are a helpful assistant" + - --stt_compile_mode + - reduce-overhead + - --tts_compile_mode + - default + expose: + - 12345/tcp + - 12346/tcp + ports: + - 12345:12345/tcp + - 12346:12346/tcp + volumes: + - ./cache/:/root/.cache/ + - ./s2s_pipeline.py:/usr/src/app/s2s_pipeline.py + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0'] + capabilities: [gpu] \ No newline at end of file diff --git a/utils/utils.py b/__init__.py similarity index 100% rename from utils/utils.py rename to __init__.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index af5e3a6..1abc72d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,5 @@ -sounddevice \ No newline at end of file +sounddevice +nltk +parler_tts @ git+https://github.com/huggingface/parler-tts.git +torch==2.4.0 +transformers[torch] \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/helpers.py b/utils/helpers.py new file mode 100644 index 0000000..4d19637 --- /dev/null +++ b/utils/helpers.py @@ -0,0 +1,136 @@ +import numpy as np +import torch + + +def next_power_of_2(x: int) -> int: + """ + Compute the next power of 2 greater than or equal to x. + + Parameters: + ---------- + x : int + Input integer value. + + Returns: + ------- + int + Next power of 2. + """ + return 1 if x == 0 else 2 ** (x - 1).bit_length() + + +def int2float(sound: np.ndarray) -> np.ndarray: + """ + Convert 16-bit PCM audio to float32 format and normalize. + + Parameters: + ---------- + sound : np.ndarray + Input 16-bit PCM audio array. + + Returns: + ------- + np.ndarray + Normalized float32 audio array. + """ + abs_max = np.abs(sound).max() + sound = sound.astype('float32') + if abs_max > 0: + sound *= 1 / 32768 # Normalize the audio to [-1, 1] range + return sound.squeeze() + + +class VADIterator: + """ + Voice Activity Detector (VAD) iterator for speech segmentation based on a given model. + + Parameters: + ---------- + model : torch.nn.Module + Preloaded .jit or .onnx silero VAD model. + + threshold : float, optional (default=0.5) + Probability threshold for classifying speech. + + sampling_rate : int, optional (default=16000) + Sampling rate of the audio in Hz. Supports only 8000 or 16000. + + min_silence_duration_ms : int, optional (default=100) + Minimum silence duration in milliseconds to end a speech segment. + + speech_pad_ms : int, optional (default=30) + Padding (in milliseconds) added to each side of detected speech segments. + """ + + def __init__(self, model: torch.nn.Module, threshold: float = 0.5, sampling_rate: int = 16000, + min_silence_duration_ms: int = 100, speech_pad_ms: int = 30): + self.model = model + self.threshold = threshold + self.sampling_rate = sampling_rate + self.is_speaking = False + self.buffer = [] + self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 + self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000 + self.reset_states() + + if sampling_rate not in [8000, 16000]: + raise ValueError('VADIterator supports only 8000 or 16000 sampling rates') + + def reset_states(self) -> None: + """ + Reset the internal states of the VAD model and buffer. + """ + self.model.reset_states() + self.triggered = False + self.temp_end = 0 + self.current_sample = 0 + + @torch.no_grad() + def __call__(self, x: torch.Tensor) -> torch.Tensor: + """ + Process an audio chunk and detect speech. + + Parameters: + ---------- + x : torch.Tensor + Input audio chunk to be processed. + + Returns: + ------- + torch.Tensor or None + Detected speech chunk if available, otherwise None. + """ + if not torch.is_tensor(x): + try: + x = torch.Tensor(x) + except: + raise TypeError("Audio cannot be cast to tensor. Cast it manually.") + + window_size_samples = len(x[0]) if x.dim() == 2 else len(x) + self.current_sample += window_size_samples + + speech_prob = self.model(x, self.sampling_rate).item() + + if speech_prob >= self.threshold and self.temp_end: + self.temp_end = 0 + + if speech_prob >= self.threshold and not self.triggered: + self.triggered = True + return None + + if speech_prob < self.threshold - 0.15 and self.triggered: + if not self.temp_end: + self.temp_end = self.current_sample + if self.current_sample - self.temp_end < self.min_silence_samples: + return None + else: + self.temp_end = 0 + self.triggered = False + spoken_utterance = self.buffer + self.buffer = [] + return torch.cat(spoken_utterance) if spoken_utterance else None + + if self.triggered: + self.buffer.append(x) + + return None