From 2e83300fbb13a58a471826450ac1244f35140e33 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 2 May 2024 01:03:06 -0400 Subject: [PATCH] =?UTF-8?q?Update=20buffer=20size=20and=20overlap=20size?= =?UTF-8?q?=20in=20whisper-processing.h=20and=20defau=E2=80=A6=20(#95)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update buffer size and overlap size in whisper-processing.h and default buffer size in msec in transcription-filter.cpp * Update audio processing timestamp calculation in whisper-processing.cpp * Update OBS plugin installation instructions for Linux * Fix typo in update_whisper_model function name --- README.md | 12 ++- src/transcription-filter.cpp | 21 +++- src/whisper-utils/whisper-processing.cpp | 122 ++++++++++------------- src/whisper-utils/whisper-processing.h | 4 +- src/whisper-utils/whisper-utils.cpp | 3 +- src/whisper-utils/whisper-utils.h | 2 +- 6 files changed, 86 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index fb8f5c8..a7f177a 100644 --- a/README.md +++ b/README.md @@ -101,18 +101,24 @@ $ ./.github/scripts/package-macos -c Release For successfully building on linux, first clone the repo, then from the repo directory: ```sh $ sudo apt install -y libssl-dev -$ export OBS_PLUGINS_PATH=$(pwd)/release/RelWithDebInfo/lib/x86_64-linux-gnu/obs-plugins -$ export OBS_PLUGINS_DATA_PATH=$(pwd)/release/RelWithDebInfo/share/obs/obs-plugins $ ./.github/scripts/build-linux ``` Copy the results to the standard OBS folders on Ubuntu ```sh -$ sudo cp -R release/RelWithDebInfo/lib/* /usr/lib/x86_64-linux-gnu/ +$ sudo cp -R release/RelWithDebInfo/lib/* /usr/lib/ $ sudo cp -R release/RelWithDebInfo/share/* /usr/share/ ``` Note: The official [OBS plugins guide](https://obsproject.com/kb/plugins-guide) recommends adding plugins to the `~/.config/obs-studio/plugins` folder. This has to do with the way you *installed* OBS. +In case the above doesn't work, attempt to copy the files to the `~/.config` folder: +```sh +$ mkdir -p ~/.config/obs-studio/plugins/obs-localvocal/bin/64bit +$ cp -R release/RelWithDebInfo/lib/x86_64-linux-gnu/obs-plugins/* ~/.config/obs-studio/plugins/obs-localvocal/bin/64bit/ +$ mkdir -p ~/.config/obs-studio/plugins/obs-localvocal/data +$ cp -R release/RelWithDebInfo/share/obs/obs-plugins/obs-localvocal/* ~/.config/obs-studio/plugins/obs-localvocal/data/ +``` + ### Windows Use the CI scripts again, for example: diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 8056a06..403122b 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -189,7 +189,24 @@ void set_text_callback(struct transcription_filter_data *gf, std::string str_copy = fix_utf8(result.text); str_copy = remove_leading_trailing_nonalpha(str_copy); - if (gf->translate) { + // if suppression is enabled, check if the text is in the suppression list + if (!gf->suppress_sentences.empty()) { + // split the suppression list by newline into individual sentences + std::vector suppress_sentences_list = + split(gf->suppress_sentences, '\n'); + // check if the text is in the suppression list + for (const std::string &suppress_sentence : suppress_sentences_list) { + if (str_copy == suppress_sentence) { + obs_log(gf->log_level, "Suppressed sentence: '%s'", + str_copy.c_str()); + gf->last_text = str_copy; + return; // do not process the sentence + } + } + } + + if (gf->translate && !str_copy.empty() && str_copy != gf->last_text && + result.result == DETECTION_RESULT_SPEECH) { obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(), gf->target_lang.c_str()); std::string translated_text; @@ -370,7 +387,7 @@ void transcription_filter_update(void *data, obs_data_t *s) } obs_log(gf->log_level, "update whisper model"); - update_whsiper_model(gf, s); + update_whisper_model(gf, s); obs_log(gf->log_level, "update whisper params"); std::lock_guard lock(*gf->whisper_ctx_mutex); diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 7d46275..1351c00 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -287,7 +287,10 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter if (token.id >= 50256) { keep = false; } - if ((j == n_tokens - 2 || j == n_tokens - 3) && token.p < 0.5) { + if (j == n_tokens - 2 && token.p < 0.5) { + keep = false; + } + if (j == n_tokens - 3 && token.p < 0.4) { keep = false; } // if the second to last token is .id == 13 ('.'), don't keep it @@ -295,7 +298,7 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter keep = false; } // token ids https://huggingface.co/openai/whisper-large-v3/raw/main/tokenizer.json - if (token.id > 50540 && token.id <= 51865) { + if (token.id > 50566 && token.id <= 51865) { obs_log(gf->log_level, "Large time token found (%d), this shouldn't happen", token.id); @@ -315,20 +318,6 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter obs_log(gf->log_level, "Decoded sentence: '%s'", text.c_str()); obs_log(gf->log_level, "Token IDs: %s", tokenIds.c_str()); - // if suppression is enabled, check if the text is in the suppression list - if (!gf->suppress_sentences.empty()) { - // split the suppression list by newline into individual sentences - std::vector suppress_sentences_list = - split(gf->suppress_sentences, '\n'); - // check if the text is in the suppression list - for (const std::string &suppress_sentence : suppress_sentences_list) { - if (text.find(suppress_sentence) != std::string::npos) { - obs_log(gf->log_level, "Suppressed sentence: '%s'", - text.c_str()); - return {DETECTION_RESULT_UNKNOWN, "", 0, 0, {}}; - } - } - } if (gf->log_words) { obs_log(LOG_INFO, "[%s --> %s] (%.3f) %s", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), sentence_p, text.c_str()); @@ -346,7 +335,7 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) { uint32_t num_new_frames_from_infos = 0; uint64_t start_timestamp = 0; - bool last_step_in_segment = false; + bool save_overlap_region = true; { // scoped lock the buffer mutex @@ -355,6 +344,10 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) // We need (gf->frames - gf->last_num_frames) new frames for a full segment, const size_t remaining_frames_to_full_segment = gf->frames - gf->last_num_frames; + obs_log(gf->log_level, + "processing audio from buffer, %lu existing frames, %lu frames needed to full segment (%d frames)", + gf->last_num_frames, remaining_frames_to_full_segment, gf->frames); + // pop infos from the info buffer and mark the beginning timestamp from the first // info as the beginning timestamp of the segment struct transcription_filter_audio_info info_from_buf = {0}; @@ -371,14 +364,12 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) num_new_frames_from_infos -= info_from_buf.frames; circlebuf_push_front(&gf->info_buffer, &info_from_buf, size_of_audio_info); - // this is the final step in the segment - last_step_in_segment = true; break; } } obs_log(gf->log_level, - "with %lu remaining to full segment, popped %d info-frames, pushing at %lu (overlap)", + "with %lu remaining to full segment, popped %d frames from info buffer, pushed at %lu (overlap)", remaining_frames_to_full_segment, num_new_frames_from_infos, gf->last_num_frames); @@ -392,24 +383,18 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) } if (gf->last_num_frames > 0) { + obs_log(gf->log_level, "full segment, %lu frames overlap, %lu frames to process", + gf->last_num_frames, gf->last_num_frames + num_new_frames_from_infos); gf->last_num_frames += num_new_frames_from_infos; - if (!last_step_in_segment) { - // Mid-segment process - obs_log(gf->log_level, "mid-segment, now %d frames left to full segment", - (int)(gf->frames - gf->last_num_frames)); - } else { - // Final step in segment - obs_log(gf->log_level, "full segment, %d frames to process", - (int)(gf->last_num_frames)); - } } else { gf->last_num_frames = num_new_frames_from_infos; - obs_log(gf->log_level, "first segment, no overlap exists, %d frames to process", - (int)(gf->last_num_frames)); + obs_log(gf->log_level, "first segment, no overlap exists, %lu frames to process", + gf->last_num_frames); } - obs_log(gf->log_level, "processing %d frames (%d ms), start timestamp %llu ", - (int)gf->last_num_frames, (int)(gf->last_num_frames * 1000 / gf->sample_rate), + obs_log(gf->log_level, "processing %lu frames (%d ms), start timestamp %llu", + gf->last_num_frames, + (int)((float)gf->last_num_frames * 1000.0f / (float)gf->sample_rate), start_timestamp); // time the audio processing @@ -442,44 +427,38 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) resampled_16khz_frames); skipped_inference = true; // prevent copying the buffer to the beginning (overlap) - gf->last_num_frames = 0; - last_step_in_segment = false; + save_overlap_region = false; } else { - speech_start_frame = (stamps[0].start < 3000) ? 0 : stamps[0].start; + // if the vad finds that start within the first 10% of the buffer, set the start to 0 + speech_start_frame = (stamps[0].start < (int)(resampled_16khz_frames / 10)) + ? 0 + : stamps[0].start; speech_end_frame = stamps.back().end; uint32_t number_of_frames = speech_end_frame - speech_start_frame; + // if the speech is pressed up against the end of the buffer + // apply the overlapped region, else don't + save_overlap_region = (speech_end_frame == resampled_16khz_frames); + obs_log(gf->log_level, "VAD detected speech from %d to %d (%d frames, %d ms)", speech_start_frame, speech_end_frame, number_of_frames, number_of_frames * 1000 / WHISPER_SAMPLE_RATE); - // if the speech segment is less than 1 second - put the audio back into the buffer - // to be handled in the next iteration + // if the speech is less than 1 second - pad with zeros and send for inference if (number_of_frames > 0 && number_of_frames < WHISPER_SAMPLE_RATE) { - // convert speech_start_frame and speech_end_frame to original sample rate - speech_start_frame = - speech_start_frame * gf->sample_rate / WHISPER_SAMPLE_RATE; - speech_end_frame = - speech_end_frame * gf->sample_rate / WHISPER_SAMPLE_RATE; - number_of_frames = speech_end_frame - speech_start_frame; - - // use memmove to copy the speech segment to the beginning of the buffer - for (size_t c = 0; c < gf->channels; c++) { - memmove(gf->copy_buffers[c], - gf->copy_buffers[c] + speech_start_frame, - number_of_frames * sizeof(float)); - } - obs_log(gf->log_level, - "Speech segment is less than 1 second, moving %d to %d (len %d) to buffer start", - speech_start_frame, speech_end_frame, number_of_frames); - // no processing of the segment - skipped_inference = true; - // reset the last_num_frames to the number of frames in the buffer - gf->last_num_frames = number_of_frames; - // prevent copying the buffer to the beginning (overlap) - last_step_in_segment = false; + "Speech segment is less than 1 second, padding with zeros to 1 second"); + // copy the speech segment to the beginning of the resampled buffer + // use memmove to copy the speech segment to the beginning of the buffer + memmove(resampled_16khz[0], resampled_16khz[0] + speech_start_frame, + number_of_frames * sizeof(float)); + // zero out the rest of the buffer + memset(resampled_16khz[0] + number_of_frames, 0, + (WHISPER_SAMPLE_RATE - number_of_frames) * sizeof(float)); + + speech_start_frame = 0; + speech_end_frame = WHISPER_SAMPLE_RATE; } } } @@ -511,24 +490,29 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) obs_log(gf->log_level, "audio processing of %lu ms data took %d ms", last_num_frames_ms, (int)duration); - if (last_step_in_segment) { + if (save_overlap_region) { const uint64_t overlap_size_ms = (uint64_t)(gf->overlap_frames * 1000 / gf->sample_rate); obs_log(gf->log_level, - "copying %lu frames (%lu ms) from the end of the buffer (pos %lu) to the beginning", + "copying %lu overlap frames (%lu ms) from the end of the buffer (pos %lu) to the beginning", gf->overlap_frames, overlap_size_ms, gf->last_num_frames - gf->overlap_frames); for (size_t c = 0; c < gf->channels; c++) { - // This is the last step in the segment - reset the copy buffer (include overlap frames) + // zero out the copy buffer, just in case + memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float)); // move overlap frames from the end of the last copy_buffers to the beginning - memcpy(gf->copy_buffers[c], - gf->copy_buffers[c] + gf->last_num_frames - gf->overlap_frames, - gf->overlap_frames * sizeof(float)); - // zero out the rest of the buffer, just in case - memset(gf->copy_buffers[c] + gf->overlap_frames, 0, - (gf->frames - gf->overlap_frames) * sizeof(float)); + memmove(gf->copy_buffers[c], + gf->copy_buffers[c] + gf->last_num_frames - gf->overlap_frames, + gf->overlap_frames * sizeof(float)); } gf->last_num_frames = gf->overlap_frames; + } else { + obs_log(gf->log_level, "no overlap needed. zeroing out the copy buffer"); + // zero out the copy buffer, just in case + for (size_t c = 0; c < gf->channels; c++) { + memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float)); + } + gf->last_num_frames = 0; } } diff --git a/src/whisper-utils/whisper-processing.h b/src/whisper-utils/whisper-processing.h index 6798e92..6b764b1 100644 --- a/src/whisper-utils/whisper-processing.h +++ b/src/whisper-utils/whisper-processing.h @@ -6,9 +6,9 @@ // buffer size in msec #define DEFAULT_BUFFER_SIZE_MSEC 3000 // overlap in msec -#define DEFAULT_OVERLAP_SIZE_MSEC 100 +#define DEFAULT_OVERLAP_SIZE_MSEC 150 #define MAX_OVERLAP_SIZE_MSEC 1000 -#define MIN_OVERLAP_SIZE_MSEC 100 +#define MIN_OVERLAP_SIZE_MSEC 150 enum DetectionResult { DETECTION_RESULT_UNKNOWN = 0, diff --git a/src/whisper-utils/whisper-utils.cpp b/src/whisper-utils/whisper-utils.cpp index ad619f8..ddaf0c8 100644 --- a/src/whisper-utils/whisper-utils.cpp +++ b/src/whisper-utils/whisper-utils.cpp @@ -5,7 +5,7 @@ #include -void update_whsiper_model(struct transcription_filter_data *gf, obs_data_t *s) +void update_whisper_model(struct transcription_filter_data *gf, obs_data_t *s) { // update the whisper model path std::string new_model_path = obs_data_get_string(s, "whisper_model_path"); @@ -140,6 +140,7 @@ void start_whisper_thread_with_path(struct transcription_filter_data *gf, const #else std::string silero_vad_model_path = silero_vad_model_file; #endif + bfree(silero_vad_model_file); // roughly following https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py // for silero vad parameters gf->vad.reset(new VadIterator(silero_vad_model_path, WHISPER_SAMPLE_RATE, 64, 0.5f, 1000, diff --git a/src/whisper-utils/whisper-utils.h b/src/whisper-utils/whisper-utils.h index bc941f8..e91d382 100644 --- a/src/whisper-utils/whisper-utils.h +++ b/src/whisper-utils/whisper-utils.h @@ -7,7 +7,7 @@ #include -void update_whsiper_model(struct transcription_filter_data *gf, obs_data_t *s); +void update_whisper_model(struct transcription_filter_data *gf, obs_data_t *s); void shutdown_whisper_thread(struct transcription_filter_data *gf); void start_whisper_thread_with_path(struct transcription_filter_data *gf, const std::string &path);