Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cloud translation support with multiple providers and configurati… #183

Merged
merged 10 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ target_sources(
src/translation/translation-language-utils.cpp
src/ui/filter-replace-dialog.cpp)

add_subdirectory(src/translation/cloud-translation)

set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})

if(ENABLE_TESTS)
Expand Down
20 changes: 16 additions & 4 deletions cmake/BuildICU.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -48,26 +48,38 @@ if(WIN32)
"${ICU_LIB_${lib}}")
endforeach()
else()
# Add ccache detection at the start
find_program(CCACHE_PROGRAM ccache)
if(CCACHE_PROGRAM)
message(STATUS "Found ccache: ${CCACHE_PROGRAM}")
# Create compiler wrapper commands
set(C_LAUNCHER "${CCACHE_PROGRAM} ${CMAKE_C_COMPILER}")
set(CXX_LAUNCHER "${CCACHE_PROGRAM} ${CMAKE_CXX_COMPILER}")
endif()

set(ICU_URL
"https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-src.tgz"
)
set(ICU_HASH "SHA256=cb968df3e4d2e87e8b11c49a5d01c787bd13b9545280fc6642f826527618caef")
if(APPLE)
set(ICU_PLATFORM "MacOSX")
set(TARGET_ARCH -arch\ $ENV{MACOS_ARCH})
set(ICU_BUILD_ENV_VARS CFLAGS=${TARGET_ARCH} CXXFLAGS=${TARGET_ARCH} LDFLAGS=${TARGET_ARCH})
set(ICU_BUILD_ENV_VARS CFLAGS=${TARGET_ARCH} CXXFLAGS=${TARGET_ARCH} LDFLAGS=${TARGET_ARCH} CC=${C_LAUNCHER}
CXX=${CXX_LAUNCHER})
else()
set(ICU_PLATFORM "Linux")
set(ICU_BUILD_ENV_VARS CFLAGS=-fPIC CXXFLAGS=-fPIC LDFLAGS=-fPIC)
set(ICU_BUILD_ENV_VARS CFLAGS=-fPIC CXXFLAGS=-fPIC LDFLAGS=-fPIC CC=${C_LAUNCHER} CXX=${CXX_LAUNCHER})
endif()

ExternalProject_Add(
ICU_build
DOWNLOAD_EXTRACT_TIMESTAMP true
GIT_REPOSITORY "https://github.com/unicode-org/icu.git"
GIT_TAG "release-${ICU_VERSION_DASH}"
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${ICU_BUILD_ENV_VARS} <SOURCE_DIR>/icu4c/source/runConfigureICU
${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static --disable-shared
CONFIGURE_COMMAND
${CMAKE_COMMAND} -E env ${ICU_BUILD_ENV_VARS} <SOURCE_DIR>/icu4c/source/runConfigureICU ${ICU_PLATFORM}
--prefix=<INSTALL_DIR> --enable-static --disable-shared --disable-tools --disable-samples --disable-layout
--disable-layoutex --disable-tests --disable-draft --disable-extras --disable-icuio
BUILD_COMMAND make -j4
BUILD_BYPRODUCTS
<INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}
Expand Down
25 changes: 24 additions & 1 deletion data/locale/en-US.ini
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ whisper_sampling_method="Whisper Sampling Method"
n_threads="Number of threads"
n_max_text_ctx="Max text context"
translate="Translate"
translate_local="Local Translation"
translate_cloud="Cloud Translation"
no_context="No context"
single_segment="Single segment"
print_special="Print special"
Expand Down Expand Up @@ -75,6 +77,11 @@ general_group="General"
transcription_group="Transcription"
file_output_group="File Output Configuration"
translate_explaination="Enabling translation will increase the processing load on your machine, This feature uses additional resources to translate content in real-time, which may impact performance. <a href='#'>Learn More</a>"
translate_cloud_explaination="Cloud translation requires an active internet connection and API keys to the translation provider."
translate_cloud_provider="Translation Provider"
translate_cloud_only_full_sentences="Translate only full sentences"
translate_cloud_api_key="Access Key"
translate_cloud_secret_key="Secret Key"
log_group="Logging"
advanced_group="Advanced Configuration"
buffered_output_parameters="Buffered Output Configuration"
Expand All @@ -89,4 +96,20 @@ translate_only_full_sentences="Translate only full sentences"
duration_filter_threshold="Duration filter"
segment_duration="Segment duration"
n_context_sentences="# Context sentences"
max_sub_duration="Max. sub duration (ms)"
max_sub_duration="Max. sub duration (ms)"
Google-Cloud-Translation="Google Cloud Translation"
Microsoft-Translator="Microsoft Azure Translator"
Amazon-Translate="AWS Translate"
IBM-Watson-Translate="IBM Watson Translate"
Yandex-Translate="Yandex Translate"
Baidu-Translate="Baidu Translate"
Tencent-Translate="Tencent Translate"
Alibaba-Translate="Alibaba Translate"
Naver-Translate="Naver Translate"
Kakao-Translate="Kakao Translate"
Papago-Translate="Papago"
Deepl-Translate="Deepl"
Bing-Translate="Bing Translate"
OpenAI-Translate="OpenAI"
Claude-Translate="Claude"
translate_cloud_deepl_free="Use Deepl Free API Endpoint"
184 changes: 125 additions & 59 deletions src/transcription-filter-callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "whisper-utils/whisper-utils.h"
#include "whisper-utils/whisper-model-utils.h"
#include "translation/language_codes.h"
#include "translation/cloud-translation/translation-cloud.h"

void send_caption_to_source(const std::string &target_source_name, const std::string &caption,
struct transcription_filter_data *gf)
Expand Down Expand Up @@ -80,30 +81,60 @@ std::string send_sentence_to_translation(const std::string &sentence,
return "";
}

void send_sentence_to_cloud_translation_async(const std::string &sentence,
struct transcription_filter_data *gf,
const std::string &source_language,
std::function<void(const std::string &)> callback)
{
std::thread([sentence, gf, source_language, callback]() {
const std::string last_text = gf->last_text_for_cloud_translation;
gf->last_text_for_cloud_translation = sentence;
if (gf->translate_cloud && !sentence.empty()) {
obs_log(gf->log_level, "Translating text with cloud provider %s. %s -> %s",
gf->translate_cloud_provider.c_str(), source_language.c_str(),
gf->translate_cloud_target_language.c_str());
std::string translated_text;
if (sentence == last_text) {
// do not translate the same sentence twice
callback(gf->last_text_cloud_translation);
return;
}
CloudTranslatorConfig config;
config.provider = gf->translate_cloud_provider;
config.access_key = gf->translate_cloud_api_key;
config.secret_key = gf->translate_cloud_secret_key;
config.free = gf->translate_cloud_deepl_free;
config.region = gf->translate_cloud_region;

translated_text = translate_cloud(config, sentence,
gf->translate_cloud_target_language,
source_language);
if (!translated_text.empty()) {
if (gf->log_words) {
obs_log(LOG_INFO, "Cloud Translation: '%s' -> '%s'",
sentence.c_str(), translated_text.c_str());
}
gf->last_text_translation = translated_text;
callback(translated_text);
return;
} else {
obs_log(gf->log_level, "Failed to translate text");
}
}
callback("");
}).detach();
}

void send_sentence_to_file(struct transcription_filter_data *gf,
const DetectionResultWithText &result, const std::string &str_copy,
const std::string &translated_sentence)
const DetectionResultWithText &result, const std::string &sentence,
const std::string &file_path, bool bump_sentence_number)
{
// Check if we should save the sentence
if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
// We are not recording, do not save the sentence to file
return;
}

std::string translated_file_path = "";
bool write_translations = gf->translate && !translated_sentence.empty();

// if translation is enabled, save the translated sentence to another file
if (write_translations) {
// add a postfix to the file name (without extension) with the translation target language
std::string output_file_path = gf->output_file_path;
std::string file_extension =
output_file_path.substr(output_file_path.find_last_of(".") + 1);
std::string file_name =
output_file_path.substr(0, output_file_path.find_last_of("."));
translated_file_path = file_name + "_" + gf->target_lang + "." + file_extension;
}

// should the file be truncated?
std::ios_base::openmode openmode = std::ios::out;
if (gf->truncate_output_file) {
Expand All @@ -114,15 +145,9 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
if (!gf->save_srt) {
// Write raw sentence to file
try {
std::ofstream output_file(gf->output_file_path, openmode);
output_file << str_copy << std::endl;
std::ofstream output_file(file_path, openmode);
output_file << sentence << std::endl;
output_file.close();
if (write_translations) {
std::ofstream translated_output_file(translated_file_path,
openmode);
translated_output_file << translated_sentence << std::endl;
translated_output_file.close();
}
} catch (const std::ofstream::failure &e) {
obs_log(LOG_ERROR, "Exception opening/writing/closing file: %s", e.what());
}
Expand All @@ -133,9 +158,9 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
}

obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
gf->output_file_path.c_str(), gf->sentence_number);
file_path.c_str(), gf->sentence_number);
// Append sentence to file in .srt format
std::ofstream output_file(gf->output_file_path, openmode);
std::ofstream output_file(file_path, openmode);
output_file << gf->sentence_number << std::endl;
// use the start and end timestamps to calculate the start and end time in srt format
auto format_ts_for_srt = [](std::ofstream &output_stream, uint64_t ts) {
Expand All @@ -156,28 +181,34 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
format_ts_for_srt(output_file, result.end_timestamp_ms);
output_file << std::endl;

output_file << str_copy << std::endl;
output_file << sentence << std::endl;
output_file << std::endl;
output_file.close();

if (write_translations) {
obs_log(gf->log_level, "Saving translation to file %s, sentence #%d",
translated_file_path.c_str(), gf->sentence_number);

// Append translated sentence to file in .srt format
std::ofstream translated_output_file(translated_file_path, openmode);
translated_output_file << gf->sentence_number << std::endl;
format_ts_for_srt(translated_output_file, result.start_timestamp_ms);
translated_output_file << " --> ";
format_ts_for_srt(translated_output_file, result.end_timestamp_ms);
translated_output_file << std::endl;

translated_output_file << translated_sentence << std::endl;
translated_output_file << std::endl;
translated_output_file.close();
if (bump_sentence_number) {
gf->sentence_number++;
}
}
}

gf->sentence_number++;
void send_translated_sentence_to_file(struct transcription_filter_data *gf,
const DetectionResultWithText &result,
const std::string &translated_sentence,
const std::string &target_lang)
{
// if translation is enabled, save the translated sentence to another file
if (translated_sentence.empty()) {
obs_log(gf->log_level, "Translation is empty, not saving to file");
} else {
// add a postfix to the file name (without extension) with the translation target language
std::string translated_file_path = "";
std::string output_file_path = gf->output_file_path;
std::string file_extension =
output_file_path.substr(output_file_path.find_last_of(".") + 1);
std::string file_name =
output_file_path.substr(0, output_file_path.find_last_of("."));
translated_file_path = file_name + "_" + target_lang + "." + file_extension;
send_sentence_to_file(gf, result, translated_sentence, translated_file_path, false);
}
}

Expand Down Expand Up @@ -235,41 +266,76 @@ void set_text_callback(struct transcription_filter_data *gf,
}
}

bool should_translate =
bool should_translate_local =
gf->translate_only_full_sentences ? result.result == DETECTION_RESULT_SPEECH : true;

// send the sentence to translation (if enabled)
std::string translated_sentence =
should_translate ? send_sentence_to_translation(str_copy, gf, result.language) : "";
std::string translated_sentence_local =
should_translate_local ? send_sentence_to_translation(str_copy, gf, result.language)
: "";

if (gf->translate) {
if (gf->translation_output == "none") {
// overwrite the original text with the translated text
str_copy = translated_sentence;
str_copy = translated_sentence_local;
} else {
if (gf->buffered_output) {
// buffered output - add the sentence to the monitor
gf->translation_monitor.addSentenceFromStdString(
translated_sentence,
translated_sentence_local,
get_time_point_from_ms(result.start_timestamp_ms),
get_time_point_from_ms(result.end_timestamp_ms),
result.result == DETECTION_RESULT_PARTIAL);
} else {
// non-buffered output - send the sentence to the selected source
send_caption_to_source(gf->translation_output, translated_sentence,
gf);
send_caption_to_source(gf->translation_output,
translated_sentence_local, gf);
}
}
if (gf->save_to_file && gf->output_file_path != "") {
send_translated_sentence_to_file(gf, result, translated_sentence_local,
gf->target_lang);
}
}

if (gf->buffered_output) {
gf->captions_monitor.addSentenceFromStdString(
str_copy, get_time_point_from_ms(result.start_timestamp_ms),
get_time_point_from_ms(result.end_timestamp_ms),
result.result == DETECTION_RESULT_PARTIAL);
} else {
// non-buffered output - send the sentence to the selected source
send_caption_to_source(gf->text_source_name, str_copy, gf);
bool should_translate_cloud = (gf->translate_cloud_only_full_sentences
? result.result == DETECTION_RESULT_SPEECH
: true) &&
gf->translate_cloud;

if (should_translate_cloud) {
send_sentence_to_cloud_translation_async(
str_copy, gf, result.language,
[gf, result](const std::string &translated_sentence_cloud) {
if (gf->translate_cloud_output != "none") {
send_caption_to_source(gf->translate_cloud_output,
translated_sentence_cloud, gf);
} else {
// overwrite the original text with the translated text
send_caption_to_source(gf->text_source_name,
translated_sentence_cloud, gf);
}
if (gf->save_to_file && gf->output_file_path != "") {
send_translated_sentence_to_file(
gf, result, translated_sentence_cloud,
gf->translate_cloud_target_language);
}
});
}

// send the original text to the output
// unless the translation is enabled and set to overwrite the original text
if (!((should_translate_cloud && gf->translate_cloud_output == "none") ||
(should_translate_local && gf->translation_output == "none"))) {
if (gf->buffered_output) {
gf->captions_monitor.addSentenceFromStdString(
str_copy, get_time_point_from_ms(result.start_timestamp_ms),
get_time_point_from_ms(result.end_timestamp_ms),
result.result == DETECTION_RESULT_PARTIAL);
} else {
// non-buffered output - send the sentence to the selected source
send_caption_to_source(gf->text_source_name, str_copy, gf);
}
}

if (gf->caption_to_stream && result.result == DETECTION_RESULT_SPEECH) {
Expand All @@ -279,7 +345,7 @@ void set_text_callback(struct transcription_filter_data *gf,

if (gf->save_to_file && gf->output_file_path != "" &&
result.result == DETECTION_RESULT_SPEECH) {
send_sentence_to_file(gf, result, str_copy, translated_sentence);
send_sentence_to_file(gf, result, str_copy, gf->output_file_path, true);
}

if (!result.text.empty() && (result.result == DETECTION_RESULT_SPEECH ||
Expand Down
18 changes: 15 additions & 3 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,18 @@ struct transcription_filter_data {
float duration_filter_threshold = 2.25f;
int segment_duration = 7000;

// Last transcription result
std::string last_text_for_translation;
std::string last_text_translation;
// Cloud translation options
bool translate_cloud = false;
std::string translate_cloud_provider;
std::string translate_cloud_target_language;
std::string translate_cloud_output;
std::string translate_cloud_api_key;
std::string translate_cloud_secret_key;
bool translate_cloud_only_full_sentences = true;
std::string last_text_for_cloud_translation;
std::string last_text_cloud_translation;
bool translate_cloud_deepl_free;
std::string translate_cloud_region;

// Transcription context sentences
int n_context_sentences;
Expand Down Expand Up @@ -119,6 +128,9 @@ struct transcription_filter_data {
std::string translation_model_index;
std::string translation_model_path_external;
bool translate_only_full_sentences;
// Last transcription result
std::string last_text_for_translation;
std::string last_text_translation;

bool buffered_output = false;
TokenBufferThread captions_monitor;
Expand Down
Loading
Loading