diff --git a/autocut/main.py b/autocut/main.py index fadf777..d71083d 100644 --- a/autocut/main.py +++ b/autocut/main.py @@ -43,6 +43,10 @@ def main(): parser.add_argument('--device', type=str, default=None, choices=['cpu', 'cuda'], help='Force to CPU or GPU for trascribing. In default automatically use GPU if available.') + parser.add_argument('--sub-cn-inline-limit', type=int, default=16, # set 0 to disable + help='Optimize the display of long sentences in subtitle for Chinese') + parser.add_argument('--sub-cn-modal-words', type=str, default="啊,吧", # use English comma to separate + help='To filter the modal words in sentences for Chinese') args = parser.parse_args() diff --git a/autocut/transcribe.py b/autocut/transcribe.py index 407b1cd..d764f37 100644 --- a/autocut/transcribe.py +++ b/autocut/transcribe.py @@ -28,7 +28,7 @@ def run(self): audio = whisper.load_audio(input, sr=self.sampling_rate) speech_timestamps = self._detect_voice_activity(audio) - transcribe_results = self._transcibe(audio, speech_timestamps) + transcribe_results = self._transcribe(audio, speech_timestamps) output = name + '.srt' self._save_srt(output, transcribe_results) @@ -65,7 +65,7 @@ def _detect_voice_activity(self, audio): logging.info(f'Done voice activity detetion in {time.time() - tic:.1f} sec') return speeches - def _transcibe(self, audio, speech_timestamps): + def _transcribe(self, audio, speech_timestamps): tic = time.time() if self.whisper_model is None: self.whisper_model = whisper.load_model(self.args.whisper_model, self.args.device) @@ -106,7 +106,10 @@ def _add_sub(start, end, text): _add_sub(start, end, s["text"]) prev_end = end - with open(output, 'wb') as f: + from .transcribe_middleware import TranscribeMiddleware + TranscribeMiddleware(self.args, subs).run() + + with open(output, mode='wb') as f: f.write(srt.compose(subs).encode(self.args.encoding, 'replace')) def _save_md(self, md_fn, srt_fn, video_fn): diff --git a/autocut/transcribe_middleware.py b/autocut/transcribe_middleware.py new file mode 100644 index 0000000..5d5c9f2 --- /dev/null +++ b/autocut/transcribe_middleware.py @@ -0,0 +1,102 @@ +import srt + + +class TranscribeMiddleware: + def __init__(self, args, subs: list[srt.Subtitle]): + self.args = args + self.subs = subs + self.SINGLE_SUB_CN_MAX_LEN = self.args.sub_cn_inline_limit + self.MODAL_WORDS_CN = self.args.sub_cn_modal_words.strip() + + def run(self): + if self.args.lang == "zh": + if self.args.sub_cn_inline_limit > 0: + self._sub_split_CN() + + if len(self.args.sub_cn_modal_words.strip()) > 0: + self._sub_filter_modal_CN() + + def _sub_split_CN(self): + import datetime + import jionlp as jio + + new_subs = [] + + for sub in self.subs: + duration = (sub.end - sub.start).total_seconds() + + # sometimes zh-res will occur English comma + sub_content_temp = sub.content.strip().replace(",", ",") + # use jionlp[https://github.com/dongrixinyu/JioNLP] to split Chinese sentence + sub_split_list = jio.split_sentence(sub_content_temp, criterion='fine') + sub_len = len(sub_content_temp) + + # Sliding Window to control single sentence length, in the case of uniform speech speed + interval_start = sub.start.total_seconds() + interval_end = sub.start.total_seconds() + interval_len = 0 + start_index = 0 + + def _add_sub(target_index): + new_subs.append(srt.Subtitle(index=0, + start=datetime.timedelta(seconds=interval_start), + end=datetime.timedelta(seconds=interval_end), + content="".join(sub_split_list[start_index:target_index]))) + + for index, sub_split_item in enumerate(sub_split_list): + sub_split = sub_split_item.strip() + + if index > 0 and interval_len + len(sub_split) > self.SINGLE_SUB_CN_MAX_LEN + self.SINGLE_SUB_CN_MAX_LEN // 2: + _add_sub(index) + interval_start = interval_end + start_index = index + interval_len = 0 + + interval_len = interval_len + len(sub_split) + interval_end = interval_end + (len(sub_split) / sub_len) * duration + + if interval_len < self.SINGLE_SUB_CN_MAX_LEN + 1: + continue + + _add_sub(index + 1) + interval_start = interval_end + start_index = index + 1 + interval_len = 0 + + if interval_len != 0: + new_subs.append(srt.Subtitle(index=0, + start=datetime.timedelta(seconds=interval_start), + end=datetime.timedelta(seconds=interval_end), + content="".join(sub_split_list[start_index:]))) + + self.subs.clear() + self.subs.extend(new_subs) + + def _sub_filter_modal_CN(self): + import jionlp as jio + import re + + key_list = [key.strip() for key in self.MODAL_WORDS_CN.split(",")] + for sub in self.subs: + # list of separate short sentence + sub_split_list = jio.split_sentence(sub.content.strip().replace(",", ","), criterion='fine') + + trigger = False + new_sub_split_list = [] + for sub_split_item in sub_split_list: + sub_split = sub_split_item.strip() + # via jionlp, the last character is always text or punctuation + last_word_index = -1 if re.match(r"^[\u4E00-\u9FA5A-Za-z0-9_]+$", sub_split[-1]) else -2 + + if sub_split[last_word_index] in key_list: + trigger = True + temp = sub_split[:last_word_index] + if last_word_index == -2: + temp += sub_split[-1] + new_sub_split_list.append(temp) + continue + + new_sub_split_list.append(sub_split) + + if trigger: + sub.content = "".join(new_sub_split_list) diff --git a/setup.py b/setup.py index 3fd8542..d241c7f 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,13 @@ -from setuptools import setup, find_packages import os +from setuptools import setup, find_packages + requirements = [ 'srt', 'moviepy', 'opencc-python-reimplemented', - 'whisper @ git+https://github.com/openai/whisper.git' + 'whisper @ git+https://github.com/openai/whisper.git', + 'jionlp', ] init_fn = os.path.join(os.path.dirname(__file__), 'autocut', '__init__.py')