From b99738545ff0c7cd3da22ddfef6c0b7f0196b128 Mon Sep 17 00:00:00 2001 From: tarepan Date: Sat, 9 Dec 2023 15:01:03 +0900 Subject: [PATCH 001/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20docstring?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0=20(#817)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- test/test_synthesis_engine.py | 51 ++++--- voicevox_engine/full_context_label.py | 11 ++ voicevox_engine/kana_parser.py | 80 +++++++++-- .../synthesis_engine/synthesis_engine_base.py | 127 ++++++++++++++---- 4 files changed, 220 insertions(+), 49 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index f9bfa2078..e155c2649 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -723,9 +723,14 @@ def result_value(i: int): self.assertEqual(result, true_result) def synthesis_test_base(self, audio_query: AudioQuery): + # Inputs 音素長・モーラ音高の設定 & Expects 音素長・音素ID・モーラ音高の記録 + # Inputs + # `audio_query`: 子音長0.1秒/母音長0.1秒/モーラ音高ランダム + # Expects + # `phoneme_length_list`: 音素長系列 + # `phoneme_id_list`: 音素ID系列 + # `f0_list`: モーラ音高系列 accent_phrases = audio_query.accent_phrases - - # decode forwardのために適当にpitchとlengthを設定し、リストで持っておく phoneme_length_list = [0.0] phoneme_id_list = [0] f0_list = [0.0] @@ -750,42 +755,46 @@ def synthesis_test_base(self, audio_query: AudioQuery): phoneme_length_list.append(0.0) phoneme_id_list.append(0) f0_list.append(0.0) - phoneme_length_list[0] = audio_query.prePhonemeLength phoneme_length_list[-1] = audio_query.postPhonemeLength + # Expects: speedScale適用 for i in range(len(phoneme_length_list)): phoneme_length_list[i] /= audio_query.speedScale + # Outputs: MockCore入りSynthesisEngine の `.synthesis` 出力および core.decode_forward 引数 result = self.synthesis_engine.synthesis(query=audio_query, style_id=1) - - # decodeに渡される値の検証 decode_args = self.decode_mock.call_args[1] list_length = decode_args["length"] + + # Test: フレーム長 self.assertEqual( list_length, int(sum([round(p * 24000 / 256) for p in phoneme_length_list])), ) + # Expects: Apply/Convert/Rescale num_phoneme = OjtPhoneme.num_phoneme # mora_phoneme_listのPhoneme ID版 mora_phoneme_id_list = [OjtPhoneme(p).phoneme_id for p in mora_phoneme_list] - # numpy.repeatをfor文でやる - f0 = [] - phoneme = [] + f0 = [] # フレームごとの音高系列 + phoneme = [] # フレームごとの音素onehotベクトル系列 f0_index = 0 mean_f0 = [] for i, phoneme_length in enumerate(phoneme_length_list): + # Expects: pitchScale適用 f0_single = numpy.array(f0_list[f0_index], dtype=numpy.float32) * ( 2**audio_query.pitchScale ) + # Expects: フレームスケール化 for _ in range(int(round(phoneme_length * (24000 / 256)))): f0.append([f0_single]) + # Expects: 音素onehot化 phoneme_s = [] for _ in range(num_phoneme): phoneme_s.append(0) - # one hot + # Expects: 音素フレームスケール化 phoneme_s[phoneme_id_list[i]] = 1 phoneme.append(phoneme_s) # consonantとvowelを判別し、vowelであればf0_indexを一つ進める @@ -793,44 +802,56 @@ def synthesis_test_base(self, audio_query: AudioQuery): if f0_single > 0: mean_f0.append(f0_single) f0_index += 1 - + # Expects: 抑揚スケール適用 mean_f0 = numpy.array(mean_f0, dtype=numpy.float32).mean() f0 = numpy.array(f0, dtype=numpy.float32) for i in range(len(f0)): if f0[i][0] != 0.0: f0[i][0] = (f0[i][0] - mean_f0) * audio_query.intonationScale + mean_f0 - phoneme = numpy.array(phoneme, dtype=numpy.float32) + assert_f0_count = 0 + + # Outputs: decode_forward `f0` 引数 + decode_f0 = decode_args["f0"] + + # Test: フレームごとの音高系列 # 乱数の影響で数値の位置がずれが生じるので、大半(4/5)があっていればよしとする # また、上の部分のint(round(phoneme_length * (24000 / 256)))の影響で # 本来のf0/phonemeとテスト生成したf0/phonemeの長さが変わることがあり、 # テスト生成したものが若干長くなることがあるので、本来のものの長さを基準にassertする - assert_f0_count = 0 - decode_f0 = decode_args["f0"] for i in range(len(decode_f0)): # 乱数の影響等で数値にずれが生じるので、10の-5乗までの近似値であれば許容する assert_f0_count += math.isclose(f0[i][0], decode_f0[i][0], rel_tol=10e-5) self.assertTrue(assert_f0_count >= int(len(decode_f0) / 5) * 4) + assert_phoneme_count = 0 + + # Outputs: decode_forward `phoneme` 引数 decode_phoneme = decode_args["phoneme"] + + # Test: フレームごとの音素系列 for i in range(len(decode_phoneme)): assert_true_count = 0 for j in range(len(decode_phoneme[i])): assert_true_count += bool(phoneme[i][j] == decode_phoneme[i][j]) assert_phoneme_count += assert_true_count == num_phoneme + self.assertTrue(assert_phoneme_count >= int(len(decode_phoneme) / 5) * 4) + + # Test: スタイルID self.assertEqual(decode_args["style_id"], 1) - # decode forwarderのmockを使う + # Expects: waveform (by mock) true_result = decode_mock(list_length, num_phoneme, f0, phoneme, 1) - + # Expects: 音量スケール適用 true_result *= audio_query.volumeScale # TODO: resampyの部分は値の検証しようがないので、パスする if audio_query.outputSamplingRate != 24000: return + # Test: assert_result_count = 0 for i in range(len(true_result)): if audio_query.outputStereo: diff --git a/voicevox_engine/full_context_label.py b/voicevox_engine/full_context_label.py index 894a56751..5ca599276 100644 --- a/voicevox_engine/full_context_label.py +++ b/voicevox_engine/full_context_label.py @@ -519,6 +519,17 @@ def labels(self): def extract_full_context_label(text: str): + """ + 日本語テキストから発話クラスを抽出 + Parameters + ---------- + text : str + 日本語テキスト + Returns + ------- + utterance : Utterance + 発話 + """ labels = pyopenjtalk.extract_fullcontext(text) phonemes = [Phoneme.from_label(label=label) for label in labels] utterance = Utterance.from_phonemes(phonemes) diff --git a/voicevox_engine/kana_parser.py b/voicevox_engine/kana_parser.py index 8e0ff845a..14efb4672 100644 --- a/voicevox_engine/kana_parser.py +++ b/voicevox_engine/kana_parser.py @@ -1,15 +1,23 @@ +""" +「AquesTalk風記法」を実装した AquesTalk風記法テキスト <-> アクセント句系列 変換。 +記法定義: `https://github.com/VOICEVOX/voicevox_engine/blob/master/README.md#読み方を-aquestalk風記法で取得修正するサンプルコード` # noqa +""" + from typing import List, Optional from .model import AccentPhrase, Mora, ParseKanaError, ParseKanaErrorCode from .mora_list import openjtalk_text2mora _LOOP_LIMIT = 300 -_UNVOICE_SYMBOL = "_" -_ACCENT_SYMBOL = "'" -_NOPAUSE_DELIMITER = "/" -_PAUSE_DELIMITER = "、" -_WIDE_INTERROGATION_MARK = "?" +# AquesTalk風記法特殊文字 +_UNVOICE_SYMBOL = "_" # 無声化 +_ACCENT_SYMBOL = "'" # アクセント位置 +_NOPAUSE_DELIMITER = "/" # ポーズ無しアクセント句境界 +_PAUSE_DELIMITER = "、" # ポーズ有りアクセント句境界 +_WIDE_INTERROGATION_MARK = "?" # 疑問形 + +# AquesTalk風記法とモーラの対応(音素長・音高 0 初期化、疑問形 off 初期化) _text2mora_with_unvoice = {} for text, (consonant, vowel) in openjtalk_text2mora.items(): _text2mora_with_unvoice[text] = Mora( @@ -22,6 +30,8 @@ is_interrogative=False, ) if vowel in ["a", "i", "u", "e", "o"]: + # 手前に`_`を入れると無声化 + # 例: "_ホ" -> "hO" _text2mora_with_unvoice[_UNVOICE_SYMBOL + text] = Mora( text=text, consonant=consonant if len(consonant) > 0 else None, @@ -35,9 +45,19 @@ def _text_to_accent_phrase(phrase: str) -> AccentPhrase: """ - longest matchにより読み仮名からAccentPhraseを生成 - 入力長Nに対し計算量O(N^2) + 単一アクセント句に相当するAquesTalk風記法テキストからアクセント句オブジェクトを生成 + longest matchによりモーラ化。入力長Nに対し計算量O(N^2)。 + Parameters + ---------- + phrase : str + 単一アクセント句に相当するAquesTalk風記法テキスト + Returns + ------- + accent_phrase : AccentPhrase + アクセント句 """ + # NOTE: ポーズと疑問形はこの関数内で処理しない + accent_index: Optional[int] = None moras: List[Mora] = [] @@ -48,24 +68,33 @@ def _text_to_accent_phrase(phrase: str) -> AccentPhrase: outer_loop = 0 while base_index < len(phrase): outer_loop += 1 + + # `'`の手前がアクセント位置 if phrase[base_index] == _ACCENT_SYMBOL: if len(moras) == 0: raise ParseKanaError(ParseKanaErrorCode.ACCENT_TOP, text=phrase) + # すでにアクセント位置がある場合はエラー if accent_index is not None: raise ParseKanaError(ParseKanaErrorCode.ACCENT_TWICE, text=phrase) accent_index = len(moras) base_index += 1 continue + + # モーラ探索 + # より長い要素からなるモーラが見つかれば上書き(longest match) + # 例: phrase "キャ" -> "キ" 検出 -> "キャ" 検出/上書き -> Mora("キャ") for watch_index in range(base_index, len(phrase)): + # アクセント位置特殊文字が来たら探索打ち切り if phrase[watch_index] == _ACCENT_SYMBOL: break - # 普通の文字の場合 stack += phrase[watch_index] if stack in _text2mora_with_unvoice: + # より長い要素からなるモーラが見つかれば上書き(longest match) + # 例: phrase "キャ" -> "キ" 検出 -> "キャ" 検出/上書き -> Mora("キャ") matched_text = stack - # push mora if matched_text is None: raise ParseKanaError(ParseKanaErrorCode.UNKNOWN_TEXT, text=stack) + # push mora else: moras.append(_text2mora_with_unvoice[matched_text].copy(deep=True)) base_index += len(matched_text) @@ -81,7 +110,15 @@ def _text_to_accent_phrase(phrase: str) -> AccentPhrase: def parse_kana(text: str) -> List[AccentPhrase]: """ - AquesTalk風記法テキストをパースして音長・音高未指定のaccent phraseに変換 + AquesTalk風記法テキストからアクセント句系列を生成 + Parameters + ---------- + text : str + AquesTalk風記法テキスト + Returns + ------- + parsed_results : List[AccentPhrase] + アクセント句(音素・モーラ音高 0初期化)系列を生成 """ parsed_results: List[AccentPhrase] = [] @@ -90,6 +127,7 @@ def parse_kana(text: str) -> List[AccentPhrase]: raise ParseKanaError(ParseKanaErrorCode.EMPTY_PHRASE, position=1) for i in range(len(text) + 1): + # アクセント句境界(`/`か`、`)の出現までインデックス進展 if i == len(text) or text[i] in [_PAUSE_DELIMITER, _NOPAUSE_DELIMITER]: phrase = text[phrase_base:i] if len(phrase) == 0: @@ -99,15 +137,19 @@ def parse_kana(text: str) -> List[AccentPhrase]: ) phrase_base = i + 1 + # アクセント句末に`?`で疑問文 is_interrogative = _WIDE_INTERROGATION_MARK in phrase if is_interrogative: if _WIDE_INTERROGATION_MARK in phrase[:-1]: raise ParseKanaError( ParseKanaErrorCode.INTERROGATION_MARK_NOT_AT_END, text=phrase ) + # 疑問形はモーラでなくアクセント句属性で表現 phrase = phrase.replace(_WIDE_INTERROGATION_MARK, "") accent_phrase: AccentPhrase = _text_to_accent_phrase(phrase) + + # `、`で無音区間を挿入 if i < len(text) and text[i] == _PAUSE_DELIMITER: accent_phrase.pause_mora = Mora( text="、", @@ -125,22 +167,38 @@ def parse_kana(text: str) -> List[AccentPhrase]: def create_kana(accent_phrases: List[AccentPhrase]) -> str: + """ + アクセント句系列からAquesTalk風記法テキストを生成 + Parameters + ---------- + accent_phrases : List[AccentPhrase] + アクセント句系列 + Returns + ------- + text : str + AquesTalk風記法テキスト + """ text = "" + # アクセント句を先頭から逐次パースし、`text`末尾にAquesTalk風記法の文字を都度追加(ループ) for i, phrase in enumerate(accent_phrases): for j, mora in enumerate(phrase.moras): + # Rule3: "カナの手前に`_`を入れるとそのカナは無声化される" if mora.vowel in ["A", "I", "U", "E", "O"]: text += _UNVOICE_SYMBOL - text += mora.text + # `'`でアクセント位置 if j + 1 == phrase.accent: text += _ACCENT_SYMBOL + # Rule5: "アクセント句末に`?`(全角)を入れることにより疑問文の発音ができる" if phrase.is_interrogative: text += _WIDE_INTERROGATION_MARK if i < len(accent_phrases) - 1: if phrase.pause_mora is None: + # アクセント句区切り text += _NOPAUSE_DELIMITER else: + # 無音でアクセント句区切り text += _PAUSE_DELIMITER return text diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py index fde453574..6a139a830 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py @@ -11,6 +11,16 @@ def mora_to_text(mora: str) -> str: + """ + Parameters + ---------- + mora : str + モーラ音素文字列 + Returns + ------- + mora : str + モーラ音素文字列 + """ if mora[-1:] in ["A", "I", "U", "E", "O"]: # 無声化母音を小文字に mora = mora[:-1] + mora[-1].lower() @@ -24,10 +34,18 @@ def adjust_interrogative_accent_phrases( accent_phrases: List[AccentPhrase], ) -> List[AccentPhrase]: """ - enable_interrogative_upspeakが有効になっていて与えられたaccent_phrasesに疑問系のものがあった場合、 - 各accent_phraseの末尾にある疑問系発音用のMoraに対して直前のMoraより少し音を高くすることで疑問文ぽくする - NOTE: リファクタリング時に適切な場所へ移動させること + アクセント句系列の必要に応じて疑問系に補正 + 各accent_phraseの末尾のモーラより少し音の高い有声母音モーラを付与するすることで疑問文ぽくする + Parameters + ---------- + accent_phrases : List[AccentPhrase] + アクセント句系列 + Returns + ------- + accent_phrases : List[AccentPhrase] + 必要に応じて疑問形補正されたアクセント句系列 """ + # NOTE: リファクタリング時に適切な場所へ移動させること return [ AccentPhrase( moras=adjust_interrogative_moras(accent_phrase), @@ -40,7 +58,19 @@ def adjust_interrogative_accent_phrases( def adjust_interrogative_moras(accent_phrase: AccentPhrase) -> List[Mora]: + """ + アクセント句に含まれるモーラ系列の必要に応じた疑問形補正 + Parameters + ---------- + accent_phrase : AccentPhrase + アクセント句 + Returns + ------- + moras : List[Mora] + 補正済みモーラ系列 + """ moras = copy.deepcopy(accent_phrase.moras) + # 疑問形補正条件: 疑問形フラグON & 終端有声母音 if accent_phrase.is_interrogative and not (len(moras) == 0 or moras[-1].pitch == 0): interrogative_mora = make_interrogative_mora(moras[-1]) moras.append(interrogative_mora) @@ -50,6 +80,17 @@ def adjust_interrogative_moras(accent_phrase: AccentPhrase) -> List[Mora]: def make_interrogative_mora(last_mora: Mora) -> Mora: + """ + 疑問形用のモーラ(同一母音・継続長 0.15秒・音高↑)の生成 + Parameters + ---------- + last_mora : Mora + 疑問形にするモーラ + Returns + ------- + mora : Mora + 疑問形用のモーラ + """ fix_vowel_length = 0.15 adjust_pitch = 0.3 max_pitch = 6.5 @@ -66,6 +107,17 @@ def make_interrogative_mora(last_mora: Mora) -> Mora: def full_context_label_moras_to_moras( full_context_moras: List[full_context_label.Mora], ) -> List[Mora]: + """ + Moraクラスのキャスト (`full_context_label.Mora` -> `Mora`) + Parameters + ---------- + full_context_moras : List[full_context_label.Mora] + モーラ系列 + Returns + ------- + moras : List[Mora] + モーラ系列。音素長・モーラ音高は 0 初期化 + """ return [ Mora( text=mora_to_text("".join([p.phoneme for p in mora.phonemes])), @@ -85,25 +137,30 @@ class SynthesisEngineBase(metaclass=ABCMeta): def default_sampling_rate(self) -> int: raise NotImplementedError - # FIXME: jsonではなくModelを返すようにする @property @abstractmethod def speakers(self) -> str: + """話者情報(json文字列)""" + # FIXME: jsonではなくModelを返すようにする raise NotImplementedError @property @abstractmethod def supported_devices(self) -> Optional[str]: + """ + デバイス対応情報 + Returns + ------- + 対応デバイス一覧(None: 情報取得不可) + """ raise NotImplementedError def initialize_style_id_synthesis( # noqa: B027 - self, - style_id: int, - skip_reinit: bool, + self, style_id: int, skip_reinit: bool ): """ - 指定したスタイルでの音声合成を初期化する。何度も実行可能。 - 未実装の場合は何もしない + 指定したスタイルでの音声合成を初期化する。 + 何度も実行可能。未実装の場合は何もしない。 Parameters ---------- style_id : int @@ -132,62 +189,86 @@ def replace_phoneme_length( self, accent_phrases: List[AccentPhrase], style_id: int ) -> List[AccentPhrase]: """ - accent_phrasesの母音・子音の長さを設定する + 音素長の更新 Parameters ---------- accent_phrases : List[AccentPhrase] - アクセント句モデルのリスト + アクセント句系列 style_id : int スタイルID Returns ------- accent_phrases : List[AccentPhrase] - 母音・子音の長さが設定されたアクセント句モデルのリスト + 音素長が更新されたアクセント句系列 """ raise NotImplementedError() @abstractmethod def replace_mora_pitch( - self, - accent_phrases: List[AccentPhrase], - style_id: int, + self, accent_phrases: List[AccentPhrase], style_id: int ) -> List[AccentPhrase]: """ - accent_phrasesの音高(ピッチ)を設定する + モーラ音高の更新 Parameters ---------- accent_phrases : List[AccentPhrase] - アクセント句モデルのリスト + アクセント句系列 style_id : int スタイルID Returns ------- accent_phrases : List[AccentPhrase] - 音高(ピッチ)が設定されたアクセント句モデルのリスト + モーラ音高が更新されたアクセント句系列 """ raise NotImplementedError() def replace_mora_data( - self, - accent_phrases: List[AccentPhrase], - style_id: int, + self, accent_phrases: List[AccentPhrase], style_id: int ) -> List[AccentPhrase]: + """ + 音素長・モーラ音高の更新 + Parameters + ---------- + accent_phrases : List[AccentPhrase] + アクセント句系列 + style_id : int + スタイルID + Returns + ------- + accent_phrases : List[AccentPhrase] + アクセント句系列 + """ return self.replace_mora_pitch( accent_phrases=self.replace_phoneme_length( - accent_phrases=accent_phrases, - style_id=style_id, + accent_phrases=accent_phrases, style_id=style_id ), style_id=style_id, ) def create_accent_phrases(self, text: str, style_id: int) -> List[AccentPhrase]: + """ + テキストからアクセント句系列を生成。 + 音素長やモーラ音高も更新。 + Parameters + ---------- + text : str + 日本語テキスト + style_id : int + スタイルID + Returns + ------- + accent_phrases : List[AccentPhrase] + アクセント句系列 + """ if len(text.strip()) == 0: return [] + # 音素とアクセントの推定 utterance = extract_full_context_label(text) if len(utterance.breath_groups) == 0: return [] + # Utterance -> List[AccentPharase] のキャスト & 音素長・モーラ音高の推定と更新 accent_phrases = self.replace_mora_data( accent_phrases=[ AccentPhrase( From 5e63e3660985b423fedd9aee969835720a8eddd5 Mon Sep 17 00:00:00 2001 From: tarepan Date: Sat, 9 Dec 2023 18:03:20 +0900 Subject: [PATCH 002/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`PresetManager`?= =?UTF-8?q?=20=E5=85=B1=E9=80=9A=E5=87=A6=E7=90=86=E3=81=AE=E9=96=A2?= =?UTF-8?q?=E6=95=B0=E5=8C=96=E3=83=BB=E8=BF=BD=E5=8A=A0=E3=82=B3=E3=83=A1?= =?UTF-8?q?=E3=83=B3=E3=83=88=20(#832)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- voicevox_engine/preset/PresetManager.py | 114 ++++++++++++------------ 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/voicevox_engine/preset/PresetManager.py b/voicevox_engine/preset/PresetManager.py index b994eb677..d06ec2b03 100644 --- a/voicevox_engine/preset/PresetManager.py +++ b/voicevox_engine/preset/PresetManager.py @@ -9,43 +9,53 @@ class PresetManager: - def __init__( - self, - preset_path: Path, - ): - self.presets = [] + """ + プリセットの管理 + + プリセットはAudioQuery全体パラメータ(話速・音高・抑揚・音量・無音長)のデフォルト値セットである。 + YAMLファイルをSSoTとする簡易データベース方式により、プリセットの管理をおこなう。 + """ + + def __init__(self, preset_path: Path): + """ + Parameters + ---------- + preset_path : Path + プリセット情報を一元管理するYAMLファイルへのパス + """ + self.presets: list[Preset] = [] self.last_modified_time = 0 self.preset_path = preset_path - def load_presets(self): + def load_presets(self) -> list[Preset]: """ - プリセットのYAMLファイルを読み込む - + 既存プリセットの読み込み Returns ------- - ret: List[Preset] - プリセットのリスト + ret: list[Preset] + 読み込まれたプリセットのリスト """ - # 設定ファイルのタイムスタンプを確認 + # データベース更新の確認(タイムスタンプベース) try: _last_modified_time = self.preset_path.stat().st_mtime if _last_modified_time == self.last_modified_time: + # 更新無し、キャッシュを返す return self.presets except OSError: raise PresetError("プリセットの設定ファイルが見つかりません") + # データベースの読み込み with open(self.preset_path, mode="r", encoding="utf-8") as f: obj = yaml.safe_load(f) if obj is None: raise PresetError("プリセットの設定ファイルが空の内容です") - try: _presets = parse_obj_as(List[Preset], obj) except ValidationError: raise PresetError("プリセットの設定ファイルにミスがあります") - # idが一意か確認 + # 全idの一意性をバリデーション if len([preset.id for preset in _presets]) != len( {preset.id for preset in _presets} ): @@ -53,40 +63,34 @@ def load_presets(self): self.presets = _presets self.last_modified_time = _last_modified_time + return self.presets def add_preset(self, preset: Preset): """ - YAMLファイルに新規のプリセットを追加する - + 新規プリセットの追加 Parameters ---------- preset : Preset - 追加するプリセットを渡す - + 新規プリセット Returns ------- ret: int - 追加したプリセットのプリセットID + 追加されたプリセットのID """ - # 手動でファイルが更新されているかも知れないので、最新のYAMLファイルを読み直す + # データベース更新の反映 self.load_presets() - # IDが0未満、または存在するIDなら新しいIDを決定し、配列に追加 + # 新規プリセットID の発行。IDが0未満、または存在するIDなら新規IDを発行 if preset.id < 0 or preset.id in {preset.id for preset in self.presets}: preset.id = max([preset.id for preset in self.presets]) + 1 + # 新規プリセットの追加 self.presets.append(preset) - # ファイルに書き込み + # 変更の反映。失敗時はリバート。 try: - with open(self.preset_path, mode="w", encoding="utf-8") as f: - yaml.safe_dump( - [preset.dict() for preset in self.presets], - f, - allow_unicode=True, - sort_keys=False, - ) + self._write_on_file() except Exception as err: self.presets.pop() if isinstance(err, FileNotFoundError): @@ -98,23 +102,21 @@ def add_preset(self, preset: Preset): def update_preset(self, preset: Preset): """ - YAMLファイルのプリセットを更新する - + 既存プリセットの更新 Parameters ---------- preset : Preset - 更新するプリセットを渡す - + 新しい既存プリセット Returns ------- ret: int - 更新したプリセットのプリセットID + 更新されたプリセットのID """ - # 手動でファイルが更新されているかも知れないので、最新のYAMLファイルを読み直す + # データベース更新の反映 self.load_presets() - # IDが存在するか探索 + # 対象プリセットの検索 prev_preset = (-1, None) for i in range(len(self.presets)): if self.presets[i].id == preset.id: @@ -124,15 +126,9 @@ def update_preset(self, preset: Preset): else: raise PresetError("更新先のプリセットが存在しません") - # ファイルに書き込み + # 変更の反映。失敗時はリバート。 try: - with open(self.preset_path, mode="w", encoding="utf-8") as f: - yaml.safe_dump( - [preset.dict() for preset in self.presets], - f, - allow_unicode=True, - sort_keys=False, - ) + self._write_on_file() except Exception as err: if prev_preset != (-1, None): self.presets[prev_preset[0]] = prev_preset[1] @@ -145,23 +141,21 @@ def update_preset(self, preset: Preset): def delete_preset(self, id: int): """ - YAMLファイルのプリセットを削除する - + 指定したIDのプリセットの削除 Parameters ---------- id: int - 削除するプリセットのプリセットIDを渡す - + 削除対象プリセットのID Returns ------- ret: int - 削除したプリセットのプリセットID + 削除されたプリセットのID """ - # 手動でファイルが更新されているかも知れないので、最新のYAMLファイルを読み直す + # データベース更新の反映 self.load_presets() - # IDが存在するか探索 + # 対象プリセットの検索 buf = None buf_index = -1 for i in range(len(self.presets)): @@ -172,17 +166,21 @@ def delete_preset(self, id: int): else: raise PresetError("削除対象のプリセットが存在しません") - # ファイルに書き込み + # 変更の反映。失敗時はリバート。 try: - with open(self.preset_path, mode="w", encoding="utf-8") as f: - yaml.safe_dump( - [preset.dict() for preset in self.presets], - f, - allow_unicode=True, - sort_keys=False, - ) + self._write_on_file() except FileNotFoundError: self.presets.insert(buf_index, buf) raise PresetError("プリセットの設定ファイルに書き込み失敗しました") return id + + def _write_on_file(self): + """プリセット情報のファイル(簡易データベース)書き込み""" + with open(self.preset_path, mode="w", encoding="utf-8") as f: + yaml.safe_dump( + [preset.dict() for preset in self.presets], + f, + allow_unicode=True, + sort_keys=False, + ) From 154d5d01e08dd57895b776806952fe4172ed6943 Mon Sep 17 00:00:00 2001 From: tarepan Date: Sun, 10 Dec 2023 02:04:37 +0900 Subject: [PATCH 003/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=82=B0?= =?UTF-8?q?=E3=83=AD=E3=83=BC=E3=83=90=E3=83=AB=E7=89=B9=E5=BE=B4=E9=87=8F?= =?UTF-8?q?=E9=81=A9=E7=94=A8=E3=81=AE=E9=96=A2=E6=95=B0=E5=8C=96=20(#819)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- test/test_synthesis_engine.py | 144 +++++++++++++- .../synthesis_engine/synthesis_engine.py | 179 ++++++++++++++---- 2 files changed, 285 insertions(+), 38 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index e155c2649..9ff7fb563 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -13,11 +13,17 @@ # TODO: import from voicevox_engine.synthesis_engine.mora from voicevox_engine.synthesis_engine.synthesis_engine import ( + apply_intonation_scale, + apply_output_sampling_rate, + apply_output_stereo, + apply_pitch_scale, + apply_prepost_silence, + apply_speed_scale, + apply_volume_scale, calc_frame_per_phoneme, calc_frame_phoneme, calc_frame_pitch, mora_phoneme_list, - pad_with_silence, pre_process, split_mora, to_flatten_moras, @@ -173,8 +179,8 @@ def _gen_mora( ) -def test_pad_with_silence(): - """Test `pad_with_silence`.""" +def test_apply_prepost_silence(): + """Test `apply_prepost_silence`.""" # Inputs query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067) moras = [ @@ -189,11 +195,139 @@ def test_pad_with_silence(): ] # Outputs - moras_with_silence = pad_with_silence(moras, query) + moras_with_silence = apply_prepost_silence(moras, query) assert moras_with_silence == true_moras_with_silence +def test_apply_speed_scale(): + """Test `apply_speed_scale`.""" + # Inputs + query = _gen_query(speedScale=2.0) + input_moras = [ + _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + ] + + # Expects - x2 fast + true_moras = [ + _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), + _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), + ] + + # Outputs + moras = apply_speed_scale(input_moras, query) + + assert moras == true_moras + + +def test_apply_pitch_scale(): + """Test `apply_pitch_scale`.""" + # Inputs + query = _gen_query(pitchScale=2.0) + input_moras = [ + _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0), + _gen_mora("ン", None, None, "N", 0.0, 50.0), + _gen_mora("、", None, None, "pau", 0.0, 0.0), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0), + _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + ] + + # Expects - x4 value scaled + true_moras = [ + _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), + _gen_mora("ン", None, None, "N", 0.0, 200.0), + _gen_mora("、", None, None, "pau", 0.0, 0.0), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), + _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + ] + + # Outputs + moras = apply_pitch_scale(input_moras, query) + + assert moras == true_moras + + +def test_apply_intonation_scale(): + """Test `apply_intonation_scale`.""" + # Inputs + query = _gen_query(intonationScale=0.5) + input_moras = [ + _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), + _gen_mora("ン", None, None, "N", 0.0, 200.0), + _gen_mora("、", None, None, "pau", 0.0, 0.0), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), + _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + ] + + # Expects - mean=300 var x0.5 intonation scaling + true_moras = [ + _gen_mora("コ", "k", 0.0, "o", 0.0, 250.0), + _gen_mora("ン", None, None, "N", 0.0, 250.0), + _gen_mora("、", None, None, "pau", 0.0, 0.0), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0), + _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + ] + + # Outputs + moras = apply_intonation_scale(input_moras, query) + + assert moras == true_moras + + +def test_apply_volume_scale(): + """Test `apply_volume_scale`.""" + # Inputs + query = _gen_query(volumeScale=3.0) + input_wave = numpy.array([0.0, 1.0, 2.0]) + + # Expects - x3 scale + true_wave = numpy.array([0.0, 3.0, 6.0]) + + # Outputs + wave = apply_volume_scale(input_wave, query) + + assert numpy.allclose(wave, true_wave) + + +def test_apply_output_sampling_rate(): + """Test `apply_output_sampling_rate`.""" + # Inputs + query = _gen_query(outputSamplingRate=12000) + input_wave = numpy.array([1.0 for _ in range(120)]) + input_sr_wave = 24000 + + # Expects - half sampling rate + true_wave = numpy.array([1.0 for _ in range(60)]) + assert true_wave.shape == (60,), "Prerequisites" + + # Outputs + wave = apply_output_sampling_rate(input_wave, input_sr_wave, query) + + assert wave.shape[0] == true_wave.shape[0] + + +def test_apply_output_stereo(): + """Test `apply_output_stereo`.""" + # Inputs + query = _gen_query(outputStereo=True) + input_wave = numpy.array([1.0, 0.0, 2.0]) + + # Expects - Stereo :: (Time, Channel) + true_wave = numpy.array([[1.0, 1.0], [0.0, 0.0], [2.0, 2.0]]) + + # Outputs + wave = apply_output_stereo(input_wave, query) + + assert numpy.array_equal(wave, true_wave) + + def test_calc_frame_per_phoneme(): """Test `calc_frame_per_phoneme`.""" # Inputs @@ -325,7 +459,7 @@ def test_feat_to_framescale(): assert true_frame_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites" # Outputs - flatten_moras = pad_with_silence(flatten_moras, query) + flatten_moras = apply_prepost_silence(flatten_moras, query) frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme) frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 9bd7dde56..9fa12d3a5 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -1,8 +1,10 @@ +import math import threading from itertools import chain from typing import List, Optional, Tuple import numpy +from numpy import ndarray from soxr import resample from ..acoustic_feature_extractor import OjtPhoneme @@ -112,8 +114,9 @@ def generate_silence_mora(length: float) -> Mora: return Mora(text=" ", vowel="sil", vowel_length=length, pitch=0.0) -def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: - """モーラ列の先頭/最後尾へqueryに基づいた無音モーラを追加 +def apply_prepost_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 前後無音(`prePhonemeLength` & `postPhonemeLength`)の適用 Parameters ---------- moras : List[Mora] @@ -131,6 +134,27 @@ def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras +def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 話速スケール(`speedScale`)の適用 + Parameters + ---------- + moras : list[Mora] + モーラ系列 + query : AudioQuery + 音声合成クエリ + Returns + ------- + moras : list[Mora] + 話速スケールが適用されたモーラ系列 + """ + for mora in moras: + mora.vowel_length /= query.speedScale + if mora.consonant_length: + mora.consonant_length /= query.speedScale + return moras + + def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): """ 音素あたりのフレーム長を算出 @@ -145,6 +169,9 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): frame_per_phoneme : NDArray[] 音素あたりのフレーム長。端数丸め。 """ + # Apply: グローバル特徴量による補正(話速) + moras = apply_speed_scale(moras, query) + # 音素あたりの継続長 sec_per_phoneme = numpy.array( [ @@ -157,10 +184,6 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): ], dtype=numpy.float32, ) - - # 話速による継続長の補正 - sec_per_phoneme /= query.speedScale - # 音素あたりのフレーム長。端数丸め。 framerate = 24000 / 256 # framerate 93.75 [frame/sec] frame_per_phoneme = numpy.round(sec_per_phoneme * framerate).astype(numpy.int32) @@ -168,6 +191,48 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): return frame_per_phoneme +def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 音高スケール(`pitchScale`)の適用 + Parameters + ---------- + moras : list[Mora] + モーラ系列 + query : AudioQuery + 音声合成クエリ + Returns + ------- + moras : list[Mora] + 音高スケールが適用されたモーラ系列 + """ + for mora in moras: + mora.pitch *= 2**query.pitchScale + return moras + + +def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 抑揚スケール(`intonationScale`)の適用 + Parameters + ---------- + moras : list[Mora] + モーラ系列 + query : AudioQuery + 音声合成クエリ + Returns + ------- + moras : list[Mora] + 抑揚スケールが適用されたモーラ系列 + """ + # 有声音素 (f0>0) の平均値に対する乖離度をスケール + voiced = list(filter(lambda mora: mora.pitch > 0, moras)) + mean_f0 = numpy.mean(list(map(lambda mora: mora.pitch, voiced))).item() + if mean_f0 != math.nan: # 空リスト -> NaN + for mora in voiced: + mora.pitch = (mora.pitch - mean_f0) * query.intonationScale + mean_f0 + return moras + + def calc_frame_pitch( query: AudioQuery, moras: List[Mora], @@ -191,30 +256,41 @@ def calc_frame_pitch( frame_f0 : NDArray[] フレームごとの基本周波数系列 """ + moras = apply_pitch_scale(moras, query) + moras = apply_intonation_scale(moras, query) + # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) # モーラごとの基本周波数 f0 = numpy.array([mora.pitch for mora in moras], dtype=numpy.float32) - # 音高スケールによる補正 - f0 *= 2**query.pitchScale - - # 抑揚スケールによる補正。有声音素 (f0>0) の平均値に対する乖離度をスケール - voiced = f0 > 0 - mean_f0 = f0[voiced].mean() - if not numpy.isnan(mean_f0): - f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0 - - # フレームごとのピッチ化 + # Rescale: 時間スケールの変更(モーラ -> フレーム) # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約 vowel_indexes = numpy.array(split_mora(phonemes)[2]) frame_per_mora = [ a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1) ] - # モーラの基本周波数を子音・母音に割当てフレーム化 frame_f0 = numpy.repeat(f0, frame_per_mora) return frame_f0 +def apply_volume_scale(wave: numpy.ndarray, query: AudioQuery) -> numpy.ndarray: + """ + 音量スケール(`volumeScale`)の適用 + Parameters + ---------- + wave : numpy.ndarray + 音声波形 + query : AudioQuery + 音声合成クエリ + Returns + ------- + wave : numpy.ndarray + 音量スケールが適用された音声波形 + """ + wave *= query.volumeScale + return wave + + def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndarray): """ フレームごとの音素列の生成(onehot化 + フレーム化) @@ -230,11 +306,59 @@ def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndar フレームごとの音素系列 """ # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) + # Convert: Core入力形式への変換(onehotベクトル系列) onehot_phoneme = numpy.stack([p.onehot for p in phonemes]) + + # Rescale: 時間スケールの変更(音素 -> フレーム) frame_phoneme = numpy.repeat(onehot_phoneme, frame_per_phoneme, axis=0) return frame_phoneme +def apply_output_sampling_rate( + wave: ndarray, sr_wave: int, query: AudioQuery +) -> ndarray: + """ + 出力サンプリングレート(`outputSamplingRate`)の適用 + Parameters + ---------- + wave : ndarray + 音声波形 + sr_wave : int + `wave`のサンプリングレート + query : AudioQuery + 音声合成クエリ + Returns + ------- + wave : ndarray + 出力サンプリングレートが適用された音声波形 + """ + # サンプリングレート一致のときはスルー + if sr_wave == query.outputSamplingRate: + return wave + + wave = resample(wave, sr_wave, query.outputSamplingRate) + return wave + + +def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray: + """ + ステレオ出力(`outputStereo`)の適用 + Parameters + ---------- + wave : ndarray + 音声波形 + query : AudioQuery + 音声合成クエリ + Returns + ------- + wave : ndarray + ステレオ出力設定が適用された音声波形 + """ + if query.outputStereo: + wave = numpy.array([wave, wave]).T + return wave + + class SynthesisEngine(SynthesisEngineBase): """音声合成器(core)の管理/実行/プロキシと音声合成フロー""" @@ -493,7 +617,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): # AccentPhraseをすべてMoraおよびOjtPhonemeの形に分解し、処理可能な形にする flatten_moras, phoneme_data_list = pre_process(query.accent_phrases) - flatten_moras = pad_with_silence(flatten_moras, query) + flatten_moras = apply_prepost_silence(flatten_moras, query) frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) f0 = calc_frame_pitch( query, flatten_moras, phoneme_data_list, frame_per_phoneme @@ -509,21 +633,10 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): phoneme=phoneme, style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1), ) + sr_wave = self.default_sampling_rate - # volume: ゲイン適用 - wave *= query.volumeScale - - # 出力サンプリングレートがデフォルト(decode forwarderによるもの、24kHz)でなければ、それを適用する - if query.outputSamplingRate != self.default_sampling_rate: - wave = resample( - wave, - self.default_sampling_rate, - query.outputSamplingRate, - ) - - # ステレオ変換 - # 出力設定がステレオなのであれば、ステレオ化する - if query.outputStereo: - wave = numpy.array([wave, wave]).T + wave = apply_volume_scale(wave, query) + wave = apply_output_sampling_rate(wave, sr_wave, query) + wave = apply_output_stereo(wave, query) return wave From d0a596d06a57f48839bd685953069dbac8f11ead Mon Sep 17 00:00:00 2001 From: tarepan Date: Sun, 10 Dec 2023 02:07:00 +0900 Subject: [PATCH 004/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`default=5Fsettin?= =?UTF-8?q?g.yml`=20=E6=8B=A1=E5=BC=B5=E5=AD=90=E5=A4=89=E6=9B=B4=20(#837)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- default_setting.yml => default_setting.yaml | 0 run.spec | 2 +- test/e2e/conftest.py | 2 +- voicevox_engine/setting/SettingLoader.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename default_setting.yml => default_setting.yaml (100%) diff --git a/Dockerfile b/Dockerfile index 5a5279235..ef920ecf4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -227,7 +227,7 @@ COPY --from=download-onnxruntime-env /opt/onnxruntime /opt/onnxruntime # Add local files ADD ./voicevox_engine /opt/voicevox_engine/voicevox_engine ADD ./docs /opt/voicevox_engine/docs -ADD ./run.py ./generate_licenses.py ./presets.yaml ./default.csv ./default_setting.yml ./engine_manifest.json /opt/voicevox_engine/ +ADD ./run.py ./generate_licenses.py ./presets.yaml ./default.csv ./default_setting.yaml ./engine_manifest.json /opt/voicevox_engine/ ADD ./speaker_info /opt/voicevox_engine/speaker_info ADD ./ui_template /opt/voicevox_engine/ui_template ADD ./engine_manifest_assets /opt/voicevox_engine/engine_manifest_assets diff --git a/default_setting.yml b/default_setting.yaml similarity index 100% rename from default_setting.yml rename to default_setting.yaml diff --git a/run.spec b/run.spec index b84c82408..9f73c6b07 100644 --- a/run.spec +++ b/run.spec @@ -10,7 +10,7 @@ datas = [ ('default.csv', '.'), ('licenses.json', '.'), ('presets.yaml', '.'), - ('default_setting.yml', '.'), + ('default_setting.yaml', '.'), ('ui_template', 'ui_template'), ] datas += collect_data_files('pyopenjtalk') diff --git a/test/e2e/conftest.py b/test/e2e/conftest.py index af21590c1..b656886b6 100644 --- a/test/e2e/conftest.py +++ b/test/e2e/conftest.py @@ -14,7 +14,7 @@ def client(): synthesis_engines = make_synthesis_engines(use_gpu=False) latest_core_version = get_latest_core_version(versions=synthesis_engines.keys()) - setting_loader = SettingLoader(Path("./default_setting.yml")) + setting_loader = SettingLoader(Path("./default_setting.yaml")) preset_manager = PresetManager( # FIXME: impl MockPresetManager preset_path=Path("./presets.yaml"), ) diff --git a/voicevox_engine/setting/SettingLoader.py b/voicevox_engine/setting/SettingLoader.py index a78952e96..3f1669e26 100644 --- a/voicevox_engine/setting/SettingLoader.py +++ b/voicevox_engine/setting/SettingLoader.py @@ -5,7 +5,7 @@ from ..utility import engine_root, get_save_dir from .Setting import Setting -DEFAULT_SETTING_PATH: Path = engine_root() / "default_setting.yml" +DEFAULT_SETTING_PATH: Path = engine_root() / "default_setting.yaml" USER_SETTING_PATH: Path = get_save_dir() / "setting.yml" From 5d7562c51364e4672462f5110c4c87e1a8eaf145 Mon Sep 17 00:00:00 2001 From: tarepan Date: Sun, 10 Dec 2023 06:44:35 +0900 Subject: [PATCH 005/177] =?UTF-8?q?Refactor:=20`frame=5Fper=5Fmora`=20?= =?UTF-8?q?=E3=81=AB=E3=82=88=E3=82=8B=E7=BD=AE=E3=81=8D=E6=8F=9B=E3=81=88?= =?UTF-8?q?=20(#841)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_synthesis_engine.py | 48 +++++++++++++------ .../synthesis_engine/synthesis_engine.py | 43 ++++++++++------- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 9ff7fb563..eee0cae66 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -20,6 +20,7 @@ apply_prepost_silence, apply_speed_scale, apply_volume_scale, + calc_frame_per_mora, calc_frame_per_phoneme, calc_frame_phoneme, calc_frame_pitch, @@ -353,24 +354,43 @@ def test_calc_frame_per_phoneme(): assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme) +def test_calc_frame_per_mora(): + """Test `calc_frame_per_mora`.""" + # Inputs + moras = [ + _gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] + _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), + _gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0), + _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0), + _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + _gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0), + ] + + # Expects + # Pre ko N pau hi hO Pst + true_frame_per_mora = [2, 6, 4, 2, 6, 6, 6] + true_frame_per_mora = numpy.array(true_frame_per_mora, dtype=numpy.int32) + + # Outputs + frame_per_phoneme = numpy.array(list(map(calc_frame_per_mora, moras))) + + assert numpy.array_equal(frame_per_phoneme, true_frame_per_mora) + + def test_calc_frame_pitch(): """Test `test_calc_frame_pitch`.""" # Inputs query = _gen_query(pitchScale=2.0, intonationScale=0.5) moras = [ - _gen_mora(" ", None, None, " ", 0.0, 0.0), - _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0), - _gen_mora("ン", None, None, "N", 0.0, 50.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), - _gen_mora(" ", None, None, " ", 0.0, 0.0), + _gen_mora(" ", None, None, " ", 1 * 0.01067, 0.0), + _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), + _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), + _gen_mora(" ", None, None, " ", 3 * 0.01067, 0.0), ] - phoneme_str = "pau k o N pau h i h O pau" - phonemes = [OjtPhoneme(p) for p in phoneme_str.split()] - # Pre k o N pau h i h O Pst - frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3] - frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32) # Expects - x4 value scaled -> mean=300 var x0.5 intonation scaling # pau ko ko ko N N @@ -382,7 +402,7 @@ def test_calc_frame_pitch(): true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32) # Outputs - f0 = calc_frame_pitch(query, moras, phonemes, frame_per_phoneme) + f0 = calc_frame_pitch(query, moras) assert numpy.array_equal(f0, true_f0) @@ -461,7 +481,7 @@ def test_feat_to_framescale(): # Outputs flatten_moras = apply_prepost_silence(flatten_moras, query) frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) - f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme) + f0 = calc_frame_pitch(query, flatten_moras) frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) assert numpy.array_equal(frame_phoneme, true_frame_phoneme) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 9fa12d3a5..d36d9a407 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -191,6 +191,29 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): return frame_per_phoneme +def _to_frame(sec: float) -> ndarray: + FRAMERATE = 93.75 # 24000 / 256 [frame/sec] + return numpy.round(sec * FRAMERATE).astype(numpy.int32) + + +def calc_frame_per_mora(mora: Mora) -> ndarray: + """ + モーラあたりのフレーム長を算出 + Parameters + ---------- + mora : Mora + モーラ + Returns + ------- + frame_per_mora : NDArray[] + モーラあたりのフレーム長。端数丸め。 + """ + # 音素ごとにフレーム長を算出し、和をモーラのフレーム長とする + vowel_frames = _to_frame(mora.vowel_length) + consonant_frames = _to_frame(mora.consonant_length) if mora.consonant else 0 + return vowel_frames + consonant_frames + + def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: """ 音高スケール(`pitchScale`)の適用 @@ -233,12 +256,7 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def calc_frame_pitch( - query: AudioQuery, - moras: List[Mora], - phonemes: List[OjtPhoneme], - frame_per_phoneme: numpy.ndarray, -): +def calc_frame_pitch(query: AudioQuery, moras: list[Mora]) -> ndarray: """ フレームごとのピッチの生成 Parameters @@ -247,10 +265,6 @@ def calc_frame_pitch( 音声合成クエリ moras : List[Mora] モーラ列 - phonemes : List[OjtPhoneme] - 音素列 - frame_per_phoneme: NDArray - 音素あたりのフレーム長。端数丸め。 Returns ------- frame_f0 : NDArray[] @@ -265,10 +279,7 @@ def calc_frame_pitch( # Rescale: 時間スケールの変更(モーラ -> フレーム) # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約 - vowel_indexes = numpy.array(split_mora(phonemes)[2]) - frame_per_mora = [ - a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1) - ] + frame_per_mora = numpy.array(list(map(calc_frame_per_mora, moras))) frame_f0 = numpy.repeat(f0, frame_per_mora) return frame_f0 @@ -619,9 +630,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): flatten_moras = apply_prepost_silence(flatten_moras, query) frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) - f0 = calc_frame_pitch( - query, flatten_moras, phoneme_data_list, frame_per_phoneme - ) + f0 = calc_frame_pitch(query, flatten_moras) phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する From cc10270cb3f5fed3340c883f92d8380f98a1bbc6 Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 11 Dec 2023 16:36:42 +0900 Subject: [PATCH 006/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=95?= =?UTF-8?q?=E3=83=AC=E3=83=BC=E3=83=A0=E8=A8=88=E7=AE=97=E3=81=AE=20`=5Fto?= =?UTF-8?q?=5Fframe`=20=E3=81=AB=E3=82=88=E3=82=8B=E5=85=B1=E9=80=9A?= =?UTF-8?q?=E5=8C=96=20(#844)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor: `_to_frame` による共通化 --- .../synthesis_engine/synthesis_engine.py | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index d36d9a407..410750383 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -172,22 +172,12 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): # Apply: グローバル特徴量による補正(話速) moras = apply_speed_scale(moras, query) - # 音素あたりの継続長 - sec_per_phoneme = numpy.array( - [ - length - for mora in moras - for length in ( - [mora.consonant_length] if mora.consonant is not None else [] - ) - + [mora.vowel_length] - ], - dtype=numpy.float32, - ) - # 音素あたりのフレーム長。端数丸め。 - framerate = 24000 / 256 # framerate 93.75 [frame/sec] - frame_per_phoneme = numpy.round(sec_per_phoneme * framerate).astype(numpy.int32) - + frame_per_phoneme: list[ndarray] = [] + for mora in moras: + if mora.consonant: + frame_per_phoneme.append(_to_frame(mora.consonant_length)) + frame_per_phoneme.append(_to_frame(mora.vowel_length)) + frame_per_phoneme = numpy.array(frame_per_phoneme) return frame_per_phoneme From 89a8b5342872b3d0c14a8f6e0af92903d2b199dc Mon Sep 17 00:00:00 2001 From: tarepan Date: Tue, 12 Dec 2023 00:19:12 +0900 Subject: [PATCH 007/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=82=B3?= =?UTF-8?q?=E3=82=A2CDLL=E5=9E=8B=E4=BB=98=E3=81=91=E3=81=AE=E5=88=87?= =?UTF-8?q?=E3=82=8A=E5=87=BA=E3=81=97=20(#843)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Refactor: コアCDLL型付けの切り出し * Fix: 参照渡しの戻し廃止 --- .../synthesis_engine/core_wrapper.py | 90 +++++++++++++------ 1 file changed, 61 insertions(+), 29 deletions(-) diff --git a/voicevox_engine/synthesis_engine/core_wrapper.py b/voicevox_engine/synthesis_engine/core_wrapper.py index 14afa427f..dcb2da101 100644 --- a/voicevox_engine/synthesis_engine/core_wrapper.py +++ b/voicevox_engine/synthesis_engine/core_wrapper.py @@ -378,6 +378,64 @@ def load_core(core_dir: Path, use_gpu: bool) -> CDLL: raise RuntimeError(f"このコンピュータのアーキテクチャ {platform.machine()} で利用可能なコアがありません") +def _type_yukarin_s_forward(core_cdll: CDLL) -> None: + """ + コアDLL `yukarin_s_forward` 関数の型付け + Parameters + ---------- + core_cdll : CDLL + コアDLL + """ + core_cdll.yukarin_s_forward.argtypes = ( + c_int, + POINTER(c_long), + POINTER(c_long), + POINTER(c_float), + ) + core_cdll.yukarin_s_forward.restype = c_bool + + +def _type_yukarin_sa_forward(core_cdll: CDLL) -> None: + """ + コアDLL `yukarin_sa_forward` 関数の型付け + Parameters + ---------- + core_cdll : CDLL + コアDLL + """ + core_cdll.yukarin_sa_forward.argtypes = ( + c_int, + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + POINTER(c_long), + POINTER(c_float), + ) + core_cdll.yukarin_sa_forward.restype = c_bool + + +def _type_decode_forward(core_cdll: CDLL) -> None: + """ + コアDLL `decode_forward` 関数の型付け + Parameters + ---------- + core_cdll : CDLL + コアDLL + """ + core_cdll.decode_forward.argtypes = ( + c_int, + c_int, + POINTER(c_float), + POINTER(c_float), + POINTER(c_long), + POINTER(c_float), + ) + core_cdll.decode_forward.restype = c_bool + + class CoreWrapper: def __init__( self, @@ -393,9 +451,9 @@ def __init__( self.core.initialize.restype = c_bool self.core.metas.restype = c_char_p - self.core.yukarin_s_forward.restype = c_bool - self.core.yukarin_sa_forward.restype = c_bool - self.core.decode_forward.restype = c_bool + _type_yukarin_s_forward(self.core) + _type_yukarin_sa_forward(self.core) + _type_decode_forward(self.core) self.core.last_error_message.restype = c_char_p self.exist_supported_devices = False @@ -426,32 +484,6 @@ def __init__( self.exist_finalize = True exist_cpu_num_threads = True - self.core.yukarin_s_forward.argtypes = ( - c_int, - POINTER(c_long), - POINTER(c_long), - POINTER(c_float), - ) - self.core.yukarin_sa_forward.argtypes = ( - c_int, - POINTER(c_long), - POINTER(c_long), - POINTER(c_long), - POINTER(c_long), - POINTER(c_long), - POINTER(c_long), - POINTER(c_long), - POINTER(c_float), - ) - self.core.decode_forward.argtypes = ( - c_int, - c_int, - POINTER(c_float), - POINTER(c_float), - POINTER(c_long), - POINTER(c_float), - ) - cwd = os.getcwd() os.chdir(core_dir) try: From 3c971cc2e634d9e20df22ab638d7319f523747aa Mon Sep 17 00:00:00 2001 From: tarepan Date: Tue, 12 Dec 2023 00:23:00 +0900 Subject: [PATCH 008/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`default=5Fsettin?= =?UTF-8?q?g.yaml`=20=E3=81=AE=E5=BB=83=E6=AD=A2=20(#855)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor: `default_setting.yaml` の廃止 --- Dockerfile | 2 +- default_setting.yaml | 2 -- run.spec | 1 - test/e2e/conftest.py | 2 +- voicevox_engine/setting/SettingLoader.py | 15 ++++++++++++--- 5 files changed, 14 insertions(+), 8 deletions(-) delete mode 100644 default_setting.yaml diff --git a/Dockerfile b/Dockerfile index ef920ecf4..545449a7c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -227,7 +227,7 @@ COPY --from=download-onnxruntime-env /opt/onnxruntime /opt/onnxruntime # Add local files ADD ./voicevox_engine /opt/voicevox_engine/voicevox_engine ADD ./docs /opt/voicevox_engine/docs -ADD ./run.py ./generate_licenses.py ./presets.yaml ./default.csv ./default_setting.yaml ./engine_manifest.json /opt/voicevox_engine/ +ADD ./run.py ./generate_licenses.py ./presets.yaml ./default.csv ./engine_manifest.json /opt/voicevox_engine/ ADD ./speaker_info /opt/voicevox_engine/speaker_info ADD ./ui_template /opt/voicevox_engine/ui_template ADD ./engine_manifest_assets /opt/voicevox_engine/engine_manifest_assets diff --git a/default_setting.yaml b/default_setting.yaml deleted file mode 100644 index 3421e7a6a..000000000 --- a/default_setting.yaml +++ /dev/null @@ -1,2 +0,0 @@ -allow_origin: null -cors_policy_mode: localapps diff --git a/run.spec b/run.spec index 9f73c6b07..970f2adfa 100644 --- a/run.spec +++ b/run.spec @@ -10,7 +10,6 @@ datas = [ ('default.csv', '.'), ('licenses.json', '.'), ('presets.yaml', '.'), - ('default_setting.yaml', '.'), ('ui_template', 'ui_template'), ] datas += collect_data_files('pyopenjtalk') diff --git a/test/e2e/conftest.py b/test/e2e/conftest.py index b656886b6..b6eab18ae 100644 --- a/test/e2e/conftest.py +++ b/test/e2e/conftest.py @@ -14,7 +14,7 @@ def client(): synthesis_engines = make_synthesis_engines(use_gpu=False) latest_core_version = get_latest_core_version(versions=synthesis_engines.keys()) - setting_loader = SettingLoader(Path("./default_setting.yaml")) + setting_loader = SettingLoader(Path("./not_exist.yaml")) preset_manager = PresetManager( # FIXME: impl MockPresetManager preset_path=Path("./presets.yaml"), ) diff --git a/voicevox_engine/setting/SettingLoader.py b/voicevox_engine/setting/SettingLoader.py index 3f1669e26..453b7a94d 100644 --- a/voicevox_engine/setting/SettingLoader.py +++ b/voicevox_engine/setting/SettingLoader.py @@ -2,21 +2,30 @@ import yaml -from ..utility import engine_root, get_save_dir +from ..utility import get_save_dir from .Setting import Setting -DEFAULT_SETTING_PATH: Path = engine_root() / "default_setting.yaml" USER_SETTING_PATH: Path = get_save_dir() / "setting.yml" class SettingLoader: def __init__(self, setting_file_path: Path) -> None: + """ + 設定ファイルの管理 + Parameters + ---------- + setting_file_path : Path + 設定ファイルのパス。存在しない場合はデフォルト値を設定。 + """ self.setting_file_path = setting_file_path def load_setting_file(self) -> Setting: + # 設定値の読み込み if not self.setting_file_path.is_file(): - setting = yaml.safe_load(DEFAULT_SETTING_PATH.read_text(encoding="utf-8")) + # 設定ファイルが存在しないためデフォルト値を取得 + setting = {"allow_origin": None, "cors_policy_mode": "localapps"} else: + # 指定された設定ファイルから値を取得 setting = yaml.safe_load(self.setting_file_path.read_text(encoding="utf-8")) setting = Setting( From 5f0b4a7d4cc1bb8ecc6df234de1b6c21ed847ce9 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 13 Dec 2023 00:50:52 +0900 Subject: [PATCH 009/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`OjtPhoneme`=20?= =?UTF-8?q?=E3=82=AF=E3=83=A9=E3=82=B9=E5=A4=89=E6=95=B0=E3=81=AE=E3=83=97?= =?UTF-8?q?=E3=83=A9=E3=82=A4=E3=83=99=E3=83=BC=E3=83=88=E5=8C=96=20(#846)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_acoustic_feature_extractor.py | 20 ++--- test/test_synthesis_engine.py | 2 +- voicevox_engine/acoustic_feature_extractor.py | 82 +++++-------------- 3 files changed, 29 insertions(+), 75 deletions(-) diff --git a/test/test_acoustic_feature_extractor.py b/test/test_acoustic_feature_extractor.py index 94ef7ac63..24c70d284 100644 --- a/test/test_acoustic_feature_extractor.py +++ b/test/test_acoustic_feature_extractor.py @@ -2,6 +2,8 @@ from voicevox_engine.acoustic_feature_extractor import OjtPhoneme +TRUE_NUM_PHONEME = 45 + class TestOjtPhoneme(TestCase): def setUp(self): @@ -10,17 +12,13 @@ def setUp(self): hello_hiho = "sil k o N n i ch i w a pau h i h o d e s U sil".split() self.ojt_hello_hiho = [OjtPhoneme(s) for s in hello_hiho] - def test_phoneme_list(self): - self.assertEqual(OjtPhoneme.phoneme_list[1], "A") - self.assertEqual(OjtPhoneme.phoneme_list[14], "e") - self.assertEqual(OjtPhoneme.phoneme_list[26], "m") - self.assertEqual(OjtPhoneme.phoneme_list[38], "ts") - self.assertEqual(OjtPhoneme.phoneme_list[41], "v") - def test_const(self): - TRUE_NUM_PHONEME = 45 - self.assertEqual(OjtPhoneme.num_phoneme, TRUE_NUM_PHONEME) - self.assertEqual(OjtPhoneme.space_phoneme, "pau") + self.assertEqual(OjtPhoneme._NUM_PHONEME, TRUE_NUM_PHONEME) + self.assertEqual(OjtPhoneme._PHONEME_LIST[1], "A") + self.assertEqual(OjtPhoneme._PHONEME_LIST[14], "e") + self.assertEqual(OjtPhoneme._PHONEME_LIST[26], "m") + self.assertEqual(OjtPhoneme._PHONEME_LIST[38], "ts") + self.assertEqual(OjtPhoneme._PHONEME_LIST[41], "v") def test_convert(self): sil_phoneme = OjtPhoneme("sil") @@ -56,7 +54,7 @@ def test_onehot(self): 0, ] for i, phoneme in enumerate(self.ojt_hello_hiho): - for j in range(OjtPhoneme.num_phoneme): + for j in range(TRUE_NUM_PHONEME): if phoneme_id_list[i] == j: self.assertEqual(phoneme.onehot[j], 1.0) else: diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index eee0cae66..bdb6de486 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -928,7 +928,7 @@ def synthesis_test_base(self, audio_query: AudioQuery): ) # Expects: Apply/Convert/Rescale - num_phoneme = OjtPhoneme.num_phoneme + num_phoneme = 45 # mora_phoneme_listのPhoneme ID版 mora_phoneme_id_list = [OjtPhoneme(p).phoneme_id for p in mora_phoneme_list] diff --git a/voicevox_engine/acoustic_feature_extractor.py b/voicevox_engine/acoustic_feature_extractor.py index 516727056..066bfbdab 100644 --- a/voicevox_engine/acoustic_feature_extractor.py +++ b/voicevox_engine/acoustic_feature_extractor.py @@ -1,74 +1,30 @@ import numpy +# 音素のリスト +_P_LIST1 = ("pau", "A", "E", "I", "N", "O", "U", "a", "b", "by") +_P_LIST2 = ("ch", "cl", "d", "dy", "e", "f", "g", "gw", "gy", "h") +_P_LIST3 = ("hy", "i", "j", "k", "kw", "ky", "m", "my", "n", "ny") +_P_LIST4 = ("o", "p", "py", "r", "ry", "s", "sh", "t", "ts", "ty") +_P_LIST5 = ("u", "v", "w", "y", "z") +_PHONEME_LIST = _P_LIST1 + _P_LIST2 + _P_LIST3 + _P_LIST4 + _P_LIST5 + +# 音素リストの要素数 +_NUM_PHONEME = len(_PHONEME_LIST) + class OjtPhoneme: """ - OpenJTalkに含まれる音素群クラス - - Attributes - ---------- - phoneme_list : Sequence[str] - 音素のリスト - num_phoneme : int - 音素リストの要素数 - space_phoneme : str - 読点に値する音素 + OpenJTalkに含まれる音素 """ - phoneme_list = ( - "pau", - "A", - "E", - "I", - "N", - "O", - "U", - "a", - "b", - "by", - "ch", - "cl", - "d", - "dy", - "e", - "f", - "g", - "gw", - "gy", - "h", - "hy", - "i", - "j", - "k", - "kw", - "ky", - "m", - "my", - "n", - "ny", - "o", - "p", - "py", - "r", - "ry", - "s", - "sh", - "t", - "ts", - "ty", - "u", - "v", - "w", - "y", - "z", - ) - num_phoneme = len(phoneme_list) - space_phoneme = "pau" + _PHONEME_LIST = _PHONEME_LIST + _NUM_PHONEME = _NUM_PHONEME def __init__(self, phoneme: str): - # `sil`-to-`pau` (silent to space_phoneme) conversion + # 無音をポーズに変換 if "sil" in phoneme: - phoneme = self.space_phoneme + phoneme = "pau" + self.phoneme = phoneme def __eq__(self, o: object): @@ -84,7 +40,7 @@ def phoneme_id(self): id : int phoneme_idを返す """ - return self.phoneme_list.index(self.phoneme) + return self._PHONEME_LIST.index(self.phoneme) @property def onehot(self): @@ -95,6 +51,6 @@ def onehot(self): onehot : numpy.ndarray 音素onehotベクトル(listの長さ分の0埋め配列のうち、phoneme id番目が1.0の配列) """ - array = numpy.zeros(self.num_phoneme, dtype=numpy.float32) + array = numpy.zeros(self._NUM_PHONEME, dtype=numpy.float32) array[self.phoneme_id] = 1.0 return array From 3e6a7395f9c9a3d800b111ca653fbba47d77a4a6 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 13 Dec 2023 02:55:00 +0900 Subject: [PATCH 010/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E8=B2=A2?= =?UTF-8?q?=E7=8C=AE=E8=80=85=E3=82=AC=E3=82=A4=E3=83=89=E7=AF=80=E3=81=B8?= =?UTF-8?q?=E3=81=AE=E9=9B=86=E7=B4=84=20(#838)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 136 +++++++++++++++++++++++++++++------------------------- 1 file changed, 72 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index bb2488199..e7aef765c 100644 --- a/README.md +++ b/README.md @@ -350,26 +350,16 @@ docker run --rm --gpus all -p '127.0.0.1:50021:50021' voicevox/voicevox_engine:n GPU 版を利用する場合、環境によってエラーが発生することがあります。その場合、`--runtime=nvidia`を`docker run`につけて実行すると解決できることがあります。 -## 貢献者の方へ +## 実行環境構築 -Issue を解決するプルリクエストを作成される際は、別の方と同じ Issue に取り組むことを避けるため、 -Issue 側で取り組み始めたことを伝えるか、最初に Draft プルリクエストを作成してください。 - -[VOICEVOX 非公式 Discord サーバー](https://discord.gg/WMwWetrzuh)にて、開発の議論や雑談を行っています。気軽にご参加ください。 - -## 環境構築 - -`Python 3.11.3` を用いて開発されています。 -インストールするには、各 OS ごとの C/C++ コンパイラ、CMake が必要になります。 +`Python 3.11.3` が動作確認済みの環境です。 +この環境へ必要なライブラリをインストールしてください。 ```bash -# 開発に必要なライブラリのインストール -python -m pip install -r requirements-dev.txt -r requirements-test.txt - -# とりあえず実行したいだけなら代わりにこちら python -m pip install -r requirements.txt ``` + ## 実行 コマンドライン引数の詳細は以下のコマンドで確認してください。 @@ -458,51 +448,43 @@ Mac では、`--runtime_dir`引数の代わりに`DYLD_LIBRARY_PATH`の指定が DYLD_LIBRARY_PATH="/path/to/onnx" python run.py --voicelib_dir="/path/to/voicevox_core" ``` -## コードフォーマット +## API ドキュメントの確認 -このソフトウェアでは、リモートにプッシュする前にコードフォーマットを確認する仕組み(静的解析ツール)を利用できます。 -利用するには、開発に必要なライブラリのインストールに加えて、以下のコマンドを実行してください。 -プルリクエストを作成する際は、利用することを推奨します。 +[API ドキュメント](https://voicevox.github.io/voicevox_engine/api/)(実体は`docs/api/index.html`)は自動で更新されます。 +次のコマンドで API ドキュメントを手動で作成することができます。 ```bash -pre-commit install -t pre-push +python make_docs.py ``` -エラーが出た際は、以下のコマンドで修正することが可能です。なお、完全に修正できるわけではないので注意してください。 - -```bash -pysen run format lint -``` +## ユーザー辞書の更新について -## テスト +以下のコマンドで openjtalk のユーザー辞書をコンパイルできます。 ```bash -python -m pytest +python -c "import pyopenjtalk; pyopenjtalk.create_user_dict('default.csv','user.dic')" ``` -## タイポチェック +## 貢献者ガイド -[typos](https://github.com/crate-ci/typos) を使ってタイポのチェックを行っています。 -[typos をインストール](https://github.com/crate-ci/typos#install) した後 +### 貢献者の方へ -```bash -typos -``` +Issue を解決するプルリクエストを作成される際は、別の方と同じ Issue に取り組むことを避けるため、 +Issue 側で取り組み始めたことを伝えるか、最初に Draft プルリクエストを作成してください。 -でタイポチェックを行えます。 -もし誤判定やチェックから除外すべきファイルがあれば -[設定ファイルの説明](https://github.com/crate-ci/typos#false-positives) に従って`_typos.toml`を編集してください。 +[VOICEVOX 非公式 Discord サーバー](https://discord.gg/WMwWetrzuh)にて、開発の議論や雑談を行っています。気軽にご参加ください。 -## API ドキュメントの確認 +### 開発環境構築 -[API ドキュメント](https://voicevox.github.io/voicevox_engine/api/)(実体は`docs/api/index.html`)は自動で更新されます。 -次のコマンドで API ドキュメントを手動で作成することができます。 +`Python 3.11.3` を用いて開発されています。 +インストールするには、各 OS ごとの C/C++ コンパイラ、CMake が必要になります。 ```bash -python make_docs.py +# ライブラリのインストール +python -m pip install -r requirements-dev.txt -r requirements-test.txt ``` -## ビルド +### ビルド この方法でビルドしたものは、リリースで公開されているものとは異なります。 また、GPU で利用するには cuDNN や CUDA、DirectML などのライブラリが追加で必要となります。 @@ -523,9 +505,44 @@ LIBONNXRUNTIME_PATH="/path/to/libonnxruntime" \ pyinstaller --noconfirm run.spec ``` -## 依存関係 +### コードフォーマット -### 更新 +このソフトウェアでは、リモートにプッシュする前にコードフォーマットを確認する仕組み(静的解析ツール)を利用できます。 +利用するには、開発に必要なライブラリのインストールに加えて、以下のコマンドを実行してください。 +プルリクエストを作成する際は、利用することを推奨します。 + +```bash +pre-commit install -t pre-push +``` + +エラーが出た際は、以下のコマンドで修正することが可能です。なお、完全に修正できるわけではないので注意してください。 + +```bash +pysen run format lint +``` + +### テスト + +```bash +python -m pytest +``` + +### タイポチェック + +[typos](https://github.com/crate-ci/typos) を使ってタイポのチェックを行っています。 +[typos をインストール](https://github.com/crate-ci/typos#install) した後 + +```bash +typos +``` + +でタイポチェックを行えます。 +もし誤判定やチェックから除外すべきファイルがあれば +[設定ファイルの説明](https://github.com/crate-ci/typos#false-positives) に従って`_typos.toml`を編集してください。 + +### 依存関係 + +#### 更新 [Poetry](https://python-poetry.org/) を用いて依存ライブラリのバージョンを固定しています。 以下のコマンドで操作できます: @@ -546,8 +563,7 @@ poetry export --without-hashes --with dev -o requirements-dev.txt poetry export --without-hashes --with test -o requirements-test.txt poetry export --without-hashes --with license -o requirements-license.txt ``` - -### ライセンス +#### ライセンス 依存ライブラリは「コアビルド時にリンクして一体化しても、コア部のコード非公開 OK」なライセンスを持つ必要があります。 主要ライセンスの可否は以下の通りです。 @@ -556,13 +572,19 @@ poetry export --without-hashes --with license -o requirements-license.txt - LGPL: OK (コアと動的分離されているため) - GPL: NG (全関連コードの公開が必要なため) -## ユーザー辞書の更新について +### GitHub Actions -以下のコマンドで openjtalk のユーザー辞書をコンパイルできます。 +#### Variables -```bash -python -c "import pyopenjtalk; pyopenjtalk.create_user_dict('default.csv','user.dic')" -``` +| name | description | +| :----------------- | :------------------ | +| DOCKERHUB_USERNAME | Docker Hub ユーザ名 | + +#### Secrets + +| name | description | +| :-------------- | :---------------------------------------------------------------------- | +| DOCKERHUB_TOKEN | [Docker Hub アクセストークン](https://hub.docker.com/settings/security) | ## マルチエンジン機能に関して @@ -581,7 +603,7 @@ VOICEVOX API に準拠した複数のエンジンの Web API をポートを分 ### マルチエンジン機能への対応方法 VOICEVOX API 準拠エンジンを起動する実行バイナリを作ることで対応が可能です。 -VOICEVOX ENGINE リポジトリを fork し、一部の機能を改造するのが簡単です。 +VOICEVOX ENGINE リポジトリを fork し、一部の機能を改造するのが簡単です(#貢献者ガイド を参照ください)。 改造すべき点はエンジン情報・キャラクター情報・音声合成の3点です。 @@ -611,20 +633,6 @@ VOICEVOX エディターにうまく読み込ませられないときは、エ -## GitHub Actions - -### Variables - -| name | description | -| :----------------- | :------------------ | -| DOCKERHUB_USERNAME | Docker Hub ユーザ名 | - -### Secrets - -| name | description | -| :-------------- | :---------------------------------------------------------------------- | -| DOCKERHUB_TOKEN | [Docker Hub アクセストークン](https://hub.docker.com/settings/security) | - ## 事例紹介 **[voicevox-client](https://github.com/tuna2134/voicevox-client) [@tuna2134](https://github.com/tuna2134)** ・・・ VOICEVOX ENGINE のための Python ラッパー From a81905da44b48c9ed00e60d7b80f4b6b5770fb4a Mon Sep 17 00:00:00 2001 From: takana-v <44311840+takana-v@users.noreply.github.com> Date: Wed, 13 Dec 2023 03:31:44 +0900 Subject: [PATCH 011/177] =?UTF-8?q?issue-labeler=E3=81=AE=E3=83=90?= =?UTF-8?q?=E3=83=BC=E3=82=B8=E3=83=A7=E3=83=B3=E3=82=92=E4=BF=AE=E6=AD=A3?= =?UTF-8?q?=20(#831)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/labeler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 06d547dcf..f485a56e1 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -10,7 +10,7 @@ jobs: triage: runs-on: ubuntu-latest steps: - - uses: github/issue-labeler@v3 + - uses: github/issue-labeler@v3.3 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: .github/labeler.yml From 3e8b32fa1d9404c93d4fc1eb7d193cafd9ac418f Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 13 Dec 2023 03:53:52 +0900 Subject: [PATCH 012/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`MetaStore.py`=20?= =?UTF-8?q?docstring=E3=81=AE=E8=BF=BD=E5=8A=A0=20(#845)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- voicevox_engine/metas/MetasStore.py | 55 +++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/voicevox_engine/metas/MetasStore.py b/voicevox_engine/metas/MetasStore.py index 88a7bc37d..c8367e831 100644 --- a/voicevox_engine/metas/MetasStore.py +++ b/voicevox_engine/metas/MetasStore.py @@ -16,7 +16,14 @@ class MetasStore: """ def __init__(self, engine_speakers_path: Path) -> None: + """ + Parameters + ---------- + engine_speakers_path : Path + エンジンに含まれる話者メタ情報ディレクトリのパス。 + """ self._engine_speakers_path = engine_speakers_path + # エンジンに含まれる各話者のメタ情報 self._loaded_metas: Dict[str, EngineSpeaker] = { folder.name: EngineSpeaker( **json.loads((folder / "metas.json").read_text(encoding="utf-8")) @@ -25,14 +32,32 @@ def __init__(self, engine_speakers_path: Path) -> None: } def speaker_engine_metas(self, speaker_uuid: str) -> EngineSpeaker: + """ + エンジンに含まれる指定話者のメタ情報を取得 + Parameters + ---------- + speaker_uuid : str + 話者UUID + Returns + ------- + ret : EngineSpeaker + エンジンに含まれる指定話者のメタ情報 + """ return self.loaded_metas[speaker_uuid] def combine_metas(self, core_metas: List[CoreSpeaker]) -> List[Speaker]: """ - 与えられたmetaにエンジンのコア情報を付加して返す - core_metas: コアのmetas()が返すJSONのModel + コアに含まれる話者メタ情報に、エンジンに含まれる話者メタ情報を統合して返す + Parameters + ---------- + core_metas : List[CoreSpeaker] + コアに含まれる話者メタ情報 + Returns + ------- + ret : List[Speaker] + エンジンとコアに含まれる話者メタ情報 """ - + # 話者単位でエンジン・コアに含まれるメタ情報を統合 return [ Speaker( **self.speaker_engine_metas(speaker_meta.speaker_uuid).dict(), @@ -45,10 +70,19 @@ def combine_metas(self, core_metas: List[CoreSpeaker]) -> List[Speaker]: # SynthesisEngineBaseによる循環importを修正する def load_combined_metas(self, engine: "SynthesisEngineBase") -> List[Speaker]: """ - 与えられたエンジンから、コア・エンジン両方の情報を含んだMetasを返す + コアに含まれる話者メタ情報とエンジンに含まれる話者メタ情報を統合 + Parameters + ---------- + engine : SynthesisEngineBase + コアに含まれる話者メタ情報をもったエンジン + Returns + ------- + ret : List[Speaker] + エンジンとコアに含まれる話者メタ情報 """ - + # コアに含まれる話者メタ情報の収集 core_metas = [CoreSpeaker(**speaker) for speaker in json.loads(engine.speakers)] + # エンジンに含まれる話者メタ情報との統合 return self.combine_metas(core_metas) @property @@ -62,9 +96,16 @@ def loaded_metas(self) -> Dict[str, EngineSpeaker]: def construct_lookup(speakers: List[Speaker]) -> Dict[int, Tuple[Speaker, StyleInfo]]: """ - `{style.id: StyleInfo}`の変換テーブル + スタイルID に話者メタ情報・スタイルメタ情報を紐付ける対応表を生成 + Parameters + ---------- + speakers : List[Speaker] + 話者メタ情報 + Returns + ------- + ret : Dict[int, Tuple[Speaker, StyleInfo]] + スタイルID に話者メタ情報・スタイルメタ情報が紐付いた対応表 """ - lookup_table = dict() for speaker in speakers: for style in speaker.styles: From 290333348b93f00b654f1bd062c117b536b74b42 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 13 Dec 2023 04:03:01 +0900 Subject: [PATCH 013/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`LibraryManager`?= =?UTF-8?q?=20docstring=E3=83=BB=E5=9E=8B=E3=81=AE=E8=BF=BD=E5=8A=A0=20(#8?= =?UTF-8?q?33)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- voicevox_engine/library_manager.py | 60 +++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/voicevox_engine/library_manager.py b/voicevox_engine/library_manager.py index 9168a0a4b..3bab1508b 100644 --- a/voicevox_engine/library_manager.py +++ b/voicevox_engine/library_manager.py @@ -23,6 +23,8 @@ class LibraryManager: + """音声ライブラリ (`.vvlib`) の管理""" + def __init__( self, library_root_dir: Path, @@ -42,7 +44,13 @@ def __init__( self.engine_name = engine_name self.engine_uuid = engine_uuid - def downloadable_libraries(self): + def downloadable_libraries(self) -> list[DownloadableLibraryInfo]: + """ + ダウンロード可能ライブラリの一覧を取得 + Returns + ------- + - : list[DownloadableLibraryInfo] + """ # == ダウンロード情報をネットワーク上から取得する場合 # url = "https://example.com/downloadable_libraries.json" # response = requests.get(url) @@ -83,9 +91,17 @@ def downloadable_libraries(self): return list(map(DownloadableLibraryInfo.parse_obj, libraries)) def installed_libraries(self) -> Dict[str, InstalledLibraryInfo]: - library = {} + """ + インストール済み音声ライブラリの情報を取得 + Returns + ------- + library : Dict[str, InstalledLibraryInfo] + インストール済みライブラリの情報 + """ + library: Dict[str, InstalledLibraryInfo] = {} for library_dir in self.library_root_dir.iterdir(): if library_dir.is_dir(): + # ライブラリ情報の取得 from `library_root_dir / f"{library_uuid}" / "metas.json"` library_uuid = os.path.basename(library_dir) with open(library_dir / INFO_FILE, encoding="utf-8") as f: library[library_uuid] = json.load(f) @@ -93,7 +109,20 @@ def installed_libraries(self) -> Dict[str, InstalledLibraryInfo]: library[library_uuid]["uninstallable"] = True return library - def install_library(self, library_id: str, file: BytesIO): + def install_library(self, library_id: str, file: BytesIO) -> Path: + """ + 音声ライブラリ (`.vvlib`) のインストール + Parameters + ---------- + library_id : str + インストール対象ライブラリID + file : BytesIO + ライブラリファイルBlob + Returns + ------- + library_dir : Path + インストール済みライブラリの情報 + """ for downloadable_library in self.downloadable_libraries(): if downloadable_library.uuid == library_id: library_info = downloadable_library.dict() @@ -102,10 +131,16 @@ def install_library(self, library_id: str, file: BytesIO): raise HTTPException( status_code=404, detail=f"指定された音声ライブラリ {library_id} が見つかりません。" ) + + # ライブラリディレクトリの生成 library_dir = self.library_root_dir / library_id library_dir.mkdir(exist_ok=True) + + # metas.jsonの生成 with open(library_dir / INFO_FILE, "w", encoding="utf-8") as f: json.dump(library_info, f, indent=4, ensure_ascii=False) + + # zipファイル形式のバリデーション if not zipfile.is_zipfile(file): raise HTTPException( status_code=422, detail=f"音声ライブラリ {library_id} は不正なファイルです。" @@ -117,7 +152,7 @@ def install_library(self, library_id: str, file: BytesIO): status_code=422, detail=f"音声ライブラリ {library_id} は不正なファイルです。" ) - # validate manifest version + # マニフェストファイルの存在とファイル形式をバリデーション vvlib_manifest = None try: vvlib_manifest = json.loads( @@ -134,6 +169,7 @@ def install_library(self, library_id: str, file: BytesIO): detail=f"指定された音声ライブラリ {library_id} のvvlib_manifest.jsonは不正です。", ) + # マニフェスト形式のバリデーション try: VvlibManifest.validate(vvlib_manifest) except ValidationError: @@ -142,11 +178,13 @@ def install_library(self, library_id: str, file: BytesIO): detail=f"指定された音声ライブラリ {library_id} のvvlib_manifest.jsonに不正なデータが含まれています。", ) + # ライブラリバージョンのバリデーション if not Version.is_valid(vvlib_manifest["version"]): raise HTTPException( status_code=422, detail=f"指定された音声ライブラリ {library_id} のversionが不正です。" ) + # マニフェストバージョンのバリデーション try: vvlib_manifest_version = Version.parse( vvlib_manifest["manifest_version"] @@ -156,33 +194,45 @@ def install_library(self, library_id: str, file: BytesIO): status_code=422, detail=f"指定された音声ライブラリ {library_id} のmanifest_versionが不正です。", ) - if vvlib_manifest_version > self.supported_vvlib_version: raise HTTPException( status_code=422, detail=f"指定された音声ライブラリ {library_id} は未対応です。" ) + # ライブラリ-エンジン対応のバリデーション if vvlib_manifest["engine_uuid"] != self.engine_uuid: raise HTTPException( status_code=422, detail=f"指定された音声ライブラリ {library_id} は{self.engine_name}向けではありません。", ) + # 展開によるインストール zf.extractall(library_dir) + return library_dir def uninstall_library(self, library_id: str): + """ + インストール済み音声ライブラリのアンインストール + Parameters + ---------- + library_id : str + インストール対象ライブラリID + """ + # 対象ライブラリがインストール済みであることの確認 installed_libraries = self.installed_libraries() if library_id not in installed_libraries.keys(): raise HTTPException( status_code=404, detail=f"指定された音声ライブラリ {library_id} はインストールされていません。" ) + # アンインストール許可フラグのバリデーション if not installed_libraries[library_id]["uninstallable"]: raise HTTPException( status_code=403, detail=f"指定された音声ライブラリ {library_id} はアンインストールできません。" ) + # ディレクトリ削除によるアンインストール try: shutil.rmtree(self.library_root_dir / library_id) except Exception: From 56e3971c0a5b1339e863e53bcf0b716f0b687bbd Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 13 Dec 2023 04:18:29 +0900 Subject: [PATCH 014/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`round`=20?= =?UTF-8?q?=E6=8C=99=E5=8B=95=E3=83=A1=E3=83=A2=E3=81=AE=E8=BF=BD=E5=8A=A0?= =?UTF-8?q?=20(#852)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- voicevox_engine/synthesis_engine/synthesis_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 410750383..da74a1d78 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -183,6 +183,7 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): def _to_frame(sec: float) -> ndarray: FRAMERATE = 93.75 # 24000 / 256 [frame/sec] + # NOTE: `round` は偶数丸め。移植時に取扱い注意。詳細は voicevox_engine#552 return numpy.round(sec * FRAMERATE).astype(numpy.int32) From e90a2f7e925c7a3c7c8c0dea9aa93a9177e121b2 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Wed, 13 Dec 2023 12:55:41 +0900 Subject: [PATCH 015/177] =?UTF-8?q?README=E3=82=92=E3=83=A6=E3=83=BC?= =?UTF-8?q?=E3=82=B6=E3=83=BC=E3=82=AC=E3=82=A4=E3=83=89=E3=83=BB=E9=96=8B?= =?UTF-8?q?=E7=99=BA=E8=80=85=EF=BC=8B=E8=B2=A2=E7=8C=AE=E8=80=85=E3=82=AC?= =?UTF-8?q?=E3=82=A4=E3=83=89=E3=81=AB=E5=8C=BA=E5=88=86=E5=8C=96=20(#858)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 164 ++++++++++++++++++++++++++---------------------------- 1 file changed, 80 insertions(+), 84 deletions(-) diff --git a/README.md b/README.md index e7aef765c..5f4594717 100644 --- a/README.md +++ b/README.md @@ -17,18 +17,38 @@ コアは [VOICEVOX CORE](https://github.com/VOICEVOX/voicevox_core/) 、 全体構成は [こちら](https://github.com/VOICEVOX/voicevox/blob/main/docs/%E5%85%A8%E4%BD%93%E6%A7%8B%E6%88%90.md) に詳細があります。) -## ダウンロード +## ユーザーガイド + +### ダウンロード [こちら](https://github.com/VOICEVOX/voicevox_engine/releases/latest)から対応するエンジンをダウンロードしてください。 -## API ドキュメント +### API ドキュメント [API ドキュメント](https://voicevox.github.io/voicevox_engine/api/)をご参照ください。 VOICEVOX エンジンもしくはエディタを起動した状態で http://127.0.0.1:50021/docs にアクセスすると、起動中のエンジンのドキュメントも確認できます。 今後の方針などについては [VOICEVOX 音声合成エンジンとの連携](./docs/VOICEVOX音声合成エンジンとの連携.md) も参考になるかもしれません。 -リクエスト・レスポンスの文字コードはすべて UTF-8 です。 +### Docker イメージ + +#### CPU + +```bash +docker pull voicevox/voicevox_engine:cpu-ubuntu20.04-latest +docker run --rm -p '127.0.0.1:50021:50021' voicevox/voicevox_engine:cpu-ubuntu20.04-latest +``` + +#### GPU + +```bash +docker pull voicevox/voicevox_engine:nvidia-ubuntu20.04-latest +docker run --rm --gpus all -p '127.0.0.1:50021:50021' voicevox/voicevox_engine:nvidia-ubuntu20.04-latest +``` + +##### トラブルシューティング + +GPU 版を利用する場合、環境によってエラーが発生することがあります。その場合、`--runtime=nvidia`を`docker run`につけて実行すると解決できることがあります。 ### HTTP リクエストで音声合成するサンプルコード @@ -53,7 +73,7 @@ curl -s \ `style_id` に指定する値は `/speakers` エンドポイントで得られます。 -### 読み方を AquesTalk風記法で取得・修正するサンプルコード +### 読み方を AquesTalk 風記法で取得・修正するサンプルコード `/audio_query`のレスポンスにはエンジンが判断した読み方が AquesTalk 風記法([本家の記法](https://www.a-quest.com/archive/manual/siyo_onseikigou.pdf)とは一部異なります)で記録されています。 記法は次のルールに従います。 @@ -283,6 +303,10 @@ VOICEVOX ではセキュリティ保護のため`localhost`・`127.0.0.1`・`app 3. 保存ボタンを押して、変更を確定してください。 4. 設定の適用にはエンジンの再起動が必要です。必要に応じて再起動をしてください。 +### 文字コード + +リクエスト・レスポンスの文字コードはすべて UTF-8 です。 + ### その他の引数 エンジン起動時に引数を指定できます。詳しいことは`-h`引数でヘルプを確認してください。 @@ -326,41 +350,33 @@ options: プリセットファイルを指定できます。指定がない場合、環境変数 VV_PRESET_FILE、--voicevox_dirのpresets.yaml、実行ファイルのディレクトリのpresets.yamlを順に探します。 ``` -## アップデート +### アップデート エンジンディレクトリ内にあるファイルを全て消去し、新しいものに置き換えてください。 -## Docker イメージ - -### CPU +## 開発者・貢献者向けガイド -```bash -docker pull voicevox/voicevox_engine:cpu-ubuntu20.04-latest -docker run --rm -p '127.0.0.1:50021:50021' voicevox/voicevox_engine:cpu-ubuntu20.04-latest -``` - -### GPU - -```bash -docker pull voicevox/voicevox_engine:nvidia-ubuntu20.04-latest -docker run --rm --gpus all -p '127.0.0.1:50021:50021' voicevox/voicevox_engine:nvidia-ubuntu20.04-latest -``` +### 貢献者の方へ -#### トラブルシューティング +Issue を解決するプルリクエストを作成される際は、別の方と同じ Issue に取り組むことを避けるため、 +Issue 側で取り組み始めたことを伝えるか、最初に Draft プルリクエストを作成してください。 -GPU 版を利用する場合、環境によってエラーが発生することがあります。その場合、`--runtime=nvidia`を`docker run`につけて実行すると解決できることがあります。 +[VOICEVOX 非公式 Discord サーバー](https://discord.gg/WMwWetrzuh)にて、開発の議論や雑談を行っています。気軽にご参加ください。 -## 実行環境構築 +### 環境構築 -`Python 3.11.3` が動作確認済みの環境です。 -この環境へ必要なライブラリをインストールしてください。 +`Python 3.11.3` を用いて開発されています。 +インストールするには、各 OS ごとの C/C++ コンパイラ、CMake が必要になります。 ```bash +# 実行環境のインストール python -m pip install -r requirements.txt -``` +# 開発環境・テスト環境のインストール +python -m pip install -r requirements-dev.txt -r requirements-test.txt +``` -## 実行 +### 実行 コマンドライン引数の詳細は以下のコマンドで確認してください。 @@ -393,30 +409,28 @@ python run.py --output_log_utf8 # もしくは VV_OUTPUT_LOG_UTF8=1 python run.py ``` -### CPU スレッド数を指定する +#### CPU スレッド数を指定する CPU スレッド数が未指定の場合は、論理コア数の半分か物理コア数が使われます。(殆どの CPU で、これは全体の処理能力の半分です) もし IaaS 上で実行していたり、専用サーバーで実行している場合など、 エンジンが使う処理能力を調節したい場合は、CPU スレッド数を指定することで実現できます。 - 実行時引数で指定する - ```bash python run.py --voicevox_dir=$VOICEVOX_DIR --cpu_num_threads=4 ``` - - 環境変数で指定する ```bash export VV_CPU_NUM_THREADS=4 python run.py --voicevox_dir=$VOICEVOX_DIR ``` -### 過去のバージョンのコアを使う +#### 過去のバージョンのコアを使う VOICEVOX Core 0.5.4 以降のコアを使用する事が可能です。 Mac での libtorch 版コアのサポートはしていません。 -#### 過去のバイナリを指定する +##### 過去のバイナリを指定する 製品版 VOICEVOX もしくはコンパイル済みエンジンのディレクトリを`--voicevox_dir`引数で指定すると、そのバージョンのコアが使用されます。 @@ -430,7 +444,7 @@ Mac では、`DYLD_LIBRARY_PATH`の指定が必要です。 DYLD_LIBRARY_PATH="/path/to/voicevox" python run.py --voicevox_dir="/path/to/voicevox" ``` -#### 音声ライブラリを直接指定する +##### 音声ライブラリを直接指定する [VOICEVOX Core の zip ファイル](https://github.com/VOICEVOX/voicevox_core/releases)を解凍したディレクトリを`--voicelib_dir`引数で指定します。 また、コアのバージョンに合わせて、[libtorch](https://pytorch.org/)や[onnxruntime](https://github.com/microsoft/onnxruntime)のディレクトリを`--runtime_dir`引数で指定します。 @@ -448,42 +462,6 @@ Mac では、`--runtime_dir`引数の代わりに`DYLD_LIBRARY_PATH`の指定が DYLD_LIBRARY_PATH="/path/to/onnx" python run.py --voicelib_dir="/path/to/voicevox_core" ``` -## API ドキュメントの確認 - -[API ドキュメント](https://voicevox.github.io/voicevox_engine/api/)(実体は`docs/api/index.html`)は自動で更新されます。 -次のコマンドで API ドキュメントを手動で作成することができます。 - -```bash -python make_docs.py -``` - -## ユーザー辞書の更新について - -以下のコマンドで openjtalk のユーザー辞書をコンパイルできます。 - -```bash -python -c "import pyopenjtalk; pyopenjtalk.create_user_dict('default.csv','user.dic')" -``` - -## 貢献者ガイド - -### 貢献者の方へ - -Issue を解決するプルリクエストを作成される際は、別の方と同じ Issue に取り組むことを避けるため、 -Issue 側で取り組み始めたことを伝えるか、最初に Draft プルリクエストを作成してください。 - -[VOICEVOX 非公式 Discord サーバー](https://discord.gg/WMwWetrzuh)にて、開発の議論や雑談を行っています。気軽にご参加ください。 - -### 開発環境構築 - -`Python 3.11.3` を用いて開発されています。 -インストールするには、各 OS ごとの C/C++ コンパイラ、CMake が必要になります。 - -```bash -# ライブラリのインストール -python -m pip install -r requirements-dev.txt -r requirements-test.txt -``` - ### ビルド この方法でビルドしたものは、リリースで公開されているものとは異なります。 @@ -563,6 +541,7 @@ poetry export --without-hashes --with dev -o requirements-dev.txt poetry export --without-hashes --with test -o requirements-test.txt poetry export --without-hashes --with license -o requirements-license.txt ``` + #### ライセンス 依存ライブラリは「コアビルド時にリンクして一体化しても、コア部のコード非公開 OK」なライセンスを持つ必要があります。 @@ -572,21 +551,15 @@ poetry export --without-hashes --with license -o requirements-license.txt - LGPL: OK (コアと動的分離されているため) - GPL: NG (全関連コードの公開が必要なため) -### GitHub Actions - -#### Variables +### ユーザー辞書の更新について -| name | description | -| :----------------- | :------------------ | -| DOCKERHUB_USERNAME | Docker Hub ユーザ名 | - -#### Secrets +以下のコマンドで openjtalk のユーザー辞書をコンパイルできます。 -| name | description | -| :-------------- | :---------------------------------------------------------------------- | -| DOCKERHUB_TOKEN | [Docker Hub アクセストークン](https://hub.docker.com/settings/security) | +```bash +python -c "import pyopenjtalk; pyopenjtalk.create_user_dict('default.csv','user.dic')" +``` -## マルチエンジン機能に関して +### マルチエンジン機能に関して VOICEVOX エディターでは、複数のエンジンを同時に起動することができます。 この機能を利用することで、自作の音声合成エンジンや既存の音声合成エンジンを VOICEVOX エディター上で動かすことが可能です。 @@ -595,15 +568,15 @@ VOICEVOX エディターでは、複数のエンジンを同時に起動する
-### マルチエンジン機能の仕組み +#### マルチエンジン機能の仕組み VOICEVOX API に準拠した複数のエンジンの Web API をポートを分けて起動し、統一的に扱うことでマルチエンジン機能を実現しています。 エディターがそれぞれのエンジンを実行バイナリ経由で起動し、EngineID と結びつけて設定や状態を個別管理します。 -### マルチエンジン機能への対応方法 +#### マルチエンジン機能への対応方法 VOICEVOX API 準拠エンジンを起動する実行バイナリを作ることで対応が可能です。 -VOICEVOX ENGINE リポジトリを fork し、一部の機能を改造するのが簡単です(#貢献者ガイド を参照ください)。 +VOICEVOX ENGINE リポジトリを fork し、一部の機能を改造するのが簡単です。 改造すべき点はエンジン情報・キャラクター情報・音声合成の3点です。 @@ -619,7 +592,7 @@ VOICEVOX ENGINE リポジトリを fork し、一部の機能を改造するの VOICEVOX API での音声合成は、エンジン側で音声合成クエリ`AudioQuery`の初期値を作成してユーザーに返し、ユーザーが必要に応じてクエリを編集したあと、エンジンがクエリに従って音声合成することで実現しています。 クエリ作成は`/audio_query`エンドポイントで、音声合成は`/synthesis`エンドポイントで行っており、最低この2つに対応すれば VOICEVOX API に準拠したことになります。 -### マルチエンジン機能対応エンジンの配布方法 +#### マルチエンジン機能対応エンジンの配布方法 VVPP ファイルとして配布するのがおすすめです。 VVPP は「VOICEVOX プラグインパッケージ」の略で、中身はビルドしたエンジンなどを含んだディレクトリの Zip ファイルです。 @@ -633,6 +606,29 @@ VOICEVOX エディターにうまく読み込ませられないときは、エ
+### API ドキュメントの確認 + +[API ドキュメント](https://voicevox.github.io/voicevox_engine/api/)(実体は`docs/api/index.html`)は自動で更新されます。 +次のコマンドで API ドキュメントを手動で作成することができます。 + +```bash +python make_docs.py +``` + +### GitHub Actions + +#### Variables + +| name | description | +| :----------------- | :------------------ | +| DOCKERHUB_USERNAME | Docker Hub ユーザ名 | + +#### Secrets + +| name | description | +| :-------------- | :---------------------------------------------------------------------- | +| DOCKERHUB_TOKEN | [Docker Hub アクセストークン](https://hub.docker.com/settings/security) | + ## 事例紹介 **[voicevox-client](https://github.com/tuna2134/voicevox-client) [@tuna2134](https://github.com/tuna2134)** ・・・ VOICEVOX ENGINE のための Python ラッパー From 6f5c384d90555981759db0b2ef66fb699659ee74 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 13 Dec 2023 12:57:27 +0900 Subject: [PATCH 016/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=A6?= =?UTF-8?q?=E3=83=BC=E3=82=B6=E3=83=BC=E8=BE=9E=E6=9B=B8=E3=81=AEdocstring?= =?UTF-8?q?=E3=83=BB=E5=A4=89=E6=95=B0=E5=90=8D=E3=83=BB=E5=9E=8B=E3=83=BB?= =?UTF-8?q?=E3=82=B3=E3=83=A1=E3=83=B3=E3=83=88=20(#836)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_user_dict.py | 6 +- voicevox_engine/user_dict.py | 205 ++++++++++++++++++++++++++++++----- 2 files changed, 181 insertions(+), 30 deletions(-) diff --git a/test/test_user_dict.py b/test/test_user_dict.py index 4280bbe53..6014ac721 100644 --- a/test/test_user_dict.py +++ b/test/test_user_dict.py @@ -11,8 +11,8 @@ from voicevox_engine.model import UserDictWord, WordTypes from voicevox_engine.part_of_speech_data import MAX_PRIORITY, part_of_speech_data from voicevox_engine.user_dict import ( + _create_word, apply_word, - create_word, delete_word, import_user_dict, read_dict, @@ -90,7 +90,7 @@ def test_read_not_exist_json(self): def test_create_word(self): # 将来的に品詞などが追加された時にテストを増やす self.assertEqual( - create_word(surface="test", pronunciation="テスト", accent_type=1), + _create_word(surface="test", pronunciation="テスト", accent_type=1), UserDictWord( surface="test", priority=5, @@ -219,7 +219,7 @@ def test_priority(self): for pos in part_of_speech_data: for i in range(MAX_PRIORITY + 1): self.assertEqual( - create_word( + _create_word( surface="test", pronunciation="テスト", accent_type=1, diff --git a/voicevox_engine/user_dict.py b/voicevox_engine/user_dict.py index 7db07a721..f720ac4aa 100644 --- a/voicevox_engine/user_dict.py +++ b/voicevox_engine/user_dict.py @@ -21,27 +21,39 @@ if not save_dir.is_dir(): save_dir.mkdir(parents=True) -default_dict_path = root_dir / "default.csv" -user_dict_path = save_dir / "user_dict.json" -compiled_dict_path = save_dir / "user.dic" +default_dict_path = root_dir / "default.csv" # VOICEVOXデフォルト辞書ファイルのパス +user_dict_path = save_dir / "user_dict.json" # ユーザー辞書ファイルのパス +compiled_dict_path = save_dir / "user.dic" # コンパイル済み辞書ファイルのパス +# 同時書き込みの制御 mutex_user_dict = threading.Lock() mutex_openjtalk_dict = threading.Lock() @mutex_wrapper(mutex_user_dict) -def write_to_json(user_dict: Dict[str, UserDictWord], user_dict_path: Path): +def _write_to_json(user_dict: Dict[str, UserDictWord], user_dict_path: Path) -> None: + """ + ユーザー辞書ファイルへのユーザー辞書データ書き込み + Parameters + ---------- + user_dict : Dict[str, UserDictWord] + ユーザー辞書データ + user_dict_path : Path + ユーザー辞書ファイルのパス + """ converted_user_dict = {} for word_uuid, word in user_dict.items(): word_dict = word.dict() - word_dict["cost"] = priority2cost( + word_dict["cost"] = _priority2cost( word_dict["context_id"], word_dict["priority"] ) del word_dict["priority"] converted_user_dict[word_uuid] = word_dict # 予めjsonに変換できることを確かめる user_dict_json = json.dumps(converted_user_dict, ensure_ascii=False) + + # ユーザー辞書ファイルへの書き込み user_dict_path.write_text(user_dict_json, encoding="utf-8") @@ -50,14 +62,29 @@ def update_dict( default_dict_path: Path = default_dict_path, user_dict_path: Path = user_dict_path, compiled_dict_path: Path = compiled_dict_path, -): +) -> None: + """ + 辞書の更新 + Parameters + ---------- + default_dict_path : Path + デフォルト辞書ファイルのパス + user_dict_path : Path + ユーザー辞書ファイルのパス + compiled_dict_path : Path + コンパイル済み辞書ファイルのパス + """ random_string = uuid4() - tmp_csv_path = save_dir / f".tmp.dict_csv-{random_string}" - tmp_compiled_path = save_dir / f".tmp.dict_compiled-{random_string}" + tmp_csv_path = save_dir / f".tmp.dict_csv-{random_string}" # csv形式辞書データの一時保存ファイル + tmp_compiled_path = ( + save_dir / f".tmp.dict_compiled-{random_string}" + ) # コンパイル済み辞書データの一時保存ファイル try: # 辞書.csvを作成 csv_text = "" + + # デフォルト辞書データの追加 if not default_dict_path.is_file(): print("Warning: Cannot find default dictionary.", file=sys.stderr) return @@ -65,6 +92,8 @@ def update_dict( if default_dict == default_dict.rstrip(): default_dict += "\n" csv_text += default_dict + + # ユーザー辞書データの追加 user_dict = read_dict(user_dict_path=user_dict_path) for word_uuid in user_dict: word = user_dict[word_uuid] @@ -77,7 +106,7 @@ def update_dict( ).format( surface=word.surface, context_id=word.context_id, - cost=priority2cost(word.context_id, word.priority), + cost=_priority2cost(word.context_id, word.priority), part_of_speech=word.part_of_speech, part_of_speech_detail_1=word.part_of_speech_detail_1, part_of_speech_detail_2=word.part_of_speech_detail_2, @@ -91,6 +120,7 @@ def update_dict( mora_count=word.mora_count, accent_associative_rule=word.accent_associative_rule, ) + # 辞書データを辞書.csv へ一時保存 tmp_csv_path.write_text(csv_text, encoding="utf-8") # 辞書.csvをOpenJTalk用にコンパイル @@ -119,10 +149,23 @@ def update_dict( @mutex_wrapper(mutex_user_dict) def read_dict(user_dict_path: Path = user_dict_path) -> Dict[str, UserDictWord]: + """ + ユーザー辞書の読み出し + Parameters + ---------- + user_dict_path : Path + ユーザー辞書ファイルのパス + Returns + ------- + result : Dict[str, UserDictWord] + ユーザー辞書 + """ + # 指定ユーザー辞書が存在しない場合、空辞書を返す if not user_dict_path.is_file(): return {} + with user_dict_path.open(encoding="utf-8") as f: - result = {} + result: Dict[str, UserDictWord] = {} for word_uuid, word in json.load(f).items(): # cost2priorityで変換を行う際にcontext_idが必要となるが、 # 0.12以前の辞書は、context_idがハードコーディングされていたためにユーザー辞書内に保管されていない @@ -131,20 +174,39 @@ def read_dict(user_dict_path: Path = user_dict_path) -> Dict[str, UserDictWord]: word["context_id"] = part_of_speech_data[ WordTypes.PROPER_NOUN ].context_id - word["priority"] = cost2priority(word["context_id"], word["cost"]) + word["priority"] = _cost2priority(word["context_id"], word["cost"]) del word["cost"] result[str(UUID(word_uuid))] = UserDictWord(**word) return result -def create_word( +def _create_word( surface: str, pronunciation: str, accent_type: int, word_type: Optional[WordTypes] = None, priority: Optional[int] = None, ) -> UserDictWord: + """ + 単語オブジェクトの生成 + Parameters + ---------- + surface : str + 単語情報 + pronunciation : str + 単語情報 + accent_type : int + 単語情報 + word_type : Optional[WordTypes] + 品詞 + priority : Optional[int] + 優先度 + Returns + ------- + : UserDictWord + 単語オブジェクト + """ if word_type is None: word_type = WordTypes.PROPER_NOUN if word_type not in part_of_speech_data.keys(): @@ -181,7 +243,31 @@ def apply_word( user_dict_path: Path = user_dict_path, compiled_dict_path: Path = compiled_dict_path, ) -> str: - word = create_word( + """ + 新規単語の追加 + Parameters + ---------- + surface : str + 単語情報 + pronunciation : str + 単語情報 + accent_type : int + 単語情報 + word_type : Optional[WordTypes] + 品詞 + priority : Optional[int] + 優先度 + user_dict_path : Path + ユーザー辞書ファイルのパス + compiled_dict_path : Path + コンパイル済み辞書ファイルのパス + Returns + ------- + word_uuid : UserDictWord + 追加された単語に発行されたUUID + """ + # 新規単語の追加による辞書データの更新 + word = _create_word( surface=surface, pronunciation=pronunciation, accent_type=accent_type, @@ -191,8 +277,11 @@ def apply_word( user_dict = read_dict(user_dict_path=user_dict_path) word_uuid = str(uuid4()) user_dict[word_uuid] = word - write_to_json(user_dict, user_dict_path) + + # 更新された辞書データの保存と適用 + _write_to_json(user_dict, user_dict_path) update_dict(user_dict_path=user_dict_path, compiled_dict_path=compiled_dict_path) + return word_uuid @@ -205,19 +294,44 @@ def rewrite_word( priority: Optional[int] = None, user_dict_path: Path = user_dict_path, compiled_dict_path: Path = compiled_dict_path, -): - word = create_word( +) -> None: + """ + 既存単語の上書き更新 + Parameters + ---------- + word_uuid : str + 単語UUID + surface : str + 単語情報 + pronunciation : str + 単語情報 + accent_type : int + 単語情報 + word_type : Optional[WordTypes] + 品詞 + priority : Optional[int] + 優先度 + user_dict_path : Path + ユーザー辞書ファイルのパス + compiled_dict_path : Path + コンパイル済み辞書ファイルのパス + """ + word = _create_word( surface=surface, pronunciation=pronunciation, accent_type=accent_type, word_type=word_type, priority=priority, ) + + # 既存単語の上書きによる辞書データの更新 user_dict = read_dict(user_dict_path=user_dict_path) if word_uuid not in user_dict: raise HTTPException(status_code=422, detail="UUIDに該当するワードが見つかりませんでした") user_dict[word_uuid] = word - write_to_json(user_dict, user_dict_path) + + # 更新された辞書データの保存と適用 + _write_to_json(user_dict, user_dict_path) update_dict(user_dict_path=user_dict_path, compiled_dict_path=compiled_dict_path) @@ -225,12 +339,26 @@ def delete_word( word_uuid: str, user_dict_path: Path = user_dict_path, compiled_dict_path: Path = compiled_dict_path, -): +) -> None: + """ + 単語の削除 + Parameters + ---------- + word_uuid : str + 単語UUID + user_dict_path : Path + ユーザー辞書ファイルのパス + compiled_dict_path : Path + コンパイル済み辞書ファイルのパス + """ + # 既存単語の削除による辞書データの更新 user_dict = read_dict(user_dict_path=user_dict_path) if word_uuid not in user_dict: raise HTTPException(status_code=422, detail="IDに該当するワードが見つかりませんでした") del user_dict[word_uuid] - write_to_json(user_dict, user_dict_path) + + # 更新された辞書データの保存と適用 + _write_to_json(user_dict, user_dict_path) update_dict(user_dict_path=user_dict_path, compiled_dict_path=compiled_dict_path) @@ -240,8 +368,23 @@ def import_user_dict( user_dict_path: Path = user_dict_path, default_dict_path: Path = default_dict_path, compiled_dict_path: Path = compiled_dict_path, -): - # 念のため型チェックを行う +) -> None: + """ + ユーザー辞書のインポート + Parameters + ---------- + dict_data : Dict[str, UserDictWord] + インポートするユーザー辞書のデータ + override : bool + 重複したエントリがあった場合、上書きするかどうか + user_dict_path : Path + ユーザー辞書ファイルのパス + default_dict_path : Path + デフォルト辞書ファイルのパス + compiled_dict_path : Path + コンパイル済み辞書ファイルのパス + """ + # インポートする辞書データのバリデーション for word_uuid, word in dict_data.items(): UUID(word_uuid) assert isinstance(word, UserDictWord) @@ -263,12 +406,20 @@ def import_user_dict( break else: raise ValueError("対応していない品詞です") + + # 既存辞書の読み出し old_dict = read_dict(user_dict_path=user_dict_path) + + # 辞書データの更新 + # 重複エントリの上書き if override: new_dict = {**old_dict, **dict_data} + # 重複エントリの保持 else: new_dict = {**dict_data, **old_dict} - write_to_json(user_dict=new_dict, user_dict_path=user_dict_path) + + # 更新された辞書データの保存と適用 + _write_to_json(user_dict=new_dict, user_dict_path=user_dict_path) update_dict( default_dict_path=default_dict_path, user_dict_path=user_dict_path, @@ -276,23 +427,23 @@ def import_user_dict( ) -def search_cost_candidates(context_id: int) -> List[int]: +def _search_cost_candidates(context_id: int) -> List[int]: for value in part_of_speech_data.values(): if value.context_id == context_id: return value.cost_candidates raise HTTPException(status_code=422, detail="品詞IDが不正です") -def cost2priority(context_id: int, cost: conint(ge=-32768, le=32767)) -> int: - cost_candidates = search_cost_candidates(context_id) +def _cost2priority(context_id: int, cost: conint(ge=-32768, le=32767)) -> int: + cost_candidates = _search_cost_candidates(context_id) # cost_candidatesの中にある値で最も近い値を元にpriorityを返す # 参考: https://qiita.com/Krypf/items/2eada91c37161d17621d # この関数とpriority2cost関数によって、辞書ファイルのcostを操作しても最も近いpriorityのcostに上書きされる return MAX_PRIORITY - np.argmin(np.abs(np.array(cost_candidates) - cost)) -def priority2cost( +def _priority2cost( context_id: int, priority: conint(ge=MIN_PRIORITY, le=MAX_PRIORITY) ) -> int: - cost_candidates = search_cost_candidates(context_id) + cost_candidates = _search_cost_candidates(context_id) return cost_candidates[MAX_PRIORITY - priority] From 6e74fbc98ea9ebbb02ce0ad5beeae921d5bd8d07 Mon Sep 17 00:00:00 2001 From: tarepan Date: Thu, 14 Dec 2023 01:58:46 +0900 Subject: [PATCH 017/177] =?UTF-8?q?=E8=BF=BD=E5=8A=A0:=20=E8=BE=9E?= =?UTF-8?q?=E6=9B=B8=E3=82=A4=E3=83=B3=E3=83=9D=E3=83=BC=E3=83=88=E3=83=BB?= =?UTF-8?q?=E3=82=A8=E3=82=AF=E3=82=B9=E3=83=9D=E3=83=BC=E3=83=88=E3=82=AC?= =?UTF-8?q?=E3=82=A4=E3=83=89=20(#861)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 5f4594717..b667f22e5 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,14 @@ word_uuid="cce59b5f-86ab-42b9-bb75-9fd3407f1e2d" curl -s -X DELETE "127.0.0.1:50021/user_dict_word/$word_uuid" ``` +#### 辞書のインポート&エクスポート + +エンジンの[設定ページ](http://127.0.0.1:50021/setting)内の「ユーザー辞書のエクスポート&インポート」節で、ユーザー辞書のインポート&エクスポートが可能です。 + +他にも API でユーザー辞書のインポート&エクスポートが可能です。 +インポートには `POST /import_user_dict`、エクスポートには `GET /user_dict` を利用します。 +引数等の詳細は API ドキュメントをご覧ください。 + ### プリセット機能について `presets.yaml`を編集することで話者や話速などのプリセットを使うことができます。 @@ -551,14 +559,6 @@ poetry export --without-hashes --with license -o requirements-license.txt - LGPL: OK (コアと動的分離されているため) - GPL: NG (全関連コードの公開が必要なため) -### ユーザー辞書の更新について - -以下のコマンドで openjtalk のユーザー辞書をコンパイルできます。 - -```bash -python -c "import pyopenjtalk; pyopenjtalk.create_user_dict('default.csv','user.dic')" -``` - ### マルチエンジン機能に関して VOICEVOX エディターでは、複数のエンジンを同時に起動することができます。 From 3809c027cd15e334fd8a5fef674431c5cc20d82f Mon Sep 17 00:00:00 2001 From: tarepan Date: Thu, 14 Dec 2023 03:33:46 +0900 Subject: [PATCH 018/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=89?= =?UTF-8?q?=E3=82=AD=E3=83=A5=E3=83=A1=E3=83=B3=E3=83=88=E7=94=9F=E6=88=90?= =?UTF-8?q?=E3=81=AE=20`build=5Futil`=20=E7=A7=BB=E6=A4=8D=20(#866)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/upload-gh-pages.yml | 2 +- README.md | 2 +- make_docs.py => build_util/make_docs.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename make_docs.py => build_util/make_docs.py (100%) diff --git a/.github/workflows/upload-gh-pages.yml b/.github/workflows/upload-gh-pages.yml index 9e78d0a1b..e545c0bf6 100644 --- a/.github/workflows/upload-gh-pages.yml +++ b/.github/workflows/upload-gh-pages.yml @@ -34,7 +34,7 @@ jobs: - name: Make documents run: | - python make_docs.py + PYTHONPATH=. python build_util/make_docs.py - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 diff --git a/README.md b/README.md index b667f22e5..20267570c 100644 --- a/README.md +++ b/README.md @@ -612,7 +612,7 @@ VOICEVOX エディターにうまく読み込ませられないときは、エ 次のコマンドで API ドキュメントを手動で作成することができます。 ```bash -python make_docs.py +PYTHONPATH=. python build_util/make_docs.py ``` ### GitHub Actions diff --git a/make_docs.py b/build_util/make_docs.py similarity index 100% rename from make_docs.py rename to build_util/make_docs.py From 0887b07ef20b59aa0ac6b12ed8dc4c17b800f90c Mon Sep 17 00:00:00 2001 From: tarepan Date: Thu, 14 Dec 2023 03:53:51 +0900 Subject: [PATCH 019/177] =?UTF-8?q?=E4=BF=AE=E6=AD=A3:=20tag=20=E4=BB=98?= =?UTF-8?q?=E3=81=91=E7=9B=B4=E3=81=97=E3=81=AB=E3=82=88=E3=82=8B=20releas?= =?UTF-8?q?e=20latest-dev=20=E3=81=AE=E6=9B=B4=E6=96=B0=20(#856)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- .github/workflows/build.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9f88c238e..218efd2c4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -634,6 +634,17 @@ jobs: ${{ steps.vars.outputs.package_name }}.vvpp.txt commit: ${{ github.sha }} + update-tag-to-current-commit: + if: needs.config.outputs.version != '' + needs: [config, build-and-upload] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Change tag to this commit for refreshing the release # c.f. voicevox_engine#854 + run: | + git tag -f ${{ needs.config.outputs.version }} + git push -f --tag + run-release-test-workflow: if: needs.config.outputs.version != '' needs: [config, build-and-upload] From dbeda9da871abec5830a3e9994c6c0a53938105e Mon Sep 17 00:00:00 2001 From: tarepan Date: Thu, 14 Dec 2023 04:01:52 +0900 Subject: [PATCH 020/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20Query=E5=85=A8?= =?UTF-8?q?=E4=BD=93=E3=83=91=E3=83=A9=E3=83=A1=E3=83=BC=E3=82=BF=E9=81=A9?= =?UTF-8?q?=E7=94=A8=E3=81=AE=E7=A7=BB=E6=A4=8D=20(#840)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_synthesis_engine.py | 20 ++++++++--------- .../synthesis_engine/synthesis_engine.py | 22 +++++++------------ 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index bdb6de486..b64298417 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -332,7 +332,6 @@ def test_apply_output_stereo(): def test_calc_frame_per_phoneme(): """Test `calc_frame_per_phoneme`.""" # Inputs - query = _gen_query(speedScale=2.0) moras = [ _gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), @@ -345,11 +344,11 @@ def test_calc_frame_per_phoneme(): # Expects # Pre k o N pau h i h O Pst - true_frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3] + true_frame_per_phoneme = [2, 2, 4, 4, 2, 2, 4, 4, 2, 6] true_frame_per_phoneme = numpy.array(true_frame_per_phoneme, dtype=numpy.int32) # Outputs - frame_per_phoneme = calc_frame_per_phoneme(query, moras) + frame_per_phoneme = calc_frame_per_phoneme(moras) assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme) @@ -381,7 +380,6 @@ def test_calc_frame_per_mora(): def test_calc_frame_pitch(): """Test `test_calc_frame_pitch`.""" # Inputs - query = _gen_query(pitchScale=2.0, intonationScale=0.5) moras = [ _gen_mora(" ", None, None, " ", 1 * 0.01067, 0.0), _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), @@ -392,17 +390,16 @@ def test_calc_frame_pitch(): _gen_mora(" ", None, None, " ", 3 * 0.01067, 0.0), ] - # Expects - x4 value scaled -> mean=300 var x0.5 intonation scaling # pau ko ko ko N N - true1_f0 = [0.0, 250.0, 250.0, 250.0, 250.0, 250.0] + true1_f0 = [0.0, 50.0, 50.0, 50.0, 50.0, 50.0] # pau hi hi hi - true2_f0 = [0.0, 400.0, 400.0, 400.0] + true2_f0 = [0.0, 125.0, 125.0, 125.0] # hO hO hO paw paw paw true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32) # Outputs - f0 = calc_frame_pitch(query, moras) + f0 = calc_frame_pitch(moras) assert numpy.array_equal(f0, true_f0) @@ -480,8 +477,11 @@ def test_feat_to_framescale(): # Outputs flatten_moras = apply_prepost_silence(flatten_moras, query) - frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) - f0 = calc_frame_pitch(query, flatten_moras) + flatten_moras = apply_speed_scale(flatten_moras, query) + flatten_moras = apply_pitch_scale(flatten_moras, query) + flatten_moras = apply_intonation_scale(flatten_moras, query) + frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) + f0 = calc_frame_pitch(flatten_moras) frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) assert numpy.array_equal(frame_phoneme, true_frame_phoneme) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index da74a1d78..19a29432f 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -155,13 +155,11 @@ def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): +def calc_frame_per_phoneme(moras: List[Mora]): """ 音素あたりのフレーム長を算出 Parameters ---------- - query : AudioQuery - 音声合成クエリ moras : List[Mora] モーラ列 Returns @@ -169,9 +167,6 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): frame_per_phoneme : NDArray[] 音素あたりのフレーム長。端数丸め。 """ - # Apply: グローバル特徴量による補正(話速) - moras = apply_speed_scale(moras, query) - frame_per_phoneme: list[ndarray] = [] for mora in moras: if mora.consonant: @@ -247,13 +242,11 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def calc_frame_pitch(query: AudioQuery, moras: list[Mora]) -> ndarray: +def calc_frame_pitch(moras: list[Mora]) -> ndarray: """ フレームごとのピッチの生成 Parameters ---------- - query : AudioQuery - 音声合成クエリ moras : List[Mora] モーラ列 Returns @@ -261,9 +254,6 @@ def calc_frame_pitch(query: AudioQuery, moras: list[Mora]) -> ndarray: frame_f0 : NDArray[] フレームごとの基本周波数系列 """ - moras = apply_pitch_scale(moras, query) - moras = apply_intonation_scale(moras, query) - # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) # モーラごとの基本周波数 f0 = numpy.array([mora.pitch for mora in moras], dtype=numpy.float32) @@ -620,8 +610,12 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): flatten_moras, phoneme_data_list = pre_process(query.accent_phrases) flatten_moras = apply_prepost_silence(flatten_moras, query) - frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) - f0 = calc_frame_pitch(query, flatten_moras) + flatten_moras = apply_speed_scale(flatten_moras, query) + flatten_moras = apply_pitch_scale(flatten_moras, query) + flatten_moras = apply_intonation_scale(flatten_moras, query) + + frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) + f0 = calc_frame_pitch(flatten_moras) phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する From 6f47a79c7c619cac177c6c78a80ccea7a05c33cd Mon Sep 17 00:00:00 2001 From: tarepan Date: Thu, 14 Dec 2023 04:08:23 +0900 Subject: [PATCH 021/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`speaker=5Finfo`?= =?UTF-8?q?=20API=E5=86=85=E9=83=A8=E5=AE=9F=E8=A3=85=E5=8D=98=E7=B4=94?= =?UTF-8?q?=E5=8C=96=20(#849)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.py | 69 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/run.py b/run.py index 85ae2002a..fad40b3b5 100644 --- a/run.py +++ b/run.py @@ -785,6 +785,31 @@ def speaker_info( ------- ret_data: SpeakerInfo """ + + # エンジンに含まれる話者メタ情報は、次のディレクトリ構造に従わなければならない: + # {root_dir}/ + # speaker_info/ + # {speaker_uuid_0}/ + # policy.md + # portrait.png + # icons/ + # {id_0}.png + # {id_1}.png + # ... + # portraits/ + # {id_0}.png + # {id_1}.png + # ... + # voice_samples/ + # {id_0}_001.wav + # {id_0}_002.wav + # {id_0}_003.wav + # {id_1}_001.wav + # ... + # {speaker_uuid_1}/ + # ... + + # 該当話者の検索 speakers = json.loads(get_engine(core_version).speakers) for i in range(len(speakers)): if speakers[i]["speaker_uuid"] == speaker_uuid: @@ -794,35 +819,32 @@ def speaker_info( raise HTTPException(status_code=404, detail="該当する話者が見つかりません") try: - policy = (root_dir / f"speaker_info/{speaker_uuid}/policy.md").read_text( - "utf-8" - ) - portrait = b64encode_str( - (root_dir / f"speaker_info/{speaker_uuid}/portrait.png").read_bytes() - ) + speaker_path = root_dir / "speaker_info" / speaker_uuid + # 話者情報の取得 + # speaker policy + policy_path = speaker_path / "policy.md" + policy = policy_path.read_text("utf-8") + # speaker portrait + portrait_path = speaker_path / "portrait.png" + portrait = b64encode_str(portrait_path.read_bytes()) + # スタイル情報の取得 style_infos = [] for style in speaker["styles"]: id = style["id"] - icon = b64encode_str( - ( - root_dir / f"speaker_info/{speaker_uuid}/icons/{id}.png" - ).read_bytes() - ) - style_portrait_path = ( - root_dir / f"speaker_info/{speaker_uuid}/portraits/{id}.png" - ) - style_portrait = ( - b64encode_str(style_portrait_path.read_bytes()) - if style_portrait_path.exists() - else None - ) + # style icon + style_icon_path = speaker_path / "icons" / f"{id}.png" + icon = b64encode_str(style_icon_path.read_bytes()) + # style portrait + style_portrait_path = speaker_path / "portraits" / f"{id}.png" + style_portrait = None + if style_portrait_path.exists(): + style_portrait = b64encode_str(style_portrait_path.read_bytes()) + # voice samples voice_samples = [ b64encode_str( ( - root_dir - / "speaker_info/{}/voice_samples/{}_{}.wav".format( - speaker_uuid, id, str(j + 1).zfill(3) - ) + speaker_path + / "voice_samples/{}_{}.wav".format(id, str(j + 1).zfill(3)) ).read_bytes() ) for j in range(3) @@ -842,6 +864,7 @@ def speaker_info( raise HTTPException(status_code=500, detail="追加情報が見つかりませんでした") ret_data = {"policy": policy, "portrait": portrait, "style_infos": style_infos} + return ret_data @app.get( From 0769a4d58da7adf00605241ada738419f4350af6 Mon Sep 17 00:00:00 2001 From: tarepan Date: Thu, 14 Dec 2023 04:53:57 +0900 Subject: [PATCH 022/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`pre=5Fprocess`?= =?UTF-8?q?=20=E3=81=AE=E7=B4=B0=E5=88=86=E5=8C=96=20(#851)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Refactor: モーラ・音素抽出の関数化と簡略化 * Refactor: クエリ処理テストの範囲拡大 * Refactor: 合成時の `pre_process` 置き換え --- test/test_synthesis_engine.py | 53 ++++++++++--- .../synthesis_engine/synthesis_engine.py | 78 ++++++++++--------- 2 files changed, 85 insertions(+), 46 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index b64298417..66192d32e 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -28,6 +28,7 @@ pre_process, split_mora, to_flatten_moras, + to_flatten_phonemes, unvoiced_mora_phoneme_list, ) @@ -180,6 +181,24 @@ def _gen_mora( ) +def test_to_flatten_phonemes(): + """Test `to_flatten_phonemes`.""" + # Inputs + moras = [ + _gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0), + _gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0), + ] + + # Expects + true_phonemes = ["pau", "h", "i", "pau"] + + # Outputs + phonemes = list(map(lambda p: p.phoneme, to_flatten_phonemes(moras))) + + assert true_phonemes == phonemes + + def test_apply_prepost_silence(): """Test `apply_prepost_silence`.""" # Inputs @@ -430,22 +449,32 @@ def test_calc_frame_phoneme(): def test_feat_to_framescale(): """Test Mora/Phonemefeature-to-framescaleFeature pipeline.""" # Inputs + accent_phrases = [ + AccentPhrase( + moras=[ + _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + ], + accent=1, + pause_mora=_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + ), + AccentPhrase( + moras=[ + _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + ], + accent=1, + pause_mora=None, + ), + ] query = _gen_query( + accent_phrases=accent_phrases, speedScale=2.0, pitchScale=2.0, intonationScale=0.5, prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067, ) - flatten_moras = [ - _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), - _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), - _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), - _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), - ] - phoneme_str = "pau k o N pau h i h O pau" - phoneme_data_list = [OjtPhoneme(p) for p in phoneme_str.split()] # Expects # frame_per_phoneme @@ -473,13 +502,15 @@ def test_feat_to_framescale(): true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32) - assert true_frame_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites" - # Outputs + flatten_moras = to_flatten_moras(query.accent_phrases) flatten_moras = apply_prepost_silence(flatten_moras, query) flatten_moras = apply_speed_scale(flatten_moras, query) flatten_moras = apply_pitch_scale(flatten_moras, query) flatten_moras = apply_intonation_scale(flatten_moras, query) + + phoneme_data_list = to_flatten_phonemes(flatten_moras) + frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) f0 = calc_frame_pitch(flatten_moras) frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 19a29432f..181273fe3 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -1,7 +1,6 @@ import math import threading -from itertools import chain -from typing import List, Optional, Tuple +from typing import List, Optional import numpy from numpy import ndarray @@ -17,30 +16,44 @@ # TODO: move mora utility to mora module -def to_flatten_moras(accent_phrases: List[AccentPhrase]) -> List[Mora]: +def to_flatten_moras(accent_phrases: list[AccentPhrase]) -> list[Mora]: """ - accent_phrasesに含まれるMora(とpause_moraがあればそれも)を - すべて一つのリストに結合する + アクセント句系列に含まれるモーラの抽出 Parameters ---------- - accent_phrases : List[AccentPhrase] - AccentPhraseのリスト + accent_phrases : list[AccentPhrase] + アクセント句系列 Returns ------- - moras : List[Mora] - 結合されたMoraのリストを返す + moras : list[Mora] + モーラ系列。ポーズモーラを含む。 """ - return list( - chain.from_iterable( - accent_phrase.moras - + ( - [accent_phrase.pause_mora] - if accent_phrase.pause_mora is not None - else [] - ) - for accent_phrase in accent_phrases - ) - ) + moras: list[Mora] = [] + for accent_phrase in accent_phrases: + moras += accent_phrase.moras + if accent_phrase.pause_mora: + moras += [accent_phrase.pause_mora] + return moras + + +def to_flatten_phonemes(moras: list[Mora]) -> list[OjtPhoneme]: + """ + モーラ系列に含まれる音素の抽出 + Parameters + ---------- + moras : list[Mora] + モーラ系列 + Returns + ------- + phonemes : list[OjtPhoneme] + 音素系列 + """ + phonemes: list[OjtPhoneme] = [] + for mora in moras: + if mora.consonant: + phonemes += [OjtPhoneme(mora.consonant)] + phonemes += [(OjtPhoneme(mora.vowel))] + return phonemes def split_mora(phoneme_list: List[OjtPhoneme]): @@ -80,8 +93,8 @@ def split_mora(phoneme_list: List[OjtPhoneme]): def pre_process( - accent_phrases: List[AccentPhrase], -) -> Tuple[List[Mora], List[OjtPhoneme]]: + accent_phrases: list[AccentPhrase], +) -> tuple[list[Mora], list[OjtPhoneme]]: """ AccentPhraseモデルのリストを整形し、処理に必要なデータの原型を作り出す Parameters @@ -92,21 +105,16 @@ def pre_process( ------- flatten_moras : List[Mora] モーラ列(前後の無音含まない) - phoneme_data_list : List[OjtPhoneme] + phonemes : List[OjtPhoneme] 音素列(前後の無音含む) """ flatten_moras = to_flatten_moras(accent_phrases) + phonemes = to_flatten_phonemes(flatten_moras) - phoneme_each_mora = [ - ([mora.consonant] if mora.consonant is not None else []) + [mora.vowel] - for mora in flatten_moras - ] - phoneme_str_list = list(chain.from_iterable(phoneme_each_mora)) - phoneme_str_list = ["pau"] + phoneme_str_list + ["pau"] - - phoneme_data_list = list(map(OjtPhoneme, phoneme_str_list)) + # 前後無音の追加 + phonemes = [OjtPhoneme("pau")] + phonemes + [OjtPhoneme("pau")] - return flatten_moras, phoneme_data_list + return flatten_moras, phonemes def generate_silence_mora(length: float) -> Mora: @@ -605,15 +613,15 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): """ # モデルがロードされていない場合はロードする self.initialize_style_id_synthesis(style_id, skip_reinit=True) - # phoneme - # AccentPhraseをすべてMoraおよびOjtPhonemeの形に分解し、処理可能な形にする - flatten_moras, phoneme_data_list = pre_process(query.accent_phrases) + flatten_moras = to_flatten_moras(query.accent_phrases) flatten_moras = apply_prepost_silence(flatten_moras, query) flatten_moras = apply_speed_scale(flatten_moras, query) flatten_moras = apply_pitch_scale(flatten_moras, query) flatten_moras = apply_intonation_scale(flatten_moras, query) + phoneme_data_list = to_flatten_phonemes(flatten_moras) + frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) f0 = calc_frame_pitch(flatten_moras) phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) From 5580eeecd8a1c2da2091fff0781baab260cf4429 Mon Sep 17 00:00:00 2001 From: tarepan Date: Fri, 15 Dec 2023 23:59:46 +0900 Subject: [PATCH 023/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20TTS=E7=B3=BB?= =?UTF-8?q?=E3=83=87=E3=82=A3=E3=83=AC=E3=82=AF=E3=83=88=E3=83=AA=E6=A7=8B?= =?UTF-8?q?=E9=80=A0=E3=81=AE=E6=94=B9=E5=96=84=20(#867)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.py | 4 ++-- test/e2e/conftest.py | 2 +- test/test_acoustic_feature_extractor.py | 2 +- test/test_full_context_label.py | 2 +- test/test_kana_parser.py | 4 ++-- test/test_mock_synthesis_engine.py | 2 +- test/test_mora_list.py | 2 +- test/test_mora_to_text.py | 2 +- test/test_synthesis_engine.py | 6 +++--- test/test_synthesis_engine_base.py | 2 +- test/test_user_dict_model.py | 2 +- voicevox_engine/cancellable_engine.py | 2 +- .../{synthesis_engine => }/core_wrapper.py | 0 voicevox_engine/dev/synthesis_engine/mock.py | 4 ++-- voicevox_engine/metas/MetasStore.py | 4 +--- voicevox_engine/morphing.py | 2 +- voicevox_engine/synthesis_engine/__init__.py | 12 ------------ voicevox_engine/tts_pipeline/__init__.py | 12 ++++++++++++ .../{ => tts_pipeline}/acoustic_feature_extractor.py | 0 .../{ => tts_pipeline}/full_context_label.py | 0 voicevox_engine/{ => tts_pipeline}/kana_parser.py | 2 +- .../make_tts_engines.py} | 4 ++-- voicevox_engine/{ => tts_pipeline}/mora_list.py | 0 .../tts_engine.py} | 6 +++--- .../tts_engine_base.py} | 6 +++--- 25 files changed, 41 insertions(+), 43 deletions(-) rename voicevox_engine/{synthesis_engine => }/core_wrapper.py (100%) delete mode 100644 voicevox_engine/synthesis_engine/__init__.py create mode 100644 voicevox_engine/tts_pipeline/__init__.py rename voicevox_engine/{ => tts_pipeline}/acoustic_feature_extractor.py (100%) rename voicevox_engine/{ => tts_pipeline}/full_context_label.py (100%) rename voicevox_engine/{ => tts_pipeline}/kana_parser.py (99%) rename voicevox_engine/{synthesis_engine/make_synthesis_engines.py => tts_pipeline/make_tts_engines.py} (97%) rename voicevox_engine/{ => tts_pipeline}/mora_list.py (100%) rename voicevox_engine/{synthesis_engine/synthesis_engine.py => tts_pipeline/tts_engine.py} (99%) rename voicevox_engine/{synthesis_engine/synthesis_engine_base.py => tts_pipeline/tts_engine_base.py} (98%) diff --git a/run.py b/run.py index fad40b3b5..ec38fd41d 100644 --- a/run.py +++ b/run.py @@ -30,7 +30,6 @@ from voicevox_engine.cancellable_engine import CancellableEngine from voicevox_engine.engine_manifest import EngineManifestLoader from voicevox_engine.engine_manifest.EngineManifest import EngineManifest -from voicevox_engine.kana_parser import create_kana, parse_kana from voicevox_engine.library_manager import LibraryManager from voicevox_engine.metas.MetasStore import MetasStore, construct_lookup from voicevox_engine.model import ( @@ -66,7 +65,8 @@ Setting, SettingLoader, ) -from voicevox_engine.synthesis_engine import SynthesisEngineBase, make_synthesis_engines +from voicevox_engine.tts_pipeline import SynthesisEngineBase, make_synthesis_engines +from voicevox_engine.tts_pipeline.kana_parser import create_kana, parse_kana from voicevox_engine.user_dict import ( apply_word, delete_word, diff --git a/test/e2e/conftest.py b/test/e2e/conftest.py index b6eab18ae..9475d3b05 100644 --- a/test/e2e/conftest.py +++ b/test/e2e/conftest.py @@ -6,7 +6,7 @@ from voicevox_engine.preset import PresetManager from voicevox_engine.setting import SettingLoader -from voicevox_engine.synthesis_engine import make_synthesis_engines +from voicevox_engine.tts_pipeline import make_synthesis_engines from voicevox_engine.utility.core_version_utility import get_latest_core_version diff --git a/test/test_acoustic_feature_extractor.py b/test/test_acoustic_feature_extractor.py index 24c70d284..9e2a4867c 100644 --- a/test/test_acoustic_feature_extractor.py +++ b/test/test_acoustic_feature_extractor.py @@ -1,6 +1,6 @@ from unittest import TestCase -from voicevox_engine.acoustic_feature_extractor import OjtPhoneme +from voicevox_engine.tts_pipeline.acoustic_feature_extractor import OjtPhoneme TRUE_NUM_PHONEME = 45 diff --git a/test/test_full_context_label.py b/test/test_full_context_label.py index 7cdde34f4..0c9ce3ee0 100644 --- a/test/test_full_context_label.py +++ b/test/test_full_context_label.py @@ -2,7 +2,7 @@ from itertools import chain from unittest import TestCase -from voicevox_engine.full_context_label import ( +from voicevox_engine.tts_pipeline.full_context_label import ( AccentPhrase, BreathGroup, Mora, diff --git a/test/test_kana_parser.py b/test/test_kana_parser.py index ef800b600..3e4c19a97 100644 --- a/test/test_kana_parser.py +++ b/test/test_kana_parser.py @@ -1,9 +1,9 @@ from typing import List from unittest import TestCase -from voicevox_engine import kana_parser -from voicevox_engine.kana_parser import create_kana from voicevox_engine.model import AccentPhrase, Mora, ParseKanaError, ParseKanaErrorCode +from voicevox_engine.tts_pipeline import kana_parser +from voicevox_engine.tts_pipeline.kana_parser import create_kana def parse_kana(text: str) -> List[AccentPhrase]: diff --git a/test/test_mock_synthesis_engine.py b/test/test_mock_synthesis_engine.py index ce6c59825..27fee31c1 100644 --- a/test/test_mock_synthesis_engine.py +++ b/test/test_mock_synthesis_engine.py @@ -1,8 +1,8 @@ from unittest import TestCase from voicevox_engine.dev.synthesis_engine import MockSynthesisEngine -from voicevox_engine.kana_parser import create_kana from voicevox_engine.model import AccentPhrase, AudioQuery, Mora +from voicevox_engine.tts_pipeline.kana_parser import create_kana class TestMockSynthesisEngine(TestCase): diff --git a/test/test_mora_list.py b/test/test_mora_list.py index 25b287fa0..a2928205a 100644 --- a/test/test_mora_list.py +++ b/test/test_mora_list.py @@ -1,6 +1,6 @@ from unittest import TestCase -from voicevox_engine.mora_list import openjtalk_mora2text +from voicevox_engine.tts_pipeline.mora_list import openjtalk_mora2text class TestOpenJTalkMoraList(TestCase): diff --git a/test/test_mora_to_text.py b/test/test_mora_to_text.py index 691681dd1..f8f531008 100644 --- a/test/test_mora_to_text.py +++ b/test/test_mora_to_text.py @@ -1,7 +1,7 @@ from unittest import TestCase # TODO: import from voicevox_engine.synthesis_engine.mora -from voicevox_engine.synthesis_engine.synthesis_engine_base import mora_to_text +from voicevox_engine.tts_pipeline.tts_engine_base import mora_to_text class TestMoraToText(TestCase): diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 66192d32e..00730bb5d 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -7,12 +7,12 @@ import numpy -from voicevox_engine.acoustic_feature_extractor import OjtPhoneme from voicevox_engine.model import AccentPhrase, AudioQuery, Mora -from voicevox_engine.synthesis_engine import SynthesisEngine +from voicevox_engine.tts_pipeline import SynthesisEngine +from voicevox_engine.tts_pipeline.acoustic_feature_extractor import OjtPhoneme # TODO: import from voicevox_engine.synthesis_engine.mora -from voicevox_engine.synthesis_engine.synthesis_engine import ( +from voicevox_engine.tts_pipeline.tts_engine import ( apply_intonation_scale, apply_output_sampling_rate, apply_output_stereo, diff --git a/test/test_synthesis_engine_base.py b/test/test_synthesis_engine_base.py index c49dcbe01..7fa8fd676 100644 --- a/test/test_synthesis_engine_base.py +++ b/test/test_synthesis_engine_base.py @@ -5,7 +5,7 @@ import numpy from voicevox_engine.model import AccentPhrase, AudioQuery, Mora -from voicevox_engine.synthesis_engine import SynthesisEngine +from voicevox_engine.tts_pipeline import SynthesisEngine def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, style_id: numpy.ndarray): diff --git a/test/test_user_dict_model.py b/test/test_user_dict_model.py index 9a3a49021..646340c6c 100644 --- a/test/test_user_dict_model.py +++ b/test/test_user_dict_model.py @@ -3,8 +3,8 @@ from pydantic import ValidationError -from voicevox_engine.kana_parser import parse_kana from voicevox_engine.model import UserDictWord +from voicevox_engine.tts_pipeline.kana_parser import parse_kana class TestUserDictWords(TestCase): diff --git a/voicevox_engine/cancellable_engine.py b/voicevox_engine/cancellable_engine.py index c473c3e4a..140a7f138 100644 --- a/voicevox_engine/cancellable_engine.py +++ b/voicevox_engine/cancellable_engine.py @@ -17,7 +17,7 @@ from fastapi import HTTPException, Request from .model import AudioQuery -from .synthesis_engine import make_synthesis_engines +from .tts_pipeline import make_synthesis_engines from .utility import get_latest_core_version diff --git a/voicevox_engine/synthesis_engine/core_wrapper.py b/voicevox_engine/core_wrapper.py similarity index 100% rename from voicevox_engine/synthesis_engine/core_wrapper.py rename to voicevox_engine/core_wrapper.py diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py index 1b6c4abeb..ec366b31b 100644 --- a/voicevox_engine/dev/synthesis_engine/mock.py +++ b/voicevox_engine/dev/synthesis_engine/mock.py @@ -6,8 +6,8 @@ from soxr import resample from ...model import AccentPhrase, AudioQuery -from ...synthesis_engine import SynthesisEngineBase -from ...synthesis_engine.synthesis_engine import to_flatten_moras +from ...tts_pipeline import SynthesisEngineBase +from ...tts_pipeline.tts_engine import to_flatten_moras class MockSynthesisEngine(SynthesisEngineBase): diff --git a/voicevox_engine/metas/MetasStore.py b/voicevox_engine/metas/MetasStore.py index c8367e831..497b2723e 100644 --- a/voicevox_engine/metas/MetasStore.py +++ b/voicevox_engine/metas/MetasStore.py @@ -5,9 +5,7 @@ from voicevox_engine.metas.Metas import CoreSpeaker, EngineSpeaker, Speaker, StyleInfo if TYPE_CHECKING: - from voicevox_engine.synthesis_engine.synthesis_engine_base import ( - SynthesisEngineBase, - ) + from voicevox_engine.tts_pipeline.tts_engine_base import SynthesisEngineBase class MetasStore: diff --git a/voicevox_engine/morphing.py b/voicevox_engine/morphing.py index 74c82fb7d..89a2498c3 100644 --- a/voicevox_engine/morphing.py +++ b/voicevox_engine/morphing.py @@ -10,7 +10,7 @@ from .metas.Metas import Speaker, SpeakerSupportPermittedSynthesisMorphing, StyleInfo from .metas.MetasStore import construct_lookup from .model import AudioQuery, MorphableTargetInfo, StyleIdNotFoundError -from .synthesis_engine import SynthesisEngine +from .tts_pipeline import SynthesisEngine # FIXME: ndarray type hint, https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/blob/2b64f86197573497c685c785c6e0e743f407b63e/pyworld/pyworld.pyx#L398 # noqa diff --git a/voicevox_engine/synthesis_engine/__init__.py b/voicevox_engine/synthesis_engine/__init__.py deleted file mode 100644 index 3e7f6a1ef..000000000 --- a/voicevox_engine/synthesis_engine/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from .core_wrapper import CoreWrapper, load_runtime_lib -from .make_synthesis_engines import make_synthesis_engines -from .synthesis_engine import SynthesisEngine -from .synthesis_engine_base import SynthesisEngineBase - -__all__ = [ - "CoreWrapper", - "load_runtime_lib", - "make_synthesis_engines", - "SynthesisEngine", - "SynthesisEngineBase", -] diff --git a/voicevox_engine/tts_pipeline/__init__.py b/voicevox_engine/tts_pipeline/__init__.py new file mode 100644 index 000000000..2fce842ba --- /dev/null +++ b/voicevox_engine/tts_pipeline/__init__.py @@ -0,0 +1,12 @@ +from ..core_wrapper import CoreWrapper, load_runtime_lib +from .make_tts_engines import make_synthesis_engines +from .tts_engine import SynthesisEngine +from .tts_engine_base import SynthesisEngineBase + +__all__ = [ + "CoreWrapper", + "load_runtime_lib", + "make_synthesis_engines", + "SynthesisEngine", + "SynthesisEngineBase", +] diff --git a/voicevox_engine/acoustic_feature_extractor.py b/voicevox_engine/tts_pipeline/acoustic_feature_extractor.py similarity index 100% rename from voicevox_engine/acoustic_feature_extractor.py rename to voicevox_engine/tts_pipeline/acoustic_feature_extractor.py diff --git a/voicevox_engine/full_context_label.py b/voicevox_engine/tts_pipeline/full_context_label.py similarity index 100% rename from voicevox_engine/full_context_label.py rename to voicevox_engine/tts_pipeline/full_context_label.py diff --git a/voicevox_engine/kana_parser.py b/voicevox_engine/tts_pipeline/kana_parser.py similarity index 99% rename from voicevox_engine/kana_parser.py rename to voicevox_engine/tts_pipeline/kana_parser.py index 14efb4672..430960156 100644 --- a/voicevox_engine/kana_parser.py +++ b/voicevox_engine/tts_pipeline/kana_parser.py @@ -5,7 +5,7 @@ from typing import List, Optional -from .model import AccentPhrase, Mora, ParseKanaError, ParseKanaErrorCode +from ..model import AccentPhrase, Mora, ParseKanaError, ParseKanaErrorCode from .mora_list import openjtalk_text2mora _LOOP_LIMIT = 300 diff --git a/voicevox_engine/synthesis_engine/make_synthesis_engines.py b/voicevox_engine/tts_pipeline/make_tts_engines.py similarity index 97% rename from voicevox_engine/synthesis_engine/make_synthesis_engines.py rename to voicevox_engine/tts_pipeline/make_tts_engines.py index 848a601af..09183574a 100644 --- a/voicevox_engine/synthesis_engine/make_synthesis_engines.py +++ b/voicevox_engine/tts_pipeline/make_tts_engines.py @@ -3,9 +3,9 @@ from pathlib import Path from typing import Dict, List, Optional +from ..core_wrapper import CoreWrapper, load_runtime_lib from ..utility import engine_root, get_save_dir -from .core_wrapper import CoreWrapper, load_runtime_lib -from .synthesis_engine import SynthesisEngine, SynthesisEngineBase +from .tts_engine import SynthesisEngine, SynthesisEngineBase def make_synthesis_engines( diff --git a/voicevox_engine/mora_list.py b/voicevox_engine/tts_pipeline/mora_list.py similarity index 100% rename from voicevox_engine/mora_list.py rename to voicevox_engine/tts_pipeline/mora_list.py diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py similarity index 99% rename from voicevox_engine/synthesis_engine/synthesis_engine.py rename to voicevox_engine/tts_pipeline/tts_engine.py index 181273fe3..372900c6f 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -6,10 +6,10 @@ from numpy import ndarray from soxr import resample -from ..acoustic_feature_extractor import OjtPhoneme +from ..core_wrapper import CoreWrapper, OldCoreError from ..model import AccentPhrase, AudioQuery, Mora -from .core_wrapper import CoreWrapper, OldCoreError -from .synthesis_engine_base import SynthesisEngineBase +from .acoustic_feature_extractor import OjtPhoneme +from .tts_engine_base import SynthesisEngineBase unvoiced_mora_phoneme_list = ["A", "I", "U", "E", "O", "cl", "pau"] mora_phoneme_list = ["a", "i", "u", "e", "o", "N"] + unvoiced_mora_phoneme_list diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/tts_pipeline/tts_engine_base.py similarity index 98% rename from voicevox_engine/synthesis_engine/synthesis_engine_base.py rename to voicevox_engine/tts_pipeline/tts_engine_base.py index 6a139a830..502580f8e 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py +++ b/voicevox_engine/tts_pipeline/tts_engine_base.py @@ -4,10 +4,10 @@ import numpy as np -from .. import full_context_label -from ..full_context_label import extract_full_context_label from ..model import AccentPhrase, AudioQuery, Mora -from ..mora_list import openjtalk_mora2text +from . import full_context_label +from .full_context_label import extract_full_context_label +from .mora_list import openjtalk_mora2text def mora_to_text(mora: str) -> str: From a20c82b8bb061de49134d37e25ebbb3567bd9acb Mon Sep 17 00:00:00 2001 From: tarepan Date: Sat, 16 Dec 2023 00:49:05 +0900 Subject: [PATCH 024/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=93?= =?UTF-8?q?=E3=83=AB=E3=83=89=E3=83=84=E3=83=BC=E3=83=AB=E3=81=AE=E7=A7=BB?= =?UTF-8?q?=E5=8B=95=20(#874)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 4 ++-- build_util/create_venv_and_generate_licenses.bash | 2 +- generate_licenses.py => build_util/generate_licenses.py | 0 get_cost_candidates.py => build_util/get_cost_candidates.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename generate_licenses.py => build_util/generate_licenses.py (100%) rename get_cost_candidates.py => build_util/get_cost_candidates.py (97%) diff --git a/Dockerfile b/Dockerfile index 545449a7c..dd056d78f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -227,7 +227,7 @@ COPY --from=download-onnxruntime-env /opt/onnxruntime /opt/onnxruntime # Add local files ADD ./voicevox_engine /opt/voicevox_engine/voicevox_engine ADD ./docs /opt/voicevox_engine/docs -ADD ./run.py ./generate_licenses.py ./presets.yaml ./default.csv ./engine_manifest.json /opt/voicevox_engine/ +ADD ./run.py ./build_util/generate_licenses.py ./presets.yaml ./default.csv ./engine_manifest.json /opt/voicevox_engine/ ADD ./speaker_info /opt/voicevox_engine/speaker_info ADD ./ui_template /opt/voicevox_engine/ui_template ADD ./engine_manifest_assets /opt/voicevox_engine/engine_manifest_assets @@ -249,7 +249,7 @@ RUN < /opt/voicevox_engine/engine_manifest_assets/dependency_licenses.json + gosu user /opt/python/bin/python3 build_util/generate_licenses.py > /opt/voicevox_engine/engine_manifest_assets/dependency_licenses.json cp /opt/voicevox_engine/engine_manifest_assets/dependency_licenses.json /opt/voicevox_engine/licenses.json EOF diff --git a/build_util/create_venv_and_generate_licenses.bash b/build_util/create_venv_and_generate_licenses.bash index d2c837dbf..71a5f61c9 100644 --- a/build_util/create_venv_and_generate_licenses.bash +++ b/build_util/create_venv_and_generate_licenses.bash @@ -17,7 +17,7 @@ else fi pip install -r requirements-license.txt -python generate_licenses.py >$OUTPUT_LICENSE_JSON_PATH +python build_util/generate_licenses.py >$OUTPUT_LICENSE_JSON_PATH deactivate diff --git a/generate_licenses.py b/build_util/generate_licenses.py similarity index 100% rename from generate_licenses.py rename to build_util/generate_licenses.py diff --git a/get_cost_candidates.py b/build_util/get_cost_candidates.py similarity index 97% rename from get_cost_candidates.py rename to build_util/get_cost_candidates.py index 072c4b4d5..785a0c4df 100644 --- a/get_cost_candidates.py +++ b/build_util/get_cost_candidates.py @@ -3,7 +3,7 @@ 引数のnaist_jdic_pathには、open_jtalkのsrc/mecab-naist-jdic/naist-jdic.csvを指定してください。 実行例: -python get_cost_candidates.py --naist_jdic_path=/path/to/naist-jdic.csv \ +python build_util/get_cost_candidates.py --naist_jdic_path=/path/to/naist-jdic.csv \ --pos=名詞 \ --pos_detail_1=固有名詞 \ --pos_detail_2=一般 \ From ec1f70e52b1df5628317311fced841f4e9d8877c Mon Sep 17 00:00:00 2001 From: tarepan Date: Sat, 16 Dec 2023 01:05:09 +0900 Subject: [PATCH 025/177] =?UTF-8?q?=E5=BB=83=E6=AD=A2:=20`MetasStore`=20?= =?UTF-8?q?=E3=81=AE=E4=B8=8D=E4=BD=BF=E7=94=A8=E9=96=A2=E6=95=B0=20(#875)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- voicevox_engine/metas/MetasStore.py | 52 ++++------------------------- 1 file changed, 7 insertions(+), 45 deletions(-) diff --git a/voicevox_engine/metas/MetasStore.py b/voicevox_engine/metas/MetasStore.py index 497b2723e..78f838a2a 100644 --- a/voicevox_engine/metas/MetasStore.py +++ b/voicevox_engine/metas/MetasStore.py @@ -20,7 +20,6 @@ def __init__(self, engine_speakers_path: Path) -> None: engine_speakers_path : Path エンジンに含まれる話者メタ情報ディレクトリのパス。 """ - self._engine_speakers_path = engine_speakers_path # エンジンに含まれる各話者のメタ情報 self._loaded_metas: Dict[str, EngineSpeaker] = { folder.name: EngineSpeaker( @@ -29,41 +28,6 @@ def __init__(self, engine_speakers_path: Path) -> None: for folder in engine_speakers_path.iterdir() } - def speaker_engine_metas(self, speaker_uuid: str) -> EngineSpeaker: - """ - エンジンに含まれる指定話者のメタ情報を取得 - Parameters - ---------- - speaker_uuid : str - 話者UUID - Returns - ------- - ret : EngineSpeaker - エンジンに含まれる指定話者のメタ情報 - """ - return self.loaded_metas[speaker_uuid] - - def combine_metas(self, core_metas: List[CoreSpeaker]) -> List[Speaker]: - """ - コアに含まれる話者メタ情報に、エンジンに含まれる話者メタ情報を統合して返す - Parameters - ---------- - core_metas : List[CoreSpeaker] - コアに含まれる話者メタ情報 - Returns - ------- - ret : List[Speaker] - エンジンとコアに含まれる話者メタ情報 - """ - # 話者単位でエンジン・コアに含まれるメタ情報を統合 - return [ - Speaker( - **self.speaker_engine_metas(speaker_meta.speaker_uuid).dict(), - **speaker_meta.dict(), - ) - for speaker_meta in core_metas - ] - # FIXME: engineではなくList[CoreSpeaker]を渡す形にすることで # SynthesisEngineBaseによる循環importを修正する def load_combined_metas(self, engine: "SynthesisEngineBase") -> List[Speaker]: @@ -81,15 +45,13 @@ def load_combined_metas(self, engine: "SynthesisEngineBase") -> List[Speaker]: # コアに含まれる話者メタ情報の収集 core_metas = [CoreSpeaker(**speaker) for speaker in json.loads(engine.speakers)] # エンジンに含まれる話者メタ情報との統合 - return self.combine_metas(core_metas) - - @property - def engine_speakers_path(self) -> Path: - return self._engine_speakers_path - - @property - def loaded_metas(self) -> Dict[str, EngineSpeaker]: - return self._loaded_metas + return [ + Speaker( + **self.self._loaded_metas[speaker_meta.speaker_uuid].dict(), + **speaker_meta.dict(), + ) + for speaker_meta in core_metas + ] def construct_lookup(speakers: List[Speaker]) -> Dict[int, Tuple[Speaker, StyleInfo]]: From f8750a1beac6c51ab7db6541d7978c92a7d02a0d Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sat, 16 Dec 2023 01:19:23 +0900 Subject: [PATCH 026/177] =?UTF-8?q?hotfix/build=5Futil=E3=83=87=E3=82=A3?= =?UTF-8?q?=E3=83=AC=E3=82=AF=E3=83=88=E3=83=AA=E5=86=85=E3=81=AE=E3=82=B9?= =?UTF-8?q?=E3=82=AF=E3=83=AA=E3=83=97=E3=83=88=E3=81=AEDockerfile?= =?UTF-8?q?=E5=86=85=E3=81=A7=E3=81=AE=E3=82=B3=E3=83=94=E3=83=BC=E3=82=92?= =?UTF-8?q?=E6=AD=A3=E3=81=97=E3=81=84=E5=BD=A2=E3=81=AB=20(#878)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index dd056d78f..ef8ca0727 100644 --- a/Dockerfile +++ b/Dockerfile @@ -227,7 +227,8 @@ COPY --from=download-onnxruntime-env /opt/onnxruntime /opt/onnxruntime # Add local files ADD ./voicevox_engine /opt/voicevox_engine/voicevox_engine ADD ./docs /opt/voicevox_engine/docs -ADD ./run.py ./build_util/generate_licenses.py ./presets.yaml ./default.csv ./engine_manifest.json /opt/voicevox_engine/ +ADD ./run.py ./presets.yaml ./default.csv ./engine_manifest.json /opt/voicevox_engine/ +ADD ./build_util/generate_licenses.py /opt/voicevox_engine/build_util ADD ./speaker_info /opt/voicevox_engine/speaker_info ADD ./ui_template /opt/voicevox_engine/ui_template ADD ./engine_manifest_assets /opt/voicevox_engine/engine_manifest_assets From 6a2a010c400edae6e9a34f787edf8e0d9847ab4e Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sat, 16 Dec 2023 01:30:52 +0900 Subject: [PATCH 027/177] =?UTF-8?q?[hotfix]=20Dockerfile=E3=81=AE=E3=83=87?= =?UTF-8?q?=E3=82=A3=E3=83=AC=E3=82=AF=E3=83=88=E3=83=AA=E4=BB=A5=E4=B8=8B?= =?UTF-8?q?=E3=81=B8=E3=81=AE=E3=82=B3=E3=83=94=E3=83=BC=E3=81=AE=E3=82=B9?= =?UTF-8?q?=E3=83=A9=E3=83=83=E3=82=B7=E3=83=A5=E5=BF=98=E3=82=8C=20(#879)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ef8ca0727..225b84bf9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -228,7 +228,7 @@ COPY --from=download-onnxruntime-env /opt/onnxruntime /opt/onnxruntime ADD ./voicevox_engine /opt/voicevox_engine/voicevox_engine ADD ./docs /opt/voicevox_engine/docs ADD ./run.py ./presets.yaml ./default.csv ./engine_manifest.json /opt/voicevox_engine/ -ADD ./build_util/generate_licenses.py /opt/voicevox_engine/build_util +ADD ./build_util/generate_licenses.py /opt/voicevox_engine/build_util/ ADD ./speaker_info /opt/voicevox_engine/speaker_info ADD ./ui_template /opt/voicevox_engine/ui_template ADD ./engine_manifest_assets /opt/voicevox_engine/engine_manifest_assets From 6c80586851fdecc52ef9f65ae0594f833296fe1b Mon Sep 17 00:00:00 2001 From: tarepan Date: Sat, 16 Dec 2023 20:50:52 +0900 Subject: [PATCH 028/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=95?= =?UTF-8?q?=E3=83=AB=E3=82=B3=E3=83=B3=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88?= =?UTF-8?q?=E3=83=A9=E3=83=99=E3=83=AB=E9=96=A2=E9=80=A3=E3=82=B3=E3=83=A1?= =?UTF-8?q?=E3=83=B3=E3=83=88/docstring/=E5=9E=8B=E3=83=92=E3=83=B3?= =?UTF-8?q?=E3=83=88=20(#880)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba Kazuyuki --- .../tts_pipeline/full_context_label.py | 160 +++++++++--------- 1 file changed, 84 insertions(+), 76 deletions(-) diff --git a/voicevox_engine/tts_pipeline/full_context_label.py b/voicevox_engine/tts_pipeline/full_context_label.py index 5ca599276..1e61a17b0 100644 --- a/voicevox_engine/tts_pipeline/full_context_label.py +++ b/voicevox_engine/tts_pipeline/full_context_label.py @@ -1,7 +1,7 @@ import re from dataclasses import dataclass from itertools import chain -from typing import Dict, List, Optional +from typing import Self import pyopenjtalk @@ -14,11 +14,11 @@ class Phoneme: Attributes ---------- - contexts: Dict[str, str] + contexts: dict[str, str] 音素の元 """ - contexts: Dict[str, str] + contexts: dict[str, str] @classmethod def from_label(cls, label: str): @@ -81,11 +81,11 @@ def label(self): @property def phoneme(self): """ - 音素クラスの中で、発声に必要な要素を返す + 音素クラスの中で、発声に必要なcontextを返す Returns ------- phoneme : str - 発声に必要な要素を返す + 発声に必要なcontextを返す """ return self.contexts["p3"] @@ -111,13 +111,13 @@ class Mora: Attributes ---------- - consonant : Optional[Phoneme] + consonant : Phoneme | None 子音 vowel : Phoneme 母音 """ - consonant: Optional[Phoneme] + consonant: Phoneme | None vowel: Phoneme def set_context(self, key: str, value: str): @@ -141,7 +141,7 @@ def phonemes(self): 音素群を返す Returns ------- - phonemes : List[Phoneme] + phonemes : list[Phoneme] 母音しかない場合は母音のみ、子音もある場合は子音、母音の順番でPhonemeのリストを返す """ if self.consonant is not None: @@ -155,7 +155,7 @@ def labels(self): ラベル群を返す Returns ------- - labels : List[str] + labels : list[str] Moraに含まれるすべてのラベルを返す """ return [p.label for p in self.phonemes] @@ -168,62 +168,69 @@ class AccentPhrase: 同じアクセントのMoraを複数保持する Attributes ---------- - moras : List[Mora] + moras : list[Mora] 音韻のリスト accent : int アクセント """ - moras: List[Mora] + moras: list[Mora] accent: int is_interrogative: bool @classmethod - def from_phonemes(cls, phonemes: List[Phoneme]): - """ - PhonemeのリストからAccentPhraseクラスを作成する - Parameters - ---------- - phonemes : List[Phoneme] - phonemeのリストを渡す + def from_phonemes(cls, phonemes: list[Phoneme]) -> Self: + """音素系列をcontextで区切りAccentPhraseインスタンスを生成する""" - Returns - ------- - accent_phrase : AccentPhrase - AccentPhraseクラスを返す - """ - moras: List[Mora] = [] + # NOTE:「モーラごとの音素系列」は音素系列をcontextで区切り生成される。 + + moras: list[Mora] = [] # モーラ系列 + mora_phonemes: list[Phoneme] = [] # モーラごとの音素系列を一時保存するコンテナ - mora_phonemes: List[Phoneme] = [] for phoneme, next_phoneme in zip(phonemes, phonemes[1:] + [None]): - # workaround for Hihosiba/voicevox_engine#57 - # (py)openjtalk によるアクセント句内のモーラへの附番は 49 番目まで - # 49 番目のモーラについて、続く音素のモーラ番号を単一モーラの特定に使えない + # モーラ抽出を打ち切る(ワークアラウンド、VOICEVOX/voicevox_engine#57) + # context a2(モーラ番号)の最大値が 49 であるため、49番目以降のモーラでは音素のモーラ番号を区切りに使えない if int(phoneme.contexts["a2"]) == 49: break + # 区切りまで音素系列を一時保存する mora_phonemes.append(phoneme) + # 一時的な音素系列を確定させて処理する + # a2はアクセント句内でのモーラ番号(1~49) if ( next_phoneme is None or phoneme.contexts["a2"] != next_phoneme.contexts["a2"] ): + # モーラごとの音素系列長に基づいて子音と母音を得る if len(mora_phonemes) == 1: consonant, vowel = None, mora_phonemes[0] elif len(mora_phonemes) == 2: consonant, vowel = mora_phonemes[0], mora_phonemes[1] else: raise ValueError(mora_phonemes) + # 子音と母音からモーラを生成して保存する mora = Mora(consonant=consonant, vowel=vowel) moras.append(mora) + # 次に向けてリセット mora_phonemes = [] + # アクセント位置を決定する + # f2はアクセント句のアクセント位置(1~49) accent = int(moras[0].vowel.contexts["f2"]) - # workaround for Hihosiba/voicevox_engine#55 - # アクセント位置とするキー f2 の値がアクセント句内のモーラ数を超える場合がある + # f2 の値がアクセント句内のモーラ数を超える場合はクリップ(ワークアラウンド、VOICEVOX/voicevox_engine#55 を参照) accent = accent if accent <= len(moras) else len(moras) + + # 疑問文か否か判定する(末尾モーラ母音のcontextに基づく) + # f3はアクセント句が疑問文かどうか(1で疑問文) is_interrogative = moras[-1].vowel.contexts["f3"] == "1" - return cls(moras=moras, accent=accent, is_interrogative=is_interrogative) + + # AccentPhrase インスタンスを生成する + accent_phrase = cls( + moras=moras, accent=accent, is_interrogative=is_interrogative + ) + + return accent_phrase def set_context(self, key: str, value: str): """ @@ -244,7 +251,7 @@ def phonemes(self): 音素群を返す Returns ------- - phonemes : List[Phoneme] + phonemes : list[Phoneme] AccentPhraseに間接的に含まれる全てのPhonemeを返す """ return list(chain.from_iterable(m.phonemes for m in self.moras)) @@ -255,7 +262,7 @@ def labels(self): ラベル群を返す Returns ------- - labels : List[str] + labels : list[str] AccentPhraseに間接的に含まれる全てのラベルを返す """ return [p.label for p in self.phonemes] @@ -288,41 +295,43 @@ class BreathGroup: アクセントの異なるアクセント句を複数保持する Attributes ---------- - accent_phrases : List[AccentPhrase] + accent_phrases : list[AccentPhrase] アクセント句のリスト """ - accent_phrases: List[AccentPhrase] + accent_phrases: list[AccentPhrase] @classmethod - def from_phonemes(cls, phonemes: List[Phoneme]): - """ - PhonemeのリストからBreathGroupクラスを作成する - Parameters - ---------- - phonemes : List[Phoneme] - phonemeのリストを渡す + def from_phonemes(cls, phonemes: list[Phoneme]) -> Self: + """音素系列をcontextで区切りBreathGroupインスタンスを生成する""" + + # NOTE:「アクセント句ごとの音素系列」は音素系列をcontextで区切り生成される。 + + accent_phrases: list[AccentPhrase] = [] # アクセント句系列 + accent_phonemes: list[Phoneme] = [] # アクセント句ごとの音素系列を一時保存するコンテナ - Returns - ------- - breath_group : BreathGroup - BreathGroupクラスを返す - """ - accent_phrases: List[AccentPhrase] = [] - accent_phonemes: List[Phoneme] = [] for phoneme, next_phoneme in zip(phonemes, phonemes[1:] + [None]): + # 区切りまで音素系列を一時保存する accent_phonemes.append(phoneme) + # 一時的な音素系列を確定させて処理する + # i3はBreathGroupの番号 + # f5はBreathGroup内でのアクセント句の番号 if ( next_phoneme is None or phoneme.contexts["i3"] != next_phoneme.contexts["i3"] or phoneme.contexts["f5"] != next_phoneme.contexts["f5"] ): + # アクセント句を生成して保存する accent_phrase = AccentPhrase.from_phonemes(accent_phonemes) accent_phrases.append(accent_phrase) + # 次に向けてリセット accent_phonemes = [] - return cls(accent_phrases=accent_phrases) + # BreathGroup インスタンスを生成する + breath_group = cls(accent_phrases=accent_phrases) + + return breath_group def set_context(self, key: str, value: str): """ @@ -343,7 +352,7 @@ def phonemes(self): 音素群を返す Returns ------- - phonemes : List[Phoneme] + phonemes : list[Phoneme] BreathGroupに間接的に含まれる全てのPhonemeを返す """ return list( @@ -358,7 +367,7 @@ def labels(self): ラベル群を返す Returns ------- - labels : List[str] + labels : list[str] BreathGroupに間接的に含まれる全てのラベルを返す """ return [p.label for p in self.phonemes] @@ -371,46 +380,45 @@ class Utterance: 発声の区切りと無音を複数保持する Attributes ---------- - breath_groups : List[BreathGroup] + breath_groups : list[BreathGroup] 発声の区切りのリスト - pauses : List[Phoneme] + pauses : list[Phoneme] 無音のリスト """ - breath_groups: List[BreathGroup] - pauses: List[Phoneme] + breath_groups: list[BreathGroup] + pauses: list[Phoneme] @classmethod - def from_phonemes(cls, phonemes: List[Phoneme]): - """ - Phonemeの完全なリストからUtteranceクラスを作成する - Parameters - ---------- - phonemes : List[Phoneme] - phonemeのリストを渡す + def from_phonemes(cls, phonemes: list[Phoneme]) -> Self: + """音素系列をポーズで区切りUtteranceインスタンスを生成する""" - Returns - ------- - utterance : Utterance - Utteranceクラスを返す - """ - pauses: List[Phoneme] = [] + # NOTE:「BreathGroupごとの音素系列」は音素系列をポーズで区切り生成される。 + + pauses: list[Phoneme] = [] # ポーズ音素のリスト + breath_groups: list[BreathGroup] = [] # BreathGroup のリスト + group_phonemes: list[Phoneme] = [] # BreathGroupごとの音素系列を一時保存するコンテナ - breath_groups: List[BreathGroup] = [] - group_phonemes: List[Phoneme] = [] for phoneme in phonemes: + # ポーズが出現するまで音素系列を一時保存する if not phoneme.is_pause(): group_phonemes.append(phoneme) + # 一時的な音素系列を確定させて処理する else: + # ポーズ音素を保存する pauses.append(phoneme) - if len(group_phonemes) > 0: + # 音素系列からBreathGroupを生成して保存する breath_group = BreathGroup.from_phonemes(group_phonemes) breath_groups.append(breath_group) + # 次に向けてリセット group_phonemes = [] - return cls(breath_groups=breath_groups, pauses=pauses) + # Utteranceインスタンスを生成する + utterance = cls(breath_groups=breath_groups, pauses=pauses) + + return utterance def set_context(self, key: str, value: str): """ @@ -431,7 +439,7 @@ def phonemes(self): 音素群を返す Returns ------- - phonemes : List[Phoneme] + phonemes : list[Phoneme] Utteranceクラスに直接的・間接的に含まれる、全てのPhonemeを返す """ accent_phrases = list( @@ -496,7 +504,7 @@ def phonemes(self): ), ) - phonemes: List[Phoneme] = [] + phonemes: list[Phoneme] = [] for i in range(len(self.pauses)): if self.pauses[i] is not None: phonemes += [self.pauses[i]] @@ -512,7 +520,7 @@ def labels(self): ラベル群を返す Returns ------- - labels : List[str] + labels : list[str] Utteranceクラスに直接的・間接的に含まれる全てのラベルを返す """ return [p.label for p in self.phonemes] From b8e1831d1eabb178b0eba8c039e8b97aedf3279b Mon Sep 17 00:00:00 2001 From: tarepan Date: Sat, 16 Dec 2023 21:37:33 +0900 Subject: [PATCH 029/177] =?UTF-8?q?=E8=BF=BD=E5=8A=A0:=20=E7=B5=B1?= =?UTF-8?q?=E4=B8=80=E3=83=89=E3=83=A1=E3=82=A4=E3=83=B3=E7=94=A8=E8=AA=9E?= =?UTF-8?q?=20`=E9=9F=B3=E5=A3=B0=E5=90=88=E6=88=90=E7=94=A8=E3=81=AE?= =?UTF-8?q?=E3=82=AF=E3=82=A8=E3=83=AA`=20(#863)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 ++-- run.py | 4 ++-- voicevox_engine/dev/synthesis_engine/mock.py | 2 +- voicevox_engine/model.py | 2 +- voicevox_engine/tts_pipeline/tts_engine.py | 18 +++++++++--------- .../tts_pipeline/tts_engine_base.py | 8 ++++---- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 20267570c..3e0eb1931 100644 --- a/README.md +++ b/README.md @@ -217,7 +217,7 @@ curl -s -X GET "127.0.0.1:50021/presets" > presets.json preset_id=$(cat presets.json | sed -r 's/^.+"id"\:\s?([0-9]+?).+$/\1/g') style_id=$(cat presets.json | sed -r 's/^.+"style_id"\:\s?([0-9]+?).+$/\1/g') -# AudioQueryの取得 +# 音声合成用のクエリを取得 curl -s \ -X POST \ "127.0.0.1:50021/audio_query_from_preset?preset_id=$preset_id"\ @@ -589,7 +589,7 @@ VOICEVOX ENGINE リポジトリを fork し、一部の機能を改造するの ダミーのアイコンなどが用意されているので適宜変更してください。 音声合成は`voicevox_engine/synthesis_engine/synthesis_engine.py`で行われています。 -VOICEVOX API での音声合成は、エンジン側で音声合成クエリ`AudioQuery`の初期値を作成してユーザーに返し、ユーザーが必要に応じてクエリを編集したあと、エンジンがクエリに従って音声合成することで実現しています。 +VOICEVOX API での音声合成は、エンジン側で音声合成用のクエリ `AudioQuery` の初期値を作成してユーザーに返し、ユーザーが必要に応じてクエリを編集したあと、エンジンがクエリに従って音声合成することで実現しています。 クエリ作成は`/audio_query`エンドポイントで、音声合成は`/synthesis`エンドポイントで行っており、最低この2つに対応すれば VOICEVOX API に準拠したことになります。 #### マルチエンジン機能対応エンジンの配布方法 diff --git a/run.py b/run.py index ec38fd41d..aa3c3afec 100644 --- a/run.py +++ b/run.py @@ -246,7 +246,7 @@ def audio_query( core_version: str | None = None, ) -> AudioQuery: """ - クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 + 音声合成用のクエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 """ style_id = get_style_id_from_deprecated(style_id=style_id, speaker_id=speaker) engine = get_engine(core_version) @@ -276,7 +276,7 @@ def audio_query_from_preset( core_version: str | None = None, ) -> AudioQuery: """ - クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 + 音声合成用のクエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 """ engine = get_engine(core_version) try: diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py index ec366b31b..3cb72dc79 100644 --- a/voicevox_engine/dev/synthesis_engine/mock.py +++ b/voicevox_engine/dev/synthesis_engine/mock.py @@ -87,7 +87,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int) -> np.ndarray: Parameters ---------- query : AudioQuery - /audio_query APIで得たjson + 音声合成用のクエリ style_id : int スタイルID diff --git a/voicevox_engine/model.py b/voicevox_engine/model.py index 1c02aa168..c4d19ba69 100644 --- a/voicevox_engine/model.py +++ b/voicevox_engine/model.py @@ -59,7 +59,7 @@ class AudioQuery(BaseModel): postPhonemeLength: float = Field(title="音声の後の無音時間") outputSamplingRate: int = Field(title="音声データの出力サンプリングレート") outputStereo: bool = Field(title="音声データをステレオ出力するか否か") - kana: Optional[str] = Field(title="[読み取り専用]AquesTalk風記法によるテキスト。音声合成クエリとしては無視される") + kana: Optional[str] = Field(title="[読み取り専用]AquesTalk風記法によるテキスト。音声合成用のクエリとしては無視される") def __hash__(self): items = [ diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 372900c6f..c05c122dd 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -130,7 +130,7 @@ def apply_prepost_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: moras : List[Mora] モーラ時系列 query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ Returns ------- moras : List[Mora] @@ -150,7 +150,7 @@ def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: moras : list[Mora] モーラ系列 query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ Returns ------- moras : list[Mora] @@ -216,7 +216,7 @@ def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: moras : list[Mora] モーラ系列 query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ Returns ------- moras : list[Mora] @@ -235,7 +235,7 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: moras : list[Mora] モーラ系列 query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ Returns ------- moras : list[Mora] @@ -281,7 +281,7 @@ def apply_volume_scale(wave: numpy.ndarray, query: AudioQuery) -> numpy.ndarray: wave : numpy.ndarray 音声波形 query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ Returns ------- wave : numpy.ndarray @@ -326,7 +326,7 @@ def apply_output_sampling_rate( sr_wave : int `wave`のサンプリングレート query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ Returns ------- wave : ndarray @@ -348,7 +348,7 @@ def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray: wave : ndarray 音声波形 query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ Returns ------- wave : ndarray @@ -599,11 +599,11 @@ def _create_one_hot(accent_phrase: AccentPhrase, position: int): def _synthesis_impl(self, query: AudioQuery, style_id: int): """ - 音声合成クエリから音声合成に必要な情報を構成し、実際に音声合成を行う + 音声合成用のクエリから音声合成に必要な情報を構成し、実際に音声合成を行う Parameters ---------- query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ style_id : int スタイルID Returns diff --git a/voicevox_engine/tts_pipeline/tts_engine_base.py b/voicevox_engine/tts_pipeline/tts_engine_base.py index 502580f8e..f4eeda039 100644 --- a/voicevox_engine/tts_pipeline/tts_engine_base.py +++ b/voicevox_engine/tts_pipeline/tts_engine_base.py @@ -307,12 +307,12 @@ def synthesis( enable_interrogative_upspeak: bool = True, ) -> np.ndarray: """ - 音声合成クエリ内の疑問文指定されたMoraを変形した後、 + 音声合成用のクエリ内の疑問文指定されたMoraを変形した後、 継承先における実装`_synthesis_impl`を使い音声合成を行う Parameters ---------- query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ style_id : int スタイルID enable_interrogative_upspeak : bool @@ -337,11 +337,11 @@ def _synthesis_impl( style_id: int, ) -> np.ndarray: """ - 音声合成クエリから音声合成に必要な情報を構成し、実際に音声合成を行う + 音声合成用のクエリから音声合成に必要な情報を構成し、実際に音声合成を行う Parameters ---------- query : AudioQuery - 音声合成クエリ + 音声合成用のクエリ style_id : int スタイルID Returns From 58a993d3d5ef299f92c731c61113809cd618808f Mon Sep 17 00:00:00 2001 From: tarepan Date: Sun, 17 Dec 2023 08:27:15 +0900 Subject: [PATCH 030/177] =?UTF-8?q?=E8=BF=BD=E5=8A=A0:=20=E9=9F=B3?= =?UTF-8?q?=E5=A3=B0=E3=83=A9=E3=82=A4=E3=83=96=E3=83=A9=E3=83=AA=E8=87=AA?= =?UTF-8?q?=E5=8B=95=E8=AA=AD=E3=81=BF=E8=BE=BC=E3=81=BF=20docs=20(#869)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 3e0eb1931..589da6944 100644 --- a/README.md +++ b/README.md @@ -470,6 +470,19 @@ Mac では、`--runtime_dir`引数の代わりに`DYLD_LIBRARY_PATH`の指定が DYLD_LIBRARY_PATH="/path/to/onnx" python run.py --voicelib_dir="/path/to/voicevox_core" ``` +##### ユーザーディレクトリに配置する + +以下のディレクトリにある音声ライブラリは自動で読み込まれます。 + +- ビルド版: `/voicevox-engine/core_libraries/` +- Python 版: `/voicevox-engine-dev/core_libraries/` + +``は OS によって異なります。 + +- Windows: `C:\Users\\AppData\Local\` +- macOS: `/Users//Library/Application\ Support/` +- Linux: `/home//.local/share/` + ### ビルド この方法でビルドしたものは、リリースで公開されているものとは異なります。 From 3cb454f1323d4c2ed0ff3252707b0c3219b1fe83 Mon Sep 17 00:00:00 2001 From: tarepan Date: Sun, 17 Dec 2023 08:41:03 +0900 Subject: [PATCH 031/177] =?UTF-8?q?Refactor:=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E4=B8=8D=E4=BD=BF=E7=94=A8=E5=A4=89=E6=95=B0=E5=89=8A=E9=99=A4?= =?UTF-8?q?=E3=81=A8utility=E3=81=AB=E3=82=88=E3=82=8B=E7=B0=A1=E7=95=A5?= =?UTF-8?q?=E5=8C=96=20(#882)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_synthesis_engine.py | 210 +++++----------------------------- 1 file changed, 27 insertions(+), 183 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 00730bb5d..84064bdea 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -481,7 +481,6 @@ def test_feat_to_framescale(): # Pre k o N pau h i h O Pst true_frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3] n_frame = sum(true_frame_per_phoneme) - true_frame_per_phoneme = numpy.array(true_frame_per_phoneme, dtype=numpy.int32) # phoneme # Pr k o o N N pau h i i h h O Pt Pt Pt frame_phoneme_idxs = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0] @@ -489,9 +488,6 @@ def test_feat_to_framescale(): for frame_idx, phoneme_idx in enumerate(frame_phoneme_idxs): true_frame_phoneme[frame_idx, phoneme_idx] = 1.0 # Pitch - # Pre ko N pau hi hO Pst - true_f0 = [0.0, 200.0, 200.0, 0.0, 500.0, 0.0, 0.0] # mean 300 - true_f0 = [0.0, 250.0, 250.0, 0.0, 400.0, 0.0, 0.0] # intonationScale 0.5 # paw ko N pau hi hO paw # frame_per_vowel = [1, 3, 2, 1, 3, 3, 3] # pau ko ko ko N N @@ -532,91 +528,21 @@ def setUp(self): self.accent_phrases_hello_hiho = [ AccentPhrase( moras=[ - Mora( - text="コ", - consonant="k", - consonant_length=0.0, - vowel="o", - vowel_length=0.0, - pitch=0.0, - ), - Mora( - text="ン", - consonant=None, - consonant_length=None, - vowel="N", - vowel_length=0.0, - pitch=0.0, - ), - Mora( - text="ニ", - consonant="n", - consonant_length=0.0, - vowel="i", - vowel_length=0.0, - pitch=0.0, - ), - Mora( - text="チ", - consonant="ch", - consonant_length=0.0, - vowel="i", - vowel_length=0.0, - pitch=0.0, - ), - Mora( - text="ワ", - consonant="w", - consonant_length=0.0, - vowel="a", - vowel_length=0.0, - pitch=0.0, - ), + _gen_mora("コ", "k", 0.0, "o", 0.0, 0.0), + _gen_mora("ン", None, None, "N", 0.0, 0.0), + _gen_mora("ニ", "n", 0.0, "i", 0.0, 0.0), + _gen_mora("チ", "ch", 0.0, "i", 0.0, 0.0), + _gen_mora("ワ", "w", 0.0, "a", 0.0, 0.0), ], accent=5, - pause_mora=Mora( - text="、", - consonant=None, - consonant_length=None, - vowel="pau", - vowel_length=0.0, - pitch=0.0, - ), + pause_mora=_gen_mora("、", None, None, "pau", 0.0, 0.0), ), AccentPhrase( moras=[ - Mora( - text="ヒ", - consonant="h", - consonant_length=0.0, - vowel="i", - vowel_length=0.0, - pitch=0.0, - ), - Mora( - text="ホ", - consonant="h", - consonant_length=0.0, - vowel="o", - vowel_length=0.0, - pitch=0.0, - ), - Mora( - text="デ", - consonant="d", - consonant_length=0.0, - vowel="e", - vowel_length=0.0, - pitch=0.0, - ), - Mora( - text="ス", - consonant="s", - consonant_length=0.0, - vowel="U", - vowel_length=0.0, - pitch=0.0, - ), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 0.0), + _gen_mora("ホ", "h", 0.0, "o", 0.0, 0.0), + _gen_mora("デ", "d", 0.0, "e", 0.0, 0.0), + _gen_mora("ス", "s", 0.0, "U", 0.0, 0.0), ], accent=1, pause_mora=None, @@ -626,9 +552,7 @@ def setUp(self): self.yukarin_s_mock = core.yukarin_s_forward self.yukarin_sa_mock = core.yukarin_sa_forward self.decode_mock = core.decode_forward - self.synthesis_engine = SynthesisEngine( - core=core, - ) + self.synthesis_engine = SynthesisEngine(core=core) def test_to_flatten_moras(self): flatten_moras = to_flatten_moras(self.accent_phrases_hello_hiho) @@ -640,30 +564,17 @@ def test_to_flatten_moras(self): ) def test_split_mora(self): + # Outputs consonant_phoneme_list, vowel_phoneme_list, vowel_indexes = split_mora( self.phoneme_data_list_hello_hiho ) self.assertEqual(vowel_indexes, [0, 2, 3, 5, 7, 9, 10, 12, 14, 16, 18, 19]) + ps = ["pau", "o", "N", "i", "i", "a", "pau", "i", "o", "e", "U", "pau"] + true_vowel_phoneme_list = [OjtPhoneme(p) for p in ps] self.assertTrue( - is_same_ojt_phoneme_list( - vowel_phoneme_list, - [ - OjtPhoneme("pau"), - OjtPhoneme("o"), - OjtPhoneme("N"), - OjtPhoneme("i"), - OjtPhoneme("i"), - OjtPhoneme("a"), - OjtPhoneme("pau"), - OjtPhoneme("i"), - OjtPhoneme("o"), - OjtPhoneme("e"), - OjtPhoneme("U"), - OjtPhoneme("pau"), - ], - ) + is_same_ojt_phoneme_list(vowel_phoneme_list, true_vowel_phoneme_list) ) self.assertTrue( is_same_ojt_phoneme_list( @@ -702,15 +613,13 @@ def test_pre_process(self): if mora.consonant is not None: self.assertTrue( is_same_phoneme( - phoneme_data_list[phoneme_index], - OjtPhoneme(mora.consonant), + phoneme_data_list[phoneme_index], OjtPhoneme(mora.consonant) ) ) phoneme_index += 1 self.assertTrue( is_same_phoneme( - phoneme_data_list[phoneme_index], - OjtPhoneme(mora.vowel), + phoneme_data_list[phoneme_index], OjtPhoneme(mora.vowel) ) ) phoneme_index += 1 @@ -718,17 +627,11 @@ def test_pre_process(self): self.assertEqual(flatten_moras[mora_index], accent_phrase.pause_mora) mora_index += 1 self.assertTrue( - is_same_phoneme( - phoneme_data_list[phoneme_index], - OjtPhoneme("pau"), - ) + is_same_phoneme(phoneme_data_list[phoneme_index], OjtPhoneme("pau")) ) phoneme_index += 1 self.assertTrue( - is_same_phoneme( - phoneme_data_list[phoneme_index], - OjtPhoneme("pau"), - ) + is_same_phoneme(phoneme_data_list[phoneme_index], OjtPhoneme("pau")) ) def test_replace_phoneme_length(self): @@ -742,33 +645,12 @@ def test_replace_phoneme_length(self): phoneme_list = yukarin_s_args["phoneme_list"] self.assertEqual(list_length, 20) self.assertEqual(list_length, len(phoneme_list)) + true_phoneme_list_1 = [0, 23, 30, 4, 28, 21, 10, 21, 42, 7] + true_phoneme_list_2 = [0, 19, 21, 19, 30, 12, 14, 35, 6, 0] + true_phoneme_list = true_phoneme_list_1 + true_phoneme_list_2 numpy.testing.assert_array_equal( phoneme_list, - numpy.array( - [ - 0, - 23, - 30, - 4, - 28, - 21, - 10, - 21, - 42, - 7, - 0, - 19, - 21, - 19, - 30, - 12, - 14, - 35, - 6, - 0, - ], - dtype=numpy.int64, - ), + numpy.array(true_phoneme_list, dtype=numpy.int64), ) self.assertEqual(yukarin_s_args["style_id"], 1) @@ -827,41 +709,11 @@ def test_replace_mora_pitch(self): numpy.testing.assert_array_equal( vowel_phoneme_list, - numpy.array( - [ - 0, - 30, - 4, - 21, - 21, - 7, - 0, - 21, - 30, - 14, - 6, - 0, - ] - ), + numpy.array([0, 30, 4, 21, 21, 7, 0, 21, 30, 14, 6, 0]), ) numpy.testing.assert_array_equal( consonant_phoneme_list, - numpy.array( - [ - -1, - 23, - -1, - 28, - 10, - 42, - -1, - 19, - 19, - 12, - 35, - -1, - ] - ), + numpy.array([-1, 23, -1, 28, 10, 42, -1, 19, 19, 12, 35, -1]), ) numpy.testing.assert_array_equal( start_accent_list, numpy.array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]) @@ -1050,18 +902,10 @@ def synthesis_test_base(self, audio_query: AudioQuery): self.assertTrue(assert_result_count >= int(len(true_result) / 5) * 4) def test_synthesis(self): - audio_query = AudioQuery( - accent_phrases=deepcopy(self.accent_phrases_hello_hiho), - speedScale=1.0, - pitchScale=1.0, - intonationScale=1.0, - volumeScale=1.0, + audio_query = _gen_query( + deepcopy(self.accent_phrases_hello_hiho), prePhonemeLength=0.1, postPhonemeLength=0.1, - outputSamplingRate=24000, - outputStereo=False, - # このテスト内では使わないので生成不要 - kana="", ) self.synthesis_test_base(audio_query) From 11f080ea0168e6c73574be5f84b7b17d27ead69e Mon Sep 17 00:00:00 2001 From: tarepan Date: Sun, 17 Dec 2023 08:53:35 +0900 Subject: [PATCH 032/177] =?UTF-8?q?=E4=BF=AE=E6=AD=A3:=20=E8=BE=9E?= =?UTF-8?q?=E6=9B=B8=E6=9B=B4=E6=96=B0=E6=99=82=E3=81=AE=E3=83=95=E3=82=A1?= =?UTF-8?q?=E3=82=A4=E3=83=AB=E3=83=AA=E3=83=8D=E3=83=BC=E3=83=A0=E3=82=A8?= =?UTF-8?q?=E3=83=A9=E3=83=BC=20(#884)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- voicevox_engine/user_dict.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/voicevox_engine/user_dict.py b/voicevox_engine/user_dict.py index f720ac4aa..abdf39f48 100644 --- a/voicevox_engine/user_dict.py +++ b/voicevox_engine/user_dict.py @@ -75,9 +75,11 @@ def update_dict( コンパイル済み辞書ファイルのパス """ random_string = uuid4() - tmp_csv_path = save_dir / f".tmp.dict_csv-{random_string}" # csv形式辞書データの一時保存ファイル - tmp_compiled_path = ( - save_dir / f".tmp.dict_compiled-{random_string}" + tmp_csv_path = compiled_dict_path.with_suffix( + f".dict_csv-{random_string}.tmp" + ) # csv形式辞書データの一時保存ファイル + tmp_compiled_path = compiled_dict_path.with_suffix( + f".dict_compiled-{random_string}.tmp" ) # コンパイル済み辞書データの一時保存ファイル try: From 3bd9199e6517e0bc0e91a6dc1cbff7831c4a9776 Mon Sep 17 00:00:00 2001 From: tarepan Date: Sun, 17 Dec 2023 23:36:47 +0900 Subject: [PATCH 033/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20AquesTalk?= =?UTF-8?q?=E9=A2=A8=E8=A8=98=E6=B3=95=E3=83=91=E3=83=BC=E3=82=B9=E8=A6=8F?= =?UTF-8?q?=E5=89=87=E3=82=B3=E3=83=A1=E3=83=B3=E3=83=88=E3=81=AE=E8=BF=BD?= =?UTF-8?q?=E5=8A=A0=20(#864)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Refactor: AquesTalk風記法パース規則コメント * 提案 * ミス * Update voicevox_engine/tts_pipeline/kana_parser.py --------- Co-authored-by: Hiroshiba --- voicevox_engine/tts_pipeline/kana_parser.py | 36 +++++++++++++-------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/voicevox_engine/tts_pipeline/kana_parser.py b/voicevox_engine/tts_pipeline/kana_parser.py index 430960156..87a4624a8 100644 --- a/voicevox_engine/tts_pipeline/kana_parser.py +++ b/voicevox_engine/tts_pipeline/kana_parser.py @@ -1,6 +1,17 @@ """ 「AquesTalk風記法」を実装した AquesTalk風記法テキスト <-> アクセント句系列 変換。 -記法定義: `https://github.com/VOICEVOX/voicevox_engine/blob/master/README.md#読み方を-aquestalk風記法で取得修正するサンプルコード` # noqa + +記法の規則は以下の通り。 + +- 読みはカタカナのみ +- `/` で区切り +- `、` で無音付き区切り +- `_` で無声化 +- `'` でアクセント位置 +- `?` で疑問文 +- アクセント位置はちょうど1つ + +NOTE: ユーザー向け案内 `https://github.com/VOICEVOX/voicevox_engine/blob/master/README.md#読み方を-aquestalk風記法で取得修正するサンプルコード` # noqa """ from typing import List, Optional @@ -30,7 +41,7 @@ is_interrogative=False, ) if vowel in ["a", "i", "u", "e", "o"]: - # 手前に`_`を入れると無声化 + # 「`_` で無声化」の実装 # 例: "_ホ" -> "hO" _text2mora_with_unvoice[_UNVOICE_SYMBOL + text] = Mora( text=text, @@ -69,13 +80,14 @@ def _text_to_accent_phrase(phrase: str) -> AccentPhrase: while base_index < len(phrase): outer_loop += 1 - # `'`の手前がアクセント位置 + # 「`'` でアクセント位置」の実装 if phrase[base_index] == _ACCENT_SYMBOL: + # 「アクセント位置はちょうど1つ」の実装 if len(moras) == 0: raise ParseKanaError(ParseKanaErrorCode.ACCENT_TOP, text=phrase) - # すでにアクセント位置がある場合はエラー if accent_index is not None: raise ParseKanaError(ParseKanaErrorCode.ACCENT_TWICE, text=phrase) + accent_index = len(moras) base_index += 1 continue @@ -89,8 +101,6 @@ def _text_to_accent_phrase(phrase: str) -> AccentPhrase: break stack += phrase[watch_index] if stack in _text2mora_with_unvoice: - # より長い要素からなるモーラが見つかれば上書き(longest match) - # 例: phrase "キャ" -> "キ" 検出 -> "キャ" 検出/上書き -> Mora("キャ") matched_text = stack if matched_text is None: raise ParseKanaError(ParseKanaErrorCode.UNKNOWN_TEXT, text=stack) @@ -137,7 +147,7 @@ def parse_kana(text: str) -> List[AccentPhrase]: ) phrase_base = i + 1 - # アクセント句末に`?`で疑問文 + # 「`?` で疑問文」の実装 is_interrogative = _WIDE_INTERROGATION_MARK in phrase if is_interrogative: if _WIDE_INTERROGATION_MARK in phrase[:-1]: @@ -149,7 +159,7 @@ def parse_kana(text: str) -> List[AccentPhrase]: accent_phrase: AccentPhrase = _text_to_accent_phrase(phrase) - # `、`で無音区間を挿入 + # 「`、` で無音付き区切り」の実装 if i < len(text) and text[i] == _PAUSE_DELIMITER: accent_phrase.pause_mora = Mora( text="、", @@ -182,23 +192,23 @@ def create_kana(accent_phrases: List[AccentPhrase]) -> str: # アクセント句を先頭から逐次パースし、`text`末尾にAquesTalk風記法の文字を都度追加(ループ) for i, phrase in enumerate(accent_phrases): for j, mora in enumerate(phrase.moras): - # Rule3: "カナの手前に`_`を入れるとそのカナは無声化される" + # 「`_` で無声化」の実装 if mora.vowel in ["A", "I", "U", "E", "O"]: text += _UNVOICE_SYMBOL text += mora.text - # `'`でアクセント位置 + # 「`'` でアクセント位置」の実装 if j + 1 == phrase.accent: text += _ACCENT_SYMBOL - # Rule5: "アクセント句末に`?`(全角)を入れることにより疑問文の発音ができる" + # 「`?` で疑問文」の実装 if phrase.is_interrogative: text += _WIDE_INTERROGATION_MARK if i < len(accent_phrases) - 1: + # 「`/` で区切り」の実装 if phrase.pause_mora is None: - # アクセント句区切り text += _NOPAUSE_DELIMITER + # 「`、` で無音付き区切り」の実装 else: - # 無音でアクセント句区切り text += _PAUSE_DELIMITER return text From 4ef4218822de666ea1272338061a14ffd8f690a3 Mon Sep 17 00:00:00 2001 From: sabonerune <102559104+sabonerune@users.noreply.github.com> Date: Mon, 18 Dec 2023 04:33:15 +0900 Subject: [PATCH 034/177] =?UTF-8?q?BLD:=20PyInstaller=E3=82=92v6=E3=81=B8?= =?UTF-8?q?=E6=9B=B4=E6=96=B0=20(#857)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- .github/workflows/build.yml | 8 +-- poetry.lock | 61 ++++++++--------- pyproject.toml | 6 +- requirements-dev.txt | 10 +-- run.py | 3 +- run.spec | 88 +++++++++++++++---------- voicevox_engine/user_dict.py | 4 +- voicevox_engine/utility/__init__.py | 5 +- voicevox_engine/utility/path_utility.py | 51 ++++++++++---- 9 files changed, 140 insertions(+), 96 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 218efd2c4..1054b5e47 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -464,10 +464,10 @@ jobs: LIBONNXRUNTIME_PATH=download/onnxruntime/lib/libonnxruntime.so fi - CORE_MODEL_DIR_PATH="download/core/model" \ - LIBCORE_PATH="$LIBCORE_PATH" \ - LIBONNXRUNTIME_PATH="$LIBONNXRUNTIME_PATH" \ - pyinstaller --noconfirm run.spec + pyinstaller --noconfirm run.spec -- \ + --libcore_path="$LIBCORE_PATH" \ + --libonnxruntime_path="$LIBONNXRUNTIME_PATH" \ + --core_model_dir_path="download/core/model" - name: Gather DLL dependencies to dist/run/ (Windows) if: startsWith(matrix.os, 'windows-') diff --git a/poetry.lock b/poetry.lock index 6e7957c9a..128c9d660 100644 --- a/poetry.lock +++ b/poetry.lock @@ -13,13 +13,13 @@ files = [ [[package]] name = "altgraph" -version = "0.17.3" +version = "0.17.4" description = "Python graph (network) package" optional = false python-versions = "*" files = [ - {file = "altgraph-0.17.3-py2.py3-none-any.whl", hash = "sha256:c8ac1ca6772207179ed8003ce7687757c04b0b71536f81e2ac5755c6226458fe"}, - {file = "altgraph-0.17.3.tar.gz", hash = "sha256:ad33358114df7c9416cdb8fa1eaa5852166c505118717021c6a8c7c7abbd03dd"}, + {file = "altgraph-0.17.4-py2.py3-none-any.whl", hash = "sha256:642743b4750de17e655e6711601b077bc6598dbfa3ba5fa2b2a35ce12b508dff"}, + {file = "altgraph-0.17.4.tar.gz", hash = "sha256:1b5afbb98f6c4dcadb2e2ae6ab9fa994bbb8c1d75f4fa96d340f9437ae454406"}, ] [[package]] @@ -1020,13 +1020,13 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", [[package]] name = "macholib" -version = "1.16.2" +version = "1.16.3" description = "Mach-O header analysis and editing" optional = false python-versions = "*" files = [ - {file = "macholib-1.16.2-py2.py3-none-any.whl", hash = "sha256:44c40f2cd7d6726af8fa6fe22549178d3a4dfecc35a9cd15ea916d9c83a688e0"}, - {file = "macholib-1.16.2.tar.gz", hash = "sha256:557bbfa1bb255c20e9abafe7ed6cd8046b48d9525db2f9b77d3122a63a2a8bf8"}, + {file = "macholib-1.16.3-py2.py3-none-any.whl", hash = "sha256:0e315d7583d38b8c77e815b1ecbdbf504a8258d8b3e17b61165c6feb60d18f2c"}, + {file = "macholib-1.16.3.tar.gz", hash = "sha256:07ae9e15e8e4cd9a788013d81f5908b3609aa76f9b1421bae9c4d7606ec86a30"}, ] [package.dependencies] @@ -1612,46 +1612,47 @@ files = [ [[package]] name = "pyinstaller" -version = "5.13.2" +version = "6.2.0" description = "PyInstaller bundles a Python application and all its dependencies into a single package." optional = false -python-versions = "<3.13,>=3.7" +python-versions = "<3.13,>=3.8" files = [ - {file = "pyinstaller-5.13.2-py3-none-macosx_10_13_universal2.whl", hash = "sha256:16cbd66b59a37f4ee59373a003608d15df180a0d9eb1a29ff3bfbfae64b23d0f"}, - {file = "pyinstaller-5.13.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8f6dd0e797ae7efdd79226f78f35eb6a4981db16c13325e962a83395c0ec7420"}, - {file = "pyinstaller-5.13.2-py3-none-manylinux2014_i686.whl", hash = "sha256:65133ed89467edb2862036b35d7c5ebd381670412e1e4361215e289c786dd4e6"}, - {file = "pyinstaller-5.13.2-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:7d51734423685ab2a4324ab2981d9781b203dcae42839161a9ee98bfeaabdade"}, - {file = "pyinstaller-5.13.2-py3-none-manylinux2014_s390x.whl", hash = "sha256:2c2fe9c52cb4577a3ac39626b84cf16cf30c2792f785502661286184f162ae0d"}, - {file = "pyinstaller-5.13.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:c63ef6133eefe36c4b2f4daf4cfea3d6412ece2ca218f77aaf967e52a95ac9b8"}, - {file = "pyinstaller-5.13.2-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:aadafb6f213549a5906829bb252e586e2cf72a7fbdb5731810695e6516f0ab30"}, - {file = "pyinstaller-5.13.2-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:b2e1c7f5cceb5e9800927ddd51acf9cc78fbaa9e79e822c48b0ee52d9ce3c892"}, - {file = "pyinstaller-5.13.2-py3-none-win32.whl", hash = "sha256:421cd24f26144f19b66d3868b49ed673176765f92fa9f7914cd2158d25b6d17e"}, - {file = "pyinstaller-5.13.2-py3-none-win_amd64.whl", hash = "sha256:ddcc2b36052a70052479a9e5da1af067b4496f43686ca3cdda99f8367d0627e4"}, - {file = "pyinstaller-5.13.2-py3-none-win_arm64.whl", hash = "sha256:27cd64e7cc6b74c5b1066cbf47d75f940b71356166031deb9778a2579bb874c6"}, - {file = "pyinstaller-5.13.2.tar.gz", hash = "sha256:c8e5d3489c3a7cc5f8401c2d1f48a70e588f9967e391c3b06ddac1f685f8d5d2"}, + {file = "pyinstaller-6.2.0-py3-none-macosx_10_13_universal2.whl", hash = "sha256:a1adbd3cf25dc90926d783eae0f444d65cdfecc7bcdf6da522c3ae3ff47b4c25"}, + {file = "pyinstaller-6.2.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:29d164394f1e949072f78a64c1e040f1c47b7f4aff08514c7666a031c8b44996"}, + {file = "pyinstaller-6.2.0-py3-none-manylinux2014_i686.whl", hash = "sha256:ba602a38d7403de89c38b8956b221ce6de0280730d269bab522492fcad82ee33"}, + {file = "pyinstaller-6.2.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:ebac06d99b80d2035594c3cc2fb5f2612d86289edd0510dbcbeb20a873f51d5a"}, + {file = "pyinstaller-6.2.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:fcfabc0ff1d38a4262c051dea3fdc1f7f106405c1f1b491b4c79cd28df19cab6"}, + {file = "pyinstaller-6.2.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:104430686149b2f1c135b2c17aa2967c85d54ef77dc92feb4e179ec846c0c467"}, + {file = "pyinstaller-6.2.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:e87fd60292b53bb9965cb5a84122875469a2bd475fd0d0db0052a3f1be351f75"}, + {file = "pyinstaller-6.2.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:8ec9d6c98972bb922cedb16a6638257aa66e5deadd79e2953f3464696237c413"}, + {file = "pyinstaller-6.2.0-py3-none-win32.whl", hash = "sha256:e5561e9a9b946d835c8dbc11ae4c16cc21e62bc77d10cc043406dc2992dfb4c6"}, + {file = "pyinstaller-6.2.0-py3-none-win_amd64.whl", hash = "sha256:3b586196277c4c54b69880650984c39c28bb6258c2b4b64200032e6ac69d53a0"}, + {file = "pyinstaller-6.2.0-py3-none-win_arm64.whl", hash = "sha256:d0c87b605bf13c3a04dfaa1d2fa7cd36765b8137000eeadccba865e1d6a19bf0"}, + {file = "pyinstaller-6.2.0.tar.gz", hash = "sha256:1ce77043929bf525be38289d78feecde0fcf15506215eda6500176a8715c5047"}, ] [package.dependencies] altgraph = "*" macholib = {version = ">=1.8", markers = "sys_platform == \"darwin\""} +packaging = ">=22.0" pefile = {version = ">=2022.5.30", markers = "sys_platform == \"win32\""} pyinstaller-hooks-contrib = ">=2021.4" pywin32-ctypes = {version = ">=0.2.1", markers = "sys_platform == \"win32\""} setuptools = ">=42.0.0" [package.extras] -encryption = ["tinyaes (>=1.0.0)"] +completion = ["argcomplete"] hook-testing = ["execnet (>=1.5.0)", "psutil", "pytest (>=2.7.3)"] [[package]] name = "pyinstaller-hooks-contrib" -version = "2023.7" +version = "2023.10" description = "Community maintained hooks for PyInstaller" optional = false python-versions = ">=3.7" files = [ - {file = "pyinstaller-hooks-contrib-2023.7.tar.gz", hash = "sha256:0c436a4c3506020e34116a8a7ddfd854c1ad6ddca9a8cd84500bd6e69c9e68f9"}, - {file = "pyinstaller_hooks_contrib-2023.7-py2.py3-none-any.whl", hash = "sha256:3c10df14c0f71ab388dfbf1625375b087e7330d9444cbfd2b310ba027fa0cff0"}, + {file = "pyinstaller-hooks-contrib-2023.10.tar.gz", hash = "sha256:4b4a998036abb713774cb26534ca06b7e6e09e4c628196017a10deb11a48747f"}, + {file = "pyinstaller_hooks_contrib-2023.10-py2.py3-none-any.whl", hash = "sha256:6dc1786a8f452941245d5bb85893e2a33632ebdcbc4c23eea41f2ee08281b0c0"}, ] [[package]] @@ -2048,19 +2049,19 @@ files = [ [[package]] name = "setuptools" -version = "68.1.2" +version = "69.0.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-68.1.2-py3-none-any.whl", hash = "sha256:3d8083eed2d13afc9426f227b24fd1659489ec107c0e86cec2ffdde5c92e790b"}, - {file = "setuptools-68.1.2.tar.gz", hash = "sha256:3d4dfa6d95f1b101d695a6160a7626e15583af71a5f52176efa5d39a054d475d"}, + {file = "setuptools-69.0.2-py3-none-any.whl", hash = "sha256:1e8fdff6797d3865f37397be788a4e3cba233608e9b509382a2777d25ebde7f2"}, + {file = "setuptools-69.0.2.tar.gz", hash = "sha256:735896e78a4742605974de002ac60562d286fa8051a7e2299445e8e8fbb01aa6"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5,<=7.1.2)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "shellingham" @@ -2431,4 +2432,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "~3.11" -content-hash = "b3ef9f8c5445b3e481d666a4a3b6a73d44fa1159646cf64f480a19aa1999d0ee" +content-hash = "eb3e0209e98c6df8760ef8dae1ccbd175af6a28e09ea5efc5e84b566b6c5b8d0" diff --git a/pyproject.toml b/pyproject.toml index 88926aa0a..c69cf96af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,14 +51,14 @@ pyyaml = "^6.0" pyworld = "^0.3.0" requests = "^2.28.1" jinja2 = "^3.1.2" -pyopenjtalk = {git = "https://github.com/VOICEVOX/pyopenjtalk", rev = "b35fc89fe42948a28e33aed886ea145a51113f88"} +pyopenjtalk = { git = "https://github.com/VOICEVOX/pyopenjtalk", rev = "b35fc89fe42948a28e33aed886ea145a51113f88" } semver = "^3.0.0" platformdirs = "^3.10.0" soxr = "^0.3.6" [tool.poetry.group.dev.dependencies] cython = "^0.29.34,>=0.29.33" # NOTE: for Python 3.11 -pyinstaller = "^5.13" +pyinstaller = "^6.2.0" pre-commit = "^2.16.0" atomicwrites = "^1.4.0" colorama = "^0.4.4" @@ -74,7 +74,7 @@ mypy = "^1.6.0" pytest = "^6.2.5" coveralls = "^3.2.0" poetry = "^1.3.1" -httpx = "^0.25.0" # NOTE: required by fastapi.testclient.TestClient +httpx = "^0.25.0" # NOTE: required by fastapi.testclient.TestClient [tool.poetry.group.license.dependencies] pip-licenses = "^4.2.0" diff --git a/requirements-dev.txt b/requirements-dev.txt index 51806c4da..a42435195 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ aiofiles==0.7.0 ; python_version >= "3.11" and python_version < "3.12" -altgraph==0.17.3 ; python_version >= "3.11" and python_version < "3.12" +altgraph==0.17.4 ; python_version >= "3.11" and python_version < "3.12" anyio==3.7.1 ; python_version >= "3.11" and python_version < "3.12" asgiref==3.7.2 ; python_version >= "3.11" and python_version < "3.12" atomicwrites==1.4.1 ; python_version >= "3.11" and python_version < "3.12" @@ -30,7 +30,7 @@ jeepney==0.8.0 ; python_version >= "3.11" and python_version < "3.12" and sys_pl jinja2==3.1.2 ; python_version >= "3.11" and python_version < "3.12" jsonschema==4.17.3 ; python_version >= "3.11" and python_version < "3.12" keyring==24.2.0 ; python_version >= "3.11" and python_version < "3.12" -macholib==1.16.2 ; python_version >= "3.11" and python_version < "3.12" and sys_platform == "darwin" +macholib==1.16.3 ; python_version >= "3.11" and python_version < "3.12" and sys_platform == "darwin" markupsafe==2.1.3 ; python_version >= "3.11" and python_version < "3.12" more-itertools==10.1.0 ; python_version >= "3.11" and python_version < "3.12" msgpack==1.0.5 ; python_version >= "3.11" and python_version < "3.12" @@ -48,8 +48,8 @@ pre-commit==2.21.0 ; python_version >= "3.11" and python_version < "3.12" ptyprocess==0.7.0 ; python_version >= "3.11" and python_version < "3.12" pycparser==2.21 ; python_version >= "3.11" and python_version < "3.12" pydantic==1.10.12 ; python_version >= "3.11" and python_version < "3.12" -pyinstaller-hooks-contrib==2023.7 ; python_version >= "3.11" and python_version < "3.12" -pyinstaller==5.13.2 ; python_version >= "3.11" and python_version < "3.12" +pyinstaller-hooks-contrib==2023.10 ; python_version >= "3.11" and python_version < "3.12" +pyinstaller==6.2.0 ; python_version >= "3.11" and python_version < "3.12" pyopenjtalk @ git+https://github.com/VOICEVOX/pyopenjtalk@b35fc89fe42948a28e33aed886ea145a51113f88 ; python_version >= "3.11" and python_version < "3.12" pyproject-hooks==1.0.0 ; python_version >= "3.11" and python_version < "3.12" pyrsistent==0.19.3 ; python_version >= "3.11" and python_version < "3.12" @@ -62,7 +62,7 @@ requests-toolbelt==1.0.0 ; python_version >= "3.11" and python_version < "3.12" requests==2.31.0 ; python_version >= "3.11" and python_version < "3.12" secretstorage==3.3.3 ; python_version >= "3.11" and python_version < "3.12" and sys_platform == "linux" semver==3.0.1 ; python_version >= "3.11" and python_version < "3.12" -setuptools==68.1.2 ; python_version >= "3.11" and python_version < "3.12" +setuptools==69.0.2 ; python_version >= "3.11" and python_version < "3.12" shellingham==1.5.3 ; python_version >= "3.11" and python_version < "3.12" six==1.16.0 ; python_version >= "3.11" and python_version < "3.12" sniffio==1.3.0 ; python_version >= "3.11" and python_version < "3.12" diff --git a/run.py b/run.py index aa3c3afec..bfb8cba14 100644 --- a/run.py +++ b/run.py @@ -82,6 +82,7 @@ engine_root, get_latest_core_version, get_save_dir, + internal_root, ) @@ -209,7 +210,7 @@ async def block_origin_middleware(request: Request, call_next): metas_store = MetasStore(root_dir / "speaker_info") - setting_ui_template = Jinja2Templates(directory=engine_root() / "ui_template") + setting_ui_template = Jinja2Templates(directory=internal_root() / "ui_template") # キャッシュを有効化 # モジュール側でlru_cacheを指定するとキャッシュを制御しにくいため、HTTPサーバ側で指定する diff --git a/run.spec b/run.spec index 970f2adfa..65642c61d 100644 --- a/run.spec +++ b/run.spec @@ -1,49 +1,42 @@ # -*- mode: python ; coding: utf-8 -*- # このファイルはPyInstallerによって自動生成されたもので、それをカスタマイズして使用しています。 +from argparse import ArgumentParser +from pathlib import Path +from shutil import copy2, copytree + from PyInstaller.utils.hooks import collect_data_files -import os + +parser = ArgumentParser() +parser.add_argument("--libcore_path", type=Path) +parser.add_argument("--libonnxruntime_path", type=Path) +parser.add_argument("--core_model_dir_path", type=Path) +options = parser.parse_args() + +libonnxruntime_path: Path | None = options.libonnxruntime_path +if libonnxruntime_path is not None and not libonnxruntime_path.is_file(): + raise Exception(f"libonnxruntime_path: {libonnxruntime_path} is not file") + +libcore_path: Path | None = options.libcore_path +if libcore_path is not None and not libcore_path.is_file(): + raise Exception(f"libcore_path: {libcore_path} is not file") + +core_model_dir_path: Path | None = options.core_model_dir_path +if core_model_dir_path is not None and not core_model_dir_path.is_dir(): + raise Exception(f"core_model_dir_path: {core_model_dir_path} is not dir") datas = [ - ('engine_manifest_assets', 'engine_manifest_assets'), - ('speaker_info', 'speaker_info'), - ('engine_manifest.json', '.'), - ('default.csv', '.'), - ('licenses.json', '.'), - ('presets.yaml', '.'), - ('ui_template', 'ui_template'), + ("default.csv", "."), + ("presets.yaml", "."), + ("ui_template", "ui_template"), ] -datas += collect_data_files('pyopenjtalk') - -core_model_dir_path = os.environ.get('CORE_MODEL_DIR_PATH') -if core_model_dir_path: - print('CORE_MODEL_DIR_PATH is found:', core_model_dir_path) - if not os.path.isdir(core_model_dir_path): - raise Exception("CORE_MODEL_DIR_PATH was found, but it is not directory!") - datas += [(core_model_dir_path, "model")] - -# コアとONNX Runtimeはバイナリであるが、`binaries`に加えると -# 依存関係のパスがPyInstallerに書き換えらるので、`datas`に加える -# 参考: https://github.com/VOICEVOX/voicevox_engine/pull/446#issuecomment-1210052318 -libcore_path = os.environ.get('LIBCORE_PATH') -if libcore_path: - print('LIBCORE_PATH is found:', libcore_path) - if not os.path.isfile(libcore_path): - raise Exception("LIBCORE_PATH was found, but it is not file!") - datas += [(libcore_path, ".")] - -libonnxruntime_path = os.environ.get('LIBONNXRUNTIME_PATH') -if libonnxruntime_path: - print('LIBONNXRUNTIME_PATH is found:', libonnxruntime_path) - if not os.path.isfile(libonnxruntime_path): - raise Exception("LIBCORE_PATH was found, but it is not file!") - datas += [(libonnxruntime_path, ".")] +datas += collect_data_files("pyopenjtalk") block_cipher = None a = Analysis( - ['run.py'], + ["run.py"], pathex=[], binaries=[], datas=datas, @@ -65,7 +58,7 @@ exe = EXE( a.scripts, [], exclude_binaries=True, - name='run', + name="run", debug=False, bootloader_ignore_signals=False, strip=False, @@ -76,6 +69,7 @@ exe = EXE( target_arch=None, codesign_identity=None, entitlements_file=None, + contents_directory="engine_internal", ) coll = COLLECT( @@ -86,5 +80,27 @@ coll = COLLECT( strip=False, upx=True, upx_exclude=[], - name='run', + name="run", ) + +# 実行ファイル作成後の処理 + +# 実行ファイルと同じrootディレクトリ +target_dir = Path(DISTPATH) / "run" + +# 動的ライブラリをコピー +if libonnxruntime_path is not None: + copy2(libonnxruntime_path, target_dir) +if libcore_path is not None: + copy2(libcore_path, target_dir) +if core_model_dir_path is not None: + copytree(core_model_dir_path, target_dir / "model") + +# 互換性維持のために必要なファイルをコピー +license_file_path = Path("licenses.json") +if license_file_path.is_file(): + copy2("licenses.json", target_dir) + +copytree("speaker_info", target_dir / "speaker_info") +copy2("engine_manifest.json", target_dir) +copytree("engine_manifest_assets", target_dir / "engine_manifest_assets") diff --git a/voicevox_engine/user_dict.py b/voicevox_engine/user_dict.py index abdf39f48..a64cb2363 100644 --- a/voicevox_engine/user_dict.py +++ b/voicevox_engine/user_dict.py @@ -13,9 +13,9 @@ from .model import UserDictWord, WordTypes from .part_of_speech_data import MAX_PRIORITY, MIN_PRIORITY, part_of_speech_data -from .utility import engine_root, get_save_dir, mutex_wrapper +from .utility import get_save_dir, internal_root, mutex_wrapper -root_dir = engine_root() +root_dir = internal_root() save_dir = get_save_dir() if not save_dir.is_dir(): diff --git a/voicevox_engine/utility/__init__.py b/voicevox_engine/utility/__init__.py index d40fea3e6..7ed74f118 100644 --- a/voicevox_engine/utility/__init__.py +++ b/voicevox_engine/utility/__init__.py @@ -5,7 +5,7 @@ ) from .core_version_utility import get_latest_core_version, parse_core_version from .mutex_utility import mutex_wrapper -from .path_utility import delete_file, engine_root, get_save_dir +from .path_utility import delete_file, engine_root, get_save_dir, internal_root __all__ = [ "ConnectBase64WavesException", @@ -13,8 +13,9 @@ "decode_base64_waves", "get_latest_core_version", "parse_core_version", + "mutex_wrapper", "delete_file", "engine_root", "get_save_dir", - "mutex_wrapper", + "internal_root", ] diff --git a/voicevox_engine/utility/path_utility.py b/voicevox_engine/utility/path_utility.py index 7c46ad40b..6c5c36ca2 100644 --- a/voicevox_engine/utility/path_utility.py +++ b/voicevox_engine/utility/path_utility.py @@ -2,35 +2,60 @@ import sys import traceback from pathlib import Path +from typing import Literal from platformdirs import user_data_dir +def _runtime_type() -> Literal["nuitka", "pyinstaller", "python"]: + """ + コンパイルに使用したライブラリ名を返す。 + コンパイルしていない場合は"python"を返す。 + """ + # nuitkaビルドをした際はグローバルに__compiled__が含まれる + if "__compiled__" in globals(): + return "nuitka" + + # pyinstallerでビルドをした際はsys.frozenが設定される + elif getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"): + return "pyinstaller" + + return "python" + + def engine_root() -> Path: - if is_development(): - root_dir = Path(__file__).parents[2] + """ + 開発環境ではリポジトリのルートディレクトリを返す。 + コンパイル後は実行ファイルがあるディレクトリを返す。 + """ + runtime = _runtime_type() + if runtime == "nuitka": + root_dir = Path(sys.argv[0]).parent + + elif runtime == "pyinstaller": + root_dir = Path(sys.executable).parent - # Nuitka/Pyinstallerでビルドされている場合 else: - root_dir = Path(sys.argv[0]).parent + root_dir = Path(__file__).parents[2] return root_dir.resolve(strict=True) +def internal_root() -> Path: + """ + コンパイル時に収集された実行ファイル内部用のルートディレクトリを返す。 + 開発環境ではリポジトリのルートディレクトリを返す。 + """ + root_dir = Path(__file__).parents[2] + return root_dir.resolve(strict=True) + + def is_development() -> bool: """ 開発版かどうか判定する関数 Nuitka/Pyinstallerでコンパイルされていない場合は開発環境とする。 """ - # nuitkaビルドをした際はグローバルに__compiled__が含まれる - if "__compiled__" in globals(): - return False - - # pyinstallerでビルドをした際はsys.frozenが設定される - elif getattr(sys, "frozen", False): - return False - - return True + return _runtime_type() == "python" def get_save_dir(): From d0b8fffa56de30fa93bb67afbc9a48b3e54b7965 Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 18 Dec 2023 06:30:32 +0900 Subject: [PATCH 035/177] =?UTF-8?q?hotifx:=20NumPy=20=E3=81=AE=20deprecate?= =?UTF-8?q?d=20=E3=81=AA=20cast=20(#888)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_synthesis_engine.py | 10 +++++----- test/test_synthesis_engine_base.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 84064bdea..8c01d18ef 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -65,7 +65,7 @@ def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, style_id: numpy.nda result = [] # mockとしての適当な処理、特に意味はない for i in range(length): - result.append(float(phoneme_list[i] * 0.5 + style_id)) + result.append((phoneme_list[i] * 0.5 + style_id).item()) return numpy.array(result) @@ -83,7 +83,7 @@ def yukarin_sa_mock( # mockとしての適当な処理、特に意味はない for i in range(length): result.append( - float( + ( ( vowel_phoneme_list[0][i] + consonant_phoneme_list[0][i] @@ -94,7 +94,7 @@ def yukarin_sa_mock( ) * 0.5 + style_id - ) + ).item() ) return numpy.array(result)[numpy.newaxis] @@ -112,10 +112,10 @@ def decode_mock( # decode forwardはデータサイズがlengthの256倍になるのでとりあえず256回データをresultに入れる for _ in range(256): result.append( - float( + ( f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) + style_id - ) + ).item() ) return numpy.array(result) diff --git a/test/test_synthesis_engine_base.py b/test/test_synthesis_engine_base.py index 7fa8fd676..ecee4df66 100644 --- a/test/test_synthesis_engine_base.py +++ b/test/test_synthesis_engine_base.py @@ -12,7 +12,7 @@ def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, style_id: numpy.nda result = [] # mockとしての適当な処理、特に意味はない for i in range(length): - result.append(round(float(phoneme_list[i] * 0.0625 + style_id), 2)) + result.append(round((phoneme_list[i] * 0.0625 + style_id).item(), 2)) return numpy.array(result) @@ -31,7 +31,7 @@ def yukarin_sa_mock( for i in range(length): result.append( round( - float( + ( ( vowel_phoneme_list[0][i] + consonant_phoneme_list[0][i] @@ -42,7 +42,7 @@ def yukarin_sa_mock( ) * 0.0625 + style_id - ), + ).item(), 2, ) ) @@ -62,10 +62,10 @@ def decode_mock( # decode forwardはデータサイズがlengthの256倍になるのでとりあえず256回データをresultに入れる for _ in range(256): result.append( - float( + ( f0[i][0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) + style_id - ) + ).item() ) return numpy.array(result) From b6a04775b91a5e1e3984e41c51ccaf9676c3882c Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 18 Dec 2023 06:32:55 +0900 Subject: [PATCH 036/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`=5Fsynthesis=5Fi?= =?UTF-8?q?mpl`=20=E5=89=8D=E5=87=A6=E7=90=86/=E5=BE=8C=E5=87=A6=E7=90=86?= =?UTF-8?q?=E3=81=AE=E9=96=A2=E6=95=B0=E5=8C=96=20(#873)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_synthesis_engine.py | 23 +++------ voicevox_engine/tts_pipeline/tts_engine.py | 58 ++++++++++++++++------ 2 files changed, 49 insertions(+), 32 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 8c01d18ef..416a18feb 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -26,6 +26,7 @@ calc_frame_pitch, mora_phoneme_list, pre_process, + query_to_decoder_feature, split_mora, to_flatten_moras, to_flatten_phonemes, @@ -446,8 +447,8 @@ def test_calc_frame_phoneme(): assert numpy.array_equal(frame_phoneme, true_frame_phoneme) -def test_feat_to_framescale(): - """Test Mora/Phonemefeature-to-framescaleFeature pipeline.""" +def test_query_to_decoder_feature(): + """Test `query_to_decoder_feature`.""" # Inputs accent_phrases = [ AccentPhrase( @@ -484,9 +485,9 @@ def test_feat_to_framescale(): # phoneme # Pr k o o N N pau h i i h h O Pt Pt Pt frame_phoneme_idxs = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0] - true_frame_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32) + true_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32) for frame_idx, phoneme_idx in enumerate(frame_phoneme_idxs): - true_frame_phoneme[frame_idx, phoneme_idx] = 1.0 + true_phoneme[frame_idx, phoneme_idx] = 1.0 # Pitch # paw ko N pau hi hO paw # frame_per_vowel = [1, 3, 2, 1, 3, 3, 3] @@ -499,19 +500,9 @@ def test_feat_to_framescale(): true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32) # Outputs - flatten_moras = to_flatten_moras(query.accent_phrases) - flatten_moras = apply_prepost_silence(flatten_moras, query) - flatten_moras = apply_speed_scale(flatten_moras, query) - flatten_moras = apply_pitch_scale(flatten_moras, query) - flatten_moras = apply_intonation_scale(flatten_moras, query) - - phoneme_data_list = to_flatten_phonemes(flatten_moras) + phoneme, f0 = query_to_decoder_feature(query) - frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) - f0 = calc_frame_pitch(flatten_moras) - frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) - - assert numpy.array_equal(frame_phoneme, true_frame_phoneme) + assert numpy.array_equal(phoneme, true_phoneme) assert numpy.array_equal(f0, true_f0) diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index c05c122dd..bbd1d537e 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -359,6 +359,44 @@ def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray: return wave +def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]: + """ + 音声合成用のクエリをデコーダー用特徴量へ変換する。 + Parameters + ---------- + query : AudioQuery + 音声合成クエリ + Returns + ------- + phoneme : ndarray + フレームごとの音素、shape=(Frame,) + f0 : ndarray + フレームごとの基本周波数、shape=(Frame,) + """ + flatten_moras = to_flatten_moras(query.accent_phrases) + + flatten_moras = apply_prepost_silence(flatten_moras, query) + flatten_moras = apply_speed_scale(flatten_moras, query) + flatten_moras = apply_pitch_scale(flatten_moras, query) + flatten_moras = apply_intonation_scale(flatten_moras, query) + + phoneme_data_list = to_flatten_phonemes(flatten_moras) + + frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) + f0 = calc_frame_pitch(flatten_moras) + phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) + + return phoneme, f0 + + +def raw_wave_to_output_wave(query: AudioQuery, wave: ndarray, sr_wave: int) -> ndarray: + """生音声波形に音声合成用のクエリを適用して出力音声波形を生成する""" + wave = apply_volume_scale(wave, query) + wave = apply_output_sampling_rate(wave, sr_wave, query) + wave = apply_output_stereo(wave, query) + return wave + + class SynthesisEngine(SynthesisEngineBase): """音声合成器(core)の管理/実行/プロキシと音声合成フロー""" @@ -614,31 +652,19 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): # モデルがロードされていない場合はロードする self.initialize_style_id_synthesis(style_id, skip_reinit=True) - flatten_moras = to_flatten_moras(query.accent_phrases) - flatten_moras = apply_prepost_silence(flatten_moras, query) - flatten_moras = apply_speed_scale(flatten_moras, query) - flatten_moras = apply_pitch_scale(flatten_moras, query) - flatten_moras = apply_intonation_scale(flatten_moras, query) - - phoneme_data_list = to_flatten_phonemes(flatten_moras) - - frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) - f0 = calc_frame_pitch(flatten_moras) - phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) + phoneme, f0 = query_to_decoder_feature(query) # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する with self.mutex: - wave = self.core.decode_forward( + raw_wave = self.core.decode_forward( length=phoneme.shape[0], phoneme_size=phoneme.shape[1], f0=f0[:, numpy.newaxis], phoneme=phoneme, style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1), ) - sr_wave = self.default_sampling_rate + sr_raw_wave = self.default_sampling_rate - wave = apply_volume_scale(wave, query) - wave = apply_output_sampling_rate(wave, sr_wave, query) - wave = apply_output_stereo(wave, query) + wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) return wave From 92af86fb107853e6367bf682bfdc05611eb1df31 Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 18 Dec 2023 06:39:07 +0900 Subject: [PATCH 037/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20Ojt=E3=83=89?= =?UTF-8?q?=E3=83=A1=E3=82=A4=E3=83=B3=E5=A4=89=E6=8F=9B=E5=88=87=E3=82=8A?= =?UTF-8?q?=E5=87=BA=E3=81=97=E3=81=A8=E9=9B=86=E7=B4=84=20(#889)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tts_pipeline/tts_engine_base.py | 79 +++++++++++-------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/voicevox_engine/tts_pipeline/tts_engine_base.py b/voicevox_engine/tts_pipeline/tts_engine_base.py index f4eeda039..3a846905e 100644 --- a/voicevox_engine/tts_pipeline/tts_engine_base.py +++ b/voicevox_engine/tts_pipeline/tts_engine_base.py @@ -6,7 +6,7 @@ from ..model import AccentPhrase, AudioQuery, Mora from . import full_context_label -from .full_context_label import extract_full_context_label +from .full_context_label import Utterance, extract_full_context_label from .mora_list import openjtalk_mora2text @@ -131,6 +131,47 @@ def full_context_label_moras_to_moras( ] +def utterance_to_accent_phrases(utterance: Utterance) -> list[AccentPhrase]: + """Utteranceインスタンスをアクセント句系列へドメイン変換する""" + return [ + AccentPhrase( + moras=full_context_label_moras_to_moras(accent_phrase.moras), + accent=accent_phrase.accent, + pause_mora=( + Mora( + text="、", + consonant=None, + consonant_length=None, + vowel="pau", + vowel_length=0, + pitch=0, + ) + if ( + i_accent_phrase == len(breath_group.accent_phrases) - 1 + and i_breath_group != len(utterance.breath_groups) - 1 + ) + else None + ), + is_interrogative=accent_phrase.is_interrogative, + ) + for i_breath_group, breath_group in enumerate(utterance.breath_groups) + for i_accent_phrase, accent_phrase in enumerate(breath_group.accent_phrases) + ] + + +def test_to_accent_phrases(text: str) -> list[AccentPhrase]: + """日本語テキストからアクセント句系列を生成""" + if len(text.strip()) == 0: + return [] + + # 音素とアクセントの推定 + utterance = extract_full_context_label(text) + if len(utterance.breath_groups) == 0: + return [] + + return utterance_to_accent_phrases(utterance) + + class SynthesisEngineBase(metaclass=ABCMeta): @property @abstractmethod @@ -260,42 +301,12 @@ def create_accent_phrases(self, text: str, style_id: int) -> List[AccentPhrase]: accent_phrases : List[AccentPhrase] アクセント句系列 """ - if len(text.strip()) == 0: - return [] - # 音素とアクセントの推定 - utterance = extract_full_context_label(text) - if len(utterance.breath_groups) == 0: - return [] + accent_phrases = test_to_accent_phrases(text) - # Utterance -> List[AccentPharase] のキャスト & 音素長・モーラ音高の推定と更新 + # 音素長・モーラ音高の推定と更新 accent_phrases = self.replace_mora_data( - accent_phrases=[ - AccentPhrase( - moras=full_context_label_moras_to_moras(accent_phrase.moras), - accent=accent_phrase.accent, - pause_mora=( - Mora( - text="、", - consonant=None, - consonant_length=None, - vowel="pau", - vowel_length=0, - pitch=0, - ) - if ( - i_accent_phrase == len(breath_group.accent_phrases) - 1 - and i_breath_group != len(utterance.breath_groups) - 1 - ) - else None - ), - is_interrogative=accent_phrase.is_interrogative, - ) - for i_breath_group, breath_group in enumerate(utterance.breath_groups) - for i_accent_phrase, accent_phrase in enumerate( - breath_group.accent_phrases - ) - ], + accent_phrases=accent_phrases, style_id=style_id, ) return accent_phrases From 43b4e72ae65ede20b555a6816b5aacce19f17b94 Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 18 Dec 2023 06:40:14 +0900 Subject: [PATCH 038/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20OpenJTalk=20?= =?UTF-8?q?=E7=B3=BB=E3=81=AE=E4=B8=8D=E4=BD=BF=E7=94=A8=E3=83=A1=E3=82=BD?= =?UTF-8?q?=E3=83=83=E3=83=89=E5=89=8A=E9=99=A4=20(#890)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_full_context_label.py | 14 ------------- .../tts_pipeline/full_context_label.py | 20 ------------------- 2 files changed, 34 deletions(-) diff --git a/test/test_full_context_label.py b/test/test_full_context_label.py index 0c9ce3ee0..153c8b35f 100644 --- a/test/test_full_context_label.py +++ b/test/test_full_context_label.py @@ -266,20 +266,6 @@ def test_labels(self): self.accent_phrase_hiho.labels, self.test_case_hello_hiho[11:19] ) - def test_merge(self): - # 「こんにちはヒホです」 - # 読点を無くしたものと同等 - merged_accent_phrase = self.accent_phrase_hello.merge(self.accent_phrase_hiho) - self.assertEqual(merged_accent_phrase.accent, 5) - self.assertEqual( - " ".join([phoneme.phoneme for phoneme in merged_accent_phrase.phonemes]), - "k o N n i ch i w a h i h o d e s U", - ) - self.assertEqual( - merged_accent_phrase.labels, - self.test_case_hello_hiho[1:10] + self.test_case_hello_hiho[11:19], - ) - class TestBreathGroup(TestBasePhonemes): def setUp(self) -> None: diff --git a/voicevox_engine/tts_pipeline/full_context_label.py b/voicevox_engine/tts_pipeline/full_context_label.py index 1e61a17b0..50517f819 100644 --- a/voicevox_engine/tts_pipeline/full_context_label.py +++ b/voicevox_engine/tts_pipeline/full_context_label.py @@ -267,26 +267,6 @@ def labels(self): """ return [p.label for p in self.phonemes] - def merge(self, accent_phrase: "AccentPhrase"): - """ - AccentPhraseを合成する - (このクラスが保持するmorasの後ろに、引数として渡されたAccentPhraseのmorasを合成する) - Parameters - ---------- - accent_phrase : AccentPhrase - 合成したいAccentPhraseを渡す - - Returns - ------- - accent_phrase : AccentPhrase - 合成されたAccentPhraseを返す - """ - return AccentPhrase( - moras=self.moras + accent_phrase.moras, - accent=self.accent, - is_interrogative=accent_phrase.is_interrogative, - ) - @dataclass class BreathGroup: From 35b7158f8ad0bd521dda999f5c332569d92bb6bd Mon Sep 17 00:00:00 2001 From: tarepan Date: Mon, 18 Dec 2023 07:16:15 +0900 Subject: [PATCH 039/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=86?= =?UTF-8?q?=E3=82=B9=E3=83=88=E7=94=A8=E3=83=A1=E3=82=BD=E3=83=83=E3=83=89?= =?UTF-8?q?=E5=88=87=E3=82=8A=E5=87=BA=E3=81=97=20(#891)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- test/test_full_context_label.py | 42 ++++++++++-- .../tts_pipeline/full_context_label.py | 68 ------------------- 2 files changed, 35 insertions(+), 75 deletions(-) diff --git a/test/test_full_context_label.py b/test/test_full_context_label.py index 153c8b35f..87bf57785 100644 --- a/test/test_full_context_label.py +++ b/test/test_full_context_label.py @@ -11,6 +11,29 @@ ) +def contexts_to_feature(contexts: dict[str, str]) -> str: + """ラベルの contexts を feature へ変換する""" + return ( + "{p1}^{p2}-{p3}+{p4}={p5}" + "/A:{a1}+{a2}+{a3}" + "/B:{b1}-{b2}_{b3}" + "/C:{c1}_{c2}+{c3}" + "/D:{d1}+{d2}_{d3}" + "/E:{e1}_{e2}!{e3}_{e4}-{e5}" + "/F:{f1}_{f2}#{f3}_{f4}@{f5}_{f6}|{f7}_{f8}" + "/G:{g1}_{g2}%{g3}_{g4}_{g5}" + "/H:{h1}_{h2}" + "/I:{i1}-{i2}@{i3}+{i4}&{i5}-{i6}|{i7}+{i8}" + "/J:{j1}_{j2}" + "/K:{k1}+{k2}-{k3}" + ).format(**contexts) + + +def features(ojt_container: Mora | AccentPhrase | BreathGroup | Utterance): + """コンテナインスタンスに直接的・間接的に含まれる全ての feature を返す""" + return [contexts_to_feature(p.contexts) for p in ojt_container.phonemes] + + class TestBasePhonemes(TestCase): def setUp(self): super().setUp() @@ -140,7 +163,10 @@ def test_is_pause(self): def test_label(self) -> None: self.assertEqual( - [phoneme.label for phoneme in self.phonemes_hello_hiho], + [ + contexts_to_feature(phoneme.contexts) + for phoneme in self.phonemes_hello_hiho + ], self.test_case_hello_hiho, ) @@ -189,7 +215,9 @@ def assert_phonemes(self, mora: Mora, mora_str: str) -> None: ) def assert_labels(self, mora: Mora, label_start: int, label_end: int) -> None: - self.assertEqual(mora.labels, self.test_case_hello_hiho[label_start:label_end]) + self.assertEqual( + features(mora), self.test_case_hello_hiho[label_start:label_end] + ) def test_phonemes(self) -> None: self.assert_phonemes(self.mora_hello_1, "ko") @@ -260,10 +288,10 @@ def test_phonemes(self): def test_labels(self): self.assertEqual( - self.accent_phrase_hello.labels, self.test_case_hello_hiho[1:10] + features(self.accent_phrase_hello), self.test_case_hello_hiho[1:10] ) self.assertEqual( - self.accent_phrase_hiho.labels, self.test_case_hello_hiho[11:19] + features(self.accent_phrase_hiho), self.test_case_hello_hiho[11:19] ) @@ -299,10 +327,10 @@ def test_phonemes(self): def test_labels(self): self.assertEqual( - self.breath_group_hello.labels, self.test_case_hello_hiho[1:10] + features(self.breath_group_hello), self.test_case_hello_hiho[1:10] ) self.assertEqual( - self.breath_group_hiho.labels, self.test_case_hello_hiho[11:19] + features(self.breath_group_hiho), self.test_case_hello_hiho[11:19] ) @@ -387,4 +415,4 @@ def test_phonemes(self): ) def test_labels(self): - self.assertEqual(self.utterance_hello_hiho.labels, self.test_case_hello_hiho) + self.assertEqual(features(self.utterance_hello_hiho), self.test_case_hello_hiho) diff --git a/voicevox_engine/tts_pipeline/full_context_label.py b/voicevox_engine/tts_pipeline/full_context_label.py index 50517f819..6c1204d7b 100644 --- a/voicevox_engine/tts_pipeline/full_context_label.py +++ b/voicevox_engine/tts_pipeline/full_context_label.py @@ -54,30 +54,6 @@ def from_label(cls, label: str): ).groupdict() return cls(contexts=contexts) - @property - def label(self): - """ - pyopenjtalk.extract_fullcontextで得られるラベルと等しい - Returns - ------- - lebel: str - ラベルを返す - """ - return ( - "{p1}^{p2}-{p3}+{p4}={p5}" - "/A:{a1}+{a2}+{a3}" - "/B:{b1}-{b2}_{b3}" - "/C:{c1}_{c2}+{c3}" - "/D:{d1}+{d2}_{d3}" - "/E:{e1}_{e2}!{e3}_{e4}-{e5}" - "/F:{f1}_{f2}#{f3}_{f4}@{f5}_{f6}|{f7}_{f8}" - "/G:{g1}_{g2}%{g3}_{g4}_{g5}" - "/H:{h1}_{h2}" - "/I:{i1}-{i2}@{i3}+{i4}&{i5}-{i6}|{i7}+{i8}" - "/J:{j1}_{j2}" - "/K:{k1}+{k2}-{k3}" - ).format(**self.contexts) - @property def phoneme(self): """ @@ -149,17 +125,6 @@ def phonemes(self): else: return [self.vowel] - @property - def labels(self): - """ - ラベル群を返す - Returns - ------- - labels : list[str] - Moraに含まれるすべてのラベルを返す - """ - return [p.label for p in self.phonemes] - @dataclass class AccentPhrase: @@ -256,17 +221,6 @@ def phonemes(self): """ return list(chain.from_iterable(m.phonemes for m in self.moras)) - @property - def labels(self): - """ - ラベル群を返す - Returns - ------- - labels : list[str] - AccentPhraseに間接的に含まれる全てのラベルを返す - """ - return [p.label for p in self.phonemes] - @dataclass class BreathGroup: @@ -341,17 +295,6 @@ def phonemes(self): ) ) - @property - def labels(self): - """ - ラベル群を返す - Returns - ------- - labels : list[str] - BreathGroupに間接的に含まれる全てのラベルを返す - """ - return [p.label for p in self.phonemes] - @dataclass class Utterance: @@ -494,17 +437,6 @@ def phonemes(self): return phonemes - @property - def labels(self): - """ - ラベル群を返す - Returns - ------- - labels : list[str] - Utteranceクラスに直接的・間接的に含まれる全てのラベルを返す - """ - return [p.label for p in self.phonemes] - def extract_full_context_label(text: str): """ From 216b40652e23b6510b0bd0285b1551f6db9cd1b3 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 20 Dec 2023 01:37:33 +0900 Subject: [PATCH 040/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E3=83=95?= =?UTF-8?q?=E3=83=AC=E3=83=BC=E3=83=A0=E6=95=B0=E3=82=AB=E3=82=A6=E3=83=B3?= =?UTF-8?q?=E3=83=88=E5=85=B1=E9=80=9A=E5=8C=96=20(#898)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_synthesis_engine.py | 40 +++++---------- voicevox_engine/tts_pipeline/tts_engine.py | 57 ++++++++++------------ 2 files changed, 36 insertions(+), 61 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 416a18feb..1f0d5016b 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -20,10 +20,9 @@ apply_prepost_silence, apply_speed_scale, apply_volume_scale, - calc_frame_per_mora, - calc_frame_per_phoneme, calc_frame_phoneme, calc_frame_pitch, + count_frame_per_unit, mora_phoneme_list, pre_process, query_to_decoder_feature, @@ -349,8 +348,8 @@ def test_apply_output_stereo(): assert numpy.array_equal(wave, true_wave) -def test_calc_frame_per_phoneme(): - """Test `calc_frame_per_phoneme`.""" +def test_count_frame_per_unit(): + """Test `count_frame_per_unit`.""" # Inputs moras = [ _gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] @@ -366,35 +365,15 @@ def test_calc_frame_per_phoneme(): # Pre k o N pau h i h O Pst true_frame_per_phoneme = [2, 2, 4, 4, 2, 2, 4, 4, 2, 6] true_frame_per_phoneme = numpy.array(true_frame_per_phoneme, dtype=numpy.int32) - - # Outputs - frame_per_phoneme = calc_frame_per_phoneme(moras) - - assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme) - - -def test_calc_frame_per_mora(): - """Test `calc_frame_per_mora`.""" - # Inputs - moras = [ - _gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] - _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), - _gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0), - _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0), - _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), - _gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0), - ] - - # Expects # Pre ko N pau hi hO Pst true_frame_per_mora = [2, 6, 4, 2, 6, 6, 6] true_frame_per_mora = numpy.array(true_frame_per_mora, dtype=numpy.int32) # Outputs - frame_per_phoneme = numpy.array(list(map(calc_frame_per_mora, moras))) + frame_per_phoneme, frame_per_mora = count_frame_per_unit(moras) - assert numpy.array_equal(frame_per_phoneme, true_frame_per_mora) + assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme) + assert numpy.array_equal(frame_per_mora, true_frame_per_mora) def test_calc_frame_pitch(): @@ -409,8 +388,11 @@ def test_calc_frame_pitch(): _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), _gen_mora(" ", None, None, " ", 3 * 0.01067, 0.0), ] + # Pre ko N pau hi hO Pst + frame_per_mora = [1, 3, 2, 1, 3, 3, 3] + frame_per_mora = numpy.array(frame_per_mora, dtype=numpy.int32) - # pau ko ko ko N N + # pau ko ko ko N N true1_f0 = [0.0, 50.0, 50.0, 50.0, 50.0, 50.0] # pau hi hi hi true2_f0 = [0.0, 125.0, 125.0, 125.0] @@ -419,7 +401,7 @@ def test_calc_frame_pitch(): true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32) # Outputs - f0 = calc_frame_pitch(moras) + f0 = calc_frame_pitch(moras, frame_per_mora) assert numpy.array_equal(f0, true_f0) diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index bbd1d537e..67ebb9564 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -163,25 +163,36 @@ def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def calc_frame_per_phoneme(moras: List[Mora]): +def count_frame_per_unit(moras: list[Mora]) -> tuple[ndarray, ndarray]: """ - 音素あたりのフレーム長を算出 + 音素あたり・モーラあたりのフレーム長を算出する Parameters ---------- - moras : List[Mora] - モーラ列 + moras : list[Mora] + モーラ系列 Returns ------- - frame_per_phoneme : NDArray[] - 音素あたりのフレーム長。端数丸め。 + frame_per_phoneme : ndarray + 音素あたりのフレーム長。端数丸め。shape = (Phoneme,) + frame_per_mora : ndarray + モーラあたりのフレーム長。端数丸め。shape = (Mora,) """ frame_per_phoneme: list[ndarray] = [] + frame_per_mora: list[ndarray] = [] for mora in moras: + vowel_frames = _to_frame(mora.vowel_length) + consonant_frames = _to_frame(mora.consonant_length) if mora.consonant else 0 + mora_frames = vowel_frames + consonant_frames # 音素ごとにフレーム長を算出し、和をモーラのフレーム長とする + if mora.consonant: - frame_per_phoneme.append(_to_frame(mora.consonant_length)) - frame_per_phoneme.append(_to_frame(mora.vowel_length)) + frame_per_phoneme += [consonant_frames] + frame_per_phoneme += [vowel_frames] + frame_per_mora += [mora_frames] + frame_per_phoneme = numpy.array(frame_per_phoneme) - return frame_per_phoneme + frame_per_mora = numpy.array(frame_per_mora) + + return frame_per_phoneme, frame_per_mora def _to_frame(sec: float) -> ndarray: @@ -190,24 +201,6 @@ def _to_frame(sec: float) -> ndarray: return numpy.round(sec * FRAMERATE).astype(numpy.int32) -def calc_frame_per_mora(mora: Mora) -> ndarray: - """ - モーラあたりのフレーム長を算出 - Parameters - ---------- - mora : Mora - モーラ - Returns - ------- - frame_per_mora : NDArray[] - モーラあたりのフレーム長。端数丸め。 - """ - # 音素ごとにフレーム長を算出し、和をモーラのフレーム長とする - vowel_frames = _to_frame(mora.vowel_length) - consonant_frames = _to_frame(mora.consonant_length) if mora.consonant else 0 - return vowel_frames + consonant_frames - - def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: """ 音高スケール(`pitchScale`)の適用 @@ -250,13 +243,15 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def calc_frame_pitch(moras: list[Mora]) -> ndarray: +def calc_frame_pitch(moras: list[Mora], frame_per_mora: ndarray) -> ndarray: """ フレームごとのピッチの生成 Parameters ---------- moras : List[Mora] モーラ列 + frame_per_mora : ndarray + モーラあたりのフレーム長 Returns ------- frame_f0 : NDArray[] @@ -267,8 +262,6 @@ def calc_frame_pitch(moras: list[Mora]) -> ndarray: f0 = numpy.array([mora.pitch for mora in moras], dtype=numpy.float32) # Rescale: 時間スケールの変更(モーラ -> フレーム) - # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約 - frame_per_mora = numpy.array(list(map(calc_frame_per_mora, moras))) frame_f0 = numpy.repeat(f0, frame_per_mora) return frame_f0 @@ -382,8 +375,8 @@ def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]: phoneme_data_list = to_flatten_phonemes(flatten_moras) - frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) - f0 = calc_frame_pitch(flatten_moras) + frame_per_phoneme, frame_per_mora = count_frame_per_unit(flatten_moras) + f0 = calc_frame_pitch(flatten_moras, frame_per_mora) phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) return phoneme, f0 From 92a05c1fa9b250758a6f48e4bebdf98b8cd1326f Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 20 Dec 2023 02:42:59 +0900 Subject: [PATCH 041/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20`SynthesisEngine`?= =?UTF-8?q?=20=E2=86=92=20`TTSEngine`=20=E6=94=B9=E5=90=8D=20(#870)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build_util/make_docs.py | 4 ++-- run.py | 6 +++--- test/test_mock_synthesis_engine.py | 6 +++--- test/test_synthesis_engine.py | 8 ++++---- test/test_synthesis_engine_base.py | 6 +++--- voicevox_engine/dev/core/mock.py | 2 +- voicevox_engine/dev/synthesis_engine/__init__.py | 4 ++-- voicevox_engine/dev/synthesis_engine/mock.py | 8 ++++---- voicevox_engine/metas/MetasStore.py | 8 ++++---- voicevox_engine/morphing.py | 4 ++-- voicevox_engine/tts_pipeline/__init__.py | 8 ++++---- voicevox_engine/tts_pipeline/make_tts_engines.py | 10 +++++----- voicevox_engine/tts_pipeline/tts_engine.py | 4 ++-- voicevox_engine/tts_pipeline/tts_engine_base.py | 2 +- 14 files changed, 40 insertions(+), 40 deletions(-) diff --git a/build_util/make_docs.py b/build_util/make_docs.py index d21ba85b9..7bf1f0b6e 100644 --- a/build_util/make_docs.py +++ b/build_util/make_docs.py @@ -1,7 +1,7 @@ import json from voicevox_engine.dev.core import mock as core -from voicevox_engine.dev.synthesis_engine.mock import MockSynthesisEngine +from voicevox_engine.dev.synthesis_engine.mock import MockTTSEngine from voicevox_engine.preset import PresetManager from voicevox_engine.setting import USER_SETTING_PATH, SettingLoader from voicevox_engine.utility import engine_root @@ -10,7 +10,7 @@ import run app = run.generate_app( - synthesis_engines={"mock": MockSynthesisEngine(speakers=core.metas())}, + synthesis_engines={"mock": MockTTSEngine(speakers=core.metas())}, latest_core_version="mock", setting_loader=SettingLoader(USER_SETTING_PATH), preset_manager=PresetManager( # FIXME: impl MockPresetManager diff --git a/run.py b/run.py index bfb8cba14..0e7a33cba 100644 --- a/run.py +++ b/run.py @@ -65,7 +65,7 @@ Setting, SettingLoader, ) -from voicevox_engine.tts_pipeline import SynthesisEngineBase, make_synthesis_engines +from voicevox_engine.tts_pipeline import TTSEngineBase, make_synthesis_engines from voicevox_engine.tts_pipeline.kana_parser import create_kana, parse_kana from voicevox_engine.user_dict import ( apply_word, @@ -131,7 +131,7 @@ def set_output_log_utf8() -> None: def generate_app( - synthesis_engines: Dict[str, SynthesisEngineBase], + synthesis_engines: Dict[str, TTSEngineBase], latest_core_version: str, setting_loader: SettingLoader, preset_manager: PresetManager, @@ -227,7 +227,7 @@ async def block_origin_middleware(request: Request, call_next): def apply_user_dict(): update_dict() - def get_engine(core_version: Optional[str]) -> SynthesisEngineBase: + def get_engine(core_version: Optional[str]) -> TTSEngineBase: if core_version is None: return synthesis_engines[latest_core_version] if core_version in synthesis_engines: diff --git a/test/test_mock_synthesis_engine.py b/test/test_mock_synthesis_engine.py index 27fee31c1..e9cf71688 100644 --- a/test/test_mock_synthesis_engine.py +++ b/test/test_mock_synthesis_engine.py @@ -1,11 +1,11 @@ from unittest import TestCase -from voicevox_engine.dev.synthesis_engine import MockSynthesisEngine +from voicevox_engine.dev.synthesis_engine import MockTTSEngine from voicevox_engine.model import AccentPhrase, AudioQuery, Mora from voicevox_engine.tts_pipeline.kana_parser import create_kana -class TestMockSynthesisEngine(TestCase): +class TestMockTTSEngine(TestCase): def setUp(self): super().setUp() @@ -102,7 +102,7 @@ def setUp(self): pause_mora=None, ), ] - self.engine = MockSynthesisEngine(speakers="", supported_devices="") + self.engine = MockTTSEngine(speakers="", supported_devices="") def test_replace_phoneme_length(self): self.assertEqual( diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 1f0d5016b..24136abe1 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -8,7 +8,7 @@ import numpy from voicevox_engine.model import AccentPhrase, AudioQuery, Mora -from voicevox_engine.tts_pipeline import SynthesisEngine +from voicevox_engine.tts_pipeline import TTSEngine from voicevox_engine.tts_pipeline.acoustic_feature_extractor import OjtPhoneme # TODO: import from voicevox_engine.synthesis_engine.mora @@ -488,7 +488,7 @@ def test_query_to_decoder_feature(): assert numpy.array_equal(f0, true_f0) -class TestSynthesisEngine(TestCase): +class TestTTSEngine(TestCase): def setUp(self): super().setUp() self.str_list_hello_hiho = ( @@ -525,7 +525,7 @@ def setUp(self): self.yukarin_s_mock = core.yukarin_s_forward self.yukarin_sa_mock = core.yukarin_sa_forward self.decode_mock = core.decode_forward - self.synthesis_engine = SynthesisEngine(core=core) + self.synthesis_engine = TTSEngine(core=core) def test_to_flatten_moras(self): flatten_moras = to_flatten_moras(self.accent_phrases_hello_hiho) @@ -772,7 +772,7 @@ def synthesis_test_base(self, audio_query: AudioQuery): for i in range(len(phoneme_length_list)): phoneme_length_list[i] /= audio_query.speedScale - # Outputs: MockCore入りSynthesisEngine の `.synthesis` 出力および core.decode_forward 引数 + # Outputs: MockCore入りTTSEngine の `.synthesis` 出力および core.decode_forward 引数 result = self.synthesis_engine.synthesis(query=audio_query, style_id=1) decode_args = self.decode_mock.call_args[1] list_length = decode_args["length"] diff --git a/test/test_synthesis_engine_base.py b/test/test_synthesis_engine_base.py index ecee4df66..bc6d88f2c 100644 --- a/test/test_synthesis_engine_base.py +++ b/test/test_synthesis_engine_base.py @@ -5,7 +5,7 @@ import numpy from voicevox_engine.model import AccentPhrase, AudioQuery, Mora -from voicevox_engine.tts_pipeline import SynthesisEngine +from voicevox_engine.tts_pipeline import TTSEngine def yukarin_s_mock(length: int, phoneme_list: numpy.ndarray, style_id: numpy.ndarray): @@ -184,10 +184,10 @@ def is_model_loaded(self, style_id): return True -class TestSynthesisEngineBase(TestCase): +class TestTTSEngineBase(TestCase): def setUp(self): super().setUp() - self.synthesis_engine = SynthesisEngine( + self.synthesis_engine = TTSEngine( core=MockCore(), ) self.synthesis_engine._synthesis_impl = Mock() diff --git a/voicevox_engine/dev/core/mock.py b/voicevox_engine/dev/core/mock.py index 2bc2102f0..c0531fbc8 100644 --- a/voicevox_engine/dev/core/mock.py +++ b/voicevox_engine/dev/core/mock.py @@ -32,7 +32,7 @@ def yukarin_sa_forward(length: int, **kwargs: Dict[str, Any]) -> np.ndarray: def decode_forward(length: int, **kwargs: Dict[str, Any]) -> np.ndarray: """ 合成音声の波形データをNumPy配列で返します。ただし、常に固定の文言を読み上げます(DUMMY_TEXT) - 参照→SynthesisEngine のdocstring [Mock] + 参照→TTSEngine のdocstring [Mock] Parameters ---------- diff --git a/voicevox_engine/dev/synthesis_engine/__init__.py b/voicevox_engine/dev/synthesis_engine/__init__.py index e7b2ac5b1..ae0b29ec2 100644 --- a/voicevox_engine/dev/synthesis_engine/__init__.py +++ b/voicevox_engine/dev/synthesis_engine/__init__.py @@ -1,3 +1,3 @@ -from .mock import MockSynthesisEngine +from .mock import MockTTSEngine -__all__ = ["MockSynthesisEngine"] +__all__ = ["MockTTSEngine"] diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py index 3cb72dc79..b861dc7c9 100644 --- a/voicevox_engine/dev/synthesis_engine/mock.py +++ b/voicevox_engine/dev/synthesis_engine/mock.py @@ -6,13 +6,13 @@ from soxr import resample from ...model import AccentPhrase, AudioQuery -from ...tts_pipeline import SynthesisEngineBase +from ...tts_pipeline import TTSEngineBase from ...tts_pipeline.tts_engine import to_flatten_moras -class MockSynthesisEngine(SynthesisEngineBase): +class MockTTSEngine(TTSEngineBase): """ - SynthesisEngine [Mock] + TTSEngine [Mock] """ def __init__( @@ -110,7 +110,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int) -> np.ndarray: def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray: """ forward tts via pyopenjtalk.tts() - 参照→SynthesisEngine のdocstring [Mock] + 参照→TTSEngine のdocstring [Mock] Parameters ---------- diff --git a/voicevox_engine/metas/MetasStore.py b/voicevox_engine/metas/MetasStore.py index 78f838a2a..76eceff8a 100644 --- a/voicevox_engine/metas/MetasStore.py +++ b/voicevox_engine/metas/MetasStore.py @@ -5,7 +5,7 @@ from voicevox_engine.metas.Metas import CoreSpeaker, EngineSpeaker, Speaker, StyleInfo if TYPE_CHECKING: - from voicevox_engine.tts_pipeline.tts_engine_base import SynthesisEngineBase + from voicevox_engine.tts_pipeline.tts_engine_base import TTSEngineBase class MetasStore: @@ -29,13 +29,13 @@ def __init__(self, engine_speakers_path: Path) -> None: } # FIXME: engineではなくList[CoreSpeaker]を渡す形にすることで - # SynthesisEngineBaseによる循環importを修正する - def load_combined_metas(self, engine: "SynthesisEngineBase") -> List[Speaker]: + # TTSEngineBaseによる循環importを修正する + def load_combined_metas(self, engine: "TTSEngineBase") -> List[Speaker]: """ コアに含まれる話者メタ情報とエンジンに含まれる話者メタ情報を統合 Parameters ---------- - engine : SynthesisEngineBase + engine : TTSEngineBase コアに含まれる話者メタ情報をもったエンジン Returns ------- diff --git a/voicevox_engine/morphing.py b/voicevox_engine/morphing.py index 89a2498c3..ee7bf446d 100644 --- a/voicevox_engine/morphing.py +++ b/voicevox_engine/morphing.py @@ -10,7 +10,7 @@ from .metas.Metas import Speaker, SpeakerSupportPermittedSynthesisMorphing, StyleInfo from .metas.MetasStore import construct_lookup from .model import AudioQuery, MorphableTargetInfo, StyleIdNotFoundError -from .tts_pipeline import SynthesisEngine +from .tts_pipeline import TTSEngine # FIXME: ndarray type hint, https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/blob/2b64f86197573497c685c785c6e0e743f407b63e/pyworld/pyworld.pyx#L398 # noqa @@ -128,7 +128,7 @@ def is_synthesis_morphing_permitted( def synthesis_morphing_parameter( - engine: SynthesisEngine, + engine: TTSEngine, query: AudioQuery, base_speaker: int, target_speaker: int, diff --git a/voicevox_engine/tts_pipeline/__init__.py b/voicevox_engine/tts_pipeline/__init__.py index 2fce842ba..8aeea1b06 100644 --- a/voicevox_engine/tts_pipeline/__init__.py +++ b/voicevox_engine/tts_pipeline/__init__.py @@ -1,12 +1,12 @@ from ..core_wrapper import CoreWrapper, load_runtime_lib from .make_tts_engines import make_synthesis_engines -from .tts_engine import SynthesisEngine -from .tts_engine_base import SynthesisEngineBase +from .tts_engine import TTSEngine +from .tts_engine_base import TTSEngineBase __all__ = [ "CoreWrapper", "load_runtime_lib", "make_synthesis_engines", - "SynthesisEngine", - "SynthesisEngineBase", + "TTSEngine", + "TTSEngineBase", ] diff --git a/voicevox_engine/tts_pipeline/make_tts_engines.py b/voicevox_engine/tts_pipeline/make_tts_engines.py index 09183574a..8ee63c907 100644 --- a/voicevox_engine/tts_pipeline/make_tts_engines.py +++ b/voicevox_engine/tts_pipeline/make_tts_engines.py @@ -5,7 +5,7 @@ from ..core_wrapper import CoreWrapper, load_runtime_lib from ..utility import engine_root, get_save_dir -from .tts_engine import SynthesisEngine, SynthesisEngineBase +from .tts_engine import TTSEngine, TTSEngineBase def make_synthesis_engines( @@ -16,7 +16,7 @@ def make_synthesis_engines( cpu_num_threads: Optional[int] = None, enable_mock: bool = True, load_all_models: bool = False, -) -> Dict[str, SynthesisEngineBase]: +) -> Dict[str, TTSEngineBase]: """ 音声ライブラリをロードして、音声合成エンジンを生成 @@ -88,7 +88,7 @@ def load_core_library(core_dir: Path, suppress_error: bool = False): file=sys.stderr, ) else: - synthesis_engines[core_version] = SynthesisEngine(core=core) + synthesis_engines[core_version] = TTSEngine(core=core) except Exception: if not suppress_error: raise @@ -113,11 +113,11 @@ def load_core_library(core_dir: Path, suppress_error: bool = False): # モック追加 from ..dev.core import metas as mock_metas from ..dev.core import supported_devices as mock_supported_devices - from ..dev.synthesis_engine import MockSynthesisEngine + from ..dev.synthesis_engine import MockTTSEngine if "0.0.0" not in synthesis_engines: print("Info: Loading mock.") - synthesis_engines["0.0.0"] = MockSynthesisEngine( + synthesis_engines["0.0.0"] = MockTTSEngine( speakers=mock_metas(), supported_devices=mock_supported_devices() ) diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 67ebb9564..519c2836b 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -9,7 +9,7 @@ from ..core_wrapper import CoreWrapper, OldCoreError from ..model import AccentPhrase, AudioQuery, Mora from .acoustic_feature_extractor import OjtPhoneme -from .tts_engine_base import SynthesisEngineBase +from .tts_engine_base import TTSEngineBase unvoiced_mora_phoneme_list = ["A", "I", "U", "E", "O", "cl", "pau"] mora_phoneme_list = ["a", "i", "u", "e", "o", "N"] + unvoiced_mora_phoneme_list @@ -390,7 +390,7 @@ def raw_wave_to_output_wave(query: AudioQuery, wave: ndarray, sr_wave: int) -> n return wave -class SynthesisEngine(SynthesisEngineBase): +class TTSEngine(TTSEngineBase): """音声合成器(core)の管理/実行/プロキシと音声合成フロー""" def __init__(self, core: CoreWrapper): diff --git a/voicevox_engine/tts_pipeline/tts_engine_base.py b/voicevox_engine/tts_pipeline/tts_engine_base.py index 3a846905e..7ffca18a5 100644 --- a/voicevox_engine/tts_pipeline/tts_engine_base.py +++ b/voicevox_engine/tts_pipeline/tts_engine_base.py @@ -172,7 +172,7 @@ def test_to_accent_phrases(text: str) -> list[AccentPhrase]: return utterance_to_accent_phrases(utterance) -class SynthesisEngineBase(metaclass=ABCMeta): +class TTSEngineBase(metaclass=ABCMeta): @property @abstractmethod def default_sampling_rate(self) -> int: From 53f8c540a93a5eece40a779b01877f29beb77d6c Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 20 Dec 2023 03:15:57 +0900 Subject: [PATCH 042/177] =?UTF-8?q?=E8=BF=BD=E5=8A=A0:=20=E6=B3=A2?= =?UTF-8?q?=E5=BD=A2=E5=90=88=E6=88=90=E5=BE=8C=E5=87=A6=E7=90=86=E3=81=AE?= =?UTF-8?q?=E3=83=86=E3=82=B9=E3=83=88=20(#902)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- test/test_synthesis_engine.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 24136abe1..f0f62427a 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -26,6 +26,7 @@ mora_phoneme_list, pre_process, query_to_decoder_feature, + raw_wave_to_output_wave, split_mora, to_flatten_moras, to_flatten_phonemes, @@ -488,6 +489,38 @@ def test_query_to_decoder_feature(): assert numpy.array_equal(f0, true_f0) +def test_raw_wave_to_output_wave_with_resample(): + """Test `raw_wave_to_output_wave` with resampling option.""" + # Inputs + query = _gen_query(volumeScale=2, outputSamplingRate=48000, outputStereo=True) + raw_wave = numpy.random.rand(240) + sr_raw_wave = 24000 + + # Expects + true_wave_shape = (480, 2) + + # Outputs + wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) + + assert wave.shape == true_wave_shape + + +def test_raw_wave_to_output_wave_without_resample(): + """Test `raw_wave_to_output_wave` without resampling option.""" + # Inputs + query = _gen_query(volumeScale=2, outputStereo=True) + raw_wave = numpy.random.rand(240) + sr_raw_wave = 24000 + + # Expects + true_wave = numpy.array([2 * raw_wave, 2 * raw_wave]).T + + # Outputs + wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) + + assert numpy.allclose(wave, true_wave) + + class TestTTSEngine(TestCase): def setUp(self): super().setUp() From d8488ff16e0fdf164621665a26ef410c73c908ed Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 20 Dec 2023 04:08:13 +0900 Subject: [PATCH 043/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E7=B0=A1?= =?UTF-8?q?=E6=98=93=20docstring=20=E3=81=A8=E5=8D=98=E7=B4=94=E5=A4=89?= =?UTF-8?q?=E6=95=B0=E5=90=8D=20(#903)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hiroshiba --- voicevox_engine/tts_pipeline/tts_engine.py | 119 +++------------------ 1 file changed, 16 insertions(+), 103 deletions(-) diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 519c2836b..acf6e8c5a 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -123,19 +123,7 @@ def generate_silence_mora(length: float) -> Mora: def apply_prepost_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: - """ - 前後無音(`prePhonemeLength` & `postPhonemeLength`)の適用 - Parameters - ---------- - moras : List[Mora] - モーラ時系列 - query : AudioQuery - 音声合成用のクエリ - Returns - ------- - moras : List[Mora] - 前後無音が付加されたモーラ時系列 - """ + """モーラ系列へ音声合成用のクエリがもつ前後無音(`prePhonemeLength` & `postPhonemeLength`)を付加する""" pre_silence_moras = [generate_silence_mora(query.prePhonemeLength)] post_silence_moras = [generate_silence_mora(query.postPhonemeLength)] moras = pre_silence_moras + moras + post_silence_moras @@ -143,19 +131,7 @@ def apply_prepost_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: - """ - 話速スケール(`speedScale`)の適用 - Parameters - ---------- - moras : list[Mora] - モーラ系列 - query : AudioQuery - 音声合成用のクエリ - Returns - ------- - moras : list[Mora] - 話速スケールが適用されたモーラ系列 - """ + """モーラ系列へ音声合成用のクエリがもつ話速スケール(`speedScale`)を適用する""" for mora in moras: mora.vowel_length /= query.speedScale if mora.consonant_length: @@ -202,38 +178,14 @@ def _to_frame(sec: float) -> ndarray: def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: - """ - 音高スケール(`pitchScale`)の適用 - Parameters - ---------- - moras : list[Mora] - モーラ系列 - query : AudioQuery - 音声合成用のクエリ - Returns - ------- - moras : list[Mora] - 音高スケールが適用されたモーラ系列 - """ + """モーラ系列へ音声合成用のクエリがもつ音高スケール(`pitchScale`)を適用する""" for mora in moras: mora.pitch *= 2**query.pitchScale return moras def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: - """ - 抑揚スケール(`intonationScale`)の適用 - Parameters - ---------- - moras : list[Mora] - モーラ系列 - query : AudioQuery - 音声合成用のクエリ - Returns - ------- - moras : list[Mora] - 抑揚スケールが適用されたモーラ系列 - """ + """モーラ系列へ音声合成用のクエリがもつ抑揚スケール(`intonationScale`)を適用する""" # 有声音素 (f0>0) の平均値に対する乖離度をスケール voiced = list(filter(lambda mora: mora.pitch > 0, moras)) mean_f0 = numpy.mean(list(map(lambda mora: mora.pitch, voiced))).item() @@ -267,19 +219,7 @@ def calc_frame_pitch(moras: list[Mora], frame_per_mora: ndarray) -> ndarray: def apply_volume_scale(wave: numpy.ndarray, query: AudioQuery) -> numpy.ndarray: - """ - 音量スケール(`volumeScale`)の適用 - Parameters - ---------- - wave : numpy.ndarray - 音声波形 - query : AudioQuery - 音声合成用のクエリ - Returns - ------- - wave : numpy.ndarray - 音量スケールが適用された音声波形 - """ + """音声波形へ音声合成用のクエリがもつ音量スケール(`volumeScale`)を適用する""" wave *= query.volumeScale return wave @@ -310,43 +250,16 @@ def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndar def apply_output_sampling_rate( wave: ndarray, sr_wave: int, query: AudioQuery ) -> ndarray: - """ - 出力サンプリングレート(`outputSamplingRate`)の適用 - Parameters - ---------- - wave : ndarray - 音声波形 - sr_wave : int - `wave`のサンプリングレート - query : AudioQuery - 音声合成用のクエリ - Returns - ------- - wave : ndarray - 出力サンプリングレートが適用された音声波形 - """ + """音声波形へ音声合成用のクエリがもつ出力サンプリングレート(`outputSamplingRate`)を適用する""" # サンプリングレート一致のときはスルー if sr_wave == query.outputSamplingRate: return wave - wave = resample(wave, sr_wave, query.outputSamplingRate) return wave def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray: - """ - ステレオ出力(`outputStereo`)の適用 - Parameters - ---------- - wave : ndarray - 音声波形 - query : AudioQuery - 音声合成用のクエリ - Returns - ------- - wave : ndarray - ステレオ出力設定が適用された音声波形 - """ + """音声波形へ音声合成用のクエリがもつステレオ出力設定(`outputStereo`)を適用する""" if query.outputStereo: wave = numpy.array([wave, wave]).T return wave @@ -366,18 +279,18 @@ def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]: f0 : ndarray フレームごとの基本周波数、shape=(Frame,) """ - flatten_moras = to_flatten_moras(query.accent_phrases) + moras = to_flatten_moras(query.accent_phrases) - flatten_moras = apply_prepost_silence(flatten_moras, query) - flatten_moras = apply_speed_scale(flatten_moras, query) - flatten_moras = apply_pitch_scale(flatten_moras, query) - flatten_moras = apply_intonation_scale(flatten_moras, query) + moras = apply_prepost_silence(moras, query) + moras = apply_speed_scale(moras, query) + moras = apply_pitch_scale(moras, query) + moras = apply_intonation_scale(moras, query) - phoneme_data_list = to_flatten_phonemes(flatten_moras) + phonemes = to_flatten_phonemes(moras) - frame_per_phoneme, frame_per_mora = count_frame_per_unit(flatten_moras) - f0 = calc_frame_pitch(flatten_moras, frame_per_mora) - phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) + frame_per_phoneme, frame_per_mora = count_frame_per_unit(moras) + f0 = calc_frame_pitch(moras, frame_per_mora) + phoneme = calc_frame_phoneme(phonemes, frame_per_phoneme) return phoneme, f0 From 1639300b896d94abf80a44e5039971763c9de788 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 20 Dec 2023 17:02:37 +0900 Subject: [PATCH 044/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E6=B3=A2?= =?UTF-8?q?=E5=BD=A2=E5=90=88=E6=88=90=E5=89=8D=E5=87=A6=E7=90=86=E3=81=AE?= =?UTF-8?q?=E9=96=A2=E6=95=B0=E3=83=8D=E3=82=B9=E3=83=88=E5=BB=83=E6=AD=A2?= =?UTF-8?q?=20(#907)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Refactor: 関数ネストの廃止 * Update voicevox_engine/tts_pipeline/tts_engine.py --------- Co-authored-by: Hiroshiba --- test/test_synthesis_engine.py | 55 ----------------- voicevox_engine/tts_pipeline/tts_engine.py | 70 +++------------------- 2 files changed, 8 insertions(+), 117 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index f0f62427a..ed6345674 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -20,8 +20,6 @@ apply_prepost_silence, apply_speed_scale, apply_volume_scale, - calc_frame_phoneme, - calc_frame_pitch, count_frame_per_unit, mora_phoneme_list, pre_process, @@ -377,59 +375,6 @@ def test_count_frame_per_unit(): assert numpy.array_equal(frame_per_mora, true_frame_per_mora) -def test_calc_frame_pitch(): - """Test `test_calc_frame_pitch`.""" - # Inputs - moras = [ - _gen_mora(" ", None, None, " ", 1 * 0.01067, 0.0), - _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), - _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), - _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), - _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), - _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), - _gen_mora(" ", None, None, " ", 3 * 0.01067, 0.0), - ] - # Pre ko N pau hi hO Pst - frame_per_mora = [1, 3, 2, 1, 3, 3, 3] - frame_per_mora = numpy.array(frame_per_mora, dtype=numpy.int32) - - # pau ko ko ko N N - true1_f0 = [0.0, 50.0, 50.0, 50.0, 50.0, 50.0] - # pau hi hi hi - true2_f0 = [0.0, 125.0, 125.0, 125.0] - # hO hO hO paw paw paw - true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] - true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32) - - # Outputs - f0 = calc_frame_pitch(moras, frame_per_mora) - - assert numpy.array_equal(f0, true_f0) - - -def test_calc_frame_phoneme(): - """Test `calc_frame_phoneme`.""" - # Inputs - phoneme_str = "pau k o N pau h i h O pau" - phonemes = [OjtPhoneme(p) for p in phoneme_str.split()] - # Pre k o N pau h i h O Pst - frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3] - n_frame = sum(frame_per_phoneme) - frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32) - - # Expects - # Pr k o o N N pau h i i h h O Pt Pt Pt - phoneme_ids = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0] - true_frame_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32) - for frame_idx, phoneme_idx in enumerate(phoneme_ids): - true_frame_phoneme[frame_idx, phoneme_idx] = 1.0 - - # Outputs - frame_phoneme = calc_frame_phoneme(phonemes, frame_per_phoneme) - - assert numpy.array_equal(frame_phoneme, true_frame_phoneme) - - def test_query_to_decoder_feature(): """Test `query_to_decoder_feature`.""" # Inputs diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index acf6e8c5a..803f721e5 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -195,58 +195,12 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def calc_frame_pitch(moras: list[Mora], frame_per_mora: ndarray) -> ndarray: - """ - フレームごとのピッチの生成 - Parameters - ---------- - moras : List[Mora] - モーラ列 - frame_per_mora : ndarray - モーラあたりのフレーム長 - Returns - ------- - frame_f0 : NDArray[] - フレームごとの基本周波数系列 - """ - # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) - # モーラごとの基本周波数 - f0 = numpy.array([mora.pitch for mora in moras], dtype=numpy.float32) - - # Rescale: 時間スケールの変更(モーラ -> フレーム) - frame_f0 = numpy.repeat(f0, frame_per_mora) - return frame_f0 - - def apply_volume_scale(wave: numpy.ndarray, query: AudioQuery) -> numpy.ndarray: """音声波形へ音声合成用のクエリがもつ音量スケール(`volumeScale`)を適用する""" wave *= query.volumeScale return wave -def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndarray): - """ - フレームごとの音素列の生成(onehot化 + フレーム化) - Parameters - ---------- - phonemes : List[OjtPhoneme] - 音素列 - frame_per_phoneme: NDArray - 音素あたりのフレーム長。端数丸め。 - Returns - ------- - frame_phoneme : NDArray[] - フレームごとの音素系列 - """ - # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) - # Convert: Core入力形式への変換(onehotベクトル系列) - onehot_phoneme = numpy.stack([p.onehot for p in phonemes]) - - # Rescale: 時間スケールの変更(音素 -> フレーム) - frame_phoneme = numpy.repeat(onehot_phoneme, frame_per_phoneme, axis=0) - return frame_phoneme - - def apply_output_sampling_rate( wave: ndarray, sr_wave: int, query: AudioQuery ) -> ndarray: @@ -266,31 +220,23 @@ def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray: def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]: - """ - 音声合成用のクエリをデコーダー用特徴量へ変換する。 - Parameters - ---------- - query : AudioQuery - 音声合成クエリ - Returns - ------- - phoneme : ndarray - フレームごとの音素、shape=(Frame,) - f0 : ndarray - フレームごとの基本周波数、shape=(Frame,) - """ + """音声合成用のクエリからフレームごとの音素 (shape=(フレーム長, 音素数)) と音高 (shape=(フレーム長,)) を得る""" moras = to_flatten_moras(query.accent_phrases) + # 設定を適用する moras = apply_prepost_silence(moras, query) moras = apply_speed_scale(moras, query) moras = apply_pitch_scale(moras, query) moras = apply_intonation_scale(moras, query) - phonemes = to_flatten_phonemes(moras) + # 表現を変更する(音素クラス → 音素 onehot ベクトル、モーラクラス → 音高スカラ) + phoneme = numpy.stack([p.onehot for p in to_flatten_phonemes(moras)]) + f0 = numpy.array([mora.pitch for mora in moras], dtype=numpy.float32) + # 時間スケールを変更する(音素・モーラ → フレーム) frame_per_phoneme, frame_per_mora = count_frame_per_unit(moras) - f0 = calc_frame_pitch(moras, frame_per_mora) - phoneme = calc_frame_phoneme(phonemes, frame_per_phoneme) + phoneme = numpy.repeat(phoneme, frame_per_phoneme, axis=0) + f0 = numpy.repeat(f0, frame_per_mora) return phoneme, f0 From 831d28aad743a7583e8260cfc72da53f205fa966 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 20 Dec 2023 20:34:13 +0900 Subject: [PATCH 045/177] =?UTF-8?q?=E6=95=B4=E7=90=86:=20OpenJTalk=20`Phon?= =?UTF-8?q?eme`=20=E2=86=92=20`Label`=20=E3=83=AA=E3=83=8D=E3=83=BC?= =?UTF-8?q?=E3=83=A0=20(#893)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_full_context_label.py | 16 +- .../tts_pipeline/full_context_label.py | 206 ++++++++---------- 2 files changed, 96 insertions(+), 126 deletions(-) diff --git a/test/test_full_context_label.py b/test/test_full_context_label.py index 87bf57785..77acad8b9 100644 --- a/test/test_full_context_label.py +++ b/test/test_full_context_label.py @@ -5,8 +5,8 @@ from voicevox_engine.tts_pipeline.full_context_label import ( AccentPhrase, BreathGroup, + Label, Mora, - Phoneme, Utterance, ) @@ -123,7 +123,7 @@ def setUp(self): + "@xx+xx&xx-xx|xx+xx/J:xx_xx/K:2+2-9", ] self.phonemes_hello_hiho = [ - Phoneme.from_label(label) for label in self.test_case_hello_hiho + Label.from_feature(feature) for feature in self.test_case_hello_hiho ] @@ -254,10 +254,10 @@ def setUp(self) -> None: super().setUp() # TODO: ValueErrorを吐く作為的ではない自然な例の模索 # 存在しないなら放置でよい - self.accent_phrase_hello = AccentPhrase.from_phonemes( + self.accent_phrase_hello = AccentPhrase.from_labels( self.phonemes_hello_hiho[1:10] ) - self.accent_phrase_hiho = AccentPhrase.from_phonemes( + self.accent_phrase_hiho = AccentPhrase.from_labels( self.phonemes_hello_hiho[11:19] ) @@ -298,10 +298,10 @@ def test_labels(self): class TestBreathGroup(TestBasePhonemes): def setUp(self) -> None: super().setUp() - self.breath_group_hello = BreathGroup.from_phonemes( + self.breath_group_hello = BreathGroup.from_labels( self.phonemes_hello_hiho[1:10] ) - self.breath_group_hiho = BreathGroup.from_phonemes( + self.breath_group_hiho = BreathGroup.from_labels( self.phonemes_hello_hiho[11:19] ) @@ -337,7 +337,7 @@ def test_labels(self): class TestUtterance(TestBasePhonemes): def setUp(self) -> None: super().setUp() - self.utterance_hello_hiho = Utterance.from_phonemes(self.phonemes_hello_hiho) + self.utterance_hello_hiho = Utterance.from_labels(self.phonemes_hello_hiho) def test_phonemes(self): self.assertEqual( @@ -346,7 +346,7 @@ def test_phonemes(self): ), "sil k o N n i ch i w a pau h i h o d e s U sil", ) - changed_utterance = Utterance.from_phonemes(self.utterance_hello_hiho.phonemes) + changed_utterance = Utterance.from_labels(self.utterance_hello_hiho.phonemes) self.assertEqual(len(changed_utterance.breath_groups), 2) accent_phrases = list( chain.from_iterable( diff --git a/voicevox_engine/tts_pipeline/full_context_label.py b/voicevox_engine/tts_pipeline/full_context_label.py index 6c1204d7b..ff843dcdc 100644 --- a/voicevox_engine/tts_pipeline/full_context_label.py +++ b/voicevox_engine/tts_pipeline/full_context_label.py @@ -7,36 +7,22 @@ @dataclass -class Phoneme: +class Label: """ - 音素(母音・子音)クラス、音素の元となるcontextを保持する - 音素には、母音や子音以外にも無音(silent/pause)も含まれる + OpenJTalk Label Attributes ---------- contexts: dict[str, str] - 音素の元 + ラベルの属性 """ contexts: dict[str, str] @classmethod - def from_label(cls, label: str): - """ - pyopenjtalk.extract_fullcontextで得られる音素の元(ラベル)から、Phonemeクラスを作成する - Parameters - ---------- - label : str - pyopenjtalk.extract_fullcontextで得られるラベルを渡す - - Returns - ------- - phoneme: Phoneme - Phonemeクラスを返す - """ - - # フルコンテキストラベルの仕様は、 - # http://hts.sp.nitech.ac.jp/?Download の HTS-2.3のJapanese tar.bz2 (126 MB)をダウンロードして、data/lab_format.pdfを見るとリストが見つかります。 # noqa + def from_feature(cls, feature: str): + """OpenJTalk feature から Label インスタンスを生成する""" + # フルコンテキストラベルの仕様は、http://hts.sp.nitech.ac.jp/?Download の HTS-2.3のJapanese tar.bz2 (126 MB)をダウンロードして、data/lab_format.pdfを見るとリストが見つかります。 # noqa contexts = re.search( r"^(?P.+?)\^(?P.+?)\-(?P.+?)\+(?P.+?)\=(?P.+?)" r"/A\:(?P.+?)\+(?P.+?)\+(?P.+?)" @@ -50,33 +36,21 @@ def from_label(cls, label: str): r"/I\:(?P.+?)\-(?P.+?)\@(?P.+?)\+(?P.+?)\&(?P.+?)\-(?P.+?)\|(?P.+?)\+(?P.+?)" # noqa r"/J\:(?P.+?)\_(?P.+?)" r"/K\:(?P.+?)\+(?P.+?)\-(?P.+?)$", - label, + feature, ).groupdict() return cls(contexts=contexts) @property def phoneme(self): - """ - 音素クラスの中で、発声に必要なcontextを返す - Returns - ------- - phoneme : str - 発声に必要なcontextを返す - """ + """このラベルに含まれる音素。子音 or 母音 (無音含む)。""" return self.contexts["p3"] def is_pause(self): - """ - 音素がポーズ(無音、silent/pause)であるかを返す - Returns - ------- - is_pose : bool - 音素がポーズ(無音、silent/pause)であるか(True)否か(False) - """ + """このラベルが無音 (silent/pause) であれば True、そうでなければ False を返す""" return self.contexts["f1"] == "xx" def __repr__(self): - return f"" + return f"