VOICEVOX · Hiroshiba · Dec 9, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023
@@ -724,9 +724,14 @@ def result_value(i: int):
         self.assertEqual(result, true_result)
 
     def synthesis_test_base(self, audio_query: AudioQuery):
+        # Inputs 音素長・モーラ音高の設定 & Expects 音素長・音素ID・モーラ音高の記録
+        #     Inputs
+        #         `audio_query`: 子音長0.1秒/母音長0.1秒/モーラ音高ランダム
+        #     Expects
+        #         `phoneme_length_list`: 音素長系列
+        #         `phoneme_id_list`: 音素ID系列
+        #         `f0_list`: モーラ音高系列
         accent_phrases = audio_query.accent_phrases
-
-        # decode forwardのために適当にpitchとlengthを設定し、リストで持っておく
         phoneme_length_list = [0.0]
         phoneme_id_list = [0]
         f0_list = [0.0]
@@ -751,89 +756,105 @@ def synthesis_test_base(self, audio_query: AudioQuery):
         phoneme_length_list.append(0.0)
         phoneme_id_list.append(0)
         f0_list.append(0.0)
-
         phoneme_length_list[0] = audio_query.prePhonemeLength
         phoneme_length_list[-1] = audio_query.postPhonemeLength
 
+        # Expects: speedScale適用
         for i in range(len(phoneme_length_list)):
             phoneme_length_list[i] /= audio_query.speedScale
 
+        # Outputs: MockCore入りSynthesisEngine の `.synthesis` 出力および core.decode_forward 引数
         result = self.synthesis_engine.synthesis(query=audio_query, style_id=1)
-
-        # decodeに渡される値の検証
         decode_args = self.decode_mock.call_args[1]
         list_length = decode_args["length"]
+
+        # Test: フレーム長
         self.assertEqual(
             list_length,
             int(sum([round(p * 24000 / 256) for p in phoneme_length_list])),
         )
 
+        # Expects: Apply/Convert/Rescale
         num_phoneme = OjtPhoneme.num_phoneme
         # mora_phoneme_listのPhoneme ID版
         mora_phoneme_id_list = [
             OjtPhoneme(p, 0, 0).phoneme_id for p in mora_phoneme_list
         ]
 
-        # numpy.repeatをfor文でやる
-        f0 = []
-        phoneme = []
+        f0 = []  # フレームごとの音高系列
+        phoneme = []  # フレームごとの音素onehotベクトル系列
         f0_index = 0
         mean_f0 = []
         for i, phoneme_length in enumerate(phoneme_length_list):
+            # Expects: pitchScale適用
             f0_single = numpy.array(f0_list[f0_index], dtype=numpy.float32) * (
                 2**audio_query.pitchScale
             )
+            # Expects: フレームスケール化
             for _ in range(int(round(phoneme_length * (24000 / 256)))):
                 f0.append([f0_single])
+                # Expects: 音素onehot化
                 phoneme_s = []
                 for _ in range(num_phoneme):
                     phoneme_s.append(0)
-                # one hot
+                # Expects: 音素フレームスケール化
                 phoneme_s[phoneme_id_list[i]] = 1
                 phoneme.append(phoneme_s)
             # consonantとvowelを判別し、vowelであればf0_indexを一つ進める
             if phoneme_id_list[i] in mora_phoneme_id_list:
                 if f0_single > 0:
                     mean_f0.append(f0_single)
                 f0_index += 1
-
+        # Expects: 抑揚スケール適用
         mean_f0 = numpy.array(mean_f0, dtype=numpy.float32).mean()
         f0 = numpy.array(f0, dtype=numpy.float32)
         for i in range(len(f0)):
             if f0[i][0] != 0.0:
                 f0[i][0] = (f0[i][0] - mean_f0) * audio_query.intonationScale + mean_f0
-
         phoneme = numpy.array(phoneme, dtype=numpy.float32)
 
+        assert_f0_count = 0
+
+        # Outputs: decode_forward `f0` 引数
+        decode_f0 = decode_args["f0"]
+
+        # Test: フレームごとの音高系列
         # 乱数の影響で数値の位置がずれが生じるので、大半(4/5)があっていればよしとする
         # また、上の部分のint(round(phoneme_length * (24000 / 256)))の影響で
         # 本来のf0/phonemeとテスト生成したf0/phonemeの長さが変わることがあり、
         # テスト生成したものが若干長くなることがあるので、本来のものの長さを基準にassertする
-        assert_f0_count = 0
-        decode_f0 = decode_args["f0"]
         for i in range(len(decode_f0)):
             # 乱数の影響等で数値にずれが生じるので、10の-5乗までの近似値であれば許容する
             assert_f0_count += math.isclose(f0[i][0], decode_f0[i][0], rel_tol=10e-5)
         self.assertTrue(assert_f0_count >= int(len(decode_f0) / 5) * 4)
+
         assert_phoneme_count = 0
+
+        # Outputs: decode_forward `phoneme` 引数
         decode_phoneme = decode_args["phoneme"]
+
+        # Test: フレームごとの音素系列
         for i in range(len(decode_phoneme)):
             assert_true_count = 0
             for j in range(len(decode_phoneme[i])):
                 assert_true_count += bool(phoneme[i][j] == decode_phoneme[i][j])
             assert_phoneme_count += assert_true_count == num_phoneme
+
         self.assertTrue(assert_phoneme_count >= int(len(decode_phoneme) / 5) * 4)
+
+        # Test: スタイルID
         self.assertEqual(decode_args["style_id"], 1)
 
-        # decode forwarderのmockを使う
+        # Expects: waveform (by mock)
         true_result = decode_mock(list_length, num_phoneme, f0, phoneme, 1)
-
+        # Expects: 音量スケール適用
         true_result *= audio_query.volumeScale
 
         # TODO: resampyの部分は値の検証しようがないので、パスする
         if audio_query.outputSamplingRate != 24000:
             return
 
+        # Test:
         assert_result_count = 0
         for i in range(len(true_result)):
             if audio_query.outputStereo:

@@ -519,6 +519,17 @@ def labels(self):
 
 
 def extract_full_context_label(text: str):
+    """
+    日本語テキストから発話クラスを抽出
+    Parameters
+    ----------
+    text : str
+        日本語テキスト
+    Returns
+    -------
+    utterance : Utterance
+        発話
+    """
     labels = pyopenjtalk.extract_fullcontext(text)
     phonemes = [Phoneme.from_label(label=label) for label in labels]
     utterance = Utterance.from_phonemes(phonemes)

@@ -1,15 +1,23 @@
+"""
+「AquesTalk風記法」を実装した AquesTalk風記法テキスト <-> アクセント句系列 変換。
+記法定義: `https://github.com/VOICEVOX/voicevox_engine/blob/master/README.md#読み方を-aquestalk風記法で取得修正するサンプルコード` # noqa
+"""
+
 from typing import List, Optional
 
 from .model import AccentPhrase, Mora, ParseKanaError, ParseKanaErrorCode
 from .mora_list import openjtalk_text2mora
 
 _LOOP_LIMIT = 300
-_UNVOICE_SYMBOL = "_"
-_ACCENT_SYMBOL = "'"
-_NOPAUSE_DELIMITER = "/"
-_PAUSE_DELIMITER = "、"
-_WIDE_INTERROGATION_MARK = "？"
 
+# AquesTalk風記法特殊文字
+_UNVOICE_SYMBOL = "_"  # 無声化
+_ACCENT_SYMBOL = "'"  # アクセント位置
+_NOPAUSE_DELIMITER = "/"  # ポーズ無しアクセント句境界
+_PAUSE_DELIMITER = "、"  # ポーズ有りアクセント句境界
+_WIDE_INTERROGATION_MARK = "？"  # 疑問形
+
+# AquesTalk風記法とモーラの対応（音素長・音高 0 初期化、疑問形 off 初期化）
 _text2mora_with_unvoice = {}
 for text, (consonant, vowel) in openjtalk_text2mora.items():
     _text2mora_with_unvoice[text] = Mora(
@@ -22,6 +30,8 @@
         is_interrogative=False,
     )
     if vowel in ["a", "i", "u", "e", "o"]:
+        # 手前に`_`を入れると無声化
+        # 例: "_ホ" -> "hO"
         _text2mora_with_unvoice[_UNVOICE_SYMBOL + text] = Mora(
             text=text,
             consonant=consonant if len(consonant) > 0 else None,
@@ -35,9 +45,19 @@
 
 def _text_to_accent_phrase(phrase: str) -> AccentPhrase:
     """
-    longest matchにより読み仮名からAccentPhraseを生成
-    入力長Nに対し計算量O(N^2)
+    単一アクセント句に相当するAquesTalk風記法テキストからアクセント句オブジェクトを生成
+    longest matchによりモーラ化。入力長Nに対し計算量O(N^2)。
+    Parameters
+    ----------
+    phrase : str
+        単一アクセント句に相当するAquesTalk風記法テキスト
+    Returns
+    -------
+    accent_phrase : AccentPhrase
+        アクセント句
     """
+    # NOTE: ポーズと疑問形はこの関数内で処理しない
+
     accent_index: Optional[int] = None
     moras: List[Mora] = []
 
@@ -48,24 +68,33 @@ def _text_to_accent_phrase(phrase: str) -> AccentPhrase:
     outer_loop = 0
     while base_index < len(phrase):
         outer_loop += 1
+
+        # `'`の手前がアクセント位置
         if phrase[base_index] == _ACCENT_SYMBOL:
             if len(moras) == 0:
                 raise ParseKanaError(ParseKanaErrorCode.ACCENT_TOP, text=phrase)
+            # すでにアクセント位置がある場合はエラー
             if accent_index is not None:
                 raise ParseKanaError(ParseKanaErrorCode.ACCENT_TWICE, text=phrase)
             accent_index = len(moras)
             base_index += 1
             continue
+
+        # モーラ探索
+        # より長い要素からなるモーラが見つかれば上書き（longest match）
+        # 例: phrase "キャ" -> "キ" 検出 -> "キャ" 検出/上書き -> Mora("キャ")
         for watch_index in range(base_index, len(phrase)):
+            # アクセント位置特殊文字が来たら探索打ち切り
             if phrase[watch_index] == _ACCENT_SYMBOL:
                 break
-            # 普通の文字の場合
             stack += phrase[watch_index]
             if stack in _text2mora_with_unvoice:
+                # より長い要素からなるモーラが見つかれば上書き（longest match）
+                # 例: phrase "キャ" -> "キ" 検出 -> "キャ" 検出/上書き -> Mora("キャ")
                 matched_text = stack
-        # push mora
         if matched_text is None:
             raise ParseKanaError(ParseKanaErrorCode.UNKNOWN_TEXT, text=stack)
+        # push mora
         else:
             moras.append(_text2mora_with_unvoice[matched_text].copy(deep=True))
             base_index += len(matched_text)
@@ -81,7 +110,15 @@ def _text_to_accent_phrase(phrase: str) -> AccentPhrase:
 
 def parse_kana(text: str) -> List[AccentPhrase]:
     """
-    AquesTalk風記法テキストをパースして音長・音高未指定のaccent phraseに変換
+    AquesTalk風記法テキストからアクセント句系列を生成
+    Parameters
+    ----------
+    text : str
+        AquesTalk風記法テキスト
+    Returns
+    -------
+    parsed_results : List[AccentPhrase]
+        アクセント句（音素・モーラ音高 0初期化）系列を生成
     """
 
     parsed_results: List[AccentPhrase] = []
@@ -90,6 +127,7 @@ def parse_kana(text: str) -> List[AccentPhrase]:
         raise ParseKanaError(ParseKanaErrorCode.EMPTY_PHRASE, position=1)
 
     for i in range(len(text) + 1):
+        # アクセント句境界（`/`か`、`）の出現までインデックス進展
         if i == len(text) or text[i] in [_PAUSE_DELIMITER, _NOPAUSE_DELIMITER]:
             phrase = text[phrase_base:i]
             if len(phrase) == 0:
@@ -99,15 +137,19 @@ def parse_kana(text: str) -> List[AccentPhrase]:
                 )
             phrase_base = i + 1
 
+            # アクセント句末に`？`で疑問文
             is_interrogative = _WIDE_INTERROGATION_MARK in phrase
             if is_interrogative:
                 if _WIDE_INTERROGATION_MARK in phrase[:-1]:
                     raise ParseKanaError(
                         ParseKanaErrorCode.INTERROGATION_MARK_NOT_AT_END, text=phrase
                     )
+                # 疑問形はモーラでなくアクセント句属性で表現
                 phrase = phrase.replace(_WIDE_INTERROGATION_MARK, "")
 
             accent_phrase: AccentPhrase = _text_to_accent_phrase(phrase)
+
+            # `、`で無音区間を挿入
             if i < len(text) and text[i] == _PAUSE_DELIMITER:
                 accent_phrase.pause_mora = Mora(
                     text="、",
@@ -125,22 +167,38 @@ def parse_kana(text: str) -> List[AccentPhrase]:
 
 
 def create_kana(accent_phrases: List[AccentPhrase]) -> str:
+    """
+    アクセント句系列からAquesTalk風記法テキストを生成
+    Parameters
+    ----------
+    accent_phrases : List[AccentPhrase]
+        アクセント句系列
+    Returns
+    -------
+    text : str
+        AquesTalk風記法テキスト
+    """
     text = ""
+    # アクセント句を先頭から逐次パースし、`text`末尾にAquesTalk風記法の文字を都度追加（ループ）
     for i, phrase in enumerate(accent_phrases):
         for j, mora in enumerate(phrase.moras):
+            # Rule3: "カナの手前に`_`を入れるとそのカナは無声化される"
-            # Rule3: "カナの手前に`_`を入れるとそのカナは無声化される"
+            # 無声化
-            # Rule3: "カナの手前に`_`を入れるとそのカナは無声化される"
+            # 無声化
             if mora.vowel in ["A", "I", "U", "E", "O"]:
                 text += _UNVOICE_SYMBOL
-
             text += mora.text
+            # `'`でアクセント位置
             if j + 1 == phrase.accent:
                 text += _ACCENT_SYMBOL
 
+        # Rule5: "アクセント句末に`？`(全角)を入れることにより疑問文の発音ができる"
-        # Rule5: "アクセント句末に`？`(全角)を入れることにより疑問文の発音ができる"
+        # `？`で疑問文
-        # Rule5: "アクセント句末に`？`(全角)を入れることにより疑問文の発音ができる"
+        # `？`で疑問文
         if phrase.is_interrogative:
             text += _WIDE_INTERROGATION_MARK
 
         if i < len(accent_phrases) - 1:
             if phrase.pause_mora is None:
+                # アクセント句区切り
                 text += _NOPAUSE_DELIMITER
             else:
+                # 無音でアクセント句区切り
                 text += _PAUSE_DELIMITER
     return text