project-sブランチをmainブランチにマージ (#1029)

VOICEVOX · Jan 27, 2024 · ea76515 · ea76515
2 parents 0c94f68 + 2f4c1ff
commit ea76515
Show file tree

Hide file tree

Showing 11 changed files with 863 additions and 17 deletions.
diff --git a/engine_manifest.json b/engine_manifest.json
@@ -9,6 +9,7 @@
     "port": 50021,
     "icon": "engine_manifest_assets/icon.png",
     "default_sampling_rate": 24000,
+    "frame_rate": 93.75,
     "terms_of_service": "engine_manifest_assets/terms_of_service.md",
     "update_infos": "engine_manifest_assets/update_infos.json",
     "dependency_licenses": "engine_manifest_assets/dependency_licenses.json",

diff --git a/run.py b/run.py
@@ -41,10 +41,12 @@
     AudioQuery,
     BaseLibraryInfo,
     DownloadableLibraryInfo,
+    FrameAudioQuery,
     InstalledLibraryInfo,
     MorphableTargetInfo,
     ParseKanaBadRequest,
     ParseKanaError,
+    Score,
     Speaker,
     SpeakerInfo,
     StyleIdNotFoundError,
@@ -640,6 +642,69 @@ def _synthesis_morphing(
             background=BackgroundTask(delete_file, f.name),
         )
 
+    @app.post(
+        "/sing_frame_audio_query",
+        response_model=FrameAudioQuery,
+        tags=["クエリ作成"],
+        summary="歌唱音声合成用のクエリを作成する",
+    )
+    def sing_frame_audio_query(
+        score: Score,
+        style_id: StyleId = Query(alias="speaker"),  # noqa: B008
+        core_version: str | None = None,
+    ) -> FrameAudioQuery:
+        """
+        歌唱音声合成用のクエリの初期値を得ます。ここで得られたクエリはそのまま歌唱音声合成に利用できます。各値の意味は`Schemas`を参照してください。
+        """
+        engine = get_engine(core_version)
+        core = get_core(core_version)
+        phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume(
+            score, style_id
+        )
+
+        return FrameAudioQuery(
+            f0=f0,
+            volume=volume,
+            phonemes=phonemes,
+            volumeScale=1,
+            outputSamplingRate=core.default_sampling_rate,
+            outputStereo=False,
+        )
+
+    @app.post(
+        "/frame_synthesis",
+        response_class=FileResponse,
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["音声合成"],
+    )
+    def frame_synthesis(
+        query: FrameAudioQuery,
+        style_id: StyleId = Query(alias="speaker"),  # noqa: B008
+        core_version: str | None = None,
+    ) -> FileResponse:
+        """
+        歌唱音声合成を行います。
+        """
+        engine = get_engine(core_version)
+        wave = engine.frame_synthsize_wave(query, style_id)
+
+        with NamedTemporaryFile(delete=False) as f:
+            soundfile.write(
+                file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
+            )
+
+        return FileResponse(
+            f.name,
+            media_type="audio/wav",
+            background=BackgroundTask(delete_file, f.name),
+        )
+
     @app.post(
         "/connect_waves",
         response_class=FileResponse,

diff --git a/test/e2e/__snapshots__/test_openapi/test_OpenAPIの形が変わっていないことを確認.json b/test/e2e/__snapshots__/test_openapi/test_OpenAPIの形が変わっていないことを確認.json
@@ -224,6 +224,10 @@
             "title": "依存関係のライセンス情報",
             "type": "array"
           },
+          "frame_rate": {
+            "title": "エンジンのフレームレート",
+            "type": "number"
+          },
           "icon": {
             "title": "エンジンのアイコンをBASE64エンコードしたもの",
             "type": "string"
@@ -276,6 +280,7 @@
           "url",
           "icon",
           "default_sampling_rate",
+          "frame_rate",
           "terms_of_service",
           "update_infos",
           "dependency_licenses",
@@ -284,6 +289,73 @@
         "title": "EngineManifest",
         "type": "object"
       },
+      "FrameAudioQuery": {
+        "description": "フレームごとの音声合成用のクエリ",
+        "properties": {
+          "f0": {
+            "items": {
+              "type": "number"
+            },
+            "title": "フレームごとの基本周波数",
+            "type": "array"
+          },
+          "outputSamplingRate": {
+            "title": "音声データの出力サンプリングレート",
+            "type": "integer"
+          },
+          "outputStereo": {
+            "title": "音声データをステレオ出力するか否か",
+            "type": "boolean"
+          },
+          "phonemes": {
+            "items": {
+              "$ref": "#/components/schemas/FramePhoneme"
+            },
+            "title": "音素のリスト",
+            "type": "array"
+          },
+          "volume": {
+            "items": {
+              "type": "number"
+            },
+            "title": "フレームごとの音量",
+            "type": "array"
+          },
+          "volumeScale": {
+            "title": "全体の音量",
+            "type": "number"
+          }
+        },
+        "required": [
+          "f0",
+          "volume",
+          "phonemes",
+          "volumeScale",
+          "outputSamplingRate",
+          "outputStereo"
+        ],
+        "title": "FrameAudioQuery",
+        "type": "object"
+      },
+      "FramePhoneme": {
+        "description": "音素の情報",
+        "properties": {
+          "frame_length": {
+            "title": "音素のフレーム長",
+            "type": "integer"
+          },
+          "phoneme": {
+            "title": "音素",
+            "type": "string"
+          }
+        },
+        "required": [
+          "phoneme",
+          "frame_length"
+        ],
+        "title": "FramePhoneme",
+        "type": "object"
+      },
       "HTTPValidationError": {
         "properties": {
           "detail": {
@@ -448,6 +520,29 @@
         "title": "MorphableTargetInfo",
         "type": "object"
       },
+      "Note": {
+        "description": "音符ごとの情報",
+        "properties": {
+          "frame_length": {
+            "title": "音符のフレーム長",
+            "type": "integer"
+          },
+          "key": {
+            "title": "音階",
+            "type": "integer"
+          },
+          "lyric": {
+            "title": "音符の歌詞",
+            "type": "string"
+          }
+        },
+        "required": [
+          "frame_length",
+          "lyric"
+        ],
+        "title": "Note",
+        "type": "object"
+      },
       "ParseKanaBadRequest": {
         "properties": {
           "error_args": {
@@ -534,6 +629,23 @@
         "title": "Preset",
         "type": "object"
       },
+      "Score": {
+        "description": "楽譜情報",
+        "properties": {
+          "notes": {
+            "items": {
+              "$ref": "#/components/schemas/Note"
+            },
+            "title": "音符のリスト",
+            "type": "array"
+          }
+        },
+        "required": [
+          "notes"
+        ],
+        "title": "Score",
+        "type": "object"
+      },
       "Speaker": {
         "description": "話者情報",
         "properties": {
@@ -611,6 +723,15 @@
           "name": {
             "title": "スタイル名",
             "type": "string"
+          },
+          "type": {
+            "enum": [
+              "talk",
+              "humming",
+              "sing_teacher"
+            ],
+            "title": "モデルの種類",
+            "type": "string"
           }
         },
         "required": [
@@ -1433,6 +1554,69 @@
         ]
       }
     },
+    "/frame_synthesis": {
+      "post": {
+        "description": "歌唱音声合成を行います。",
+        "operationId": "frame_synthesis_frame_synthesis_post",
+        "parameters": [
+          {
+            "in": "query",
+            "name": "speaker",
+            "required": true,
+            "schema": {
+              "title": "Speaker",
+              "type": "integer"
+            }
+          },
+          {
+            "in": "query",
+            "name": "core_version",
+            "required": false,
+            "schema": {
+              "title": "Core Version",
+              "type": "string"
+            }
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/FrameAudioQuery"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "content": {
+              "audio/wav": {
+                "schema": {
+                  "format": "binary",
+                  "type": "string"
+                }
+              }
+            },
+            "description": "Successful Response"
+          },
+          "422": {
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/HTTPValidationError"
+                }
+              }
+            },
+            "description": "Validation Error"
+          }
+        },
+        "summary": "Frame Synthesis",
+        "tags": [
+          "音声合成"
+        ]
+      }
+    },
     "/import_user_dict": {
       "post": {
         "description": "他のユーザー辞書をインポートします。\n\nParameters\n----------\nimport_dict_data: dict[str, UserDictWord]\n    インポートするユーザー辞書のデータ\noverride: bool\n    重複したエントリがあった場合、上書きするかどうか",
@@ -2066,6 +2250,68 @@
         ]
       }
     },
+    "/sing_frame_audio_query": {
+      "post": {
+        "description": "歌唱音声合成用のクエリの初期値を得ます。ここで得られたクエリはそのまま歌唱音声合成に利用できます。各値の意味は`Schemas`を参照してください。",
+        "operationId": "sing_frame_audio_query_sing_frame_audio_query_post",
+        "parameters": [
+          {
+            "in": "query",
+            "name": "speaker",
+            "required": true,
+            "schema": {
+              "title": "Speaker",
+              "type": "integer"
+            }
+          },
+          {
+            "in": "query",
+            "name": "core_version",
+            "required": false,
+            "schema": {
+              "title": "Core Version",
+              "type": "string"
+            }
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/Score"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/FrameAudioQuery"
+                }
+              }
+            },
+            "description": "Successful Response"
+          },
+          "422": {
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/HTTPValidationError"
+                }
+              }
+            },
+            "description": "Validation Error"
+          }
+        },
+        "summary": "歌唱音声合成用のクエリを作成する",
+        "tags": [
+          "クエリ作成"
+        ]
+      }
+    },
     "/speaker_info": {
       "get": {
         "description": "指定されたspeaker_uuidに関する情報をjson形式で返します。\n画像や音声はbase64エンコードされたものが返されます。\n\nReturns\n-------\nret_data: SpeakerInfo",