Skip to content

Commit

Permalink
feat!: AudioQueryのJSON表現をENGINEと同じにする (#946)
Browse files Browse the repository at this point in the history
#261 が行われた理由は次の2点である。

1. VOICEVOX/voicevox_engine#460 もやろうとしていた
2. (少なくとも正攻法では)`pydantic.dataclasses`の対応ができない
   (#239 (comment))

1.は中止、2.もこの度`pydantic.TypeAdapter`を利用および推奨すればよいこと
がわかったため、 #261 の取り消しを行う。これでENGINEとCOREでAudioQueryを
使い回せるようになる。

BREAKING-CHANGE: AudioQueryのJSON表現がENGINEと同じになる。
See-also: https://docs.pydantic.dev/2.10/api/type_adapter/
Co-authored-by: Hiroshiba <[email protected]>
  • Loading branch information
qryxip and Hiroshiba authored Feb 3, 2025
1 parent 91158a3 commit d810e4b
Show file tree
Hide file tree
Showing 12 changed files with 156 additions and 131 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ futures-core = "0.3.31"
futures-util = "0.3.31"
futures-lite = "2.3.0"
futures-io = "0.3.31"
heck = "0.4.1"
humansize = "2.1.3"
indexmap = "2.6.0"
indicatif = "0.17.8"
Expand Down
1 change: 0 additions & 1 deletion crates/voicevox_core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ voicevox-ort = { workspace = true, features = ["download-binaries", "__init-for-
voicevox_core_macros.workspace = true

[dev-dependencies]
heck.workspace = true
pollster = { workspace = true, features = ["macro"] }
pretty_assertions.workspace = true
rstest.workspace = true
Expand Down
103 changes: 36 additions & 67 deletions crates/voicevox_core/src/engine/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,32 +55,42 @@ pub struct AudioQuery {
/// アクセント句の配列。
pub accent_phrases: Vec<AccentPhrase>,
/// 全体の話速。
#[serde(rename = "speedScale")]
pub speed_scale: f32,
/// 全体の音高。
#[serde(rename = "pitchScale")]
pub pitch_scale: f32,
/// 全体の抑揚。
#[serde(rename = "intonationScale")]
pub intonation_scale: f32,
/// 全体の音量。
#[serde(rename = "volumeScale")]
pub volume_scale: f32,
/// 音声の前の無音時間。
#[serde(rename = "prePhonemeLength")]
pub pre_phoneme_length: f32,
/// 音声の後の無音時間。
#[serde(rename = "postPhonemeLength")]
pub post_phoneme_length: f32,
/// 音声データの出力サンプリングレート。
#[serde(rename = "outputSamplingRate")]
pub output_sampling_rate: u32,
/// 音声データをステレオ出力するか否か。
#[serde(rename = "outputStereo")]
pub output_stereo: bool,
// TODO: VOICEVOX/voicevox_engine#1308 を実装する
/// 句読点などの無音時間。`null`のときは無視される。デフォルト値は`null`。
#[serde(
default,
rename = "pauseLength",
deserialize_with = "deserialize_pause_length",
serialize_with = "serialize_pause_length"
)]
pub pause_length: (),
/// 読点などの無音時間(倍率)。デフォルト値は`1`。
#[serde(
default,
rename = "pauseLengthScale",
deserialize_with = "deserialize_pause_length_scale",
serialize_with = "serialize_pause_length_scale"
)]
Expand Down Expand Up @@ -183,52 +193,11 @@ impl AudioQuery {

#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use rstest::rstest;
use serde_json::json;

use super::AudioQuery;

#[rstest]
fn check_audio_query_model_json_field_snake_case() {
let audio_query_model = AudioQuery {
accent_phrases: vec![],
speed_scale: 0.0,
pitch_scale: 0.0,
intonation_scale: 0.0,
volume_scale: 0.0,
pre_phoneme_length: 0.0,
post_phoneme_length: 0.0,
output_sampling_rate: 0,
output_stereo: false,
pause_length: (),
pause_length_scale: (),
kana: None,
};
let val = serde_json::to_value(audio_query_model).unwrap();
check_json_field_snake_case(&val);
}

fn check_json_field_snake_case(val: &serde_json::Value) {
use serde_json::Value::*;

match val {
Object(obj) => {
for (k, v) in obj.iter() {
use heck::ToSnakeCase as _;
assert_eq!(k.to_snake_case(), *k, "should be snake case {k}");
check_json_field_snake_case(v);
}
}
Array(array) => {
for val in array.iter() {
check_json_field_snake_case(val);
}
}
_ => {}
}
}

#[rstest]
fn it_accepts_json_without_optional_fields() -> anyhow::Result<()> {
serde_json::from_value::<AudioQuery>(json!({
Expand All @@ -245,14 +214,14 @@ mod tests {
"accent": 1
}
],
"speed_scale": 1.0,
"pitch_scale": 0.0,
"intonation_scale": 1.0,
"volume_scale": 1.0,
"pre_phoneme_length": 0.1,
"post_phoneme_length": 0.1,
"output_sampling_rate": 24000,
"output_stereo": false
"speedScale": 1.0,
"pitchScale": 0.0,
"intonationScale": 1.0,
"volumeScale": 1.0,
"prePhonemeLength": 0.1,
"postPhonemeLength": 0.1,
"outputSamplingRate": 24000,
"outputStereo": false
}))?;
Ok(())
}
Expand All @@ -262,15 +231,15 @@ mod tests {
fn it_denies_non_null_for_pause_length() {
serde_json::from_value::<AudioQuery>(json!({
"accent_phrases": [],
"speed_scale": 1.0,
"pitch_scale": 0.0,
"intonation_scale": 1.0,
"volume_scale": 1.0,
"pre_phoneme_length": 0.1,
"post_phoneme_length": 0.1,
"output_sampling_rate": 24000,
"output_stereo": false,
"pause_length": "aaaaa"
"speedScale": 1.0,
"pitchScale": 0.0,
"intonationScale": 1.0,
"volumeScale": 1.0,
"prePhonemeLength": 0.1,
"postPhonemeLength": 0.1,
"outputSamplingRate": 24000,
"outputStereo": false,
"pauseLength": "aaaaa"
}))
.map(|_| ())
.unwrap_err();
Expand All @@ -281,15 +250,15 @@ mod tests {
fn it_denies_non_float_for_pause_length_scale() {
serde_json::from_value::<AudioQuery>(json!({
"accent_phrases": [],
"speed_scale": 1.0,
"pitch_scale": 0.0,
"intonation_scale": 1.0,
"volume_scale": 1.0,
"pre_phoneme_length": 0.1,
"post_phoneme_length": 0.1,
"output_sampling_rate": 24000,
"output_stereo": false,
"pause_length_scale": "aaaaa",
"speedScale": 1.0,
"pitchScale": 0.0,
"intonationScale": 1.0,
"volumeScale": 1.0,
"prePhonemeLength": 0.1,
"postPhonemeLength": 0.1,
"outputSamplingRate": 24000,
"outputStereo": false,
"pauseLengthScale": "aaaaa",
}))
.map(|_| ())
.unwrap_err();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,66 +16,42 @@ public class AudioQuery {
public List<AccentPhrase> accentPhrases;

/** 全体の話速。 */
@SerializedName("speed_scale")
@Expose
public double speedScale;
@Expose public double speedScale;

/** 全体の音高。 */
@SerializedName("pitch_scale")
@Expose
public double pitchScale;
@Expose public double pitchScale;

/** 全体の抑揚。 */
@SerializedName("intonation_scale")
@Expose
public double intonationScale;
@Expose public double intonationScale;

/** 全体の音量。 */
@SerializedName("volume_scale")
@Expose
public double volumeScale;
@Expose public double volumeScale;

/** 音声の前の無音時間。 */
@SerializedName("pre_phoneme_length")
@Expose
public double prePhonemeLength;
@Expose public double prePhonemeLength;

/** 音声の後の無音時間。 */
@SerializedName("post_phoneme_length")
@Expose
public double postPhonemeLength;
@Expose public double postPhonemeLength;

/** 音声データの出力サンプリングレート。 */
@SerializedName("output_sampling_rate")
@Expose
public int outputSamplingRate;
@Expose public int outputSamplingRate;

/** 音声データをステレオ出力するか否か。 */
@SerializedName("output_stereo")
@Expose
public boolean outputStereo;
@Expose public boolean outputStereo;

/** 句読点などの無音時間。{@code null}のときは無視される。デフォルト値は{@code null}。 */
@SerializedName("pause_length")
@Expose
@Nullable
public Double pauseLength;
@Expose @Nullable public Double pauseLength;

/** 読点などの無音時間(倍率)。デフォルト値は{@code 1.}。 */
@SerializedName("pause_length_scale")
@Expose
public double pauseLengthScale;
@Expose public double pauseLengthScale;

/**
* [読み取り専用] AquesTalk風記法。
*
* <p>{@link jp.hiroshiba.voicevoxcore.blocking.Synthesizer#createAudioQuery} が返すもののみ String
* となる。入力としてのAudioQueryでは無視される。
*/
@SerializedName("kana")
@Expose
@Nullable
public final String kana;
@Expose @Nullable public final String kana;

public AudioQuery() {
this.accentPhrases = new ArrayList<>();
Expand Down
1 change: 1 addition & 0 deletions crates/voicevox_core_python_api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ crate-type = ["cdylib"]

[dependencies]
camino.workspace = true
duplicate.workspace = true
easy-ext.workspace = true
futures-lite.workspace = true
log.workspace = true
Expand Down
44 changes: 36 additions & 8 deletions crates/voicevox_core_python_api/python/test/test_audio_query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import dataclasses
import json
import textwrap

import pytest
from pydantic import TypeAdapter
from voicevox_core import AudioQuery


Expand All @@ -23,16 +26,41 @@ def test_accept_json_without_optional_fields() -> None:
"accent": 1
}
],
"speed_scale": 1.0,
"pitch_scale": 0.0,
"intonation_scale": 1.0,
"volume_scale": 1.0,
"pre_phoneme_length": 0.1,
"post_phoneme_length": 0.1,
"output_sampling_rate": 24000,
"output_stereo": false
"speedScale": 1.0,
"pitchScale": 0.0,
"intonationScale": 1.0,
"volumeScale": 1.0,
"prePhonemeLength": 0.1,
"postPhonemeLength": 0.1,
"outputSamplingRate": 24000,
"outputStereo": false
}
""",
)
)
)


def test_dumps() -> None:
BEFORE = textwrap.dedent(
"""\
{
"accent_phrases": [],
"speedScale": 1.0,
"pitchScale": 0.0,
"intonationScale": 1.0,
"volumeScale": 1.0,
"prePhonemeLength": 0.1,
"postPhonemeLength": 0.1,
"outputSamplingRate": 24000,
"outputStereo": false,
"pauseLength": null,
"pauseLengthScale": 1.0,
"kana": ""
}""",
)

adapter = TypeAdapter(AudioQuery)
query = adapter.validate_json(BEFORE)
after = adapter.dump_json(query, indent=2, by_alias=True).decode()
assert BEFORE == after
Loading

0 comments on commit d810e4b

Please sign in to comment.