Skip to content

Commit

Permalink
feat!: decode.onnxを復活させる (#918)
Browse files Browse the repository at this point in the history
現在の4モデルの`TalkDomain` (`talk`)を`ExperimentalTalk`
(`experimental_talk`)とし、decode.onnxの3モデル版を`TalkDomain`
(`talk`)とする。

両方のdomainは`StyleType::Talk`と対応する。ある`"type": "talk"`のスタイ
ルに対して`TalkDomain`か`ExperimentalTalk`のどちらかが有ればよいことに
し、両方有る場合は`TalkDomain`が優先されるようにする。

Resolves: #916
  • Loading branch information
qryxip authored Jan 5, 2025
1 parent 6ff13e0 commit 87443fd
Show file tree
Hide file tree
Showing 9 changed files with 452 additions and 91 deletions.
59 changes: 43 additions & 16 deletions crates/voicevox_core/src/infer/domains.rs
Original file line number Diff line number Diff line change
@@ -1,70 +1,87 @@
pub(crate) mod experimental_talk;
mod frame_decode;
mod singing_teacher;
mod talk;
pub(crate) mod talk;

use educe::Educe;
use serde::{Deserialize, Deserializer};

pub(crate) use self::{
experimental_talk::{
ExperimentalTalkDomain, ExperimentalTalkOperation, GenerateFullIntermediateInput,
GenerateFullIntermediateOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput,
},
frame_decode::{FrameDecodeDomain, FrameDecodeOperation, SfDecodeInput, SfDecodeOutput},
singing_teacher::{
PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input,
PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, SingingTeacherDomain,
SingingTeacherOperation,
},
talk::{
GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
},
talk::{DecodeInput, DecodeOutput, TalkDomain, TalkOperation},
};

#[derive(Educe)]
// TODO: `bounds`に`V: ?Sized`も入れようとすると、よくわからない理由で弾かれる。最新版のeduce
// でもそうなのか?また最新版でも駄目だとしたら、弾いている理由は何なのか?
#[educe(Clone(
bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::ExperimentalTalk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
))]
pub(crate) struct InferenceDomainMap<V: InferenceDomainMapValues + ?Sized> {
pub(crate) talk: V::Talk,
pub(crate) experimental_talk: V::ExperimentalTalk,
pub(crate) singing_teacher: V::SingingTeacher,
pub(crate) frame_decode: V::FrameDecode,
}

impl<T, S, F> InferenceDomainMap<(T, S, F)> {
pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &S, &F)> {
impl<T, X, S, F> InferenceDomainMap<(T, X, S, F)> {
pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &X, &S, &F)> {
let talk = &self.talk;
let experimental_talk = &self.experimental_talk;
let singing_teacher = &self.singing_teacher;
let frame_decode = &self.frame_decode;
InferenceDomainMap {
talk,
experimental_talk,
singing_teacher,
frame_decode,
}
}

pub(crate) fn map<T2, S2, F2, Ft: FnOnce(T) -> T2, Fs: FnOnce(S) -> S2, Ff: FnOnce(F) -> F2>(
pub(crate) fn map<
T2,
X2,
S2,
F2,
Ft: FnOnce(T) -> T2,
Fx: FnOnce(X) -> X2,
Fs: FnOnce(S) -> S2,
Ff: FnOnce(F) -> F2,
>(
self,
fs: InferenceDomainMap<(Ft, Fs, Ff)>,
) -> InferenceDomainMap<(T2, S2, F2)> {
fs: InferenceDomainMap<(Ft, Fx, Fs, Ff)>,
) -> InferenceDomainMap<(T2, X2, S2, F2)> {
let talk = (fs.talk)(self.talk);
let experimental_talk = (fs.experimental_talk)(self.experimental_talk);
let singing_teacher = (fs.singing_teacher)(self.singing_teacher);
let frame_decode = (fs.frame_decode)(self.frame_decode);
InferenceDomainMap {
talk,
experimental_talk,
singing_teacher,
frame_decode,
}
}
}

impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)> {
pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, S, F)>, E> {
impl<T, X, S, F, E> InferenceDomainMap<(Result<T, E>, Result<X, E>, Result<S, E>, Result<F, E>)> {
pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, X, S, F)>, E> {
let talk = self.talk?;
let experimental_talk = self.experimental_talk?;
let singing_teacher = self.singing_teacher?;
let frame_decode = self.frame_decode?;
Ok(InferenceDomainMap {
talk,
experimental_talk,
singing_teacher,
frame_decode,
})
Expand All @@ -74,6 +91,7 @@ impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)>
impl<'de, V: InferenceDomainMapValues + ?Sized> Deserialize<'de> for InferenceDomainMap<V>
where
V::Talk: Deserialize<'de>,
V::ExperimentalTalk: Deserialize<'de>,
V::SingingTeacher: Deserialize<'de>,
V::FrameDecode: Deserialize<'de>,
{
Expand All @@ -83,18 +101,21 @@ where
{
let Repr {
talk,
experimental_talk,
singing_teacher,
frame_decode,
} = Repr::deserialize(deserializer)?;
return Ok(Self {
talk,
experimental_talk,
singing_teacher,
frame_decode,
});

#[derive(Deserialize)]
struct Repr<T, S, F> {
struct Repr<T, E, S, F> {
talk: T,
experimental_talk: E,
singing_teacher: S,
frame_decode: F,
}
Expand All @@ -103,12 +124,14 @@ where

pub(crate) trait InferenceDomainMapValues {
type Talk;
type ExperimentalTalk;
type SingingTeacher;
type FrameDecode;
}

impl<T, S, F> InferenceDomainMapValues for (T, S, F) {
impl<T, X, S, F> InferenceDomainMapValues for (T, X, S, F) {
type Talk = T;
type ExperimentalTalk = X;
type SingingTeacher = S;
type FrameDecode = F;
}
Expand All @@ -120,6 +143,10 @@ macro_rules! inference_domain_map_values {
$body
where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain
),
::macros::substitute_type!(
$body
where $arg = crate::infer::domains::ExperimentalTalkDomain as crate::infer::InferenceDomain
),
::macros::substitute_type!(
$body
where $arg = crate::infer::domains::SingingTeacherDomain as crate::infer::InferenceDomain
Expand Down
116 changes: 116 additions & 0 deletions crates/voicevox_core/src/infer/domains/experimental_talk.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
use std::{collections::BTreeSet, sync::LazyLock};

use enum_map::Enum;
use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature};
use ndarray::{Array0, Array1, Array2};

use crate::{manifest::ExperimentalTalkManifest, StyleType};

use super::super::{
InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor,
};

pub(crate) enum ExperimentalTalkDomain {}

impl InferenceDomain for ExperimentalTalkDomain {
type Operation = ExperimentalTalkOperation;
type Manifest = ExperimentalTalkManifest;

fn style_types() -> &'static BTreeSet<StyleType> {
static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> =
LazyLock::new(|| [StyleType::Talk].into());
&STYLE_TYPES
}
}

#[derive(Clone, Copy, Enum, InferenceOperation)]
#[inference_operation(
type Domain = ExperimentalTalkDomain;
)]
pub(crate) enum ExperimentalTalkOperation {
#[inference_operation(
type Input = PredictDurationInput;
type Output = PredictDurationOutput;
)]
PredictDuration,

#[inference_operation(
type Input = PredictIntonationInput;
type Output = PredictIntonationOutput;
)]
PredictIntonation,

#[inference_operation(
type Input = GenerateFullIntermediateInput;
type Output = GenerateFullIntermediateOutput;
)]
GenerateFullIntermediate,

#[inference_operation(
type Input = RenderAudioSegmentInput;
type Output = RenderAudioSegmentOutput;
)]
RenderAudioSegment,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = PredictDuration;
)]
pub(crate) struct PredictDurationInput {
pub(crate) phoneme_list: Array1<i64>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct PredictDurationOutput {
pub(crate) phoneme_length: Array1<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = PredictIntonation;
)]
pub(crate) struct PredictIntonationInput {
pub(crate) length: Array0<i64>,
pub(crate) vowel_phoneme_list: Array1<i64>,
pub(crate) consonant_phoneme_list: Array1<i64>,
pub(crate) start_accent_list: Array1<i64>,
pub(crate) end_accent_list: Array1<i64>,
pub(crate) start_accent_phrase_list: Array1<i64>,
pub(crate) end_accent_phrase_list: Array1<i64>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct PredictIntonationOutput {
pub(crate) f0_list: Array1<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = GenerateFullIntermediate;
)]
pub(crate) struct GenerateFullIntermediateInput {
pub(crate) f0: Array2<f32>,
pub(crate) phoneme: Array2<f32>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct GenerateFullIntermediateOutput {
pub(crate) spec: Array2<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = RenderAudioSegment;
)]
pub(crate) struct RenderAudioSegmentInput {
pub(crate) spec: Array2<f32>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct RenderAudioSegmentOutput {
pub(crate) wave: Array1<f32>,
}
31 changes: 6 additions & 25 deletions crates/voicevox_core/src/infer/domains/talk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,10 @@ pub(crate) enum TalkOperation {
PredictIntonation,

#[inference_operation(
type Input = GenerateFullIntermediateInput;
type Output = GenerateFullIntermediateOutput;
type Input = DecodeInput;
type Output = DecodeOutput;
)]
GenerateFullIntermediate,

#[inference_operation(
type Input = RenderAudioSegmentInput;
type Output = RenderAudioSegmentOutput;
)]
RenderAudioSegment,
Decode,
}

#[derive(InferenceInputSignature)]
Expand Down Expand Up @@ -89,28 +83,15 @@ pub(crate) struct PredictIntonationOutput {

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = GenerateFullIntermediate;
type Signature = Decode;
)]
pub(crate) struct GenerateFullIntermediateInput {
pub(crate) struct DecodeInput {
pub(crate) f0: Array2<f32>,
pub(crate) phoneme: Array2<f32>,
pub(crate) speaker_id: Array1<i64>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct GenerateFullIntermediateOutput {
pub(crate) spec: Array2<f32>,
}

#[derive(InferenceInputSignature)]
#[inference_input_signature(
type Signature = RenderAudioSegment;
)]
pub(crate) struct RenderAudioSegmentInput {
pub(crate) spec: Array2<f32>,
}

#[derive(InferenceOutputSignature)]
pub(crate) struct RenderAudioSegmentOutput {
pub(crate) struct DecodeOutput {
pub(crate) wave: Array1<f32>,
}
Loading

0 comments on commit 87443fd

Please sign in to comment.