Skip to content

Commit

Permalink
best tts
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxMax2016 committed Feb 22, 2023
1 parent 14284a5 commit 32f0be6
Show file tree
Hide file tree
Showing 27 changed files with 41,520 additions and 10,880 deletions.
36 changes: 16 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,35 +1,31 @@
### VITS实现的中文TTS,集成微软NaturalSpeech推理Loss优化措施,以及iSTFT加速
### Best TTS base on BERT and VITS with some Natural Speech Features Of Microsoft

this is the copy of https://github.com/jaywalnut310/vits
based on BERT,NatureSpeech, VITS

VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech

Espnet连接:github.com/espnet/espnet/tree/master/espnet2/gan_tts/vits

coqui-ai/TTS连接:github.com/coqui-ai/TTS/tree/main/recipes/ljspeech/vits_tts


### 基于VITS 实现 16K TTS 的流程记录
### Infer

pip install -r requirements.txt

cd monotonic_align

python setup.py build_ext --inplace

### Data Link
https://github.com/PlayVoice/HuaYan_TTS
#### Down Pretrained model

### 将16K音频拷贝到./baker_waves/,启动训练
BaiduYun:https://pan.baidu.com/s/1Cj4MnwFyZ0XZmTR6EpygbQ?pwd=yn60

python train.py -c configs/baker_base.json -m baker_base
prosody_model.pt To ./bert/prosody_model.pt

两张1080卡,训练两天,基本可以使用了
vits_bert.pth To ./vits_bert.pth

![LOSS值](/configs/loss.png)
python vits_infer.py

./vits_infer_out have the waves infered

### Train
going

### other data Link
https://github.com/PlayVoice/HuaYan_TTS

### 测试
python vits_strings.py

### iSTFT
完成
75 changes: 75 additions & 0 deletions bert/ProsodyModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import BertModel, BertConfig, BertTokenizer


class CharEmbedding(nn.Module):
def __init__(self, model_dir):
super().__init__()
self.tokenizer = BertTokenizer.from_pretrained(model_dir)
self.bert_config = BertConfig.from_pretrained(model_dir)
self.hidden_size = self.bert_config.hidden_size
self.bert = BertModel(self.bert_config)
self.proj = nn.Linear(self.hidden_size, 256)
self.linear = nn.Linear(256, 3)

def text2Token(self, text):
token = self.tokenizer.tokenize(text)
txtid = self.tokenizer.convert_tokens_to_ids(token)
return txtid

def forward(self, inputs_ids, inputs_masks, tokens_type_ids):
out_seq = self.bert(input_ids=inputs_ids,
attention_mask=inputs_masks,
token_type_ids=tokens_type_ids)[0]
out_seq = self.proj(out_seq)
return out_seq


class TTSProsody(object):
def __init__(self, path, device):
self.device = device
self.char_model = CharEmbedding(path)
self.char_model.load_state_dict(
torch.load(
os.path.join(path, 'prosody_model.pt'),
map_location="cpu"
),
strict=False
)
self.char_model.eval()
self.char_model.to(self.device)

def get_char_embeds(self, text):
input_ids = self.char_model.text2Token(text)
input_masks = [1] * len(input_ids)
type_ids = [0] * len(input_ids)
input_ids = torch.LongTensor([input_ids]).to(self.device)
input_masks = torch.LongTensor([input_masks]).to(self.device)
type_ids = torch.LongTensor([type_ids]).to(self.device)

with torch.no_grad():
char_embeds = self.char_model(
input_ids, input_masks, type_ids).squeeze(0).cpu()
return char_embeds

def expand_for_phone(self, char_embeds, length): # length of phones for char
assert char_embeds.size(0) == len(length)
expand_vecs = list()
for vec, leng in zip(char_embeds, length):
vec = vec.expand(leng, -1)
expand_vecs.append(vec)
expand_embeds = torch.cat(expand_vecs, 0)
assert expand_embeds.size(0) == sum(length)
return expand_embeds.numpy()


if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
prosody = TTSProsody('./bert/', device)
while True:
text = input("请输入文本:")
prosody.get_char_embeds(text)
1 change: 1 addition & 0 deletions bert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ProsodyModel import TTSProsody
19 changes: 19 additions & 0 deletions bert/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"type_vocab_size": 2,
"vocab_size": 21128
}
Loading

0 comments on commit 32f0be6

Please sign in to comment.