-
Notifications
You must be signed in to change notification settings - Fork 168
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
14284a5
commit 32f0be6
Showing
27 changed files
with
41,520 additions
and
10,880 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,31 @@ | ||
### VITS实现的中文TTS,集成微软NaturalSpeech推理Loss优化措施,以及iSTFT加速 | ||
### Best TTS base on BERT and VITS with some Natural Speech Features Of Microsoft | ||
|
||
this is the copy of https://github.com/jaywalnut310/vits | ||
based on BERT,NatureSpeech, VITS | ||
|
||
VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech | ||
|
||
Espnet连接:github.com/espnet/espnet/tree/master/espnet2/gan_tts/vits | ||
|
||
coqui-ai/TTS连接:github.com/coqui-ai/TTS/tree/main/recipes/ljspeech/vits_tts | ||
|
||
|
||
### 基于VITS 实现 16K TTS 的流程记录 | ||
### Infer | ||
|
||
pip install -r requirements.txt | ||
|
||
cd monotonic_align | ||
|
||
python setup.py build_ext --inplace | ||
|
||
### Data Link | ||
https://github.com/PlayVoice/HuaYan_TTS | ||
#### Down Pretrained model | ||
|
||
### 将16K音频拷贝到./baker_waves/,启动训练 | ||
BaiduYun:https://pan.baidu.com/s/1Cj4MnwFyZ0XZmTR6EpygbQ?pwd=yn60 | ||
|
||
python train.py -c configs/baker_base.json -m baker_base | ||
prosody_model.pt To ./bert/prosody_model.pt | ||
|
||
两张1080卡,训练两天,基本可以使用了 | ||
vits_bert.pth To ./vits_bert.pth | ||
|
||
![LOSS值](/configs/loss.png) | ||
python vits_infer.py | ||
|
||
./vits_infer_out have the waves infered | ||
|
||
### Train | ||
going | ||
|
||
### other data Link | ||
https://github.com/PlayVoice/HuaYan_TTS | ||
|
||
### 测试 | ||
python vits_strings.py | ||
|
||
### iSTFT | ||
完成 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import os | ||
import torch | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
|
||
from transformers import BertModel, BertConfig, BertTokenizer | ||
|
||
|
||
class CharEmbedding(nn.Module): | ||
def __init__(self, model_dir): | ||
super().__init__() | ||
self.tokenizer = BertTokenizer.from_pretrained(model_dir) | ||
self.bert_config = BertConfig.from_pretrained(model_dir) | ||
self.hidden_size = self.bert_config.hidden_size | ||
self.bert = BertModel(self.bert_config) | ||
self.proj = nn.Linear(self.hidden_size, 256) | ||
self.linear = nn.Linear(256, 3) | ||
|
||
def text2Token(self, text): | ||
token = self.tokenizer.tokenize(text) | ||
txtid = self.tokenizer.convert_tokens_to_ids(token) | ||
return txtid | ||
|
||
def forward(self, inputs_ids, inputs_masks, tokens_type_ids): | ||
out_seq = self.bert(input_ids=inputs_ids, | ||
attention_mask=inputs_masks, | ||
token_type_ids=tokens_type_ids)[0] | ||
out_seq = self.proj(out_seq) | ||
return out_seq | ||
|
||
|
||
class TTSProsody(object): | ||
def __init__(self, path, device): | ||
self.device = device | ||
self.char_model = CharEmbedding(path) | ||
self.char_model.load_state_dict( | ||
torch.load( | ||
os.path.join(path, 'prosody_model.pt'), | ||
map_location="cpu" | ||
), | ||
strict=False | ||
) | ||
self.char_model.eval() | ||
self.char_model.to(self.device) | ||
|
||
def get_char_embeds(self, text): | ||
input_ids = self.char_model.text2Token(text) | ||
input_masks = [1] * len(input_ids) | ||
type_ids = [0] * len(input_ids) | ||
input_ids = torch.LongTensor([input_ids]).to(self.device) | ||
input_masks = torch.LongTensor([input_masks]).to(self.device) | ||
type_ids = torch.LongTensor([type_ids]).to(self.device) | ||
|
||
with torch.no_grad(): | ||
char_embeds = self.char_model( | ||
input_ids, input_masks, type_ids).squeeze(0).cpu() | ||
return char_embeds | ||
|
||
def expand_for_phone(self, char_embeds, length): # length of phones for char | ||
assert char_embeds.size(0) == len(length) | ||
expand_vecs = list() | ||
for vec, leng in zip(char_embeds, length): | ||
vec = vec.expand(leng, -1) | ||
expand_vecs.append(vec) | ||
expand_embeds = torch.cat(expand_vecs, 0) | ||
assert expand_embeds.size(0) == sum(length) | ||
return expand_embeds.numpy() | ||
|
||
|
||
if __name__ == "__main__": | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
prosody = TTSProsody('./bert/', device) | ||
while True: | ||
text = input("请输入文本:") | ||
prosody.get_char_embeds(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .ProsodyModel import TTSProsody |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{ | ||
"attention_probs_dropout_prob": 0.1, | ||
"directionality": "bidi", | ||
"hidden_act": "gelu", | ||
"hidden_dropout_prob": 0.1, | ||
"hidden_size": 768, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 3072, | ||
"max_position_embeddings": 512, | ||
"num_attention_heads": 12, | ||
"num_hidden_layers": 12, | ||
"pooler_fc_size": 768, | ||
"pooler_num_attention_heads": 12, | ||
"pooler_num_fc_layers": 3, | ||
"pooler_size_per_head": 128, | ||
"pooler_type": "first_token_transform", | ||
"type_vocab_size": 2, | ||
"vocab_size": 21128 | ||
} |
Oops, something went wrong.