best tts

PlayVoice · Feb 22, 2023 · 32f0be6 · 32f0be6
1 parent 14284a5
commit 32f0be6
Show file tree

Hide file tree

Showing 27 changed files with 41,520 additions and 10,880 deletions.
diff --git a/README.md b/README.md
@@ -1,35 +1,31 @@
-### VITS实现的中文TTS，集成微软NaturalSpeech推理Loss优化措施，以及iSTFT加速
+### Best TTS base on BERT and VITS with some Natural Speech Features Of Microsoft
 
-this is the copy of https://github.com/jaywalnut310/vits		
+based on BERT，NatureSpeech, VITS
 
-VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech		
-
-Espnet连接：github.com/espnet/espnet/tree/master/espnet2/gan_tts/vits
-
-coqui-ai/TTS连接：github.com/coqui-ai/TTS/tree/main/recipes/ljspeech/vits_tts
-
-
-### 基于VITS 实现 16K TTS 的流程记录
+### Infer
 
 pip install -r requirements.txt
 
 cd monotonic_align
 
 python setup.py build_ext --inplace
 
-### Data Link
-https://github.com/PlayVoice/HuaYan_TTS
+#### Down Pretrained model
 
-### 将16K音频拷贝到./baker_waves/，启动训练
+BaiduYun：https://pan.baidu.com/s/1Cj4MnwFyZ0XZmTR6EpygbQ?pwd=yn60
 
-python train.py -c configs/baker_base.json -m baker_base
+prosody_model.pt To ./bert/prosody_model.pt
 
-两张1080卡，训练两天，基本可以使用了
+vits_bert.pth To ./vits_bert.pth
 
-![LOSS值](/configs/loss.png)
+python vits_infer.py
+
+./vits_infer_out have the waves infered
+
+### Train
+going
+
+### other data Link
+https://github.com/PlayVoice/HuaYan_TTS
 
-### 测试
-python vits_strings.py
 
-### iSTFT
-完成
diff --git a/bert/ProsodyModel.py b/bert/ProsodyModel.py
@@ -0,0 +1,75 @@
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from transformers import BertModel, BertConfig, BertTokenizer
+
+
+class CharEmbedding(nn.Module):
+    def __init__(self, model_dir):
+        super().__init__()
+        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+        self.bert_config = BertConfig.from_pretrained(model_dir)
+        self.hidden_size = self.bert_config.hidden_size
+        self.bert = BertModel(self.bert_config)
+        self.proj = nn.Linear(self.hidden_size, 256)
+        self.linear = nn.Linear(256, 3)
+
+    def text2Token(self, text):
+        token = self.tokenizer.tokenize(text)
+        txtid = self.tokenizer.convert_tokens_to_ids(token)
+        return txtid
+
+    def forward(self, inputs_ids, inputs_masks, tokens_type_ids):
+        out_seq = self.bert(input_ids=inputs_ids,
+                            attention_mask=inputs_masks,
+                            token_type_ids=tokens_type_ids)[0]
+        out_seq = self.proj(out_seq)
+        return out_seq
+
+
+class TTSProsody(object):
+    def __init__(self, path, device):
+        self.device = device
+        self.char_model = CharEmbedding(path)
+        self.char_model.load_state_dict(
+            torch.load(
+                os.path.join(path, 'prosody_model.pt'),
+                map_location="cpu"
+            ),
+            strict=False
+        )
+        self.char_model.eval()
+        self.char_model.to(self.device)
+
+    def get_char_embeds(self, text):
+        input_ids = self.char_model.text2Token(text)
+        input_masks = [1] * len(input_ids)
+        type_ids = [0] * len(input_ids)
+        input_ids = torch.LongTensor([input_ids]).to(self.device)
+        input_masks = torch.LongTensor([input_masks]).to(self.device)
+        type_ids = torch.LongTensor([type_ids]).to(self.device)
+
+        with torch.no_grad():
+            char_embeds = self.char_model(
+                input_ids, input_masks, type_ids).squeeze(0).cpu()
+        return char_embeds
+
+    def expand_for_phone(self, char_embeds, length):  # length of phones for char
+        assert char_embeds.size(0) == len(length)
+        expand_vecs = list()
+        for vec, leng in zip(char_embeds, length):
+            vec = vec.expand(leng, -1)
+            expand_vecs.append(vec)
+        expand_embeds = torch.cat(expand_vecs, 0)
+        assert expand_embeds.size(0) == sum(length)
+        return expand_embeds.numpy()
+
+
+if __name__ == "__main__":
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    prosody = TTSProsody('./bert/', device)
+    while True:
+        text = input("请输入文本：")
+        prosody.get_char_embeds(text)
diff --git a/bert/__init__.py b/bert/__init__.py
@@ -0,0 +1 @@
+from .ProsodyModel import TTSProsody
diff --git a/bert/config.json b/bert/config.json
@@ -0,0 +1,19 @@
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "directionality": "bidi", 
+  "hidden_act": "gelu", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 768, 
+  "initializer_range": 0.02, 
+  "intermediate_size": 3072, 
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 12, 
+  "num_hidden_layers": 12, 
+  "pooler_fc_size": 768, 
+  "pooler_num_attention_heads": 12, 
+  "pooler_num_fc_layers": 3, 
+  "pooler_size_per_head": 128, 
+  "pooler_type": "first_token_transform", 
+  "type_vocab_size": 2, 
+  "vocab_size": 21128
+}