-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatabaker_G2P_1.py
123 lines (98 loc) · 3.37 KB
/
databaker_G2P_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import re
import numpy as np
from tqdm import tqdm
def write_metadata(metadata, out_dir):
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in tqdm(metadata):
f.write('|'.join([str(x) for x in m]) + '\n')
print('len:', len(metadata))
return True
def build_from_path_CN(input_path, use_prosody):
content = _read_labels(input_path)
# databaker:
# 002387 毛#2一般#1为#1白色#3只是#1两个#1耳壳#3眼圈#3肩部#3四肢#1为#1黑色#4
# mao2 yi4 ban1 wei2 bai2 se4 zhi3 shi4 liang3 ge5 er3 ke2 yan3 quan1 jian1 bu4 si4 zhi1 wei2 hei1 se4
metadata = []
num = int(len(content)//2)
for idx in range(num):
res = _parse_cn_prosody_label(content[idx*2], content[idx*2+1], use_prosody)
if res is not None:
basename, text = res
metadata.append([basename, text])
break
return metadata
def _read_labels(input_path):
# 从多个文件读入, 改为指定一个文件读入, 因此有些冗余
files = []
files.append(input_path)
# load from all files
labels = []
for item in files:
with open(item, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line != '': labels.append(line)
return labels
def _parse_cn_prosody_label(text, pinyin, use_prosody=False):
"""
Parse label from text and pronunciation lines with prosodic structure labelings
Input text: 100001 妈妈#1当时#1表示#3,儿子#1开心得#2像花儿#1一样#4。
Input pinyin: ma1 ma1 dang1 shi2 biao3 shi4 er2 zi5 kai1 xin1 de5 xiang4 huar1 yi2 yang4
Return sen_id: 100001
Return pinyin: ma1-ma1 dang1-shi2 biao3-shi4, er2-zi5 kai1-xin1-de5 / xiang4-huar1 yi2-yang4.
Args:
- text: Chinese characters with prosodic structure labeling, begin with sentence id for wav and interval file
- pinyin: Pinyin pronunciations, with tone 1-5
- use_prosody: Whether the prosodic structure labeling information will be used
Returns:
- (sen_id, pinyin&tag): latter contains pinyin string with optional prosodic structure tags
"""
text = text.strip()
pinyin = pinyin.strip()
if len(text) == 0:
return None
# remove punctuations
text = re.sub('[“”、,。:;?!—…#()]', '', text)
# split into sub-terms
sen_id, texts = text.split()
phones = pinyin.split()
# prosody boundary tag (SYL: 音节, PWD: 韵律词, PPH: 韵律短语, IPH: 语调短语, SEN: 语句)
SYL = '-'
PWD = ' '
PPH = ' / ' if use_prosody==True else ' '
IPH = ', '
SEN = '.'
# parse details
pinyin = ''
i = 0 # texts index
j = 0 # phones index
b = 1 # left is boundary
while i < len(texts):
if texts[i].isdigit():
if texts[i] == '1': pinyin += PWD # Prosodic Word, 韵律词边界
if texts[i] == '2': pinyin += PPH # Prosodic Phrase, 韵律短语边界
if texts[i] == '3': pinyin += IPH # Intonation Phrase, 语调短语边界
if texts[i] == '4': pinyin += SEN # Sentence, 语句结束
b = 1
i += 1
elif texts[i]!='儿' or j==0 or not _is_erhua(phones[j-1][:-1]): # Chinese segment
if b == 0: pinyin += SYL # Syllable, 音节边界(韵律词内部)
pinyin += phones[j]
b = 0
i += 1
j += 1
else: # 儿化音
i += 1
return (sen_id, pinyin)
def _is_erhua(pinyin):
"""
Decide whether pinyin (without tone number) is retroflex (Erhua)
"""
if len(pinyin)<=1 or pinyin == 'er':
return False
elif pinyin[-1] == 'r':
return True
else:
return False