-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathschema_flypy_pro.py
188 lines (176 loc) · 7.99 KB
/
schema_flypy_pro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2024-10-11 11:32:43
# @Author : Litles ([email protected])
# @Link : https://github.com/Litles
# @Version : 1.0
import os
import re
import itertools
from collections import defaultdict
from func_lib import load_char_code
class SchemaFlypyPro:
def __init__(self, dir_in: str, fname_full: str, dir_out: str, map_shengmu: dict, map_yunmu: dict):
print("初始化中...")
# 1.双拼方案映射表
self.map_shengmu = map_shengmu
self.map_yunmu = map_yunmu
self.dir_in = dir_in
# 2.取两码形码作为辅助码
self.dict_char_codes = load_char_code(os.path.join(self.dir_in, fname_full)) # 单字全码, 加参数len=2则是取前两码
# (a)分析每个汉字的字根数
file_chaifen = "material_yujoy/yujoy_chaifen_v3.6.0.dict.yaml" # 卿云
# file_chaifen = "material_yustar/yustar_chaifen_v3.6.0.dict.yaml" # 星陈
dict_char_len = {} # 每个汉字的字根数
with open(file_chaifen, 'r', encoding='utf-8') as fr:
pat = re.compile(r"({[^\}]+})")
for line in fr:
line = line.strip()
if (not line.startswith("#")) and ("\t" in line):
char, others = line.split("\t", 1)
chaifen = others.lstrip("[").split(",", 1)[0]
if len(chaifen) > 0:
chaifen = pat.sub("字", chaifen)
dict_char_len[char] = len(chaifen)
# (b)提取首末字根的大码(单根字则补小码)
self.dict_char_codes_sup = defaultdict(set)
for char, codes in self.dict_char_codes.items():
for code in codes:
if len(code) in (2, 3):
self.dict_char_codes_sup[char].add(code[:2])
elif dict_char_len.get(char,4) == 3: # 4码,三根字的情况
self.dict_char_codes_sup[char].add(code[0]+code[2])
else: # 4码,四根以上字的情况
self.dict_char_codes_sup[char].add(code[0]+code[-1])
# 3.输出
self.dir_out = dir_out
if not os.path.exists(self.dir_out):
os.makedirs(self.dir_out)
def build(self):
# 1.白霜(frost)词库
dir_ciku = "material_common/cn_dicts" # tencent.dict.yaml
files_in = ['8105.dict.yaml', '41448.dict.yaml', 'base.dict.yaml', 'ext.dict.yaml', 'others.dict.yaml']
# (待生成) 形码整句版词库
files_out = ['8105.dict.yaml', '41448.dict.yaml', 'base.dict.yaml', 'ext.dict.yaml', 'others.dict.yaml']
# files_out = ['chars.dict.yaml', 'chars_ext.dict.yaml', 'words.dict.yaml', 'words_ext.dict.yaml', 'words_ext2.dict.yaml']
# 2.开始处理
for i in range(len(files_in)):
file_in = os.path.join(dir_ciku, files_in[i])
file_out = os.path.join(self.dir_out, files_out[i])
print("开始处理", file_in)
self.convert_dict_yaml_file(file_in, file_out)
def convert_dict_yaml_file(self, file_in: str, file_out: str, filter_flag: bool=False):
str_result = ""
with open(file_in, 'r', encoding='utf-8') as fr:
# 按行处理
for line in fr:
# 1.解析(全拼)编码行
if '\t' not in line or line.startswith("#"):
str_result += line # 非编码行直接拷贝
continue
line = line.strip()
freq = "0"
if len(line.split('\t')) == 3:
word, str_pinyin, freq = line.split('\t')
else:
word, str_pinyin = line.split('\t')
# 2.处理拼音部分
lst_pinyin = str_pinyin.split(" ")
lst_double_pinyin = []
for pinyin in lst_pinyin:
if len(pinyin) == 2: # 包含 ya, en, er, ai, ... 等情况
double_pinyin = pinyin
elif len(pinyin) == 1: # 包含 a, o, e 情况
double_pinyin = pinyin * 2
else:
# 1.解析全拼的声母、韵母
shengmu = pinyin[0] # 声母部分
yunmu = pinyin[1:] # 韵母部分
# 对特殊情况做修正
if pinyin[0] in ('a','o','e'): # (三字母)零声母, 包含 ang, eng 情况
yunmu = pinyin
elif pinyin[1] == "h": # 翘舌音, 包含 zh.., ch.., sh.. 情况
shengmu = pinyin[:2]
yunmu = pinyin[2:]
# 2.映射得到双拼的声母、韵母两键
double_shengmu = self.map_shengmu.get(shengmu, shengmu) # 双拼第一码(声母)
double_yummu = self.map_yunmu.get(yunmu, yunmu) # 双拼第二码(韵母)
if len(double_shengmu) != 1 or len(double_yummu) != 1:
print("该拼音有误(双拼将使用默认码ak):", pinyin)
double_pinyin = "ak"
else:
double_pinyin = double_shengmu + double_yummu
lst_double_pinyin.append(double_pinyin)
# 3.整合形成新的(双拼+形)编码行
# (a)拼接成完整编码(双拼+形)
lst_codes = []
punc_pat = re.compile(r"[,。《〈«‹》〉»›?;:‘’“”、~!……·()-—「【〔[」】〕]『〖{』〗}]")
word_pure = punc_pat.sub("", word)
if len(word_pure) == len(lst_double_pinyin):
# 获取所有可能的形码编码组合
lst_set_sup = []
for char in word_pure:
lst_set_sup.append(self.dict_char_codes_sup.get(char, {"ak"}))
# 按每个组合情况分别拼接
for pair in itertools.product(*lst_set_sup):
codes = []
for i in range(len(lst_double_pinyin)):
codes.append(lst_double_pinyin[i] + pair[i] + '0')
lst_codes.append(codes)
# (b)匹配词频
for codes in lst_codes:
if filter_flag and "41448" in file_in:
str_result += f"[超集字]{word}[超]\t{' '.join(codes)}\t{freq}\n"
else:
str_result += f"{word}\t{' '.join(codes)}\t{freq}\n"
else:
print("拼音数和汉字数不相等,已跳过该行:", line)
# 保存结果
with open(file_out, 'w', encoding='utf-8') as fw:
fw.write(str_result)
if __name__ == '__main__':
import time
start = time.perf_counter()
map_shengmu = { "zh": "v", "ch": "i", "sh": "u" }
map_yunmu = {
'iu': 'q',
'ei': 'w',
'uan':'r', 'van':'r',
'ue': 't', 've': 't',
'un': 'y', 'vn': 'y',
'uo': 'o',
'ie': 'p',
'iong': 's', 'ong': 's',
'ai': 'd',
'en': 'f',
'eng': 'g',
'ang': 'h',
'an': 'j',
'ing': 'k', 'uai': 'k',
'iang': 'l', 'uang': 'l',
'ou': 'z',
'ia': 'x', 'ua': 'x',
'ao': 'c',
'ui': 'v',
'in': 'b',
'iao': 'n',
'ian': 'm',
}
# 卿云
myschema = SchemaFlypyPro(
"material_yujoy",
"yujoy.full.dict_v3.6.0.yaml",
"schema_flypy_pro/dicts_flypy_pro",
map_shengmu,
map_yunmu
)
# # 星陈
# myschema = SchemaFlypyPro(
# "material_yustar",
# "yustar.full.dict_v3.6.0.yaml",
# "schema_flypy_pro2/dicts_flypy_pro",
# map_shengmu,
# map_yunmu
# )
myschema.build()
print("\nRuntime:", time.perf_counter() - start)