forked from tedljw/data_augment
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathciLin.py
136 lines (124 loc) · 5.45 KB
/
ciLin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
'''
@author: Yalei Meng
@contact: [email protected]
@license: (C) Copyright 2017, HUST Corporation Limited.
@DateTime: Created on 2018/5/3,at 19:13
@desc:使用基于信息内容的算法来计算词语相似度。参考文献:
【1】彭琦, 朱新华, 陈意山,等. 基于信息内容的词林词语相似度计算[J]. 计算机应用研究, 2018(2):400-404.
'''
#import math
class CilinSimilarity(object):
""" 基于哈工大同义词词林扩展版计算语义相似度 """
def __init__(self):
"""
'code_word' 以编码为key,单词list为value的dict,一个编码有多个单词
'word_code' 以单词为key,编码为value的dict,一个单词可能有多个编码
'vocab' 所有不重复的单词,便于统计词汇总数。
'mydict' 每个大中小类编码对应的下位节点数量。
"""
self.code_word = {}
self.word_code = {}
self.vocab = set()
self.file = './new_cilin.txt'
# self.mydict = {}
self.read_cilin()
def read_cilin(self):
"""
读入同义词词林,编码为key,词群为value,保存在self.code_word
单词为key,编码群为value,保存在self.word_code
所有单词保存在self.vocab
"""
head = set()
with open(self.file, 'r', encoding='gbk') as f:
for line in f.readlines():
res = line.split()
code = res[0] # 词义编码
words = res[1:] # 同组的多个词
self.vocab.update(words) # 一组词更新到词汇表中
self.code_word[code] = words # 字典,目前键是词义编码,值是一组单词。
for w in words:
if w in self.word_code.keys(): # 最终目的:键是单词本身,值是词义编码。
self.word_code[w].append(code) # 如果单词已经在,就把当前编码增加到字典中
else:
self.word_code[w] = [code] # 反之,则在字典中添加该项。
# 第一次遍历,得到大中小类的代码。
if len(code) < 6:
continue
fathers = [code[:1], code[:2], code[:4], code[:5], code[:7]]
head.update(fathers)
fatherlist = sorted(list(head))
'''
with open(self.file, 'r', encoding='gbk') as f:
# 第二次遍历:得到大中小类的数量。更新到字典mydict里面。
for ele in fatherlist:
self.mydict[ele] = 0
for line in f.readlines():
res = line.split()
code = res[0] # 词义编码
words = res[1:] # 同组的多个词
if len(code) > 5 and code[:5] in self.mydict.keys():
self.mydict[code[:7]] += len(words)
self.mydict[code[:5]] += len(words)
if len(code) > 4 and code[:4] in self.mydict.keys():
self.mydict[code[:4]] += len(words)
if len(code) > 2 and code[:2] in self.mydict.keys():
self.mydict[code[:2]] += len(words)
if len(code) > 1 and code[:1] in self.mydict.keys():
self.mydict[code[:1]] += len(words)
def get_common_str(self, c1, c2):
""" 获取两个字符的公共部分,注意有些层是2位数字 """
res = ''
for i, j in zip(c1, c2):
if i == j:
res += i
else:
break
if 3 == len(res) or 6 == len(res):
res = res[:-1]
return res
def Info_Content(self, concept):
if concept == '':
return 0
total =0
for ele in self.mydict.keys():
if len(ele)==1:
total += self.mydict[ele]
FenMu = math.log(total,2)
#print('总结点数',total,FenMu)
hypo = 1
if concept in self.mydict.keys():
hypo += self.mydict[concept]
info = math.log(hypo, 2) / FenMu
# print(concept, '下位节点数:', hypo,'信息内容:',1-info)
return 1 - info
def sim_by_IC(self, c1, c2):
# 找到公共字符串
LCS = self.get_common_str(c1, c2)
distance = self.Info_Content(LCS) - (self.Info_Content(c1) + self.Info_Content(c2)) / 2
return distance + 1
def sim2018(self, w1, w2):
""" 按照论文彭琦, 朱新华, 陈意山,等. 基于信息内容的词林词语相似度计算[J]. 计算机应用研究, 2018(2):400-404.计算相似度 """
for word in [w1, w2]:
if word not in self.vocab:
print(word, '未被词林词林收录!')
return 0 # 如果有一个词不在词林中,则相似度为0
# 获取两个词的编码列表
code1 = self.word_code[w1]
code2 = self.word_code[w2]
simlist = []
for c1 in code1: # 选取相似度最大值
for c2 in code2:
cur_sim = self.sim_by_IC(c1, c2)
simlist.append(cur_sim)
aver = sum(simlist) / len(simlist)
# print(sorted(simlist,reverse=True))
if len(simlist) < 2:
return simlist[0]
if max(simlist) > 0.7:
return max(simlist)
elif aver > 0.2:
return (sum(simlist) - max(simlist)) / (len(simlist) - 1)
else:
return min(simlist)
'''