-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
109 lines (85 loc) · 2.37 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#coding = utf-8
from nltk.corpus import words
from nltk.corpus import webtext
from nltk.corpus import wordnet
import re
import nltk
# worddict = ['a', 'an', 'any', 'anyone','on','one','bar', 'barks', 'ark', 'bark']
import json
# 生成所有子串
def substrings(string):
string_list = []
length = len(string)
# 分别找长度为 1 2 3 4...n 的子串
for i in range(length):
# 子串长度
n = i + 1
# 子串数量
num = length - i
for j in range(num):
seg = Segment(string[j:j + n], j, j + n)
string_list.append(seg)
return string_list
class Segment:
def __init__(self, content, start, end):
self.content = content
self.start = start
self.end = end
class Candidate:
def __init__(self):
self.seg_list = []
self.num = 0
self.end = 0
self.coverage = 0
def add_candidate(self, segment):
# 比较起始位置和结束位置
end = self.end
start = segment.start
if end <= start:
self.seg_list.append(segment)
self.num = self.num + 1
self.end = segment.end
self.coverage = self.coverage + len(segment.content)
# 定义密码对象
class Password:
def __init__(self, content):
self.content = content
self.len = 0
# 最优切割
self.candidates_word = []
self.gap = []
# def is_word(seg):
# corpus = webtext.words()
# try:
# return corpus.index(seg)
# except ValueError:
# return -1
worddict = words.words()
def is_word(word):
if word in worddict:
return True
else:
return False
def classify_semantic(segment):
semantic_seg = []
for seg, pos in segment:
c = ""
if pos is "gap":
c = 'number'
elif pos == 'noun':
c = 'dog.n.03'
elif pos == 'noun' or 'verb':
print("hdkjask")
semantic_seg.append([seg,pos,c])
return semantic_seg
# 判断gap类型:数字、字母、特殊符号、混合
def get_gap_type():
digital = '^[0-9]*$'
letter = '^[A-Za-z]+$'
special = "^[%&;()=?><.,':\"_+@#$\x22]+"
re.match()
if __name__ == '__main__':
# nltk.download('wordnet')s
# print(wordnet.synsets("dog"))
special = "^[%&;-()=?><.,':\"_+@#$\x22]+"
re.match(special,"sd&><.,")