-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchecker.py
executable file
·124 lines (106 loc) · 3.46 KB
/
checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
# reference https://www.cs.bgu.ac.il/~elhadad/nlp19/How+to+Do+Things+with+Words.html
import re, collections, os, sys
def tokens(text):
"""
Get all words from the corpus
"""
t = re.sub("[A-Z]", lambda x: "_" + x.group(0).lower(), text)
return set(re.findall('[a-z]+', t))
def known(words):
"""
Return the subset of words that are actually
in our WORD_COUNTS dictionary.
"""
return {w for w in words if w in WORD_COUNTS}
def edits0(word):
"""
Return all strings that are zero edits away
from the input word (i.e., the word itself).
"""
return {word}
def edits1(word):
"""
Return all strings that are one edit away
from the input word.
"""
alphabet = ''.join([chr(ord('a')+i) for i in range(26)])
def splits(word):
"""
Return a list of all possible (first, rest) pairs
that the input word is made of.
"""
return [(word[:i], word[i:])
for i in range(len(word)+1)]
pairs = splits(word)
deletes = [a+b[1:] for (a, b) in pairs if b]
transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
replaces = [a+c+b[1:] for (a, b) in pairs for c in alphabet if b]
inserts = [a+c+b for (a, b) in pairs for c in alphabet]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"""Return all strings that are two edits away
from the input word.
"""
return {e2 for e1 in edits1(word) for e2 in edits1(e1)}
def correct(word):
"""
Get the best correct spelling for the input word
"""
# Priority is for edit distance 0, then 1, then 2
# else defaults to the input word itself.
# candidates = (known(edits0(word)) or
# known(edits1(word)) or
# known(edits2(word)) or
# {word})
candidates = (known(edits0(word)) or
known(edits1(word)) or
{word})
return max(candidates, key=WORD_COUNTS.get)
def correct_match(match):
"""
Spell-correct word in match,
and preserve proper upper/lower/title case.
"""
word = match.group()
def case_of(text):
"""
Return the case-function appropriate
for text: upper, lower, title, or just str.:
"""
return (str.upper if text.isupper() else
str.lower if text.islower() else
str.title if text.istitle() else
str)
return case_of(word)(correct(word.lower()))
def correct_text_generic(text):
"""
Correct all the words within a text,
returning the corrected text.
"""
return re.sub('[a-zA-Z]+', correct_match, text)
def print_correction(fn):
with open(fn, 'r') as f:
TOKENS = tokens(f.read())
corr = {}
for original_word in TOKENS:
correct_word = correct_text_generic(original_word)
if correct_word != original_word and len(original_word) >4 :
corr[original_word] = correct_word
if len(corr) > 0:
print('\n%s'% (fn))
for k in corr:
print(' %s (suggestion: %s)'% (k, corr[k]))
#print('%s'% (k))
with open('spell.dict', 'r') as f:
WORDS = tokens(f.read())
WORD_COUNTS = collections.Counter(WORDS)
top_dir = sys.argv[1]
if os.path.isfile(top_dir):
print_correction(top_dir)
else:
g = os.walk(top_dir)
for path,dir_list,file_list in g:
for file_name in file_list:
f = os.path.join(path, file_name)
print_correction(f)