-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
106 lines (76 loc) · 3.08 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import string
import pandas as pd
def load_files(fname='usps.csv', output = 'dict'):
"""
load abbreviation from file fname
return dictionary {pattern : replacement}
return list [list of items to remove from strings to avoid]
"""
if output == "dict":
df = pd.read_csv(fname, index_col='pattern')
return df.to_dict()
elif output == "list":
df = pd.read_csv(fname, header=0)
col = df.columns[0]
return df[col].tolist()
USPS = load_files('usps.csv')
SCHOOL_ABREV = load_files('school_abbrev.csv')
def handle_strings(x, exclude=set(string.punctuation)):
"""
Helper function to make string all caps and remove punctuation.
x: any string
"""
x = x.replace('-', ' ')
x = ''.join(ch for ch in x if ch not in exclude)
return x
def handle_words(x, exclude=[], case = 'u'):
''' Helper function to remove words from match comparisons that don't have signal but are noisy
x: any string
exclude = list of words that are removed from comparison strings for matching.
'''
case = case[0].lower()
exclude = [word.lower() for word in exclude]
x = x.lower().split(" ")
x = ' '.join(word for word in x if word not in exclude)
if case == 'u':
if type(x) is str:
x = x.upper()
else:
x = [word.upper() for word in x]
return x
def normalizeText(inputValue, d={}, case='u'):
'''
if case=='l', returns lowercase
if case=='u', returns uppercase
else returns proper case
d = dictionary to use for replacements
'''
case = case[0].lower()
abbv = d
words = inputValue.split()
for i,word in enumerate(words):
w = handle_strings(word.lower())
rep = abbv[w] if w in abbv.keys() else handle_strings(words[i])
words[i] = rep.upper() if case == 'u' else rep.lower() if case == 'l' else (rep[0].upper() + rep[1:])
return ' '.join(words)
def norm_shorthand(inputValue, case = 'u', short_hand = []):
''' fixes cases when short-hand is used at the end of a string
eg. "Prevost elementary" vs. "Prevost elementary school'''
case = case[0].lower()
words = inputValue.split()
short_hand = ['elementary', 'middle', 'high', 'intermediate']
if words[-1].lower() in short_hand:
if case == 'l':
words[-1] = words[-1].replace(words[-1], words[-1] + ' school').lower()
else:
words[-1] = words[-1].replace(words[-1], words[-1] + ' school').upper()
return ' '.join(words)
def k12_clean (text_col, case = 'u'):
''' Normalizes strings of K-12 type school or location descriptions
text_col: Pandas column of text strings
returns: normalized pandas column
'''
text_col = text_col.map(lambda x: normalizeText(x, case = case, d = SCHOOL_ABREV['replacement']))
text_col = text_col.map(lambda x: normalizeText(x, case = case, d=USPS))
text_col = text_col.map(lambda x: norm_shorthand(x, case = case))
return text_col