-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextPreprocessor.py
198 lines (165 loc) · 7 KB
/
TextPreprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import string, unicodedata
import nltk
import contractions
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tag.perceptron import PerceptronTagger
from nltk.corpus import wordnet
from nltk.tag import pos_tag_sents
from nltk.tag.mapping import map_tag
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn.base import TransformerMixin, BaseEstimator
# inspired by:
# https://towardsdatascience.com/text-preprocessing-steps-and-universal-pipeline-94233cb6725a
# https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self,
language = "english",
unicodedata = True,
clean_html = False,
replace_contractions = True,
remove_numbers = True,
punctuations = None,
stopwords = None,
pos_tags = None,
lemmatize = True,
stem = False,
min_length = 1,
max_length = 15,
n_jobs = 1):
"""
Text preprocessing transformer includes steps:
1. Text normalization and cleanup
2. POS tag retention
3. Lemmatization
4. Stemming
language - the language
unicodedata - makes sure characters which look identical are identical (é -> e)
clean_html - cleans up HTLM tags
replace_contractions - Replaces e.g. "I'm" with "I am"
remove_numbers - Removes numbers
punctuations - An user defined punctuation set
stopwords - An user defined stop word set (None for default)
pos_tags - An user defined POS tag set that should be kept
lemmatize - Defines if lemmatization should be applied
stem - Defines if stemming should be applied
min_length - Defines the min length of a string
max_length - Defines the max length of a string
n_jobs - parallel jobs to run
"""
self.language = language
self.unicodedata = unicodedata
self.clean_html = clean_html
self.replace_contractions = replace_contractions
self.remove_numbers = remove_numbers
self.punctuations = punctuations
self.stopwords = stopwords
self.pos_tags = pos_tags
self.lemmatize = lemmatize
self.stem = stem
self.min_length = min_length
self.max_length = max_length
self.n_jobs = n_jobs
def fit(self, X, y=None):
return self
def transform(self, X, *_):
X_copy = X.copy()
partitions = 1
cores = mp.cpu_count()
if self.n_jobs <= -1:
partitions = cores
elif self.n_jobs <= 0:
return X_copy.apply(self.preprocess_text)
else:
partitions = min(self.n_jobs, cores)
data_split = np.array_split(X_copy, partitions)
pool = mp.Pool(cores)
data = pd.concat(pool.map(self._preprocess_part, data_split))
pool.close()
pool.join()
return data
def _preprocess_part(self, part):
return part.apply(self.preprocess_text)
def preprocess_text(self, text):
return ' '.join(self.normalize(text))
def _get_wordnet_pos(self, tag):
"""Map POS tag to first character lemmatize() accepts"""
tag = tag[0].upper()
if tag == "J":
return wordnet.ADJ
elif tag == "N":
return wordnet.NOUN
elif tag == "V":
return wordnet.VERB
elif tag == "R":
return wordnet.ADV
else:
return wordnet.NOUN
def _tag(self, text):
tags = [self._get_wordnet_pos(nltk.pos_tag([w])[0][1][0]) for w in text]
#tagger = PerceptronTagger()
#tags = tagger.tag(text)
#tags = [self._get_wordnet_pos(tag[1]) for tag in tags]
#tags = [self._get_wordnet_pos(map_tag('en-ptb', None, tag[1])) for tag in tags]
#tags = pos_tag_sents(text)
#tags = [self._get_wordnet_pos(tag[1][0]) for tag in tags[0]]
return tags
def normalize(self, text):
if self.clean_html:
soup = BeautifulSoup(text, 'html.parser')
text = soup.get_text()
'''
text = soup.find_all(text=True)
output = []
blacklist = ['[document]','noscript','header','html','meta','head','input','script','style']
for t in text:
if t.parent.name not in blacklist:
output.append('{}'.format(t))
text = ' '.join(output)
'''
if self.unicodedata:
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
if self.replace_contractions:
text = contractions.fix(text)
stop_words = None
if self.stopwords is None:
# use default (use none -> user provides empty set)
stop_words=set(stopwords.words(self.language))
else:
stop_words=self.stopwords
punct_words = None
if self.punctuations is None:
# use default (use none -> user provides empty set)
punct_words = set(string.punctuation)
else:
punct_words = self.punctuations
normalized = [w for w in word_tokenize(text.lower())
if
(not w in punct_words)
and
(not w in stop_words)
and
((not self.remove_numbers) or (not w.isdigit()))
and
((self.min_length < len(w)) and (len(w) < self.max_length))
]
tagged = None
if self.pos_tags is not None:
tagged = self._tag(normalized)
normalized = [w for (w, tag) in zip(normalized, tagged) if tag in self.pos_tags]
tagged = [tag for tag in tagged if tag in self.pos_tags]
#normalized = [w for w in normalized if self._get_wordnet_pos(w) in self.pos_tags]
if self.lemmatize:
if tagged is None:
tagged = self._tag(normalized)
lemmatizer = WordNetLemmatizer()
normalized = [lemmatizer.lemmatize(w, tag) for (w, tag) in zip(normalized, tagged)]
#normalized = [lemmatizer.lemmatize(w, self._get_wordnet_pos(w)) for w in normalized]
if self.stem:
stemmer = SnowballStemmer(self.language)
normalized = [stemmer.stem(w) for w in normalized]
return normalized