forked from alexbovet/network_lesson
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTwSentiment.py
244 lines (169 loc) · 7.92 KB
/
TwSentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# -*- coding: utf-8 -*-
"""
Created on May 2nd 2017
@author: Alexandre Bovet <[email protected]>
Sentiment Analysis of tweets
"""
import collections
from nltk import ngrams
from itertools import chain
import numpy as np
from string import punctuation
from nltk.tokenize.casual import TweetTokenizer, _replace_html_entities, remove_handles, \
reduce_lengthening, HANG_RE, WORD_RE, EMOTICON_RE
import re
def bag_of_words(words):
return dict([(word, True) for word in words])
def bag_of_words_and_bigrams(words):
bigrams = ngrams(words, 2)
return bag_of_words(chain(words, bigrams))
#==============================================================================
# Custom Tokenizer for tweets
#==============================================================================
def normalize_mentions(text):
"""
Replace Twitter username handles with '@USER'.
"""
pattern = re.compile(r"(^|(?<=[^\w.-]))@[A-Za-z_]+\w+")
return pattern.sub('@USER', text)
def normalize_urls(text):
"""
Replace urls with 'URL'.
"""
pattern = re.compile(r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")
# first shorten consecutive punctuation to 3
# to avoid the pattern to hang in exponential loop in extreme cases.
text = HANG_RE.sub(r'\1\1\1', text)
return pattern.sub('URL', text)
def _lowerize(word, keep_all_upper=False):
if EMOTICON_RE.search(word):
return word
elif word.isupper() and keep_all_upper:
return word
elif word == 'URL':
return word
elif word == '@USER':
return word
else:
return word.lower()
class CustomTweetTokenizer(TweetTokenizer):
""" Custom tweet tokenizer based on NLTK TweetTokenizer"""
def __init__(self, preserve_case=False, reduce_len=True, strip_handles=False,
normalize_usernames=True, normalize_urls=True, keep_allupper=True):
TweetTokenizer.__init__(self, preserve_case=preserve_case, reduce_len=reduce_len,
strip_handles=strip_handles)
self.keep_allupper = keep_allupper
self.normalize_urls = normalize_urls
self.normalize_usernames = normalize_usernames
if normalize_usernames:
self.strip_handles = False
if self.preserve_case:
self.keep_allupper = True
def tokenize(self, text):
"""
:param text: str
:rtype: list(str)
:return: a tokenized list of strings;
Normalizes URLs, usernames and word lengthening depending of the
attributes of the instance.
"""
# Fix HTML character entities:
text = _replace_html_entities(text)
# Remove or replace username handles
if self.strip_handles:
text = remove_handles(text)
elif self.normalize_usernames:
text = normalize_mentions(text)
if self.normalize_urls:
# Shorten problematic sequences of characters
text = normalize_urls(text)
# Normalize word lengthening
if self.reduce_len:
text = HANG_RE.sub(r'\1\1\1', text)
text = reduce_lengthening(text)
# Tokenize:
safe_text = HANG_RE.sub(r'\1\1\1', text)
words = WORD_RE.findall(safe_text)
# Possibly alter the case, but avoid changing emoticons like :D into :d:
# lower words but keep words that are all upper cases
if not self.preserve_case:
words = [_lowerize(w, self.keep_allupper) for w in words]
return words
#==============================================================================
# Emoticon classification
#==============================================================================
POS_EMOTICONS = [":D", ":-D", ":-)", ":=)", "=)", "XD", "=D", "=]", ":]", ":<)",
":>)", "=}", ":)",":}", ":o)","8D","8-)",
":]", ":-}", ":-]",":-.)","^_^", "^-^"]
NEG_EMOTICONS = [":(", ":-(", ":'(", "=(", "={",
":-{", ":-{", ":-(", ":'{", "=[", ":["]
POS_EMOJIS_RE = re.compile(u'['
u'\U0001F600-\U0001F606'
u'\U0001F60A-\U0001F60E'
u'\U0001F638-\U0001F63B'
u'\U0001F642'
u'\U0000263A-\U0000263B]+',
re.UNICODE)
NEG_EMOJIS_RE = re.compile(u'['
u'\U0001F61E-\U0001F622'
u'\U0001F63E-\U0001F63F'
u'\U0001F641'
u'\U00002639]+',
re.UNICODE)
def classifyEmoticons(text):
# find all emoticons
emoticons = EMOTICON_RE.findall(text)
pos = any([emo in POS_EMOTICONS for emo in emoticons]) or bool(POS_EMOJIS_RE.search(text))
neg = any([emo in NEG_EMOTICONS for emo in emoticons]) or bool(NEG_EMOJIS_RE.search(text))
if pos and neg:
return 'N/A'
elif pos and not neg:
return 'pos'
elif neg and not pos:
return 'neg'
elif not pos and not neg:
return None
class TweetClassifier(object):
def __init__(self, classifier,
tokenizer=CustomTweetTokenizer(preserve_case=False,
reduce_len=True,
strip_handles=False,
normalize_usernames=False,
normalize_urls=False,
keep_allupper=False),
feature_extractor=bag_of_words_and_bigrams,
label_inv_mapper={0 : 'neg' , 1 : 'pos'},
polarity_threshold=0.5):
self.classifier = classifier
self.tokenizer = tokenizer
self.feature_extractor = feature_extractor
self.label_inv_mapper = label_inv_mapper
self.polarity_threshold = polarity_threshold
self.labels = [self.label_inv_mapper[c] for c in self.classifier.classes_]
def classify_text(self, text, return_pred_labels=True):
if isinstance(text, str):
#single text
tokens = self.tokenizer.tokenize(text)
features = self.feature_extractor(tokens)
proba = self.classifier.predict_proba(features)
proba = proba.flatten()
if return_pred_labels:
if np.max(proba) > self.polarity_threshold:
predicted_label = self.labels[np.argmax(proba)]
else:
predicted_label = 'N/A'
elif isinstance(text, list):
# list of multiple texts
tokens = map(self.tokenizer.tokenize, text)
features = map(self.feature_extractor, tokens)
proba = self.classifier.predict_proba(features)
if return_pred_labels:
len_labels = max(len(l) for l in self.labels)
predicted_label = np.zeros(len(text), dtype='<U' + str(len_labels))
predicted_label[:] = 'N/A'
mask = np.max(proba,axis=1) > self.polarity_threshold
predicted_label[mask] = [self.labels[i] for i in np.argmax(proba[mask], axis=1)]
if return_pred_labels:
return predicted_label, proba
else:
return proba