forked from chemjen/skincare_scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNLP_lite.py
121 lines (91 loc) · 4.72 KB
/
NLP_lite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import matplotlib.pyplot as plt
from collections import Counter
import nltk
import pandas as pd
import re
from textblob import TextBlob
df = pd.read_csv('sephora_clean.csv')
df['details'] = df['details'].str.lower()
dets = df[['family', 'brand', 'name', 'details']].dropna()
menskincare = df.loc[(df['family']=='Men') & (df['genus'] == 'Skincare')]
menskincare = menskincare[['family', 'brand', 'name', 'details']].dropna()
skincare = dets.loc[dets['family'] == 'Skincare']
generic_phrases = ['what it is', 'which skin type is it good for?', 'ingredient callouts',
'what is it good for', 'highlighted ingredients',
'formulation', 'skin type', 'what it is formulated without', 'skincare concerns',
'solutions for', 'what else you need to know']
skincare['details'] = skincare['details'].apply(lambda x: x.split('clean at sephora')[0])
menskincare['details'] = menskincare['details'].apply(lambda x: x.split('clean at sephora')[0])
for phrase in generic_phrases:
skincare['details'] = skincare['details'].apply(lambda x: x.replace(phrase, ' '))
menskincare['details'] = menskincare['details'].apply(lambda x: x.replace(phrase, ' '))
skincare['details'] = skincare['details'].apply(lambda x: x.replace('\\n', ' '))
skincare['details'] = skincare['details'].apply(lambda x: re.sub('\s+', ' ', x))
skincare['details'] = skincare['details'].apply(lambda x: re.sub('[^\w\s]', '', x))
menskincare['details'] = menskincare['details'].apply(lambda x: x.replace('\\n', ' '))
menskincare['details'] = menskincare['details'].apply(lambda x: re.sub('\s+', ' ', x))
menskincare['details'] = menskincare['details'].apply(lambda x: re.sub('[^\w\s]', '', x))
from nltk.corpus import stopwords
stop = stopwords.words('english')
skincare['details'] = skincare['details'].apply(lambda text: " ".join(word for word in text.split() if word not in stop))
menskincare['details'] = menskincare['details'].apply(lambda text: " ".join(word for word in text.split() if word not in stop))
#from nltk import WordNetLemmatizer
#lemztr = WordNetLemmatizer()
details = skincare['details'].values.flatten()
bigrams = []
for detail in details:
bigrams.extend(TextBlob(detail).ngrams(2))
total_bigrams = len(bigrams)
bigram_counts = Counter(str(bigram) for bigram in bigrams)
bigram_counts = pd.DataFrame(list(bigram_counts.items()), columns=['bigram', 'count'])
bigram_counts['bigram'] = bigram_counts['bigram'].apply(lambda x: ' '.join(''.join(list(x)[1:-1]).strip("'").split("', '")))
bigram_counts.sort_values(by='count', ascending=False)[:30].plot.bar(x='bigram', y='count')
plt.tight_layout()
plt.show()
trigrams = []
for detail in details:
trigrams.extend(TextBlob(detail).ngrams(3))
total_trigrams = len(trigrams)
trigram_counts = Counter(str(trigram) for trigram in trigrams)
trigram_counts = pd.DataFrame(list(trigram_counts.items()), columns=['trigram', 'count'])
trigram_counts['trigram'] = trigram_counts['trigram'].apply(lambda x: ' '.join(''.join(list(x)[1:-1]).strip("'").split("', '")))
trigram_counts.sort_values(by='count', ascending=False)[:17].plot.bar(x='trigram', y='count')
plt.tight_layout()
plt.show()
details = menskincare['details'].values.flatten()
bigrams = []
for detail in details:
bigrams.extend(TextBlob(detail).ngrams(2))
total_bigrams = len(bigrams)
bigram_counts = Counter(str(bigram) for bigram in bigrams)
bigram_counts = pd.DataFrame(list(bigram_counts.items()), columns=['bigram', 'count'])
bigram_counts['bigram'] = bigram_counts['bigram'].apply(lambda x: ' '.join(''.join(list(x)[1:-1]).strip("'").split("', '")))
bigram_counts.sort_values(by='count', ascending=False)[:30].plot.bar(x='bigram', y='count')
plt.tight_layout()
plt.show()
trigrams = []
for detail in details:
trigrams.extend(TextBlob(detail).ngrams(3))
total_trigrams = len(trigrams)
trigram_counts = Counter(str(trigram) for trigram in trigrams)
trigram_counts = pd.DataFrame(list(trigram_counts.items()), columns=['trigram', 'count'])
trigram_counts['trigram'] = trigram_counts['trigram'].apply(lambda x: ' '.join(''.join(list(x)[1:-1]).strip("'").split("', '")))
trigram_counts.sort_values(by='count', ascending=False)[:17].plot.bar(x='trigram', y='count')
plt.tight_layout()
plt.show()
from scipy import stats
print(stats.binom_test(x=1200, n=1956, p=800/1956)/2)
print(stats.binom_test(x=1000, n=1956, p=800/1956)/2)
print(stats.binom_test(x=31, n=68, p=22/68)/2)
#from wordcloud import WordCloud
#wc = WordCloud(background_color="white", max_words=1000, width=800, height=400)
#wc.generate(' '.join(skincare['details']))
#import matplotlib.pyplot as plt
#plt.figure(figsize=(12, 6))
#plt.imshow(wc, interpolation='bilinear')
#plt.axis("off")
#plt.show()
#d = defaultdict(int) #values will be integers. can also be list or string, etc
#for s in day_i:
# cust, order = s.split(':')
# d[cust] += int(order)