forked from alexcaldarone/wstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_page.py
230 lines (200 loc) · 8.63 KB
/
main_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import streamlit as st
import matplotlib.pyplot as plt
from io import StringIO
import emoji
from scripts.message import Message
from scripts.analysis import Analysis
st.set_page_config(page_title="WhatsApp Stats")
st.sidebar.markdown("# Chat Explorer")
st.sidebar.markdown("""
Use this page to explore you WhatsApp chats!
""")
st.title("WhatsApp Stats")
st.markdown(''' #### _Discover the **secrets** behind your chats!_
With this simple app you will be able to analyze:
- :iphone: The total number of messages sent by each participant
- :incoming_envelope: How many messages were sent in each conversation
- :calendar: How many messages were sent on each weekday
- :watch: What time of day you chat the most
- :file_folder: How many of your messages are media
- :speech_balloon: Who is the conversation starter?
- :memo: Who writes the longest messages on average?
-----
#### How does it work?
- Open the chat you want to analyze in WhatsApp from your smartphone.
- Click on the 3 dots, select More > Export chat
- Choose to export without media
- Import the text file in the app!
''')
chatfile = st.file_uploader("Upload your chat file.", type='txt')
if not chatfile:
st.warning('No chat file uploaded')
st.stop()
else:
analysis = Analysis()
stringio = StringIO(chatfile.getvalue().decode("utf-8"))
last_message_analyzed = None
for i, line in enumerate(stringio):
if i == 0:
continue # skip first chat line
# chatline = line.strip()
if Message.is_valid_message(Message, line): # check if valid message
message = Message(line.strip())
last_message_analyzed = analysis.update_list(message)
else:
# if it isn't a valid message it's continuation of previous message
if last_message_analyzed:
analysis.update_last_message(line)
analysis.generate_dataframe()
analysis.text_regularization()
classifier_df = analysis.export_to_classifier()
# Charts
# Number of messages sent by each user
messages_by_user = analysis.get_messages_per_user()
fig1, ax1 = plt.subplots()
ax1.pie(messages_by_user, labels=messages_by_user.index, autopct='%1.2f%%')
ax1.axis('equal')
# Number of messages per conversation
number_of_messager_per_conversation = analysis.get_messages_per_day()
fig2, ax2 = plt.subplots()
ax2.plot(number_of_messager_per_conversation.index,
number_of_messager_per_conversation)
# ax2.set(xticklabels=[])
ax2.set_ylabel('N° of messages')
# number of messages per weekday
number_of_messager_per_weekday = analysis.get_messages_by_weekday()
fig3, ax3 = plt.subplots()
ax3.barh(number_of_messager_per_weekday.index[::-1],
number_of_messager_per_weekday[::-1])
ax3.set_title('Weekly chatting patterns')
# number of messages per hour
number_of_messages_per_hour = analysis.get_messages_by_hour()
fig4, ax4 = plt.subplots()
ax4.bar(number_of_messages_per_hour.index, number_of_messages_per_hour)
ax4.set_title('Houly chatting patterns')
ax2.set_ylabel('N° of messages')
# Number of messages for each message type (text, media, link)
number_of_messages_per_type = analysis.get_messages_by_type()
fig5, ax5 = plt.subplots()
ax5.pie(number_of_messages_per_type,
labels=number_of_messages_per_type.index, autopct='%1.2f%%')
ax5.axis('equal')
# Number of conversations started by each user
conversation_started_by_users = analysis.chats_started_by_user()
fig6, ax6 = plt.subplots()
ax6.pie(conversation_started_by_users,
labels=conversation_started_by_users.index, autopct='%1.2f%%')
ax6.axis('equal')
# Average message length for each message
average_message_length = analysis.average_message_length_by_user()
fig7, ax7 = plt.subplots()
ax7.barh(average_message_length.index, average_message_length,
align='center')
ax7.set_yticks(average_message_length.index)
ax7.invert_yaxis() # labels read top-to-bottom
ax7.set_xlabel('Average message length')
ax7.set_title('Average participant message length')
st.success('Done')
# Display charts
st.header("Number of messages")
col1, col2 = st.columns(2)
with col1:
st.metric('Total messages (includes update messages)',
sum(messages_by_user))
for i in messages_by_user.index:
if i is not None and i != 'None':
st.write(i+':', messages_by_user[i])
with col2:
st.pyplot(fig1)
st.header("Messages sent per conversation")
st.bar_chart(data=number_of_messager_per_conversation)
st.header("Messages sent each weekday")
col3, col4 = st.columns([1, 2])
with col3:
for i in number_of_messager_per_weekday.index:
st.write(i+':', number_of_messager_per_weekday[i])
with col4:
st.pyplot(fig3)
st.header('Hourly chatting patterns')
st.pyplot(fig4)
st.header("Type of message")
col5, col6 = st.columns(2)
with col5:
st.write('Media includes photos, documents, audios.')
for i in number_of_messages_per_type.index:
st.write(i+':', number_of_messages_per_type[i])
with col6:
st.pyplot(fig5)
st.header("Who were the conversations started by?")
col7, col8 = st.columns(2)
with col7:
st.metric('Total number of \nconversations',
sum(conversation_started_by_users))
for i in conversation_started_by_users.index:
if i is not None:
st.write(i, 'started', conversation_started_by_users[i],
'conversations.')
with col8:
st.pyplot(fig6)
st.header('Average message length')
col9, col10 = st.columns(2)
with col9:
for i in average_message_length.index:
if i is not None:
st.write(i+"'s average message length is:",
round(average_message_length[i], 2))
with col10:
st.pyplot(fig7)
st.header("How often was a certain word used?")
input_word = st.text_input("Search for word")
if input_word != None:
word_frequency = analysis.get_count_of_word_per_conversation(input_word.lower())
fig8, ax8 = plt.subplots()
ax8.bar(word_frequency.index, word_frequency)
ax8.set_ylim(bottom=0)
st.pyplot(fig8)
# add distribution of words per user (?)
# add count of a single word
# add most common words for a certain user
st.header("What is a user's most common word?")
st.write("Returns the selected user's 5 most common words.")
col11, col12 = st.columns(2)
with col11:
selected_user = st.radio(
label="Select a user",
options=analysis.get_chat_participants()
)
most_common_words_per_user = analysis.get_most_common_words_per_user(selected_user)
with col12: # better way to detect if a string is an emoji or not?
for word in most_common_words_per_user:
if word[0] == ":" and word[-1] == ":" and emoji.is_emoji(emoji.emojize(f"{word}")): emoji_string = word
elif word.startswith(":") and emoji.is_emoji(emoji.emojize(f"{word}:")): emoji_string = word+":"
elif word.endswith(":") and emoji.is_emoji(emoji.emojize(f":{word}")): emoji_string = ":"+word
elif emoji.is_emoji(emoji.emojize(f":{word}:")): emoji_string = ":"+word+":"
else: emoji_string = None
if emoji_string: st.write(emoji.emojize(emoji_string))
else: st.write(word)
# cosine similarity
st.header("How similar are the messages?")
st.write("Using cosine similarity we can determine how 'similar' the messages written are.")
options = st.multiselect("Choose two authors to compare:",
analysis.get_chat_participants())
if len(options) == 2:
similarities = analysis.cosine_similarity(options[0], options[1])
fig9, ax9 = plt.subplots()
ax9.plot(similarities)
st.pyplot(fig9)
st.markdown("---")
st.header("Export chat data to classifier")
st.markdown("""
This section allows you to dowload the necessary data to train the Multinomial Naive Bayes Classifier.
Press the download button, save the parquet file, switch over to page2 and import!
""")
parquet_file = classifier_df.to_parquet(engine="pyarrow")
st.download_button(label="Download data for classifier",
data=parquet_file,
file_name="classifier_data.parquet",
mime="application/octet-stream")
st.markdown("---")
st.markdown('''
Created by [Alex Caldarone](https://alexcaldarone.github.io/)''')