-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml-web-app.py
225 lines (181 loc) · 8.74 KB
/
ml-web-app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.io as pio
# define general random seed and plotly template
pio.templates.default = "ggplot2"
np.random.seed(69)
# INTRO: write introduction
st.write("""
# ✂️Clustering cross-sectional data
## Table of contents
1. [Data description](#data-description)
1. [Number of clusters choice](#number-of-clusters-choice)
1. [Cluster visualization](#cluster-visualization)
The purpose of this app is to compute and plot clusters for your **cross-sectional data**.
I assume you have a CSV file with your data **already cleaned** (i.e., without missing values, duplicates or string variables).
The algorithms used are **KMeans** and **tSNE**:
- **KMeans** clustering method is an unsupervised machine learning technique used to identify clusters of data objects in a dataset. It is an example of partitional clustering which divides data objects into non-overlapping groups. In other words, no object can be a member of more than one cluster, and every cluster must have at least one object.
- The **t-distributed stochastic neighbor embedding (t-SNE)** is a statistical method for visualizing high-dimensional data by giving each datapoint a location in a two-dimensional map. It models each high-dimensional object by a two-dimensional point in such a way that similar objects are modeled by nearby points and dissimilar objects are modeled by distant points with high probability.
""")
# SIDEBAR: develop sidebar
st.sidebar.header('User input features')
# Collects user input features into dataframe
file_object = st.sidebar.file_uploader("Upload your cleaned CSV file", type=["csv"])
sep_option = st.sidebar.selectbox(
'Which seperator would you like to use?',
[",", ";", ".", ":", "\s+"])
@st.cache(allow_output_mutation=True)
def read_csv_cache():
df = pd.read_csv(file_object, sep=sep_option)
return df
try:
df = read_csv_cache()
except:
st.error('Please upload a valid CSV file!')
st.stop()
if df.shape[1] == 1:
st.error('Please select the appropriate separator for your CSV!')
st.stop()
st.sidebar.header('Preferred number of clusters')
n_clusters = st.sidebar.slider('Number of clusters', 2, 15, 2)
# SUBSECTION: DATA
st.subheader('Data description')
# Displays the dataframe head
if st.checkbox('Show data'):
st.write(df)
if st.checkbox('Show descriptive statistics'):
st.write(df.describe())
# Data processing
@st.cache
def data_processing(data):
numerical=data.select_dtypes(include=np.number).columns
categorical=data.select_dtypes(exclude=np.number).columns
ct = ColumnTransformer([
("ohe", OneHotEncoder(sparse=False ), categorical)
], remainder='passthrough')
df_enc=ct.fit_transform(data)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_enc)
return numerical, categorical, df_scaled
numerical, categorical, df_scaled = data_processing(df)
# Plot selected variables
option = st.selectbox(
'Which variable would you like to plot the distribution of?', df.columns)
fig = px.histogram(df[option], title=option.replace("_", " ").title())
fig.update_layout(showlegend=False)
fig.update_xaxes(title_text="", showgrid=False)
fig.update_yaxes(title_text="Number of observations")
st.plotly_chart(fig, use_container_width=True)
# SUBSECTION: CHOOSING NUMBER OF CLUSTERS
st.subheader('Number of clusters choice')
# Try different number of clusters up to 15
r=16
@st.cache
def cluster_choice(df_scaled):
sse = []
silhouette_coefficients = []
DB_score= []
for k in range(2, r):
kmeans = KMeans(n_clusters=k, init='k-means++' )
kmeans.fit(df_scaled)
sse.append(kmeans.inertia_)
score = silhouette_score(df_scaled, kmeans.labels_)
silhouette_coefficients.append(score)
score = davies_bouldin_score(df_scaled, kmeans.labels_)
DB_score.append(score)
return sse, silhouette_coefficients, DB_score
sse, silhouette_coefficients, DB_score = cluster_choice(df_scaled)
st.write("""
Three commonly used methods to evaluate the appropriate number of clusters are:
1. *The elbow method*
2. *The silhouette coefficient*
3. *Davies Bouldin index*
These are often used as complementary evaluation techniques rather than one being preferred over the other.
1. **Elbow Method**
To perform the *elbow method*, we run several KMeans, incrementing the number of clusters with each iteration, and record the sum of the squared errors (SSE).
The SSE continues to decrease as we increase the number of clusters. As more centroids are added, the distance from each point to its closest centroid will decrease.
There’s a sweet spot **where the SSE curve starts to bend** known as the elbow point. The x-value of this point is thought to be a reasonable trade-off between error and number of clusters.
""")
# Plot 1: SSE - choose elbow
fig = px.line(x = range(2, r), y = sse)
fig.update_layout(showlegend=False)
fig.update_xaxes(title_text="Number fo clusters")
fig.update_yaxes(title_text="SSE", showgrid=False)
st.plotly_chart(fig, use_container_width=True)
st.write("""
2. **Silhoutte coefficient**
The *silhouette coefficient* is a measure of cluster cohesion and separation. It quantifies how well a data point fits into its assigned cluster based on two factors:
- How close the data point is to other points in the cluster
- How far away the data point is from points in other clusters
Its values ranges from -1 and 1. **Larger numbers** indicate that samples are closer to their clusters than they are to other clusters.
In the ```sklearn``` implementation, the average *silhouette coefficient* of all the samples is summarized into one score.
""")
# Plot 2: Silhouette Coefficient - choose local maximum
fig = px.line(x = range(2, r), y = silhouette_coefficients)
fig.update_layout(showlegend=False)
fig.update_xaxes(title_text="Number fo clusters")
fig.update_yaxes(title_text="Silhouette coefficient", showgrid=False)
st.plotly_chart(fig, use_container_width=True)
st.write("""
3. **Davies-Bouldin index**
The *Davies-Bouldin index* is easier to calculation than the *silhouette scores*.
It is the ratio between the within cluster distances and the between cluster distances, averaged across clusters. Its is therefore bounded between 0 and 1.
A **lower index** relates to a model with better separation between the clusters.
""")
# Plot 3: Davies Bouldin Score - choose local minimum
fig = px.line(x = range(2, r), y = DB_score)
fig.update_layout(showlegend=False)
fig.update_xaxes(title_text="Number fo clusters")
fig.update_yaxes(title_text="Davies Bouldin score", showgrid=False)
st.plotly_chart(fig, use_container_width=True)
st.write("""
> Please choose your desired number of clusters (based on your combined interpretation of the previous three methods/ graphs) on the slider bar on the left panel **Preferred number of clusters** ⬅️.
""")
# SUBSECTION: VISUALIZATION
st.subheader('Cluster visualization')
# KMeans
kmeans = KMeans(n_clusters=n_clusters, init='k-means++')
kmeans.fit(df_scaled)
# Aggregate clusters by mean and count
@st.cache
def categorical_groupby(lst):
res_dct = {lst[i]: "count" for i in range(0, len(lst), 1)}
return res_dct
@st.cache(allow_output_mutation=True)
def numerical_groupby(lst):
res_dct = {lst[i]: "mean" for i in range(0, len(lst), 1)}
return res_dct
categorical_agg = categorical_groupby(categorical)
numerical_agg = numerical_groupby(numerical)
numerical_agg.update(categorical_agg)
st.write("""
Below I plot your **Preferred number of clusters** ⬅️using the **t-SNE** algorithm.
""")
# t-SNE
tsne = TSNE(perplexity = 30)
X_tsne = tsne.fit_transform(df_scaled)
# Plot 4: t-SNE
fig = px.scatter(x = X_tsne[:, 0], y = X_tsne[:, 1], color=kmeans.labels_, color_continuous_scale=px.colors.sequential.Agsunset_r)
fig.update_layout(showlegend=False)
fig.update_xaxes(title_text="t-SNE feature 1")
fig.update_yaxes(title_text="t-SNE feature 2")
fig.update(layout_coloraxis_showscale=False)
st.plotly_chart(fig, use_container_width=True)
if st.checkbox('Show clusters description'):
df["cluster"] = pd.Series(kmeans.labels_, index=df.index)
df_clusters = df.groupby("cluster").agg(numerical_agg).reset_index(drop=True)
if 'cluster' in df_clusters.columns:
df_clusters = df_clusters.drop(["cluster"], axis=1)
st.dataframe(df_clusters)
st.write("""
*Notes: numerical and categorical features aggregated using mean and number of observations, respectively; index corresponds to the cluster number.*
""")
if st.checkbox('Show original data with cluster assignment'):
st.write(df)