-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_pca.py
59 lines (51 loc) · 1.64 KB
/
plot_pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""
Attribution: https://gist.github.com/andreasvc/8317989
@andreasvc on github
Apply PCA to a CSV file and plot its datapoints (one per line).
The first column should be a category (determines the color of each datapoint),
the second a label (shown alongside each datapoint).
"""
import sys
import pandas
import pylab as pl
import sklearn
from sklearn import preprocessing
from sklearn.decomposition import PCA
def plotpca(csvfile, title):
data = pandas.read_csv(csvfile, index_col=(0, 1))
# first column provides labels
ylabels = [a for a, _ in data.index]
labels = [text for _, text in data.index]
encoder = preprocessing.LabelEncoder().fit(ylabels)
xdata = data.to_numpy(data.columns)
ydata = encoder.transform(ylabels)
target_names = encoder.classes_
generate_pca(xdata, ydata, target_names, labels, csvfile, title)
def generate_pca(xdata, ydata, target_names, items, filename, title):
"""Make plot."""
pca = PCA(n_components=2)
components = pca.fit(preprocessing.normalize(xdata)).transform(xdata)
pl.figure() # Make a plotting figure
pl.subplots_adjust(bottom=0.1)
# NB: a maximum of 7 targets will be plotted
# CHANGE COLORS HERE
for i, (c, m, target_name) in enumerate(zip(
'yrbmmcg', 'o^s*v+x', target_names)):
pl.scatter(components[ydata == i, 0], components[ydata == i, 1],
color=c, marker=m, label=target_name)
for n, x, y in zip(
(ydata == i).nonzero()[0],
components[ydata == i, 0],
components[ydata == i, 1]):
pl.annotate(
items[n],
xy=(x, y),
xytext=(5, 5),
textcoords='offset points',
color=c,
fontsize='small',
ha='left',
va='top')
pl.legend()
pl.title(title)
pl.show()