-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.py
executable file
·126 lines (111 loc) · 4.21 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
import random
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import stop_point_detection as sp
import cf_tree as cft
import seaborn as sns
import sample as smpl
import mplleaflet
from sklearn.cluster import KMeans, Birch
from collections import Counter
"""
Clusters points that are within a specified geographical region.
"""
def refined_tree(store, order, threshold, run, radius, centroid_long, centroid_lat):
tree = cft.CFTree(order, threshold)
for chunk in store.select('sp', chunksize=10000):
chunk = chunk.query(
'sqrt( (longitude - @centroid_long)**2 + (latitude - @centroid_lat)**2) < @radius')
for _, row in (chunk[['longitude', 'latitude']]).iterrows():
tree.insert_point(row.values)
tree.save_tree(store, run)
"""
Clusters points per day using the structure of the HDF store
"""
def create_tree_per_day(store, order, threshold, run, day):
tree = cft.CFTree(order, threshold)
for chunk in store.select('sp/d{}'.format(day), chunksize=10000):
for _, row in (chunk[['longitude', 'latitude']]).iterrows():
tree.insert_point(row.values)
tree.save_tree(store, run)
"""
Main.
"""
def create_clusters(store, day, order, threshold):
# Automatic threshold calculation
if threshold is None:
threshold = k_means_thresh(
smpl.elbow_mode(store, day=day), store, day=day)
# Saves CFT to store
create_tree_per_day(store, order, threshold, order + day, day)
# Plotting
plt.clf()
df = clusters_at_max_height(store, order + day)
cluster_filter = df.filter(
items=['centroid_0', 'centroid_1', 'n', 'radius'])
cluster_filter = cluster_filter[cluster_filter['n'] > 1000]
catchment = []
for _, row in cluster_filter.iterrows():
# radius in meters
dist = sp.distance((row['centroid_0'], row['centroid_1']), (
row['centroid_0'] + row['radius'], row['centroid_1'] + row['radius']))
# area
catchment.append(np.pi * dist**2)
cluster_filter['size'] = catchment
pd.set_option('display.max_rows', df.shape[0] + 1)
print("Date: 0{}-02-2008".format(day))
print("Clusters:")
print(cluster_filter)
cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
plt.scatter(x="centroid_0", y="centroid_1", s="size", c="n",
cmap='Reds', alpha=0.7, data=cluster_filter)
plt.plot()
mplleaflet.show()
"""
Calculates the threshold to use for birch clustering, based on the samples taken from the data.
"""
def k_means_thresh(mode, store, n_samples=1000, day=None):
sample_set = smpl.sample(store, n_samples, day)
long_lats = sample_set.filter(items=['longitude', 'latitude'])
kmeans = KMeans(n_clusters=mode).fit(long_lats)
avg_r = np.sqrt(kmeans.inertia_ / len(sample_set.index))
return avg_r
def clusters_at_max_height(store, run):
print("Run: {}".format(run))
df = store.select('clusters/r{}'.format(run))
max = df['layer'].max()
pd.set_option('display.max_rows', df.shape[0] + 1)
return df[df.layer == max]
def closest_cluster(clusters, long, lat):
c_index = 0
c_distance = sp.distance(
(clusters.iat[0, 7], clusters.iat[0, 8]), (long, lat))
index = 1
for _, row in clusters.iloc[1:].iterrows():
dist = sp.distance((row['centroid_0'], row['centroid_1']), (long, lat))
if dist < c_distance:
c_index = index
c_distance = dist
index += 1
return clusters.iat[c_index, 0]
def distance_to_stop_plot(store, order, day):
df = clusters_at_max_height(store, order + day)
nrows = store.get_storer('sp/d{}'.format(day)).nrows
r = np.random.randint(0, nrows, size=1000)
samples = store.select('sp/d{}'.format(day), where=pd.Index(r))
hist = []
i = 0
for _, row in (samples[['longitude', 'latitude']]).iterrows():
dist = closest_cluster(df, row['longitude'], row['latitude'])
print(i)
i += 1
hist.append(dist)
plt.hist(x=hist, bins='auto', color='#0504aa',
alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Distance (meters)')
plt.ylabel('Frequency')
plt.title('Distance to nearest bus stop')
plt.show()