-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsax_dash.py
214 lines (157 loc) · 6.71 KB
/
sax_dash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import numpy as np
from scipy.stats import norm
import plotly.graph_objs as go
from numpy.random import default_rng
rng = default_rng()
# ************************************************
# Helper methods for the SAX transformation
def get_breakpoints(n):
""" Calculates breakpoints
Uses scipy's inverse cummulative gaussian distribution to find the boundaries.
Args:
n (int) cardinality
Returns:
List of sax words
"""
b = norm.ppf(np.linspace(0,1, n+1))
# do not slice inf/-inf as they are used later in range comparisons
return b
def get_sax_representation(T, w, breakpoints, cardinality):
""" Transforms a time series to sax
Args:
T (array) time series
w (int) dimension - time series length must be a multiple of w
breakpoints (array) boundaries of the breakpoints. Lower and upper bounds must be -inf/inf
cardinality (int) sax word length
Returns:
List of sax words
"""
n = len(T)
sax_words = []
for i in range(0, w):
start = n / w * i
end = n / w * (i + 1)
paa = w/n * np.sum(T[int(start):int(end)])
# find the word/range that describes the paa value
for k in range(0, cardinality+1):
if(paa > breakpoints[k] and paa <= breakpoints[k+1]):
sax_word = k
break
sax_words.append(sax_word)
#print(f"word from {start} to {end}. PAA: {paa} SAX: {sax_word}" )
return sax_words
def get_sax_figure(T, w, breakpoints, sax_words):
""" Get sax plot
Args:
T (array) time series
w (int) dimension - time series length must be a multiple of w
breakpoints (list) boundaries of the breakpoints. Lower and upper bounds must be -inf/inf
sax_words (list) the sax representation
Returns:
Plotly figure
"""
n = len(T)
fig = go.Figure()
plot_breakpoints = np.copy(breakpoints)
plot_breakpoints[0] = np.min(T) - 0.1
plot_breakpoints[-1] = np.max(T) + 0.1
shapes = []
for i, sax_word in enumerate(sax_words):
start = i * (n/w)
end = (i+1) * (n/w)
fig.add_shape(type="rect", x0=start-.1, y0=plot_breakpoints[sax_word], x1=end-.2, y1=plot_breakpoints[sax_word+1],
line=dict(color=None, width=0),
fillcolor="LightSkyBlue",
opacity=0.5, layer="below")
fig.add_trace(go.Scatter(x=np.arange(0, len(T)), y=T))
for k in range(1, len(breakpoints)-1):
fig.add_shape(type="line", x0=0, y0=breakpoints[k], x1=len(T), y1=breakpoints[k], line_dash="dot")
fig.update_layout(title='SAX representation of T',
xaxis_title='Time Series Index',
yaxis_title='Value')
return fig
def get_hist(sax_words, cardinality):
""" Get a histogram of the sax symbols
Args:
breakpoints (list) boundaries of the breakpoints. Lower and upper bounds must be -inf/inf
sax_words (list) the sax representation
Returns:
List of sax words
"""
hist, bin_edges = np.histogram(sax_words, bins=cardinality)
fig = go.Figure(data=go.Bar(x=np.arange(0, cardinality), y=(hist / np.sum(hist))))
fig.update_layout(title='SAX symbol frequency',
xaxis_title='Symbol Index',
yaxis_title='Relative Frequency')
return fig
# This short series approximates the example from the paper
T = np.array([-.6, -.5, -.7, -1, -.8, -1.5, -.75, -.1, -.15, .2, .3, .2, 1.3, 1.7, 1.5, 1.4])
# Synthetic longer series
x_len = 128
x = np.linspace(0, 16, x_len)
x = np.sin(x) + np.sin(0.25*x) + np.sin(-.5*x) + 0.1*rng.standard_normal(x_len)
x = (x - np.mean(x)) / np.std(x)
T = x
# ************************************************
# Below Dash app code
app = dash.Dash(__name__)
server = app.server
app.layout = html.Div([
dcc.Markdown('''
# SAX Representation for Time Series
SAX can be used as a compact representation of time series.
This demo can be used to experiment with the parameters of the SAX transformation.
Additional information on the algorithm can be found in _iSAX: Indexing and Mining Terabyte Sized Time Series, Jin Shieh & Eamonn Keogh, SIGKDD, 2008_, section 2.3.
* Dimension **w**: this parameter controls the dimensionality of the resulting vector space. More dimensions increase the temporal resolution of the representation at the cost of increased storage requirements
* Cardinality **a**: the cardinality controls the number of symbols that are used. The number of symbols is equal to the distinct values that are used to approximate the time series.
The two plots below illustrate the results of the SAX transformation:
### Upper Plot: Transformation Result
The blue line shows the input time series. Dashed black lines illustrate the breakpoints (thresholds) that are used to determine the underlying PAA representations.
The box overlays indicate the SAX symbol which is used for a particular section of the series. The bottom-most box represents the first symbol (0). The top-most box represents the last symbol (a-1).
### Lower Plot: Histogram of SAX Symbols
The lower plot shows the relative frequency of the symbols that were used for representing the signal.
'''),
html.Label(
[
"Dimensions (w)",
dcc.Dropdown(
id='dimensions',
options=[{'label': i, 'value': i} for i in [2,4,8,16,32]],
value='8',
),
]
),
html.Label(
[
"Cardinality (a)",
dcc.Dropdown(
id='cardinality',
options=[{'label': i, 'value': i} for i in [2,4,8,16,32]],
value='4',
),
]
),
dcc.Graph(id='graph-sax'),
dcc.Graph(id='graph-hist')
])
@app.callback(
Output('graph-sax', 'figure'),
Output('graph-hist', 'figure'),
Input('dimensions', 'value'),
Input('cardinality', 'value'))
def update_figure(dimensions, cardinality):
# Length of time series
n = len(T)
# Dimensions of the vector space
w = int(dimensions)
# Length of the sax symbol
cardinality = int(cardinality)
breakpoints = get_breakpoints(cardinality)
sax_words = get_sax_representation(T, w, breakpoints, cardinality)
return get_sax_figure(T, w, breakpoints, sax_words), get_hist(sax_words, cardinality)
if __name__ == '__main__':
app.run_server(debug=False)