-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtools.py
613 lines (488 loc) · 20.3 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
import numpy as np
import xarray as xr
from scipy.stats import binned_statistic
from warnings import warn
def barycentric_avg(v):
'''
Calculate the barycentric average
Parameters
----------
v : `xarray.Dataset`
Measurements taken at each vertex of a tetrahedron
Returns
-------
v_bary : `xarray.DataArray`
Barycentric average of the measurements
'''
v_bary = 0
for v_name in v:
v_bary += v[v_name]
v_bary /= len(v)
return v_bary
def binned_avg_ds(ds, t_out):
def ds_bin_avg(data_to_avg):
avg, bin_edges, binnum = binned_statistic(t_in, data_to_avg,
statistic='mean',
bins=t_bins)
return avg
vars_out = {}
# scipy does not like datetime64 so time has to be converted to floats
# - Subtracting lots of datetime64 can take a long time! (???)
t_ref = np.min([ds['time'][0].data.astype('datetime64[D]'),
t_out[0].astype('datetime64[D]')])
t_bins = (t_out - t_ref).astype('float')
t_in = (ds['time'].data - t_ref).astype('float')
# Step through each variable in the dataset
for name, var in ds.data_vars.items():
# Remember what the variable looked like
var_shape = (len(t_bins)-1, *var.shape[1:])
var_ndim = var.ndim
var_dims = var.dims
var_dtype = var.dtype
# Reshape the variable to be 2D
if var_ndim == 1:
var = var.expand_dims('temp', axis=1)
elif var_ndim > 2:
var = var.stack(temp=var_dims[1:])
# scipy does not like datetimes/timedelta of any type so convert to float
if np.issubdtype(var_dtype, np.timedelta64):
var = var.data.astype(float)
# Bin-average each component of the vector
temp_vars = []
for idx in range(var.shape[1]):
temp_vars.append(ds_bin_avg(var.data[:,idx]))
# Reshape the data back to how it was
temp_vars = np.column_stack(temp_vars)
if var_ndim == 1:
temp_vars = temp_vars.squeeze()
elif var_ndim > 2:
temp_vars = temp_vars.reshape(var_shape)
var = var.unstack()
# Convert back to datetime/timedelta
if np.issubdtype(var_dtype, np.timedelta64):
temp_vars = temp_vars.astype(np.timedelta64)
# Combine bin-averaged components into an array
temp = xr.DataArray(temp_vars,
dims=var_dims,
coords={key: value if key != var_dims[0] else t_out[:-1]
for key, value in var.coords.items()},
name=name)
# Save the variables in the output dictionary
vars_out[name] = temp
# Create a new dataset
return xr.Dataset(vars_out)
def binned_avg(data, t_out):
'''
Resample data by averaging into temporal bins
Parameters
----------
data : `numpy.array`
Data to be averaged
t_out : `numpy.datetime64`
Leading edge of bins into which data should be averaged
Returns
-------
avg : `xarray.DataArray`
Resampled data
'''
# scipy does not like datetime64 so time has to be converted to floats
t_ref = np.min([data['time'][0].data.astype('datetime64[D]'),
t_out[0].astype('datetime64[D]')])
t_bins = (t_out - t_ref).astype('float')
t_in = (data['time'].data - t_ref).astype('float')
# Calculate the average
def bin_avg(data_to_avg):
avg, _, _ = binned_statistic(t_in, data_to_avg,
statistic='mean',
bins=t_bins)
return avg
# A time series
if data.ndim == 1:
if np.issubdtype(data.dtype, np.timedelta64):
temp = bin_avg(data.data.astype(float))
temp = temp.astype(np.timedelta64)
else:
temp = bin_avg(data.data)
# The final time in t_out is the right-most bin edge of the final bin. Remove it.
temp = xr.DataArray(temp,
dims=('time',),
coords={'time': t_out[:-1]},
name=data.name)
# A vector of time series
elif data.ndim == 2:
temp_vars = []
# Bin each component of the vector
for idx in range(data.shape[1]):
temp_vars.append(bin_avg(data.data[:,idx]))
# Combine bin-averaged components into an array
# The final time in t_out is the right-most bin edge of the final bin. Remove it.
temp = xr.DataArray(np.column_stack(temp_vars),
dims=('time', data.dims[1]),
coords={'time': t_out[:-1],
data.dims[1]: data.coords[data.dims[1]]},
name=data.name)
else:
warn('Variable {0} has > 2 dimensions. Skipping'
.format(data.name))
return temp
def curl(K, V):
curl = 0
for k_name, v_name in zip(K, V):
k = K[k_name]
v = V[v_name]
curl += xr.concat([k[:,1]*v[:,2] - k[:,2]*v[:,1],
k[:,2]*v[:,0] - k[:,0]*v[:,2],
k[:,0]*v[:,1] - k[:,1]*v[:,0]], dim='component').transpose()
curl = curl.assign_coords({'component': ['x', 'y', 'z']})
return curl
def divergence(K, V, dim='component'):
div = 0
for k_name, v_name in zip(K, V):
k = K[k_name]
v = V[v_name]
#div += k.dot(v, dim=dim) #gave me the error: TypeError:
#DataArray.dot() got an unexpected keyword argument 'dim'
div += (k * v).sum(dim=dim)
return div
def curl(K, V):
curl = 0
for k_name, v_name in zip(K, V):
k = K[k_name]
v = V[v_name]
curl += xr.concat([k[:,1]*v[:,2] - k[:,2]*v[:,1],
k[:,2]*v[:,0] - k[:,0]*v[:,2],
k[:,0]*v[:,1] - k[:,1]*v[:,0]], dim='component').transpose()
curl = curl.assign_coords({'component': ['x', 'y', 'z']})
return curl
def divergence(K, V, dim='component'):
div = 0
for k_name, v_name in zip(K, V):
k = K[k_name]
v = V[v_name]
#div += k.dot(v, dim=dim) #gave me the error: TypeError:
#DataArray.dot() got an unexpected keyword argument 'dim'
div += (k * v).sum(dim=dim)
return div
def expand_times(da, t_out):
# Locate the data times within the packet times by using the packet times
# as bin edges
# - scipy does not like datetime64 so time has to be converted to floats
# - Subtracting lots of datetime64 can take a long time! (???)
t0 = np.min([da['time'][0].data.astype('datetime64[D]'),
t_out[0].astype('datetime64[D]')])
dt_out = (t_out - t0).astype('float')
dt_in = (da['time'].data - t0).astype('float')
# Bin the data. The key here will be the bin number
cnt, bin_edges, binnum = binned_statistic(dt_out, dt_out,
statistic='count',
bins=dt_in)
# Test bins
# - Bin 0 is for data before the first packet time
# - Bin len(t_out) is for all data past the last packet time. There
# should be `sample_rate` points after the last packet time. If
# there are more, there is a problem: TODO - test
if binnum[0] == 0:
raise ValueError('Data times start before output times.')
# Expand the input data
result = da[binnum-1].assign_coords({'time': t_out})
return result
def generate_time_stamps(t_start, t_stop, t_res=np.timedelta64(30, 'ms')):
'''
Create an array of times spanning a time interval with a specified resolution.
Parameters
----------
t_start, t_stop : `numpy.datetime64`
Start and end of the time interval, given as the begin times of the first
and last samples
t_res : `numpy.timedelta64`
Sample interval
Returns
-------
t_stamps : `numpy.datetime64`
Timestamps spanning the given time interval and with the given resolution. Note
that the last point in the array is the end time of the last sample. This is to
work better with `scipy.binned_statistic`.
'''
# Find the start time
t_ref = t_start.astype('datetime64[m]').astype('datetime64[ns]')
t_start = t_start - ((t_start - t_ref) % t_res)
# Find the end time -- it should be after the final time
t_ref = t_stop.astype('datetime64[m]').astype('datetime64[ns]')
dt_round = (t_stop - t_ref) % t_res
t_stop += (t_res - dt_round)
# Time at five second intervals.
# - We want t_end to be included in the array as the right-most edge of the time interval
t_stamps = np.arange(t_start, t_stop + t_res, t_res)
return t_stamps
def gradient(K, G):
'''
Note that XArray will automatically broadcasts arrays based on their dimension names.
Therefore, when taking the spatial gradient (Nx3) of a scalar (N), the scalar will be broadcast
to Nx3 and multiplied. When taking the spatial gradient (Nx3) of a vector (Nx3), the second dimension
must have different names so that XArray broadcasts them to (Nx3x3) and calculates the outer product.
See:
https://stackoverflow.com/questions/51144786/computing-the-outer-product-of-two-variables-in-xarray-dataset
'''
grad = 0
for k_name, g_name in zip(K, G):
k = K[k_name]
g = G[g_name]
grad += k*g
return grad
def interp(ds, t_out, method='linear', extrapolate=False):
if extrapolate:
kwargs = {'fill_value': 'extrapolate'}
else:
kwargs = None
return ds.interp(time=t_out, method=method, kwargs=kwargs)
def interp_over_gaps(data, t_out, extrapolate=False):
'''
Interpolate data being careful to avoid interpolating over data gaps
Parameters
----------
t_out : `numpy.datetime64`
Times to which the data should be interpolated
data : `xarray.DataArray`
Data to be interpolated with coordinate 'time'
t_delta : `numpy.timedelta64`
Sampling interval of the data
Returns
-------
result : `xarray.DataArray`
Data interpolated to the given timestamps via nearest neighbor averaging
'''
if extrapolate:
kwargs = {'fill_value': 'extrapolate'}
else:
kwargs = None
# Data gaps are any time interval larger than a sampling interval
N_dt = np.round(data['time'].diff(dim='time') / data['dt_plus']).astype(int)
# Find the last sample in each contiguous segment
# - The last data point in the array is the last sample in the last contiguous segment
idx_gaps = np.argwhere(N_dt.data > 1)[:,0]
idx_gaps = np.append(idx_gaps, len(data['time']))
# Number of data gaps
N_gaps = len(idx_gaps)
if (N_gaps - 1) > 0:
warn('{0} data gaps found in dataset. Gap size: {1}.'
.format(N_gaps - 1, N_dt[idx_gaps[:-1]]))
# Interpolate each contiguous segment
temp = []
istart = 0
for igap in idx_gaps:
# If no extrapolation, the first and last data points turn into NaNs
# or are dropped because one of the nearest neighbors is beyond the
# data interval
# - d_in: * * * * *
# - t_in: * * * * *
# - t_out: * * * * * *
# - d_out: NaN * * * * NaN
# - len(data) = len(t_in) - 1 for each gap
temp.append(data[dict(time=slice(istart, igap+1))]
.interp({'time': t_out}, method='nearest', kwargs=kwargs)
.dropna(dim='time')
)
istart = igap+1
# Combine each contiguous segment into a single array
return xr.concat(temp, dim='time')
def recip_vec(R):
r = ['r1', 'r2', 'r3', 'r4']
i = 0 # alpha
j = 1 # beta
k = 2 # gamma
m = 3 # lambda
recvec = xr.Dataset()
for i in range(len(R)):
# r_ij = position vector pointing from S/C i to S/C j
r_ji = R[r[i]] - R[r[j]]
r_jk = R[r[k]] - R[r[j]]
r_jm = R[r[m]] - R[r[j]]
# The reciprocal vector for vertex m of the tetrahedron points normal to the area
# of the face of the tetrahedron opposite to vertex m. The normal vector to this
# surface can be found by taking the cross product between two vectors that lie in
# the plane of the surface.
area = np.stack([(r_jk[:,1]*r_jm[:,2] - r_jk[:,2]*r_jm[:,1]),
(r_jk[:,2]*r_jm[:,0] - r_jk[:,0]*r_jm[:,2]),
(r_jk[:,0]*r_jm[:,1] - r_jk[:,1]*r_jm[:,0])],
axis=1)
# Calculate the volume of the tetrahedron
volume = r_ji[:,0]*area[:,0] + r_ji[:,1]*area[:,1] + r_ji[:,2]*area[:,2]
# The reciprical vector is the area's normal vector normalized to the tetrahedron
# volume
k_i = xr.DataArray(np.stack([area[:,0] / volume,
area[:,1] / volume,
area[:,2] / volume], axis=1),
dims=('time', 'component'),
coords={'time': R['time'],
'component': R['component']}
)
recvec['k{0:1d}'.format(i+1)] = k_i
# k[0,0,i] = [ area[0,:] / volume,
# area[1,:] / volume,
# area[2,:] / volume ]
# These should exist but my version of XArray does not have them
# area = xr.cross(r_jk, r_jm, dim='component')
# volume = xr.dot(r_ji, area, dims='component')
# recvec = area / volume
# increase the indices cyclically
j = (j + 1) % 4
k = (k + 1) % 4
m = (m + 1) % 4
return recvec
def resample(data, t0, t1, dt_out, extrapolate=False, method='nearest'):
# Generate a set of timestamps
# - The end time will be the next sample after t1
# - For binned averages, we want the closed interval [t_out[0], t_out[-1]]
# - For all other resample methods, we want the half-open interval [t_out[0], t_out[-1])
t_out = generate_time_stamps(np.datetime64(t0),
np.datetime64(t1),
np.timedelta64(dt_out))
# Align the time
# - Same sample rate but different time stamps
# - Assumes sample time is at beginning of sample interval (dt_minus = 0)
# - Want to bring the ratio to 1
dt_ratio = np.timedelta64(dt_out) / data['dt_plus']
# Upsample / Interpolate
# - Less than two samples per target sampling interval
if dt_ratio <= 2:
# Repeat values
if method == 'repeat':
data = expand_times(data, t_out[:-1])
elif method in ('nearest', 'linear'):
kwargs = {}
if extrapolate:
kwargs['fill_value'] = 'extrapolate'
data = data.interp({'time': t_out[:-1]}, method=method, kwargs=kwargs)
# Interpolate
elif method == 'interp_gaps':
data = interp_over_gaps(data, t_out[:-1],
extrapolate=extrapolate)
else:
raise ValueError('Method {0} not recognized. Pick one of '
'(repeat, nearest, linear, interp_gaps).'
.format(method))
# Downsample
# - Two or more samples per target sampling interval
elif dt_ratio > 2:
if isinstance(data, xr.DataArray):
func = binned_avg
else:
func = binned_avg_ds
# Simple average
data = func(data, t_out)
# Suppress annoying xarray warning about converting to ns precision by doing the conversion ourselves
return data.assign_coords({'dt_plus': dt_out.astype('timedelta64[ns]')})
def smooth(data, n, step=1, fs=None, weights=1, end_points=None):
'''
Smooth an array.
Parameters
----------
data : (N,...), `xarray.DataArray`
Data to be smoothed. Smoothing is performed along the "time" dimension
n : int or float
Number of points to smooth. If a float, this is the time interval over
which to smooth
step : int
Number of points to skip between smoothing windows
fs : float
Sample rate. Used if `n` is a time interval.
end_points : str
How to treat end points. (None, "copy", "wrap", "repeat")
* None: Skip over end points (output is 0)
* copy: Copy points from the input array
* wrap: Wrap the smoothing window to the other end of the array
* repeat: Repeat the first or last data point
Returns
-------
out : (N, ...), `xarray.DataArray`
The smoothed input array
'''
if isinstance(n, float) or isinstance(step, float):
# Calculate the sample rate
if fs is None:
fs = 1.0 / (np.diff(data['time']).mean().astype(float) * 1e-9)
# Number of points in the smoothing window is given as a time interval
if isinstance(n, float):
n = np.int64(np.round(n * fs))
# Number of points to shift is given as a time interval
if isinstance(step, float):
step = np.int64(np.round(step * fs))
# Need to average at least two points
if n < 2:
raise ValueError('Too few puts (n={0}). Must be at least 2.'.format(n))
# Need to shift by at least one point
if step < 1:
raise ValueError('Too few points to shift (step={0}). '
'Must be at least 1.'.format(step))
# Weights must be scalar or an array the same length as data
if np.isscalar(weights):
weights = np.repeat(weights, n)
if len(weights) != n:
raise ValueError('Weights must be scalar or have length n.')
if data.ndim > 1:
weights = np.expand_dims(weights, np.arange(1,data.ndim).tolist())
# Number of windows that fit within the data
N = len(data)
N_out = int(np.floor((N-n)/step + 1))
# Allocate memory to the output array
out = np.zeros((N_out, *data.shape[1:]), dtype=data.dtype)
t_out = np.zeros(N_out, dtype='datetime64[ns]')
# Initial loop conditions
i0 = 0
i1 = n
# idx = np.int64((i0 + i1) / 2)
idx = 0
while i1 <= N:
out[idx,...] = np.sum(weights * data[i0:i1,...], axis=0) / np.sum(weights, axis=0)
t_out[idx] = data['time'][i0:i1].mean(dim='time').item()
i0 += step
i1 += step
idx += 1
# Turn into a DataArray
out = (xr.DataArray(out,
dims=('time', *data.dims[1:]),
coords={key: data[key] for key in data.coords if key != 'time'})
.assign_coords({'time': t_out})
)
'''
imid0 = np.int64(n / 2)
imid1 = N - (n - imid0)
i0 = -imid0
i1 = i0 + n
for ii in range(N):
# Skip the end points (copied later)
if end_points in (None, 'copy'):
if (i0 >= 0) & (i1 < N):
idx = slice(i0, i1)
else:
i0 += step
i1 += step
continue
# Repeat the first/last point in the array
elif end_points == 'repeat':
if i0 < 0:
idx = np.append(np.repeat(0, -i0), np.arange(0, i1))
elif i1 >= N:
idx = np.append(np.arange(i0, N), np.repeat(-1, i1-N))
else:
idx = slice(i0, i1)
# Wrap around to the beginning of the array
elif end_points == 'wrap':
if i0 < 0:
idx = np.append(np.arange(i0, 0), np.arange(i1))
elif i1 >= N:
print(i0, N, 0, i1-N)
idx = np.append(np.arange(i0, N), np.arange(0, i1-N))
else:
idx = slice(i0, i1)
else:
raise ValueError('end_points must be in (None, "repeat", "wrap", "copy").')
out[ii,...] = data[idx,...].mean(dim='time')
i0 += step
i1 += step
# Copy values to end-points
if end_points == 'copy':
out[0:imid0,...] = data[0:imid0,...]
out[imid1:,...] = data[imid1:,...]
'''
return out