-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHtests.py
124 lines (102 loc) · 4.55 KB
/
Htests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
from scipy.stats import rankdata
from scipy.stats import mood, mannwhitneyu
'''
This code is an implementation of the detector as seen in
Ross, Gordon & Tasoulis, Dimitris & Adams, Niall. (2012). Nonparametric Monitoring of Data Streams for Changes in Location and Scale. Technometrics. 53. 379-389. 10.1198/TECH.2011.10069.
'''
def MW(data):
# returns max normalised Mann-Whitney statistic across all possible paritions and its argument
# for location shift
n = len(data)
ranked = rankdata(data)[:-1]
x = np.arange(1, n)
helper = (n + 1) / 2
sd = np.sqrt(x * (n - x) * (n + 1) / 12 )
MW = np.abs((np.cumsum(ranked - helper)) / sd)
return np.argmax(MW), np.max(MW)
def Mood(data):
# returns max normalised Mood statistic across all possible paritions and its argument
# for scale shift
n = len(data)
ranked = rankdata(data)[:-1]
x = np.arange(1, n)
helper = (n + 1) / 2
mean = x * (n ** 2 - 1) / 12
sd = np.sqrt((n - x) * x * (n + 1) * (n + 2) * (n - 2) / 180)
Mood = np.abs((np.cumsum((ranked - helper) ** 2) - mean) / sd)
return np.argmax(Mood) , np.max(Mood)
def Lepage(data):
# returns max Lepage statistic across all possible paritions and its argument
# for location location and/or scale shift
n = len(data)
ranked = rankdata(data)[:-1]
x = np.arange(1, n)
helper = (n + 1) / 2
sd_MW = np.sqrt(x * (n - x) * (n + 1) / 12 )
mean_Mood = x * (n ** 2 - 1) / 12
sd_Mood = np.sqrt((n - x) * x * (n + 1) * (n + 2) * (n - 2) / 180)
MW = np.abs((np.cumsum(ranked - helper)) / sd_MW)
Mood = np.abs((np.cumsum((ranked - helper) ** 2) - mean_Mood) / sd_Mood)
Lepages = MW ** 2 + Mood ** 2
return np.argmax(Lepages), np.max(Lepages)
def changedetected(D, threshold):
# Flags if the threshold limit has been breached, resulting in a changepoint being declared
if D > threshold:
return True
else:
False
def stream(data, Test, thresholds):
# start monitoring at time = 20
i = 20
while (i < len(data) + 1):
# increment window to be tested
window = data[:i]
arg, D = Test(window)
# test at ith threshold value or at the last loaded threshold value
if i >len(thresholds):
cp = changedetected(D, thresholds[-1])
else:
cp = changedetected(D, thresholds[i-1])
if cp == True:
change = ''
if Test == Lepage:
# If test is Lepage, test to see what change was detected (either location or shit)
# If the p value from eg mann whitney test is 100 times smaller than that from the mood test, we declare a location shift
# If this is not met, then further investigation is needed (tested on synthetic data and the value of 100 seemed appropiate)
p_val_Mood = mood(window[:arg + 1], window[arg + 1:])[1]
p_val_MW = mannwhitneyu(window[:arg + 1], window[arg + 1:])[1]
if 0.01 * p_val_Mood > p_val_MW:
change = "Location"
elif 0.01 * p_val_MW > p_val_Mood:
change = "Scale"
else:
change = 'Needs further investigation'
else:
pass
# return the location of the cp, index for which is was detected, True (cp was declared), what type of change
return arg + 1, i - 1, True, change
else:
# if no change, increase the window size by 1
i += 1
# return this if no cp is found and no more datapoints to process
return None, None, False, None
def online(data, Test, thresholds):
# This processes the data and works with stream
# stream is called and montiors for a cp, if one is found, we reinitialise stream and start at the detected changepoitn location
# we need the data, which test we want to use, the threshold values we want to use, see ThresholdMonte.py for code to estimate appropriate values and timeseries dates
restart = True
current_data = data
cps = [0]
detection_times = [0]
change_type = []
while (len(current_data) >=20) and (restart == True):
arg, d_time, restart, change = stream(current_data, Test, thresholds)
if restart == True:
change_type.append(change)
last_cp = cps[-1]
detection_times.append(last_cp + d_time)
cps.append(last_cp + arg)
current_data = data[cps[-1]:]
#returns the CP location in the array
return cps, detection_times, change_type