-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathkstest.py
121 lines (110 loc) · 4.65 KB
/
kstest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""This module implements two-sided K-S tests. The 1-D version is from scipy, the 2-D version
is that originated by Peacock 1984: http://adsabs.harvard.edu/abs/1983MNRAS.202..615P
and Fasano 1987 http://adsabs.harvard.edu/abs/1987MNRAS.225..155F
See also NR 14.8 (although they are a bit vague). Note that this uses NR results,
but not their code, so is ok for distribution.
"""
import math
import scipy.stats as st
import numpy as np
def ks_2samp(data1, data2):
"""Computes the Kolmogorov-Smirnof statistic on 2 samples.
This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution.
Parameters :
a, b : sequence of 1-D ndarrays
two arrays of sample observations assumed to be drawn from a continuous distribution, sample sizes can be different
Returns :
D : float KS statistic
p-value : float two-tailed p-value: high value means
we cannot reject the hypothesis that numbers are from the same distribution
ie, low D => high p
"""
return st.ks_2samp(data1, data2)
def ks_2d_2samp(data1, data2):
"""Computes the 2-dimensional Kolmogorov-Smirnof statistic on 2 samples.
This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution.
Parameters :
a, b : sequence of 1-D ndarrays
two arrays of sample observations assumed to be drawn from a continuous distribution, sample sizes can be different
Returns :
D : float KS statistic
p-value : float two-tailed p-value.
High value means we cannot reject the hypothesis that they are from the same distribution.
low D => high p
"""
#Compute D using data1 as the origins
D1 = np.max([max_diff_for_orig(dd, data1, data2) for dd in data1])
#Compute D using data2 as the origins
D2 = np.max([max_diff_for_orig(dd, data1, data2) for dd in data2])
#Their mean
D = (D1+D2)/2.
#The approximate p-value: this is detailed in NR 14.8
# neff = npt1*npt2/(1.*npt1+npt2)
# (rr1,p) = st.pearsonr(data1[:,0], data1[:,1])
# (rr2,p) = st.pearsonr(data2[:,0], data2[:,1])
# reff = (rr1**2+rr2**2)/2.
# ksarg = neff*D/(1+np.sqrt(1-reff)*(025-0.75/np.sqrt(neff)))
# pval = ksdist(ksarg)
return D #(D, pval)
def max_diff_for_orig(orig, data1, data2):
"""For a given origin, orig, compute the fraction of each data point in each quadrant,
then compute the maximum distance between the distributions, and return it."""
count1 = count_quadrant(orig, data1)
count2 = count_quadrant(orig, data2)
return np.max(np.abs(count1-count2))
def count_quadrant(orig, xx):
"""Count fraction of points in each quadrant around an origin orig = (x,y), from an nd-array of points.
Returns:
count: array of length 4.
Each element is the fraction of items in each quadrant, measured clockwise from x > x_i, y > y_i in quadrant 0.
"""
count = np.zeros(4)
#The clockwise arrangement of the = digns mean that (0,0) is in no quadrant
# x > 0, y >= 0
count[0] = np.size(np.where(np.logical_and(xx[:,0] > orig[0], xx[:,1] >= orig[1])))
# x >= 0, y < 0
count[1] = np.size(np.where(np.logical_and(xx[:,0] >= orig[0], xx[:,1] < orig[1])))
# x < 0, y <= 0
count[2] = np.size(np.where(np.logical_and(xx[:,0] < orig[0], xx[:,1] <= orig[1])))
# x <= 0, y > 0
count[3] = np.size(np.where(np.logical_and(xx[:,0] <= orig[0], xx[:,1] > orig[1])))
#One element (the origin) will be in no quadrant
nn = np.shape(xx)[0]-1
count/=(1.*nn)
return count
def ksdist(x):
"""The K-S distribution, from the description in NR.
Scipy seems to do this too, but I don't understand their documentation,
so I implement it myself.
Defined st ksdist(0) == 1 and ksdist(infty) == 0.
"""
if x == 0:
return 1
if x > 1.18:
return _ksdist1(x)
else:
return _ksdist2(x)
def _ksdist1(x):
"""Helper for above. Series is eq. 6.14.56 of NR.
Best for when x > 1.18"""
total = 0
#Should be < 3 iterations in practice
for j in range(20):
extra = (-1)**j*np.exp(-2*(j+1)**2*x**2)
total += extra
if extra == 0 or extra/total < 1e-7:
break
total*=2
return total
def _ksdist2(x):
"""Helper for above. Series is eq. 6.14.57 of NR.
Best for when x < 1.18"""
total = 0
#Should be < 3 iterations in practice
for j in range(20):
extra = np.exp(-(2*j-1)**2*math.pi**2/(8*x**2))
total += extra
if extra == 0 or extra/total < 1e-7:
break
total*=np.sqrt(2*math.pi)/x
return 1-total