-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathskipped_Pearson.m
199 lines (177 loc) · 6.73 KB
/
skipped_Pearson.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
function [rp,tp,CI,pval,outid,h]=skipped_Pearson(varargin)
%SKIPPED_PEARSON Pearson correlation after bivariate outlier removal
%
% Performs a robust Pearson correlation on data cleaned up for bivariate outliers,
% that is after finding the central point in the distribution using the mid covariance
% determinant, orthogonal distances are computed to this point, and any data outside the
% bound defined by the idealf estimator of the interquartile range is removed.
%
% FORMAT: [rp,tp,CI,pval,outid,h]=skipped_Pearson(X,pairs,method,alphav,p_alpha);
%
% INPUTS: X is a matrix and correlations between all pairs (default) are computed
% pairs (optional) is a n*2 matrix of pairs of columns to correlate
% method (optional) is 'ECP' or 'Hochberg' (only for n>60)
% alphav (optional, 5% by default) is the requested alpha level
% p_alpha (optional) the critical p_value to correct for multiple
% comparisons (see MC_corrpval)
%
% OUTPUTS: rp is the Pearson correlation
% tp is the T value associated to the skipped correlation
% CI is the robust confidence interval of r computed by bootstrapping
% the cleaned-up data set and taking the alphav centile values
% pval is the p value associated to t
% outid is the index of bivariate outliers
% h is the significance after correction for multiple comparisons
%
% This code rely on the mid covariance determinant as implemented in LIBRA
% - Verboven, S., Hubert, M. (2005), LIBRA: a MATLAB Library for Robust Analysis,
% Chemometrics and Intelligent Laboratory Systems, 75, 127-136.
% - Rousseeuw, P.J. (1984), "Least Median of Squares Regression,"
% Journal of the American Statistical Association, Vol. 79, pp. 871-881.
%
% The quantile of observations whose covariance is minimized is
% floor((n+size(X,2)*2+1)/2)),
% i.e. ((number of observations + number of variables*2)+1) / 2,
% thus for a correlation this is floor(n/2 + 5/2).
%
% The method for multiple comparisons correction is described in
% Rand R. Wilcox, Guillaume A. Rousselet & Cyril R. Pernet (2018)
% Improved methods for making inferences about multiple skipped correlations,
% Journal of Statistical Computation and Simulation, 88:16, 3116-3131,
% DOI: 10.1080/00949655.2018.1501051
%
% See also MCDCOV, IDEALF.
%
% Cyril Pernet v3 - Novembre 2017
% ---------------------------------------------------
% Copyright (C) Corr_toolbox 2017
%% check the data input
% _if no input simply return the help, otherwise load the data X_
if nargin <1
help skipped_Pearson
return
else
x = varargin{1};
[n,p]=size(x);
end
% _set the default options_
method = 'ECP';
alphav = 5/100;
pairs = nchoosek([1:p],2);
nboot = 599;
% _check other inputs of the function_
for inputs = 2:nargin
if inputs == 2
pairs = varargin{inputs};
elseif inputs == 3
method = varargin{inputs};
elseif inputs == 4
alphav = varargin{inputs};
elseif inputs == 5
p_alpha = varargin{inputs};
end
end
% _do a quick quality check_
if isempty(pairs)
pairs = nchoosek([1:p],2);
end
if size(pairs,2)~=2
pairs = pairs';
end
if sum(strcmpi(method,{'ECP','Hochberg'})) == 0
error('unknown method selected, see help skipped_Pearson')
end
if strcmp(method,'Hochberg') && n<60 || strcmp(method,'Hochberg') && n<60 && alphav == 5/100
error('Hochberg is only valid for n>60 and aplha 5%')
end
%% start the algorithm
% _create a table of resamples_
boostrap_sampling = NaN(n,nboot);
if nargout > 2
boot_index = 1;
while boot_index <= nboot
resample = randi(n,n,1);
if length(unique(resample)) > p % always more observations than variables
boostrap_sampling(:,boot_index) = resample;
boot_index = boot_index +1;
end
end
lower_bound = round((alphav*nboot)/2);
upper_bound = nboot - lower_bound;
end
% now for each pair to test, get the observed and boostrapped r and t
% values, then derive the p value from the bootstrap (and hboot and CI if
% requested)
% place holders
rp = NaN(size(pairs,1),1);
for outputs = 2:nargout
if outputs == 2
tp = NaN(size(pairs,1),1);
elseif outputs == 3
CI = NaN(size(pairs,1),2);
elseif outputs == 4
pval = NaN(size(pairs,1),1);
elseif outputs == 5
outid = cell(size(pairs,1),1);
end
end
% loop for each pair to test
for row = 1:size(pairs,1)
% select relevant columns
X = [x(:,pairs(row,1)) x(:,pairs(row,2))];
% get the bivariate outliers
flag = bivariate_outliers(X);
vec = 1:n;
if sum(flag)==0
outid{row}=[];
else
flag=(flag>=1);
outid{row}=vec(flag);
end
keep=vec(~flag); % the vector of data to keep
% Pearson correlation on cleaned data
rp(row) = sum(detrend(X(keep,1),'constant').*detrend(X(keep,2),'constant')) ./ ...
(sum(detrend(X(keep,1),'constant').^2).*sum(detrend(X(keep,2),'constant').^2)).^(1/2);
tp(row) = rp(row)*sqrt((n-2)/(1-rp(row).^2));
if nargout > 2
% redo this for bootstrap samples
% fprintf('computing p values by bootstrapping data, pair %g %g\n',pairs(row,1),pairs(row,2))
parfor b=1:nboot
Xb = X(boostrap_sampling(:,b),:);
r(b) = sum(detrend(Xb(keep,1),'constant').*detrend(Xb(keep,2),'constant')) ./ ...
(sum(detrend(Xb(keep,1),'constant').^2).*sum(detrend(Xb(keep,2),'constant').^2)).^(1/2);
end
% get the CI
r = sort(r);
CI(row,:) = [r(lower_bound) r(upper_bound)];
% get the p value
Q = sum(r<0)/nboot;
pval(row) = 2*min([Q 1-Q]);
end
end
%% once we have all the r and t values, we need to adjust for multiple comparisons
if nargout == 6
if strcmp(method,'ECP')
if exist('p_alpha','var')
h = pval < p_alpha;
else
disp('ECP method requested, computing p alpha ... (takes a while)')
[~,~,p_alpha] = MC_corrpval(n,p,'Skipped Pearson',alphav,pairs);
h = pval < p_alpha;
end
elseif strcmp('method','Hochberg')
[sorted_pval,index] = sort(pval,'descend');
k = 1; sig = 0; h = zeros(1,length(pval));
while sig == 0
if sorted_pval(k) < alphav/k
h(k:end) = 1; sig = 1;
else
k = k+1;
end
end
h = h(index);
end
end
%% quick clean-up of individual p-values
pval(pval==0) = 1/nboot;
disp('Skipped Pearson done')