-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathpca.daph
90 lines (79 loc) · 3.35 KB
/
pca.daph
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Modifications Copyright 2022 The DAPHNE Consortium
#
#-------------------------------------------------------------
# Principal Component Analysis (PCA) for dimensionality reduction
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Matrix --- Input feature matrix
# K Int 2 Number of reduced dimensions (i.e., columns)
# center Boolean true Indicates whether or not to center the feature matrix
# scale Boolean true Indicates whether or not to scale the feature matrix
# RETURN VALUES
# ---------------------------------------------------------------------------------------------
# Xout Matrix --- Output feature matrix with K columns
# Mout Matrix --- Output dominant eigen vectors (can be used for projections)
# ---------------------------------------------------------------------------------------------
X = readMatrix($X);
K = $K;
N = nrow(X);
Nm1 = as.f64(N) - as.f64(1);
M = as.si64(ncol(X));
if(K > M) {
print("PCA: invalid parameter, K should be <= ncol(X) -- setting K = ncol(X).");
K = M;
}
# perform z-scoring (centering and scaling)
if( $center ) {
X = X - mean(X, 1);
}
if( $scale ) {
# TODO This does not use the original X before centering anymore. Is that okay?
scaleFactor = sqrt(sum(X^2.0, 1)/Nm1);
# Replace entries in the scale factor that are 0 and NaN with 1.
# To avoid division by 0 or NaN, introducing NaN to the ouput.
scaleFactor = replace(scaleFactor, nan, 1.0);
scaleFactor = replace(scaleFactor, 0.0, 1.0);
X = X / scaleFactor;
}
# co-variance matrix
mu = mean(X, 1);
C = (t(X) @ X) / Nm1 - (t(mu) @ mu) * (as.f64(N)/Nm1);
# compute eigen vectors and values
evalues, evectors = eigen(C);
# extract dominant eigenvalues and eigenvectors
# Note: can be emulated by: cbind(evectors, t(evalues)) + cast, sort frame, + extract
decreasing_Idx = order(evalues, 0, false, true);
# TODO This cast should not be necessary (type promotion in ctable).
diagmat = ctable(seq(0.0, M - 1, 1.0), as.f64(decreasing_Idx));
evalues = diagmat @ evalues;
evectors = evectors @ t(diagmat);
eval_dominant = evalues[0:K, 0];
evec_dominant = evectors[,0:K];
# Construct new data set by treating computed dominant eigenvectors as the basis vectors
Xout = X @ evec_dominant;
Mout = evec_dominant;
# Write out results
writeMatrix(Xout, $Xout);
writeMatrix(Mout, $Mout);