-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheigen-miniapp-serial.c
281 lines (237 loc) · 12 KB
/
eigen-miniapp-serial.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
/* Eigenrockets mini-app for full-to-banded transformation: serial implementation
USAGE: <executable> <mode> <block size> <matrix size>
<mode> : 0 for Householder QR algorithm
1 for Cholesky QR algorithm (not implemented yet)
2 for modified Cholesky QR algorithm (not implemented yet)
<block size> : the size of most matrix blocks in level-3 BLAS operations
& the bandwidth of transformed matrix
<matrix size> : the size of the matrix being diagonalized
NOTES: - for simplicity & rapidity of development, this is written in ANSI C (C89)
and uses the standard Fortran interfaces to BLAS & LAPACK
- this implementation is restricted to real doubles
- "ELPA thesis" refers to the 2012 PhD thesis of Thomas Auckenthaler,
"Highly Scalable Eigensolvers for Petaflop Applications"
- only the lower triangle of the symmetric matrix is stored and transformed
& the Householder vectors are stored in a separate matrix whereas in ELPA,
the symmetric matrix is fully stored & overlaid w/ Householder vectors
- block Householder transformations are stored as W & Y in I - W*Y^T
- a complete set of block Householder transformations are stored in the pattern
[ W_n W_(n-1) ... W_2 W_1 ]
[ Y_1 Y_2 ... Y_(n-1) Y_n ]
contained within the memory footprint of the transformed matrix
- 1D array indexing is used to avoid confusion between row-major C & column-major Fortran (BLAS)
- whenever possible, all matrix and/or vector operations are written as BLAS operations,
even if there is no expectation of a performance benefit (e.g. BLAS level-1 operations)
- with the standard 4-byte integer using in the BLAS and LAPACK interface,
the largest square matrix that can be indexed properly is 46340-by-46340
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
/* generous machine precision used in accuracy tests */
#define EPS 1e-14
/* pairwise minimization & maximization macro */
#define MIN(a, b) ((a) < (b)) ? (a) : (b)
#define MAX(a, b) ((a) > (b)) ? (a) : (b)
/* column-major matrix indexing macro */
#define INDEX(row, col, stride) ((row) + (col)*(stride))
/* array offsetting macro */
#define OFFSET(array, offset) (&((array)[(offset)]))
/* TO DO:
- implement & debug Cholesky QR
- implement & debug modified Cholesky QR
*/
/* BLAS level-1 vector copy */
void dcopy_(int*, double*, int*, double*, int*);
/* BLAS level-1 scalar-vector multiplication */
void dscal_(int*, double*, double*, int*);
/* BLAS level-1 vector-vector inner product */
double ddot_(int*, double*, int*, double*, int*);
/* BLAS level-2 vector-vector outer product */
void dger_(int*, int*, double*, double*, int*, double*, int*, double*, int*);
/* BLAS level-2 matrix-vector multiplication */
double dgemv_(char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
/* BLAS level-3 half-symmetric matrix-matrix multiplication */
void dsymm_(char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
/* BLAS level-3 triangular matrix inversion */
void dtrsm_(char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int*);
/* BLAS level-3 symmetry-preserving matrix self-multiplication */
void dsyrk_(char*, char*, int*, int*, double*, double*, int*, double*, double*, int*);
/* BLAS level-3 symmetry-preserving matrix-matrix multiplication */
void dsyr2k_(char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
/* BLAS level-3 matrix-matrix multiplication */
void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
/* LAPACK symmetric eigensolver */
void dsyev_(char*, char*, int*, double*, int*, double*, double*, int*, int*);
/* 1-sided (left side) application of a block Householder transformation: (I - W*Y^T)*M */
/* NOTE: this is similar to Algorithm 5 in the ELPA thesis */
/* M is a nrow-by-stride matrix */
/* A is a stride-by-ncol matrix workspace */
void householder_1sided(int nrow, int ncol, int stride, double *M, double *W, double *Y, double *A)
{
char trans = 'T', notrans = 'N';
double zero = 0.0, one = 1.0, minus_one = -1.0;
/* A = M^T*Y */
dgemm_(&trans, ¬rans, &stride, &ncol, &nrow, &one, M, &stride, Y, &stride, &zero, A, &stride);
/* M = M - W*A^T */
dgemm_(¬rans, &trans, &nrow, &stride, &ncol, &minus_one, W, &stride, A, &stride, &one, M, &stride);
}
/* 2-sided application of a block Householder transformation: (I - W*Y^T)*M*(I - Y*W^T) */
/* NOTE: this is similar to Algorithm 7 in the ELPA thesis */
/* M is a symmetric nrow-by-nrow matrix, only the lower triangle is read & transformed */
/* A is a ncol-by-ncol matrix workspace & Z is a nrow-by-ncol matrix workspace */
void householder_2sided(int nrow, int ncol, int stride, double *M, double *W, double *Y, double *A, double *Z)
{
char left = 'L', lo = 'L', trans = 'T', notrans = 'N';
double zero = 0.0, one = 1.0, minus_one = -1.0, minus_half = -0.5;
/* Z = M*Y */
dsymm_(&left, &lo, &nrow, &ncol, &one, M, &stride, Y, &stride, &zero, Z, &stride);
/* A = Y^T*Z */
dgemm_(&trans, ¬rans, &ncol, &ncol, &nrow, &one, Y, &stride, Z, &stride, &zero, A, &stride);
/* Z = Z - 0.5*W*A */
dgemm_(¬rans, ¬rans, &nrow, &ncol, &ncol, &minus_half, W, &stride, A, &stride, &one, Z, &stride);
/* M = M - W*Z^T - Z*W^T */
dsyr2k_(&lo, ¬rans, &nrow, &ncol, &minus_one, W, &stride, Z, &stride, &one, M, &stride);
}
/* Householder QR decomposition of a matrix stored as a block Householder transformation, Q = I - W*Y^T */
/* NOTE: this is similar to Algorithm 20 & unlabelled pseudocode on page 19 of the ELPA thesis */
/* M is overwritten by R upon return */
/* T is part of the compact representation: W = Y*T */
void qr_householder(int nrow, int ncol, int stride, double *M, double *W, double *Y, double *T)
{
int i, j, rank = MIN(nrow, ncol);
char trans = 'T', notrans = 'N', up = 'U', left = 'L', nounit = 'N';
double zero = 0.0, half = 0.5, one = 1.0, minus_two = -2.0;
for(i=0 ; i<rank ; i++)
{
int nrow2 = nrow-i, ncol2 = ncol-i, unit = 1;
double wt, *M2 = OFFSET(M, INDEX(i,i,stride)), *Y2 = OFFSET(Y, INDEX(i,i,stride));
/* calculate matrix element of reduced column */
wt = sqrt(ddot_(&nrow2, M2, &unit, M2, &unit));
if(M2[0] > 0.0) { wt = -wt; }
/* construct Householder vector in leading column of Y2 */
dcopy_(&nrow2, M2, &unit, Y2, &unit);
Y2[0] -= wt;
wt = 1.0/sqrt(2.0*wt*(wt - M2[0]));
dscal_(&nrow2, &wt, Y2, &unit);
/* T = M2^T*Y2 (leading columns of T & Y2 only) */
dgemv_(&trans, &nrow2, &ncol2, &one, M2, &stride, Y2, &unit, &zero, T, &unit);
/* M2 = M2 - 2.0*Y2*T^T (leading columns of T & Y2 only) */
dger_(&nrow2, &ncol2, &minus_two, Y2, &unit, T, &unit, M2, &stride);
/* set unassigned matrix elements of Y to zero */
for(j=0 ; j<i ; j++) { Y[INDEX(j,i,stride)] = 0.0; }
/* clean the matrix elements formally removed from M */
nrow2--;
dscal_(&nrow2, &zero, OFFSET(M2,1), &unit);
}
/* W = Y^T*Y (upper triangle only) */
dsyrk_(&up, &trans, &rank, &nrow, &one, Y, &stride, &zero, W, &stride);
/* W *= 0.5 (diagonals only) */
i = stride+1;
dscal_(&rank, &half, W, &i);
/* T = W^{-1} */
for(i=0 ; i<ncol ; i++) for(j=0 ; j<ncol ; j++) { T[INDEX(j,i,stride)] = 0.0; }
for(i=0 ; i<ncol ; i++) { T[INDEX(i,i,stride)] = 1.0; }
dtrsm_(&left, &up, ¬rans, &nounit, &rank, &rank, &one, W, &stride, T, &stride);
/* W = Y*T */
dgemm_(¬rans, ¬rans, &nrow, &rank, &rank, &one, Y, &stride, T, &stride, &zero, W, &stride);
}
/* Cholesky QR decomposition of a matrix stored as a block Householder transformation, Q = I - W*Y^T */
/* NOTE: this is equivalent to Algorithm 26 of the ELPA thesis, but calls LAPACK for Cholesky decomposition */
/* M is overwritten by R upon return */
/* T is part of the compact representation: W = Y*T */
void qr_cholesky(int nrow, int ncol, int stride, double *M, double *W, double *Y, double *T)
{
/* T = M^T*M */
/* Cholesky decomposition: T = R^T*R */
/* ??? */
/* profit */
}
/* print a matrix for debugging purposes */
void print_mat(int nrow, int ncol, int stride, double *mat)
{
int i, j;
for(i=0 ; i<ncol ; i++) for(j=0 ; j<nrow ; j++)
{ printf("%d %d %e\n",j,i,mat[INDEX(j,i,stride)]); }
}
/* full-to-band transformation of a test matrix & band-to-full transformation of its eigenvectors */
int main(int argc, char** argv)
{
char jobv = 'V', lo = 'L';
int i, j, mode, nblock, ndim, lwork = -1, info, unit = 1;
double *mat1, *mat2, *hmat, *eval1, *eval2, *work, norm, kappa;
/* read command-line inputs */
if(argc < 4) { printf("USAGE: <executable> <mode> <block size> <matrix size>\n"); exit(1); }
sscanf(argv[1],"%d",&mode); sscanf(argv[2],"%d",&nblock); sscanf(argv[3],"%d",&ndim);
if(mode < 0 || mode > 1) { printf("ERROR: unknown mode (%d)\n", mode); exit(1); }
/* workspace query */
dsyev_(&jobv, &lo, &ndim, mat1, &ndim, eval1, &kappa, &lwork, &info);
lwork = MAX((int)kappa, nblock*ndim);
/* allocate memory */
mat1 = malloc(sizeof(double)*ndim*ndim);
mat2 = malloc(sizeof(double)*ndim*ndim);
hmat = malloc(sizeof(double)*ndim*ndim);
eval1 = malloc(sizeof(double)*ndim);
eval2 = malloc(sizeof(double)*ndim);
work = malloc(sizeof(double)*lwork);
/* construct lower triangle of an arbitrary test matrix */
for(i=0 ; i<ndim ; i++) for(j=i ; j<ndim ; j++)
{ mat1[INDEX(j,i,ndim)] = mat2[INDEX(j,i,ndim)] = cos(i + sqrt(2.0)*j + sqrt(3.0)*ndim); }
/* full-to-banded matrix transformation */
for(i=nblock ; i<ndim ; i+=nblock)
{
/* submatrix dimensions & pointers to submatrices */
int nrow = ndim-i;
double *M = OFFSET(mat2, INDEX(i,i-nblock,ndim)),
*M2 = OFFSET(mat2, INDEX(i,i,ndim)),
*W = OFFSET(hmat, INDEX(0,ndim-i,ndim)),
*Y = OFFSET(hmat, INDEX(i,i-nblock,ndim)),
*A = work,
*Z = OFFSET(work, nblock);
/* band reduction of column block with block Householder transformation */
switch(mode)
{
case 0: { qr_householder(nrow, nblock, ndim, M, W, Y, A); break; }
case 1: { qr_cholesky(nrow, nblock, ndim, M, W, Y, A); break; }
default: { printf("ERROR: invalid mode (%d)\n", mode); exit(1); }
}
/* apply block Householder transformation to the rest of the matrix (2-sided) */
householder_2sided(nrow, MIN(nrow,nblock), ndim, M2, Y, W, A, Z);
}
/* diagonalize the original & transformed matrices */
dsyev_(&jobv, &lo, &ndim, mat1, &ndim, eval1, work, &lwork, &info);
dsyev_(&jobv, &lo, &ndim, mat2, &ndim, eval2, work, &lwork, &info);
/* back-transform the eigenvectors of the transformed matrix */
for(i=ndim - ndim%nblock ; i>=nblock ; i-=nblock)
{
/* submatrix dimensions & pointers to submatrices */
int nrow = ndim-i;
double *M = OFFSET(mat2, i),
*W = OFFSET(hmat, INDEX(0,ndim-i,ndim)),
*Y = OFFSET(hmat, INDEX(i,i-nblock,ndim));
/* apply block Householder transformation to the eigenvector matrix (1-sided) */
householder_1sided(nrow, MIN(nrow,nblock), ndim, M, W, Y, work);
}
/* compare eigenvalues & eigenvectors to heuristic error bounds */
norm = MAX(-eval1[0], eval1[ndim-1]);
for(i=0 ; i<ndim ; i++)
{
double error, kappa, *vec1 = OFFSET(mat1, INDEX(0,i,ndim)), *vec2 = OFFSET(mat2, INDEX(0,i,ndim));
error = fabs(eval1[i] - eval2[i]);
if(error > norm*EPS) { printf("WARNING: large eigenvalue error (%d,%e > %e)\n", i, error, norm*EPS); }
error = 1.0 - fabs(ddot_(&ndim,vec1,&unit,vec2,&unit)) /
sqrt(ddot_(&ndim,vec1,&unit,vec1,&unit) * ddot_(&ndim,vec2,&unit,vec2,&unit));
kappa = 1.0;
if(i>0) { kappa = MAX(kappa, norm/(eval1[i] - eval1[i-1])); }
if(i<ndim-1) { kappa = MAX(kappa, norm/(eval1[i+1] - eval1[i])); }
if(error > kappa*EPS) { printf("WARNING: large eigenvector error (%d,%e > %e)\n", i, error, kappa*EPS); }
}
/* deallocate memory */
free(work);
free(eval2);
free(eval1);
free(hmat);
free(mat2);
free(mat1);
return 1;
}