-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathcuBLAS_sgemm.cu
108 lines (89 loc) · 2.47 KB
/
cuBLAS_sgemm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#include <cstdio>
#include <cublas_v2.h>
#include <cuda_runtime.h>
/*
* A stand-alone script to invoke & benchmark standard cuBLAS SGEMM performance
*/
int main(int argc, char *argv[]) {
int m = 2;
int k = 3;
int n = 4;
int print = 1;
cudaError_t cudaStat; // cudaMalloc status
cublasStatus_t stat; // cuBLAS functions status
cublasHandle_t handle; // cuBLAS context
int i, j;
float *a, *b, *c;
// malloc for a,b,c...
a = (float *)malloc(m * k * sizeof(float));
b = (float *)malloc(k * n * sizeof(float));
c = (float *)malloc(m * n * sizeof(float));
int ind = 11;
for (j = 0; j < m * k; j++) {
a[j] = (float)ind++;
}
ind = 11;
for (j = 0; j < k * n; j++) {
b[j] = (float)ind++;
}
ind = 11;
for (j = 0; j < m * n; j++) {
c[j] = (float)ind++;
}
// DEVICE
float *d_a, *d_b, *d_c;
// cudaMalloc for d_a, d_b, d_c...
cudaMalloc((void **)&d_a, m * k * sizeof(float));
cudaMalloc((void **)&d_b, k * n * sizeof(float));
cudaMalloc((void **)&d_c, m * n * sizeof(float));
stat = cublasCreate(&handle); // initialize CUBLAS context
cudaMemcpy(d_a, a, m * k * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, k * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, m * n * sizeof(float), cudaMemcpyHostToDevice);
float alpha = 1.0f;
float beta = 0.5f;
if (print == 1) {
printf("alpha = %4.0f, beta = %4.0f\n", alpha, beta);
printf("A = (mxk: %d x %d)\n", m, k);
for (i = 0; i < m; i++) {
for (j = 0; j < k; j++) {
printf("%4.1f ", a[i * m + j]);
}
printf("\n");
}
printf("B = (kxn: %d x %d)\n", k, n);
for (i = 0; i < k; i++) {
for (j = 0; j < n; j++) {
printf("%4.1f ", b[i * n + j]);
}
printf("\n");
}
printf("C = (mxn: %d x %d)\n", m, n);
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
printf("%4.1f ", c[i * n + j]);
}
printf("\n");
}
}
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, d_b, n,
d_a, k, &beta, d_c, n);
cudaMemcpy(c, d_c, m * n * sizeof(float), cudaMemcpyDeviceToHost);
if (print == 1) {
printf("\nC after SGEMM = \n");
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
printf("%4.1f ", c[i * n + j]);
}
printf("\n");
}
}
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cublasDestroy(handle); // destroy CUBLAS context
free(a);
free(b);
free(c);
return EXIT_SUCCESS;
}