-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtutorials_six.cu
318 lines (242 loc) · 8.8 KB
/
tutorials_six.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/*!
我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。
只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。
因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。
可以保证,认真学完教程,cuda编程毫无压力。
详情请链接:http://t.csdn.cn/NaCZ5
@Description : CUDA矩阵的加减乘除
@Author : tangjun
@Date :
*/
#include <iostream>
#include <time.h>
#include "opencv2/highgui.hpp" //实际上在/usr/include下
#include "opencv2/opencv.hpp"
#include "device_launch_parameters.h"
#include <cuda_runtime_api.h>
#include <device_functions.h>
using namespace cv;
using namespace std;
void Print_2dim(int* ptr, int m, int n) {
std::cout << "result:\n";
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
std::cout << "\t" << ptr[i * n + j];
}
std::cout << "\n";
}
}
__global__ void gpu_matrix_plus_thread(int* a, int* b, int* c)
{
//方法一:通过id方式计算
//grid为2维度,block为2维度,使用公式id=blocksize * blockid + threadid
int blocksize = blockDim.x * blockDim.y;
int blockid = gridDim.x * blockIdx.y + blockIdx.x;
int threadid = blockDim.x * threadIdx.y + threadIdx.x;
int id = blocksize * blockid + threadid;
c[id] = a[id] + b[id];
}
__global__ void gpu_matrix_plus1(int* a, int* b, int* c, int m, int n)
{ //方法二:通过row与col的方式计算-->通过变换列给出id
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
c[row * n + col] = a[row * n + col] + b[row * n + col];
}
__global__ void gpu_matrix_plus2(int* a, int* b, int* c, int m, int n)
{ //方法三:通过row与col的方式计算-->通过变换行给出id
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
c[row + col * m] = a[row + col * m] + b[row + col * m];
}
void init_variables(int* a, int* b, int m, int n) {
//初始化变量
std::cout << "value of a:" << endl;
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a[i * n + j] = rand() % 256;
std::cout << "\t" << a[i * n + j];
}
std::cout << "\n";
}
std::cout << "value of b:" << endl;
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
b[i * n + j] = rand() % 256;
std::cout << "\t" << b[i * n + j];
}
std::cout << "\n";
}
std::cout << "value of a+b:" << endl;
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
std::cout << "\t" << a[i * n + j] + b[i * n + j];
}
std::cout << "\n";
}
}
int kernel_plus()
{
/*
matrix a[m,n], matrix b[m,n]
a[m,n]+b[m,n]=[m,n]
*/
const int BLOCK_SIZE = 2;
int m = 8; //行
int n = 10; //列
int* a, * b;
//分配host内存
cudaMallocHost((void**)&a, sizeof(int) * m * n);
cudaMallocHost((void**)&b, sizeof(int) * m * n);
init_variables(a, b, m, n);//随机初始化变量
int* g_a, * g_b;
//分配gpu内存
cudaMalloc((void**)&g_a, sizeof(int) * m * n);
cudaMalloc((void**)&g_b, sizeof(int) * m * n);
cudaMemcpy(g_a, a, sizeof(int) * m * n, cudaMemcpyHostToDevice);
cudaMemcpy(g_b, b, sizeof(int) * m * n, cudaMemcpyHostToDevice);
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE; //行写给y
unsigned int grid_cols = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; //列写给x
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
std::cout << "gridDIM.x:" << grid_cols << "\tgridDIM.y:" << grid_rows << endl;
std::cout << "blockDIM.x:" << BLOCK_SIZE << "\tblockDIM.y:" << BLOCK_SIZE << endl;
int* c1, * g_c;
cudaMalloc((void**)&g_c, sizeof(int) * m * n);
cudaMallocHost((void**)&c1, sizeof(int) * m * n);
gpu_matrix_plus_thread << <dimGrid, dimBlock >> > (g_a, g_b, g_c);
cudaMemcpy(c1, g_c, sizeof(int) * m * n, cudaMemcpyDeviceToHost);
Print_2dim(c1, m, n);
int* c2, * g_c2;
cudaMallocHost((void**)&c2, sizeof(int) * m * n);
cudaMalloc((void**)&g_c2, sizeof(int) * m * n);
gpu_matrix_plus1 << <dimGrid, dimBlock >> > (g_a, g_b, g_c2, m, n);
cudaMemcpy(c2, g_c2, sizeof(int) * m * n, cudaMemcpyDeviceToHost); //将device端转host端
Print_2dim(c2, m, n);
int* c3, * g_c3;
cudaMallocHost((void**)&c3, sizeof(int) * m * n);
cudaMalloc((void**)&g_c3, sizeof(int) * m * n);
gpu_matrix_plus2 << <dimGrid, dimBlock >> > (g_a, g_b, g_c3, m, n);
cudaMemcpy(c3, g_c3, sizeof(int) * m * n, cudaMemcpyDeviceToHost); //将device端转host端
Print_2dim(c3, m, n);
//释放内存
cudaFree(g_a);
cudaFree(g_b);
cudaFree(g_c);
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c1);
return 0;
}
__global__ void gpu_matrix_mult(int* a, int* b, int* c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // 行线程 y
int col = blockIdx.x * blockDim.x + threadIdx.x; // 列线程 x
int sum = 0;
if (col < k && row < m) {
for (int i = 0; i < n; i++) {
sum += a[row * n + i] * b[i * k + col]; //看出row与col不动的方式计算
}
c[row * k + col] = sum;
}
}
__global__ void gpu_matrix_multiply_thread(int* a, int* b, int* c, int m, int n, int k)
{
// [m*n]* [n*k]<---m n
//方法一:通过id方式计算
//grid为2维度,block为2维度,使用公式id=blocksize * blockid + threadid
int blocksize = blockDim.x * blockDim.y;
int blockid = gridDim.x * blockIdx.y + blockIdx.x;
int threadid = blockDim.x * threadIdx.y + threadIdx.x;
int id = blocksize * blockid + threadid;
int row = id / k;
int col = id % k;
int sum = 0;
for (int i = 0; i < n; i++) {
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
int kernel_multiply()
{
/*
matrix a[m,n], matrix b[n,k]
a[m,n]*b[n,k]=[m,k]
*/
const int BLOCK_SIZE = 2;
int m = 8; //行
int n = 4; //中间变量
int k = 10; //列
int* a, * b;
// 初始化 a与b
cudaMallocHost((void**)&a, sizeof(int) * m * n);
cudaMallocHost((void**)&b, sizeof(int) * n * k);
std::cout << "value of a:" << endl;
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a[i * n + j] = rand() % 6;
std::cout << "\t" << a[i * n + j];
}
std::cout << "\n";
}
std::cout << "value of b:" << endl;
for (int i = 0; i < n; i++) {
for (int j = 0; j < k; j++) {
b[i * k + j] = rand() % 10;
std::cout << "\t" << b[i * k + j];
}
std::cout << "\n";
}
//a*b相乘
std::cout << "value of a*b:" << endl;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
int tmp = 0;
for (int h = 0; h < n; h++) {
tmp += a[i * n + h] * b[h * k + j];
}
//c[i * k + j] = tmp;
std::cout << "\t" << tmp;
}
std::cout << "\n";
}
int* g_a, * g_b;
cudaMalloc((void**)&g_a, sizeof(int) * m * n);
cudaMalloc((void**)&g_b, sizeof(int) * n * k);
cudaMemcpy(g_a, a, sizeof(int) * m * n, cudaMemcpyHostToDevice);
cudaMemcpy(g_b, b, sizeof(int) * k * n, cudaMemcpyHostToDevice);
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE; //行写给y
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE; //列写给x
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
std::cout << "gridDIM.x:" << grid_cols << "\tgridDIM.y:" << grid_rows << endl;
std::cout << "blockDIM.x:" << BLOCK_SIZE << "\tblockDIM.y:" << BLOCK_SIZE << endl;
// 使用row与col计算
int* c1, * g_c;
cudaMalloc((void**)&g_c, sizeof(int) * m * k);
cudaMallocHost((void**)&c1, sizeof(int) * m * k);
gpu_matrix_mult << <dimGrid, dimBlock >> > (g_a, g_b, g_c, m, n, k);
cudaMemcpy(c1, g_c, sizeof(int) * m * k, cudaMemcpyDeviceToHost);
Print_2dim(c1, m, k);
cudaFree(g_c);
cudaFreeHost(c1);
//使用id计算
int* c2, * g_c2;
cudaMalloc((void**)&g_c2, sizeof(int) * m * k);
cudaMallocHost((void**)&c2, sizeof(int) * m * k);
gpu_matrix_multiply_thread << <dimGrid, dimBlock >> > (g_a, g_b, g_c2, m, n, k);
cudaMemcpy(c2, g_c2, sizeof(int) * m * k, cudaMemcpyDeviceToHost);
Print_2dim(c2, m, k);
cudaFree(g_c2);
cudaFreeHost(c2);
//释放内存
cudaFree(g_a);
cudaFree(g_b);
cudaFreeHost(a);
cudaFreeHost(b);
return 0;
}
void main_six() {
kernel_plus();
kernel_multiply();
}