-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProblem1b
102 lines (87 loc) · 3.01 KB
/
Problem1b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
//----HEADER FILE--------------------------//
#include<stdio.h>
#include<math.h>
#include <sys/time.h>
#include<time.h>
#include<cuda.h>
#include<cuda_runtime.h>
//---------------------------------------//
#define N 1048576
//------------- KERNEL FUNCTION ADD-------------------//
__global__ void add(int*a,int*b,int*c,long n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n) {
c[index] = a[index] + b[index];
}
}
//----------------------------------------------------//
//--------------FUNCTION TO ASSIGN RANDOM INTEGERS----//
void random_ints(int* a, long x)
{
int i;
for (i = 0; i < x; ++i)
a[i] = rand();
}
//----------------------------------------------------//
int main(int argc, char *argv[]) {
if(argc < 2) {
printf("Please enter the size of vector in the format ./exec.o THREAD_PER_BLOCK(128/256/512/1024)\n\n");
return 0;
}
int THREAD_PER_BLOCK = atoi(argv[1]); //Change the thread block as per the problem query
int*a, *b, *c, *output; //host copies
int *d_a, *d_b, *d_c;//device copies
struct timeval startc,end;
long seconds,useconds;
float time1;
double mtime;
cudaEvent_t start, stop;
floatloat time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void**)&d_a, size);
cudaMalloc((void**)&d_b, size);
cudaMalloc((void**)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int*)malloc(size); random_ints(a, N);
b = (int*)malloc(size); random_ints(b, N);
c = (int*)malloc(size);
output = (int*)malloc(size);
//Executes CPU
gettimeofday(&startc,NULL);
for(int i=0;i<N;i++)
{
output[i] = a[i] + b[i];
}
gettimeofday(&end,NULL);
seconds = end.tv_sec - startc.tv_sec;
useconds = end.tv_usec - startc.tv_usec;
mtime = 1000 * seconds + useconds/1000;
printf("\n-------------------Vector addition in CPU-----------------------------\n");
printf("Value of N= %lu TB_size= %d Time Taken= %g millisec \n",N,THREAD_PER_BLOCK,mtime);
printf("------------------------------------------------------------------------\n");
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
cudaEventRecord( start, 0 );
// Launch add() kernel on GPU with N blocks
add<<<(N + THREAD_PER_BLOCK-1) / THREAD_PER_BLOCK, THREAD_PER_BLOCK>>>(d_a, d_b, d_c, N);
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
printf("----------------------------Vector Addition in GPU----------------------\n");
printf("Value of N= %lu TB_size= %d Time Taken= %f millisecond \n",N,THREAD_PER_BLOCK,time);
printf("------------------------------------------------------------------------\n");
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}