Problem 1c

//----HEADER FILE--------------------------//
#include<stdio.h>
#include<math.h>
#include <sys/time.h>
#include<time.h>
#include<cuda.h>
#include<cuda_runtime.h>
//---------------------------------------//

#define THREAD_PER_BLOCK 512

//------------- KERNEL FUNCTION ADD-------------------//
__global__ void add(int*a,int*b,int*c,long n) {
	int index = threadIdx.x + blockIdx.x * blockDim.x;
	if (index < n) {
		c[index] = a[index] + b[index]; 
	}
}
//----------------------------------------------------//

//--------------FUNCTION TO ASSIGN RANDOM INTEGERS----//
void random_ints(int* a, long x)
{
   int i;
   for (i = 0; i < x; ++i)
    a[i] = rand();
}
//-----------------------------------------------------//

int main(int argc, char *argv[]) {

	if(argc < 2) {
		printf("Please enter the size of vector in the format ./exec.o N(1024/10240/102400)\n");
		return 0;
	}
	
	long N = atoi(argv[1]);	
	
	int*a, *b, *c, *output; //host copies
	int *d_a, *d_b, *d_c;//device copies
	
    struct timeval startc,end; //gettimeofday structure
	long seconds,useconds;
	double mtime;
	cudaEvent_t start, stop;
    float time;
	
	//create cuda events to initilaise the cuda GPU timers
	cudaEventCreate(&start);
    cudaEventCreate(&stop);

	int size = N * sizeof(int);
	
	// Alloc space for device copies of a, b, c
	cudaMalloc((void**)&d_a, size);
	cudaMalloc((void**)&d_b, size);
	cudaMalloc((void**)&d_c, size);
	
	// Alloc space for host copies of a, b, c and setup input values
	a = (int*)malloc(size); random_ints(a, N);
	b = (int*)malloc(size); random_ints(b, N);
	c = (int*)malloc(size);
	output = (int*)malloc(size);
        
	//Executes CPU
	gettimeofday(&startc,NULL);
    for(int i=0;i<N;i++)
	{ 
       output[i] = a[i] + b[i];
    }	
	gettimeofday(&end,NULL);
    seconds = end.tv_sec - startc.tv_sec;
	useconds = end.tv_usec - startc.tv_usec;
	mtime = 1000 * seconds + useconds/1000;
    printf("\n-------------------Vector addition in CPU-----------------------------\n");
	printf("Value of N= %lu      Time Taken= %g millisec \n",N,mtime);
	printf("------------------------------------------------------------------------\n");
	
	cudaEventRecord( start, 0 );//start recording the kernel execution time and data transfer from CPU to GPU
	
	// Copy inputs to device
	cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
	
	// Launch add() kernel on GPU with N blocks
	add<<<(N + THREAD_PER_BLOCK-1) / THREAD_PER_BLOCK, THREAD_PER_BLOCK>>>(d_a, d_b, d_c, N);
	       
	// Copy result back to host
	cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
	
	cudaEventRecord( stop, 0 );	//stop recording the kernel execution time and data transfer from CPU to GPU
	cudaEventSynchronize( stop );
	cudaEventElapsedTime( &time, start, stop );//calculated difference between start and stop of time
	printf("----------------------------Vector Addition in GPU----------------------\n");
    printf("Value of N= %lu      Time Taken= %f millisecond \n",N,time);
    printf("------------------------------------------------------------------------\n");


    cudaEventDestroy(start);
    cudaEventDestroy(stop);
	
	// Cleanup
	free(a); free(b); free(c);
	cudaFree(d_a); 
	cudaFree(d_b); 
	cudaFree(d_c);
	
	return 0;
}