Problem1b

//----HEADER FILE--------------------------//
#include<stdio.h>
#include<math.h>
#include <sys/time.h>
#include<time.h>
#include<cuda.h>
#include<cuda_runtime.h>
//---------------------------------------//

#define N 1048576
//------------- KERNEL FUNCTION ADD-------------------//
__global__ void add(int*a,int*b,int*c,long n) {
	int index = threadIdx.x + blockIdx.x * blockDim.x;
	if (index < n) {
		c[index] = a[index] + b[index]; 
	}
}
//----------------------------------------------------//
//--------------FUNCTION TO ASSIGN RANDOM INTEGERS----//
void random_ints(int* a, long x)
{
   int i;
   for (i = 0; i < x; ++i)
    a[i] = rand();
}
//----------------------------------------------------//
int main(int argc, char *argv[]) {

	if(argc < 2) {
		printf("Please enter the size of vector in the format ./exec.o THREAD_PER_BLOCK(128/256/512/1024)\n\n");
		return 0;
	}
	
	int THREAD_PER_BLOCK = atoi(argv[1]);	 //Change the thread block as per the problem query
	int*a, *b, *c, *output; //host copies
	int *d_a, *d_b, *d_c;//device copies
  struct timeval startc,end;
	long seconds,useconds;
	float time1;
	double mtime;
    
	cudaEvent_t start, stop;
  floatloat time;
	cudaEventCreate(&start);
  cudaEventCreate(&stop);

	int size = N * sizeof(int);
	
	// Alloc space for device copies of a, b, c
	cudaMalloc((void**)&d_a, size);
	cudaMalloc((void**)&d_b, size);
	cudaMalloc((void**)&d_c, size);
	
	// Alloc space for host copies of a, b, c and setup input values
	a = (int*)malloc(size); random_ints(a, N);
	b = (int*)malloc(size); random_ints(b, N);
	c = (int*)malloc(size);
	output = (int*)malloc(size);
        
	//Executes CPU
	gettimeofday(&startc,NULL);
  for(int i=0;i<N;i++)
	{ 
    output[i] = a[i] + b[i];
  }	
	gettimeofday(&end,NULL);
  seconds = end.tv_sec - startc.tv_sec;
	useconds = end.tv_usec - startc.tv_usec;
	mtime = 1000 * seconds + useconds/1000;
  printf("\n-------------------Vector addition in CPU-----------------------------\n");
	printf("Value of N= %lu  TB_size= %d     Time Taken= %g millisec \n",N,THREAD_PER_BLOCK,mtime);
	printf("------------------------------------------------------------------------\n");
	
	// Copy inputs to device
	cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
	
	cudaEventRecord( start, 0 );
	// Launch add() kernel on GPU with N blocks
	add<<<(N + THREAD_PER_BLOCK-1) / THREAD_PER_BLOCK, THREAD_PER_BLOCK>>>(d_a, d_b, d_c, N);
	cudaEventRecord( stop, 0 );
	cudaEventSynchronize( stop );
	cudaEventElapsedTime( &time, start, stop );
        
	// Copy result back to host
	cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
	printf("----------------------------Vector Addition in GPU----------------------\n");
  printf("Value of N= %lu  TB_size= %d  Time Taken= %f millisecond \n",N,THREAD_PER_BLOCK,time);
  printf("------------------------------------------------------------------------\n");


  cudaEventDestroy(start);
  cudaEventDestroy(stop);
	
	// Cleanup
	free(a); free(b); free(c);
	cudaFree(d_a); 
	cudaFree(d_b); 
	cudaFree(d_c);
	
	return 0;
}