diff --git a/README.md b/README.md index d63a6a1..6b3b89e 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,41 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Guanlin Huang + * [LinkedIn](https://www.linkedin.com/in/guanlin-huang-4406668502/), [personal website](virulentkid.github.io/personal_web/index.html) +* Tested on: Windows 11, i9-10900K @ 4.9GHz 32GB, RTX3080 10GB; Compute Capability: 8.6 +# Screenshots +## 50000 boids with naive, scattered and coherent method +![](images/naive50000.gif) +![](images/sca50000.gif) +![](images/coh50000.gif) +### Performance Analysis -### (TODO: Your README) +The average FPS of the first 10 second is measured; reasonable amount of waiting is performed to minimize thermal throttling. +The results show that the FPS drops significantly as boid number increases; the difference between scattered and coherent memory method is more noticable as the number of boids increases. +However, no significant differences among different block sizes at the same boid size. +![](images/fps.png) +![](images/fps2.png) -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Questions + +* For each implementation, how does changing the number of boids affect +performance? Why do you think this is? +The FPS drops significantly as boid number increases. It is because at each tick, the calculation needed to get the change in velocity increases as boid number increases. + +* For each implementation, how does changing the block count and block size +affect performance? Why do you think this is? + No significant differences among different block sizes at the same boid size. It could be that we haven't hit the throttling point, or the hardware-level of optimization is done at different block size. + +* For the coherent uniform grid: did you experience any performance improvements +with the more coherent uniform grid? Was this the outcome you expected? +Why or why not? +Yes.the difference between scattered and coherent memory method is more noticable as the number of boids increases. +As the number of boids increases, the time complexity for scattered method increases whereas the coherent method stays relatively constant. + + +* Did changing cell width and checking 27 vs 8 neighboring cells affect performance? +Why or why not? Be careful: it is insufficient (and possibly incorrect) to say +that 27-cell is slower simply because there are more cells to check! +I did the grid optimization to avoid hard coding. But if I were to guess, the 27-cell might be faster in cases where the number of boids are high enough that +checking only 8 cells would result more complicated calculation overall. \ No newline at end of file diff --git a/images/coh50000.gif b/images/coh50000.gif new file mode 100644 index 0000000..c99ef1d Binary files /dev/null and b/images/coh50000.gif differ diff --git a/images/fps.png b/images/fps.png new file mode 100644 index 0000000..d3efc77 Binary files /dev/null and b/images/fps.png differ diff --git a/images/fps2.png b/images/fps2.png new file mode 100644 index 0000000..4a6eef3 Binary files /dev/null and b/images/fps2.png differ diff --git a/images/naive50000.gif b/images/naive50000.gif new file mode 100644 index 0000000..95df360 Binary files /dev/null and b/images/naive50000.gif differ diff --git a/images/sca50000.gif b/images/sca50000.gif new file mode 100644 index 0000000..7680392 Binary files /dev/null and b/images/sca50000.gif differ diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..1319a9a 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -5,6 +5,7 @@ #include #include "utilityCore.hpp" #include "kernel.h" +#include // LOOK-2.1 potentially useful for doing grid-based neighbor search #ifndef imax @@ -37,7 +38,7 @@ void checkCUDAError(const char *msg, int line = -1) { *****************/ /*! Block size used for CUDA kernel launch. */ -#define blockSize 128 +#define blockSize 64 // LOOK-1.2 Parameters for the boids algorithm. // These worked well in our reference implementation. @@ -85,7 +86,8 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. - +glm::vec3* dev_posHolder; +glm::vec3* dev_vel1Holder; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation int gridCellCount; @@ -169,6 +171,24 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + cudaMalloc((void**)&dev_posHolder, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_posHolder failed!"); + + cudaMalloc((void**)&dev_vel1Holder, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel1Holder failed!"); + cudaDeviceSynchronize(); } @@ -210,8 +230,8 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) { dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); - kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); - kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); + kernCopyPositionsToVBO <<>>(numObjects, dev_pos, vbodptr_positions, scene_scale); + kernCopyVelocitiesToVBO <<>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale); checkCUDAErrorWithLine("copyBoidsToVBO failed!"); @@ -222,7 +242,6 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) /****************** * stepSimulation * ******************/ - /** * LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce. * __device__ code can be called from a __global__ context @@ -233,7 +252,53 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves // Rule 2: boids try to stay a distance d away from each other // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + glm::vec3 perceived_center = glm::vec3(0.f); + glm::vec3 c = glm::vec3(0.f); + glm::vec3 perceived_velocity = glm::vec3(0.f); + int r1Neighbors = 0; + int r3Neighbors = 0; + + //accumulate 3 rules data into one loop + for (int boid = 0; boid < N; boid++) { + if (boid != iSelf) { + float distance = glm::distance(pos[boid], pos[iSelf]); + + if (distance < rule1Distance) { + perceived_center += pos[boid]; + r1Neighbors++; + } + + if (distance < rule2Distance) { + c -= (pos[boid] - pos[iSelf]); + } + + if (distance < rule3Distance) { + perceived_velocity += vel[boid]; + r3Neighbors++; + } + } + } + + //combine three rules' vel together + glm::vec3 new_vel = glm::vec3(0.f, 0.f, 0.f); + if (r1Neighbors != 0) { + perceived_center /= r1Neighbors; + new_vel += (perceived_center - pos[iSelf]) * rule1Scale; + } + + new_vel += c * rule2Scale; + + if (r3Neighbors != 0) { + perceived_velocity /= r3Neighbors; + new_vel += perceived_velocity * rule3Scale; + } + + if (glm::length(new_vel + vel[iSelf]) > maxSpeed) { + return glm::normalize(new_vel) * maxSpeed; + } + else { + return new_vel + vel[iSelf]; + } } /** @@ -245,6 +310,9 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, // Compute a new velocity based on pos and vel1 // Clamp the speed // Record the new velocity into vel2. Question: why NOT vel1? + int index = threadIdx.x + (blockIdx.x * blockDim.x); + vel2[index] = computeVelocityChange(N, index, pos, vel1); + } /** @@ -289,10 +357,20 @@ __global__ void kernComputeIndices(int N, int gridResolution, // - Label each boid with the index of its grid cell. // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < N) { + glm::vec3 gridId3d = (pos[index] - gridMin) * inverseCellWidth; + int iX = glm::floor(gridId3d.x); + int iY = glm::floor(gridId3d.y); + int iZ = glm::floor(gridId3d.z); + indices[index] = index; + gridIndices[index] = gridIndex3Dto1D(iX, iY, iZ, gridResolution); + } + } // LOOK-2.1 Consider how this could be useful for indicating that a cell -// does not enclose any boids +// does not enclose any boid __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index < N) { @@ -306,6 +384,16 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, // Identify the start point of each cell in the gridIndices array. // This is basically a parallel unrolling of a loop that goes // "this index doesn't match the one before it, must be a new cell!" + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + if (index == 0) { + gridCellStartIndices[particleGridIndices[index]] = 0; + } + else if (particleGridIndices[index] != particleGridIndices[index - 1]) { + gridCellEndIndices[particleGridIndices[index - 1]] = index - 1; + gridCellStartIndices[particleGridIndices[index]] = index; + } + } } __global__ void kernUpdateVelNeighborSearchScattered( @@ -322,6 +410,92 @@ __global__ void kernUpdateVelNeighborSearchScattered( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < N) { + + //grid-looping optimization vars + float max_distance = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance); + glm::vec3 max_dist_vec = glm::vec3(max_distance); + glm::vec3 min_gridId3d = (pos[index] - max_dist_vec - gridMin ) * inverseCellWidth; + glm::vec3 max_gridId3d = (pos[index] + max_dist_vec - gridMin ) * inverseCellWidth; + int minX = glm::floor(min_gridId3d.x); + int minY = glm::floor(min_gridId3d.y); + int minZ = glm::floor(min_gridId3d.z); + int maxX = glm::floor(max_gridId3d.x); + int maxY = glm::floor(max_gridId3d.y); + int maxZ = glm::floor(max_gridId3d.z); + + //cumpute vel changes + glm::vec3 perceived_center = glm::vec3(0.f); + glm::vec3 c = glm::vec3(0.f); + glm::vec3 perceived_velocity = glm::vec3(0.f); + int r1Neighbors = 0; + int r3Neighbors = 0; + + for (int x = minX; x <= maxX; x++) { + for (int y = minY; y <= maxY; y++) { + for (int z = minZ; z <= maxZ; z++) { + bool isOutOfBound = (x < 0 || y < 0 || z < 0 + || x >= gridResolution || y >= gridResolution || z >= gridResolution); + if (isOutOfBound) //skip this iteration if out of boundary + continue; + + int neighborId1d = gridIndex3Dto1D(x, y, z, gridResolution); + int startIndex = gridCellStartIndices[neighborId1d]; + int endIndex = gridCellEndIndices[neighborId1d]; + + if (startIndex < 0 || endIndex < 0) { // skip this iteration if no particle + continue; + } + + for (int i = startIndex; i <= endIndex; i++) + { + int neighborId = particleArrayIndices[i]; + if (index == neighborId) + continue; + float distance = glm::distance(pos[index], pos[neighborId]); + if (distance < rule1Distance) + { + perceived_center += pos[neighborId]; + r1Neighbors++; + } + + if (distance < rule2Distance) + { + c -= (pos[neighborId] - pos[index]); + } + + if (distance < rule3Distance) + { + perceived_velocity += vel1[neighborId]; + r3Neighbors++; + } + } + } + } + } + glm::vec3 new_vel = glm::vec3(0.f, 0.f, 0.f); + if (r1Neighbors != 0) { + perceived_center /= r1Neighbors; + new_vel += (perceived_center - pos[index]) * rule1Scale; + } + + new_vel += c * rule2Scale; + + if (r3Neighbors != 0) { + perceived_velocity /= r3Neighbors; + new_vel += perceived_velocity * rule3Scale; + } + + if (glm::length(new_vel + vel1[index]) > maxSpeed) { + vel2[index] = glm::normalize(new_vel) * maxSpeed; + } + else { + vel2[index] = new_vel + vel1[index]; + } + } + } __global__ void kernUpdateVelNeighborSearchCoherent( @@ -341,6 +515,92 @@ __global__ void kernUpdateVelNeighborSearchCoherent( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < N) { + //grid-looping optimization vars + float max_distance = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance); + glm::vec3 max_dist_vec = glm::vec3(max_distance); + glm::vec3 min_gridId3d = (pos[index] - max_dist_vec - gridMin) * inverseCellWidth; + glm::vec3 max_gridId3d = (pos[index] + max_dist_vec - gridMin) * inverseCellWidth; + int minX = glm::floor(min_gridId3d.x); + int minY = glm::floor(min_gridId3d.y); + int minZ = glm::floor(min_gridId3d.z); + int maxX = glm::floor(max_gridId3d.x); + int maxY = glm::floor(max_gridId3d.y); + int maxZ = glm::floor(max_gridId3d.z); + + //cumpute vel changes + glm::vec3 perceived_center = glm::vec3(0.f); + glm::vec3 c = glm::vec3(0.f); + glm::vec3 perceived_velocity = glm::vec3(0.f); + int r1Neighbors = 0; + int r3Neighbors = 0; + + for (int x = minX; x <= maxX; x++) { + for (int y = minY; y <= maxY; y++) { + for (int z = minZ; z <= maxZ; z++) { + bool isOutOfBound = (x < 0 || y < 0 || z < 0 + || x >= gridResolution || y >= gridResolution || z >= gridResolution); + if (isOutOfBound) //skip this iteration if out of boundary + continue; + + int neighborId1d = gridIndex3Dto1D(x, y, z, gridResolution); + int startIndex = gridCellStartIndices[neighborId1d]; + int endIndex = gridCellEndIndices[neighborId1d]; + + if (startIndex < 0 || endIndex < 0) { // skip this iteration if no particle + continue; + } + + for (int i = startIndex; i <= endIndex; i++) + { + int neighborId = i; + if (index == neighborId) + continue; + float distance = glm::distance(pos[index], pos[neighborId]); + if (distance < rule1Distance) + { + perceived_center += pos[neighborId]; + r1Neighbors++; + } + + if (distance < rule2Distance) + { + c -= (pos[neighborId] - pos[index]); + } + + if (distance < rule3Distance) + { + perceived_velocity += vel1[neighborId]; + r3Neighbors++; + } + } + } + } + } + glm::vec3 new_vel = glm::vec3(0.f, 0.f, 0.f); + if (r1Neighbors != 0) { + perceived_center /= r1Neighbors; + new_vel += (perceived_center - pos[index]) * rule1Scale; + } + + new_vel += c * rule2Scale; + + if (r3Neighbors != 0) { + perceived_velocity /= r3Neighbors; + new_vel += perceived_velocity * rule3Scale; + } + + if (glm::length(new_vel + vel1[index]) > maxSpeed) { + vel2[index] = glm::normalize(new_vel) * maxSpeed; + } + else { + vel2[index] = new_vel + vel1[index]; + } + } + } /** @@ -349,39 +609,109 @@ __global__ void kernUpdateVelNeighborSearchCoherent( void Boids::stepSimulationNaive(float dt) { // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. // TODO-1.2 ping-pong the velocity buffers + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernUpdateVelocityBruteForce<<>> (numObjects, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("naive simulation failed!"); + cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice); + kernUpdatePos <<>> (numObjects, dt, dev_pos, dev_vel1); + checkCUDAErrorWithLine("kernUpdatePos failed!"); } + void Boids::stepSimulationScatteredGrid(float dt) { - // TODO-2.1 - // Uniform Grid Neighbor search using Thrust sort. - // In Parallel: - // - label each particle with its array index as well as its grid index. - // Use 2x width grids. - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed + // TODO-2.1 + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 cell_block_num((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellStartIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellEndIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + // Uniform Grid Neighbor search using Thrust sort. + // In Parallel: + // - label each particle with its array index as well as its grid index. + // Use 2x width grids. + kernComputeIndices << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + thrust::device_ptr dev_thrust_keys(dev_particleGridIndices); + thrust::device_ptr dev_thrust_values(dev_particleArrayIndices); + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values); + + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + kernIdentifyCellStartEnd << > > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchScattered << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!"); + + // - Update positions + kernUpdatePos << > > (numObjects, dt, dev_pos, dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); + + // - Ping-pong buffers as needed + dev_vel1 = dev_vel2; +} +__global__ void kernMakeCopyOfPosAndVel(int N, int* indices, + glm::vec3* pos, glm::vec3* vel1, + glm::vec3* posHolder, glm::vec3* vel1Holder) +{ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < N) { + posHolder[index] = pos[indices[index]]; + vel1Holder[index] = vel1[indices[index]]; + } } void Boids::stepSimulationCoherentGrid(float dt) { - // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid - // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. - // In Parallel: - // - Label each particle with its array index as well as its grid index. - // Use 2x width grids - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all - // the particle data in the simulation array. - // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 cell_block_num((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellStartIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellEndIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + + // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. + // In Parallel: + // - Label each particle with its array index as well as its grid index. + // Use 2x width grids + kernComputeIndices << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + thrust::device_ptr dev_thrust_keys(dev_particleGridIndices); + thrust::device_ptr dev_thrust_values(dev_particleArrayIndices); + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values); + + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + + // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all + // the particle data in the simulation array. + // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + // - Perform velocity updates using neighbor search + kernMakeCopyOfPosAndVel << > > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_posHolder, dev_vel1Holder); + cudaMemcpy(dev_pos, dev_posHolder, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_vel1, dev_vel1Holder, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + kernUpdateVelNeighborSearchCoherent << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_pos, dev_vel1, dev_vel2); + + // - Update positions + kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); + + // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + dev_vel1 = dev_vel2; } void Boids::endSimulation() { @@ -390,6 +720,13 @@ void Boids::endSimulation() { cudaFree(dev_pos); // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + + cudaFree(dev_vel1Holder); + cudaFree(dev_posHolder); } void Boids::unitTest() { diff --git a/src/main.cpp b/src/main.cpp index b82c8c6..fafcc7b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,11 +14,11 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation -const int N_FOR_VIS = 5000; +const int N_FOR_VIS = 50000; const float DT = 0.2f; /**