diff --git a/README.md b/README.md
index d63a6a1..6b3b89e 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,41 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Guanlin Huang
+  * [LinkedIn](https://www.linkedin.com/in/guanlin-huang-4406668502/), [personal website](virulentkid.github.io/personal_web/index.html)
+* Tested on: Windows 11, i9-10900K @ 4.9GHz 32GB, RTX3080 10GB; Compute Capability: 8.6
+# Screenshots
+## 50000 boids with naive, scattered and coherent method
+![](images/naive50000.gif)
+![](images/sca50000.gif)
+![](images/coh50000.gif)
+### Performance Analysis
 
-### (TODO: Your README)
+The average FPS of the first 10 second is measured; reasonable amount of waiting is performed to minimize thermal throttling.
+The results show that the FPS drops significantly as boid number increases; the difference between scattered and coherent memory method is more noticable as the number of boids increases. 
+However, no significant differences among different block sizes at the same boid size. 
+![](images/fps.png)
+![](images/fps2.png)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Questions
+
+* For each implementation, how does changing the number of boids affect
+performance? Why do you think this is?
+The FPS drops significantly as boid number increases. It is because at each tick, the calculation needed to get the change in velocity increases as boid number increases.
+
+* For each implementation, how does changing the block count and block size
+affect performance? Why do you think this is?
+ No significant differences among different block sizes at the same boid size. It could be that we haven't hit the throttling point, or the hardware-level of optimization is done at different block size.
+
+* For the coherent uniform grid: did you experience any performance improvements
+with the more coherent uniform grid? Was this the outcome you expected?
+Why or why not?
+Yes.the difference between scattered and coherent memory method is more noticable as the number of boids increases. 
+As the number of boids increases, the time complexity for scattered method increases whereas the coherent method stays relatively constant.
+
+
+* Did changing cell width and checking 27 vs 8 neighboring cells affect performance?
+Why or why not? Be careful: it is insufficient (and possibly incorrect) to say
+that 27-cell is slower simply because there are more cells to check!
+I did the grid optimization to avoid hard coding. But if I were to guess, the 27-cell might be faster in cases where the number of boids are high enough that
+checking only 8 cells would result more complicated calculation overall.
\ No newline at end of file
diff --git a/images/coh50000.gif b/images/coh50000.gif
new file mode 100644
index 0000000..c99ef1d
Binary files /dev/null and b/images/coh50000.gif differ
diff --git a/images/fps.png b/images/fps.png
new file mode 100644
index 0000000..d3efc77
Binary files /dev/null and b/images/fps.png differ
diff --git a/images/fps2.png b/images/fps2.png
new file mode 100644
index 0000000..4a6eef3
Binary files /dev/null and b/images/fps2.png differ
diff --git a/images/naive50000.gif b/images/naive50000.gif
new file mode 100644
index 0000000..95df360
Binary files /dev/null and b/images/naive50000.gif differ
diff --git a/images/sca50000.gif b/images/sca50000.gif
new file mode 100644
index 0000000..7680392
Binary files /dev/null and b/images/sca50000.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..1319a9a 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -5,6 +5,7 @@
 #include <glm/glm.hpp>
 #include "utilityCore.hpp"
 #include "kernel.h"
+#include <device_launch_parameters.h>
 
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
@@ -37,7 +38,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 64
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -85,7 +86,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
-
+glm::vec3* dev_posHolder;
+glm::vec3* dev_vel1Holder;
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -169,6 +171,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_posHolder, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_posHolder failed!");
+
+  cudaMalloc((void**)&dev_vel1Holder, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_vel1Holder failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -210,8 +230,8 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
   dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+  kernCopyPositionsToVBO <<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_pos, vbodptr_positions, scene_scale);
+  kernCopyVelocitiesToVBO <<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
   checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
@@ -222,7 +242,6 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 /******************
 * stepSimulation *
 ******************/
-
 /**
 * LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce.
 * __device__ code can be called from a __global__ context
@@ -233,7 +252,53 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 perceived_center = glm::vec3(0.f);
+    glm::vec3 c = glm::vec3(0.f);
+    glm::vec3 perceived_velocity = glm::vec3(0.f);
+    int r1Neighbors = 0;
+    int r3Neighbors = 0;
+
+    //accumulate 3 rules data into one loop
+    for (int boid = 0; boid < N; boid++) {
+        if (boid != iSelf) {
+            float distance = glm::distance(pos[boid], pos[iSelf]);
+
+            if (distance < rule1Distance) {
+                perceived_center += pos[boid];
+                r1Neighbors++;
+            }
+
+            if (distance < rule2Distance) {
+                c -= (pos[boid] - pos[iSelf]);
+            }
+
+            if (distance < rule3Distance) {
+                perceived_velocity += vel[boid];
+                r3Neighbors++;
+            }
+        }
+    }
+
+    //combine three rules' vel together
+    glm::vec3 new_vel = glm::vec3(0.f, 0.f, 0.f);
+    if (r1Neighbors != 0) {
+        perceived_center /= r1Neighbors;
+        new_vel += (perceived_center - pos[iSelf]) * rule1Scale;
+    }
+
+    new_vel += c * rule2Scale;
+
+    if (r3Neighbors != 0) {
+        perceived_velocity /= r3Neighbors;
+        new_vel += perceived_velocity * rule3Scale;
+    }
+
+    if (glm::length(new_vel + vel[iSelf]) > maxSpeed) {
+        return glm::normalize(new_vel) * maxSpeed;
+    } 
+    else {
+        return new_vel + vel[iSelf];
+    }
 }
 
 /**
@@ -245,6 +310,9 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    vel2[index] = computeVelocityChange(N, index, pos, vel1);
+    
 }
 
 /**
@@ -289,10 +357,20 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index < N) {
+        glm::vec3 gridId3d = (pos[index] - gridMin) * inverseCellWidth;
+        int iX = glm::floor(gridId3d.x);
+        int iY = glm::floor(gridId3d.y);
+        int iZ = glm::floor(gridId3d.z);
+        indices[index] = index;
+        gridIndices[index] = gridIndex3Dto1D(iX, iY, iZ, gridResolution);
+    }
+
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
-//          does not enclose any boids
+//          does not enclose any boid
 __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index < N) {
@@ -306,6 +384,16 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {        
+        if (index == 0) {
+            gridCellStartIndices[particleGridIndices[index]] = 0; 
+        }       
+        else if (particleGridIndices[index] != particleGridIndices[index - 1]) {
+            gridCellEndIndices[particleGridIndices[index - 1]] = index - 1;
+            gridCellStartIndices[particleGridIndices[index]] = index;
+        }
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +410,92 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+    if (index < N) {
+
+        //grid-looping optimization vars
+        float max_distance = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance);
+        glm::vec3 max_dist_vec = glm::vec3(max_distance);
+        glm::vec3 min_gridId3d = (pos[index] - max_dist_vec - gridMin ) * inverseCellWidth;
+        glm::vec3 max_gridId3d = (pos[index] + max_dist_vec - gridMin ) * inverseCellWidth;
+        int minX = glm::floor(min_gridId3d.x);
+        int minY = glm::floor(min_gridId3d.y);
+        int minZ = glm::floor(min_gridId3d.z);
+        int maxX = glm::floor(max_gridId3d.x);
+        int maxY = glm::floor(max_gridId3d.y);
+        int maxZ = glm::floor(max_gridId3d.z);
+
+        //cumpute vel changes
+        glm::vec3 perceived_center = glm::vec3(0.f);
+        glm::vec3 c = glm::vec3(0.f);
+        glm::vec3 perceived_velocity = glm::vec3(0.f);
+        int r1Neighbors = 0;
+        int r3Neighbors = 0;
+
+        for (int x = minX; x <= maxX; x++) {
+            for (int y = minY; y <= maxY; y++) {
+                for (int z = minZ; z <= maxZ; z++) {
+                    bool isOutOfBound = (x < 0 || y < 0 || z < 0
+                        || x >= gridResolution || y >= gridResolution || z >= gridResolution);
+                    if (isOutOfBound) //skip this iteration if out of boundary
+                        continue;
+
+                    int neighborId1d = gridIndex3Dto1D(x, y, z, gridResolution);
+                    int startIndex = gridCellStartIndices[neighborId1d];
+                    int endIndex = gridCellEndIndices[neighborId1d];
+
+                    if (startIndex < 0 || endIndex < 0) { // skip this iteration if no particle 
+                        continue;
+                    }
+
+                    for (int i = startIndex; i <= endIndex; i++)
+                    {
+                        int neighborId = particleArrayIndices[i];
+                        if (index == neighborId)
+                            continue;
+                        float distance = glm::distance(pos[index], pos[neighborId]);
+                        if (distance < rule1Distance)
+                        {
+                            perceived_center += pos[neighborId];
+                            r1Neighbors++;
+                        }
+
+                        if (distance < rule2Distance)
+                        {
+                            c -= (pos[neighborId] - pos[index]);
+                        }
+
+                        if (distance < rule3Distance)
+                        {
+                            perceived_velocity += vel1[neighborId];
+                            r3Neighbors++;
+                        }
+                    }
+                }
+            }
+        }
+        glm::vec3 new_vel = glm::vec3(0.f, 0.f, 0.f);
+        if (r1Neighbors != 0) {
+            perceived_center /= r1Neighbors;
+            new_vel += (perceived_center - pos[index]) * rule1Scale;
+        }
+
+        new_vel += c * rule2Scale;
+
+        if (r3Neighbors != 0) {
+            perceived_velocity /= r3Neighbors;
+            new_vel += perceived_velocity * rule3Scale;
+        }
+
+        if (glm::length(new_vel + vel1[index]) > maxSpeed) {
+            vel2[index] = glm::normalize(new_vel) * maxSpeed;
+        }
+        else {
+            vel2[index] = new_vel + vel1[index];
+        }
+    }
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +515,92 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+    if (index < N) {
+        //grid-looping optimization vars
+        float max_distance = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance);
+        glm::vec3 max_dist_vec = glm::vec3(max_distance);
+        glm::vec3 min_gridId3d = (pos[index] - max_dist_vec - gridMin) * inverseCellWidth;
+        glm::vec3 max_gridId3d = (pos[index] + max_dist_vec - gridMin) * inverseCellWidth;
+        int minX = glm::floor(min_gridId3d.x);
+        int minY = glm::floor(min_gridId3d.y);
+        int minZ = glm::floor(min_gridId3d.z);
+        int maxX = glm::floor(max_gridId3d.x);
+        int maxY = glm::floor(max_gridId3d.y);
+        int maxZ = glm::floor(max_gridId3d.z);
+
+        //cumpute vel changes
+        glm::vec3 perceived_center = glm::vec3(0.f);
+        glm::vec3 c = glm::vec3(0.f);
+        glm::vec3 perceived_velocity = glm::vec3(0.f);
+        int r1Neighbors = 0;
+        int r3Neighbors = 0;
+
+        for (int x = minX; x <= maxX; x++) {
+            for (int y = minY; y <= maxY; y++) {
+                for (int z = minZ; z <= maxZ; z++) {
+                    bool isOutOfBound = (x < 0 || y < 0 || z < 0
+                        || x >= gridResolution || y >= gridResolution || z >= gridResolution);
+                    if (isOutOfBound) //skip this iteration if out of boundary
+                        continue;
+
+                    int neighborId1d = gridIndex3Dto1D(x, y, z, gridResolution);
+                    int startIndex = gridCellStartIndices[neighborId1d];
+                    int endIndex = gridCellEndIndices[neighborId1d];
+
+                    if (startIndex < 0 || endIndex < 0) { // skip this iteration if no particle 
+                        continue;
+                    }
+
+                    for (int i = startIndex; i <= endIndex; i++)
+                    {
+                        int neighborId = i;
+                        if (index == neighborId)
+                            continue;
+                        float distance = glm::distance(pos[index], pos[neighborId]);
+                        if (distance < rule1Distance)
+                        {
+                            perceived_center += pos[neighborId];
+                            r1Neighbors++;
+                        }
+
+                        if (distance < rule2Distance)
+                        {
+                            c -= (pos[neighborId] - pos[index]);
+                        }
+
+                        if (distance < rule3Distance)
+                        {
+                            perceived_velocity += vel1[neighborId];
+                            r3Neighbors++;
+                        }
+                    }
+                }
+            }
+        }
+        glm::vec3 new_vel = glm::vec3(0.f, 0.f, 0.f);
+        if (r1Neighbors != 0) {
+            perceived_center /= r1Neighbors;
+            new_vel += (perceived_center - pos[index]) * rule1Scale;
+        }
+
+        new_vel += c * rule2Scale;
+
+        if (r3Neighbors != 0) {
+            perceived_velocity /= r3Neighbors;
+            new_vel += perceived_velocity * rule3Scale;
+        }
+
+        if (glm::length(new_vel + vel1[index]) > maxSpeed) {
+            vel2[index] = glm::normalize(new_vel) * maxSpeed;
+        }
+        else {
+            vel2[index] = new_vel + vel1[index];
+        }
+    }
+
 }
 
 /**
@@ -349,39 +609,109 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>> (numObjects, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("naive simulation failed!");
+    cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+    kernUpdatePos <<<fullBlocksPerGrid, blockSize>>> (numObjects, dt, dev_pos, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
 }
 
+
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+    // TODO-2.1
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 cell_block_num((gridCellCount + blockSize - 1) / blockSize);
+    kernResetIntBuffer << <cell_block_num, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+    kernResetIntBuffer << <cell_block_num, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+    checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+    // Uniform Grid Neighbor search using Thrust sort.
+    // In Parallel:
+    // - label each particle with its array index as well as its grid index.
+    //   Use 2x width grids.
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+    thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+    thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+    // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+        dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+    // - Update positions
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    // - Ping-pong buffers as needed
+    dev_vel1 = dev_vel2;
+}
+__global__ void kernMakeCopyOfPosAndVel(int N, int* indices,
+    glm::vec3* pos, glm::vec3* vel1,
+    glm::vec3* posHolder, glm::vec3* vel1Holder)
+{
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+    if (index < N) {
+        posHolder[index] = pos[indices[index]];
+        vel1Holder[index] = vel1[indices[index]];
+    }
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 cell_block_num((gridCellCount + blockSize - 1) / blockSize);
+    kernResetIntBuffer << <cell_block_num, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+    kernResetIntBuffer << <cell_block_num, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+    checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+    
+    // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+    // In Parallel:
+    // - Label each particle with its array index as well as its grid index.
+    //   Use 2x width grids
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+    thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+    thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+    // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+    //   the particle data in the simulation array.
+    //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+    // - Perform velocity updates using neighbor search
+    kernMakeCopyOfPosAndVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_posHolder, dev_vel1Holder);
+    cudaMemcpy(dev_pos, dev_posHolder, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dev_vel1, dev_vel1Holder, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+        dev_gridCellStartIndices, dev_gridCellEndIndices,
+        dev_pos, dev_vel1, dev_vel2);
+
+    // - Update positions
+    kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    dev_vel1 = dev_vel2;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +720,13 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_vel1Holder);
+  cudaFree(dev_posHolder);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..fafcc7b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 50000;
 const float DT = 0.2f;
 
 /**