diff --git a/README.md b/README.md
index d63a6a1..b0f5f9d 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,49 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zhuohao Lin
+  * [LinkedIn](https://www.linkedin.com/in/zhuohao-lin-960b54194/)
+* Tested on: Windows 10, i7-10875H @ 2.30GHz 16GB, NVIDIA Grforce RTX 2060 6GB (personal machine)
 
-### (TODO: Your README)
+# Features
+I implemented 3 ways to find neighbors of boids for velocity computation and flocking boids simulation.
+* Naive method: Simply check through all of boids to find neighbors of a boid.
+* Scattered uniform grid: Put boids in uniform grids based on their position, and then find neighbor grids in order to find neighbor boids.
+* Coherent uniform grid: Use the uniform grid method, and rearrange position and velocity data so that they are contiguous in memory.
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+# Screenshots
+![](images/screenshot1.PNG)
+![](images/simulation2.gif)
+![](images/simulation3.gif)
+
+# Performance Analysis
+
+### Figure 1
+![](images/figure1.PNG)
+### Figure 2
+![](images/figure2.PNG)
+### Figure 3
+![](images/figure3.PNG)
+
+### *For each implementation, how does changing the number of boids affect performance? Why do you think this is?*
+
+The performance drops as the number of boids increases as we can see in figure 1 and 2. This is because the number of threads required for computation exceeds the number of threads limit in each block.As the number of boids increases, there are more and more threads waiting for computation.
+
+<br/>
+
+### *For each implementation, how does changing the block count and block size affect performance? Why do you think this is?*
+
+The performance improves as the block size increases at the beginning. However, when block size reaches a threshold, the performance would not improve any more but remain at the same level. The reason is that there are more threads running in parallel as the block size increases. The performance doesn't improve after the block size reaches a threshold because the computing power of the GPU is limited.
+
+<br/>
+
+### *For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?*
+
+As the number of boids becomes larger, the performance improvements with coherent uniform grid becomes more significant. This is expected since the GPU will access the the global memory more frequently when the number of boids increases and that gradually becomes a large cost. With coherent uniform grid, the GPU reduces the times to access global memory. Therefore, it is faster.
+
+<br/>
+
+### *Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not?*
+
+Changing cell width and checking 27 vs 8 neighboring cells affects performance. Checking 27 cells is faster than checking 9 neighboring cells, and the difference becomes significant when the number of boids is large enough. This is because we're actually checking less neighbor boids when checking 27 cells.<br/>
+The cell width we use for checking 27 cells is (1 * maxRuleDistance), thus the total checking space is (27 * maxRuleDistance<sup>3</sup>). The cell width we use for checking 8 cells is (2 * maxRuleDistance), thus the total checking space is (64 * maxRuleDistance<sup>3</sup>). With less boids to check, we get the performance improvements.
diff --git a/images/figure1.PNG b/images/figure1.PNG
new file mode 100644
index 0000000..593601b
Binary files /dev/null and b/images/figure1.PNG differ
diff --git a/images/figure2.PNG b/images/figure2.PNG
new file mode 100644
index 0000000..d372645
Binary files /dev/null and b/images/figure2.PNG differ
diff --git a/images/figure3.PNG b/images/figure3.PNG
new file mode 100644
index 0000000..60e085c
Binary files /dev/null and b/images/figure3.PNG differ
diff --git a/images/screenshot1.PNG b/images/screenshot1.PNG
new file mode 100644
index 0000000..e114469
Binary files /dev/null and b/images/screenshot1.PNG differ
diff --git a/images/simulation1.gif b/images/simulation1.gif
new file mode 100644
index 0000000..a6d5f12
Binary files /dev/null and b/images/simulation1.gif differ
diff --git a/images/simulation2.gif b/images/simulation2.gif
new file mode 100644
index 0000000..b10d903
Binary files /dev/null and b/images/simulation2.gif differ
diff --git a/images/simulation3.gif b/images/simulation3.gif
new file mode 100644
index 0000000..c63c2a5
Binary files /dev/null and b/images/simulation3.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..22928d9 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -5,6 +5,7 @@
 #include <glm/glm.hpp>
 #include "utilityCore.hpp"
 #include "kernel.h"
+#include <device_launch_parameters.h>
 
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
@@ -17,6 +18,8 @@
 
 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
+#define USE_DOUBLE_CELL_WIDTH 1
+
 /**
 * Check for CUDA errors; print and exit if there was a problem.
 */
@@ -85,6 +88,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_reshuffledPos;
+glm::vec3 *dev_reshuffledVel;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -157,7 +162,11 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
+#if USE_DOUBLE_CELL_WIDTH
   gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+#else
+  gridCellWidth = std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+#endif
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,7 +178,20 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+
+
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+
+  cudaMalloc((void**)&dev_reshuffledPos, N * sizeof(glm::vec3));
+  cudaMalloc((void**)&dev_reshuffledVel, N * sizeof(glm::vec3));
+
   cudaDeviceSynchronize();
+
 }
 
 
@@ -230,10 +252,53 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
+  glm::vec3 velChange;
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+  glm::vec3 perceivedCenter;
+  int numNeighbors = 0;
+  for (int i = 0; i < N; ++i)
+  {
+    if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule1Distance)
+    {
+      perceivedCenter += pos[i];
+      numNeighbors++;
+    }
+  }
+  if (numNeighbors > 0)
+  {
+    perceivedCenter /= numNeighbors;
+    velChange += (perceivedCenter - pos[iSelf]) * rule1Scale;
+  }
+
   // Rule 2: boids try to stay a distance d away from each other
+  glm::vec3 c;
+  for (int i = 0; i < N; ++i)
+  {
+    if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule2Distance)
+    {
+      c -= (pos[i] - pos[iSelf]);
+    }
+  }
+  velChange += c * rule2Scale;
+
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceivedVel;
+  numNeighbors = 0;
+  for (int i = 0; i < N; ++i)
+  {
+    if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule3Distance)
+    {
+      perceivedVel += vel[i];
+      numNeighbors++;
+    }
+  }
+  if (numNeighbors > 0)
+  {
+    perceivedVel /= numNeighbors;
+    velChange += perceivedVel * rule3Scale;
+  }
+
+  return velChange;
 }
 
 /**
@@ -245,6 +310,15 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+
+  int currIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+  glm::vec3 newVel = computeVelocityChange(N, currIndex, pos, vel1) + vel1[currIndex];
+  float newSpeed = glm::length(newVel);
+  if (newSpeed > maxSpeed)
+  {
+    newVel = newVel / newSpeed * maxSpeed;
+  }
+  vel2[currIndex] = newVel;
 }
 
 /**
@@ -289,6 +363,13 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) return;
+  
+  indices[index] = index;
+  
+  glm::vec3 gridIndex3D = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+  gridIndices[index] = gridIndex3Dto1D(gridIndex3D.x, gridIndex3D.y, gridIndex3D.z, gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +387,31 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) return;
+  
+  int currGridIndex = particleGridIndices[index];
+  
+  if (index > 0)
+  {
+    int lastGridIndex = particleGridIndices[index - 1];
+    if (lastGridIndex != currGridIndex)
+    {
+      gridCellStartIndices[currGridIndex] = index;
+      gridCellEndIndices[lastGridIndex] = index - 1;
+    }
+  }
+  // First one must be the start
+  else
+  {
+    gridCellStartIndices[currGridIndex] = index;
+  }
+  // Last one must be the end
+  if (index == N - 1)
+  {
+    gridCellEndIndices[currGridIndex] = index;
+  }  
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +428,97 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) return;
+
+  glm::vec3 gridIndex3D = (pos[index] - gridMin) * inverseCellWidth;
+
+  // relativeNeighborGrid is the direction (-1 or 1) in which grid may contain neighbors of the boid
+  glm::ivec3 relativeNeighborGrid = glm::round(glm::fract(gridIndex3D)) * 2.f - 1.f;
+  glm::ivec3 zeroVec3;
+  glm::ivec3 directions3D[2] = { zeroVec3, relativeNeighborGrid };
+  gridIndex3D = glm::floor(gridIndex3D);
+
+  glm::vec3 velChange;
+  // variables for rule 1
+  glm::vec3 perceivedCenter;
+  int numNeighbors1 = 0;
+  // variables for rule 2
+  glm::vec3 c;
+  // varirables for rule 3
+  glm::vec3 perceivedVel;
+  int numNeighbors3 = 0;
+  // Find out the cells which may contain neighbors
+  for (int i = 0; i < 2; ++i)
+  {
+    glm::ivec3 neighborGridIndex3D;
+    neighborGridIndex3D.x = gridIndex3D.x + directions3D[i].x;
+    // out of bound check
+    if (neighborGridIndex3D.x < 0 || neighborGridIndex3D.x >= gridResolution) continue;
+    for (int j = 0; j < 2; ++j)
+    {
+      neighborGridIndex3D.y = gridIndex3D.y + directions3D[j].y;
+      if (neighborGridIndex3D.y < 0 || neighborGridIndex3D.y >= gridResolution) continue;
+      for (int k = 0; k < 2; ++k)
+      {
+        neighborGridIndex3D.z = gridIndex3D.z + directions3D[k].z;
+        if (neighborGridIndex3D.z < 0 || neighborGridIndex3D.z >= gridResolution) continue;
+
+        int gridCell = gridIndex3Dto1D(neighborGridIndex3D.x, neighborGridIndex3D.y, neighborGridIndex3D.z, gridResolution);
+
+        if (gridCellStartIndices[gridCell] == -1) continue;
+        // Check all of boids in this cell
+        for (int neighborArrayIndex = gridCellStartIndices[gridCell]; neighborArrayIndex <= gridCellEndIndices[gridCell]; ++neighborArrayIndex)
+        {
+          int iNeighbor = particleArrayIndices[neighborArrayIndex];
+          if (iNeighbor == index) continue;
+          float dist = glm::distance(pos[iNeighbor], pos[index]);
+          // Rule 1
+          if (dist < rule1Distance)
+          {
+            perceivedCenter += pos[iNeighbor];
+            numNeighbors1++;
+          }
+          //Rule 2
+          if (dist < rule2Distance)
+          {
+            c -= (pos[iNeighbor] - pos[index]);
+          }
+          //Rule 3
+          if (dist < rule3Distance)
+          {
+            perceivedVel += vel1[iNeighbor];
+            numNeighbors3++;
+          }
+        }
+      }
+    }
+  }
+
+  // Rule 1
+  if (numNeighbors1 > 0)
+  {
+    perceivedCenter /= numNeighbors1;
+    velChange += (perceivedCenter - pos[index]) * rule1Scale;
+  }
+  // Rule 2
+  velChange += c * rule2Scale;
+  // Rule 3
+  if (numNeighbors3 > 0)
+  {
+    perceivedVel /= numNeighbors3;
+    velChange += perceivedVel * rule3Scale;
+  }
+
+  // compute new velocity
+  glm::vec3 newVel = velChange + vel1[index];
+  float newSpeed = glm::length(newVel);
+  if (newSpeed > maxSpeed)
+  {
+    newVel = newVel / newSpeed * maxSpeed;
+  }
+  vel2[index] = newVel;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +538,113 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) return;
+
+  glm::vec3 gridIndex3D = (pos[index] - gridMin) * inverseCellWidth;
+
+  // relativeNeighborGrid is the direction (-1 or 1) in which grid may contain neighbors of the boid
+  glm::ivec3 relativeNeighborGrid = glm::round(glm::fract(gridIndex3D)) * 2.f - 1.f;
+  glm::ivec3 zeroVec3;
+  glm::ivec3 directions3D[2] = { zeroVec3, relativeNeighborGrid };
+  gridIndex3D = glm::floor(gridIndex3D);
+
+  glm::vec3 velChange;
+  // variables for rule 1
+  glm::vec3 perceivedCenter;
+  int numNeighbors1 = 0;
+  // variables for rule 2
+  glm::vec3 c;
+  // varirables for rule 3
+  glm::vec3 perceivedVel;
+  int numNeighbors3 = 0;
+  // Find out the cells which may contain neighbors
+#if USE_DOUBLE_CELL_WIDTH
+  for (int i = 0; i < 2; ++i)
+  {
+    glm::ivec3 neighborGridIndex3D;
+    neighborGridIndex3D.x = gridIndex3D.x + directions3D[i].x;
+    // out of bound check
+    if (neighborGridIndex3D.x < 0 || neighborGridIndex3D.x >= gridResolution) continue;
+    for (int j = 0; j < 2; ++j)
+    {
+      neighborGridIndex3D.y = gridIndex3D.y + directions3D[j].y;
+      if (neighborGridIndex3D.y < 0 || neighborGridIndex3D.y >= gridResolution) continue;
+      for (int k = 0; k < 2; ++k)
+      {
+        neighborGridIndex3D.z = gridIndex3D.z + directions3D[k].z;
+        if (neighborGridIndex3D.z < 0 || neighborGridIndex3D.z >= gridResolution) continue;
+#else
+  for (int i = -1; i <= 1; ++i)
+  {
+    glm::ivec3 neighborGridIndex3D;
+    neighborGridIndex3D.x = gridIndex3D.x + i;
+    // out of bound check
+    if (neighborGridIndex3D.x < 0 || neighborGridIndex3D.x >= gridResolution) continue;
+    for (int j = -1; j <= 1; ++j)
+    {
+      neighborGridIndex3D.y = gridIndex3D.y + j;
+      if (neighborGridIndex3D.y < 0 || neighborGridIndex3D.y >= gridResolution) continue;
+      for (int k = -1; k <= 1; ++k)
+      {
+        neighborGridIndex3D.z = gridIndex3D.z + k;
+        if (neighborGridIndex3D.z < 0 || neighborGridIndex3D.z >= gridResolution) continue;
+#endif
+
+        int gridCell = gridIndex3Dto1D(neighborGridIndex3D.x, neighborGridIndex3D.y, neighborGridIndex3D.z, gridResolution);
+
+        if (gridCellStartIndices[gridCell] == -1) continue;
+        // Check all of boids in this cell
+        for (int iNeighbor = gridCellStartIndices[gridCell]; iNeighbor <= gridCellEndIndices[gridCell]; ++iNeighbor)
+        {
+          if (iNeighbor == index) continue;
+          float dist = glm::distance(pos[iNeighbor], pos[index]);
+          // Rule 1
+          if (dist < rule1Distance)
+          {
+            perceivedCenter += pos[iNeighbor];
+            numNeighbors1++;
+          }
+          //Rule 2
+          if (dist < rule2Distance)
+          {
+            c -= (pos[iNeighbor] - pos[index]);
+          }
+          //Rule 3
+          if (dist < rule3Distance)
+          {
+            perceivedVel += vel1[iNeighbor];
+            numNeighbors3++;
+          }
+        }
+      }
+    }
+  }
+
+  // Rule 1
+  if (numNeighbors1 > 0)
+  {
+    perceivedCenter /= numNeighbors1;
+    velChange += (perceivedCenter - pos[index]) * rule1Scale;
+  }
+  // Rule 2
+  velChange += c * rule2Scale;
+  // Rule 3
+  if (numNeighbors3 > 0)
+  {
+    perceivedVel /= numNeighbors3;
+    velChange += perceivedVel * rule3Scale;
+  }
+
+  // compute new velocity
+  glm::vec3 newVel = velChange + vel1[index];
+  float newSpeed = glm::length(newVel);
+  if (newSpeed > maxSpeed)
+  {
+    newVel = newVel / newSpeed * maxSpeed;
+  }
+  vel2[index] = newVel;
+
 }
 
 /**
@@ -349,6 +653,10 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +672,35 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, 
+    gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+  
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, 
+    dev_thrust_particleArrayIndices);
+
+  dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
+  kernResetIntBuffer<<<fullBlocksPerGridCell, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGridCell, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices, 
+    dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+    gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+  std::swap(dev_vel1, dev_vel2);
+
+}
+
+__global__ void kernReshuffleVec3Buffer(int N, int* oldIndices, glm::vec3* oldBuffer, glm::vec3* newBuffer)
+{
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) return;
+
+  int oldIndex = oldIndices[index];
+  newBuffer[index] = oldBuffer[oldIndex];
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +719,31 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum,
+    gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects,
+    dev_thrust_particleArrayIndices);
+
+  dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
+  kernResetIntBuffer<<<fullBlocksPerGridCell, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGridCell, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices,
+    dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  //Use the rearranged array index buffer to reshuffle all the particle data in the simulation array
+  kernReshuffleVec3Buffer<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleArrayIndices, dev_pos, dev_reshuffledPos);
+  kernReshuffleVec3Buffer<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleArrayIndices, dev_vel1, dev_reshuffledVel);
+
+  kernUpdateVelNeighborSearchCoherent<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+    gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_reshuffledPos, dev_reshuffledVel, dev_vel2);
+
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_reshuffledPos, dev_vel2);
+  std::swap(dev_vel1, dev_vel2);
+  std::swap(dev_pos, dev_reshuffledPos);
+
 }
 
 void Boids::endSimulation() {
@@ -390,6 +752,13 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_reshuffledPos);
+  cudaFree(dev_reshuffledVel);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..ddd0e3b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;