diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index 5f2ccf08b..f0145a9ae 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -483,6 +483,10 @@ void Run::parse_one_keyword(std::vector<std::string>& tokens)
     add_efield.parse(param, num_param, group);
   } else if (strcmp(param[0], "mc") == 0) {
     mc.parse_mc(param, num_param, group, atom);
+  } else if (strcmp(param[0], "mc_minimize") == 0) {
+    MC_Minimize mc_minimize;
+    mc_minimize.parse_mc_minimize(param, num_param, group, atom, box, force);
+    mc_minimize.compute(force, atom, box, group);
   } else if (strcmp(param[0], "dftd3") == 0) {
     // nothing here; will be handled elsewhere
   } else if (strcmp(param[0], "compute_lsqt") == 0) {
diff --git a/src/main_gpumd/run.cuh b/src/main_gpumd/run.cuh
index 146efcaf0..e7558b993 100644
--- a/src/main_gpumd/run.cuh
+++ b/src/main_gpumd/run.cuh
@@ -26,6 +26,7 @@ class Measure;
 #include "force/force.cuh"
 #include "integrate/integrate.cuh"
 #include "mc/mc.cuh"
+#include "mc_minimize/mc_minimize.cuh"
 #include "measure/measure.cuh"
 #include "model/atom.cuh"
 #include "model/box.cuh"
diff --git a/src/makefile b/src/makefile
index 30164800c..95fcccda2 100644
--- a/src/makefile
+++ b/src/makefile
@@ -37,6 +37,7 @@ SOURCES_GPUMD =                   \
 	$(wildcard phonon/*.cu)       \
 	$(wildcard integrate/*.cu)    \
 	$(wildcard mc/*.cu)           \
+	$(wildcard mc_minimize/*.cu)  \
 	$(wildcard force/*.cu)        \
 	$(wildcard measure/*.cu)      \
 	$(wildcard model/*.cu)        \
@@ -66,6 +67,7 @@ HEADERS =                         \
 	$(wildcard main_gpumd/*.cuh)  \
 	$(wildcard integrate/*.cuh)   \
 	$(wildcard mc/*.cuh)          \
+	$(wildcard mc_minimize/*.cuh) \
 	$(wildcard minimize/*.cuh)    \
 	$(wildcard force/*.cuh)       \
 	$(wildcard measure/*.cuh)     \
@@ -98,6 +100,8 @@ integrate/%.obj: integrate/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 mc/%.obj: mc/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+mc_minimize/%.obj: mc_minimize/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 minimize/%.obj: minimize/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 force/%.obj: force/%.cu $(HEADERS)
@@ -119,6 +123,8 @@ integrate/%.o: integrate/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 mc/%.o: mc/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+mc_minimize/%.o: mc_minimize/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 minimize/%.o: minimize/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 force/%.o: force/%.cu $(HEADERS)
diff --git a/src/makefile.hip b/src/makefile.hip
index 7fcfc6d37..2806e54ad 100644
--- a/src/makefile.hip
+++ b/src/makefile.hip
@@ -29,6 +29,7 @@ SOURCES_GPUMD =                   \
 	$(wildcard phonon/*.cu)       \
 	$(wildcard integrate/*.cu)    \
 	$(wildcard mc/*.cu)           \
+	$(wildcard mc_minimize/*.cu)  \
 	$(wildcard force/*.cu)        \
 	$(wildcard measure/*.cu)      \
 	$(wildcard model/*.cu)        \
@@ -53,6 +54,7 @@ HEADERS =                         \
 	$(wildcard main_gpumd/*.cuh)  \
 	$(wildcard integrate/*.cuh)   \
 	$(wildcard mc/*.cuh)          \
+	$(wildcard mc_minimize/*.cuh) \
 	$(wildcard minimize/*.cuh)    \
 	$(wildcard force/*.cuh)       \
 	$(wildcard measure/*.cuh)     \
@@ -84,6 +86,8 @@ integrate/%.o: integrate/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 mc/%.o: mc/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+mc_minimize/%.o: mc_minimize/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 minimize/%.o: minimize/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 force/%.o: force/%.cu $(HEADERS)
diff --git a/src/mc_minimize/mc_minimize.cu b/src/mc_minimize/mc_minimize.cu
new file mode 100644
index 000000000..9e135c115
--- /dev/null
+++ b/src/mc_minimize/mc_minimize.cu
@@ -0,0 +1,187 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mc_minimize.cuh"
+#include "utilities/read_file.cuh"
+#include "mc_minimizer_local.cuh"
+#include "mc_minimizer_global.cuh"
+#include "mc_minimizer_test.cuh"
+
+void MC_Minimize::parse_group(
+  const char** param, int num_param, std::vector<Group>& groups, int num_param_before_group)
+{
+  if (strcmp(param[num_param_before_group], "group") != 0) {
+    PRINT_INPUT_ERROR("invalid option for mc.\n");
+  }
+  if (!is_valid_int(param[num_param_before_group + 1], &grouping_method)) {
+    PRINT_INPUT_ERROR("grouping method of MC_Minimize should be an integer.\n");
+  }
+  if (grouping_method < 0) {
+    PRINT_INPUT_ERROR("grouping method of MC_Minimize should >= 0.\n");
+  }
+  if (grouping_method >= groups.size()) {
+    PRINT_INPUT_ERROR("Grouping method should < number of grouping methods.");
+  }
+  if (!is_valid_int(param[num_param_before_group + 2], &group_id)) {
+    PRINT_INPUT_ERROR("group ID of MC_Minimize should be an integer.\n");
+  }
+  if (group_id < 0) {
+    PRINT_INPUT_ERROR("group ID of MC_Minimize should >= 0.\n");
+  }
+  if (group_id >= groups[grouping_method].number) {
+    PRINT_INPUT_ERROR("Group ID should < number of groups.");
+  }
+}
+
+
+void MC_Minimize::compute(
+  Force& force,
+  Atom& atom,
+  Box& box,
+  std::vector<Group>& group)
+{
+    mc_minimizer->compute(
+    num_trials_mc,
+    force,
+    atom,
+    box,
+    group,
+    grouping_method,
+    group_id);
+}
+
+void MC_Minimize::parse_mc_minimize(const char** param, int num_param, std::vector<Group>& group, Atom& atom, Box& box, Force& force)
+{
+  if (num_param < 6) {
+    PRINT_INPUT_ERROR("mc_minimize should have at least 5 parameters.\n");
+  }
+
+  //0 for local, 1 for global, 2 for test
+  int mc_minimizer_type = 0;
+  if (strcmp(param[1], "local") == 0) {
+    printf("Perform simple MC with local relaxation:\n");
+    mc_minimizer_type = 0;
+  } else if (strcmp(param[1], "global") == 0) {
+    printf("Perform simple MC with global relaxation:\n");
+    mc_minimizer_type = 1;
+  } else if (strcmp(param[1], "test") == 0) {
+    printf("Perform simple MC test:\n");
+    mc_minimizer_type = 2;
+  } else {
+    PRINT_INPUT_ERROR("invalid MC Minimizer type for MC.\n");
+  }
+  if (mc_minimizer_type == 0) {
+    if (num_param < 7) {
+      PRINT_INPUT_ERROR("reading error for local relaxation, missing parameter\n");
+    }
+  }
+  if (mc_minimizer_type == 1) {
+    if (num_param < 6) {
+      PRINT_INPUT_ERROR("reading error for global relaxation, missing parameter\n");
+    }
+  }
+  if (mc_minimizer_type == 2) {
+    if (num_param < 7) {
+      PRINT_INPUT_ERROR("reading error for test, missing parameter\n");
+    }
+  }
+
+  //check if num_trials_mc reasonable
+  if (!is_valid_int(param[2], &num_trials_mc)) {
+    PRINT_INPUT_ERROR("number of MC trials for MC Minimize should be an integer.\n");
+  }
+  if (num_trials_mc <= 0) {
+    PRINT_INPUT_ERROR("number of MC trials for MC Minimize should be positive.\n");
+  }
+
+  //check if temperature reasonable
+  if (!is_valid_real(param[3], &temperature)) {
+    PRINT_INPUT_ERROR("temperature for MC Minimize should be a number.\n");
+  }
+  if (temperature <= 0) {
+    PRINT_INPUT_ERROR("temperature for MC Minimize should be positive.\n");
+  }
+
+  //check if force_tolerance reasonable
+  if (!is_valid_real(param[4], &force_tolerance)) {
+    PRINT_INPUT_ERROR("force tolerance for MC Minimize should be a number.\n");
+  }
+  if (force_tolerance <= 0) {
+    PRINT_INPUT_ERROR("force tolerance for MC Minimize should be positive.\n");
+  }
+
+  //check if max nmber of relax reasonable
+  if (!is_valid_int(param[5], &max_relax_steps)) {
+    PRINT_INPUT_ERROR("max relaxation steps for MC Minimize should be an integer.\n");
+  }
+  if (max_relax_steps <= 0) {
+    PRINT_INPUT_ERROR("max relaxation steps for MC Minimize should be positive.\n");
+  }
+
+  //check if scale factor reasonable 
+  if (mc_minimizer_type == 0)
+  {
+  if (!is_valid_real(param[6], &scale_factor)) {
+    PRINT_INPUT_ERROR("scale factor for MC Minimize should be a number.\n");
+  }
+  if (scale_factor <= 0) {
+    PRINT_INPUT_ERROR("scale factor for MC Minimize should be positive.\n");
+  }
+  }
+  if (mc_minimizer_type == 2)
+  {
+  if (!is_valid_real(param[6], &scale_factor)) {
+    PRINT_INPUT_ERROR("scale factor for MC Minimize should be a number.\n");
+  }
+  if (scale_factor <= 0) {
+    PRINT_INPUT_ERROR("scale factor for MC Minimize should be positive.\n");
+  }
+  }
+
+  
+
+  int num_param_before_group;
+  if (mc_minimizer_type == 0)
+  {
+    num_param_before_group = 7;
+  }
+  if (mc_minimizer_type == 1)
+  {
+    num_param_before_group = 6;
+  }
+  if (mc_minimizer_type == 2)
+  {
+    num_param_before_group = 7;
+  }
+
+  if (num_param > num_param_before_group)
+  {
+    parse_group(param, num_param, group, num_param_before_group);
+    printf("    only for atoms in group %d of grouping method %d.\n", group_id, grouping_method);
+  }
+
+  if (mc_minimizer_type == 0)
+  {
+    mc_minimizer.reset(new MC_Minimizer_Local(param, num_param, scale_factor, temperature, force_tolerance, max_relax_steps));
+  }
+  if (mc_minimizer_type == 1)
+  {
+    mc_minimizer.reset(new MC_Minimizer_Global(param, num_param, temperature, force_tolerance, max_relax_steps));
+  }
+  if (mc_minimizer_type == 2)
+  {
+    mc_minimizer.reset(new MC_Minimizer_Test(param, num_param, scale_factor, temperature, force_tolerance, max_relax_steps));
+  }
+}
\ No newline at end of file
diff --git a/src/mc_minimize/mc_minimize.cuh b/src/mc_minimize/mc_minimize.cuh
new file mode 100644
index 000000000..bd91c86ce
--- /dev/null
+++ b/src/mc_minimize/mc_minimize.cuh
@@ -0,0 +1,52 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#pragma once
+
+#include "mc_minimizer.cuh"
+#include "force/force.cuh"
+#include "model/atom.cuh"
+#include "model/box.cuh"
+#include "model/group.cuh"
+#include <memory>
+#include <vector>
+
+class MC_Minimize
+{
+private:
+
+    void parse_group(const char** param, int num_param, std::vector<Group>& groups, int num_param_before_group);
+    int grouping_method = -1;
+    int group_id = -1;
+
+    //parameters
+    int num_trials_mc = 0; 
+    double temperature = 0; 
+    double force_tolerance = 0;
+    int max_relax_steps = 0;
+    double scale_factor = 0;
+
+
+public:
+    void parse_mc_minimize(const char** param, int num_param, std::vector<Group>& group, Atom& atom, Box& box, Force& force);
+
+    void compute(
+      Force& force,
+      Atom& atom,
+      Box& box,
+      std::vector<Group>& group);
+
+    std::unique_ptr<MC_Minimizer> mc_minimizer;
+};
+
diff --git a/src/mc_minimize/mc_minimizer.cu b/src/mc_minimize/mc_minimizer.cu
new file mode 100644
index 000000000..467a6a5af
--- /dev/null
+++ b/src/mc_minimize/mc_minimizer.cu
@@ -0,0 +1,50 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mc_minimizer.cuh"
+
+MC_Minimizer::MC_Minimizer(const char** param, int num_param)
+{
+  mc_output.open("mc_minimize.out", std::ios::app);
+  mc_output << "# ";
+  for (int n = 0; n < num_param; ++n) {
+    mc_output << param[n] << " ";
+  }
+  mc_output << "\n";
+  mc_output << std::defaultfloat << std::setprecision(12);
+  if (strcmp(param[1], "local") == 0) {
+    mc_output << "Step" << "\t" << "Maximum displacement" << "\t" << "Average displacement" 
+    << "\t" << "Energy difference" << "\t" <<  "Accept ratio" << std::endl;
+  } else if (strcmp(param[1], "global") == 0) {
+    mc_output << "Step" << "\t" << "Energy before" << "\t" << "Energy after" << "\t" <<  "Accept ratio" << std::endl;
+  } else if (strcmp(param[1], "test") == 0) {
+    mc_output << "Step" << "\t" << "Maximum displacement" << "\t" << "Average displacement" 
+    << "\t" << "Local energy difference" << "\t" << "Global energy difference" << "\t" <<  "Accept ratio"
+    << "\t" << "Local probability" << "\t" << "Global probability" << "\t" << "Accelerate ratio" << std::endl;
+  }
+
+
+
+#ifdef DEBUG
+  rng = std::mt19937(13579);
+#else
+  rng = std::mt19937(std::chrono::system_clock::now().time_since_epoch().count());
+#endif
+}
+
+MC_Minimizer::~MC_Minimizer()
+{
+    //default destructor
+}
\ No newline at end of file
diff --git a/src/mc_minimize/mc_minimizer.cuh b/src/mc_minimize/mc_minimizer.cuh
new file mode 100644
index 000000000..a36e18e81
--- /dev/null
+++ b/src/mc_minimize/mc_minimizer.cuh
@@ -0,0 +1,48 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "model/atom.cuh"
+#include "model/box.cuh"
+#include "force/force.cuh"
+#include <random>
+#include "minimize/minimizer_fire.cuh"
+#include <iostream>
+#include <fstream>
+#include <chrono>
+#include <iomanip>
+
+class MC_Minimizer
+{
+private:
+
+public:
+  MC_Minimizer(const char** param, int num_param);
+  virtual ~MC_Minimizer();
+
+  virtual void compute(
+    int trials,
+    Force& force,
+    Atom& atom,
+    Box& box,
+    std::vector<Group>& group,
+    int grouping_method,
+    int group_id) = 0;
+
+protected:
+  std::mt19937 rng;
+  std::ofstream mc_output;
+};
+
diff --git a/src/mc_minimize/mc_minimizer_global.cu b/src/mc_minimize/mc_minimizer_global.cu
new file mode 100644
index 000000000..3af43486d
--- /dev/null
+++ b/src/mc_minimize/mc_minimizer_global.cu
@@ -0,0 +1,201 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mc_minimizer_global.cuh"
+
+MC_Minimizer_Global::MC_Minimizer_Global(
+  const char** param, int num_param,
+  double temperature_input,
+  double force_tolerance_input,
+  int max_relax_steps_input)
+  : MC_Minimizer(param, num_param)
+{
+  temperature = temperature_input;
+  force_tolerance = force_tolerance_input;
+  max_relax_steps = max_relax_steps_input;
+}
+
+MC_Minimizer_Global::~MC_Minimizer_Global()
+{
+  //default destructor
+}
+
+
+// a kernel with a single thread <<<1, 1>>>
+static __global__ void exchange(
+  const int i,
+  const int j,
+  const int type_i,
+  const int type_j,
+  int* g_type,
+  double* g_mass,
+  double* g_vx,
+  double* g_vy,
+  double* g_vz)
+{
+  g_type[i] = type_j;
+  g_type[j] = type_i;
+
+  double mass_i = g_mass[i];
+  g_mass[i] = g_mass[j];
+  g_mass[j] = mass_i;
+
+  double vx_i = g_vx[i];
+  g_vx[i] = g_vx[j];
+  g_vx[j] = vx_i;
+
+  double vy_i = g_vy[i];
+  g_vy[i] = g_vy[j];
+  g_vy[j] = vy_i;
+
+  double vz_i = g_vz[i];
+  g_vz[i] = g_vz[j];
+  g_vz[j] = vz_i;
+}
+
+void MC_Minimizer_Global::compute(
+  int trials,
+  Force& force,
+  Atom& atom,
+  Box& box,
+  std::vector<Group>& group,
+  int grouping_method,
+  int group_id)
+{
+  //get the swap index
+  int group_size =
+    grouping_method >= 0 ? group[grouping_method].cpu_size[group_id] : atom.number_of_atoms;
+  std::uniform_int_distribution<int> r1(0, group_size - 1);
+
+  int num_accepted = 0;
+  int N = atom.number_of_atoms;
+  for (int step = 1; step <= trials; ++step) {
+
+    int i = grouping_method >= 0
+              ? group[grouping_method]
+                  .cpu_contents[group[grouping_method].cpu_size_sum[group_id] + r1(rng)]
+              : r1(rng);
+    int type_i = atom.cpu_type[i];
+    int j = 0, type_j = type_i;
+    while (type_i == type_j) {
+      j = grouping_method >= 0
+            ? group[grouping_method]
+                .cpu_contents[group[grouping_method].cpu_size_sum[group_id] + r1(rng)]
+            : r1(rng);
+      type_j = atom.cpu_type[j];
+    }
+  //initialize
+  if (step == 1)
+  {
+    energy_last_step = 0;
+    force.compute(
+      box,
+      atom.position_per_atom,
+      atom.type,
+      group,
+      atom.potential_per_atom,
+      atom.force_per_atom,
+      atom.virial_per_atom);
+    std::vector<double> pe_before_cpu(N);
+    atom.potential_per_atom.copy_to_host(pe_before_cpu.data(), N);
+    for (int n = 0; n < N; ++n) {
+      energy_last_step += pe_before_cpu[n];
+    }
+  }
+
+  //construct a copy
+  Atom atom_copy;
+  atom_copy.number_of_atoms = N;
+  atom_copy.mass.resize(N);
+  atom_copy.type.resize(N);
+  atom_copy.potential_per_atom.resize(N, 0);
+  atom_copy.force_per_atom.resize(3 * N, 0);
+  atom_copy.velocity_per_atom.resize(3 * N, 0);
+  atom_copy.position_per_atom.resize(3 * N);
+  atom_copy.virial_per_atom.resize(9 * N, 0);
+
+  atom_copy.mass.copy_from_device(atom.mass.data(), N);
+  atom_copy.type.copy_from_device(atom.type.data(), N);
+  atom_copy.position_per_atom.copy_from_device(atom.position_per_atom.data(), 3 * N);
+
+  //calculate the energy after swap
+  exchange<<<1, 1>>>(
+    i,
+    j,
+    type_i,
+    type_j,
+    atom_copy.type.data(),
+    atom_copy.mass.data(),
+    atom_copy.velocity_per_atom.data(),
+    atom_copy.velocity_per_atom.data() + N,
+    atom_copy.velocity_per_atom.data() + N * 2);
+
+  Minimizer_FIRE minimizer(N, max_relax_steps, force_tolerance);
+  minimizer.compute(
+    force,
+    box,
+    atom_copy.position_per_atom,
+    atom_copy.type,
+    group,
+    atom_copy.potential_per_atom,
+    atom_copy.force_per_atom,
+    atom_copy.virial_per_atom);
+  double pe_after_total = 0;
+  std::vector<double> pe_after_cpu(N);
+  atom_copy.potential_per_atom.copy_to_host(pe_after_cpu.data(), N);
+  for (int n = 0; n < N; ++n) {
+    pe_after_total += pe_after_cpu[n];
+  }
+
+  double energy_difference = pe_after_total - energy_last_step;
+  std::uniform_real_distribution<float> r2(0, 1);
+  float random_number = r2(rng);
+  double probability = exp(-energy_difference / (K_B * temperature));
+  printf("%.8f\n", probability);
+
+  if (random_number < probability) {
+    ++num_accepted;
+
+    atom.cpu_type[i] = type_j;
+    atom.cpu_type[j] = type_i;
+
+    auto atom_symbol_i = atom.cpu_atom_symbol[i];
+    atom.cpu_atom_symbol[i] = atom.cpu_atom_symbol[j];
+    atom.cpu_atom_symbol[j] = atom_symbol_i;
+
+    double mass_i = atom.cpu_mass[i];
+    atom.cpu_mass[i] = atom.cpu_mass[j];
+    atom.cpu_mass[j] = mass_i;
+
+    atom.position_per_atom.copy_from_device(atom_copy.position_per_atom.data(), 3 * N);
+    atom.potential_per_atom.copy_from_device(atom_copy.potential_per_atom.data(), N);
+
+    exchange<<<1, 1>>>(
+      i,
+      j,
+      type_i,
+      type_j,
+      atom.type.data(),
+      atom.mass.data(),
+      atom.velocity_per_atom.data(),
+      atom.velocity_per_atom.data() + N,
+      atom.velocity_per_atom.data() + N * 2);
+
+    mc_output << step << "\t" << energy_last_step << "\t" << pe_after_total << "\t" << num_accepted / double(step) << std::endl;
+
+    energy_last_step = pe_after_total;
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/mc_minimize/mc_minimizer_global.cuh b/src/mc_minimize/mc_minimizer_global.cuh
new file mode 100644
index 000000000..8c419bd56
--- /dev/null
+++ b/src/mc_minimize/mc_minimizer_global.cuh
@@ -0,0 +1,43 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mc_minimizer.cuh"
+
+class MC_Minimizer_Global : public MC_Minimizer
+{
+private:
+  double temperature;
+  double force_tolerance;
+  double max_relax_steps;
+  double energy_last_step = 0;
+public:
+    MC_Minimizer_Global(const char** param, int num_param,
+    double temperature_input,
+    double force_tolerance_input,
+    int max_relax_steps_input);
+    ~MC_Minimizer_Global();
+
+    virtual void compute(
+    int trials,
+    Force& force,
+    Atom& atom,
+    Box& box,
+    std::vector<Group>& group,
+    int grouping_method,
+    int group_id);
+};
+
diff --git a/src/mc_minimize/mc_minimizer_local.cu b/src/mc_minimize/mc_minimizer_local.cu
new file mode 100644
index 000000000..8b3837148
--- /dev/null
+++ b/src/mc_minimize/mc_minimizer_local.cu
@@ -0,0 +1,711 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mc_minimizer_local.cuh"
+
+MC_Minimizer_Local::MC_Minimizer_Local(
+  const char** param, int num_param,
+    double scale_factor_input,
+    double temperature_input,
+    double force_tolerance_input,
+    int max_relax_steps_input)
+  : MC_Minimizer(param, num_param)
+{
+  scale_factor = scale_factor_input;
+  temperature = temperature_input;
+  force_tolerance = force_tolerance_input;
+  max_relax_steps = max_relax_steps_input;
+}
+
+MC_Minimizer_Local::~MC_Minimizer_Local()
+{
+  //default destructor
+}
+
+namespace
+{
+/*
+this function can find out the atoms whose distance from two atom centers is within rc_radial
+*/
+static __global__ void get_neighbors_of_i_and_j(
+  const int N,
+  const Box box,
+  const int i,
+  const int j,
+  const float rc_radial_square,
+  const double* __restrict__ g_x,
+  const double* __restrict__ g_y,
+  const double* __restrict__ g_z,
+  int* g_NN_ij,
+  int* g_NL_ij)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < N) {
+    double x0 = g_x[n];
+    double y0 = g_y[n];
+    double z0 = g_z[n];
+    double x0i = g_x[i] - x0;
+    double y0i = g_y[i] - y0;
+    double z0i = g_z[i] - z0;
+    double x0j = g_x[j] - x0;
+    double y0j = g_y[j] - y0;
+    double z0j = g_z[j] - z0;
+
+    apply_mic(box, x0i, y0i, z0i);
+    float distance_square_i = float(x0i * x0i + y0i * y0i + z0i * z0i);
+    apply_mic(box, x0j, y0j, z0j);
+    float distance_square_j = float(x0j * x0j + y0j * y0j + z0j * z0j);
+
+    if (distance_square_i < rc_radial_square) {
+      g_NL_ij[atomicAdd(g_NN_ij, 1)] = n;
+    } else {
+      if (distance_square_j < rc_radial_square) {
+        g_NL_ij[atomicAdd(g_NN_ij, 1)] = n;
+      }
+    }
+  }
+}
+
+// a kernel with a single thread <<<1, 1>>>
+static __global__ void exchange(
+  const int i,
+  const int j,
+  const int type_i,
+  const int type_j,
+  int* g_type,
+  double* g_mass,
+  double* g_vx,
+  double* g_vy,
+  double* g_vz)
+{
+  g_type[i] = type_j;
+  g_type[j] = type_i;
+
+  double mass_i = g_mass[i];
+  g_mass[i] = g_mass[j];
+  g_mass[j] = mass_i;
+
+  double vx_i = g_vx[i];
+  g_vx[i] = g_vx[j];
+  g_vx[j] = vx_i;
+
+  double vy_i = g_vy[i];
+  g_vy[i] = g_vy[j];
+  g_vy[j] = vy_i;
+
+  double vz_i = g_vz[i];
+  g_vz[i] = g_vz[j];
+  g_vz[j] = vz_i;
+}
+
+// find the local index of ij
+static __global__ void find_local_ij(
+  const int i,
+  const int j,
+  const int local_N,
+  int* local_index,
+  int* local_i,
+  int* local_j)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < local_N)
+  {
+    if (local_index[n] == i)
+    {
+        *local_i = n;
+    }
+    if (local_index[n] == j)
+    {
+        *local_j = n;
+    }
+  }
+}
+
+/*
+this function is used for label the atoms within the rc_radial, which will return an array of [0, 1, 0, 0......].
+1 stands for inside, and 0 stands for outside.
+*/
+static __global__ void get_sphere_atoms(
+  const int N,
+  const int local_N,
+  const int* local_index,
+  const Box box,
+  const int i,
+  const int j,
+  const float rc_radial_square,
+  const double* __restrict__ g_x,
+  const double* __restrict__ g_y,
+  const double* __restrict__ g_z,
+  double* local_sphere_flags)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < local_N) {
+    int index = local_index[n];
+    double x0 = g_x[index];
+    double y0 = g_y[index];
+    double z0 = g_z[index];
+    double x0i = g_x[i] - x0;
+    double y0i = g_y[i] - y0;
+    double z0i = g_z[i] - z0;
+    double x0j = g_x[j] - x0;
+    double y0j = g_y[j] - y0;
+    double z0j = g_z[j] - z0;
+
+    apply_mic(box, x0i, y0i, z0i);
+    float distance_square_i = float(x0i * x0i + y0i * y0i + z0i * z0i);
+    apply_mic(box, x0j, y0j, z0j);
+    float distance_square_j = float(x0j * x0j + y0j * y0j + z0j * z0j);
+
+    if (distance_square_i < rc_radial_square) {
+      local_sphere_flags[n] = 1;
+      local_sphere_flags[n + local_N] = 1;
+      local_sphere_flags[n + 2 * local_N] = 1;
+    } else {
+      if (distance_square_j < rc_radial_square) {
+        local_sphere_flags[n] = 1;
+        local_sphere_flags[n + local_N] = 1;
+        local_sphere_flags[n + 2 * local_N] = 1;
+      }
+    }
+  }
+}
+
+/*
+This function is used to mark atoms whose distance from two atom centers is between the range of rcmin and rcmax
+*/
+static __global__ void get_outer_atoms(
+  const int N,
+  const int local_N,
+  const int* local_index,
+  const Box box,
+  const int i,
+  const int j,
+  const float rcmin_radial_square,
+  const float rcmax_radial_square,
+  const double* __restrict__ g_x,
+  const double* __restrict__ g_y,
+  const double* __restrict__ g_z,
+  double* outer_atoms_flags)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < local_N) {
+    int index = local_index[n];
+    double x0 = g_x[index];
+    double y0 = g_y[index];
+    double z0 = g_z[index];
+    double x0i = g_x[i] - x0;
+    double y0i = g_y[i] - y0;
+    double z0i = g_z[i] - z0;
+    double x0j = g_x[j] - x0;
+    double y0j = g_y[j] - y0;
+    double z0j = g_z[j] - z0;
+
+    apply_mic(box, x0i, y0i, z0i);
+    float distance_square_i = float(x0i * x0i + y0i * y0i + z0i * z0i);
+    apply_mic(box, x0j, y0j, z0j);
+    float distance_square_j = float(x0j * x0j + y0j * y0j + z0j * z0j);
+
+    if (distance_square_i < rcmax_radial_square && distance_square_i > rcmin_radial_square && distance_square_j > rcmin_radial_square) {
+      outer_atoms_flags[n] += 1;
+    } else {
+      if (distance_square_j < rcmax_radial_square && distance_square_j > rcmin_radial_square && distance_square_i > rcmin_radial_square) {
+        outer_atoms_flags[n] += 1;
+      }
+    }
+  }
+}
+
+//copy from a to b, for position, dimension is 3, for mass, dimension is 1, and so on
+template <typename T>
+static __global__ void copy
+(
+  T* a,
+  T* b,
+  int* local_index,
+  int global_N,
+  int local_N,
+  int dimension)
+{
+  int n = blockDim.x * blockIdx.x + threadIdx.x;
+  if (n < local_N)
+  {
+    int index = local_index[n];
+    for (int i = 0; i < dimension; i++)
+    {
+      b[n + i * local_N] = a[index + i * global_N];
+    }
+  }
+}
+
+
+//copy from b to a
+template <typename T>
+static __global__ void copy_back
+(
+  T* a,
+  T* b,
+  int* local_index,
+  int global_N,
+  int local_N,
+  int dimension)
+{
+  int n = blockDim.x * blockIdx.x + threadIdx.x;
+  if (n < local_N)
+  {
+    int index = local_index[n];
+    for (int i = 0; i < dimension; i++)
+    {
+      a[index + i * global_N] = b[n + i * local_N];
+    }
+  }
+}
+
+//sum up
+static __global__ void gpu_sum(const int size, double* a, double* result)
+{
+  int number_of_patches = (size - 1) / 1024 + 1;
+  int tid = threadIdx.x;
+  int n, patch;
+  __shared__ double data[1024];
+  data[tid] = 0.0;
+  for (patch = 0; patch < number_of_patches; ++patch) {
+    n = tid + patch * 1024;
+    if (n < size)
+      data[tid] += a[n];
+  }
+  __syncthreads();
+  for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
+    if (tid < offset) {
+      data[tid] += data[tid + offset];
+    }
+    __syncthreads();
+  }
+  if (tid == 0)
+    *result = data[0];
+}
+
+/*
+calculate the displacement for the labelled atomsd
+*/
+static __global__ void get_displacement(
+  const Box box,
+  double* global_position,
+  double* local_position,
+  int global_N,
+  int local_N,
+  int* local_index,
+  double* outer_atoms_flags,
+  double* displacement)
+{
+  int n = blockDim.x * blockIdx.x + threadIdx.x;
+  if (n < local_N)
+  {
+    int index = local_index[n];
+    double displace = 0;
+    double r12[3];
+    for (int i = 0; i < 3; i++)
+    {
+      r12[i] = (global_position[index + i * global_N] - local_position[n + i * local_N]);
+    }
+    apply_mic(box, r12[0], r12[1], r12[2]);
+    displace = r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2];
+    displacement[n] = sqrt(displace) * outer_atoms_flags[n];
+  }
+}
+
+//remove the energy of the background atoms
+static __global__ void modify_energy(
+  const int local_N,
+  double* pe,
+  double* local_sphere_flags)
+{
+  int n = blockDim.x * blockIdx.x + threadIdx.x;
+  if (n < local_N)
+  {
+    pe[n] *= local_sphere_flags[n];
+  } 
+}
+
+//calculate the max value
+__global__ void gpu_calculate_max(
+  const int size,
+  const int number_of_rounds,
+  const double* array,
+  double* max)
+{
+  const int tid = threadIdx.x;
+
+  __shared__ double s_max[1024]; // Shared memory for max values
+  s_max[tid] = 0;         
+
+  double max_value = 0;   // Initialize local max
+
+  for (int round = 0; round < number_of_rounds; ++round) {
+    const int n = tid + round * 1024;
+    if (n < size) {
+      const double f = array[n];
+      if (f > max_value)
+        max_value = f;           // Update local max
+    }
+  }
+
+  s_max[tid] = max_value;        // Write local max to shared memory
+  __syncthreads();
+
+  for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
+    if (tid < offset) {
+      if (s_max[tid + offset] > s_max[tid]) {
+        s_max[tid] = s_max[tid + offset];
+      }
+    }
+    __syncthreads();
+  }
+
+  if (tid == 0) {
+    max[0] = s_max[0]; // Block's final max written to global memory
+  }
+}
+
+double get_outer_average_displacement(
+  const Box box,
+  double* global_position,
+  double* local_position,
+  int global_N,
+  int local_N,
+  int* local_index,
+  double* outer_atoms_flags,
+  double* max)
+{
+  GPU_Vector<double> displacement(local_N);
+  get_displacement<<<(local_N - 1) / 64 + 1, 64>>>(
+    box,
+    global_position,
+    local_position,
+    global_N,
+    local_N,
+    local_index,
+    outer_atoms_flags,
+    displacement.data());
+  CHECK(gpuDeviceSynchronize());
+  
+  //calculate the average displacement
+  const int number_of_rounds = (local_N - 1) / 1024 + 1;
+  gpu_calculate_max<<<1, 1024>>>(local_N, number_of_rounds, displacement.data(), max);
+
+  GPU_Vector<double> result(1);
+  gpu_sum<<<1, 1024>>>(local_N, displacement.data(), result.data());
+  GPU_Vector<double> outer_number(1);
+  gpu_sum<<<1, 1024>>>(local_N, outer_atoms_flags, outer_number.data());
+  CHECK(gpuDeviceSynchronize());
+  double number;
+  outer_number.copy_to_host(&number, 1);
+  double ans;
+  result.copy_to_host(&ans, 1);
+  ans /= number;
+  return ans;
+}
+
+//copy the local structure to previous system
+void build_all_atoms(
+  Atom& atom,
+  Atom& local_atoms,
+  int local_N,
+  int* local_index)
+{
+  copy_back<<<(local_N - 1) / 64 + 1, 64>>>(
+    atom.position_per_atom.data(),
+    local_atoms.position_per_atom.data(),
+    local_index,
+    atom.number_of_atoms,
+    local_N,
+    3);
+  CHECK(gpuDeviceSynchronize());
+}
+
+//construct a local structure
+void build_local_atoms(
+  Atom& atom,
+  Atom& local_atoms,
+  int local_N,
+  int* local_index)
+{
+  copy<<<(local_N - 1) / 64 + 1, 64>>>(
+    atom.mass.data(),
+    local_atoms.mass.data(),
+    local_index,
+    atom.number_of_atoms,
+    local_N,
+    1);
+  copy<<<(local_N - 1) / 64 + 1, 64>>>(
+    atom.type.data(),
+    local_atoms.type.data(),
+    local_index,
+    atom.number_of_atoms,
+    local_N,
+    1);
+  copy<<<(local_N - 1) / 64 + 1, 64>>>(
+    atom.position_per_atom.data(),
+    local_atoms.position_per_atom.data(),
+    local_index,
+    atom.number_of_atoms,
+    local_N,
+    3);
+  CHECK(gpuDeviceSynchronize());
+}
+}
+
+//implement the local simple MC
+void MC_Minimizer_Local::compute(
+    int trials,
+    Force& force,
+    Atom& atom,
+    Box& box,
+    std::vector<Group>& group,
+    int grouping_method,
+    int group_id)
+{
+  int group_size =
+    grouping_method >= 0 ? group[grouping_method].cpu_size[group_id] : atom.number_of_atoms;
+  std::uniform_int_distribution<int> r1(0, group_size - 1);
+
+  int num_accepted = 0;
+  float rc_radius = force.potentials[0]->rc;
+  for (int step = 1; step <= trials; ++step) {
+
+    int i = grouping_method >= 0
+              ? group[grouping_method]
+                  .cpu_contents[group[grouping_method].cpu_size_sum[group_id] + r1(rng)]
+              : r1(rng);
+    int type_i = atom.cpu_type[i];
+    int j = 0, type_j = type_i;
+    while (type_i == type_j) {
+      j = grouping_method >= 0
+            ? group[grouping_method]
+                .cpu_contents[group[grouping_method].cpu_size_sum[group_id] + r1(rng)]
+            : r1(rng);
+      type_j = atom.cpu_type[j];
+    }
+
+    Atom local_atoms;
+    //find local atoms
+    GPU_Vector<int> local_i;  //the local index of i atom
+    GPU_Vector<int> local_j;  //the local index of j atom
+    GPU_Vector<int> local_N; //the length of local_index
+    GPU_Vector<int> local_index; // an array contains the index of the local atoms
+    GPU_Vector<double> local_sphere_flags; // an array labels the atoms inside shell
+    local_N.resize(1);
+    local_i.resize(1);
+    local_j.resize(1);
+    local_index.resize(atom.number_of_atoms);
+    CHECK(gpuMemset(local_N.data(), 0, sizeof(int)));
+    CHECK(gpuMemset(local_i.data(), 0, sizeof(int)));
+    CHECK(gpuMemset(local_j.data(), 0, sizeof(int)));
+    get_neighbors_of_i_and_j<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
+      atom.number_of_atoms,
+      box,
+      i,
+      j,
+      rc_radius * rc_radius * (scale_factor + 1.2) * (scale_factor + 1.2),
+      atom.position_per_atom.data(),
+      atom.position_per_atom.data() + atom.number_of_atoms,
+      atom.position_per_atom.data() + atom.number_of_atoms * 2,
+      local_N.data(),
+      local_index.data());
+    GPU_CHECK_KERNEL
+
+    int local_N_cpu;
+    local_N.copy_to_host(&local_N_cpu);
+    local_sphere_flags.resize(local_N_cpu * 3);
+    local_sphere_flags.fill(0);
+
+    find_local_ij<<<(local_N_cpu - 1) / 64 + 1, 64>>>(
+      i,
+      j,
+      local_N_cpu,
+      local_index.data(),
+      local_i.data(),
+      local_j.data());
+    int local_i_cpu;
+    int local_j_cpu;
+    local_i.copy_to_host(&local_i_cpu);
+    local_j.copy_to_host(&local_j_cpu);
+
+    //get sphere atoms
+    get_sphere_atoms<<<(local_N_cpu - 1) / 64 + 1, 64>>>(
+      atom.number_of_atoms,
+      local_N_cpu,
+      local_index.data(),
+      box,
+      i,
+      j,
+      rc_radius * rc_radius * scale_factor * scale_factor,
+      atom.position_per_atom.data(),
+      atom.position_per_atom.data() + atom.number_of_atoms,
+      atom.position_per_atom.data() + atom.number_of_atoms * 2,
+      local_sphere_flags.data());
+
+    //build the local_atoms
+    local_atoms.number_of_atoms = local_N_cpu;
+    local_atoms.position_per_atom.resize(3 * local_N_cpu);
+    local_atoms.force_per_atom.resize(3 * local_N_cpu);
+    local_atoms.type.resize(local_N_cpu);
+    local_atoms.velocity_per_atom.resize(3 * local_N_cpu);
+    local_atoms.mass.resize(local_N_cpu);
+    local_atoms.potential_per_atom.resize(local_N_cpu);
+    local_atoms.virial_per_atom.resize(9 * local_N_cpu);
+
+    //initialize
+    local_atoms.force_per_atom.fill(0);
+    local_atoms.potential_per_atom.fill(0);
+    local_atoms.velocity_per_atom.fill(0);
+    local_atoms.virial_per_atom.fill(0);
+  
+    build_local_atoms(
+      atom,
+      local_atoms,
+      local_N_cpu,
+      local_index.data());
+    
+    //calculate the origin energy, WARNING: here the group is not correct but now this variable will not be used in compute()
+    //need to modify here
+    force.potentials[0]->N2 = local_N_cpu;
+    force.compute(box,
+    local_atoms.position_per_atom,
+    local_atoms.type,
+    group,
+    local_atoms.potential_per_atom,
+    local_atoms.force_per_atom,
+    local_atoms.virial_per_atom);
+    modify_energy<<<(local_N_cpu - 1) / 64 + 1, 64>>>(
+      local_N_cpu,
+      local_atoms.potential_per_atom.data(),
+      local_sphere_flags.data());
+
+    std::vector<double> pe_before_cpu(local_N_cpu);
+    local_atoms.potential_per_atom.copy_to_host(pe_before_cpu.data(), local_N_cpu);
+
+    exchange<<<1, 1>>>(
+      local_i_cpu,
+      local_j_cpu,
+      type_i,
+      type_j,
+      local_atoms.type.data(),
+      local_atoms.mass.data(),
+      local_atoms.velocity_per_atom.data(),
+      local_atoms.velocity_per_atom.data() + local_atoms.number_of_atoms,
+      local_atoms.velocity_per_atom.data() + local_atoms.number_of_atoms * 2);
+
+    Minimizer_FIRE Minimizer(
+      local_N_cpu,
+      max_relax_steps,
+      force_tolerance);
+
+    Minimizer.compute_label_atoms(
+      force,
+      box,
+      local_atoms.position_per_atom,
+      local_atoms.type,
+      group,
+      local_sphere_flags,
+      local_atoms.potential_per_atom,
+      local_atoms.force_per_atom,
+      local_atoms.virial_per_atom);
+
+    std::vector<double> pe_after_cpu(local_N_cpu);
+    local_atoms.potential_per_atom.copy_to_host(pe_after_cpu.data(), local_N_cpu);
+    double pe_before_total = 0;
+    double pe_after_total = 0;
+    for (int n = 0; n < local_N_cpu; ++n) {
+      pe_before_total += pe_before_cpu[n];
+    }
+    for (int n = 0; n < local_N_cpu; ++n) {
+      pe_after_total += pe_after_cpu[n];
+    }
+    //printf("        energy before swapping = %g.10 eV.\n", pe_before_total);
+    //printf("        energy after swapping = %g.10 eV.\n", pe_after_total);
+    double energy_difference = pe_after_total - pe_before_total;
+    std::uniform_real_distribution<float> r2(0, 1);
+    float random_number = r2(rng);
+    double probability = exp(-energy_difference / (K_B * temperature));
+
+    if (random_number < probability) {
+      ++num_accepted;
+
+      atom.cpu_type[i] = type_j;
+      atom.cpu_type[j] = type_i;
+
+      auto atom_symbol_i = atom.cpu_atom_symbol[i];
+      atom.cpu_atom_symbol[i] = atom.cpu_atom_symbol[j];
+      atom.cpu_atom_symbol[j] = atom_symbol_i;
+
+      double mass_i = atom.cpu_mass[i];
+      atom.cpu_mass[i] = atom.cpu_mass[j];
+      atom.cpu_mass[j] = mass_i;
+
+      //get the output data
+      GPU_Vector<double> outer_atoms_flags(local_N_cpu);
+      outer_atoms_flags.fill(0);
+      get_outer_atoms<<<(local_N_cpu - 1) / 64 + 1, 64>>>(
+        atom.number_of_atoms,
+        local_N_cpu,
+        local_index.data(),
+        box,
+        i,
+        j,
+        rc_radius * rc_radius * (scale_factor - 1) * (scale_factor - 1),
+        rc_radius * rc_radius * scale_factor * scale_factor,
+        atom.position_per_atom.data(),
+        atom.position_per_atom.data() + atom.number_of_atoms,
+        atom.position_per_atom.data() + atom.number_of_atoms * 2,
+        outer_atoms_flags.data());
+      CHECK(gpuDeviceSynchronize());
+
+      GPU_Vector<double> max_displacement(1);
+      double average_displacement = get_outer_average_displacement(
+        box,
+        atom.position_per_atom.data(),
+        local_atoms.position_per_atom.data(),
+        atom.number_of_atoms,
+        local_atoms.number_of_atoms,
+        local_index.data(),
+        outer_atoms_flags.data(),
+        max_displacement.data());
+      double max_value;
+      max_displacement.copy_to_host(&max_value);
+
+      mc_output<< step << "\t" << max_value << "\t" << average_displacement 
+      << "\t" << energy_difference << "\t" << num_accepted / double(step) << std::endl;
+
+      //copy the relaxed local structure to the global structure
+      build_all_atoms(
+        atom,
+        local_atoms,
+        local_N_cpu,
+        local_index.data());
+      exchange<<<1, 1>>>(
+      i,
+      j,
+      type_i,
+      type_j,
+      atom.type.data(),
+      atom.mass.data(),
+      atom.velocity_per_atom.data(),
+      atom.velocity_per_atom.data() + atom.number_of_atoms,
+      atom.velocity_per_atom.data() + atom.number_of_atoms * 2);
+    }
+    //need modification
+    force.potentials[0]->N2 = atom.number_of_atoms;
+  }
+}
\ No newline at end of file
diff --git a/src/mc_minimize/mc_minimizer_local.cuh b/src/mc_minimize/mc_minimizer_local.cuh
new file mode 100644
index 000000000..be14daf9b
--- /dev/null
+++ b/src/mc_minimize/mc_minimizer_local.cuh
@@ -0,0 +1,44 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mc_minimizer.cuh"
+
+class MC_Minimizer_Local : public MC_Minimizer
+{
+private:
+  double scale_factor;
+  double temperature;
+  double force_tolerance;
+  int max_relax_steps;
+public:
+    MC_Minimizer_Local(const char** param, int num_param,
+    double scale_factor_input,
+    double temperature_input,
+    double force_tolerance_input,
+    int max_relax_steps_input);
+    ~MC_Minimizer_Local();
+
+    virtual void compute(
+    int trials,
+    Force& force,
+    Atom& atom,
+    Box& box,
+    std::vector<Group>& group,
+    int grouping_method,
+    int group_id);
+};
+
diff --git a/src/mc_minimize/mc_minimizer_test.cu b/src/mc_minimize/mc_minimizer_test.cu
new file mode 100644
index 000000000..af6ac62d7
--- /dev/null
+++ b/src/mc_minimize/mc_minimizer_test.cu
@@ -0,0 +1,807 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mc_minimizer_test.cuh"
+
+MC_Minimizer_Test::MC_Minimizer_Test(
+  const char** param, int num_param,
+    double scale_factor_input,
+    double temperature_input,
+    double force_tolerance_input,
+    int max_relax_steps_input)
+  : MC_Minimizer(param, num_param)
+{
+  scale_factor = scale_factor_input;
+  temperature = temperature_input;
+  force_tolerance = force_tolerance_input;
+  max_relax_steps = max_relax_steps_input;
+}
+
+MC_Minimizer_Test::~MC_Minimizer_Test()
+{
+  //default destructor
+}
+
+namespace
+{
+/*
+this function can find out the atoms whose distance from two atom centers is within rc_radial
+*/
+static __global__ void get_neighbors_of_i_and_j(
+  const int N,
+  const Box box,
+  const int i,
+  const int j,
+  const float rc_radial_square,
+  const double* __restrict__ g_x,
+  const double* __restrict__ g_y,
+  const double* __restrict__ g_z,
+  int* g_NN_ij,
+  int* g_NL_ij)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < N) {
+    double x0 = g_x[n];
+    double y0 = g_y[n];
+    double z0 = g_z[n];
+    double x0i = g_x[i] - x0;
+    double y0i = g_y[i] - y0;
+    double z0i = g_z[i] - z0;
+    double x0j = g_x[j] - x0;
+    double y0j = g_y[j] - y0;
+    double z0j = g_z[j] - z0;
+
+    apply_mic(box, x0i, y0i, z0i);
+    float distance_square_i = float(x0i * x0i + y0i * y0i + z0i * z0i);
+    apply_mic(box, x0j, y0j, z0j);
+    float distance_square_j = float(x0j * x0j + y0j * y0j + z0j * z0j);
+
+    if (distance_square_i < rc_radial_square) {
+      g_NL_ij[atomicAdd(g_NN_ij, 1)] = n;
+    } else {
+      if (distance_square_j < rc_radial_square) {
+        g_NL_ij[atomicAdd(g_NN_ij, 1)] = n;
+      }
+    }
+  }
+}
+
+// a kernel with a single thread <<<1, 1>>>
+static __global__ void exchange(
+  const int i,
+  const int j,
+  const int type_i,
+  const int type_j,
+  int* g_type,
+  double* g_mass,
+  double* g_vx,
+  double* g_vy,
+  double* g_vz)
+{
+  g_type[i] = type_j;
+  g_type[j] = type_i;
+
+  double mass_i = g_mass[i];
+  g_mass[i] = g_mass[j];
+  g_mass[j] = mass_i;
+
+  double vx_i = g_vx[i];
+  g_vx[i] = g_vx[j];
+  g_vx[j] = vx_i;
+
+  double vy_i = g_vy[i];
+  g_vy[i] = g_vy[j];
+  g_vy[j] = vy_i;
+
+  double vz_i = g_vz[i];
+  g_vz[i] = g_vz[j];
+  g_vz[j] = vz_i;
+}
+
+// find the local index of ij
+static __global__ void find_local_ij(
+  const int i,
+  const int j,
+  const int local_N,
+  int* local_index,
+  int* local_i,
+  int* local_j)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < local_N)
+  {
+    if (local_index[n] == i)
+    {
+        *local_i = n;
+    }
+    if (local_index[n] == j)
+    {
+        *local_j = n;
+    }
+  }
+}
+
+/*
+this function is used for label the atoms within the rc_radial, which will return an array of [0, 1, 0, 0......].
+1 stands for inside, and 0 stands for outside.
+*/
+static __global__ void get_sphere_atoms(
+  const int N,
+  const int local_N,
+  const int* local_index,
+  const Box box,
+  const int i,
+  const int j,
+  const float rc_radial_square,
+  const double* __restrict__ g_x,
+  const double* __restrict__ g_y,
+  const double* __restrict__ g_z,
+  double* local_sphere_flags)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < local_N) {
+    int index = local_index[n];
+    double x0 = g_x[index];
+    double y0 = g_y[index];
+    double z0 = g_z[index];
+    double x0i = g_x[i] - x0;
+    double y0i = g_y[i] - y0;
+    double z0i = g_z[i] - z0;
+    double x0j = g_x[j] - x0;
+    double y0j = g_y[j] - y0;
+    double z0j = g_z[j] - z0;
+
+    apply_mic(box, x0i, y0i, z0i);
+    float distance_square_i = float(x0i * x0i + y0i * y0i + z0i * z0i);
+    apply_mic(box, x0j, y0j, z0j);
+    float distance_square_j = float(x0j * x0j + y0j * y0j + z0j * z0j);
+
+    if (distance_square_i < rc_radial_square) {
+      local_sphere_flags[n] = 1;
+      local_sphere_flags[n + local_N] = 1;
+      local_sphere_flags[n + 2 * local_N] = 1;
+    } else {
+      if (distance_square_j < rc_radial_square) {
+        local_sphere_flags[n] = 1;
+        local_sphere_flags[n + local_N] = 1;
+        local_sphere_flags[n + 2 * local_N] = 1;
+      }
+    }
+  }
+}
+
+/*
+This function is used to mark atoms whose distance from two atom centers is between the range of rcmin and rcmax
+*/
+static __global__ void get_outer_atoms(
+  const int N,
+  const int local_N,
+  const int* local_index,
+  const Box box,
+  const int i,
+  const int j,
+  const float rcmin_radial_square,
+  const float rcmax_radial_square,
+  const double* __restrict__ g_x,
+  const double* __restrict__ g_y,
+  const double* __restrict__ g_z,
+  double* outer_atoms_flags)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < local_N) {
+    int index = local_index[n];
+    double x0 = g_x[index];
+    double y0 = g_y[index];
+    double z0 = g_z[index];
+    double x0i = g_x[i] - x0;
+    double y0i = g_y[i] - y0;
+    double z0i = g_z[i] - z0;
+    double x0j = g_x[j] - x0;
+    double y0j = g_y[j] - y0;
+    double z0j = g_z[j] - z0;
+
+    apply_mic(box, x0i, y0i, z0i);
+    float distance_square_i = float(x0i * x0i + y0i * y0i + z0i * z0i);
+    apply_mic(box, x0j, y0j, z0j);
+    float distance_square_j = float(x0j * x0j + y0j * y0j + z0j * z0j);
+
+    if (distance_square_i < rcmax_radial_square && distance_square_i > rcmin_radial_square && distance_square_j > rcmin_radial_square) {
+      outer_atoms_flags[n] += 1;
+    } else {
+      if (distance_square_j < rcmax_radial_square && distance_square_j > rcmin_radial_square && distance_square_i > rcmin_radial_square) {
+        outer_atoms_flags[n] += 1;
+      }
+    }
+  }
+}
+
+//copy from a to b, for position, dimension is 3, for mass, dimension is 1, and so on
+template <typename T>
+static __global__ void copy
+(
+  T* a,
+  T* b,
+  int* local_index,
+  int global_N,
+  int local_N,
+  int dimension)
+{
+  int n = blockDim.x * blockIdx.x + threadIdx.x;
+  if (n < local_N)
+  {
+    int index = local_index[n];
+    for (int i = 0; i < dimension; i++)
+    {
+      b[n + i * local_N] = a[index + i * global_N];
+    }
+  }
+}
+
+
+//copy from b to a
+template <typename T>
+static __global__ void copy_back
+(
+  T* a,
+  T* b,
+  int* local_index,
+  int global_N,
+  int local_N,
+  int dimension)
+{
+  int n = blockDim.x * blockIdx.x + threadIdx.x;
+  if (n < local_N)
+  {
+    int index = local_index[n];
+    for (int i = 0; i < dimension; i++)
+    {
+      a[index + i * global_N] = b[n + i * local_N];
+    }
+  }
+}
+
+//sum up
+static __global__ void gpu_sum(const int size, double* a, double* result)
+{
+  int number_of_patches = (size - 1) / 1024 + 1;
+  int tid = threadIdx.x;
+  int n, patch;
+  __shared__ double data[1024];
+  data[tid] = 0.0;
+  for (patch = 0; patch < number_of_patches; ++patch) {
+    n = tid + patch * 1024;
+    if (n < size)
+      data[tid] += a[n];
+  }
+  __syncthreads();
+  for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
+    if (tid < offset) {
+      data[tid] += data[tid + offset];
+    }
+    __syncthreads();
+  }
+  if (tid == 0)
+    *result = data[0];
+}
+
+/*
+calculate the displacement for the labelled atomsd
+*/
+static __global__ void get_displacement(
+  const Box box,
+  double* global_position,
+  double* local_position,
+  int global_N,
+  int local_N,
+  int* local_index,
+  double* outer_atoms_flags,
+  double* displacement)
+{
+  int n = blockDim.x * blockIdx.x + threadIdx.x;
+  if (n < local_N)
+  {
+    int index = local_index[n];
+    double displace = 0;
+    double r12[3];
+    for (int i = 0; i < 3; i++)
+    {
+      r12[i] = (global_position[index + i * global_N] - local_position[n + i * local_N]);
+    }
+    apply_mic(box, r12[0], r12[1], r12[2]);
+    displace = r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2];
+    displacement[n] = sqrt(displace) * outer_atoms_flags[n];
+  }
+}
+
+//remove the energy of the background atoms
+static __global__ void modify_energy(
+  const int local_N,
+  double* pe,
+  double* local_sphere_flags)
+{
+  int n = blockDim.x * blockIdx.x + threadIdx.x;
+  if (n < local_N)
+  {
+    pe[n] *= local_sphere_flags[n];
+  } 
+}
+
+//calculate the max value
+__global__ void gpu_calculate_max(
+  const int size,
+  const int number_of_rounds,
+  const double* array,
+  double* max)
+{
+  const int tid = threadIdx.x;
+
+  __shared__ double s_max[1024]; // Shared memory for max values
+  s_max[tid] = 0;         
+
+  double max_value = 0;   // Initialize local max
+
+  for (int round = 0; round < number_of_rounds; ++round) {
+    const int n = tid + round * 1024;
+    if (n < size) {
+      const double f = array[n];
+      if (f > max_value)
+        max_value = f;           // Update local max
+    }
+  }
+
+  s_max[tid] = max_value;        // Write local max to shared memory
+  __syncthreads();
+
+  for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
+    if (tid < offset) {
+      if (s_max[tid + offset] > s_max[tid]) {
+        s_max[tid] = s_max[tid + offset];
+      }
+    }
+    __syncthreads();
+  }
+
+  if (tid == 0) {
+    max[0] = s_max[0]; // Block's final max written to global memory
+  }
+}
+
+double get_outer_average_displacement(
+  const Box box,
+  double* global_position,
+  double* local_position,
+  int global_N,
+  int local_N,
+  int* local_index,
+  double* outer_atoms_flags,
+  double* max)
+{
+  GPU_Vector<double> displacement(local_N);
+  get_displacement<<<(local_N - 1) / 64 + 1, 64>>>(
+    box,
+    global_position,
+    local_position,
+    global_N,
+    local_N,
+    local_index,
+    outer_atoms_flags,
+    displacement.data());
+  CHECK(gpuDeviceSynchronize());
+  
+  //calculate the average displacement
+  const int number_of_rounds = (local_N - 1) / 1024 + 1;
+  gpu_calculate_max<<<1, 1024>>>(local_N, number_of_rounds, displacement.data(), max);
+
+  GPU_Vector<double> result(1);
+  gpu_sum<<<1, 1024>>>(local_N, displacement.data(), result.data());
+  GPU_Vector<double> outer_number(1);
+  gpu_sum<<<1, 1024>>>(local_N, outer_atoms_flags, outer_number.data());
+  CHECK(gpuDeviceSynchronize());
+  double number;
+  outer_number.copy_to_host(&number, 1);
+  double ans;
+  result.copy_to_host(&ans, 1);
+  ans /= number;
+  return ans;
+}
+
+//copy the local structure to previous system
+void build_all_atoms(
+  Atom& atom,
+  Atom& local_atoms,
+  int local_N,
+  int* local_index)
+{
+  copy_back<<<(local_N - 1) / 64 + 1, 64>>>(
+    atom.position_per_atom.data(),
+    local_atoms.position_per_atom.data(),
+    local_index,
+    atom.number_of_atoms,
+    local_N,
+    3);
+  CHECK(gpuDeviceSynchronize());
+}
+
+//construct a local structure
+void build_local_atoms(
+  Atom& atom,
+  Atom& local_atoms,
+  int local_N,
+  int* local_index)
+{
+  copy<<<(local_N - 1) / 64 + 1, 64>>>(
+    atom.mass.data(),
+    local_atoms.mass.data(),
+    local_index,
+    atom.number_of_atoms,
+    local_N,
+    1);
+  copy<<<(local_N - 1) / 64 + 1, 64>>>(
+    atom.type.data(),
+    local_atoms.type.data(),
+    local_index,
+    atom.number_of_atoms,
+    local_N,
+    1);
+  copy<<<(local_N - 1) / 64 + 1, 64>>>(
+    atom.position_per_atom.data(),
+    local_atoms.position_per_atom.data(),
+    local_index,
+    atom.number_of_atoms,
+    local_N,
+    3);
+  CHECK(gpuDeviceSynchronize());
+}
+}
+
+//implement the test
+void MC_Minimizer_Test::compute(
+    int trials,
+    Force& force,
+    Atom& atom,
+    Box& box,
+    std::vector<Group>& group,
+    int grouping_method,
+    int group_id)
+{
+  int group_size =
+    grouping_method >= 0 ? group[grouping_method].cpu_size[group_id] : atom.number_of_atoms;
+  std::uniform_int_distribution<int> r1(0, group_size - 1);
+
+  int num_accepted = 0;
+  float rc_radius = force.potentials[0]->rc;
+
+  //Construct an atom object and perform independent global relaxation calculations to compare with local relaxation. The same atoms are exchanged between global and local relaxation in each iteration.
+  Atom atom_global;
+  int N = atom.number_of_atoms;
+  atom_global.number_of_atoms = N;
+  atom_global.mass.resize(N);
+  atom_global.type.resize(N);
+  atom_global.potential_per_atom.resize(N, 0);
+  atom_global.force_per_atom.resize(3 * N, 0);
+  atom_global.velocity_per_atom.resize(3 * N, 0);
+  atom_global.position_per_atom.resize(3 * N);
+  atom_global.virial_per_atom.resize(9 * N, 0);
+
+  atom_global.mass.copy_from_device(atom.mass.data(), N);
+  atom_global.type.copy_from_device(atom.type.data(), N);
+  atom_global.position_per_atom.copy_from_device(atom.position_per_atom.data(), 3 * N);
+
+  for (int step = 1; step <= trials; ++step) {
+
+    int i = grouping_method >= 0
+              ? group[grouping_method]
+                  .cpu_contents[group[grouping_method].cpu_size_sum[group_id] + r1(rng)]
+              : r1(rng);
+    int type_i = atom.cpu_type[i];
+    int j = 0, type_j = type_i;
+    while (type_i == type_j) {
+      j = grouping_method >= 0
+            ? group[grouping_method]
+                .cpu_contents[group[grouping_method].cpu_size_sum[group_id] + r1(rng)]
+            : r1(rng);
+      type_j = atom.cpu_type[j];
+    }
+
+    //measure the running time
+    auto start_time_global = std::chrono::high_resolution_clock::now();
+    //calculate the global relaxation energy after swap
+    force.potentials[0]->N2 = atom.number_of_atoms;
+    Atom atom_global_copy;
+    atom_global_copy.number_of_atoms = N;
+    atom_global_copy.mass.resize(N);
+    atom_global_copy.type.resize(N);
+    atom_global_copy.potential_per_atom.resize(N, 0);
+    atom_global_copy.force_per_atom.resize(3 * N, 0);
+    atom_global_copy.velocity_per_atom.resize(3 * N, 0);
+    atom_global_copy.position_per_atom.resize(3 * N);
+    atom_global_copy.virial_per_atom.resize(9 * N, 0);
+
+    atom_global_copy.mass.copy_from_device(atom_global.mass.data(), N);
+    atom_global_copy.type.copy_from_device(atom_global.type.data(), N);
+    atom_global_copy.position_per_atom.copy_from_device(atom_global.position_per_atom.data(), 3 * N);
+    //calculate the global energy before swap
+    force.compute(
+      box,
+      atom_global.position_per_atom,
+      atom_global.type,
+      group,
+      atom_global.potential_per_atom,
+      atom_global.force_per_atom,
+      atom_global.virial_per_atom);
+    double pe_before_total_global = 0;
+    std::vector<double> pe_before_cpu_global(N);
+    atom_global.potential_per_atom.copy_to_host(pe_before_cpu_global.data(), N);
+    //calculate the energy after swap
+    exchange<<<1, 1>>>(
+      i,
+      j,
+      type_i,
+      type_j,
+      atom_global_copy.type.data(),
+      atom_global_copy.mass.data(),
+      atom_global_copy.velocity_per_atom.data(),
+      atom_global_copy.velocity_per_atom.data() + N,
+      atom_global_copy.velocity_per_atom.data() + N * 2);
+
+    Minimizer_FIRE minimizer_compare_global(N, max_relax_steps, force_tolerance);
+    minimizer_compare_global.compute(
+      force,
+      box,
+      atom_global_copy.position_per_atom,
+      atom_global_copy.type,
+      group,
+      atom_global_copy.potential_per_atom,
+      atom_global_copy.force_per_atom,
+      atom_global_copy.virial_per_atom);
+    double pe_after_total_global = 0;
+    std::vector<double> pe_after_cpu_global(N);
+    atom_global_copy.potential_per_atom.copy_to_host(pe_after_cpu_global.data(), N);
+    for (int n = 0; n < N; ++n) {
+      pe_after_total_global += pe_after_cpu_global[n];
+      pe_before_total_global += pe_before_cpu_global[n];
+    }
+    auto end_time_global = std::chrono::high_resolution_clock::now();
+    auto duration_global = std::chrono::duration_cast<std::chrono::microseconds>(end_time_global - start_time_global).count();
+    //measure the local running time
+    auto start_time_local = std::chrono::high_resolution_clock::now();
+    Atom local_atoms;
+    //find local atoms
+    GPU_Vector<int> local_i;  //the local index of i atom
+    GPU_Vector<int> local_j;  //the local index of j atom
+    GPU_Vector<int> local_N; //the length of local_index
+    GPU_Vector<int> local_index; // an array contains the index of the local atoms
+    GPU_Vector<double> local_sphere_flags; // an array labels the atoms inside shell
+    local_N.resize(1);
+    local_i.resize(1);
+    local_j.resize(1);
+    local_index.resize(atom.number_of_atoms);
+    CHECK(gpuMemset(local_N.data(), 0, sizeof(int)));
+    CHECK(gpuMemset(local_i.data(), 0, sizeof(int)));
+    CHECK(gpuMemset(local_j.data(), 0, sizeof(int)));
+    get_neighbors_of_i_and_j<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
+      atom.number_of_atoms,
+      box,
+      i,
+      j,
+      rc_radius * rc_radius * (scale_factor + 1.2) * (scale_factor + 1.2),
+      atom.position_per_atom.data(),
+      atom.position_per_atom.data() + atom.number_of_atoms,
+      atom.position_per_atom.data() + atom.number_of_atoms * 2,
+      local_N.data(),
+      local_index.data());
+    GPU_CHECK_KERNEL
+
+    int local_N_cpu;
+    local_N.copy_to_host(&local_N_cpu);
+    local_sphere_flags.resize(local_N_cpu * 3);
+    local_sphere_flags.fill(0);
+
+    find_local_ij<<<(local_N_cpu - 1) / 64 + 1, 64>>>(
+      i,
+      j,
+      local_N_cpu,
+      local_index.data(),
+      local_i.data(),
+      local_j.data());
+    int local_i_cpu;
+    int local_j_cpu;
+    local_i.copy_to_host(&local_i_cpu);
+    local_j.copy_to_host(&local_j_cpu);
+
+    //get sphere atoms
+    get_sphere_atoms<<<(local_N_cpu - 1) / 64 + 1, 64>>>(
+      atom.number_of_atoms,
+      local_N_cpu,
+      local_index.data(),
+      box,
+      i,
+      j,
+      rc_radius * rc_radius * scale_factor * scale_factor,
+      atom.position_per_atom.data(),
+      atom.position_per_atom.data() + atom.number_of_atoms,
+      atom.position_per_atom.data() + atom.number_of_atoms * 2,
+      local_sphere_flags.data());
+
+    //build the local_atoms
+    local_atoms.number_of_atoms = local_N_cpu;
+    local_atoms.position_per_atom.resize(3 * local_N_cpu);
+    local_atoms.force_per_atom.resize(3 * local_N_cpu);
+    local_atoms.type.resize(local_N_cpu);
+    local_atoms.velocity_per_atom.resize(3 * local_N_cpu);
+    local_atoms.mass.resize(local_N_cpu);
+    local_atoms.potential_per_atom.resize(local_N_cpu);
+    local_atoms.virial_per_atom.resize(9 * local_N_cpu);
+
+    //initialize
+    local_atoms.force_per_atom.fill(0);
+    local_atoms.potential_per_atom.fill(0);
+    local_atoms.velocity_per_atom.fill(0);
+    local_atoms.virial_per_atom.fill(0);
+  
+    build_local_atoms(
+      atom,
+      local_atoms,
+      local_N_cpu,
+      local_index.data());
+    
+    //calculate the origin energy, WARNING: here the group is not correct but now this variable will not be used in compute()
+    //need to modify here
+    force.potentials[0]->N2 = local_N_cpu;
+    force.compute(box,
+    local_atoms.position_per_atom,
+    local_atoms.type,
+    group,
+    local_atoms.potential_per_atom,
+    local_atoms.force_per_atom,
+    local_atoms.virial_per_atom);
+    modify_energy<<<(local_N_cpu - 1) / 64 + 1, 64>>>(
+      local_N_cpu,
+      local_atoms.potential_per_atom.data(),
+      local_sphere_flags.data());
+
+    std::vector<double> pe_before_cpu(local_N_cpu);
+    local_atoms.potential_per_atom.copy_to_host(pe_before_cpu.data(), local_N_cpu);
+
+    exchange<<<1, 1>>>(
+      local_i_cpu,
+      local_j_cpu,
+      type_i,
+      type_j,
+      local_atoms.type.data(),
+      local_atoms.mass.data(),
+      local_atoms.velocity_per_atom.data(),
+      local_atoms.velocity_per_atom.data() + local_atoms.number_of_atoms,
+      local_atoms.velocity_per_atom.data() + local_atoms.number_of_atoms * 2);
+
+    Minimizer_FIRE Minimizer(
+      local_N_cpu,
+      max_relax_steps,
+      force_tolerance);
+
+    Minimizer.compute_label_atoms(
+      force,
+      box,
+      local_atoms.position_per_atom,
+      local_atoms.type,
+      group,
+      local_sphere_flags,
+      local_atoms.potential_per_atom,
+      local_atoms.force_per_atom,
+      local_atoms.virial_per_atom);
+
+    std::vector<double> pe_after_cpu(local_N_cpu);
+    local_atoms.potential_per_atom.copy_to_host(pe_after_cpu.data(), local_N_cpu);
+    double pe_before_total = 0;
+    double pe_after_total = 0;
+    for (int n = 0; n < local_N_cpu; ++n) {
+      pe_before_total += pe_before_cpu[n];
+    }
+    for (int n = 0; n < local_N_cpu; ++n) {
+      pe_after_total += pe_after_cpu[n];
+    }
+    auto end_time_local = std::chrono::high_resolution_clock::now();
+    auto duration_local = std::chrono::duration_cast<std::chrono::microseconds>(end_time_local - start_time_local).count();
+
+    //determine if change or not
+    //printf("        energy before swapping = %g.10 eV.\n", pe_before_total);
+    //printf("        energy after swapping = %g.10 eV.\n", pe_after_total);
+    double energy_difference = pe_after_total - pe_before_total;
+    std::uniform_real_distribution<float> r2(0, 1);
+    float random_number = r2(rng);
+    double probability = exp(-energy_difference / (K_B * temperature));
+
+      //get the output data
+    GPU_Vector<double> outer_atoms_flags(local_N_cpu);
+    outer_atoms_flags.fill(0);
+    get_outer_atoms<<<(local_N_cpu - 1) / 64 + 1, 64>>>(
+      atom.number_of_atoms,
+      local_N_cpu,
+      local_index.data(),
+      box,
+      i,
+      j,
+      rc_radius * rc_radius * (scale_factor - 1) * (scale_factor - 1),
+      rc_radius * rc_radius * scale_factor * scale_factor,
+      atom.position_per_atom.data(),
+      atom.position_per_atom.data() + atom.number_of_atoms,
+      atom.position_per_atom.data() + atom.number_of_atoms * 2,
+      outer_atoms_flags.data());
+    CHECK(gpuDeviceSynchronize());
+
+    GPU_Vector<double> max_displacement(1);
+    double average_displacement = get_outer_average_displacement(
+      box,
+      atom.position_per_atom.data(),
+      local_atoms.position_per_atom.data(),
+      atom.number_of_atoms,
+      local_atoms.number_of_atoms,
+      local_index.data(),
+      outer_atoms_flags.data(),
+      max_displacement.data());
+    double max_value;
+    max_displacement.copy_to_host(&max_value);
+    double energy_difference_global = pe_after_total_global - pe_before_total_global;
+    double probability_global = exp(-energy_difference_global / (K_B * temperature));
+    double accelerate_ratio = (double)(duration_global - duration_local) / duration_global;
+      mc_output<< step << "\t" << max_value << "\t" << average_displacement 
+      << "\t" << energy_difference << "\t" << energy_difference_global << "\t" << num_accepted / double(step)
+      << "\t" << probability << "\t" << probability_global << "\t" << accelerate_ratio << std::endl;
+
+    if (random_number < probability) {
+      ++num_accepted;
+
+      atom.cpu_type[i] = type_j;
+      atom.cpu_type[j] = type_i;
+
+      auto atom_symbol_i = atom.cpu_atom_symbol[i];
+      atom.cpu_atom_symbol[i] = atom.cpu_atom_symbol[j];
+      atom.cpu_atom_symbol[j] = atom_symbol_i;
+
+      double mass_i = atom.cpu_mass[i];
+      atom.cpu_mass[i] = atom.cpu_mass[j];
+      atom.cpu_mass[j] = mass_i;
+
+      atom_global.position_per_atom.copy_from_device(atom_global_copy.position_per_atom.data(), 3*N);
+      exchange<<<1, 1>>>(
+        i,
+        j,
+        type_i,
+        type_j,
+        atom_global.type.data(),
+        atom_global.mass.data(),
+        atom_global.velocity_per_atom.data(),
+        atom_global.velocity_per_atom.data() + atom_global.number_of_atoms,
+        atom_global.velocity_per_atom.data() + atom_global.number_of_atoms * 2);
+
+      //copy the relaxed local structure to the global structure
+      build_all_atoms(
+        atom,
+        local_atoms,
+        local_N_cpu,
+        local_index.data());
+      exchange<<<1, 1>>>(
+        i,
+        j,
+        type_i,
+        type_j,
+        atom.type.data(),
+        atom.mass.data(),
+        atom.velocity_per_atom.data(),
+        atom.velocity_per_atom.data() + atom.number_of_atoms,
+        atom.velocity_per_atom.data() + atom.number_of_atoms * 2);
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/mc_minimize/mc_minimizer_test.cuh b/src/mc_minimize/mc_minimizer_test.cuh
new file mode 100644
index 000000000..5b1bd60d1
--- /dev/null
+++ b/src/mc_minimize/mc_minimizer_test.cuh
@@ -0,0 +1,44 @@
+/*
+    Copyright 2017 Zheyong Fan and GPUMD development team
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mc_minimizer.cuh"
+
+class MC_Minimizer_Test : public MC_Minimizer
+{
+private:
+  double scale_factor;
+  double temperature;
+  double force_tolerance;
+  int max_relax_steps;
+public:
+    MC_Minimizer_Test(const char** param, int num_param,
+    double scale_factor_input,
+    double temperature_input,
+    double force_tolerance_input,
+    int max_relax_steps_input);
+    ~MC_Minimizer_Test();
+
+    virtual void compute(
+    int trials,
+    Force& force,
+    Atom& atom,
+    Box& box,
+    std::vector<Group>& group,
+    int grouping_method,
+    int group_id);
+};
+
diff --git a/src/minimize/minimizer_fire.cu b/src/minimize/minimizer_fire.cu
index 9784030c9..e6c25ce03 100644
--- a/src/minimize/minimizer_fire.cu
+++ b/src/minimize/minimizer_fire.cu
@@ -179,4 +179,88 @@ void Minimizer_FIRE::compute(
   }
 
   printf("Energy minimization finished.\n");
-}
\ No newline at end of file
+}
+
+void Minimizer_FIRE::compute_label_atoms(
+    Force& force,
+    Box& box,
+    GPU_Vector<double>& position_per_atom,
+    GPU_Vector<int>& type,
+    std::vector<Group>& group,
+    GPU_Vector<double>& local_flags,
+    GPU_Vector<double>& potential_per_atom,
+    GPU_Vector<double>& force_per_atom,
+    GPU_Vector<double>& virial_per_atom)
+{
+  double next_dt;
+  const int size = number_of_atoms_ * 3;
+  //int base = (number_of_steps_ >= 10) ? (number_of_steps_ / 10) : 1;
+  // create a velocity vector in GPU
+  GPU_Vector<double> v(size, 0);
+  GPU_Vector<double> temp1(size);
+  GPU_Vector<double> temp2(size);
+
+  //printf("\nEnergy minimization started.\n");
+
+  for (int step = 0; step < number_of_steps_; ++step) {
+    force.compute(
+      box, position_per_atom, type, group, potential_per_atom, force_per_atom, virial_per_atom);
+    pairwise_product(force_per_atom, local_flags, force_per_atom);
+    pairwise_product(potential_per_atom, local_flags, potential_per_atom);
+    calculate_force_square_max(force_per_atom);
+    const double force_max = sqrt(cpu_force_square_max_[0]);
+    calculate_total_potential(potential_per_atom);
+
+    /*if (step % base == 0 || force_max < force_tolerance_) {
+      printf(
+        "    step %d: total_potential = %.10f eV, f_max = %.10f eV/A.\n",
+        step,
+        cpu_total_potential_[0],
+        force_max);
+    */
+    if (force_max < force_tolerance_)
+      break;
+    
+    P = dot(v, force_per_atom);
+
+    if (P > 0) {
+      if (N_neg > N_min) {
+        next_dt = dt * f_inc;
+        if (next_dt < dt_max)
+          dt = next_dt;
+        alpha *= f_alpha;
+      }
+      N_neg++;
+    } else {
+      next_dt = dt * f_dec;
+      if (next_dt > dt_min)
+        dt = next_dt;
+      alpha = alpha_start;
+      // move position back
+      scalar_multiply(-0.5 * dt, v, temp1);
+      pairwise_product(temp1, local_flags, temp1);
+      vector_sum(position_per_atom, temp1, position_per_atom);
+      v.fill(0);
+      N_neg = 0;
+    }
+
+    // md step
+    // implicit Euler integration
+    double F_modulus = sqrt(dot(force_per_atom, force_per_atom));
+    double v_modulus = sqrt(dot(v, v));
+    // dv = F/m*dt
+    scalar_multiply(dt / m, force_per_atom, temp2);
+    vector_sum(v, temp2, v);
+    pairwise_product(v, local_flags, v);
+    scalar_multiply(1 - alpha, v, temp1);
+    scalar_multiply(alpha * v_modulus / F_modulus, force_per_atom, temp2);
+    vector_sum(temp1, temp2, v);
+    pairwise_product(v, local_flags, v);
+    // dx = v*dt
+    scalar_multiply(dt, v, temp1);
+    pairwise_product(temp1, local_flags, temp1);
+    vector_sum(position_per_atom, temp1, position_per_atom);
+  }
+
+  //printf("Energy minimization finished.\n");
+}
diff --git a/src/minimize/minimizer_fire.cuh b/src/minimize/minimizer_fire.cuh
index a01cd8f90..79faeb2b9 100644
--- a/src/minimize/minimizer_fire.cuh
+++ b/src/minimize/minimizer_fire.cuh
@@ -50,4 +50,21 @@ public:
     GPU_Vector<double>& potential_per_atom,
     GPU_Vector<double>& force_per_atom,
     GPU_Vector<double>& virial_per_atom);
+
+
+/*
+the additional array, local_flags is needed. The length of the array should be same with atom total number.
+With 1 means need to update and 0 means keep static
+This is currently dedicated to the mc_minimize module
+*/
+  void compute_label_atoms(
+    Force& force,
+    Box& box,
+    GPU_Vector<double>& position_per_atom,
+    GPU_Vector<int>& type,
+    std::vector<Group>& group,
+    GPU_Vector<double>& local_flags,
+    GPU_Vector<double>& potential_per_atom,
+    GPU_Vector<double>& force_per_atom,
+    GPU_Vector<double>& virial_per_atom);
 };
\ No newline at end of file