From b774e3c0bc9c7fca1d79709341638d4ff83add77 Mon Sep 17 00:00:00 2001
From: Daniel Peter <peterda@ethz.ch>
Date: Fri, 6 Jul 2012 13:58:19 +0000
Subject: [PATCH] adds GPU functionality to SPECFEM3D solver (requires to rerun
 ./configure); merges GPU branch modifications to main trunk; adds src/cuda
 directory & files; updates Par_files; renames compute_forces_elastic.F90 and
 prepare_timerun.F90

---
 AUTHORS                                       |    1 +
 config.h.in                                   |    6 +
 configure                                     |  194 +-
 configure.ac                                  |   35 +-
 in_data_files/Par_file                        |    3 +
 src/cuda/check_fields_cuda.cu                 |  566 +++
 src/cuda/compute_add_sources_acoustic_cuda.cu |  370 ++
 src/cuda/compute_add_sources_elastic_cuda.cu  |  422 ++
 src/cuda/compute_coupling_cuda.cu             |  345 ++
 src/cuda/compute_forces_acoustic_cuda.cu      |  966 +++++
 src/cuda/compute_forces_elastic_cuda.cu       | 2255 +++++++++++
 src/cuda/compute_kernels_cuda.cu              |  646 +++
 src/cuda/compute_stacey_acoustic_cuda.cu      |  192 +
 src/cuda/compute_stacey_elastic_cuda.cu       |  214 +
 src/cuda/it_update_displacement_cuda.cu       |  228 ++
 src/cuda/mesh_constants_cuda.h                |  467 +++
 src/cuda/noise_tomography_cuda.cu             |  305 ++
 src/cuda/prepare_constants_cuda.h             |   99 +
 src/cuda/prepare_mesh_constants_cuda.cu       | 2037 ++++++++++
 src/cuda/save_and_compare_cpu_vs_gpu.c        |  318 ++
 src/cuda/specfem3D_gpu_cuda_method_stubs.c    |  805 ++++
 .../specfem3D_gpu_cuda_method_stubs.c.bak     |  805 ++++
 src/cuda/transfer_fields_cuda.cu              |  586 +++
 src/cuda/write_seismograms_cuda.cu            |  537 +++
 .../decompose_mesh_SCOTCH.f90                 |   44 +-
 .../part_decompose_mesh_SCOTCH.f90            |   26 +-
 .../program_decompose_mesh_SCOTCH.f90         |    3 +
 src/generate_databases/Makefile.in            |   17 +-
 .../create_mass_matrices.f90                  |   14 +-
 .../create_regions_mesh.f90                   |  855 +++-
 src/generate_databases/generate_databases.f90 |   15 +-
 .../get_coupling_surfaces.f90                 | 1419 ++-----
 src/generate_databases/get_global.f90         |    4 +-
 src/generate_databases/get_model.f90          |  143 +-
 src/generate_databases/get_perm_color.f90     | 1210 ++++++
 src/generate_databases/memory_eval.f90        |   10 +-
 src/generate_databases/model_1d_cascadia.f90  |   10 +-
 src/generate_databases/model_1d_prem.f90      |  129 +-
 src/generate_databases/model_1d_socal.f90     |   12 +-
 src/generate_databases/model_aniso.f90        |   23 +
 src/generate_databases/model_default.f90      |   20 +-
 .../model_external_values.f90                 |   12 +-
 src/generate_databases/model_gll.f90          |   28 +-
 src/generate_databases/model_ipati.f90        |  113 +
 .../model_salton_trough.f90                   |   40 +-
 src/generate_databases/model_tomography.f90   |    7 +-
 src/generate_databases/save_arrays_solver.f90 |  571 +--
 src/meshfem3D/check_mesh_quality.f90          |   22 +-
 src/meshfem3D/compute_parameters.f90          |    2 +-
 src/meshfem3D/create_regions_mesh.f90         |    4 +-
 src/meshfem3D/create_visual_files.f90         |    8 +-
 src/meshfem3D/meshfem3D.f90                   |    2 +-
 src/meshfem3D/store_boundaries.f90            |    6 +-
 src/shared/assemble_MPI_scalar.f90            |    1 -
 src/shared/check_mesh_resolution.f90          |   11 +-
 src/shared/combine_vol_data.f90               |   10 +-
 src/shared/compute_arrays_source.f90          |  352 ++
 src/shared/constants.h.in                     |   90 +-
 src/shared/detect_surface.f90                 |    6 +-
 src/shared/get_element_face.f90               |   66 +-
 src/shared/param_reader.c                     |    2 +-
 src/shared/read_parameter_file.f90            |   74 +-
 src/shared/read_topo_bathy_file.f90           |   92 +-
 src/shared/serial.f90                         |   11 +-
 src/shared/smooth_vol_data.f90                |   40 +-
 src/shared/sum_kernels.f90                    |   76 +-
 src/shared/write_VTK_data.f90                 |   18 +-
 src/shared/write_c_binary.c                   |   69 +-
 src/specfem3D/Makefile.in                     |  518 +--
 src/specfem3D/assemble_MPI_vector.f90         |  475 ++-
 .../compute_add_sources_acoustic.f90          |  602 +--
 src/specfem3D/compute_add_sources_elastic.f90 |  542 +--
 src/specfem3D/compute_coupling_elastic_po.f90 |    8 +-
 .../compute_coupling_poroelastic_el.f90       |    8 +-
 src/specfem3D/compute_forces_acoustic.f90     |  290 +-
 ...elastic.f90 => compute_forces_elastic.F90} |  458 ++-
 src/specfem3D/compute_forces_elastic_Dev.f90  | 3454 +---------------
 src/specfem3D/compute_forces_elastic_Dev2.f90 | 3477 +++++++++++++++++
 .../compute_forces_elastic_Dev_openmp.f90     |  786 ++++
 src/specfem3D/compute_gradient.f90            |   15 +-
 src/specfem3D/compute_kernels.f90             |  268 +-
 src/specfem3D/compute_stacey_acoustic.f90     |   85 +-
 src/specfem3D/compute_stacey_elastic.f90      |  106 +-
 src/specfem3D/create_color_image.f90          |  163 +-
 src/specfem3D/finalize_simulation.f90         |    4 +-
 src/specfem3D/initialize_simulation.f90       |   74 +-
 src/specfem3D/iterate_time.f90                |  383 +-
 src/specfem3D/locate_receivers.f90            |   27 +-
 src/specfem3D/locate_source.f90               |   14 +-
 src/specfem3D/make_gravity.f90                |  680 ++++
 src/specfem3D/noise_tomography.f90            |  395 +-
 ...repare_timerun.f90 => prepare_timerun.F90} |  410 +-
 src/specfem3D/read_mesh_databases.f90         |   80 +-
 src/specfem3D/setup_GLL_points.f90            |    6 +-
 src/specfem3D/setup_sources_receivers.f90     |  233 +-
 src/specfem3D/specfem3D_par.f90               |   95 +-
 src/specfem3D/write_movie_output.f90          |   54 +-
 src/specfem3D/write_seismograms.f90           |  140 +-
 utils/Cluster/pbs/valgrind_go_solver_pbs.bash |   77 +
 .../create_specfem3D_gpu_cuda_method_stubs.pl |  141 +
 utils/readme_cuda_cscs.txt                    |   63 +
 utils/remake_makefiles.sh                     |   12 +
 utils/update_headers_change_word_f90.pl       |  114 +
 103 files changed, 25331 insertions(+), 7015 deletions(-)
 create mode 100644 src/cuda/check_fields_cuda.cu
 create mode 100644 src/cuda/compute_add_sources_acoustic_cuda.cu
 create mode 100644 src/cuda/compute_add_sources_elastic_cuda.cu
 create mode 100644 src/cuda/compute_coupling_cuda.cu
 create mode 100644 src/cuda/compute_forces_acoustic_cuda.cu
 create mode 100644 src/cuda/compute_forces_elastic_cuda.cu
 create mode 100644 src/cuda/compute_kernels_cuda.cu
 create mode 100644 src/cuda/compute_stacey_acoustic_cuda.cu
 create mode 100644 src/cuda/compute_stacey_elastic_cuda.cu
 create mode 100644 src/cuda/it_update_displacement_cuda.cu
 create mode 100644 src/cuda/mesh_constants_cuda.h
 create mode 100644 src/cuda/noise_tomography_cuda.cu
 create mode 100644 src/cuda/prepare_constants_cuda.h
 create mode 100644 src/cuda/prepare_mesh_constants_cuda.cu
 create mode 100644 src/cuda/save_and_compare_cpu_vs_gpu.c
 create mode 100644 src/cuda/specfem3D_gpu_cuda_method_stubs.c
 create mode 100644 src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak
 create mode 100644 src/cuda/transfer_fields_cuda.cu
 create mode 100644 src/cuda/write_seismograms_cuda.cu
 create mode 100644 src/generate_databases/get_perm_color.f90
 create mode 100644 src/generate_databases/model_ipati.f90
 create mode 100644 src/shared/compute_arrays_source.f90
 rename src/specfem3D/{compute_forces_elastic.f90 => compute_forces_elastic.F90} (64%)
 create mode 100644 src/specfem3D/compute_forces_elastic_Dev2.f90
 create mode 100644 src/specfem3D/compute_forces_elastic_Dev_openmp.f90
 create mode 100644 src/specfem3D/make_gravity.f90
 rename src/specfem3D/{prepare_timerun.f90 => prepare_timerun.F90} (59%)
 create mode 100755 utils/Cluster/pbs/valgrind_go_solver_pbs.bash
 create mode 100755 utils/create_specfem3D_gpu_cuda_method_stubs.pl
 create mode 100644 utils/readme_cuda_cscs.txt
 create mode 100755 utils/remake_makefiles.sh
 create mode 100755 utils/update_headers_change_word_f90.pl

diff --git a/AUTHORS b/AUTHORS
index cf2dc7332..0938d8293 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -28,6 +28,7 @@ David Michéa,
 Hom Nath Gharti,
 Tarje Nissen-Meyer,
 Daniel Peter,
+Max Rietmann,
 Brian Savage,
 Bernhard Schuberth,
 Anne Sieminski,
diff --git a/config.h.in b/config.h.in
index 26a366c62..4d074a31d 100644
--- a/config.h.in
+++ b/config.h.in
@@ -85,3 +85,9 @@
 /* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
    `char[]'. */
 #undef YYTEXT_POINTER
+
+
+/* Uncomment and define to select optimized file i/o for regional simulations */
+/* map fails when output files are > 4GB, which is often the case for GPU simulations */
+// #define USE_MAP_FUNCTION
+
diff --git a/configure b/configure
index a9af4633e..6a581d7c6 100755
--- a/configure
+++ b/configure
@@ -604,11 +604,16 @@ ac_includes_default="\
 
 ac_subst_vars='LTLIBOBJS
 LIBOBJS
+NVCC
 PYTHON_EGG_LDFLAGS
 PYTHON_EGG_CPPFLAGS
 PYTHON_EGG_CFLAGS
 PYTHONPATH
 LOCAL_PATH_IS_ALSO_GLOBAL
+OPENMP_LIB
+MPI_INC
+CUDA_INC
+CUDA_LIB
 FLAGS_NO_CHECK
 FLAGS_CHECK
 MPICC
@@ -673,6 +678,10 @@ PYTHON_EXEC_PREFIX
 PYTHON_PREFIX
 PYTHON_VERSION
 PYTHON
+COND_OPENMP_FALSE
+COND_OPENMP_TRUE
+COND_CUDA_FALSE
+COND_CUDA_TRUE
 CUSTOM_MPI_TYPE
 CUSTOM_REAL
 COND_MPI_FALSE
@@ -723,6 +732,8 @@ enable_option_checking
 with_pyre
 with_mpi
 enable_double_precision
+with_cuda
+with_openmp
 with_scotch_dir
 with_scotch_includedir
 with_scotch_libdir
@@ -749,6 +760,10 @@ MPILIBS
 MPICC
 FLAGS_CHECK
 FLAGS_NO_CHECK
+CUDA_LIB
+CUDA_INC
+MPI_INC
+OPENMP_LIB
 LOCAL_PATH_IS_ALSO_GLOBAL
 PYTHON
 PYTHONPATH'
@@ -1375,6 +1390,8 @@ Optional Packages:
   --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
   --with-pyre             build Pyrized version [default=no]
   --with-mpi              build parallel version [default=yes]
+  --with-cuda             build CUDA GPU enabled version [default=no]
+  --with-openmp           build OpenMP enabled version [default=no]
   --with-scotch-dir       define the root path to Scotch (e.g. /opt/scotch/)
   --with-scotch-includedir
                           define the path to the Scotch headers (e.g.
@@ -1413,6 +1430,11 @@ Some influential environment variables:
   FLAGS_NO_CHECK
               Fortran compiler flags for creating fast, production-run code
               for critical subroutines
+  CUDA_LIB    Location of CUDA library libcudart
+  CUDA_INC    Location of CUDA include files
+  MPI_INC     Location of MPI include mpi.h, which is needed by nvcc when
+              compiling cuda files
+  OPENMP_LIB  Location of extra OpenMP libraries
   LOCAL_PATH_IS_ALSO_GLOBAL
               files on a local path on each node are also seen as global with
               same path [default=true]
@@ -2279,6 +2301,40 @@ fi
 
 
 
+# Check whether --with-cuda was given.
+if test "${with_cuda+set}" = set; then :
+  withval=$with_cuda; want_cuda="$withval"
+else
+  want_cuda=no
+fi
+
+ if test "$want_cuda" = yes; then
+  COND_CUDA_TRUE=
+  COND_CUDA_FALSE='#'
+else
+  COND_CUDA_TRUE='#'
+  COND_CUDA_FALSE=
+fi
+
+
+
+# Check whether --with-openmp was given.
+if test "${with_openmp+set}" = set; then :
+  withval=$with_openmp; want_openmp="$withval"
+else
+  want_openmp=no
+fi
+
+ if test "$want_openmp" = yes; then
+  COND_OPENMP_TRUE=
+  COND_OPENMP_FALSE='#'
+else
+  COND_OPENMP_TRUE='#'
+  COND_OPENMP_FALSE=
+fi
+
+
+
 # Checks for programs.
 
 if test "$want_pyre" = yes; then
@@ -6390,6 +6446,9 @@ LDFLAGS=${ac_save_ldflags}
 
 
 
+
+
+
 if test x"$MPIFC" = x; then
     MPIFC=mpif90
 fi
@@ -6398,6 +6457,7 @@ if test x"$MPICC" = x; then
 fi
 
 
+
 if test x"$LOCAL_PATH_IS_ALSO_GLOBAL" = x; then
     LOCAL_PATH_IS_ALSO_GLOBAL=true
 fi
@@ -6708,9 +6768,131 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 fi
 
+if test "$want_cuda" = yes; then
+
+  # Extract the first word of "nvcc", so it can be a program name with args.
+set dummy nvcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_path_NVCC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $NVCC in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_NVCC="$NVCC" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_path_NVCC="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  ;;
+esac
+fi
+NVCC=$ac_cv_path_NVCC
+if test -n "$NVCC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $NVCC" >&5
+$as_echo "$NVCC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
 
-# Checks for library functions.
 
+  if test -z "$NVCC" ; then
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot find 'nvcc' program.
+See \`config.log' for more details" "$LINENO" 5; }
+    NVCC=`echo "Error: nvcc is not installed." ; false`
+  fi
+
+fi
+
+if test "$want_openmp" = yes; then
+
+ac_ext=${ac_fc_srcext-f}
+ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
+ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+cit_fc_save_fc=$FC
+cit_fc_save_fcflags=$FCFLAGS
+FC=$FC
+FCFLAGS="$FCFLAGS $FLAGS_NO_CHECK"
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether OpenMP directives work" >&5
+$as_echo_n "checking whether OpenMP directives work... " >&6; }
+
+#AC_COMPILE_IFELSE(_CIT_FC_TRIVIAL_OPENMP_PROGRAM, [
+#    AC_MSG_RESULT(yes)
+#], [
+#    AC_MSG_RESULT(no)
+#    AC_MSG_FAILURE([cannot compile a trivial OpenMP program using $FC])
+#])
+
+cat > conftest.$ac_ext <<_ACEOF
+
+      program main
+
+       implicit none
+       integer OMP_get_thread_num
+       integer OMP_GET_MAX_THREADS
+       integer NUM_THREADS
+       integer thread_id
+
+       NUM_THREADS = OMP_GET_MAX_THREADS()
+       !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(thread_id)
+       thread_id = OMP_get_thread_num()+1
+       !$OMP END PARALLEL
+
+      end
+
+_ACEOF
+if ac_fn_fc_try_link "$LINENO"; then :
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot link a trivial OpenMP program using $FC with flags: $FLAGS_NO_CHECK
+See \`config.log' for more details" "$LINENO" 5; }
+
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+FC=$cit_fc_save_fc
+FCFLAGS=$cit_fc_save_fcflags
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+fi
+
+
+# Checks for library functions.
 
 ac_config_files="$ac_config_files Makefile src/meshfem3D/Makefile src/meshfem3D/constants.h src/meshfem3D/precision.h src/decompose_mesh_SCOTCH/Makefile src/specfem3D/Makefile src/generate_databases/Makefile src/shared/constants.h src/shared/precision.h src/decompose_mesh_SCOTCH/scotch_5.1.11/src/Makefile.inc src/check_mesh_quality_CUBIT_Abaqus/Makefile"
 
@@ -6831,6 +7013,14 @@ if test -z "${COND_MPI_TRUE}" && test -z "${COND_MPI_FALSE}"; then
   as_fn_error $? "conditional \"COND_MPI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${COND_CUDA_TRUE}" && test -z "${COND_CUDA_FALSE}"; then
+  as_fn_error $? "conditional \"COND_CUDA\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${COND_OPENMP_TRUE}" && test -z "${COND_OPENMP_FALSE}"; then
+  as_fn_error $? "conditional \"COND_OPENMP\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -8017,4 +8207,6 @@ $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
 fi
 
 
+#daniel: adding custom lines to config.h.in through autoheader
+
 
diff --git a/configure.ac b/configure.ac
index 3dd651ac6..f1057fc99 100644
--- a/configure.ac
+++ b/configure.ac
@@ -45,6 +45,20 @@ fi
 AC_SUBST([CUSTOM_REAL])
 AC_SUBST([CUSTOM_MPI_TYPE])
 
+AC_ARG_WITH([cuda],
+    [AC_HELP_STRING([--with-cuda],
+        [build CUDA GPU enabled version @<:@default=no@:>@])],
+    [want_cuda="$withval"],
+    [want_cuda=no])
+AM_CONDITIONAL([COND_CUDA], [test "$want_cuda" = yes])
+
+AC_ARG_WITH([openmp],
+    [AC_HELP_STRING([--with-openmp],
+        [build OpenMP enabled version @<:@default=no@:>@])],
+    [want_openmp="$withval"],
+    [want_openmp=no])
+AM_CONDITIONAL([COND_OPENMP], [test "$want_openmp" = yes])
+
 
 # Checks for programs.
 
@@ -172,7 +186,7 @@ else
    AC_MSG_ERROR([No suitable yacc or bison found])
  fi
  ACX_PTHREAD(AC_MSG_RESULT([pthread found]), AC_MSG_ERROR([pthread not found]))
-
+ 
  #daniel: scotch bundle
  # uses bundled scotch: current version 5.1.11
  USE_BUNDLED_SCOTCH=1
@@ -339,12 +353,16 @@ AC_ARG_VAR(MPILIBS, [extra libraries for linking MPI programs])
 AC_ARG_VAR(MPICC, [MPI C compiler command])
 AC_ARG_VAR(FLAGS_CHECK, [Fortran compiler flags for non-critical subroutines])
 AC_ARG_VAR(FLAGS_NO_CHECK, [Fortran compiler flags for creating fast, production-run code for critical subroutines])
+AC_ARG_VAR(CUDA_LIB,[Location of CUDA library libcudart])
+AC_ARG_VAR(CUDA_INC,[Location of CUDA include files])
+AC_ARG_VAR(MPI_INC,[Location of MPI include mpi.h, which is needed by nvcc when compiling cuda files])
 if test x"$MPIFC" = x; then
     MPIFC=mpif90
 fi
 if test x"$MPICC" = x; then
     MPICC=mpicc
 fi
+AC_ARG_VAR(OPENMP_LIB,[Location of extra OpenMP libraries])
 
 AC_ARG_VAR(LOCAL_PATH_IS_ALSO_GLOBAL, [files on a local path on each node are also seen as global with same path @<:@default=true@:>@])
 if test x"$LOCAL_PATH_IS_ALSO_GLOBAL" = x; then
@@ -415,9 +433,16 @@ if test "$want_pyre" = yes; then
     CIT_FC_MAIN
 fi
 
+if test "$want_cuda" = yes; then
+    CIT_CUDA_COMPILER
+fi
+
+if test "$want_openmp" = yes; then
+    CIT_FC_OPENMP_MODULE([$FC],[$FLAGS_NO_CHECK])        
+fi
 
-# Checks for library functions.
 
+# Checks for library functions.
 
 AC_CONFIG_FILES([
 	Makefile
@@ -434,5 +459,11 @@ AC_CONFIG_FILES([
 ])
 AC_OUTPUT
 
+#daniel: adding custom lines to config.h.in through autoheader
+AH_BOTTOM([
+/* Uncomment and define to select optimized file i/o for regional simulations */
+/* map fails when output files are > 4GB, which is often the case for GPU simulations */
+// #define USE_MAP_FUNCTION
+])
 
 dnl end of configure.ac
diff --git a/in_data_files/Par_file b/in_data_files/Par_file
index 93996c51e..3c68407d0 100644
--- a/in_data_files/Par_file
+++ b/in_data_files/Par_file
@@ -33,6 +33,7 @@ TOPOGRAPHY                      = .false.
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .false.
@@ -65,4 +66,6 @@ NTSTEP_BETWEEN_READ_ADJSRC      = 0
 # print source time function
 PRINT_SOURCE_TIME_FUNCTION      = .false.
 
+# set to true to use GPUs 
+GPU_MODE                        = .false.
 
diff --git a/src/cuda/check_fields_cuda.cu b/src/cuda/check_fields_cuda.cu
new file mode 100644
index 000000000..ba747d09c
--- /dev/null
+++ b/src/cuda/check_fields_cuda.cu
@@ -0,0 +1,566 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+#include "prepare_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Check functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_displ_gpu,
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
+
+TRACE("check_max_norm_displ_gpu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(displ[i]));
+  }
+  printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_vector,
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
+
+TRACE("check_max_norm_vector");
+
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  realw maxnorm=0;
+  int maxloc;
+  for(int i=0;i<*size;i++) {
+    if(maxnorm<fabsf(vector1[i])) {
+      maxnorm = vector1[i];
+      maxloc = i;
+    }
+  }
+  printf("%d:maxnorm of vector %d [%d] = %e\n",procid,*announceID,maxloc,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_displ,
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
+
+TRACE("check_max_norm_displ");
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(displ[i]));
+  }
+  printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_displ_gpu,
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
+
+TRACE("check_max_norm_b_displ_gpu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  realw* b_accel = (realw*)malloc(*size*sizeof(realw));
+
+  cudaMemcpy(b_displ, mp->d_b_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  realw maxnorm=0;
+  realw maxnorm_accel=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
+    maxnorm_accel = MAX(maxnorm,fabsf(b_accel[i]));
+  }
+  free(b_accel);
+  printf("%d: maxnorm of backward displ = %e\n",*announceID,maxnorm);
+  printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm_accel);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_accel_gpu,
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
+
+TRACE("check_max_norm_b_accel_gpu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
+  }
+  printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_veloc_gpu,
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
+
+TRACE("check_max_norm_b_veloc_gpu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_veloc[i]));
+  }
+  printf("%d: maxnorm of backward veloc = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_displ,
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
+
+TRACE("check_max_norm_b_displ");
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
+  }
+  printf("%d:maxnorm of backward displ = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_accel,
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
+
+TRACE("check_max_norm_b_accel");
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
+  }
+  printf("%d:maxnorm of backward accel = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_error_vectors,
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
+
+TRACE("check_error_vectors");
+
+  int size = *sizef;
+
+  double diff2 = 0;
+  double sum = 0;
+  double temp;
+  double maxerr=0;
+  int maxerrorloc;
+
+  for(int i=0;i<size;++i) {
+    temp = vector1[i]-vector2[i];
+    diff2 += temp*temp;
+    sum += vector1[i]*vector1[i];
+    if(maxerr < fabsf(temp)) {
+      maxerr = abs(temp);
+      maxerrorloc = i;
+    }
+  }
+
+  printf("rel error = %f, maxerr = %e @ %d\n",diff2/sum,maxerr,maxerrorloc);
+  int myrank;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+  myrank = 0;
+#endif
+  if(myrank == 0) {
+    for(int i=maxerrorloc;i>maxerrorloc-5;i--) {
+      printf("[%d]: %e vs. %e\n",i,vector1[i],vector2[i]);
+    }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Auxiliary functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_max_accel,
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
+
+TRACE("get_max_accel");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  int size = *sizef;
+  int it = *itf;
+  realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
+  cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  realw maxval=0;
+  for(int i=0;i<size;++i) {
+    maxval = MAX(maxval,accel_cpy[i]);
+  }
+  printf("%d/%d: max=%e\n",it,procid,maxval);
+  free(accel_cpy);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
+
+  /* simplest version: uses only 1 thread
+   realw max;
+   max = 0;
+   // finds maximum value in array
+   if( size > 0 ){
+   max = abs(array[0]);
+   for( int i=1; i < size; i++){
+   if( abs(array[i]) > max ) max = abs(array[i]);
+   }
+   }
+   *d_max = max;
+   */
+
+  // reduction example:
+  __shared__ realw sdata[256] ;
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+
+  // loads absolute values into shared memory
+  sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
+
+  __syncthreads();
+
+  // do reduction in shared mem
+  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+  {
+    if (tid < s){
+      // summation:
+      //sdata[tid] += sdata[tid + s];
+      // maximum:
+      if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+    }
+    __syncthreads();
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) d_max[blockIdx.x] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_acoustic_from_device,
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
+                                                  long* Mesh_pointer_f,
+                                                  int* SIMULATION_TYPE) {
+
+TRACE("get_norm_acoustic_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  realw max;
+  realw *d_max;
+
+  max = 0;
+
+  /* way 1 : timing Elapsed time: 8.464813e-03
+   realw* h_array;
+   h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw));
+
+   print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic,
+   sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
+
+   // finds maximum value in array
+   max = h_array[0];
+   for( int i=1; i < mp->NGLOB_AB; i++){
+   if( abs(h_array[i]) > max ) max = abs(h_array[i]);
+   }
+   free(h_array);
+   */
+
+  /* way 2: timing Elapsed time: 8.818102e-02
+   // launch simple kernel
+   cudaMalloc((void**)&d_max,sizeof(realw));
+
+   dim3 grid(1,1);
+   dim3 threads(1,1,1);
+
+   get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+   mp->NGLOB_AB,
+   d_max);
+   print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222);
+
+   cudaFree(d_max);
+   */
+
+  // way 2 b: timing Elapsed time: 1.236916e-03
+  // launch simple reduction kernel
+  realw* h_max;
+  int blocksize = 256;
+
+  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
+  //printf("num_blocks_x %i \n",num_blocks_x);
+
+  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
+
+  dim3 grid(num_blocks_x,1);
+  dim3 threads(blocksize,1,1);
+
+  if(*SIMULATION_TYPE == 1 ){
+    get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                         mp->NGLOB_AB,
+                                         d_max);
+  }
+
+  if(*SIMULATION_TYPE == 3 ){
+    get_maximum_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                         mp->NGLOB_AB,
+                                         d_max);
+  }
+
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+
+  // determines max for all blocks
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+
+  cudaFree(d_max);
+  free(h_max);
+
+  /* way 3: doesn't work properly...
+   cublasStatus status;
+
+   // Initialize CUBLAS
+   status = cublasInit();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! CUBLAS initialization error\n");
+   exit(1);
+   }
+
+   // cublas function: cublasIsamax
+   //       finds the smallest index of the maximum magnitude element of single 
+   //      precision vector x
+   int incr = 1;
+   int imax = 0;
+   imax = cublasIsamax(mp->NGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr);
+   status= cublasGetError();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n");
+   exit(1);
+   }
+
+   print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]),
+                      sizeof(realw), cudaMemcpyDeviceToHost),222);
+
+   printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max);
+
+   // Shutdown
+   status = cublasShutdown();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! shutdown error (A)\n");
+   exit(1);
+   }
+
+   */
+
+  // return result
+  *norm = max;
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("after get_norm_acoustic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
+
+  // reduction example:
+  __shared__ realw sdata[256] ;
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+
+  // loads values into shared memory: assume array is a vector array
+  sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
+                                 + array[i*3+1]*array[i*3+1]
+                                 + array[i*3+2]*array[i*3+2]) : 0.0 ;
+
+  __syncthreads();
+
+  // do reduction in shared mem
+  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+  {
+    if (tid < s){
+      // summation:
+      //sdata[tid] += sdata[tid + s];
+      // maximum:
+      if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+    }
+    __syncthreads();
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) d_max[blockIdx.x] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_elastic_from_device,
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                                 long* Mesh_pointer_f,
+                                                 int* SIMULATION_TYPE) {
+
+  TRACE("get_norm_elastic_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  realw max;
+  realw *d_max;
+
+  max = 0;
+
+  // launch simple reduction kernel
+  realw* h_max;
+  int blocksize = 256;
+
+  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
+  //printf("num_blocks_x %i \n",num_blocks_x);
+
+  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
+
+  dim3 grid(num_blocks_x,1);
+  dim3 threads(blocksize,1,1);
+
+  if(*SIMULATION_TYPE == 1 ){
+    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,
+                                                mp->NGLOB_AB,
+                                                d_max);
+  }
+
+  if(*SIMULATION_TYPE == 3 ){
+    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,
+                                                mp->NGLOB_AB,
+                                                d_max);
+  }
+
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+
+  // determines max for all blocks
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+
+  cudaFree(d_max);
+  free(h_max);
+
+  // return result
+  *norm = max;
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("after get_norm_elastic_from_device");
+#endif
+}
+
+
diff --git a/src/cuda/compute_add_sources_acoustic_cuda.cu b/src/cuda/compute_add_sources_acoustic_cuda.cu
new file mode 100644
index 000000000..e91f6646a
--- /dev/null
+++ b/src/cuda/compute_add_sources_acoustic_cuda.cu
@@ -0,0 +1,370 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+// #include "epik_user.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_add_sources_acoustic_kernel(realw* potential_dot_dot_acoustic,
+                                                    int* ibool,
+                                                    int* ispec_is_inner,
+                                                    int phase_is_inner,
+                                                    realw* sourcearrays,
+                                                    double* stf_pre_compute,
+                                                    int myrank,
+                                                    int* islice_selected_source,
+                                                    int* ispec_selected_source,
+                                                    int* ispec_is_acoustic,
+                                                    realw* kappastore,
+                                                    int NSOURCES) {
+  int i = threadIdx.x;
+  int j = threadIdx.y;
+  int k = threadIdx.z;
+
+  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+  int ispec;
+  int iglob;
+  realw stf;
+  realw kappal;
+
+  if( isource < NSOURCES ){
+
+    if(myrank == islice_selected_source[isource]) {
+
+      ispec = ispec_selected_source[isource]-1;
+
+      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
+
+        stf = (realw) stf_pre_compute[isource];
+        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+        kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+        atomicAdd(&potential_dot_dot_acoustic[iglob],
+                  -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
+
+        //      potential_dot_dot_acoustic[iglob] +=
+        //                -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_ac_cuda,
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* NSOURCESf,
+                                                 int* SIMULATION_TYPEf,
+                                                 double* h_stf_pre_compute,
+                                                 int* myrankf) {
+
+TRACE("compute_add_sources_ac_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  // copies pre-computed source time factors onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                        mp->d_ibool,
+                                                        mp->d_ispec_is_inner,
+                                                        phase_is_inner,
+                                                        mp->d_sourcearrays,
+                                                        mp->d_stf_pre_compute,
+                                                        myrank,
+                                                        mp->d_islice_selected_source,
+                                                        mp->d_ispec_selected_source,
+                                                        mp->d_ispec_is_acoustic,
+                                                        mp->d_kappastore,
+                                                        NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_ac_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_ac_s3_cuda,
+              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
+                                                      int* phase_is_innerf,
+                                                      int* NSOURCESf,
+                                                      int* SIMULATION_TYPEf,
+                                                      double* h_stf_pre_compute,
+                                                      int* myrankf) {
+
+TRACE("compute_add_sources_ac_s3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  // copies source time factors onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                        mp->d_ibool,
+                                                        mp->d_ispec_is_inner,
+                                                        phase_is_inner,
+                                                        mp->d_sourcearrays,
+                                                        mp->d_stf_pre_compute,
+                                                        myrank,
+                                                        mp->d_islice_selected_source,
+                                                        mp->d_ispec_selected_source,
+                                                        mp->d_ispec_is_acoustic,
+                                                        mp->d_kappastore,
+                                                        NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic adjoint sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_acoustic,
+                                                      int nrec,
+                                                      realw* adj_sourcearrays,
+                                                      int* ibool,
+                                                      int* ispec_is_inner,
+                                                      int* ispec_is_acoustic,
+                                                      int* ispec_selected_rec,
+                                                      int phase_is_inner,
+                                                      int* pre_computed_irec,
+                                                      int nadj_rec_local,
+                                                      realw* kappastore) {
+
+  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
+
+  // because of grid shape, irec_local can be too big
+  if(irec_local < nadj_rec_local) {
+
+    int irec = pre_computed_irec[irec_local];
+
+    int ispec = ispec_selected_rec[irec]-1;
+    if( ispec_is_acoustic[ispec] ){
+
+      // checks if element is in phase_is_inner run
+      if(ispec_is_inner[ispec] == phase_is_inner) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        int k = threadIdx.z;
+
+        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+        //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+        //                                            pre_computed_irec_local_index[irec],
+        //                                            pre_computed_index,
+        //                                            0,
+        //                                            i,j,k)]/kappal;
+
+        // beware, for acoustic medium, a pressure source would be taking the negative
+        // and divide by Kappa of the fluid;
+        // this would have to be done when constructing the adjoint source.
+        //
+        // note: we take the first component of the adj_sourcearrays
+        //          the idea is to have e.g. a pressure source, where all 3 components would be the same
+        realw stf = adj_sourcearrays[INDEX5(5,5,5,3,i,j,k,0,irec_local)]; // / kappal
+
+        atomicAdd(&potential_dot_dot_acoustic[iglob],stf);
+
+                  //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+                  //                         pre_computed_irec_local_index[irec],pre_computed_index-1,
+                  //                         0,i,j,k)] // / kappal
+                  //                         );
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
+              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_acoustic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+
+TRACE("add_sources_ac_sim_2_or_3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // checks
+  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
+
+  // make sure grid dimension is less than 65535 in x dimension
+  int num_blocks_x = mp->nadj_rec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(5,5,5);
+
+  // build slice of adj_sourcearrays because full array is *very* large.
+  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  int ispec,i,j,k;
+  int irec_local = 0;
+  for(int irec = 0; irec < *nrec; irec++) {
+    if(*myrank == h_islice_selected_rec[irec]) {
+      irec_local++;
+
+      // takes only acoustic sources
+      ispec = h_ispec_selected_rec[irec]-1;
+      if( h_ispec_is_acoustic[ispec] ){
+
+        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+          for(k=0;k<5;k++) {
+            for(j=0;j<5;j++) {
+              for(i=0;i<5;i++) {
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            0,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            1,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            2,i,j,k)];
+              }
+            }
+          }
+        } // phase_is_inner
+      } // h_ispec_is_acoustic
+    }
+  }
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+  // copies extracted array values onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+                              (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
+
+  // launches cuda kernel for acoustic adjoint sources
+  add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                          *nrec,
+                                                          mp->d_adj_sourcearrays,
+                                                          mp->d_ibool,
+                                                          mp->d_ispec_is_inner,
+                                                          mp->d_ispec_is_acoustic,
+                                                          mp->d_ispec_selected_rec,
+                                                          *phase_is_inner,
+                                                          mp->d_pre_computed_irec,
+                                                          mp->nadj_rec_local,
+                                                          mp->d_kappastore);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
+#endif
+}
diff --git a/src/cuda/compute_add_sources_elastic_cuda.cu b/src/cuda/compute_add_sources_elastic_cuda.cu
new file mode 100644
index 000000000..056b6a07c
--- /dev/null
+++ b/src/cuda/compute_add_sources_elastic_cuda.cu
@@ -0,0 +1,422 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+// #include "epik_user.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic domain sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_add_sources_kernel(realw* accel,
+                                           int* ibool,
+                                           int* ispec_is_inner,
+                                           int phase_is_inner,
+                                           realw* sourcearrays,
+                                           double* stf_pre_compute,
+                                           int myrank,
+                                           int* islice_selected_source,
+                                           int* ispec_selected_source,
+                                           int* ispec_is_elastic,
+                                           int NSOURCES) {
+  int i = threadIdx.x;
+  int j = threadIdx.y;
+  int k = threadIdx.z;
+
+  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
+  int ispec;
+  int iglob;
+  realw stf;
+
+  if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
+    if(myrank == islice_selected_source[isource]) {
+
+      ispec = ispec_selected_source[isource]-1;
+
+      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+
+        stf = (realw) stf_pre_compute[isource];
+        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        atomicAdd(&accel[iglob*3],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
+        atomicAdd(&accel[iglob*3+1],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
+        atomicAdd(&accel[iglob*3+2],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
+      }
+    }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_el_cuda,
+              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* NSOURCESf,
+                                            double* h_stf_pre_compute,
+                                            int* myrankf) {
+
+TRACE("compute_add_sources_el_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  //double* d_stf_pre_compute;
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
+                                               mp->d_ibool,
+                                               mp->d_ispec_is_inner,
+                                               phase_is_inner,
+                                               mp->d_sourcearrays,
+                                               mp->d_stf_pre_compute,
+                                               myrank,
+                                               mp->d_islice_selected_source,
+                                               mp->d_ispec_selected_source,
+                                               mp->d_ispec_is_elastic,
+                                               NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_kernel");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_el_s3_cuda,
+              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
+                                              double* h_stf_pre_compute,
+                                              int* NSOURCESf,
+                                              int* phase_is_inner,
+                                              int* myrank) {
+  TRACE("compute_add_sources_el_s3_cuda");
+  // EPIK_TRACER("compute_add_sources_el_s3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int NSOURCES = *NSOURCESf;
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+#endif
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_ibool,
+                                               mp->d_ispec_is_inner, *phase_is_inner,
+                                               mp->d_sourcearrays,
+                                               mp->d_stf_pre_compute,
+                                               *myrank,
+                                               mp->d_islice_selected_source,mp->d_ispec_selected_source,
+                                               mp->d_ispec_is_elastic,
+                                               NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// NOISE sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool,
+                                                        int* ispec_selected_rec,
+                                                        int irec_master_noise,
+                                                        realw* accel,
+                                                        realw* noise_sourcearray,
+                                                        int it) {
+  int tx = threadIdx.x;
+  int iglob = ibool[tx + NGLL3*(ispec_selected_rec[irec_master_noise-1]-1)]-1;
+
+  // not sure if we need atomic operations but just in case...
+  // accel[3*iglob] += noise_sourcearray[3*tx + 3*125*it];
+  // accel[1+3*iglob] += noise_sourcearray[1+3*tx + 3*125*it];
+  // accel[2+3*iglob] += noise_sourcearray[2+3*tx + 3*125*it];
+
+  atomicAdd(&accel[iglob*3],noise_sourcearray[3*tx + 3*NGLL3*it]);
+  atomicAdd(&accel[iglob*3+1],noise_sourcearray[1+3*tx + 3*NGLL3*it]);
+  atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*NGLL3*it]);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(add_source_master_rec_noise_cu,
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+                                                int* myrank_f,
+                                                int* it_f,
+                                                int* irec_master_noise_f,
+                                                int* islice_selected_rec) {
+
+TRACE("add_source_master_rec_noise_cu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  int it = *it_f-1; // -1 for Fortran -> C indexing differences
+  int irec_master_noise = *irec_master_noise_f;
+  int myrank = *myrank_f;
+
+  dim3 grid(1,1,1);
+  dim3 threads(NGLL3,1,1);
+
+  if(myrank == islice_selected_rec[irec_master_noise-1]) {
+    add_source_master_rec_noise_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool,
+                                                              mp->d_ispec_selected_rec,
+                                                              irec_master_noise,
+                                                              mp->d_accel,
+                                                              mp->d_noise_sourcearray,
+                                                              it);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
+#endif
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ADJOINT sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
+                                                     int nrec,
+                                                     realw* adj_sourcearrays,
+                                                     int* ibool,
+                                                     int* ispec_is_inner,
+                                                     int* ispec_is_elastic,
+                                                     int* ispec_selected_rec,
+                                                     int phase_is_inner,
+                                                     int* pre_computed_irec,
+                                                     int nadj_rec_local) {
+
+  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
+
+  if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
+    int irec = pre_computed_irec[irec_local];
+
+    int ispec = ispec_selected_rec[irec]-1;
+    if( ispec_is_elastic[ispec] ){
+
+      if(ispec_is_inner[ispec] == phase_is_inner) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        int k = threadIdx.z;
+        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        // atomic operations are absolutely necessary for correctness!
+        atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+                                                              i,j,k,
+                                                              0,
+                                                              irec_local)]);
+
+        atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
+                                                             i,j,k,
+                                                             1,
+                                                             irec_local)]);
+
+        atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+                                                            i,j,k,
+                                                            2,
+                                                            irec_local)]);
+      }
+    } // ispec_is_elastic
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_elastic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+
+TRACE("add_sources_el_sim_type_2_or_3");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // checks
+  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
+
+  // make sure grid dimension is less than 65535 in x dimension
+  int num_blocks_x = mp->nadj_rec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(5,5,5);
+
+  // build slice of adj_sourcearrays because full array is *very* large.
+  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  int ispec,i,j,k;
+  int irec_local = 0;
+  for(int irec = 0; irec < *nrec; irec++) {
+    if(*myrank == h_islice_selected_rec[irec]) {
+      irec_local++;
+
+      // takes only elastic sources
+      ispec = h_ispec_selected_rec[irec]-1;
+      if( h_ispec_is_elastic[ispec] ){
+
+        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+          for(k=0;k<5;k++) {
+            for(j=0;j<5;j++) {
+              for(i=0;i<5;i++) {
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,0,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    0,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,1,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    1,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,2,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    2,i,j,k)];
+              }
+            }
+          }
+        } // phase_is_inner
+      } // h_ispec_is_elastic
+    }
+  }
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+  // copies extracted array values onto GPU
+  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+             (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+
+
+  // the irec_local variable needs to be precomputed (as
+  // h_pre_comp..), because normally it is in the loop updating accel,
+  // and due to how it's incremented, it cannot be parallelized
+
+  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
+                                                         *nrec,
+                                                         mp->d_adj_sourcearrays,
+                                                         mp->d_ibool,
+                                                         mp->d_ispec_is_inner,
+                                                         mp->d_ispec_is_elastic,
+                                                         mp->d_ispec_selected_rec,
+                                                         *phase_is_inner,
+                                                         mp->d_pre_computed_irec,
+                                                         mp->nadj_rec_local);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
+#endif
+}
+
diff --git a/src/cuda/compute_coupling_cuda.cu b/src/cuda/compute_coupling_cuda.cu
new file mode 100644
index 000000000..bdf7c798e
--- /dev/null
+++ b/src/cuda/compute_coupling_cuda.cu
@@ -0,0 +1,345 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC - ELASTIC coupling
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_coupling_acoustic_el_kernel(realw* displ,
+                                                    realw* potential_dot_dot_acoustic,
+                                                    int num_coupling_ac_el_faces,
+                                                    int* coupling_ac_el_ispec,
+                                                    int* coupling_ac_el_ijk,
+                                                    realw* coupling_ac_el_normal,
+                                                    realw* coupling_ac_el_jacobian2Dw,
+                                                    int* ibool,
+                                                    int* ispec_is_inner,
+                                                    int phase_is_inner) {
+
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+  int i,j,k,iglob,ispec;
+  realw displ_x,displ_y,displ_z,displ_n;
+  realw nx,ny,nz;
+  realw jacobianw;
+
+  if( iface < num_coupling_ac_el_faces){
+
+    // don't compute points outside NGLLSQUARE==NGLL2==25
+    // way 2: no further check needed since blocksize = 25
+    //  if(igll<NGLL2) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = coupling_ac_el_ispec[iface] - 1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner ) {
+
+      i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1;
+      j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+      k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+      // elastic displacement on global point
+      displ_x = displ[iglob*3] ; // (1,iglob)
+      displ_y = displ[iglob*3+1] ; // (2,iglob)
+      displ_z = displ[iglob*3+2] ; // (3,iglob)
+
+      // gets associated normal on GLL point
+      nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface)
+      ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface)
+      nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface)
+
+      // calculates displacement component along normal
+      // (normal points outwards of acoustic element)
+      displ_n = displ_x*nx + displ_y*ny + displ_z*nz;
+
+      // gets associated, weighted jacobian
+      jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+      // continuity of pressure and normal displacement on global point
+
+      // note: Newmark time scheme together with definition of scalar potential:
+      //          pressure = - chi_dot_dot
+      //          requires that this coupling term uses the updated displacement at time step [t+delta_t],
+      //          which is done at the very beginning of the time loop
+      //          (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+      //          it also means you have to calculate and update this here first before
+      //          calculating the coupling on the elastic side for the acceleration...
+      atomicAdd(&potential_dot_dot_acoustic[iglob],+ jacobianw*displ_n);
+
+    }
+  //  }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_coupling_ac_el_cuda,
+              COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* num_coupling_ac_el_facesf,
+                                           int* SIMULATION_TYPEf) {
+  TRACE("compute_coupling_ac_el_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int phase_is_inner            = *phase_is_innerf;
+  int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
+  int SIMULATION_TYPE           = *SIMULATION_TYPEf;
+
+  // way 1: exact blocksize to match NGLLSQUARE
+  int blocksize = NGLL2;
+  int num_blocks_x = num_coupling_ac_el_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // launches GPU kernel
+  compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_displ,
+                                                       mp->d_potential_dot_dot_acoustic,
+                                                       num_coupling_ac_el_faces,
+                                                       mp->d_coupling_ac_el_ispec,
+                                                       mp->d_coupling_ac_el_ijk,
+                                                       mp->d_coupling_ac_el_normal,
+                                                       mp->d_coupling_ac_el_jacobian2Dw,
+                                                       mp->d_ibool,
+                                                       mp->d_ispec_is_inner,
+                                                       phase_is_inner);
+
+  //  adjoint simulations
+  if (SIMULATION_TYPE == 3 ){
+    compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_b_displ,
+                                                          mp->d_b_potential_dot_dot_acoustic,
+                                                          num_coupling_ac_el_faces,
+                                                          mp->d_coupling_ac_el_ispec,
+                                                          mp->d_coupling_ac_el_ijk,
+                                                          mp->d_coupling_ac_el_normal,
+                                                          mp->d_coupling_ac_el_jacobian2Dw,
+                                                          mp->d_ibool,
+                                                          mp->d_ispec_is_inner,
+                                                          phase_is_inner);
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_coupling_acoustic_el_kernel");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC - ACOUSTIC coupling
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_coupling_elastic_ac_kernel(realw* potential_dot_dot_acoustic,
+                                                    realw* accel,
+                                                    int num_coupling_ac_el_faces,
+                                                    int* coupling_ac_el_ispec,
+                                                    int* coupling_ac_el_ijk,
+                                                    realw* coupling_ac_el_normal,
+                                                    realw* coupling_ac_el_jacobian2Dw,
+                                                    int* ibool,
+                                                    int* ispec_is_inner,
+                                                    int phase_is_inner,
+                                                    int gravity,
+                                                    realw* minus_g,
+                                                    realw* rhostore,
+                                                    realw* displ) {
+
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+  int i,j,k,iglob,ispec;
+  realw pressure;
+  realw nx,ny,nz;
+  realw jacobianw;
+  realw rhol;
+
+  if( iface < num_coupling_ac_el_faces){
+
+    // don't compute points outside NGLLSQUARE==NGLL2==25
+    // way 2: no further check needed since blocksize = 25
+    //  if(igll<NGLL2) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = coupling_ac_el_ispec[iface] - 1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner ) {
+
+      i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1;
+      j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+      k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+      // gets associated normal on GLL point
+      // note: normal points away from acoustic element
+      nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface)
+      ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface)
+      nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface)
+
+      // gets associated, weighted jacobian
+      jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+      // acoustic pressure on global point
+      if( gravity ){
+        // takes density (from acoustic? element)
+        rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
+        // note: uses potential chi such that displacement s = grad(chi),
+        //         pressure becomes: p = - kappa ( div( s ) ) = rho ( - dot_dot_chi + g * s )
+        //  g only acting in negative z-direction
+
+        // daniel: TODO - check gravity and coupling would be displ * nz  correct?
+        pressure = rhol*( - potential_dot_dot_acoustic[iglob]
+                         + minus_g[iglob] * displ[iglob*3+2] );
+
+        //daniel: TODO - check gravity and coupling
+        //pressure = - potential_dot_dot_acoustic[iglob] ;
+        //if( iface == 128 && igll == 5 ){
+        //  printf("coupling acoustic: %f %f \n",potential_dot_dot_acoustic[iglob],
+        //             minus_g[iglob] * displ[iglob*3+2]);
+        //}
+
+      }else{
+        // no gravity: uses potential chi such that displacement s = 1/rho grad(chi)
+        //                  pressure p = - kappa ( div( s ) ) then becomes: p = - dot_dot_chi
+        //                  ( multiplied with factor 1/kappa due to setup of equation of motion )
+        pressure = - potential_dot_dot_acoustic[iglob];
+      }
+
+      // continuity of displacement and pressure on global point
+      //
+      // note: Newmark time scheme together with definition of scalar potential:
+      //          pressure = - chi_dot_dot
+      //          requires that this coupling term uses the *UPDATED* pressure (chi_dot_dot), i.e.
+      //          pressure at time step [t + delta_t]
+      //          (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+      //          it means you have to calculate and update the acoustic pressure first before
+      //          calculating this term...
+      atomicAdd(&accel[iglob*3],+ jacobianw*nx*pressure);
+      atomicAdd(&accel[iglob*3+1],+ jacobianw*ny*pressure);
+      atomicAdd(&accel[iglob*3+2],+ jacobianw*nz*pressure);
+    }
+    //  }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_coupling_el_ac_cuda,
+              COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* num_coupling_ac_el_facesf,
+                                           int* SIMULATION_TYPEf) {
+  TRACE("compute_coupling_el_ac_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int phase_is_inner            = *phase_is_innerf;
+  int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
+  int SIMULATION_TYPE           = *SIMULATION_TYPEf;
+
+  // way 1: exact blocksize to match NGLLSQUARE
+  int blocksize = 25;
+
+  int num_blocks_x = num_coupling_ac_el_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // launches GPU kernel
+  compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                       mp->d_accel,
+                                                       num_coupling_ac_el_faces,
+                                                       mp->d_coupling_ac_el_ispec,
+                                                       mp->d_coupling_ac_el_ijk,
+                                                       mp->d_coupling_ac_el_normal,
+                                                       mp->d_coupling_ac_el_jacobian2Dw,
+                                                       mp->d_ibool,
+                                                       mp->d_ispec_is_inner,
+                                                       phase_is_inner,
+                                                       mp->gravity,
+                                                       mp->d_minus_g,
+                                                       mp->d_rhostore,
+                                                       mp->d_displ);
+
+  //  adjoint simulations
+  if (SIMULATION_TYPE == 3 ){
+    compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                         mp->d_b_accel,
+                                                         num_coupling_ac_el_faces,
+                                                         mp->d_coupling_ac_el_ispec,
+                                                         mp->d_coupling_ac_el_ijk,
+                                                         mp->d_coupling_ac_el_normal,
+                                                         mp->d_coupling_ac_el_jacobian2Dw,
+                                                         mp->d_ibool,
+                                                         mp->d_ispec_is_inner,
+                                                         phase_is_inner,
+                                                         mp->gravity,
+                                                         mp->d_minus_g,
+                                                         mp->d_rhostore,
+                                                         mp->d_b_displ);
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_coupling_el_ac_cuda");
+#endif
+}
diff --git a/src/cuda/compute_forces_acoustic_cuda.cu b/src/cuda/compute_forces_acoustic_cuda.cu
new file mode 100644
index 000000000..e07cf712a
--- /dev/null
+++ b/src/cuda/compute_forces_acoustic_cuda.cu
@@ -0,0 +1,966 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// prepares a device array with with all inter-element edge-nodes -- this
+// is followed by a memcpy and MPI operations
+__global__ void prepare_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic,
+                                                     realw* d_send_potential_dot_dot_buffer,
+                                                     int num_interfaces_ext_mesh,
+                                                     int max_nibool_interfaces_ext_mesh,
+                                                     int* d_nibool_interfaces_ext_mesh,
+                                                     int* d_ibool_interfaces_ext_mesh) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int iinterface=0;
+
+  for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+    if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
+      d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces_ext_mesh*iinterface)] =
+        d_potential_dot_dot_acoustic[(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)];
+    }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// prepares and transfers the inter-element edge-nodes to the host to be MPI'd
+extern "C"
+void FC_FUNC_(transfer_boun_pot_from_device,
+              TRANSFER_BOUN_POT_FROM_DEVICE)(
+                                              int* size,
+                                              long* Mesh_pointer_f,
+                                              realw* potential_dot_dot_acoustic,
+                                              realw* send_potential_dot_dot_buffer,
+                                              int* num_interfaces_ext_mesh,
+                                              int* max_nibool_interfaces_ext_mesh,
+                                              int* nibool_interfaces_ext_mesh,
+                                              int* ibool_interfaces_ext_mesh,
+                                              int* FORWARD_OR_ADJOINT){
+
+TRACE("transfer_boun_pot_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  if( *num_interfaces_ext_mesh == 0 ) return;
+
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_ext_mesh))/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if(*FORWARD_OR_ADJOINT == 1) {
+    prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                         mp->d_send_potential_dot_dot_buffer,
+                                                         mp->num_interfaces_ext_mesh,
+                                                         mp->max_nibool_interfaces_ext_mesh,
+                                                         mp->d_nibool_interfaces_ext_mesh,
+                                                         mp->d_ibool_interfaces_ext_mesh);
+  }
+  else if(*FORWARD_OR_ADJOINT == 3) {
+    prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                           mp->d_send_potential_dot_dot_buffer,
+                                                           mp->num_interfaces_ext_mesh,
+                                                           mp->max_nibool_interfaces_ext_mesh,
+                                                           mp->d_nibool_interfaces_ext_mesh,
+                                                           mp->d_ibool_interfaces_ext_mesh);
+  }
+
+  print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
+      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),cudaMemcpyDeviceToHost),98000);
+
+  // finish timing of kernel+memcpy
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("boundary xfer d->h Time: %f ms\n",time);
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_boundary_kernel");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void assemble_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic,
+                                                      realw* d_send_potential_dot_dot_buffer,
+                                                      int num_interfaces_ext_mesh,
+                                                      int max_nibool_interfaces_ext_mesh,
+                                                      int* d_nibool_interfaces_ext_mesh,
+                                                      int* d_ibool_interfaces_ext_mesh) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int iinterface=0;
+
+  for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+    if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
+
+      // for testing atomic operations against not atomic operations (0.1ms vs. 0.04 ms)
+      // d_potential_dot_dot_acoustic[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)] +=
+      // d_send_potential_dot_dot_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)];
+
+      atomicAdd(&d_potential_dot_dot_acoustic[(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
+                d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces_ext_mesh*iinterface)]);
+    }
+  }
+  // ! This step is done via previous function transfer_and_assemble...
+  // ! do iinterface = 1, num_interfaces_ext_mesh
+  // !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+  // !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+  // !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+  // !   enddo
+  // ! enddo
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_asmbl_pot_to_device,
+              TRANSFER_ASMBL_POT_TO_DEVICE)(
+                                                long* Mesh_pointer,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* buffer_recv_scalar_ext_mesh,
+                                                int* num_interfaces_ext_mesh,
+                                                int* max_nibool_interfaces_ext_mesh,
+                                                int* nibool_interfaces_ext_mesh,
+                                                int* ibool_interfaces_ext_mesh,
+                                                int* FORWARD_OR_ADJOINT) {
+
+TRACE("transfer_asmbl_pot_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  //double start_time = get_time();
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  // copies buffer onto GPU
+  cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
+             (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice);
+
+  // assembles on GPU
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if(*FORWARD_OR_ADJOINT == 1) {
+    //assemble forward field
+    assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                          mp->d_send_potential_dot_dot_buffer,
+                                                          mp->num_interfaces_ext_mesh,
+                                                          mp->max_nibool_interfaces_ext_mesh,
+                                                          mp->d_nibool_interfaces_ext_mesh,
+                                                          mp->d_ibool_interfaces_ext_mesh);
+  }
+  else if(*FORWARD_OR_ADJOINT == 3) {
+    //assemble reconstructed/backward field
+    assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                            mp->d_send_potential_dot_dot_buffer,
+                                                            mp->num_interfaces_ext_mesh,
+                                                            mp->max_nibool_interfaces_ext_mesh,
+                                                            mp->d_nibool_interfaces_ext_mesh,
+                                                            mp->d_ibool_interfaces_ext_mesh);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("transfer_asmbl_pot_to_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* KERNEL 2 */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,
+                                       int NGLOB, int* d_ibool,
+                                       int* d_phase_ispec_inner_acoustic,
+                                       int num_phase_ispec_acoustic,
+                                       int d_iphase,
+                                       int use_mesh_coloring_gpu,
+                                       realw* d_potential_acoustic, realw* d_potential_dot_dot_acoustic,
+                                       realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                       realw* d_etax, realw* d_etay, realw* d_etaz,
+                                       realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                       realw* hprime_xx, realw* hprime_yy, realw* hprime_zz,
+                                       realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz,
+                                       realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
+                                       realw* d_rhostore,
+                                       int gravity,
+                                       realw* minus_g,
+                                       realw* d_kappastore,
+                                       realw* wgll_cube){
+
+  int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  int tx = threadIdx.x;
+
+  //const int NGLL3 = NGLL3;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+
+  int K = (tx/NGLL2);
+  int J = ((tx-K*NGLL2)/NGLLX);
+  int I = (tx-K*NGLL2-J*NGLLX);
+
+  int active,offset;
+  int iglob = 0;
+  int working_element;
+  reald temp1l,temp2l,temp3l;
+  reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  reald dpotentialdxl,dpotentialdyl,dpotentialdzl;
+  reald fac1,fac2,fac3;
+  reald rho_invl,kappa_invl;
+  reald sum_terms;
+  reald gravity_term;
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+  int l;
+  int offset1,offset2,offset3;
+  realw hp1,hp2,hp3;
+#endif
+
+  __shared__ reald s_dummy_loc[NGLL3];
+
+  __shared__ reald s_temp1[NGLL3];
+  __shared__ reald s_temp2[NGLL3];
+  __shared__ reald s_temp3[NGLL3];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+  active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+  if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+    working_element = bx;
+#else
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+      working_element = bx;
+    }else{
+      // iphase-1 and working_element-1 for Fortran->C array conventions
+      working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)]-1;
+    }
+#endif
+
+    // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
+    iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES
+    s_dummy_loc[tx] = tex1Dfetch(tex_potential_acoustic, iglob);
+#else
+    // changing iglob indexing to match fortran row changes fast style
+    s_dummy_loc[tx] = d_potential_acoustic[iglob];
+#endif
+  }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+  __syncthreads();
+
+#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+
+  if (active) {
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//      if(iglob == 0 )printf("kernel 2: iglob %i  hprime_xx %f %f %f \n",iglob,hprime_xx[0],hprime_xx[1],hprime_xx[2]);
+#endif
+
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+    temp1l = 0.f;
+    temp2l = 0.f;
+    temp3l = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+        hp1 = hprime_xx[l*NGLLX+I];
+        offset1 = K*NGLL2+J*NGLLX+l;
+        temp1l += s_dummy_loc[offset1]*hp1;
+
+        //no more assumes that hprime_xx = hprime_yy = hprime_zz
+        hp2 = hprime_yy[l*NGLLX+J];
+        offset2 = K*NGLL2+l*NGLLX+I;
+        temp2l += s_dummy_loc[offset2]*hp2;
+
+        hp3 = hprime_zz[l*NGLLX+K];
+        offset3 = l*NGLL2+J*NGLLX+I;
+        temp3l += s_dummy_loc[offset3]*hp3;
+    }
+#else
+
+    temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*hprime_xx[I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+1]*hprime_xx[NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+2]*hprime_xx[2*NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];
+
+    temp2l = s_dummy_loc[K*NGLL2+I]*hprime_yy[J]
+            + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_yy[NGLLX+J]
+            + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_yy[2*NGLLX+J]
+            + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_yy[3*NGLLX+J]
+            + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_yy[4*NGLLX+J];
+
+    temp3l = s_dummy_loc[J*NGLLX+I]*hprime_zz[K]
+            + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_zz[NGLLX+K]
+            + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_zz[2*NGLLX+K]
+            + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_zz[3*NGLLX+K]
+            + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_zz[4*NGLLX+K];
+
+#endif
+
+    // compute derivatives of ux, uy and uz with respect to x, y and z
+    offset = working_element*NGLL3_ALIGN + tx;
+
+    xixl = d_xix[offset];
+    xiyl = d_xiy[offset];
+    xizl = d_xiz[offset];
+    etaxl = d_etax[offset];
+    etayl = d_etay[offset];
+    etazl = d_etaz[offset];
+    gammaxl = d_gammax[offset];
+    gammayl = d_gammay[offset];
+    gammazl = d_gammaz[offset];
+
+    jacobianl = 1.f / (xixl*(etayl*gammazl-etazl*gammayl)
+                      -xiyl*(etaxl*gammazl-etazl*gammaxl)
+                      +xizl*(etaxl*gammayl-etayl*gammaxl));
+
+    // derivatives of potential
+    dpotentialdxl = xixl*temp1l + etaxl*temp2l + gammaxl*temp3l;
+    dpotentialdyl = xiyl*temp1l + etayl*temp2l + gammayl*temp3l;
+    dpotentialdzl = xizl*temp1l + etazl*temp2l + gammazl*temp3l;
+
+    // pre-computes gravity sum term
+    if( gravity ){
+      // uses potential definition: s = grad(chi)
+
+      // gravity term: 1/kappa grad(chi) * g
+      // assumes that g only acts in (negative) z-direction
+      kappa_invl = 1.f / d_kappastore[working_element*NGLL3 + tx];
+      iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+      // daniel: TODO - check gravity
+//      if( kappa_invl <= 0.0f ){
+//        printf("kappa error: %f %f\n",kappa_invl,d_kappastore[working_element*NGLL3 + tx]);
+//        printf("kappa error: thread %d %d \n",tx,working_element);
+//        asm("trap;");
+//      }
+//      if( iglob <= 0 ){
+//        printf("iglob error: %d %d %d \n",iglob,tx,working_element);
+//        asm("trap;");
+//      }
+
+      gravity_term = minus_g[iglob] * kappa_invl * jacobianl * wgll_cube[tx] * dpotentialdzl;
+
+      // daniel: TODO - check gravity
+      //gravity_term = 0.f;
+      //if( iglob == 5 ){
+      //  printf("iglob infos: %f %f %f %f %f \n",minus_g[iglob],kappa_invl,jacobianl,wgll_cube[tx],dpotentialdzl);
+      //}
+    }
+
+    // density (reciproc)
+    rho_invl = 1.f / d_rhostore[offset];
+
+    // form the dot product with the test vector
+    s_temp1[tx] = jacobianl * rho_invl * (dpotentialdxl*xixl + dpotentialdyl*xiyl + dpotentialdzl*xizl);
+    s_temp2[tx] = jacobianl * rho_invl * (dpotentialdxl*etaxl + dpotentialdyl*etayl + dpotentialdzl*etazl);
+    s_temp3[tx] = jacobianl * rho_invl * (dpotentialdxl*gammaxl + dpotentialdyl*gammayl + dpotentialdzl*gammazl);
+  }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+  __syncthreads();
+
+  if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+    temp1l = 0.f;
+    temp2l = 0.f;
+    temp3l = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+        fac1 = hprimewgll_xx[I*NGLLX+l];
+        offset1 = K*NGLL2+J*NGLLX+l;
+        temp1l += s_temp1[offset1]*fac1;
+
+        //no more assumes hprimewgll_xx = hprimewgll_yy = hprimewgll_zz
+        fac2 = hprimewgll_yy[J*NGLLX+l];
+        offset2 = K*NGLL2+l*NGLLX+I;
+        temp2l += s_temp2[offset2]*fac2;
+
+        fac3 = hprimewgll_zz[K*NGLLX+l];
+        offset3 = l*NGLL2+J*NGLLX+I;
+        temp3l += s_temp3[offset3]*fac3;
+    }
+#else
+
+    temp1l = s_temp1[K*NGLL2+J*NGLLX]*hprimewgll_xx[I*NGLLX]
+            + s_temp1[K*NGLL2+J*NGLLX+1]*hprimewgll_xx[I*NGLLX+1]
+            + s_temp1[K*NGLL2+J*NGLLX+2]*hprimewgll_xx[I*NGLLX+2]
+            + s_temp1[K*NGLL2+J*NGLLX+3]*hprimewgll_xx[I*NGLLX+3]
+            + s_temp1[K*NGLL2+J*NGLLX+4]*hprimewgll_xx[I*NGLLX+4];
+
+
+    temp2l = s_temp2[K*NGLL2+I]*hprimewgll_yy[J*NGLLX]
+            + s_temp2[K*NGLL2+NGLLX+I]*hprimewgll_yy[J*NGLLX+1]
+            + s_temp2[K*NGLL2+2*NGLLX+I]*hprimewgll_yy[J*NGLLX+2]
+            + s_temp2[K*NGLL2+3*NGLLX+I]*hprimewgll_yy[J*NGLLX+3]
+            + s_temp2[K*NGLL2+4*NGLLX+I]*hprimewgll_yy[J*NGLLX+4];
+
+
+    temp3l = s_temp3[J*NGLLX+I]*hprimewgll_zz[K*NGLLX]
+            + s_temp3[NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+1]
+            + s_temp3[2*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+2]
+            + s_temp3[3*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+3]
+            + s_temp3[4*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+4];
+
+
+#endif
+
+    fac1 = wgllwgll_yz[K*NGLLX+J];
+    fac2 = wgllwgll_xz[K*NGLLX+I];
+    fac3 = wgllwgll_xy[J*NGLLX+I];
+
+    sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l);
+    if( gravity ) sum_terms += gravity_term;
+
+    iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES
+    d_potential_dot_dot_acoustic[iglob] = tex1Dfetch(tex_potential_dot_dot_acoustic, iglob)
+                                            + sum_terms;
+#else
+
+#ifdef USE_MESH_COLORING_GPU
+    // no atomic operation needed, colors don't share global points between elements
+    d_potential_dot_dot_acoustic[iglob] += sum_terms;
+#else
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+
+      // no atomic operation needed, colors don't share global points between elements
+      d_potential_dot_dot_acoustic[iglob] += sum_terms;
+
+    }else{
+
+      atomicAdd(&d_potential_dot_dot_acoustic[iglob],sum_terms);
+
+    }
+#endif
+
+#endif
+  }
+
+#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+  d_potential_dot_dot_acoustic[iglob] = 123.123f;
+#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
+                       int SIMULATION_TYPE,
+                       int* d_ibool,
+                       realw* d_xix,
+                       realw* d_xiy,
+                       realw* d_xiz,
+                       realw* d_etax,
+                       realw* d_etay,
+                       realw* d_etaz,
+                       realw* d_gammax,
+                       realw* d_gammay,
+                       realw* d_gammaz,
+                       realw* d_rhostore,
+                       realw* d_kappastore)
+{
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before acoustic kernel Kernel 2");
+#endif
+
+  /* if the grid can handle the number of blocks, we let it be 1D */
+  /* grid_2_x = nb_elem_color; */
+  /* nb_elem_color is just how many blocks we are computing now */
+
+  int num_blocks_x = nb_blocks_to_compute;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
+  dim3 grid_2(num_blocks_x,num_blocks_y);
+
+
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+                                                        mp->NGLOB_AB,
+                                                        d_ibool,
+                                                        mp->d_phase_ispec_inner_acoustic,
+                                                        mp->num_phase_ispec_acoustic,
+                                                        d_iphase,
+                                                        mp->use_mesh_coloring_gpu,
+                                                        mp->d_potential_acoustic, mp->d_potential_dot_dot_acoustic,
+                                                        d_xix, d_xiy, d_xiz,
+                                                        d_etax, d_etay, d_etaz,
+                                                        d_gammax, d_gammay, d_gammaz,
+                                                        mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                                        mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                        mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                        d_rhostore,
+                                                        mp->gravity,
+                                                        mp->d_minus_g,
+                                                        d_kappastore,
+                                                        mp->d_wgll_cube);
+
+  if(SIMULATION_TYPE == 3) {
+    Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+                                                          mp->NGLOB_AB,
+                                                          d_ibool,
+                                                          mp->d_phase_ispec_inner_acoustic,
+                                                          mp->num_phase_ispec_acoustic,
+                                                          d_iphase,
+                                                          mp->use_mesh_coloring_gpu,
+                                                          mp->d_b_potential_acoustic, mp->d_b_potential_dot_dot_acoustic,
+                                                          d_xix, d_xiy, d_xiz,
+                                                          d_etax, d_etay, d_etaz,
+                                                          d_gammax, d_gammay, d_gammaz,
+                                                          mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                                          mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                          mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                          d_rhostore,
+                                                          mp->gravity,
+                                                          mp->d_minus_g,
+                                                          d_kappastore,
+                                                          mp->d_wgll_cube);
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Kernel2 Execution Time: %f ms\n",time);
+
+  /* cudaThreadSynchronize(); */
+  /* TRACE("Kernel 2 finished"); */
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("Tried to start with %dx1 blocks\n",nb_blocks_to_compute);
+  exit_on_cuda_error("kernel Kernel_2");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// main compute_forces_acoustic CUDA routine
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_forces_acoustic_cuda,
+              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+                                            int* iphase,
+                                            int* nspec_outer_acoustic,
+                                            int* nspec_inner_acoustic,
+                                            int* SIMULATION_TYPE) {
+
+  TRACE("compute_forces_acoustic_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_elements;
+
+  if( *iphase == 1 )
+    num_elements = *nspec_outer_acoustic;
+  else
+    num_elements = *nspec_inner_acoustic;
+
+  if( num_elements == 0 ) return;
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+
+    // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
+    //         and followed by acoustic ones.
+    //         acoustic elements also start with outer than inner element ordering
+
+    int nb_colors,nb_blocks_to_compute;
+    int istart;
+    int color_offset,color_offset_nonpadded;
+
+    // sets up color loop
+    if( *iphase == 1 ){
+      // outer elements
+      nb_colors = mp->num_colors_outer_acoustic;
+      istart = 0;
+
+      // array offsets (acoustic elements start after elastic ones)
+      color_offset = mp->nspec_elastic * NGLL3_PADDED;
+      color_offset_nonpadded = mp->nspec_elastic * NGLL3;
+    }else{
+      // inner element colors (start after outer elements)
+      nb_colors = mp->num_colors_outer_acoustic + mp->num_colors_inner_acoustic;
+      istart = mp->num_colors_outer_acoustic;
+
+      // array offsets (inner elements start after outer ones)
+      color_offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED;
+      color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3;
+    }
+
+    // loops over colors
+    for(int icolor = istart; icolor < nb_colors; icolor++){
+
+      nb_blocks_to_compute = mp->h_num_elem_colors_acoustic[icolor];
+
+      Kernel_2_acoustic(nb_blocks_to_compute,mp,*iphase,
+                         *SIMULATION_TYPE,
+                         mp->d_ibool + color_offset_nonpadded,
+                         mp->d_xix + color_offset,
+                         mp->d_xiy + color_offset,
+                         mp->d_xiz + color_offset,
+                         mp->d_etax + color_offset,
+                         mp->d_etay + color_offset,
+                         mp->d_etaz + color_offset,
+                         mp->d_gammax + color_offset,
+                         mp->d_gammay + color_offset,
+                         mp->d_gammaz + color_offset,
+                         mp->d_rhostore + color_offset,
+                         mp->d_kappastore + color_offset_nonpadded);
+
+      // for padded and aligned arrays
+      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      // for no-aligned arrays
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+    }
+
+  }else{
+
+    // no mesh coloring: uses atomic updates
+    Kernel_2_acoustic(num_elements, mp, *iphase,
+                      *SIMULATION_TYPE,
+                      mp->d_ibool,
+                      mp->d_xix,
+                      mp->d_xiy,
+                      mp->d_xiz,
+                      mp->d_etax,
+                      mp->d_etay,
+                      mp->d_etaz,
+                      mp->d_gammax,
+                      mp->d_gammay,
+                      mp->d_gammaz,
+                      mp->d_rhostore,
+                      mp->d_kappastore);
+
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* KERNEL 3 */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void kernel_3_a_acoustic_cuda_device(realw* potential_dot_dot_acoustic,
+                                                int size,
+                                                realw* rmass_acoustic) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    // multiplies pressure with the inverse of the mass matrix
+    potential_dot_dot_acoustic[id] = potential_dot_dot_acoustic[id]*rmass_acoustic[id];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_b_acoustic_cuda_device(realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                int size,
+                                                realw deltatover2,
+                                                realw* rmass_acoustic) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    // Newmark time scheme: corrector term
+    potential_dot_acoustic[id] = potential_dot_acoustic[id] + deltatover2*potential_dot_dot_acoustic[id];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                             long* Mesh_pointer,
+                             int* size_F,
+                             int* SIMULATION_TYPE) {
+
+TRACE("kernel_3_a_acoustic_cuda");
+
+   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+   int size = *size_F;
+
+   int blocksize = BLOCKSIZE_KERNEL3;
+   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+   int num_blocks_x = size_padded/blocksize;
+   int num_blocks_y = 1;
+   while(num_blocks_x > 65535) {
+     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+     num_blocks_y = num_blocks_y*2;
+   }
+   dim3 grid(num_blocks_x,num_blocks_y);
+   dim3 threads(blocksize,1,1);
+
+   kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                       size,
+                                                       mp->d_rmass_acoustic);
+
+   if(*SIMULATION_TYPE == 3) {
+     kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                         size,
+                                                         mp->d_rmass_acoustic);
+   }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 a");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                                                             long* Mesh_pointer,
+                                                             int* size_F,
+                                                             realw* deltatover2_F,
+                                                             int* SIMULATION_TYPE,
+                                                             realw* b_deltatover2_F) {
+
+TRACE("kernel_3_b_acoustic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+  int size = *size_F;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_acoustic,
+                                                    mp->d_potential_dot_dot_acoustic,
+                                                    size, deltatover2,
+                                                    mp->d_rmass_acoustic);
+
+  if(*SIMULATION_TYPE == 3) {
+    kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_acoustic,
+                                                      mp->d_b_potential_dot_dot_acoustic,
+                                                      size, b_deltatover2,
+                                                      mp->d_rmass_acoustic);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 b");
+#endif
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* KERNEL for enforce free surface */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void enforce_free_surface_cuda_kernel(
+                                       realw* potential_acoustic,
+                                       realw* potential_dot_acoustic,
+                                       realw* potential_dot_dot_acoustic,
+                                       int num_free_surface_faces,
+                                       int* free_surface_ispec,
+                                       int* free_surface_ijk,
+                                       int* ibool,
+                                       int* ispec_is_acoustic) {
+  // gets spectral element face id
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+  // for all faces on free surface
+  if( iface < num_free_surface_faces ){
+
+    int ispec = free_surface_ispec[iface]-1;
+
+    // checks if element is in acoustic domain
+    if( ispec_is_acoustic[ispec] ){
+
+      // gets global point index
+      int igll = threadIdx.x + threadIdx.y*blockDim.x;
+
+      int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+      int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+      int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+
+      int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+      // sets potentials to zero at free surface
+      potential_acoustic[iglob] = 0;
+      potential_dot_acoustic[iglob] = 0;
+      potential_dot_dot_acoustic[iglob] = 0;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(acoustic_enforce_free_surf_cuda,
+              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
+                                                  int* SIMULATION_TYPE,
+                                                  int* ABSORB_FREE_SURFACE) {
+
+TRACE("acoustic_enforce_free_surf_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( *ABSORB_FREE_SURFACE == 0 ){
+
+    // does not absorb free surface, thus we enforce the potential to be zero at surface
+
+    // block sizes
+    int num_blocks_x = mp->num_free_surface_faces;
+    int num_blocks_y = 1;
+    while(num_blocks_x > 65535) {
+      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+      num_blocks_y = num_blocks_y*2;
+    }
+    dim3 grid(num_blocks_x,num_blocks_y,1);
+    dim3 threads(NGLL2,1,1);
+
+    // sets potentials to zero at free surface
+    enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
+                                                       mp->d_potential_dot_acoustic,
+                                                       mp->d_potential_dot_dot_acoustic,
+                                                       mp->num_free_surface_faces,
+                                                       mp->d_free_surface_ispec,
+                                                       mp->d_free_surface_ijk,
+                                                       mp->d_ibool,
+                                                       mp->d_ispec_is_acoustic);
+    // for backward/reconstructed potentials
+    if(*SIMULATION_TYPE == 3) {
+      enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_b_potential_acoustic,
+                                                         mp->d_b_potential_dot_acoustic,
+                                                         mp->d_b_potential_dot_dot_acoustic,
+                                                         mp->num_free_surface_faces,
+                                                         mp->d_free_surface_ispec,
+                                                         mp->d_free_surface_ijk,
+                                                         mp->d_ibool,
+                                                         mp->d_ispec_is_acoustic);
+
+    }
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("enforce_free_surface_cuda");
+#endif
+}
+
diff --git a/src/cuda/compute_forces_elastic_cuda.cu b/src/cuda/compute_forces_elastic_cuda.cu
new file mode 100644
index 000000000..6c3e753e5
--- /dev/null
+++ b/src/cuda/compute_forces_elastic_cuda.cu
@@ -0,0 +1,2255 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+// #include "epik_user.h"
+
+//  cuda constant arrays
+__device__ realw d_hprime_xx[NGLL2];
+
+// only needed if NGLLX != NGLLY != NGLLZ
+// __device__ realw d_hprime_yy[NGLL2];
+// __device__ realw d_hprime_zz[NGLL2];
+__device__ realw d_hprimewgll_xx[NGLL2];
+__device__ realw d_hprimewgll_yy[NGLL2];
+__device__ realw d_hprimewgll_zz[NGLL2];
+__device__ realw d_wgllwgll_xy[NGLL2];
+__device__ realw d_wgllwgll_xz[NGLL2];
+__device__ realw d_wgllwgll_yz[NGLL2];
+
+__constant__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
+
+//daniel: todo - check if necessary...
+// prototype for the fortran function to do non-blocking mpi send
+//extern "C"
+//void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*); // {};
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// prepares a device array with with all inter-element edge-nodes -- this
+// is followed by a memcpy and MPI operations
+
+__global__ void prepare_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
+                                                 int num_interfaces_ext_mesh,
+                                                 int max_nibool_interfaces_ext_mesh,
+                                                 int* d_nibool_interfaces_ext_mesh,
+                                                 int* d_ibool_interfaces_ext_mesh) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  //int iinterface=0;
+
+  for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+    if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
+      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)] =
+        d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)];
+      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1] =
+        d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1];
+      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2] =
+        d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2];
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// prepares and transfers the inter-element edge-nodes to the host to be MPI'd
+// (elements on boundary)
+extern "C"
+void FC_FUNC_(transfer_boun_accel_from_device,
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
+                                                    realw* send_accel_buffer,
+                                                    int* num_interfaces_ext_mesh,
+                                                    int* max_nibool_interfaces_ext_mesh,
+                                                    int* nibool_interfaces_ext_mesh,
+                                                    int* ibool_interfaces_ext_mesh,
+                                                    int* FORWARD_OR_ADJOINT){
+TRACE("transfer_boun_accel_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  if( *num_interfaces_ext_mesh == 0 ) return;
+
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  //timing for memory xfer
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+  if(*FORWARD_OR_ADJOINT == 1) {
+    prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
+                                                       mp->num_interfaces_ext_mesh,
+                                                       mp->max_nibool_interfaces_ext_mesh,
+                                                       mp->d_nibool_interfaces_ext_mesh,
+                                                       mp->d_ibool_interfaces_ext_mesh);
+  }
+  else if(*FORWARD_OR_ADJOINT == 3) {
+    prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
+                                                       mp->num_interfaces_ext_mesh,
+                                                       mp->max_nibool_interfaces_ext_mesh,
+                                                       mp->d_nibool_interfaces_ext_mesh,
+                                                       mp->d_ibool_interfaces_ext_mesh);
+  }
+
+
+  cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
+             3*mp->max_nibool_interfaces_ext_mesh*mp->num_interfaces_ext_mesh*sizeof(realw),
+             cudaMemcpyDeviceToHost);
+
+  // finish timing of kernel+memcpy
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("boundary xfer d->h Time: %f ms\n",time);
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_boun_accel_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_boundary_from_device_a,
+              TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
+                                               int* nspec_outer_elastic) {
+
+// asynchronous transfer from device to host
+
+  TRACE("transfer_boundary_from_device_a");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+//daniel: todo - check below with this...
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+/*
+//daniel: todo - check originally used...
+  int num_blocks_x = *nspec_outer_elastic;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  int blocksize = NGLL3_PADDED;
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+*/
+
+  prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
+                                                                          mp->num_interfaces_ext_mesh,
+                                                                          mp->max_nibool_interfaces_ext_mesh,
+                                                                          mp->d_nibool_interfaces_ext_mesh,
+                                                                          mp->d_ibool_interfaces_ext_mesh);
+  // wait until kernel is finished before starting async memcpy
+#if CUDA_VERSION >= 4000
+  cudaDeviceSynchronize();
+#else
+  cudaThreadSynchronize();
+#endif
+
+  cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+                  3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
+                  cudaMemcpyDeviceToHost,mp->copy_stream);
+  // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+  // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
+  // cudaMemcpyDeviceToHost,mp->compute_stream);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
+                                                  int num_interfaces_ext_mesh,
+                                                  int max_nibool_interfaces_ext_mesh,
+                                                  int* d_nibool_interfaces_ext_mesh,
+                                                  int* d_ibool_interfaces_ext_mesh) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  //int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  //int tx = threadIdx.x;
+  //int iinterface=0;
+
+  for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+    if(id < d_nibool_interfaces_ext_mesh[iinterface]) {
+
+      // for testing atomic operations against not atomic operations (0.1ms vs. 0.04 ms)
+      // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)] +=
+      // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)];
+      // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1] +=
+      // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1];
+      // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2] +=
+      // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2];
+
+
+      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
+                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)]);
+      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1],
+                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1]);
+      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2],
+                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2]);
+    }
+  }
+  // ! This step is done via previous function transfer_and_assemble...
+  // ! do iinterface = 1, num_interfaces_ext_mesh
+  // !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+  // !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+  // !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+  // !   enddo
+  // ! enddo
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_boundary_to_device_a,
+              TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
+                                             realw* buffer_recv_vector_ext_mesh,
+                                             int* num_interfaces_ext_mesh,
+                                             int* max_nibool_interfaces_ext_mesh) {
+
+// asynchronous transfer from host to device
+
+  TRACE("transfer_boundary_to_device_a");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_recv_buffer*sizeof(realw));
+
+  // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+  // 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
+  // cudaMemcpyHostToDevice,mp->compute_stream);
+  //printf("xfer to device\n");
+  cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+                  3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
+                  cudaMemcpyHostToDevice,mp->copy_stream);
+
+
+
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+//daniel: not used ...
+//
+//extern "C"
+//void FC_FUNC_(assemble_accel_on_device,
+//              ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
+//                                              realw* buffer_recv_vector_ext_mesh,
+//                                              int* num_interfaces_ext_mesh,
+//                                              int* max_nibool_interfaces_ext_mesh,
+//                                              int* nibool_interfaces_ext_mesh,
+//                                              int* ibool_interfaces_ext_mesh,
+//                                              int* FORWARD_OR_ADJOINT) {
+//  TRACE("assemble_accel_on_device");
+//
+//  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+//
+//  int blocksize = BLOCKSIZE_TRANSFER;
+//  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+//  int num_blocks_x = size_padded/blocksize;
+//  int num_blocks_y = 1;
+//  while(num_blocks_x > 65535) {
+//    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+//    num_blocks_y = num_blocks_y*2;
+//  }
+//
+//  //double start_time = get_time();
+//  dim3 grid(num_blocks_x,num_blocks_y);
+//  dim3 threads(blocksize,1,1);
+//  // cudaEvent_t start, stop;
+//  // realw time;
+//  // cudaEventCreate(&start);
+//  // cudaEventCreate(&stop);
+//  // cudaEventRecord( start, 0 );
+//
+//
+//  // ***************************************************************************
+//  // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+//  cudaStreamSynchronize(mp->copy_stream);
+//
+//  // Assembling on the copy_stream breaks the solution and it "blows up"
+//  if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
+//    assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
+//                                                                             mp->num_interfaces_ext_mesh,
+//                                                                             mp->max_nibool_interfaces_ext_mesh,
+//                                                                             mp->d_nibool_interfaces_ext_mesh,
+//                                                                             mp->d_ibool_interfaces_ext_mesh);
+//  }
+//  else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
+//    assemble_boundary_accel_on_device<<<grid,threads,0,mp->copy_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
+//                                                        mp->num_interfaces_ext_mesh,
+//                                                        mp->max_nibool_interfaces_ext_mesh,
+//                                                        mp->d_nibool_interfaces_ext_mesh,
+//                                                        mp->d_ibool_interfaces_ext_mesh);
+//  }
+//
+//  // cudaEventRecord( stop, 0 );
+//  // cudaEventSynchronize( stop );
+//  // cudaEventElapsedTime( &time, start, stop );
+//  // cudaEventDestroy( start );
+//  // cudaEventDestroy( stop );
+//  // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
+//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//  //double end_time = get_time();
+//  //printf("Elapsed time: %e\n",end_time-start_time);
+//  exit_on_cuda_error("transfer_asmbl_accel_to_device");
+//#endif
+//}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel
+extern "C"
+void FC_FUNC_(transfer_asmbl_accel_to_device,
+              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
+                                                    realw* buffer_recv_vector_ext_mesh,
+                                                    int* num_interfaces_ext_mesh,
+                                                    int* max_nibool_interfaces_ext_mesh,
+                                                    int* nibool_interfaces_ext_mesh,
+                                                    int* ibool_interfaces_ext_mesh,
+                                                    int* FORWARD_OR_ADJOINT) {
+TRACE("transfer_asmbl_accel_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
+  if(*FORWARD_OR_ADJOINT == 1 ){
+    // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+    cudaStreamSynchronize(mp->copy_stream);
+  }
+  else if(*FORWARD_OR_ADJOINT == 3 ){
+    cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+             3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
+             cudaMemcpyHostToDevice);
+  }
+
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  //double start_time = get_time();
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+  if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
+    assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
+                                                        mp->num_interfaces_ext_mesh,
+                                                        mp->max_nibool_interfaces_ext_mesh,
+                                                        mp->d_nibool_interfaces_ext_mesh,
+                                                        mp->d_ibool_interfaces_ext_mesh);
+  }
+  else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
+    assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
+                                                        mp->num_interfaces_ext_mesh,
+                                                        mp->max_nibool_interfaces_ext_mesh,
+                                                        mp->d_nibool_interfaces_ext_mesh,
+                                                        mp->d_ibool_interfaces_ext_mesh);
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("transfer_asmbl_accel_to_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 2
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+//__global__ void Kernel_test(realw* d_debug_output,int* d_phase_ispec_inner_elastic,
+//                            int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
+//  int bx = blockIdx.x;
+//  int tx = threadIdx.x;
+//  int working_element;
+//  //int ispec;
+//  //int NGLL3_ALIGN = 128;
+//  if(tx==0 && bx==0) {
+//
+//    d_debug_output[0] = 420.0;
+//
+//    d_debug_output[2] = num_phase_ispec_elastic;
+//    d_debug_output[3] = d_iphase;
+//    working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+//    d_debug_output[4] = working_element;
+//    d_debug_output[5] = d_phase_ispec_inner_elastic[0];
+//    /* d_debug_output[1] = d_ibool[working_element*NGLL3_ALIGN + tx]-1; */
+//  }
+//  /* d_debug_output[1+tx+128*bx] = 69.0; */
+//
+//}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// updates stress
+
+__device__ void compute_element_att_stress(int tx,int working_element,int NSPEC,
+                                           realw* R_xx,
+                                           realw* R_yy,
+                                           realw* R_xy,
+                                           realw* R_xz,
+                                           realw* R_yz,
+                                           reald* sigma_xx,
+                                           reald* sigma_yy,
+                                           reald* sigma_zz,
+                                           reald* sigma_xy,
+                                           reald* sigma_xz,
+                                           reald* sigma_yz) {
+
+  int i_sls,offset_sls;
+  reald R_xx_val,R_yy_val;
+
+  for(i_sls = 0; i_sls < N_SLS; i_sls++){
+    // index
+    offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
+
+    R_xx_val = R_xx[offset_sls]; //(i,j,k,ispec,i_sls)
+    R_yy_val = R_yy[offset_sls];
+
+    *sigma_xx = *sigma_xx - R_xx_val;
+    *sigma_yy = *sigma_yy - R_yy_val;
+    *sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
+    *sigma_xy = *sigma_xy - R_xy[offset_sls];
+    *sigma_xz = *sigma_xz - R_xz[offset_sls];
+    *sigma_yz = *sigma_yz - R_yz[offset_sls];
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// updates R_memory
+
+__device__ void compute_element_att_memory(int tx,int working_element,int NSPEC,
+                                          realw* d_muv,
+                                          realw* factor_common,
+                                          realw* alphaval,realw* betaval,realw* gammaval,
+                                          realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                          realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                          realw* epsilondev_xz,realw* epsilondev_yz,
+                                          reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
+                                          reald epsilondev_xz_loc,reald epsilondev_yz_loc
+                                          ){
+
+  int i_sls;
+  int ijk_ispec;
+  int offset_sls,offset_align,offset_common;
+  reald mul;
+  reald alphaval_loc,betaval_loc,gammaval_loc;
+  reald factor_loc,Sn,Snp1;
+
+  // indices
+  offset_align = tx + NGLL3_PADDED * working_element;
+  ijk_ispec = tx + NGLL3 * working_element;
+
+  mul = d_muv[offset_align];
+
+  // use Runge-Kutta scheme to march in time
+  for(i_sls = 0; i_sls < N_SLS; i_sls++){
+
+    // indices
+    offset_common = i_sls + N_SLS*(tx + NGLL3*working_element); // (i_sls,i,j,k,ispec)
+    offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);   // (i,j,k,ispec,i_sls)
+
+    factor_loc = mul * factor_common[offset_common]; //mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+    alphaval_loc = alphaval[i_sls]; // (i_sls)
+    betaval_loc = betaval[i_sls];
+    gammaval_loc = gammaval[i_sls];
+
+    // term in xx
+    Sn   = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+    Snp1   = factor_loc * epsilondev_xx_loc; //(i,j,k)
+
+    //R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) +
+    //                          betaval_loc * Sn + gammaval_loc * Snp1;
+
+    R_xx[offset_sls] = alphaval_loc * R_xx[offset_sls] +
+                       betaval_loc * Sn + gammaval_loc * Snp1;
+
+    // term in yy
+    Sn   = factor_loc * epsilondev_yy[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_yy_loc;
+    R_yy[offset_sls] = alphaval_loc * R_yy[offset_sls] +
+                        betaval_loc * Sn + gammaval_loc * Snp1;
+    // term in zz not computed since zero trace
+    // term in xy
+    Sn   = factor_loc * epsilondev_xy[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_xy_loc;
+    R_xy[offset_sls] = alphaval_loc * R_xy[offset_sls] +
+                        betaval_loc * Sn + gammaval_loc * Snp1;
+    // term in xz
+    Sn   = factor_loc * epsilondev_xz[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_xz_loc;
+    R_xz[offset_sls] = alphaval_loc * R_xz[offset_sls] +
+                        betaval_loc * Sn + gammaval_loc * Snp1;
+    // term in yz
+    Sn   = factor_loc * epsilondev_yz[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_yz_loc;
+    R_yz[offset_sls] = alphaval_loc * R_yz[offset_sls] +
+                        betaval_loc * Sn + gammaval_loc * Snp1;
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// pre-computes gravity term
+
+__device__ void compute_element_gravity(int tx,int working_element,
+                                        int* d_ibool,
+                                        realw* d_minus_g,
+                                        realw* d_minus_deriv_gravity,
+                                        realw* d_rhostore,
+                                        realw* wgll_cube,
+                                        reald jacobianl,
+                                        reald* s_dummyx_loc,
+                                        reald* s_dummyy_loc,
+                                        reald* s_dummyz_loc,
+                                        reald* sigma_xx,
+                                        reald* sigma_yy,
+                                        reald* sigma_xz,
+                                        reald* sigma_yz,
+                                        reald* rho_s_H1,
+                                        reald* rho_s_H2,
+                                        reald* rho_s_H3){
+
+  int iglob;
+  reald minus_g,minus_dg;
+  reald rhol;
+  reald gzl; // gxl,gyl,
+  reald sx_l,sy_l,sz_l;
+  reald Hxxl,Hyyl,Hzzl; //,Hxyl,Hxzl,Hyzl;
+  reald factor;
+
+  // compute non-symmetric terms for gravity
+
+  // get g, rho and dg/dr=dg
+  iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+  minus_g = d_minus_g[iglob];
+  minus_dg = d_minus_deriv_gravity[iglob];
+
+  // Cartesian components of the gravitational acceleration
+  //gxl = 0.f;
+  //gyl = 0.f;
+  gzl = minus_g;
+
+  // Cartesian components of gradient of gravitational acceleration
+  // H = grad g
+  // assumes g only acts in negative z-direction
+  Hxxl = 0.f;
+  Hyyl = 0.f;
+  Hzzl = minus_dg;
+  //Hxyl = 0.f;
+  //Hxzl = 0.f;
+  //Hyzl = 0.f;
+
+  rhol = d_rhostore[working_element*NGLL3_PADDED + tx];
+
+  // get displacement and multiply by density to compute G tensor
+  // G = rho [ sg - (s * g) I  ]
+  sx_l = rhol * s_dummyx_loc[tx]; // d_displ[iglob*3];
+  sy_l = rhol * s_dummyy_loc[tx]; // d_displ[iglob*3 + 1];
+  sz_l = rhol * s_dummyz_loc[tx]; // d_displ[iglob*3 + 2];
+
+  // compute G tensor from s . g and add to sigma (not symmetric)
+  //sigma_xx += sy_l*gyl + sz_l*gzl;
+  *sigma_xx += sz_l*gzl;
+  //sigma_yy += sx_l*gxl + sz_l*gzl;
+  *sigma_yy += sz_l*gzl;
+  //sigma_zz += sx_l*gxl + sy_l*gyl;
+
+  //sigma_xy -= sx_l*gyl;
+  //sigma_yx -= sy_l*gxl;
+
+  *sigma_xz -= sx_l*gzl;
+  //sigma_zx -= sz_l*gxl;
+
+  *sigma_yz -= sy_l*gzl;
+  //sigma_zy -= sz_l*gyl;
+
+  // precompute vector
+  factor = jacobianl * wgll_cube[tx];
+
+  //rho_s_H1 = fac1 * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl);
+  //rho_s_H2 = fac1 * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl);
+  //rho_s_H3 = fac1 * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl);
+
+  // only non-zero z-direction
+  *rho_s_H1 = factor * sx_l * Hxxl ; // 0.f;
+  *rho_s_H2 = factor * sy_l * Hyyl ; // 0.f;
+  *rho_s_H3 = factor * sz_l * Hzzl ;
+
+  // debug
+  //*rho_s_H1 = 0.f;
+  //*rho_s_H2 = 0.f;
+  //*rho_s_H3 = 0.f ;
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// double precision temporary variables leads to 10% performance
+// decrease in Kernel_2_impl (not very much..)
+//typedef realw reald;
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
+#endif
+
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_tex;
+#endif
+
+__global__ void Kernel_2_impl(int nb_blocks_to_compute,
+                              int NGLOB,
+                              int* d_ibool,
+                              int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
+                              int d_iphase,
+                              int use_mesh_coloring_gpu,
+                              realw* d_displ, realw* d_accel,
+                              realw* d_xix, realw* d_xiy, realw* d_xiz,
+                              realw* d_etax, realw* d_etay, realw* d_etaz,
+                              realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                              realw* d_kappav, realw* d_muv,
+                              int COMPUTE_AND_STORE_STRAIN,
+                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                              realw* epsilondev_xz,realw* epsilondev_yz,
+                              realw* epsilon_trace_over_3,
+                              int SIMULATION_TYPE,
+                              int ATTENUATION,
+                              int NSPEC,
+                              realw* one_minus_sum_beta,realw* factor_common,
+                              realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+                              realw* alphaval,realw* betaval,realw* gammaval,
+                              int ANISOTROPY,
+                              realw* d_c11store,
+                              realw* d_c12store,
+                              realw* d_c13store,
+                              realw* d_c14store,
+                              realw* d_c15store,
+                              realw* d_c16store,
+                              realw* d_c22store,
+                              realw* d_c23store,
+                              realw* d_c24store,
+                              realw* d_c25store,
+                              realw* d_c26store,
+                              realw* d_c33store,
+                              realw* d_c34store,
+                              realw* d_c35store,
+                              realw* d_c36store,
+                              realw* d_c44store,
+                              realw* d_c45store,
+                              realw* d_c46store,
+                              realw* d_c55store,
+                              realw* d_c56store,
+                              realw* d_c66store,
+                              int gravity,
+                              realw* d_minus_g,
+                              realw* d_minus_deriv_gravity,
+                              realw* d_rhostore,
+                              realw* wgll_cube){
+
+  /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
+  int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  /* int bx = blockIdx.x; */
+  int tx = threadIdx.x;
+
+  //const int NGLLX = 5;
+  // const int NGLL2 = 25;
+  //const int NGLL3 = NGLL3;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+
+  int K = (tx/NGLL2);
+  int J = ((tx-K*NGLL2)/NGLLX);
+  int I = (tx-K*NGLL2-J*NGLLX);
+
+  int active,offset;
+  int iglob = 0;
+  int working_element;
+
+  reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+  reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+  reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+  reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+  reald fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
+  reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+  reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+  reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+  reald sum_terms1,sum_terms2,sum_terms3;
+
+  // gravity variables
+  reald sigma_yx,sigma_zx,sigma_zy;
+  reald rho_s_H1,rho_s_H2,rho_s_H3;
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+    int l;
+    realw hp1,hp2,hp3;
+#endif
+
+    __shared__ reald s_dummyx_loc[NGLL3];
+    __shared__ reald s_dummyy_loc[NGLL3];
+    __shared__ reald s_dummyz_loc[NGLL3];
+
+    __shared__ reald s_tempx1[NGLL3];
+    __shared__ reald s_tempx2[NGLL3];
+    __shared__ reald s_tempx3[NGLL3];
+    __shared__ reald s_tempy1[NGLL3];
+    __shared__ reald s_tempy2[NGLL3];
+    __shared__ reald s_tempy3[NGLL3];
+    __shared__ reald s_tempz1[NGLL3];
+    __shared__ reald s_tempz2[NGLL3];
+    __shared__ reald s_tempz3[NGLL3];
+
+    __shared__ reald sh_hprime_xx[NGLL2];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+    active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+    if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+      working_element = bx;
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+        working_element = bx;
+      }else{
+        // iphase-1 and working_element-1 for Fortran->C array conventions
+        working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+      }
+#endif
+
+      // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
+      iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES_FIELDS
+      s_dummyx_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3);
+      s_dummyy_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3 + 1);
+      s_dummyz_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3 + 2);
+#else
+      // changing iglob indexing to match fortran row changes fast style
+      s_dummyx_loc[tx] = d_displ[iglob*3];
+      s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+      s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+#endif
+    }
+
+    if (tx < NGLL2) {
+      #ifdef USE_TEXTURES_CONSTANTS
+      sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+      #else
+      sh_hprime_xx[tx] = d_hprime_xx[tx];
+      #endif
+    }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+    __syncthreads();
+
+    if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+      tempx1l = 0.f;
+      tempx2l = 0.f;
+      tempx3l = 0.f;
+
+      tempy1l = 0.f;
+      tempy2l = 0.f;
+      tempy3l = 0.f;
+
+      tempz1l = 0.f;
+      tempz2l = 0.f;
+      tempz3l = 0.f;
+
+      for (l=0;l<NGLLX;l++) {
+          hp1 = sh_hprime_xx[l*NGLLX+I];
+          offset = K*NGLL2+J*NGLLX+l;
+          tempx1l += s_dummyx_loc[offset]*hp1;
+          tempy1l += s_dummyy_loc[offset]*hp1;
+          tempz1l += s_dummyz_loc[offset]*hp1;
+
+          hp2 = sh_hprime_xx[l*NGLLX+J];
+          offset = K*NGLL2+l*NGLLX+I;
+          tempx2l += s_dummyx_loc[offset]*hp2;
+          tempy2l += s_dummyy_loc[offset]*hp2;
+          tempz2l += s_dummyz_loc[offset]*hp2;
+
+          hp3 = sh_hprime_xx[l*NGLLX+K];
+          offset = l*NGLL2+J*NGLLX+I;
+          tempx3l += s_dummyx_loc[offset]*hp3;
+          tempy3l += s_dummyy_loc[offset]*hp3;
+          tempz3l += s_dummyz_loc[offset]*hp3;
+
+      }
+#else
+
+      tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+#endif
+
+// compute derivatives of ux, uy and uz with respect to x, y and z
+      offset = working_element*NGLL3_ALIGN + tx;
+
+      xixl = d_xix[offset];
+      xiyl = d_xiy[offset];
+      xizl = d_xiz[offset];
+      etaxl = d_etax[offset];
+      etayl = d_etay[offset];
+      etazl = d_etaz[offset];
+      gammaxl = d_gammax[offset];
+      gammayl = d_gammay[offset];
+      gammazl = d_gammaz[offset];
+
+      duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+      duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+      duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+
+      duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+      duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+      duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+
+      duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+      duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+      duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+
+      // precompute some sums to save CPU time
+      duxdxl_plus_duydyl = duxdxl + duydyl;
+      duxdxl_plus_duzdzl = duxdxl + duzdzl;
+      duydyl_plus_duzdzl = duydyl + duzdzl;
+      duxdyl_plus_duydxl = duxdyl + duydxl;
+      duzdxl_plus_duxdzl = duzdxl + duxdzl;
+      duzdyl_plus_duydzl = duzdyl + duydzl;
+
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+        realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+        /*
+        epsilondev_xx[offset] = duxdxl - templ;
+        epsilondev_yy[offset] = duydyl - templ;
+        epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
+        epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
+        epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
+              */
+        // local storage: stresses at this current time step
+        epsilondev_xx_loc = duxdxl - templ;
+        epsilondev_yy_loc = duydyl - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+
+        if(SIMULATION_TYPE == 3) {
+          epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+        }
+      }
+
+      // compute elements with an elastic isotropic rheology
+      kappal = d_kappav[offset];
+      mul = d_muv[offset];
+
+      // attenuation
+      if(ATTENUATION){
+        // use unrelaxed parameters if attenuation
+        mul  = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+      }
+
+      // full anisotropic case, stress calculations
+      if(ANISOTROPY){
+
+        c11 = d_c11store[offset];
+        c12 = d_c12store[offset];
+        c13 = d_c13store[offset];
+        c14 = d_c14store[offset];
+        c15 = d_c15store[offset];
+        c16 = d_c16store[offset];
+        c22 = d_c22store[offset];
+        c23 = d_c23store[offset];
+        c24 = d_c24store[offset];
+        c25 = d_c25store[offset];
+        c26 = d_c26store[offset];
+        c33 = d_c33store[offset];
+        c34 = d_c34store[offset];
+        c35 = d_c35store[offset];
+        c36 = d_c36store[offset];
+        c44 = d_c44store[offset];
+        c45 = d_c45store[offset];
+        c46 = d_c46store[offset];
+        c55 = d_c55store[offset];
+        c56 = d_c56store[offset];
+        c66 = d_c66store[offset];
+
+        sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+                   c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+        sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+                   c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+        sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+                   c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+        sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+                   c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+        sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+                   c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+        sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+                   c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
+
+      }else{
+
+        // isotropic case
+
+        lambdalplus2mul = kappal + 1.33333333333333333333f * mul;  // 4./3. = 1.3333333
+        lambdal = lambdalplus2mul - 2.0f * mul;
+
+        // compute the six components of the stress tensor sigma
+        sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+        sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+        sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+        sigma_xy = mul*duxdyl_plus_duydxl;
+        sigma_xz = mul*duzdxl_plus_duxdzl;
+        sigma_yz = mul*duzdyl_plus_duydzl;
+      }
+
+      if(ATTENUATION){
+        // subtracts memory variables if attenuation
+        compute_element_att_stress(tx,working_element,NSPEC,
+                                   R_xx,R_yy,R_xy,R_xz,R_yz,
+                                   &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+      }
+
+      jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
+
+      // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+      sigma_yx = sigma_xy;
+      sigma_zx = sigma_xz;
+      sigma_zy = sigma_yz;
+
+      if( gravity ){
+        //  computes non-symmetric terms for gravity
+        compute_element_gravity(tx,working_element,d_ibool,d_minus_g,d_minus_deriv_gravity,
+                                d_rhostore,wgll_cube,jacobianl,
+                                s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+                                &sigma_xx,&sigma_yy,&sigma_xz,&sigma_yz,
+                                &rho_s_H1,&rho_s_H2,&rho_s_H3);
+      }
+
+      // form dot product with test vector, non-symmetric form
+      s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+      s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+      s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+
+      s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+      s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+      s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+
+      s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+      s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+      s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+
+    }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+    __syncthreads();
+
+    if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+      tempx1l = 0.f;
+      tempy1l = 0.f;
+      tempz1l = 0.f;
+
+      tempx2l = 0.f;
+      tempy2l = 0.f;
+      tempz2l = 0.f;
+
+      tempx3l = 0.f;
+      tempy3l = 0.f;
+      tempz3l = 0.f;
+
+      for (l=0;l<NGLLX;l++) {
+
+        fac1 = d_hprimewgll_xx[I*NGLLX+l];
+        offset = K*NGLL2+J*NGLLX+l;
+        tempx1l += s_tempx1[offset]*fac1;
+        tempy1l += s_tempy1[offset]*fac1;
+        tempz1l += s_tempz1[offset]*fac1;
+
+        fac2 = d_hprimewgll_yy[J*NGLLX+l];
+        offset = K*NGLL2+l*NGLLX+I;
+        tempx2l += s_tempx2[offset]*fac2;
+        tempy2l += s_tempy2[offset]*fac2;
+        tempz2l += s_tempz2[offset]*fac2;
+
+        fac3 = d_hprimewgll_zz[K*NGLLX+l];
+        offset = l*NGLL2+J*NGLLX+I;
+        tempx3l += s_tempx3[offset]*fac3;
+        tempy3l += s_tempy3[offset]*fac3;
+        tempz3l += s_tempz3[offset]*fac3;
+
+      }
+#else
+
+      tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+      tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+      tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+#endif
+
+      fac1 = d_wgllwgll_yz[K*NGLLX+J];
+      fac2 = d_wgllwgll_xz[K*NGLLX+I];
+      fac3 = d_wgllwgll_xy[J*NGLLX+I];
+
+      sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+      sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+      sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+
+      // adds gravity term
+      if( gravity ){
+        sum_terms1 += rho_s_H1;
+        sum_terms2 += rho_s_H2;
+        sum_terms3 += rho_s_H3;
+      }
+
+
+#ifdef USE_MESH_COLORING_GPU
+      // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+      d_accel[iglob*3]     = tex1Dfetch(d_accel_tex, iglob*3) + sum_terms1;
+      d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_tex, iglob*3 + 1) + sum_terms2;
+      d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_tex, iglob*3 + 2) + sum_terms3;
+#else
+      d_accel[iglob*3]     += sum_terms1;
+      d_accel[iglob*3 + 1] += sum_terms2;
+      d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+#else // MESH_COLORING
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+
+        // no atomic operation needed, colors don't share global points between elements
+        // d_accel[iglob*3]     += sum_terms1;
+        // d_accel[iglob*3 + 1] += sum_terms2;
+        // d_accel[iglob*3 + 2] += sum_terms3;
+#ifdef USE_TEXTURES_FIELDS
+        d_accel[iglob*3]     = tex1Dfetch(d_accel_tex, iglob*3) + sum_terms1;
+        d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_tex, iglob*3 + 1) + sum_terms2;
+        d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_tex, iglob*3 + 2) + sum_terms3;
+#else
+        d_accel[iglob*3]     += sum_terms1;
+        d_accel[iglob*3 + 1] += sum_terms2;
+        d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+      }
+      else {
+
+        // for testing purposes only: w/out atomic updates
+        //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+        //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+        //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+
+        atomicAdd(&d_accel[iglob*3], sum_terms1);
+        atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+        atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+
+      } // if(use_mesh_coloring_gpu)
+
+#endif // MESH_COLORING
+
+
+      // update memory variables based upon the Runge-Kutta scheme
+      if( ATTENUATION ){
+        compute_element_att_memory(tx,working_element,NSPEC,
+                                  d_muv,
+                                  factor_common,alphaval,betaval,gammaval,
+                                  R_xx,R_yy,R_xy,R_xz,R_yz,
+                                  epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+                                  epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc);
+      }
+
+      // save deviatoric strain for Runge-Kutta scheme
+      if( COMPUTE_AND_STORE_STRAIN ){
+        int ijk_ispec = tx + working_element*NGLL3;
+
+        // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+        epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
+        epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+        epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+        epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+        epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+      }
+
+    } // if(active)
+
+} // kernel_2_impl()
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
+              int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,
+              int ATTENUATION,int ANISOTROPY,
+              int* d_ibool,
+              realw* d_xix,
+              realw* d_xiy,
+              realw* d_xiz,
+              realw* d_etax,
+              realw* d_etay,
+              realw* d_etaz,
+              realw* d_gammax,
+              realw* d_gammay,
+              realw* d_gammaz,
+              realw* d_kappav,
+              realw* d_muv,
+              realw* d_epsilondev_xx,
+              realw* d_epsilondev_yy,
+              realw* d_epsilondev_xy,
+              realw* d_epsilondev_xz,
+              realw* d_epsilondev_yz,
+              realw* d_epsilon_trace_over_3,
+              realw* d_one_minus_sum_beta,
+              realw* d_factor_common,
+              realw* d_R_xx,
+              realw* d_R_yy,
+              realw* d_R_xy,
+              realw* d_R_xz,
+              realw* d_R_yz,
+              realw* d_b_epsilondev_xx,
+              realw* d_b_epsilondev_yy,
+              realw* d_b_epsilondev_xy,
+              realw* d_b_epsilondev_xz,
+              realw* d_b_epsilondev_yz,
+              realw* d_b_epsilon_trace_over_3,
+              realw* d_b_R_xx,
+              realw* d_b_R_yy,
+              realw* d_b_R_xy,
+              realw* d_b_R_xz,
+              realw* d_b_R_yz,
+              realw* d_c11store,
+              realw* d_c12store,
+              realw* d_c13store,
+              realw* d_c14store,
+              realw* d_c15store,
+              realw* d_c16store,
+              realw* d_c22store,
+              realw* d_c23store,
+              realw* d_c24store,
+              realw* d_c25store,
+              realw* d_c26store,
+              realw* d_c33store,
+              realw* d_c34store,
+              realw* d_c35store,
+              realw* d_c36store,
+              realw* d_c44store,
+              realw* d_c45store,
+              realw* d_c46store,
+              realw* d_c55store,
+              realw* d_c56store,
+              realw* d_c66store,
+              realw* d_rhostore){
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before kernel Kernel 2");
+#endif
+
+  /* if the grid can handle the number of blocks, we let it be 1D */
+  /* grid_2_x = nb_elem_color; */
+  /* nb_elem_color is just how many blocks we are computing now */
+
+  int num_blocks_x = nb_blocks_to_compute;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  int blocksize = NGLL3_PADDED;
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  Kernel_2_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+                                  mp->NGLOB_AB,
+                                  d_ibool,
+                                  mp->d_phase_ispec_inner_elastic,
+                                  mp->num_phase_ispec_elastic,
+                                  d_iphase,
+                                  mp->use_mesh_coloring_gpu,
+                                  mp->d_displ, mp->d_accel,
+                                  d_xix, d_xiy, d_xiz,
+                                  d_etax, d_etay, d_etaz,
+                                  d_gammax, d_gammay, d_gammaz,
+                                  d_kappav, d_muv,
+                                  COMPUTE_AND_STORE_STRAIN,
+                                  d_epsilondev_xx,
+                                  d_epsilondev_yy,
+                                  d_epsilondev_xy,
+                                  d_epsilondev_xz,
+                                  d_epsilondev_yz,
+                                  d_epsilon_trace_over_3,
+                                  SIMULATION_TYPE,
+                                  ATTENUATION,mp->NSPEC_AB,
+                                  d_one_minus_sum_beta,
+                                  d_factor_common,
+                                  d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                  ANISOTROPY,
+                                  d_c11store,
+                                  d_c12store,
+                                  d_c13store,
+                                  d_c14store,
+                                  d_c15store,
+                                  d_c16store,
+                                  d_c22store,
+                                  d_c23store,
+                                  d_c24store,
+                                  d_c25store,
+                                  d_c26store,
+                                  d_c33store,
+                                  d_c34store,
+                                  d_c35store,
+                                  d_c36store,
+                                  d_c44store,
+                                  d_c45store,
+                                  d_c46store,
+                                  d_c55store,
+                                  d_c56store,
+                                  d_c66store,
+                                  mp->gravity,
+                                  mp->d_minus_g,
+                                  mp->d_minus_deriv_gravity,
+                                  d_rhostore,
+                                  mp->d_wgll_cube);
+
+
+  if(SIMULATION_TYPE == 3) {
+    Kernel_2_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+                                     mp->NGLOB_AB,
+                                     d_ibool,
+                                     mp->d_phase_ispec_inner_elastic,
+                                     mp->num_phase_ispec_elastic,
+                                     d_iphase,
+                                     mp->use_mesh_coloring_gpu,
+                                     mp->d_b_displ, mp->d_b_accel,
+                                     d_xix, d_xiy, d_xiz,
+                                     d_etax, d_etay, d_etaz,
+                                     d_gammax, d_gammay, d_gammaz,
+                                     d_kappav, d_muv,
+                                     COMPUTE_AND_STORE_STRAIN,
+                                     d_b_epsilondev_xx,
+                                     d_b_epsilondev_yy,
+                                     d_b_epsilondev_xy,
+                                     d_b_epsilondev_xz,
+                                     d_b_epsilondev_yz,
+                                     d_b_epsilon_trace_over_3,
+                                     SIMULATION_TYPE,
+                                     ATTENUATION,mp->NSPEC_AB,
+                                     d_one_minus_sum_beta,
+                                     d_factor_common,
+                                     d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+                                     mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+                                     ANISOTROPY,
+                                     d_c11store,
+                                     d_c12store,
+                                     d_c13store,
+                                     d_c14store,
+                                     d_c15store,
+                                     d_c16store,
+                                     d_c22store,
+                                     d_c23store,
+                                     d_c24store,
+                                     d_c25store,
+                                     d_c26store,
+                                     d_c33store,
+                                     d_c34store,
+                                     d_c35store,
+                                     d_c36store,
+                                     d_c44store,
+                                     d_c45store,
+                                     d_c46store,
+                                     d_c55store,
+                                     d_c56store,
+                                     d_c66store,
+                                     mp->gravity,
+                                     mp->d_minus_g,
+                                     mp->d_minus_deriv_gravity,
+                                     d_rhostore,
+                                     mp->d_wgll_cube);
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Kernel2 Execution Time: %f ms\n",time);
+
+  /* cudaThreadSynchronize(); */
+  /* LOG("Kernel 2 finished"); */
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("Kernel_2_impl ");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_forces_elastic_cuda,
+              COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* iphase,
+                                           int* nspec_outer_elastic,
+                                           int* nspec_inner_elastic,
+                                           int* SIMULATION_TYPE,
+                                           int* COMPUTE_AND_STORE_STRAIN,
+                                           int* ATTENUATION,
+                                           int* ANISOTROPY) {
+
+  TRACE("compute_forces_elastic_cuda");
+  // EPIK_TRACER("compute_forces_elastic_cuda");
+  //printf("Running compute_forces\n");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_elements;
+
+  if( *iphase == 1 )
+    num_elements = *nspec_outer_elastic;
+  else
+    num_elements = *nspec_inner_elastic;
+
+  // checks if anything to do
+  if( num_elements == 0 ) return;
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+
+    // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
+    //         and followed by acoustic ones.
+    //         elastic elements also start with outer than inner element ordering
+
+    int nb_colors,nb_blocks_to_compute;
+    int istart;
+    int color_offset,color_offset_nonpadded,color_offset_nonpadded_att2;
+
+    // sets up color loop
+    if( *iphase == 1 ){
+      // outer elements
+      nb_colors = mp->num_colors_outer_elastic;
+      istart = 0;
+
+      // array offsets
+      color_offset = 0;
+      color_offset_nonpadded = 0;
+      color_offset_nonpadded_att2 = 0;
+    }else{
+      // inner elements (start after outer elements)
+      nb_colors = mp->num_colors_outer_elastic + mp->num_colors_inner_elastic;
+      istart = mp->num_colors_outer_elastic;
+
+      // array offsets
+      color_offset = (*nspec_outer_elastic) * NGLL3_PADDED;
+      color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3;
+      color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3 * N_SLS;
+    }
+
+    // loops over colors
+    for(int icolor = istart; icolor < nb_colors; icolor++){
+
+      nb_blocks_to_compute = mp->h_num_elem_colors_elastic[icolor];
+
+      // checks
+      //if( nb_blocks_to_compute <= 0 ){
+      //  printf("error number of elastic color blocks: %d -- color = %d \n",nb_blocks_to_compute,icolor);
+      //  exit(EXIT_FAILURE);
+      //}
+
+      Kernel_2(nb_blocks_to_compute,mp,*iphase,
+               *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,
+               *ATTENUATION,*ANISOTROPY,
+               mp->d_ibool + color_offset_nonpadded,
+               mp->d_xix + color_offset,
+               mp->d_xiy + color_offset,
+               mp->d_xiz + color_offset,
+               mp->d_etax + color_offset,
+               mp->d_etay + color_offset,
+               mp->d_etaz + color_offset,
+               mp->d_gammax + color_offset,
+               mp->d_gammay + color_offset,
+               mp->d_gammaz + color_offset,
+               mp->d_kappav + color_offset,
+               mp->d_muv + color_offset,
+               mp->d_epsilondev_xx + color_offset_nonpadded,
+               mp->d_epsilondev_yy + color_offset_nonpadded,
+               mp->d_epsilondev_xy + color_offset_nonpadded,
+               mp->d_epsilondev_xz + color_offset_nonpadded,
+               mp->d_epsilondev_yz + color_offset_nonpadded,
+               mp->d_epsilon_trace_over_3 + color_offset_nonpadded,
+               mp->d_one_minus_sum_beta + color_offset_nonpadded,
+               mp->d_factor_common + color_offset_nonpadded_att2,
+               mp->d_R_xx + color_offset_nonpadded,
+               mp->d_R_yy + color_offset_nonpadded,
+               mp->d_R_xy + color_offset_nonpadded,
+               mp->d_R_xz + color_offset_nonpadded,
+               mp->d_R_yz + color_offset_nonpadded,
+               mp->d_b_epsilondev_xx + color_offset_nonpadded,
+               mp->d_b_epsilondev_yy + color_offset_nonpadded,
+               mp->d_b_epsilondev_xy + color_offset_nonpadded,
+               mp->d_b_epsilondev_xz + color_offset_nonpadded,
+               mp->d_b_epsilondev_yz + color_offset_nonpadded,
+               mp->d_b_epsilon_trace_over_3 + color_offset_nonpadded,
+               mp->d_b_R_xx + color_offset_nonpadded,
+               mp->d_b_R_yy + color_offset_nonpadded,
+               mp->d_b_R_xy + color_offset_nonpadded,
+               mp->d_b_R_xz + color_offset_nonpadded,
+               mp->d_b_R_yz + color_offset_nonpadded,
+               mp->d_c11store + color_offset,
+               mp->d_c12store + color_offset,
+               mp->d_c13store + color_offset,
+               mp->d_c14store + color_offset,
+               mp->d_c15store + color_offset,
+               mp->d_c16store + color_offset,
+               mp->d_c22store + color_offset,
+               mp->d_c23store + color_offset,
+               mp->d_c24store + color_offset,
+               mp->d_c25store + color_offset,
+               mp->d_c26store + color_offset,
+               mp->d_c33store + color_offset,
+               mp->d_c34store + color_offset,
+               mp->d_c35store + color_offset,
+               mp->d_c36store + color_offset,
+               mp->d_c44store + color_offset,
+               mp->d_c45store + color_offset,
+               mp->d_c46store + color_offset,
+               mp->d_c55store + color_offset,
+               mp->d_c56store + color_offset,
+               mp->d_c66store + color_offset,
+               mp->d_rhostore + color_offset);
+
+      // for padded and aligned arrays
+      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      // for no-aligned arrays
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+      // for factor_common array
+      color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
+    }
+
+  }else{
+
+    // no mesh coloring: uses atomic updates
+
+    Kernel_2(num_elements,mp,*iphase,
+             *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,
+             *ATTENUATION,*ANISOTROPY,
+             mp->d_ibool,
+             mp->d_xix,
+             mp->d_xiy,
+             mp->d_xiz,
+             mp->d_etax,
+             mp->d_etay,
+             mp->d_etaz,
+             mp->d_gammax,
+             mp->d_gammay,
+             mp->d_gammaz,
+             mp->d_kappav,
+             mp->d_muv,
+             mp->d_epsilondev_xx,
+             mp->d_epsilondev_yy,
+             mp->d_epsilondev_xy,
+             mp->d_epsilondev_xz,
+             mp->d_epsilondev_yz,
+             mp->d_epsilon_trace_over_3,
+             mp->d_one_minus_sum_beta,
+             mp->d_factor_common,
+             mp->d_R_xx,
+             mp->d_R_yy,
+             mp->d_R_xy,
+             mp->d_R_xz,
+             mp->d_R_yz,
+             mp->d_b_epsilondev_xx,
+             mp->d_b_epsilondev_yy,
+             mp->d_b_epsilondev_xy,
+             mp->d_b_epsilondev_xz,
+             mp->d_b_epsilondev_yz,
+             mp->d_b_epsilon_trace_over_3,
+             mp->d_b_R_xx,
+             mp->d_b_R_yy,
+             mp->d_b_R_xy,
+             mp->d_b_R_xz,
+             mp->d_b_R_yz,
+             mp->d_c11store,
+             mp->d_c12store,
+             mp->d_c13store,
+             mp->d_c14store,
+             mp->d_c15store,
+             mp->d_c16store,
+             mp->d_c22store,
+             mp->d_c23store,
+             mp->d_c24store,
+             mp->d_c25store,
+             mp->d_c26store,
+             mp->d_c33store,
+             mp->d_c34store,
+             mp->d_c35store,
+             mp->d_c36store,
+             mp->d_c44store,
+             mp->d_c45store,
+             mp->d_c46store,
+             mp->d_c55store,
+             mp->d_c56store,
+             mp->d_c66store,
+             mp->d_rhostore);
+  }
+
+  //daniel: todo - check with routine sync_copy_from_device below...
+//  // Wait until async-memcpy of outer elements is finished and start MPI.
+//  if(*iphase==2) {
+//    cudaStreamSynchronize(mp->copy_stream);
+//
+//    // There have been problems using the pinned-memory with MPI, so
+//    // we copy the buffer into a non-pinned region.
+//    memcpy(mp->send_buffer,mp->h_send_accel_buffer,
+//           mp->size_mpi_send_buffer*sizeof(float));
+//
+//    // memory copy is now finished, so non-blocking MPI send can proceed
+//    // MPI based halo exchange
+//
+//    assemble_mpi_vector_send_cuda_(&(mp->NPROCS),
+//                                   mp->send_buffer, /* "regular" memory */
+//                                   // mp->h_send_accel_buffer, /* pinned memory **CRASH** */
+//                                   mp->buffer_recv_vector_ext_mesh,
+//                                   &mp->num_interfaces_ext_mesh,
+//                                   &mp->max_nibool_interfaces_ext_mesh,
+//                                   mp->nibool_interfaces_ext_mesh,
+//                                   mp->my_neighbours_ext_mesh,
+//                                   mp->request_send_vector_ext_mesh,
+//                                   mp->request_recv_vector_ext_mesh);
+//
+//    // Decided to keep launching kernels and to wait for MPI & do memcpy while other kernels launch.
+//    // cudaDeviceSynchronize();
+//  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+//daniel: todo - use this instead above call to fortran routine to avoid compilation problems
+extern "C"
+void FC_FUNC_(sync_copy_from_device,
+              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f,
+                                     int* iphase,
+                                     realw* send_buffer) {
+
+  TRACE("sync_copy_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  // Wait until async-memcpy of outer elements is finished and start MPI.
+  if( *iphase != 2 ){ exit_on_cuda_error("sync_copy_from_device must be called for iphase == 2"); }
+
+  //if(*iphase==2) {
+
+  // waits for asynchronous copy to finish
+  cudaStreamSynchronize(mp->copy_stream);
+
+  // There have been problems using the pinned-memory with MPI, so
+  // we copy the buffer into a non-pinned region.
+  memcpy(send_buffer,mp->h_send_accel_buffer,
+         mp->size_mpi_send_buffer*sizeof(float));
+
+  // memory copy is now finished, so non-blocking MPI send can proceed
+
+  //}
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 3
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_cuda_device(realw* veloc,
+                                     realw* accel, int size,
+                                     realw deltatover2,
+                                     realw* rmass) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    realw new_accel = accel[id] * rmass[id / 3];
+    veloc[id] += deltatover2 * new_accel;
+    accel[id] = new_accel;
+/*
+    accel[3*id] = accel[3*id]*rmass[id];
+    accel[3*id+1] = accel[3*id+1]*rmass[id];
+    accel[3*id+2] = accel[3*id+2]*rmass[id];
+
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+*/
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_accel_cuda_device(realw* accel,
+                                           int size,
+                                           realw* rmass) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    accel[id] *= rmass[id / 3];
+/*
+    accel[3*id] = accel[3*id]*rmass[id];
+    accel[3*id+1] = accel[3*id+1]*rmass[id];
+    accel[3*id+2] = accel[3*id+2]*rmass[id];
+*/
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_veloc_cuda_device(realw* veloc,
+                                           realw* accel,
+                                           int size,
+                                           realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_a_cuda,
+              KERNEL_3_A_CUDA)(long* Mesh_pointer,
+                               int* size_F,
+                               realw* deltatover2_F,
+                               int* SIMULATION_TYPE_f,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {
+TRACE("kernel_3_a_cuda");
+
+   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+   //int size = *size_F;
+   int size = *size_F * 3;
+   int SIMULATION_TYPE = *SIMULATION_TYPE_f;
+   realw deltatover2 = *deltatover2_F;
+   realw b_deltatover2 = *b_deltatover2_F;
+
+   int blocksize = BLOCKSIZE_KERNEL3;
+   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+   int num_blocks_x = size_padded/blocksize;
+   int num_blocks_y = 1;
+   while(num_blocks_x > 65535) {
+     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+     num_blocks_y = num_blocks_y*2;
+   }
+
+   dim3 grid(num_blocks_x,num_blocks_y);
+   dim3 threads(blocksize,1,1);
+
+   // check whether we can update accel and veloc, or only accel at this point
+   if( *OCEANS == 0 ){
+     // updates both, accel and veloc
+     kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
+
+     if(SIMULATION_TYPE == 3) {
+       kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc, mp->d_b_accel, size, b_deltatover2,mp->d_rmass);
+     }
+   }else{
+     // updates only accel
+     kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel, size, mp->d_rmass);
+
+     if(SIMULATION_TYPE == 3) {
+       kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel, size, mp->d_rmass);
+     }
+   }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 a");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_b_cuda,
+              KERNEL_3_B_CUDA)(long* Mesh_pointer,
+                             int* size_F,
+                             realw* deltatover2_F,
+                             int* SIMULATION_TYPE_f,
+                             realw* b_deltatover2_F) {
+  TRACE("kernel_3_b_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+  int size = *size_F;
+  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // updates only veloc at this point
+  kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,mp->d_accel,size,deltatover2);
+
+  if(SIMULATION_TYPE == 3) {
+    kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,mp->d_b_accel,size,b_deltatover2);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 b");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* OCEANS load on free surface */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void elastic_ocean_load_cuda_kernel(realw* accel,
+                                               realw* rmass,
+                                               realw* rmass_ocean_load,
+                                               int num_free_surface_faces,
+                                               int* free_surface_ispec,
+                                               int* free_surface_ijk,
+                                               realw* free_surface_normal,
+                                               int* ibool,
+                                               int* updated_dof_ocean_load) {
+  // gets spectral element face id
+  int igll = threadIdx.x ;  //  threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+  realw nx,ny,nz;
+  realw force_normal_comp,additional_term;
+
+  // for all faces on free surface
+  if( iface < num_free_surface_faces ){
+
+    int ispec = free_surface_ispec[iface]-1;
+
+    // gets global point index
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+    //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
+
+    // only update this global point once
+
+    // daniel: TODO - there might be better ways to implement a mutex like below,
+    //            and find a workaround to not use the temporary update array.
+    //            atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
+
+    if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
+
+      // get normal
+      nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
+      ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+      nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+
+      // make updated component of right-hand side
+      // we divide by rmass() which is 1 / M
+      // we use the total force which includes the Coriolis term above
+      force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
+
+      additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
+
+      // probably wouldn't need atomicAdd anymore, but just to be sure...
+      atomicAdd(&accel[iglob*3], + additional_term * nx);
+      atomicAdd(&accel[iglob*3+1], + additional_term * ny);
+      atomicAdd(&accel[iglob*3+2], + additional_term * nz);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(elastic_ocean_load_cuda,
+              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
+                                       int* SIMULATION_TYPE) {
+
+  TRACE("elastic_ocean_load_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( mp->num_free_surface_faces == 0 ) return;
+
+  // block sizes: exact blocksize to match NGLLSQUARE
+  int blocksize = NGLL2;
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+
+  // initializes temporary array to zero
+  print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+                                     sizeof(int)*mp->NGLOB_AB),88501);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before kernel elastic_ocean_load_cuda");
+#endif
+
+  elastic_ocean_load_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
+                                                   mp->d_rmass,
+                                                   mp->d_rmass_ocean_load,
+                                                   mp->num_free_surface_faces,
+                                                   mp->d_free_surface_ispec,
+                                                   mp->d_free_surface_ijk,
+                                                   mp->d_free_surface_normal,
+                                                   mp->d_ibool,
+                                                   mp->d_updated_dof_ocean_load);
+  // for backward/reconstructed potentials
+  if(*SIMULATION_TYPE == 3) {
+    // re-initializes array
+    print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+                                       sizeof(int)*mp->NGLOB_AB),88502);
+
+    elastic_ocean_load_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,
+                                                     mp->d_rmass,
+                                                     mp->d_rmass_ocean_load,
+                                                     mp->num_free_surface_faces,
+                                                     mp->d_free_surface_ispec,
+                                                     mp->d_free_surface_ijk,
+                                                     mp->d_free_surface_normal,
+                                                     mp->d_ibool,
+                                                     mp->d_updated_dof_ocean_load);
+
+  }
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("elastic_ocean_load_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* note:
+ constant arrays when used in compute_forces_acoustic_cuda.cu routines stay zero,
+ constant declaration and cudaMemcpyToSymbol would have to be in the same file...
+
+ extern keyword doesn't work for __constant__ declarations.
+
+ also:
+ cudaMemcpyToSymbol("deviceCaseParams", caseParams, sizeof(CaseParams));
+ ..
+ and compile with -arch=sm_20
+
+ see also: http://stackoverflow.com/questions/4008031/how-to-use-cuda-constant-memory-in-a-programmer-pleasant-way
+ doesn't seem to work.
+
+ we could keep arrays separated for acoustic and elastic routines...
+
+ for now, we store pointers with cudaGetSymbolAddress() function calls.
+
+ */
+
+
+// constant arrays
+
+void setConst_hprime_xx(realw* array,Mesh* mp)
+{
+
+  cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+    exit(1);
+  }
+
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_xx),"d_hprime_xx");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprime_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+// void setConst_hprime_yy(realw* array,Mesh* mp)
+// {
+
+//   cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
+//   if (err != cudaSuccess)
+//   {
+//     fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
+//     fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+//     exit(1);
+//   }
+
+//   err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
+//   if(err != cudaSuccess) {
+//     fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
+//     exit(1);
+//   }
+// }
+
+// void setConst_hprime_zz(realw* array,Mesh* mp)
+// {
+
+//   cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
+//   if (err != cudaSuccess)
+//   {
+//     fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
+//     fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+//     exit(1);
+//   }
+
+//   err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
+//   if(err != cudaSuccess) {
+//     fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
+//     exit(1);
+//   }
+// }
+
+
+void setConst_hprimewgll_xx(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_xx),"d_hprimewgll_xx");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_hprimewgll_yy(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_hprimewgll_zz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_wgllwgll_xy(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  //mp->d_wgllwgll_xy = d_wgllwgll_xy;
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xy),"d_wgllwgll_xy");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_xy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+
+}
+
+void setConst_wgllwgll_xz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in  setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  //mp->d_wgllwgll_xz = d_wgllwgll_xz;
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xz),"d_wgllwgll_xz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_xz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+
+}
+
+void setConst_wgllwgll_yz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  //mp->d_wgllwgll_yz = d_wgllwgll_yz;
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_yz),"d_wgllwgll_yz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_yz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+
+}
+
+void setConst_wgll_cube(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgll_cube, array, NGLL3*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_wgll_cube: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  //mp->d_wgll_cube = d_wgll_cube;
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgll_cube),"d_wgll_cube");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgll_cube: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+
+}
diff --git a/src/cuda/compute_kernels_cuda.cu b/src/cuda/compute_kernels_cuda.cu
new file mode 100644
index 000000000..2cb339185
--- /dev/null
+++ b/src/cuda/compute_kernels_cuda.cu
@@ -0,0 +1,646 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC SIMULATIONS
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_kernels_cudakernel(int* ispec_is_elastic,
+                                           int* ibool,
+                                           realw* accel,
+                                           realw* b_displ,
+                                           realw* epsilondev_xx,
+                                           realw* epsilondev_yy,
+                                           realw* epsilondev_xy,
+                                           realw* epsilondev_xz,
+                                           realw* epsilondev_yz,
+                                           realw* b_epsilondev_xx,
+                                           realw* b_epsilondev_yy,
+                                           realw* b_epsilondev_xy,
+                                           realw* b_epsilondev_xz,
+                                           realw* b_epsilondev_yz,
+                                           realw* rho_kl,
+                                           realw deltat,
+                                           realw* mu_kl,
+                                           realw* kappa_kl,
+                                           realw* epsilon_trace_over_3,
+                                           realw* b_epsilon_trace_over_3,
+                                           int NSPEC_AB) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if(ispec < NSPEC_AB) {
+
+    // elastic elements only
+    if( ispec_is_elastic[ispec] ) {
+
+      int ijk = threadIdx.x;
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int iglob = ibool[ijk_ispec] - 1 ;
+
+      // isotropic kernels:
+      // density kernel
+      rho_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_displ[3*iglob]+
+                                     accel[3*iglob+1]*b_displ[3*iglob+1]+
+                                     accel[3*iglob+2]*b_displ[3*iglob+2]);
+
+
+      // shear modulus kernel
+      mu_kl[ijk_ispec] += deltat * (epsilondev_xx[ijk_ispec]*b_epsilondev_xx[ijk_ispec]+
+                                    epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+
+                                    (epsilondev_xx[ijk_ispec]+epsilondev_yy[ijk_ispec])*
+                                    (b_epsilondev_xx[ijk_ispec]+b_epsilondev_yy[ijk_ispec])+
+                                    2*(epsilondev_xy[ijk_ispec]*b_epsilondev_xy[ijk_ispec]+
+                                       epsilondev_xz[ijk_ispec]*b_epsilondev_xz[ijk_ispec]+
+                                       epsilondev_yz[ijk_ispec]*b_epsilondev_yz[ijk_ispec]));
+
+      // bulk modulus kernel
+      kappa_kl[ijk_ispec] += deltat*(9*epsilon_trace_over_3[ijk_ispec]*
+                                     b_epsilon_trace_over_3[ijk_ispec]);
+
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_kernels_elastic_cuda,
+              COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
+                                            realw* deltat_f) {
+TRACE("compute_kernels_elastic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
+
+  int num_blocks_x = mp->NSPEC_AB;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
+                                               mp->d_accel, mp->d_b_displ,
+                                               mp->d_epsilondev_xx,
+                                               mp->d_epsilondev_yy,
+                                               mp->d_epsilondev_xy,
+                                               mp->d_epsilondev_xz,
+                                               mp->d_epsilondev_yz,
+                                               mp->d_b_epsilondev_xx,
+                                               mp->d_b_epsilondev_yy,
+                                               mp->d_b_epsilondev_xy,
+                                               mp->d_b_epsilondev_xz,
+                                               mp->d_b_epsilondev_yz,
+                                               mp->d_rho_kl,
+                                               deltat,
+                                               mp->d_mu_kl,
+                                               mp->d_kappa_kl,
+                                               mp->d_epsilon_trace_over_3,
+                                               mp->d_b_epsilon_trace_over_3,
+                                               mp->NSPEC_AB);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_kernels_elastic_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// NOISE SIMULATIONS
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void compute_kernels_strength_noise_cuda_kernel(realw* displ,
+                                                           int* free_surface_ispec,
+                                                           int* free_surface_ijk,
+                                                           int* ibool,
+                                                           realw* noise_surface_movie,
+                                                           realw* normal_x_noise,
+                                                           realw* normal_y_noise,
+                                                           realw* normal_z_noise,
+                                                           realw* Sigma_kl,
+                                                           realw deltat,
+                                                           int num_free_surface_faces) {
+  int iface = blockIdx.x + blockIdx.y*gridDim.x;
+
+  if(iface < num_free_surface_faces) {
+
+    int ispec = free_surface_ispec[iface]-1;
+    int igll = threadIdx.x;
+    int ipoin = igll + NGLL2*iface;
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1 ;
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1 ;
+
+    realw eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
+                 noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y_noise[ipoin]+
+                 noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z_noise[ipoin]);
+
+    Sigma_kl[INDEX4(5,5,5,i,j,k,ispec)] += deltat*eta*(normal_x_noise[ipoin]*displ[3*iglob]+
+                                                       normal_y_noise[ipoin]*displ[1+3*iglob]+
+                                                       normal_z_noise[ipoin]*displ[2+3*iglob]);
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_kernels_strgth_noise_cu,
+              COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
+                                                    realw* h_noise_surface_movie,
+                                                    realw* deltat) {
+
+TRACE("compute_kernels_strgth_noise_cu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
+
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(NGLL2,1,1);
+
+  compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ,
+                                                               mp->d_free_surface_ispec,
+                                                               mp->d_free_surface_ijk,
+                                                               mp->d_ibool,
+                                                               mp->d_noise_surface_movie,
+                                                               mp->d_normal_x_noise,
+                                                               mp->d_normal_y_noise,
+                                                               mp->d_normal_z_noise,
+                                                               mp->d_Sigma_kl,*deltat,
+                                                               mp->num_free_surface_faces);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_kernels_strength_noise_cuda_kernel");
+#endif
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC SIMULATIONS
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__device__ void compute_gradient_kernel(int ijk,
+                                        int ispec,
+                                        realw* scalar_field,
+                                        realw* vector_field_element,
+                                        realw* hprime_xx,
+                                        realw* hprime_yy,
+                                        realw* hprime_zz,
+                                        realw* d_xix,
+                                        realw* d_xiy,
+                                        realw* d_xiz,
+                                        realw* d_etax,
+                                        realw* d_etay,
+                                        realw* d_etaz,
+                                        realw* d_gammax,
+                                        realw* d_gammay,
+                                        realw* d_gammaz,
+                                        realw rhol,
+                                        int gravity) {
+
+  realw temp1l,temp2l,temp3l;
+  realw hp1,hp2,hp3;
+  realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl;
+  realw rho_invl;
+  int l,offset,offset1,offset2,offset3;
+
+  //const int NGLLX = 5;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+
+  int K = (ijk/NGLL2);
+  int J = ((ijk-K*NGLL2)/NGLLX);
+  int I = (ijk-K*NGLL2-J*NGLLX);
+
+  // derivative along x
+  temp1l = 0.f;
+  for( l=0; l<NGLLX;l++){
+    hp1 = hprime_xx[l*NGLLX+I];
+    offset1 = K*NGLL2+J*NGLLX+l;
+    temp1l += scalar_field[offset1]*hp1;
+  }
+
+  // derivative along y
+  temp2l = 0.f;
+  for( l=0; l<NGLLX;l++){
+    hp2 = hprime_yy[l*NGLLX+J];
+    offset2 = K*NGLL2+l*NGLLX+I;
+    temp2l += scalar_field[offset2]*hp2;
+  }
+
+  // derivative along z
+  temp3l = 0.f;
+  for( l=0; l<NGLLX;l++){
+    hp3 = hprime_zz[l*NGLLX+K];
+    offset3 = l*NGLL2+J*NGLLX+I;
+    temp3l += scalar_field[offset3]*hp3;
+
+  }
+
+  offset = ispec*NGLL3_ALIGN + ijk;
+
+  xixl = d_xix[offset];
+  xiyl = d_xiy[offset];
+  xizl = d_xiz[offset];
+  etaxl = d_etax[offset];
+  etayl = d_etay[offset];
+  etazl = d_etaz[offset];
+  gammaxl = d_gammax[offset];
+  gammayl = d_gammay[offset];
+  gammazl = d_gammaz[offset];
+
+  if( gravity ){
+    // daniel: TODO - check gravity case here
+    rho_invl = 1.0f / rhol;
+  }else{
+    rho_invl = 1.0f / rhol;
+  }
+  // derivatives of acoustic scalar potential field on GLL points
+  vector_field_element[0] = (temp1l*xixl + temp2l*etaxl + temp3l*gammaxl) * rho_invl;
+  vector_field_element[1] = (temp1l*xiyl + temp2l*etayl + temp3l*gammayl) * rho_invl;
+  vector_field_element[2] = (temp1l*xizl + temp2l*etazl + temp3l*gammazl) * rho_invl;
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void compute_kernels_acoustic_kernel(int* ispec_is_acoustic,
+                                                int* ibool,
+                                                realw* rhostore,
+                                                realw* kappastore,
+                                                realw* hprime_xx,
+                                                realw* hprime_yy,
+                                                realw* hprime_zz,
+                                                realw* d_xix,
+                                                realw* d_xiy,
+                                                realw* d_xiz,
+                                                realw* d_etax,
+                                                realw* d_etay,
+                                                realw* d_etaz,
+                                                realw* d_gammax,
+                                                realw* d_gammay,
+                                                realw* d_gammaz,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                realw* rho_ac_kl,
+                                                realw* kappa_ac_kl,
+                                                realw deltat,
+                                                int NSPEC_AB,
+                                                int gravity) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if( ispec < NSPEC_AB ){
+
+    // acoustic elements only
+    if( ispec_is_acoustic[ispec] ) {
+
+      int ijk = threadIdx.x;
+
+      // local and global indices
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
+      int iglob = ibool[ijk_ispec] - 1;
+
+      realw accel_elm[3];
+      realw b_displ_elm[3];
+      realw rhol,kappal;
+
+      // shared memory between all threads within this block
+      __shared__ realw scalar_field_displ[NGLL3];
+      __shared__ realw scalar_field_accel[NGLL3];
+
+      // copy field values
+      scalar_field_displ[ijk] = b_potential_acoustic[iglob];
+      scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
+      __syncthreads();
+
+      // gets material parameter
+      rhol = rhostore[ijk_ispec_padded];
+
+      // displacement vector from backward field
+      compute_gradient_kernel(ijk,ispec,scalar_field_displ,b_displ_elm,
+                              hprime_xx,hprime_yy,hprime_zz,
+                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                              rhol,gravity);
+
+      // acceleration vector
+      compute_gradient_kernel(ijk,ispec,scalar_field_accel,accel_elm,
+                              hprime_xx,hprime_yy,hprime_zz,
+                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                              rhol,gravity);
+
+      // density kernel
+      rho_ac_kl[ijk_ispec] -= deltat * rhol * (accel_elm[0]*b_displ_elm[0] +
+                                               accel_elm[1]*b_displ_elm[1] +
+                                               accel_elm[2]*b_displ_elm[2]);
+
+      // bulk modulus kernel
+      kappal = kappastore[ijk_ispec];
+      kappa_ac_kl[ijk_ispec] -= deltat / kappal * potential_dot_dot_acoustic[iglob]
+                                                * b_potential_dot_dot_acoustic[iglob];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_kernels_acoustic_cuda,
+              COMPUTE_KERNELS_ACOUSTIC_CUDA)(
+                                             long* Mesh_pointer,
+                                             realw* deltat_f) {
+
+TRACE("compute_kernels_acoustic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
+
+  int num_blocks_x = mp->NSPEC_AB;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  compute_kernels_acoustic_kernel<<<grid,threads>>>(mp->d_ispec_is_acoustic,
+                                                    mp->d_ibool,
+                                                    mp->d_rhostore,
+                                                    mp->d_kappastore,
+                                                    mp->d_hprime_xx,
+                                                    mp->d_hprime_yy,
+                                                    mp->d_hprime_zz,
+                                                    mp->d_xix,
+                                                    mp->d_xiy,
+                                                    mp->d_xiz,
+                                                    mp->d_etax,
+                                                    mp->d_etay,
+                                                    mp->d_etaz,
+                                                    mp->d_gammax,
+                                                    mp->d_gammay,
+                                                    mp->d_gammaz,
+                                                    mp->d_potential_dot_dot_acoustic,
+                                                    mp->d_b_potential_acoustic,
+                                                    mp->d_b_potential_dot_dot_acoustic,
+                                                    mp->d_rho_ac_kl,
+                                                    mp->d_kappa_ac_kl,
+                                                    deltat,
+                                                    mp->NSPEC_AB,
+                                                    mp->gravity);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_kernels_acoustic_kernel");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// preconditioner (approximate Hessian kernel)
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_kernels_hess_el_cudakernel(int* ispec_is_elastic,
+                                                   int* ibool,
+                                                   realw* accel,
+                                                   realw* b_accel,
+                                                   realw* hess_kl,
+                                                   realw deltat,
+                                                   int NSPEC_AB) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if(ispec < NSPEC_AB) {
+
+    // elastic elements only
+    if( ispec_is_elastic[ispec] ) {
+
+      int ijk = threadIdx.x;
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int iglob = ibool[ijk_ispec] - 1 ;
+
+      // approximate hessian
+      hess_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_accel[3*iglob]+
+                                      accel[3*iglob+1]*b_accel[3*iglob+1]+
+                                      accel[3*iglob+2]*b_accel[3*iglob+2]);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_kernels_hess_ac_cudakernel(int* ispec_is_acoustic,
+                                                   int* ibool,
+                                                   realw* potential_dot_dot_acoustic,
+                                                   realw* b_potential_dot_dot_acoustic,
+                                                   realw* rhostore,
+                                                   realw* hprime_xx,
+                                                   realw* hprime_yy,
+                                                   realw* hprime_zz,
+                                                   realw* d_xix,
+                                                   realw* d_xiy,
+                                                   realw* d_xiz,
+                                                   realw* d_etax,
+                                                   realw* d_etay,
+                                                   realw* d_etaz,
+                                                   realw* d_gammax,
+                                                   realw* d_gammay,
+                                                   realw* d_gammaz,
+                                                   realw* hess_kl,
+                                                   realw deltat,
+                                                   int NSPEC_AB,
+                                                   int gravity) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if(ispec < NSPEC_AB) {
+
+    // acoustic elements only
+    if( ispec_is_acoustic[ispec] ){
+
+      // local and global indices
+      int ijk = threadIdx.x;
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int iglob = ibool[ijk_ispec] - 1 ;
+
+      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
+
+      realw accel_elm[3];
+      realw b_accel_elm[3];
+      realw rhol;
+
+      // shared memory between all threads within this block
+      __shared__ realw scalar_field_accel[NGLL3];
+      __shared__ realw scalar_field_b_accel[NGLL3];
+
+      // copy field values
+      scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
+      scalar_field_b_accel[ijk] = b_potential_dot_dot_acoustic[iglob];
+      __syncthreads();
+
+      // gets material parameter
+      rhol = rhostore[ijk_ispec_padded];
+
+      // acceleration vector
+      compute_gradient_kernel(ijk,ispec,
+                              scalar_field_accel,accel_elm,
+                              hprime_xx,hprime_yy,hprime_zz,
+                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                              rhol,gravity);
+
+      // acceleration vector from backward field
+      compute_gradient_kernel(ijk,ispec,
+                              scalar_field_b_accel,b_accel_elm,
+                              hprime_xx,hprime_yy,hprime_zz,
+                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                              rhol,gravity);
+      // approximates hessian
+      hess_kl[ijk_ispec] += deltat * (accel_elm[0]*b_accel_elm[0] +
+                                      accel_elm[1]*b_accel_elm[1] +
+                                      accel_elm[2]*b_accel_elm[2]);
+
+    } // ispec_is_acoustic
+
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_kernels_hess_cuda,
+              COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
+                                         realw* deltat_f,
+                                         int* ELASTIC_SIMULATION,
+                                         int* ACOUSTIC_SIMULATION) {
+  TRACE("compute_kernels_hess_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
+
+  int num_blocks_x = mp->NSPEC_AB;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if( *ELASTIC_SIMULATION ) {
+    compute_kernels_hess_el_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,
+                                                         mp->d_ibool,
+                                                         mp->d_accel,
+                                                         mp->d_b_accel,
+                                                         mp->d_hess_el_kl,
+                                                         deltat,
+                                                         mp->NSPEC_AB);
+  }
+
+  if( *ACOUSTIC_SIMULATION ) {
+    compute_kernels_hess_ac_cudakernel<<<grid,threads>>>(mp->d_ispec_is_acoustic,
+                                                         mp->d_ibool,
+                                                         mp->d_potential_dot_dot_acoustic,
+                                                         mp->d_b_potential_dot_dot_acoustic,
+                                                         mp->d_rhostore,
+                                                         mp->d_hprime_xx,
+                                                         mp->d_hprime_yy,
+                                                         mp->d_hprime_zz,
+                                                         mp->d_xix,
+                                                         mp->d_xiy,
+                                                         mp->d_xiz,
+                                                         mp->d_etax,
+                                                         mp->d_etay,
+                                                         mp->d_etaz,
+                                                         mp->d_gammax,
+                                                         mp->d_gammay,
+                                                         mp->d_gammaz,
+                                                         mp->d_hess_ac_kl,
+                                                         deltat,
+                                                         mp->NSPEC_AB,
+                                                         mp->gravity);
+  }
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_kernels_hess_cuda");
+#endif
+}
+
diff --git a/src/cuda/compute_stacey_acoustic_cuda.cu b/src/cuda/compute_stacey_acoustic_cuda.cu
new file mode 100644
index 000000000..0174923ec
--- /dev/null
+++ b/src/cuda/compute_stacey_acoustic_cuda.cu
@@ -0,0 +1,192 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_stacey_acoustic_kernel(realw* potential_dot_acoustic,
+                                               realw* potential_dot_dot_acoustic,
+                                               int* abs_boundary_ispec,
+                                               int* abs_boundary_ijk,
+                                               realw* abs_boundary_jacobian2Dw,
+                                               int* ibool,
+                                               realw* rhostore,
+                                               realw* kappastore,
+                                               int* ispec_is_inner,
+                                               int* ispec_is_acoustic,
+                                               int phase_is_inner,
+                                               int SIMULATION_TYPE, int SAVE_FORWARD,
+                                               int num_abs_boundary_faces,
+                                               realw* b_potential_dot_acoustic,
+                                               realw* b_potential_dot_dot_acoustic,
+                                               realw* b_absorb_potential,
+                                               int gravity) {
+
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+  int i,j,k,iglob,ispec;
+  realw rhol,kappal,cpl;
+  realw jacobianw;
+  realw vel;
+
+  // don't compute points outside NGLLSQUARE==NGLL2==25
+  // way 2: no further check needed since blocksize = 25
+  if( iface < num_abs_boundary_faces){
+
+  //  if(igll<NGLL2 && iface < num_abs_boundary_faces) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = abs_boundary_ispec[iface]-1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
+
+      i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+      j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+      k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+      // determines bulk sound speed
+      rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
+      kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+      cpl = sqrt( kappal / rhol );
+
+      // velocity
+      if( gravity ){
+        // daniel: TODO - check gravity and stacey condition here...
+        // uses a potential definition of: s = grad(chi)
+        vel = potential_dot_acoustic[iglob] / rhol ;
+      }else{
+        // uses a potential definition of: s = 1/rho grad(chi)
+        vel = potential_dot_acoustic[iglob] / rhol;
+      }
+
+      // gets associated, weighted jacobian
+      jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+      // Sommerfeld condition
+      atomicAdd(&potential_dot_dot_acoustic[iglob],-vel*jacobianw/cpl);
+
+      // adjoint simulations
+      if( SIMULATION_TYPE == 3 ){
+        // Sommerfeld condition
+        atomicAdd(&b_potential_dot_dot_acoustic[iglob],-b_absorb_potential[INDEX2(NGLL2,igll,iface)]);
+      }else if( SIMULATION_TYPE == 1 && SAVE_FORWARD ){
+        // saves boundary values
+        b_absorb_potential[INDEX2(NGLL2,igll,iface)] = vel*jacobianw/cpl;
+      }
+    }
+//  }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_stacey_acoustic_cuda,
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(
+                                    long* Mesh_pointer_f,
+                                    int* phase_is_innerf,
+                                    int* SIMULATION_TYPEf,
+                                    int* SAVE_FORWARDf,
+                                    realw* h_b_absorb_potential) {
+TRACE("compute_stacey_acoustic_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int phase_is_inner          = *phase_is_innerf;
+  int SIMULATION_TYPE         = *SIMULATION_TYPEf;
+  int SAVE_FORWARD            = *SAVE_FORWARDf;
+
+  // way 1: Elapsed time: 4.385948e-03
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //  int blocksize = 32;
+
+  // way 2: Elapsed time: 4.379034e-03
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = NGLL2;
+
+  int num_blocks_x = mp->d_num_abs_boundary_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  //  adjoint simulations: reads in absorbing boundary
+  if (SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0 ){
+    // copies array to GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential,
+                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700);
+  }
+
+  compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_acoustic,
+                                                   mp->d_potential_dot_dot_acoustic,
+                                                   mp->d_abs_boundary_ispec,
+                                                   mp->d_abs_boundary_ijk,
+                                                   mp->d_abs_boundary_jacobian2Dw,
+                                                   mp->d_ibool,
+                                                   mp->d_rhostore,
+                                                   mp->d_kappastore,
+                                                   mp->d_ispec_is_inner,
+                                                   mp->d_ispec_is_acoustic,
+                                                   phase_is_inner,
+                                                   SIMULATION_TYPE,SAVE_FORWARD,
+                                                   mp->d_num_abs_boundary_faces,
+                                                   mp->d_b_potential_dot_acoustic,
+                                                   mp->d_b_potential_dot_dot_acoustic,
+                                                   mp->d_b_absorb_potential,
+                                                   mp->gravity);
+
+  //  adjoint simulations: stores absorbed wavefield part
+  if (SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ){
+    // copies array to CPU
+    print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential,
+                                       mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_stacey_acoustic_kernel");
+#endif
+}
+
diff --git a/src/cuda/compute_stacey_elastic_cuda.cu b/src/cuda/compute_stacey_elastic_cuda.cu
new file mode 100644
index 000000000..a00776495
--- /dev/null
+++ b/src/cuda/compute_stacey_elastic_cuda.cu
@@ -0,0 +1,214 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_stacey_elastic_kernel(realw* veloc,
+                                              realw* accel,
+                                              int* abs_boundary_ispec,
+                                              int* abs_boundary_ijk,
+                                              realw* abs_boundary_normal,
+                                              realw* abs_boundary_jacobian2Dw,
+                                              int* ibool,
+                                              realw* rho_vp,
+                                              realw* rho_vs,
+                                              int* ispec_is_inner,
+                                              int* ispec_is_elastic,
+                                              int phase_is_inner,
+                                              int SIMULATION_TYPE,
+                                              int SAVE_FORWARD,
+                                              int num_abs_boundary_faces,
+                                              realw* b_accel,
+                                              realw* b_absorb_field) {
+
+  int igll = threadIdx.x; // tx
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+  int i,j,k,iglob,ispec;
+  realw vx,vy,vz,vn;
+  realw nx,ny,nz;
+  realw rho_vp_temp,rho_vs_temp;
+  realw tx,ty,tz;
+  realw jacobianw;
+
+  // don't compute points outside NGLLSQUARE==NGLL2==25
+  // way 2: no further check needed since blocksize = 25
+  if( iface < num_abs_boundary_faces){
+
+  //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = abs_boundary_ispec[iface]-1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+
+      i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+      j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+      k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+      iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+
+      // gets associated velocity
+
+      vx = veloc[iglob*3+0];
+      vy = veloc[iglob*3+1];
+      vz = veloc[iglob*3+2];
+
+      // gets associated normal
+      nx = abs_boundary_normal[INDEX3(NDIM,NGLL2,0,igll,iface)];
+      ny = abs_boundary_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+      nz = abs_boundary_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+
+      // // velocity component in normal direction (normal points out of element)
+      vn = vx*nx + vy*ny + vz*nz;
+
+      rho_vp_temp = rho_vp[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+      rho_vs_temp = rho_vs[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
+      tx = rho_vp_temp*vn*nx + rho_vs_temp*(vx-vn*nx);
+      ty = rho_vp_temp*vn*ny + rho_vs_temp*(vy-vn*ny);
+      tz = rho_vp_temp*vn*nz + rho_vs_temp*(vz-vn*nz);
+
+      jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+      atomicAdd(&accel[iglob*3],-tx*jacobianw);
+      atomicAdd(&accel[iglob*3+1],-ty*jacobianw);
+      atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
+
+      if(SIMULATION_TYPE == 3) {
+        atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
+        atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
+        atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
+      }
+      else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
+        b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
+        b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
+        b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
+      } // SIMULATION_TYPE
+    }
+  } // num_abs_boundary_faces
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_stacey_elastic_cuda,
+              COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* SIMULATION_TYPEf,
+                                           int* SAVE_FORWARDf,
+                                           realw* h_b_absorb_field) {
+
+TRACE("compute_stacey_elastic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check
+  if( mp->d_num_abs_boundary_faces == 0 ) return;
+
+  int phase_is_inner    = *phase_is_innerf;
+  int SIMULATION_TYPE   = *SIMULATION_TYPEf;
+  int SAVE_FORWARD      = *SAVE_FORWARDf;
+
+  // way 1
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //int blocksize = 32;
+
+  // way 2: seems sligthly faster
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = NGLL2;
+
+  int num_blocks_x = mp->d_num_abs_boundary_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) {
+    // The read is done in fortran
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
+                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("between cudamemcpy and compute_stacey_elastic_kernel");
+#endif
+
+  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,
+                                                  mp->d_accel,
+                                                  mp->d_abs_boundary_ispec,
+                                                  mp->d_abs_boundary_ijk,
+                                                  mp->d_abs_boundary_normal,
+                                                  mp->d_abs_boundary_jacobian2Dw,
+                                                  mp->d_ibool,
+                                                  mp->d_rho_vp,
+                                                  mp->d_rho_vs,
+                                                  mp->d_ispec_is_inner,
+                                                  mp->d_ispec_is_elastic,
+                                                  phase_is_inner,
+                                                  SIMULATION_TYPE,SAVE_FORWARD,
+                                                  mp->d_num_abs_boundary_faces,
+                                                  mp->d_b_accel,
+                                                  mp->d_b_absorb_field);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_stacey_elastic_kernel");
+#endif
+
+  // ! adjoint simulations: stores absorbed wavefield part
+  // if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
+  //   write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
+
+  if(SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ) {
+    print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field,
+                                       mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701);
+    // The write is done in fortran
+    // write_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&it);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after compute_stacey_elastic after cudamemcpy");
+#endif
+}
+
diff --git a/src/cuda/it_update_displacement_cuda.cu b/src/cuda/it_update_displacement_cuda.cu
new file mode 100644
index 000000000..3c0b3ff94
--- /dev/null
+++ b/src/cuda/it_update_displacement_cuda.cu
@@ -0,0 +1,228 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+#define CUBLAS_ERROR(s,n)  if (s != CUBLAS_STATUS_SUCCESS) {  \
+fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \
+exit(EXIT_FAILURE); }
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic wavefield
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void UpdateDispVeloc_kernel(realw* displ,
+                                       realw* veloc,
+                                       realw* accel,
+                                       int size,
+                                       realw deltat,
+                                       realw deltatsqover2,
+                                       realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
+    veloc[id] = veloc[id] + deltatover2*accel[id];
+    accel[id] = 0; // can do this using memset...not sure if faster
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(it_update_displacement_cuda,
+              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
+                                                 int* size_F,
+                                                 realw* deltat_F,
+                                                 realw* deltatsqover2_F,
+                                                 realw* deltatover2_F,
+                                                 int* SIMULATION_TYPE,
+                                                 realw* b_deltat_F,
+                                                 realw* b_deltatsqover2_F,
+                                                 realw* b_deltatover2_F) {
+
+TRACE("it_update_displacement_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  //int i,device;
+
+  int size = *size_F;
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltat = *b_deltat_F;
+  realw b_deltatsqover2 = *b_deltatsqover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+  //cublasStatus status;
+
+  int blocksize = BLOCKSIZE_KERNEL1;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+
+//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//  exit_on_cuda_error("Before UpdateDispVeloc_kernel");
+//#endif
+
+  //launch kernel
+  UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
+                                           size,deltat,deltatsqover2,deltatover2);
+
+  //cudaThreadSynchronize();
+//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+//  // sync and check to catch errors from previous async operations
+//  exit_on_cuda_error("UpdateDispVeloc_kernel");
+//#endif
+
+  // kernel for backward fields
+  if(*SIMULATION_TYPE == 3) {
+
+    UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+
+//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//    //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+//    exit_on_cuda_error("after SIM_TYPE==3 UpdateDispVeloc_kernel");
+//#endif
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("it_update_displacement_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic wavefield
+
+// KERNEL 1
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void UpdatePotential_kernel(realw* potential_acoustic,
+                                       realw* potential_dot_acoustic,
+                                       realw* potential_dot_dot_acoustic,
+                                       int size,
+                                       realw deltat,
+                                       realw deltatsqover2,
+                                       realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    potential_acoustic[id] = potential_acoustic[id]
+                            + deltat*potential_dot_acoustic[id]
+                            + deltatsqover2*potential_dot_dot_acoustic[id];
+
+    potential_dot_acoustic[id] = potential_dot_acoustic[id]
+                                + deltatover2*potential_dot_dot_acoustic[id];
+
+    potential_dot_dot_acoustic[id] = 0;
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(it_update_displacement_ac_cuda,
+              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
+                                               int* size_F,
+                                               realw* deltat_F,
+                                               realw* deltatsqover2_F,
+                                               realw* deltatover2_F,
+                                               int* SIMULATION_TYPE,
+                                               realw* b_deltat_F,
+                                               realw* b_deltatsqover2_F,
+                                               realw* b_deltatover2_F) {
+TRACE("it_update_displacement_ac_cuda");
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  //int i,device;
+  int size = *size_F;
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltat = *b_deltat_F;
+  realw b_deltatsqover2 = *b_deltatsqover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+  //cublasStatus status;
+
+  int blocksize = BLOCKSIZE_KERNEL1;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  //launch kernel
+  UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
+                                           mp->d_potential_dot_acoustic,
+                                           mp->d_potential_dot_dot_acoustic,
+                                           size,deltat,deltatsqover2,deltatover2);
+
+  if(*SIMULATION_TYPE == 3) {
+    UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
+                                             mp->d_b_potential_dot_acoustic,
+                                             mp->d_b_potential_dot_dot_acoustic,
+                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+  }
+
+  //cudaThreadSynchronize();
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("it_update_displacement_ac_cuda");
+#endif
+}
diff --git a/src/cuda/mesh_constants_cuda.h b/src/cuda/mesh_constants_cuda.h
new file mode 100644
index 000000000..032504321
--- /dev/null
+++ b/src/cuda/mesh_constants_cuda.h
@@ -0,0 +1,467 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+/* trivia
+
+- for most working arrays we use now "realw" instead of "float" type declarations to make it easier to switch
+  between a real or double precision simulation
+  (matching CUSTOM_REAL == 4 or 8 in fortran routines).
+
+- instead of boolean "logical" declared in fortran routines, in C (or Cuda-C) we have to use "int" variables.
+  ifort / gfortran caveat:
+    to check whether it is true or false, do not check for == 1 to test for true values since ifort just uses
+    non-zero values for true (e.g. can be -1 for true). however, false will be always == 0.
+  thus, rather use: if( var ) {...}  for testing if true instead of if( var == 1){...} (alternative: one could use if( var != 0 ){...}
+
+*/
+
+#ifndef GPU_MESH_
+#define GPU_MESH_
+
+#include <sys/types.h>
+#include <unistd.h>
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for debugging and benchmarking
+
+/* ----------------------------------------------------------------------------------------------- */
+
+#define DEBUG 0
+#if DEBUG == 1
+#define TRACE(x) printf("%s\n",x);
+#else
+#define TRACE(x) // printf("%s\n",x);
+#endif
+
+#define MAXDEBUG 0
+#if MAXDEBUG == 1
+#define LOG(x) printf("%s\n",x)
+#define PRINT5(var,offset) for(;print_count<5;print_count++) printf("var(%d)=%2.20f\n",print_count,var[offset+print_count]);
+#define PRINT10(var) if(print_count<10) { printf("var=%1.20e\n",var); print_count++; }
+#define PRINT10i(var) if(print_count<10) { printf("var=%d\n",var); print_count++; }
+#else
+#define LOG(x) // printf("%s\n",x);
+#define PRINT5(var,offset) // for(i=0;i<10;i++) printf("var(%d)=%f\n",i,var[offset+i]);
+#endif
+
+// error checking after cuda function calls
+/* #define ENABLE_VERY_SLOW_ERROR_CHECKING */
+
+#define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
+
+double get_time();
+
+void print_CUDA_error_if_any(cudaError_t err, int num);
+
+void pause_for_debugger(int pause);
+
+void exit_on_cuda_error(char* kernel_name);
+
+void exit_on_error(char* info);
+
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// cuda constant arrays
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// dimensions
+#define NDIM 3
+
+// Gauss-Lobatto-Legendre
+#define NGLLX 5
+#define NGLL2 25
+#define NGLL3 125 // no padding: requires same size as in fortran for NGLLX * NGLLY * NGLLZ
+
+// padding: 128 == 2**7 might improve on older graphics cards w/ coalescent memory accesses:
+#define NGLL3_PADDED 128
+// no padding: 125 == 5*5*5 to avoid allocation of extra memory
+//#define NGLL3_PADDED 125
+
+// number of standard linear solids
+#define N_SLS 3
+
+//typedef float real;   // type of variables passed into function
+typedef float realw;  // type of "working" variables
+
+// double precision temporary variables leads to 10% performance
+// decrease in Kernel_2_impl (not very much..)
+typedef float reald;
+
+// (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
+// leads up to ~ 5% performance increase
+//#define USE_MESH_COLORING_GPU
+
+// Texture memory usage:
+// requires CUDA version >= 4.0, see check below
+// Use textures for d_displ and d_accel -- 10% performance boost
+#define USE_TEXTURES_FIELDS
+//
+// Using texture memory for the hprime-style constants is slower on
+// Fermi generation hardware, but *may* be faster on Kepler
+// generation.
+// Use textures for hprime_xx
+//#define USE_TEXTURES_CONSTANTS
+
+// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
+#if CUDA_VERSION < 4000
+#undef USE_TEXTURES_FIELDS
+#undef USE_TEXTURES_CONSTANTS
+#endif
+
+
+// (optional) unrolling loops
+// leads up to ~1% performance increase
+//#define MANUALLY_UNROLLED_LOOPS
+
+// cuda kernel block size for updating displacements/potential (newmark time scheme)
+// current hardware: 128 is slightly faster than 256 ( ~ 4%)
+#define BLOCKSIZE_KERNEL1 128
+#define BLOCKSIZE_KERNEL3 128
+#define BLOCKSIZE_TRANSFER 256
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// indexing
+
+#define INDEX2(xsize,x,y) x + (y)*xsize
+
+#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
+//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
+
+#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
+//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
+
+#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j))))
+//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
+
+#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
+
+#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED
+//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// mesh pointer wrapper structure
+
+/* ----------------------------------------------------------------------------------------------- */
+
+typedef struct mesh_ {
+
+  // mesh resolution
+  int NSPEC_AB;
+  int NGLOB_AB;
+
+  int myrank;
+  int NPROCS;
+
+  // interpolators
+  realw* d_xix; realw* d_xiy; realw* d_xiz;
+  realw* d_etax; realw* d_etay; realw* d_etaz;
+  realw* d_gammax; realw* d_gammay; realw* d_gammaz;
+
+  // model parameters
+  realw* d_kappav; realw* d_muv;
+
+  // global indexing
+  int* d_ibool;
+
+  // inner / outer elements
+  int* d_ispec_is_inner;
+
+  // mesh coloring
+  int use_mesh_coloring_gpu;
+
+  // pointers to constant memory arrays
+  realw* d_hprime_xx; realw* d_hprime_yy; realw* d_hprime_zz;
+  realw* d_hprimewgll_xx; realw* d_hprimewgll_yy; realw* d_hprimewgll_zz;
+  realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
+  realw* d_wgll_cube;
+
+  // A buffer for mpi-send/recv, which is duplicated in fortran but is
+  // allocated with pinned memory to facilitate asynchronus device <->
+  // host memory transfers
+  float* h_send_accel_buffer;
+  float* h_send_b_accel_buffer;
+  float* send_buffer;
+  float* h_recv_accel_buffer;
+  float* h_recv_b_accel_buffer;
+  float* recv_buffer;
+  int size_mpi_send_buffer;
+  int size_mpi_recv_buffer;
+
+
+
+  // buffers and constants for the MPI-send required for async-memcpy
+  // + non-blocking MPI
+  float* buffer_recv_vector_ext_mesh;
+  int num_interfaces_ext_mesh;
+  int max_nibool_interfaces_ext_mesh;
+  int* nibool_interfaces_ext_mesh;
+  int* my_neighbours_ext_mesh;
+  int* request_send_vector_ext_mesh;
+  int* request_recv_vector_ext_mesh;
+
+
+  // overlapped memcpy streams
+  cudaStream_t compute_stream;
+  cudaStream_t copy_stream;
+  cudaStream_t b_copy_stream;
+
+  // ------------------------------------------------------------------ //
+  // elastic wavefield parameters
+  // ------------------------------------------------------------------ //
+
+  // displacement, velocity, acceleration
+  realw* d_displ; realw* d_veloc; realw* d_accel;
+  // backward/reconstructed elastic wavefield
+  realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel;
+
+  // Texture references for fast non-coalesced scattered access
+  const textureReference* d_displ_tex_ref_ptr;
+  const textureReference* d_accel_tex_ref_ptr;
+
+  // elastic elements
+  int* d_ispec_is_elastic;
+
+  // elastic domain parameters
+  int* d_phase_ispec_inner_elastic;
+  int num_phase_ispec_elastic;
+
+  // mesh coloring
+  int* h_num_elem_colors_elastic;
+  int num_colors_outer_elastic,num_colors_inner_elastic;
+  int nspec_elastic;
+
+  realw* d_rmass;
+
+  // mpi buffer
+  realw* d_send_accel_buffer;
+
+  // interfaces
+  int* d_nibool_interfaces_ext_mesh;
+  int* d_ibool_interfaces_ext_mesh;
+
+  //used for absorbing stacey boundaries
+  int d_num_abs_boundary_faces;
+  int* d_abs_boundary_ispec;
+  int* d_abs_boundary_ijk;
+  realw* d_abs_boundary_normal;
+  realw* d_abs_boundary_jacobian2Dw;
+
+  realw* d_b_absorb_field;
+  int d_b_reclen_field;
+
+  realw* d_rho_vp;
+  realw* d_rho_vs;
+
+  // sources
+  int nsources_local;
+  realw* d_sourcearrays;
+  double* d_stf_pre_compute;
+  int* d_islice_selected_source;
+  int* d_ispec_selected_source;
+
+  // receivers
+  int* d_number_receiver_global;
+  int* d_ispec_selected_rec;
+  int nrec_local;
+  realw* d_station_seismo_field;
+  realw* h_station_seismo_field;
+
+  double* d_hxir, *d_hetar, *d_hgammar;
+  double* d_dxd, *d_dyd, *d_dzd;
+  double* d_vxd, *d_vyd, *d_vzd;
+  double* d_axd, *d_ayd, *d_azd;
+  realw* d_seismograms_d, *d_seismograms_v, *d_seismograms_a;
+  double* d_nu;
+
+  realw* h_seismograms_d_it;
+  realw* h_seismograms_v_it;
+  realw* h_seismograms_a_it;
+
+  // adjoint receivers/sources
+  int nadj_rec_local;
+  realw* d_adj_sourcearrays;
+  realw* h_adj_sourcearrays_slice;
+  int* d_pre_computed_irec;
+
+  // surface elements (to save for noise tomography and acoustic simulations)
+  int* d_free_surface_ispec;
+  int* d_free_surface_ijk;
+  int num_free_surface_faces;
+
+  // surface movie elements to save for noise tomography
+  realw* d_noise_surface_movie;
+
+  // attenuation
+  realw* d_R_xx;
+  realw* d_R_yy;
+  realw* d_R_xy;
+  realw* d_R_xz;
+  realw* d_R_yz;
+
+  realw* d_one_minus_sum_beta;
+  realw* d_factor_common;
+
+  realw* d_alphaval;
+  realw* d_betaval;
+  realw* d_gammaval;
+
+  // attenuation & kernel
+  realw* d_epsilondev_xx;
+  realw* d_epsilondev_yy;
+  realw* d_epsilondev_xy;
+  realw* d_epsilondev_xz;
+  realw* d_epsilondev_yz;
+  realw* d_epsilon_trace_over_3;
+
+  // anisotropy
+  realw* d_c11store;
+  realw* d_c12store;
+  realw* d_c13store;
+  realw* d_c14store;
+  realw* d_c15store;
+  realw* d_c16store;
+  realw* d_c22store;
+  realw* d_c23store;
+  realw* d_c24store;
+  realw* d_c25store;
+  realw* d_c26store;
+  realw* d_c33store;
+  realw* d_c34store;
+  realw* d_c35store;
+  realw* d_c36store;
+  realw* d_c44store;
+  realw* d_c45store;
+  realw* d_c46store;
+  realw* d_c55store;
+  realw* d_c56store;
+  realw* d_c66store;
+
+  // noise
+  realw* d_normal_x_noise;
+  realw* d_normal_y_noise;
+  realw* d_normal_z_noise;
+  realw* d_mask_noise;
+  realw* d_free_surface_jacobian2Dw;
+
+  realw* d_noise_sourcearray;
+
+  // attenuation & kernel backward fields
+  realw* d_b_R_xx;
+  realw* d_b_R_yy;
+  realw* d_b_R_xy;
+  realw* d_b_R_xz;
+  realw* d_b_R_yz;
+
+  realw* d_b_epsilondev_xx;
+  realw* d_b_epsilondev_yy;
+  realw* d_b_epsilondev_xy;
+  realw* d_b_epsilondev_xz;
+  realw* d_b_epsilondev_yz;
+  realw* d_b_epsilon_trace_over_3;
+
+  realw* d_b_alphaval;
+  realw* d_b_betaval;
+  realw* d_b_gammaval;
+
+  // sensitivity kernels
+  realw* d_rho_kl;
+  realw* d_mu_kl;
+  realw* d_kappa_kl;
+
+  // noise sensitivity kernel
+  realw* d_Sigma_kl;
+
+  // approximative hessian for preconditioning kernels
+  realw* d_hess_el_kl;
+
+  // oceans
+  realw* d_rmass_ocean_load;
+  realw* d_free_surface_normal;
+  int* d_updated_dof_ocean_load;
+
+  // ------------------------------------------------------------------ //
+  // acoustic wavefield
+  // ------------------------------------------------------------------ //
+  // potential and first and second time derivative
+  realw* d_potential_acoustic; realw* d_potential_dot_acoustic; realw* d_potential_dot_dot_acoustic;
+  // backward/reconstructed wavefield
+  realw* d_b_potential_acoustic; realw* d_b_potential_dot_acoustic; realw* d_b_potential_dot_dot_acoustic;
+
+  // acoustic domain parameters
+  int* d_ispec_is_acoustic;
+
+  int* d_phase_ispec_inner_acoustic;
+  int num_phase_ispec_acoustic;
+
+  // mesh coloring
+  int* h_num_elem_colors_acoustic;
+  int num_colors_outer_acoustic,num_colors_inner_acoustic;
+  int nspec_acoustic;
+
+  realw* d_rhostore;
+  realw* d_kappastore;
+  realw* d_rmass_acoustic;
+
+  // mpi buffer
+  realw* d_send_potential_dot_dot_buffer;
+
+  realw* d_b_absorb_potential;
+  int d_b_reclen_potential;
+
+  // for writing seismograms
+  realw* d_station_seismo_potential;
+  realw* h_station_seismo_potential;
+
+  // sensitivity kernels
+  realw* d_rho_ac_kl;
+  realw* d_kappa_ac_kl;
+
+  // approximative hessian for preconditioning kernels
+  realw* d_hess_ac_kl;
+
+  // coupling acoustic-elastic
+  int* d_coupling_ac_el_ispec;
+  int* d_coupling_ac_el_ijk;
+  realw* d_coupling_ac_el_normal;
+  realw* d_coupling_ac_el_jacobian2Dw;
+
+  // gravity
+  int gravity;
+  realw* d_minus_deriv_gravity;
+  realw* d_minus_g;
+
+} Mesh;
+
+
+#endif
diff --git a/src/cuda/noise_tomography_cuda.cu b/src/cuda/noise_tomography_cuda.cu
new file mode 100644
index 000000000..e1750b28e
--- /dev/null
+++ b/src/cuda/noise_tomography_cuda.cu
@@ -0,0 +1,305 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+// #include "epik_user.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){
+TRACE("fortranflush");
+
+  fflush(stdout);
+  fflush(stderr);
+  printf("Flushing proc %d!\n",*rank);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {
+TRACE("fortranprint");
+
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  printf("%d: sends msg_id %d\n",procid,*id);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {
+TRACE("fortranprintf");
+
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  printf("%d: sends val %e\n",procid,*val);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {
+TRACE("fortranprintd");
+
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  printf("%d: sends val %e\n",procid,*val);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// randomize displ for testing
+extern "C"
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {
+TRACE("make_displ_rand");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  // realw* displ_rnd = (realw*)malloc(mp->NGLOB_AB*3*sizeof(realw));
+  for(int i=0;i<mp->NGLOB_AB*3;i++) {
+    h_displ[i] = rand();
+  }
+  cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void transfer_surface_to_host_kernel(int* free_surface_ispec,
+                                                int* free_surface_ijk,
+                                                int num_free_surface_faces,
+                                                int* ibool,
+                                                realw* displ,
+                                                realw* noise_surface_movie) {
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // int id = tx + blockIdx.x*blockDim.x + blockIdx.y*blockDim.x*gridDim.x;
+
+  if(iface < num_free_surface_faces) {
+    int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing
+
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+    noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_surface_to_host,
+              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
+                                        realw* h_noise_surface_movie) {
+TRACE("transfer_surface_to_host");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(NGLL2,1,1);
+
+  transfer_surface_to_host_kernel<<<grid,threads>>>(mp->d_free_surface_ispec,
+                                                    mp->d_free_surface_ijk,
+                                                    mp->num_free_surface_faces,
+                                                    mp->d_ibool,
+                                                    mp->d_displ,
+                                                    mp->d_noise_surface_movie);
+
+  cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_surface_to_host");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void noise_read_add_surface_movie_cuda_kernel(realw* accel, int* ibool,
+                                                         int* free_surface_ispec,
+                                                         int* free_surface_ijk,
+                                                         int num_free_surface_faces,
+                                                         realw* noise_surface_movie,
+                                                         realw* normal_x_noise,
+                                                         realw* normal_y_noise,
+                                                         realw* normal_z_noise,
+                                                         realw* mask_noise,
+                                                         realw* free_surface_jacobian2Dw) {
+
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
+
+  // when nspec_top > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+  if(iface < num_free_surface_faces) {
+    int ispec = free_surface_ispec[iface]-1;
+
+    int igll = threadIdx.x;
+
+    int ipoin = NGLL2*iface + igll;
+    int i=free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+    int j=free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+    int k=free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+    realw normal_x = normal_x_noise[ipoin];
+    realw normal_y = normal_y_noise[ipoin];
+    realw normal_z = normal_z_noise[ipoin];
+
+    realw eta = (noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x +
+                noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y +
+                noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z);
+
+    // error from cuda-memcheck and ddt seems "incorrect", because we
+    // are passing a __constant__ variable pointer around like it was
+    // made using cudaMalloc, which *may* be "incorrect", but produces
+    // correct results.
+
+    // ========= Invalid __global__ read of size
+    // 4 ========= at 0x00000cd8 in
+    // compute_add_sources_cuda.cu:260:noise_read_add_surface_movie_cuda_kernel
+    // ========= by thread (0,0,0) in block (3443,0) ========= Address
+    // 0x203000c8 is out of bounds
+
+    // non atomic version for speed testing -- atomic updates are needed for correctness
+    // accel[3*iglob] +=   eta*mask_noise[ipoin] * normal_x * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+    // accel[3*iglob+1] += eta*mask_noise[ipoin] * normal_y * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+    // accel[3*iglob+2] += eta*mask_noise[ipoin] * normal_z * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+
+    // Fortran version in SVN -- note deletion of wgllwgll_xy?
+    // accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) &
+    // * free_surface_jacobian2Dw(igll,iface)
+    // accel(2,iglob) = accel(2,iglob) + eta * mask_noise(ipoin) * normal_y_noise(ipoin) &
+    // * free_surface_jacobian2Dw(igll,iface)
+    // accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) &
+    // * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface)
+
+    // atomicAdd(&accel[iglob*3]  ,eta*mask_noise[ipoin]*normal_x*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    // atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    // atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+
+    atomicAdd(&accel[iglob*3]  ,eta*mask_noise[ipoin]*normal_x*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(noise_read_add_surface_movie_cu,
+              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+                                               realw* h_noise_surface_movie,
+                                               int* NOISE_TOMOGRAPHYf) {
+TRACE("noise_read_add_surface_movie_cu");
+
+  // EPIK_TRACER("noise_read_add_surface_movie_cu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
+
+  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(NGLL2,1,1);
+
+  if(NOISE_TOMOGRAPHY == 2) { // add surface source to forward field
+    noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_accel,
+                                                               mp->d_ibool,
+                                                               mp->d_free_surface_ispec,
+                                                               mp->d_free_surface_ijk,
+                                                               mp->num_free_surface_faces,
+                                                               mp->d_noise_surface_movie,
+                                                               mp->d_normal_x_noise,
+                                                               mp->d_normal_y_noise,
+                                                               mp->d_normal_z_noise,
+                                                               mp->d_mask_noise,
+                                                               mp->d_free_surface_jacobian2Dw);
+  }
+  else if(NOISE_TOMOGRAPHY == 3) { // add surface source to adjoint (backward) field
+    noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
+                                                               mp->d_ibool,
+                                                               mp->d_free_surface_ispec,
+                                                               mp->d_free_surface_ijk,
+                                                               mp->num_free_surface_faces,
+                                                               mp->d_noise_surface_movie,
+                                                               mp->d_normal_x_noise,
+                                                               mp->d_normal_y_noise,
+                                                               mp->d_normal_z_noise,
+                                                               mp->d_mask_noise,
+                                                               mp->d_free_surface_jacobian2Dw);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("noise_read_add_surface_movie_cuda_kernel");
+#endif
+}
diff --git a/src/cuda/prepare_constants_cuda.h b/src/cuda/prepare_constants_cuda.h
new file mode 100644
index 000000000..04c5ea839
--- /dev/null
+++ b/src/cuda/prepare_constants_cuda.h
@@ -0,0 +1,99 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#ifndef CUDA_HEADER_H
+#define CUDA_HEADER_H
+
+typedef float realw;  // type of "working" variables
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// setters for these const arrays (very ugly hack, but will have to do)
+
+// elastic
+void setConst_hprime_xx(realw* array,Mesh* mp);
+void setConst_hprime_yy(realw* array,Mesh* mp);
+void setConst_hprime_zz(realw* array,Mesh* mp);
+
+void setConst_hprimewgll_xx(realw* array,Mesh* mp);
+void setConst_hprimewgll_yy(realw* array,Mesh* mp);
+void setConst_hprimewgll_zz(realw* array,Mesh* mp);
+
+void setConst_wgllwgll_xy(realw* array,Mesh* mp);
+void setConst_wgllwgll_xz(realw* array, Mesh* mp);
+void setConst_wgllwgll_yz(realw* array, Mesh* mp);
+
+void setConst_wgll_cube(realw* array, Mesh* mp);
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* CUDA specific things from specfem3D_kernels.cu */
+
+// older TEXTURE usage. For now just acoustic simulations. See usage
+// of USE_TEXTURES_FIELDS elsewhere in code for elastic case
+#ifdef USE_TEXTURES
+
+// declaration of textures
+texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
+texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
+
+
+  void bindTexturesPotential(realw* d_potential_acoustic)
+  {
+    cudaError_t err;
+
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+
+    err = cudaBindTexture(NULL,tex_potential_acoustic, d_potential_acoustic,
+                          channelDescFloat, NGLOB*sizeof(realw));
+    if (err != cudaSuccess)
+    {
+      fprintf(stderr, "Error in bindTexturesPotential for potential_acoustic: %s\n", cudaGetErrorString(err));
+      exit(1);
+    }
+  }
+
+  void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot_acoustic)
+  {
+    cudaError_t err;
+
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+
+    err = cudaBindTexture(NULL,tex_potential_dot_dot_acoustic, d_potential_dot_dot_acoustic,
+                          channelDescFloat, NGLOB*sizeof(realw));
+    if (err != cudaSuccess)
+    {
+      fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot_acoustic: %s\n", cudaGetErrorString(err));
+      exit(1);
+    }
+  }
+
+#endif // USE_TEXTURES
+
+
+#endif //CUDA_HEADER_H
diff --git a/src/cuda/prepare_mesh_constants_cuda.cu b/src/cuda/prepare_mesh_constants_cuda.cu
new file mode 100644
index 000000000..cc3060f36
--- /dev/null
+++ b/src/cuda/prepare_mesh_constants_cuda.cu
@@ -0,0 +1,2037 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+#include "prepare_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Helper functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+double get_time()
+{
+  struct timeval t;
+  struct timezone tzp;
+  gettimeofday(&t, &tzp);
+  return t.tv_sec + t.tv_usec*1e-6;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {
+TRACE("pause_for_debug");
+
+  pause_for_debugger(1);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void pause_for_debugger(int pause) {
+  if(pause) {
+    int myrank;
+#ifdef WITH_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+    myrank = 0;
+#endif
+    printf("I'm rank %d\n",myrank);
+    int i = 0;
+    char hostname[256];
+    gethostname(hostname, sizeof(hostname));
+    printf("PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
+    FILE *file = fopen("/scratch/eiger/rietmann/attach_gdb.txt","w+");
+    if (file != NULL){
+      fprintf(file,"PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
+      fclose(file);
+    }
+    fflush(stdout);
+    while (0 == i)
+      sleep(5);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_cuda_error(char* kernel_name) {
+  // sync and check to catch errors from previous async operations
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+    {
+      fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+      pause_for_debugger(0);
+      //free(kernel_name);
+#ifdef WITH_MPI
+      MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+      exit(EXIT_FAILURE);
+    }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_error(char* info)
+{
+  printf("\nERROR: %s\n",info);
+  fflush(stdout);
+#ifdef WITH_MPI
+  MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+  //free(info);
+  exit(EXIT_FAILURE);
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void print_CUDA_error_if_any(cudaError_t err, int num)
+{
+  if (cudaSuccess != err)
+  {
+    printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
+    fflush(stdout);
+#ifdef WITH_MPI
+    MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+    exit(EXIT_FAILURE);
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void get_free_memory(double* free_db, double* used_db, double* total_db) {
+
+  // gets memory usage in byte
+  size_t free_byte ;
+  size_t total_byte ;
+  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+  if ( cudaSuccess != cuda_status ){
+    printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
+    exit(EXIT_FAILURE);
+  }
+
+  *free_db = (double)free_byte ;
+  *total_db = (double)total_byte ;
+  *used_db = *total_db - *free_db ;
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Saves GPU memory usage to file
+void output_free_memory(int myrank,char* info_str) {
+
+  FILE* fp;
+  char filename[BUFSIZ];
+  double free_db,used_db,total_db;
+
+  get_free_memory(&free_db,&used_db,&total_db);
+
+  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_mem_usage_proc_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+  if (fp != NULL){
+    fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
+            used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+    fclose(fp);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Fortran-callable version of above method
+extern "C"
+void FC_FUNC_(output_free_device_memory,
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {
+TRACE("output_free_device_memory");
+
+  char info[6];
+  sprintf(info,"f %d:",*myrank);
+  output_free_memory(*myrank,info);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+void show_free_memory(char* info_str) {
+
+  // show memory usage of GPU
+  int myrank;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
+#else
+  myrank = 0;
+#endif
+  double free_db,used_db,total_db;
+
+  get_free_memory(&free_db,&used_db,&total_db);
+
+  printf("%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
+   used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+}
+*/
+
+/*
+extern "C"
+void FC_FUNC_(show_free_device_memory,
+              SHOW_FREE_DEVICE_MEMORY)() {
+ TRACE("show_free_device_memory");
+
+ show_free_memory("from fortran");
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(get_free_device_memory,
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {
+TRACE("get_free_device_memory");
+
+  double free_db,used_db,total_db;
+
+  get_free_memory(&free_db,&used_db,&total_db);
+
+  // converts to MB
+  *free = (realw) free_db/1024.0/1024.0;
+  *used = (realw) used_db/1024.0/1024.0;
+  *total = (realw) total_db/1024.0/1024.0;
+  return;
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+__global__ void check_phase_ispec_kernel(int num_phase_ispec,
+                                         int* phase_ispec,
+                                         int NSPEC_AB,
+                                         int* ier) {
+
+  int i,ispec,iphase,count0,count1;
+  *ier = 0;
+
+  for(iphase=0; iphase < 2; iphase++){
+    count0 = 0;
+    count1 = 0;
+
+    for(i=0; i < num_phase_ispec; i++){
+      ispec = phase_ispec[iphase*num_phase_ispec + i] - 1;
+      if( ispec < -1 || ispec >= NSPEC_AB ){
+        printf("Error in d_phase_ispec_inner_elastic %d %d\n",i,ispec);
+        *ier = 1;
+        return;
+      }
+      if( ispec >= 0 ){ count0++;}
+      if( ispec < 0 ){ count1++;}
+    }
+
+    printf("check_phase_ispec done: phase %d, count = %d %d \n",iphase,count0,count1);
+
+  }
+}
+
+void check_phase_ispec(long* Mesh_pointer_f,int type){
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  printf("check phase_ispec for type=%d\n",type);
+
+  dim3 grid(1,1);
+  dim3 threads(1,1,1);
+
+  int* h_debug = (int*) calloc(1,sizeof(int));
+  int* d_debug;
+  cudaMalloc((void**)&d_debug,sizeof(int));
+
+  if( type == 1 ){
+    check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_elastic,
+                                             mp->d_phase_ispec_inner_elastic,
+                                             mp->NSPEC_AB,
+                                             d_debug);
+  }else if( type == 2 ){
+    check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_acoustic,
+                                               mp->d_phase_ispec_inner_acoustic,
+                                               mp->NSPEC_AB,
+                                               d_debug);
+  }
+
+  cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+  cudaFree(d_debug);
+  if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+  free(h_debug);
+  fflush(stdout);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("check_phase_ispec");
+#endif
+
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+__global__ void check_ispec_is_kernel(int NSPEC_AB,
+                                      int* ispec_is,
+                                      int* ier) {
+
+  int ispec,count0,count1;
+
+  *ier = 0;
+  count0 = 0;
+  count1 = 0;
+  for(ispec=0; ispec < NSPEC_AB; ispec++){
+    if( ispec_is[ispec] < -1 || ispec_is[ispec] > 1 ){
+      printf("Error in ispec_is %d %d\n",ispec,ispec_is[ispec]);
+      *ier = 1;
+      return;
+      //exit(1);
+    }
+    if( ispec_is[ispec] == 0 ){count0++;}
+    if( ispec_is[ispec] != 0 ){count1++;}
+  }
+  printf("check_ispec_is done: count = %d %d\n",count0,count1);
+}
+
+void check_ispec_is(long* Mesh_pointer_f,int type){
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  printf("check ispec_is for type=%d\n",type);
+
+  dim3 grid(1,1);
+  dim3 threads(1,1,1);
+
+  int* h_debug = (int*) calloc(1,sizeof(int));
+  int* d_debug;
+  cudaMalloc((void**)&d_debug,sizeof(int));
+
+  if( type == 0 ){
+    check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+                                            mp->d_ispec_is_inner,
+                                            d_debug);
+  }else if( type == 1 ){
+    check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+                                            mp->d_ispec_is_elastic,
+                                            d_debug);
+  }else if( type == 2 ){
+    check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+                                            mp->d_ispec_is_acoustic,
+                                            d_debug);
+  }
+
+  cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+  cudaFree(d_debug);
+  if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+  free(h_debug);
+  fflush(stdout);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("check_ispec_is");
+#endif
+}
+*/
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+__global__ void check_array_ispec_kernel(int num_array_ispec,
+                                         int* array_ispec,
+                                         int NSPEC_AB,
+                                         int* ier) {
+
+  int i,ispec,count0,count1;
+
+  *ier = 0;
+  count0 = 0;
+  count1 = 0;
+
+  for(i=0; i < num_array_ispec; i++){
+    ispec = array_ispec[i] - 1;
+    if( ispec < -1 || ispec >= NSPEC_AB ){
+      printf("Error in d_array_ispec %d %d\n",i,ispec);
+      *ier = 1;
+      return;
+    }
+    if( ispec >= 0 ){ count0++;}
+    if( ispec < 0 ){ count1++;}
+  }
+
+  printf("check_array_ispec done: count = %d %d \n",count0,count1);
+}
+
+void check_array_ispec(long* Mesh_pointer_f,int type){
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  printf("check array_ispec for type=%d\n",type);
+
+  dim3 grid(1,1);
+  dim3 threads(1,1,1);
+
+  int* h_debug = (int*) calloc(1,sizeof(int));
+  int* d_debug;
+  cudaMalloc((void**)&d_debug,sizeof(int));
+
+  if( type == 1 ){
+    check_array_ispec_kernel<<<grid,threads>>>(mp->d_num_abs_boundary_faces,
+                                               mp->d_abs_boundary_ispec,
+                                               mp->NSPEC_AB,
+                                               d_debug);
+  }
+
+  cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+  cudaFree(d_debug);
+  if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+  free(h_debug);
+  fflush(stdout);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("check_array_ispec");
+#endif
+
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// GPU preparation
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_cuda_device,
+              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+  TRACE("prepare_cuda_device");
+
+  // Gets rank number of MPI process
+  int myrank = *myrank_f;
+
+/*
+  // cuda initialization (needs -lcuda library)
+  // note:   cuInit initializes the driver API.
+  //             it is needed for any following CUDA driver API function call (format cuFUNCTION(..) )
+  //             however, for the CUDA runtime API functions (format cudaFUNCTION(..) )
+  //             the initialization is implicit, thus cuInit() here would not be needed...
+  CUresult status = cuInit(0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n");
+
+  // returns a handle to the first cuda compute device
+  CUdevice dev;
+  status = cuDeviceGet(&dev, 0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n");
+
+  // gets device properties
+  int major,minor;
+  status = cuDeviceComputeCapability(&major,&minor,dev);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n");
+
+  // make sure that the device has compute capability >= 1.3
+  if (major < 1){
+    fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major);
+    exit_on_error("CUDA Compute capability major number should be at least 1\n");
+  }
+  if (major == 1 && minor < 3){
+    fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor);
+    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+  }
+*/
+
+  // note: from here on we use the runtime API  ...
+
+  // Gets number of GPU devices
+  int device_count = 0;
+  cudaGetDeviceCount(&device_count);
+  exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n");
+
+  // returns device count to fortran
+  if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
+  *ncuda_devices = device_count;
+
+
+  // Sets the active device
+  if(device_count > 1) {
+    // generalized for more GPUs per node
+    // note: without previous context release, cudaSetDevice will complain with the cuda error
+    //         "setting the device when a process is active is not allowed"
+    // releases previous contexts
+    cudaThreadExit();
+
+    //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
+    //MPI_Barrier(MPI_COMM_WORLD);
+
+    // sets active device
+    cudaSetDevice( myrank % device_count );
+    exit_on_cuda_error("cudaSetDevice");
+  }
+
+  // returns a handle to the active device
+  int device;
+  cudaGetDevice(&device);
+
+  // get device properties
+  struct cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp,device);
+
+  // exit if the machine has no CUDA-enabled device
+  if (deviceProp.major == 9999 && deviceProp.minor == 9999){
+    fprintf(stderr,"No CUDA-enabled device found, exiting...\n\n");
+    exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n");
+  }
+
+  // outputs device infos to file
+  char filename[BUFSIZ];
+  FILE* fp;
+  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_info_proc_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+  if (fp != NULL){
+    // display device properties
+    fprintf(fp,"Device Name = %s\n",deviceProp.name);
+    fprintf(fp,"multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
+    fprintf(fp,"totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+    fprintf(fp,"totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+    fprintf(fp,"sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+    fprintf(fp,"Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+    fprintf(fp,"Maximum size of each dimension of a block: %d x %d x %d\n",
+            deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+    fprintf(fp,"Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+            deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+    fprintf(fp,"Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+    if(deviceProp.canMapHostMemory){
+      fprintf(fp,"canMapHostMemory: TRUE\n");
+    }else{
+      fprintf(fp,"canMapHostMemory: FALSE\n");
+    }
+    if(deviceProp.deviceOverlap){
+      fprintf(fp,"deviceOverlap: TRUE\n");
+    }else{
+      fprintf(fp,"deviceOverlap: FALSE\n");
+    }
+
+    // outputs initial memory infos via cudaMemGetInfo()
+    double free_db,used_db,total_db;
+    get_free_memory(&free_db,&used_db,&total_db);
+    fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank,
+            used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+    fclose(fp);
+  }
+
+  // make sure that the device has compute capability >= 1.3
+  if (deviceProp.major < 1){
+    fprintf(stderr,"Compute capability major number should be at least 1, exiting...\n\n");
+    exit_on_error("CUDA Compute capability major number should be at least 1\n");
+  }
+  if (deviceProp.major == 1 && deviceProp.minor < 3){
+    fprintf(stderr,"Compute capability should be at least 1.3, exiting...\n");
+    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+  }
+  // we use pinned memory for asynchronous copy
+  if( ! deviceProp.canMapHostMemory){
+    fprintf(stderr,"Device capability should allow to map host memory, exiting...\n");
+    exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_constants_device,
+              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
+                                        int* h_NGLLX,
+                                        int* NSPEC_AB, int* NGLOB_AB,
+                                        realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                        realw* h_etax, realw* h_etay, realw* h_etaz,
+                                        realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                        realw* h_kappav, realw* h_muv,
+                                        int* h_ibool,
+                                        int* num_interfaces_ext_mesh,
+                                        int* max_nibool_interfaces_ext_mesh,
+                                        int* h_nibool_interfaces_ext_mesh,
+                                        int* h_ibool_interfaces_ext_mesh,
+                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+                                        realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
+                                        int* ABSORBING_CONDITIONS,
+                                        int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
+                                        realw* h_abs_boundary_normal,
+                                        realw* h_abs_boundary_jacobian2Dw,
+                                        int* h_num_abs_boundary_faces,
+                                        int* h_ispec_is_inner,
+                                        int* NSOURCES,
+                                        int* nsources_local_f,
+                                        realw* h_sourcearrays,
+                                        int* h_islice_selected_source,
+                                        int* h_ispec_selected_source,
+                                        int* h_number_receiver_global,
+                                        int* h_ispec_selected_rec,
+                                        int* nrec_f,
+                                        int* nrec_local_f,
+                                        int* SIMULATION_TYPE,
+                                        int* USE_MESH_COLORING_GPU_f,
+                                        int* nspec_acoustic,int* nspec_elastic,
+                                        int* my_neighbours_ext_mesh,
+                                        int* request_send_vector_ext_mesh,
+                                        int* request_recv_vector_ext_mesh,
+                                        realw* buffer_recv_vector_ext_mesh
+                                        ) {
+
+TRACE("prepare_constants_device");
+
+  // allocates mesh parameter structure
+  Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
+  if (mp == NULL) exit_on_error("error allocating mesh pointer");
+  *Mesh_pointer = (long)mp;
+
+  // checks if NGLLX == 5
+  if( *h_NGLLX != NGLLX ){
+    exit_on_error("NGLLX must be 5 for CUDA devices");
+  }
+
+
+#ifdef WITH_MPI
+  int nproc;
+  MPI_Comm_size(MPI_COMM_WORLD,&nproc);
+  mp->NPROCS=nproc;
+#else
+  mp->NPROCS = 1;
+#endif
+
+
+  // sets global parameters
+  mp->NSPEC_AB = *NSPEC_AB;
+  mp->NGLOB_AB = *NGLOB_AB;
+
+  // sets constant arrays
+  setConst_hprime_xx(h_hprime_xx,mp);
+  // only needed if NGLLX != NGLLY != NGLLZ
+  // setConst_hprime_yy(h_hprime_yy,mp);
+  // setConst_hprime_zz(h_hprime_zz,mp);
+  setConst_hprimewgll_xx(h_hprimewgll_xx,mp);
+  setConst_hprimewgll_yy(h_hprimewgll_yy,mp);
+  setConst_hprimewgll_zz(h_hprimewgll_zz,mp);
+  setConst_wgllwgll_xy(h_wgllwgll_xy,mp);
+  setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
+  setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
+
+  // Using texture memory for the hprime-style constants is slower on
+  // Fermi generation hardware, but *may* be faster on Kepler
+  // generation. We will reevaluate this again, so might as well leave
+  // in the code with with #USE_TEXTURES_FIELDS not-defined.
+  #ifdef USE_TEXTURES_CONSTANTS
+  {
+    const textureReference* d_hprime_xx_tex_ptr;
+    print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_tex_ptr, "d_hprime_xx_tex"), 4101);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_tex_ptr, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
+  }
+  #endif
+
+
+  // Allocate pinned mpi-buffers.
+  // MPI buffers use pinned memory allocated by cudaMallocHost, which
+  // enables the use of asynchronous memory copies from host <->
+  // device
+  int size_mpi_buffer = 3 * (*num_interfaces_ext_mesh) * (*max_nibool_interfaces_ext_mesh);
+  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+  mp->send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+  mp->size_mpi_send_buffer = size_mpi_buffer;
+
+  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+  mp->recv_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+  mp->size_mpi_recv_buffer = size_mpi_buffer;
+
+  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+  // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+
+  mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
+  mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
+  mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
+  mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
+  mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
+  mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
+  mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
+
+  // setup two streams, one for compute and one for host<->device memory copies
+  cudaStreamCreate(&mp->compute_stream);
+  cudaStreamCreate(&mp->copy_stream);
+  cudaStreamCreate(&mp->b_copy_stream);
+
+  /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+
+  // mesh
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(realw)),1002);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(realw)),1003);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(realw)),1004);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(realw)),1005);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(realw)),1006);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(realw)),1007);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(realw)),1008);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(realw)),1009);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(realw)),1010);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(realw)),1011);
+
+  // transfer constant element data with padding
+  for(int i=0;i < mp->NSPEC_AB;i++) {
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*NGLL3_PADDED, &h_xix[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*NGLL3_PADDED,   &h_xiy[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1502);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz+i*NGLL3_PADDED,   &h_xiz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1503);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etax+i*NGLL3_PADDED,  &h_etax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1504);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etay+i*NGLL3_PADDED,  &h_etay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1505);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz+i*NGLL3_PADDED,  &h_etaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1506);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax+i*NGLL3_PADDED,&h_gammax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1507);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay+i*NGLL3_PADDED,&h_gammay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1508);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz+i*NGLL3_PADDED,&h_gammaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1509);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_kappav+i*NGLL3_PADDED,&h_kappav[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_muv+i*NGLL3_PADDED,   &h_muv[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1511);
+  }
+
+  // global indexing
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool,size_padded*sizeof(int)),1021);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
+                                     NGLL3*(mp->NSPEC_AB)*sizeof(int),cudaMemcpyHostToDevice),1022);
+
+
+  // prepare interprocess-edge exchange information
+  mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
+  mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
+  if( mp->num_interfaces_ext_mesh > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
+                                       (mp->num_interfaces_ext_mesh)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
+                                       (mp->num_interfaces_ext_mesh)*sizeof(int),cudaMemcpyHostToDevice),1202);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
+                                       (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int)),1203);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
+                                       (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1204);
+  }
+
+  // inner elements
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),1206);
+
+  // absorbing boundaries
+  mp->d_num_abs_boundary_faces = *h_num_abs_boundary_faces;
+  if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ispec),
+                                       (mp->d_num_abs_boundary_faces)*sizeof(int)),1101);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
+                                       (mp->d_num_abs_boundary_faces)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1102);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ijk),
+                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
+                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1104);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_normal),
+                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1105);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
+                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),1106);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_jacobian2Dw),
+                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1107);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
+                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),1108);
+  }
+
+  // sources
+  mp->nsources_local = *nsources_local_f;
+  if (*SIMULATION_TYPE == 1  || *SIMULATION_TYPE == 3){
+    // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays,
+                                       sizeof(realw)* *NSOURCES*3*NGLL3),1301);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays,
+                                       sizeof(realw)* *NSOURCES*3*NGLL3,cudaMemcpyHostToDevice),1302);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_stf_pre_compute,
+                                       *NSOURCES*sizeof(double)),1303);
+  }
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_source,
+                                     sizeof(int) * *NSOURCES),1401);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_source, h_islice_selected_source,
+                                     sizeof(int)* *NSOURCES,cudaMemcpyHostToDevice),1402);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_source,
+                                     sizeof(int)* *NSOURCES),1403);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_source, h_ispec_selected_source,
+                                     sizeof(int)* *NSOURCES,cudaMemcpyHostToDevice),1404);
+
+
+  // receiver stations
+  int nrec = *nrec_f; // total number of receivers
+  mp->nrec_local = *nrec_local_f; // number of receiver located in this partition
+  //int nrec_local = *nrec_local_f;
+  // note that:
+  // size(number_receiver_global) = nrec_local
+  // size(ispec_selected_rec) = nrec
+  if( mp->nrec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_number_receiver_global),mp->nrec_local*sizeof(int)),1);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_number_receiver_global,h_number_receiver_global,
+                                     mp->nrec_local*sizeof(int),cudaMemcpyHostToDevice),1512);
+  }
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),nrec*sizeof(int)),1513);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec,
+                                     nrec*sizeof(int),cudaMemcpyHostToDevice),1514);
+
+#ifdef USE_MESH_COLORING_GPU
+  mp->use_mesh_coloring_gpu = 1;
+  if( ! *USE_MESH_COLORING_GPU_f ) exit_on_error("error with USE_MESH_COLORING_GPU constant; please re-compile\n");
+#else
+  // mesh coloring
+  // note: this here passes the coloring as an option to the kernel routines
+  //          the performance seems to be the same if one uses the pre-processing directives above or not
+  mp->use_mesh_coloring_gpu = *USE_MESH_COLORING_GPU_f;
+#endif
+
+  // number of elements per domain
+  mp->nspec_acoustic = *nspec_acoustic;
+  mp->nspec_elastic = *nspec_elastic;
+
+  // gravity flag initialization
+  mp->gravity = 0;
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_constants_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_acoustic_device,
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
+                                              realw* rmass_acoustic,
+                                              realw* rhostore,
+                                              realw* kappastore,
+                                              int* num_phase_ispec_acoustic,
+                                              int* phase_ispec_inner_acoustic,
+                                              int* ispec_is_acoustic,
+                                              int* NOISE_TOMOGRAPHY,
+                                              int* num_free_surface_faces,
+                                              int* free_surface_ispec,
+                                              int* free_surface_ijk,
+                                              int* ABSORBING_CONDITIONS,
+                                              int* b_reclen_potential,
+                                              realw* b_absorb_potential,
+                                              int* ELASTIC_SIMULATION,
+                                              int* num_coupling_ac_el_faces,
+                                              int* coupling_ac_el_ispec,
+                                              int* coupling_ac_el_ijk,
+                                              realw* coupling_ac_el_normal,
+                                              realw* coupling_ac_el_jacobian2Dw,
+                                              int* num_colors_outer_acoustic,
+                                              int* num_colors_inner_acoustic,
+                                              int* num_elem_colors_acoustic) {
+
+  TRACE("prepare_fields_acoustic_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
+  int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
+  int size_nonpadded = NGLL3 * mp->NSPEC_AB;
+  int size_glob = mp->NGLOB_AB;
+
+  // allocates arrays on device (GPU)
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),2001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),2002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003);
+
+  // mpi buffer
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
+                      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+
+  // padded array
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),2006);
+  // transfer constant element data with padding
+  for(int i=0; i < mp->NSPEC_AB; i++) {
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),2106);
+  }
+
+  // non-padded array
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105);
+
+  // phase elements
+  mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
+                                      mp->num_phase_ispec_acoustic*2*sizeof(int)),2008);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
+                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),
+                                     mp->NSPEC_AB*sizeof(int)),2009);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102);
+
+  // free surface
+  if( *NOISE_TOMOGRAPHY == 0 ){
+    // allocate surface arrays
+    mp->num_free_surface_faces = *num_free_surface_faces;
+    if( mp->num_free_surface_faces > 0 ){
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
+                                       mp->num_free_surface_faces*sizeof(int)),2201);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
+                                       mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203);
+
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
+                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204);
+    }
+  }
+
+  // absorbing boundaries
+  if( *ABSORBING_CONDITIONS ){
+    mp->d_b_reclen_potential = *b_reclen_potential;
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
+                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302);
+  }
+
+
+  // for seismograms
+  if( mp->nrec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
+                                       mp->nrec_local*NGLL3*sizeof(realw)),2107);
+
+    mp->h_station_seismo_potential = (realw*) malloc( mp->nrec_local*NGLL3*sizeof(realw) );
+    if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
+  }
+
+
+  // coupling with elastic parts
+  if( *ELASTIC_SIMULATION && *num_coupling_ac_el_faces > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
+                                       (*num_coupling_ac_el_faces)*sizeof(int)),2601);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
+                                       (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
+                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
+                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
+                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
+                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
+                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
+                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608);
+
+  }
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+    mp->num_colors_outer_acoustic = *num_colors_outer_acoustic;
+    mp->num_colors_inner_acoustic = *num_colors_inner_acoustic;
+    mp->h_num_elem_colors_acoustic = (int*) num_elem_colors_acoustic;
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_acoustic_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_acoustic_adj_dev,
+              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPE,
+                                              int* APPROXIMATE_HESS_KL) {
+
+  TRACE("prepare_fields_acoustic_adj_dev");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  int size_glob = mp->NGLOB_AB;
+
+  // kernel simulations
+  if( *SIMULATION_TYPE != 3 ) return;
+
+  // allocates backward/reconstructed arrays on device (GPU)
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),3014);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),3015);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),3016);
+
+  // allocates kernels
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3017);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3018);
+
+  // initializes kernel values to zero
+  print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),3019);
+  print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),3020);
+
+  // preconditioner
+  if( *APPROXIMATE_HESS_KL ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3030);
+    // initializes with zeros
+    print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),3031);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_acoustic_adj_dev");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_elastic_device,
+              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
+                                             int* size,
+                                             realw* rmass,
+                                             realw* rho_vp,
+                                             realw* rho_vs,
+                                             int* num_phase_ispec_elastic,
+                                             int* phase_ispec_inner_elastic,
+                                             int* ispec_is_elastic,
+                                             int* ABSORBING_CONDITIONS,
+                                             realw* h_b_absorb_field,
+                                             int* h_b_reclen_field,
+                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                             realw* epsilondev_xz,realw* epsilondev_yz,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                             realw* one_minus_sum_beta,realw* factor_common,
+                                             realw* alphaval,realw* betaval,realw* gammaval,
+                                             int* OCEANS,
+                                             realw* rmass_ocean_load,
+                                             int* NOISE_TOMOGRAPHY,
+                                             realw* free_surface_normal,
+                                             int* free_surface_ispec,
+                                             int* free_surface_ijk,
+                                             int* num_free_surface_faces,
+                                             int* ACOUSTIC_SIMULATION,
+                                             int* num_colors_outer_elastic,
+                                             int* num_colors_inner_elastic,
+                                             int* num_elem_colors_elastic,
+                                             int* ANISOTROPY,
+                                             realw *c11store,
+                                             realw *c12store,
+                                             realw *c13store,
+                                             realw *c14store,
+                                             realw *c15store,
+                                             realw *c16store,
+                                             realw *c22store,
+                                             realw *c23store,
+                                             realw *c24store,
+                                             realw *c25store,
+                                             realw *c26store,
+                                             realw *c33store,
+                                             realw *c34store,
+                                             realw *c35store,
+                                             realw *c36store,
+                                             realw *c44store,
+                                             realw *c45store,
+                                             realw *c46store,
+                                             realw *c55store,
+                                             realw *c56store,
+                                             realw *c66store){
+
+TRACE("prepare_fields_elastic_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+  int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
+
+  #ifdef USE_TEXTURES_FIELDS
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_tex_ref_ptr, "d_displ_tex"), 4001);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_tex_ref_ptr, mp->d_displ, &channelDesc, sizeof(realw)*(*size)), 4001);
+  }
+
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);
+  }
+  #endif
+
+  // mpi buffer
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
+                        3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
+
+  // mass matrix
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
+  // transfer element data
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
+                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
+
+
+  // element indices
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
+
+  // phase elements
+  mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
+                                     mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
+                                     mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
+
+  // for seismograms
+  if( mp->nrec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
+                                     3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+
+    mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
+    if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
+  }
+
+  // absorbing conditions
+  if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
+    // non-padded arrays
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
+
+    // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
+
+    // absorb_field array used for file i/o
+    if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
+      mp->d_b_reclen_field = *h_b_reclen_field;
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
+                                       mp->d_b_reclen_field),4016);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
+                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
+    }
+  }
+
+  // strains used for attenuation and kernel simulations
+  if( *COMPUTE_AND_STORE_STRAIN ){
+    // strains
+    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
+                                       epsilondev_size*sizeof(realw)),4301);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4302);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
+                                       epsilondev_size*sizeof(realw)),4302);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4303);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
+                                       epsilondev_size*sizeof(realw)),4304);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4305);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
+                                       epsilondev_size*sizeof(realw)),4306);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4307);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
+                                       epsilondev_size*sizeof(realw)),4308);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4309);
+
+  }
+
+  // attenuation memory variables
+  if( *ATTENUATION ){
+    // memory arrays
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
+                                       (*R_size)*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4402);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
+                                       (*R_size)*sizeof(realw)),4403);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4404);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
+                                       (*R_size)*sizeof(realw)),4405);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4406);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
+                                       (*R_size)*sizeof(realw)),4407);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4408);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
+                                       (*R_size)*sizeof(realw)),4409);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4410);
+
+    // attenuation factors
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
+                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
+                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
+
+    // alpha,beta,gamma factors
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
+                                       N_SLS*sizeof(realw)),4434);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
+                                       N_SLS*sizeof(realw)),4436);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
+                                       N_SLS*sizeof(realw)),4438);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
+
+  }
+
+  // anisotropy
+  if( *ANISOTROPY ){
+    // allocates memory on GPU
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
+                                       size_padded*sizeof(realw)),4700);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
+                                       size_padded*sizeof(realw)),4701);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
+                                       size_padded*sizeof(realw)),4702);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
+                                       size_padded*sizeof(realw)),4703);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
+                                       size_padded*sizeof(realw)),4704);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
+                                       size_padded*sizeof(realw)),4705);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
+                                       size_padded*sizeof(realw)),4706);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
+                                       size_padded*sizeof(realw)),4707);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
+                                       size_padded*sizeof(realw)),4708);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
+                                       size_padded*sizeof(realw)),4709);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
+                                       size_padded*sizeof(realw)),4710);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
+                                       size_padded*sizeof(realw)),4711);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
+                                       size_padded*sizeof(realw)),4712);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
+                                       size_padded*sizeof(realw)),4713);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
+                                       size_padded*sizeof(realw)),4714);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
+                                       size_padded*sizeof(realw)),4715);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
+                                       size_padded*sizeof(realw)),4716);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
+                                       size_padded*sizeof(realw)),4717);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
+                                       size_padded*sizeof(realw)),4718);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
+                                       size_padded*sizeof(realw)),4719);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
+                                       size_padded*sizeof(realw)),4720);
+
+    // transfer constant element data with padding
+    for(int i=0;i < mp->NSPEC_AB;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4805);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4806);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4807);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4808);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4809);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4810);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4811);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4812);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4813);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4814);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4815);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4816);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4817);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4818);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4819);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4820);
+    }
+  }
+
+  // ocean load approximation
+  if( *OCEANS ){
+    // oceans needs a free surface
+    mp->num_free_surface_faces = *num_free_surface_faces;
+    if( mp->num_free_surface_faces > 0 ){
+      // mass matrix
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
+                                         sizeof(realw)*mp->NGLOB_AB),4501);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
+                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
+      // surface normal
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
+                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
+                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
+
+      // temporary global array: used to synchronize updates on global accel array
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
+                                         sizeof(int)*mp->NGLOB_AB),4505);
+
+      if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
+                                          mp->num_free_surface_faces*sizeof(int)),4601);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
+                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
+                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
+      }
+    }
+  }
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+    mp->num_colors_outer_elastic = *num_colors_outer_elastic;
+    mp->num_colors_inner_elastic = *num_colors_inner_elastic;
+    mp->h_num_elem_colors_elastic = (int*) num_elem_colors_elastic;
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_elastic_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_elastic_adj_dev,
+              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                             int* size,
+                                             int* SIMULATION_TYPE,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilon_trace_over_3,
+                                             realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                             realw* b_epsilon_trace_over_3,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
+                                             int* APPROXIMATE_HESS_KL){
+
+  TRACE("prepare_fields_elastic_adj_dev");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // checks if kernel simulation
+  if( *SIMULATION_TYPE != 3 ) return;
+
+  // kernel simulations
+  // allocates backward/reconstructed arrays on device (GPU)
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),5201);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),5202);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),5203);
+
+  // allocates kernels
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5204);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5205);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5206);
+
+  // initializes kernel values to zero
+  print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5207);
+  print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5208);
+  print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5209);
+
+  // strains used for attenuation and kernel simulations
+  if( *COMPUTE_AND_STORE_STRAIN ){
+    // strains
+    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+
+    // solid pressure
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5310);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311);
+    // backward solid pressure
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5312);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313);
+    // prepares backward strains
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
+                                       epsilondev_size*sizeof(realw)),5321);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
+                                       epsilondev_size*sizeof(realw)),5322);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
+                                       epsilondev_size*sizeof(realw)),5323);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
+                                       epsilondev_size*sizeof(realw)),5324);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
+                                       epsilondev_size*sizeof(realw)),5325);
+
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330);
+  }
+
+  // attenuation memory variables
+  if( *ATTENUATION ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
+                                       (*R_size)*sizeof(realw)),5421);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5422);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
+                                       (*R_size)*sizeof(realw)),5423);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5424);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
+                                       (*R_size)*sizeof(realw)),5425);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5426);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
+                                       (*R_size)*sizeof(realw)),5427);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5428);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
+                                       (*R_size)*sizeof(realw)),5429);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5420);
+
+    // alpha,beta,gamma factors for backward fields
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
+                                       N_SLS*sizeof(realw)),5434);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
+                                       N_SLS*sizeof(realw)),5436);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
+                                       N_SLS*sizeof(realw)),5438);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
+  }
+
+  if( *APPROXIMATE_HESS_KL ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5450);
+    // initializes with zeros
+    print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5451);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_elastic_adj_dev");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// purely adjoint & kernel simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_sim2_or_3_const_device,
+              PREPARE_SIM2_OR_3_CONST_DEVICE)(
+                                              long* Mesh_pointer_f,
+                                              int* islice_selected_rec,
+                                              int* islice_selected_rec_size,
+                                              int* nadj_rec_local,
+                                              int* nrec,
+                                              int* myrank) {
+
+  TRACE("prepare_sim2_or_3_const_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // adjoint source arrays
+  mp->nadj_rec_local = *nadj_rec_local;
+  if( mp->nadj_rec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
+                                       (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),6003);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int)),6004);
+
+    // prepares local irec array:
+    // the irec_local variable needs to be precomputed (as
+    // h_pre_comp..), because normally it is in the loop updating accel,
+    // and due to how it's incremented, it cannot be parallelized
+    int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
+    if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n");
+
+    int irec_local = 0;
+    for(int irec = 0; irec < *nrec; irec++) {
+      if(*myrank == islice_selected_rec[irec]) {
+        irec_local++;
+        h_pre_computed_irec[irec_local-1] = irec;
+      }
+    }
+    if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+    // copies values onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010);
+    free(h_pre_computed_irec);
+
+    // temporary array to prepare extracted source array values
+    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
+    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_sim2_or_3_const_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for NOISE simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_noise_device,
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+                                           int* NSPEC_AB, int* NGLOB_AB,
+                                           int* free_surface_ispec,
+                                           int* free_surface_ijk,
+                                           int* num_free_surface_faces,
+                                           int* SIMULATION_TYPE,
+                                           int* NOISE_TOMOGRAPHY,
+                                           int* NSTEP,
+                                           realw* noise_sourcearray,
+                                           realw* normal_x_noise,
+                                           realw* normal_y_noise,
+                                           realw* normal_z_noise,
+                                           realw* mask_noise,
+                                           realw* free_surface_jacobian2Dw) {
+
+  TRACE("prepare_fields_noise_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // free surface
+  mp->num_free_surface_faces = *num_free_surface_faces;
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
+                                     mp->num_free_surface_faces*sizeof(int)),7001);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
+                                     mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004);
+
+  // alloc storage for the surface buffer to be copied
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),7005);
+
+  // prepares noise source array
+  if( *NOISE_TOMOGRAPHY == 1 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
+                                       3*NGLL3*(*NSTEP)*sizeof(realw)),7101);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
+                                       3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
+  }
+
+  // prepares noise directions
+  if( *NOISE_TOMOGRAPHY > 1 ){
+    int nface_size = NGLL2*(*num_free_surface_faces);
+    // allocates memory on GPU
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
+                                       nface_size*sizeof(realw)),7301);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
+                                       nface_size*sizeof(realw)),7302);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
+                                       nface_size*sizeof(realw)),7303);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
+                                       nface_size*sizeof(realw)),7304);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
+                                       nface_size*sizeof(realw)),7305);
+    // transfers data onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
+  }
+
+  // prepares noise strength kernel
+  if( *NOISE_TOMOGRAPHY == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
+                                       NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401);
+    // initializes kernel values to zero
+    print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),7403);
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("jacobian_size = %d\n",25*(*num_free_surface_faces));
+  exit_on_cuda_error("prepare_fields_noise_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// GRAVITY simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_gravity_device,
+              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+                                             int* GRAVITY,
+                                             realw* minus_deriv_gravity,
+                                             realw* minus_g,
+                                             realw* h_wgll_cube,
+                                             int* ACOUSTIC_SIMULATION,
+                                             realw* rhostore) {
+
+  TRACE("prepare_fields_gravity_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  setConst_wgll_cube(h_wgll_cube,mp);
+
+  mp->gravity = *GRAVITY;
+  if( mp->gravity ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_deriv_gravity),
+                                       (mp->NGLOB_AB)*sizeof(realw)),8000);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity, minus_deriv_gravity,
+                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_g),
+                                       (mp->NGLOB_AB)*sizeof(realw)),8002);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_g, minus_g,
+                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8003);
+
+
+    if( *ACOUSTIC_SIMULATION == 0 ){
+      // rhostore not allocated yet
+      int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+      // padded array
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),8006);
+      // transfer constant element data with padding
+      for(int i=0; i < mp->NSPEC_AB; i++) {
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
+                                           NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8007);
+      }
+    }
+  }
+
+}
+
+extern "C"
+void FC_FUNC_(prepare_seismogram_fields,
+              PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {
+
+  TRACE("prepare_constants_device");
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_nu),3*3* *nrec_local*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hxir),5* *nrec_local*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hetar),5* *nrec_local*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hgammar),5* *nrec_local*sizeof(double)),8100);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_d,3**nrec_local*sizeof(realw)),8101);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_v,3**nrec_local*sizeof(realw)),8101);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_a,3**nrec_local*sizeof(realw)),8101);
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nu,nu,3*3* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hxir,hxir,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hetar,hetar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hgammar,hgammar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+
+  cudaMallocHost((void**)&mp->h_seismograms_d_it,3**nrec_local*sizeof(realw));
+  cudaMallocHost((void**)&mp->h_seismograms_v_it,3**nrec_local*sizeof(realw));
+  cudaMallocHost((void**)&mp->h_seismograms_a_it,3**nrec_local*sizeof(realw));
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// cleanup
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_cleanup_device,
+              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
+                                      int* SIMULATION_TYPE,
+                                      int* SAVE_FORWARD,
+                                      int* ACOUSTIC_SIMULATION,
+                                      int* ELASTIC_SIMULATION,
+                                      int* ABSORBING_CONDITIONS,
+                                      int* NOISE_TOMOGRAPHY,
+                                      int* COMPUTE_AND_STORE_STRAIN,
+                                      int* ATTENUATION,
+                                      int* ANISOTROPY,
+                                      int* OCEANS,
+                                      int* APPROXIMATE_HESS_KL) {
+
+TRACE("prepare_cleanup_device");
+
+  // frees allocated memory arrays
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // frees memory on GPU
+  // mesh
+  cudaFree(mp->d_xix);
+  cudaFree(mp->d_xiy);
+  cudaFree(mp->d_xiz);
+  cudaFree(mp->d_etax);
+  cudaFree(mp->d_etay);
+  cudaFree(mp->d_etaz);
+  cudaFree(mp->d_gammax);
+  cudaFree(mp->d_gammay);
+  cudaFree(mp->d_gammaz);
+  cudaFree(mp->d_muv);
+
+  // absorbing boundaries
+  if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0 ){
+    cudaFree(mp->d_abs_boundary_ispec);
+    cudaFree(mp->d_abs_boundary_ijk);
+    cudaFree(mp->d_abs_boundary_normal);
+    cudaFree(mp->d_abs_boundary_jacobian2Dw);
+  }
+
+  // interfaces
+  cudaFree(mp->d_nibool_interfaces_ext_mesh);
+  cudaFree(mp->d_ibool_interfaces_ext_mesh);
+
+  // global indexing
+  cudaFree(mp->d_ispec_is_inner);
+  cudaFree(mp->d_ibool);
+
+  // sources
+  if (*SIMULATION_TYPE == 1  || *SIMULATION_TYPE == 3){
+    cudaFree(mp->d_sourcearrays);
+    cudaFree(mp->d_stf_pre_compute);
+  }
+
+  cudaFree(mp->d_islice_selected_source);
+  cudaFree(mp->d_ispec_selected_source);
+
+  // receivers
+  if( mp->nrec_local > 0 ) cudaFree(mp->d_number_receiver_global);
+  cudaFree(mp->d_ispec_selected_rec);
+
+  // ACOUSTIC arrays
+  if( *ACOUSTIC_SIMULATION ){
+    cudaFree(mp->d_potential_acoustic);
+    cudaFree(mp->d_potential_dot_acoustic);
+    cudaFree(mp->d_potential_dot_dot_acoustic);
+    cudaFree(mp->d_send_potential_dot_dot_buffer);
+    cudaFree(mp->d_rmass_acoustic);
+    cudaFree(mp->d_rhostore);
+    cudaFree(mp->d_kappastore);
+    cudaFree(mp->d_phase_ispec_inner_acoustic);
+    cudaFree(mp->d_ispec_is_acoustic);
+
+    if( *NOISE_TOMOGRAPHY == 0 ){
+      cudaFree(mp->d_free_surface_ispec);
+      cudaFree(mp->d_free_surface_ijk);
+    }
+
+    if( *ABSORBING_CONDITIONS ) cudaFree(mp->d_b_absorb_potential);
+
+    if( *SIMULATION_TYPE == 3 ) {
+      cudaFree(mp->d_b_potential_acoustic);
+      cudaFree(mp->d_b_potential_dot_acoustic);
+      cudaFree(mp->d_b_potential_dot_dot_acoustic);
+      cudaFree(mp->d_rho_ac_kl);
+      cudaFree(mp->d_kappa_ac_kl);
+      if( *APPROXIMATE_HESS_KL) cudaFree(mp->d_hess_ac_kl);
+    }
+
+
+    if(mp->nrec_local > 0 ){
+      cudaFree(mp->d_station_seismo_potential);
+      free(mp->h_station_seismo_potential);
+    }
+
+  } // ACOUSTIC_SIMULATION
+
+  // ELASTIC arrays
+  if( *ELASTIC_SIMULATION ){
+    cudaFree(mp->d_displ);
+    cudaFree(mp->d_veloc);
+    cudaFree(mp->d_accel);
+    cudaFree(mp->d_send_accel_buffer);
+    cudaFree(mp->d_rmass);
+
+    cudaFree(mp->d_phase_ispec_inner_elastic);
+    cudaFree(mp->d_ispec_is_elastic);
+
+    if( mp->nrec_local > 0 ){
+      cudaFree(mp->d_station_seismo_field);
+      free(mp->h_station_seismo_field);
+    }
+
+    if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
+      cudaFree(mp->d_rho_vp);
+      cudaFree(mp->d_rho_vs);
+
+      if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD ))
+          cudaFree(mp->d_b_absorb_field);
+    }
+
+    if( *SIMULATION_TYPE == 3 ) {
+      cudaFree(mp->d_b_displ);
+      cudaFree(mp->d_b_veloc);
+      cudaFree(mp->d_b_accel);
+      cudaFree(mp->d_rho_kl);
+      cudaFree(mp->d_mu_kl);
+      cudaFree(mp->d_kappa_kl);
+      if( *APPROXIMATE_HESS_KL ) cudaFree(mp->d_hess_el_kl);
+    }
+
+    if( *COMPUTE_AND_STORE_STRAIN ){
+      cudaFree(mp->d_epsilondev_xx);
+      cudaFree(mp->d_epsilondev_yy);
+      cudaFree(mp->d_epsilondev_xy);
+      cudaFree(mp->d_epsilondev_xz);
+      cudaFree(mp->d_epsilondev_yz);
+      if( *SIMULATION_TYPE == 3 ){
+        cudaFree(mp->d_epsilon_trace_over_3);
+        cudaFree(mp->d_b_epsilon_trace_over_3);
+        cudaFree(mp->d_b_epsilondev_xx);
+        cudaFree(mp->d_b_epsilondev_yy);
+        cudaFree(mp->d_b_epsilondev_xy);
+        cudaFree(mp->d_b_epsilondev_xz);
+        cudaFree(mp->d_b_epsilondev_yz);
+      }
+    }
+
+    if( *ATTENUATION ){
+      cudaFree(mp->d_factor_common);
+      cudaFree(mp->d_one_minus_sum_beta);
+      cudaFree(mp->d_alphaval);
+      cudaFree(mp->d_betaval);
+      cudaFree(mp->d_gammaval);
+      cudaFree(mp->d_R_xx);
+      cudaFree(mp->d_R_yy);
+      cudaFree(mp->d_R_xy);
+      cudaFree(mp->d_R_xz);
+      cudaFree(mp->d_R_yz);
+      if( *SIMULATION_TYPE == 3){
+        cudaFree(mp->d_b_R_xx);
+        cudaFree(mp->d_b_R_yy);
+        cudaFree(mp->d_b_R_xy);
+        cudaFree(mp->d_b_R_xz);
+        cudaFree(mp->d_b_R_yz);
+        cudaFree(mp->d_b_alphaval);
+        cudaFree(mp->d_b_betaval);
+        cudaFree(mp->d_b_gammaval);
+      }
+    }
+
+    if( *ANISOTROPY ){
+      cudaFree(mp->d_c11store);
+      cudaFree(mp->d_c12store);
+      cudaFree(mp->d_c13store);
+      cudaFree(mp->d_c14store);
+      cudaFree(mp->d_c15store);
+      cudaFree(mp->d_c16store);
+      cudaFree(mp->d_c22store);
+      cudaFree(mp->d_c23store);
+      cudaFree(mp->d_c24store);
+      cudaFree(mp->d_c25store);
+      cudaFree(mp->d_c26store);
+      cudaFree(mp->d_c33store);
+      cudaFree(mp->d_c34store);
+      cudaFree(mp->d_c35store);
+      cudaFree(mp->d_c36store);
+      cudaFree(mp->d_c44store);
+      cudaFree(mp->d_c45store);
+      cudaFree(mp->d_c46store);
+      cudaFree(mp->d_c55store);
+      cudaFree(mp->d_c56store);
+      cudaFree(mp->d_c66store);
+    }
+
+    if( *OCEANS ){
+      if( mp->num_free_surface_faces > 0 ){
+        cudaFree(mp->d_rmass_ocean_load);
+        cudaFree(mp->d_free_surface_normal);
+        cudaFree(mp->d_updated_dof_ocean_load);
+        if( *NOISE_TOMOGRAPHY == 0){
+          cudaFree(mp->d_free_surface_ispec);
+          cudaFree(mp->d_free_surface_ijk);
+        }
+      }
+    }
+  } // ELASTIC_SIMULATION
+
+  // purely adjoint & kernel array
+  if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ){
+    if(mp->nadj_rec_local > 0 ){
+      cudaFree(mp->d_adj_sourcearrays);
+      cudaFree(mp->d_pre_computed_irec);
+      free(mp->h_adj_sourcearrays_slice);
+    }
+  }
+
+  // NOISE arrays
+  if( *NOISE_TOMOGRAPHY > 0 ){
+    cudaFree(mp->d_free_surface_ispec);
+    cudaFree(mp->d_free_surface_ijk);
+    cudaFree(mp->d_noise_surface_movie);
+    if( *NOISE_TOMOGRAPHY == 1 ) cudaFree(mp->d_noise_sourcearray);
+    if( *NOISE_TOMOGRAPHY > 1 ){
+      cudaFree(mp->d_normal_x_noise);
+      cudaFree(mp->d_normal_y_noise);
+      cudaFree(mp->d_normal_z_noise);
+      cudaFree(mp->d_mask_noise);
+      cudaFree(mp->d_free_surface_jacobian2Dw);
+    }
+    if( *NOISE_TOMOGRAPHY == 3 ) cudaFree(mp->d_Sigma_kl);
+  }
+
+  // mesh pointer - not needed anymore
+  free(mp);
+}
diff --git a/src/cuda/save_and_compare_cpu_vs_gpu.c b/src/cuda/save_and_compare_cpu_vs_gpu.c
new file mode 100644
index 000000000..082a18f2d
--- /dev/null
+++ b/src/cuda/save_and_compare_cpu_vs_gpu.c
@@ -0,0 +1,318 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <errno.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+
+void save_to_max_surface_file_(float* maxval) {
+  int rank;
+  char filename[BUFSIZ];
+  FILE* fp;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+#else
+  rank = 0;
+#endif
+  sprintf(filename,"maxval_surface_proc_%03d.dat",rank);
+  fp = fopen(filename,"a+");
+  fprintf(fp,"%e\n",*maxval);
+  fclose(fp);
+}
+
+
+void save_fvector_(float* vector, int* size, int* id, int* cpu_or_gpu) {
+  FILE* fp;
+  char filename[BUFSIZ];
+  if(*cpu_or_gpu == 0) {
+    sprintf(filename, "debug_output_cpu_%d.dat",*id);
+  }
+  else {
+    sprintf(filename, "debug_output_gpu_%d.dat",*id);
+  }
+  fp = fopen(filename, "wb");
+  printf("writing vector, vector[0]=%e\n",vector[0]);
+  fwrite(vector, sizeof(float), *size, fp);
+  fclose(fp);
+
+}
+
+void save_ivector_(int* vector, int* size, int* id, int* cpu_or_gpu) {
+  FILE* fp;
+  char filename[BUFSIZ];
+  if(*cpu_or_gpu == 0) {
+    sprintf(filename, "debug_output_cpu_%d.dat",*id);
+  }
+  else {
+    sprintf(filename, "debug_output_gpu_%d.dat",*id);
+  }
+  fp = fopen(filename, "wb");
+  fwrite(vector, sizeof(int), *size, fp);
+  fclose(fp);
+
+}
+
+
+void get_max_from_surface_file_(int* nodes_per_iterationf,int* NSTEP) {
+  int nodes_per_iteration = *nodes_per_iterationf;
+  char filename[BUFSIZ];
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  sprintf(filename,"/scratch/eiger/rietmann/SPECFEM3D_AIGLE/in_out_files/DATABASES_MPI/proc%06d_surface_movie",procid);
+
+  FILE* fp; int it;
+  printf("Opening %s for analysis\n",filename);
+  fp = fopen(filename,"rb");
+  //char* errorstr;
+  if(fp == 0) {
+    //errorstr = (char*) strerror(errno);
+    printf("FILE ERROR:%s\n",(char*) strerror(errno));
+    perror("file error\n");
+    exit(1);
+  }
+
+  float* vector = (float*)malloc(nodes_per_iteration*sizeof(float));
+  float max_val;
+  int i;
+  for(it=0;it<*NSTEP;it++) {
+    int pos = (sizeof(float)*nodes_per_iteration)*(it);
+    fseek(fp,pos,SEEK_SET);
+    fread(vector,sizeof(float),nodes_per_iteration,fp);
+    for(i=0;i<nodes_per_iteration;i++) {
+      max_val = MAX(max_val,vector[i]);
+    }
+    if(it % 500 == 0) {
+      printf("scanning it=%d\n",it);
+    }
+  }
+  printf("max_val=%e\n",max_val);
+}
+
+void compare_two_vectors_exact_(int* sizef,float* vector1,float* vector2,int* num_errors) {
+
+  int size = *sizef;
+  int i;
+  int error_count = 0;
+
+  for(i=0;i<size;++i) {
+    if(vector1[i] != vector2[i]) {
+      error_count++;
+      if(error_count < 10) {
+  printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
+      }
+    }
+  }
+  printf("**** Error Count: %d ****\n",error_count);
+  *num_errors = error_count;
+}
+
+void compare_two_vectors_(int* sizef,float* vector1,float* vector2,int* num_errors) {
+
+  int size = *sizef;
+  int i;
+  int error_count = 0;
+  for(i=0;i<size;++i) {
+    if(vector1[i] != 0) {
+      if( fabsf(vector1[i]-vector2[i])/vector1[i] > 0.01) {
+  if(fabsf(vector1[i]-vector2[i]) > 1e-20) {
+  error_count++;
+      if(error_count<10) {
+        printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
+      }
+      }
+      }
+    }
+    /* if(vector1[i] != vector2[i]) { */
+    /*   if(fabsf(vector1[i]-vector2[i]) > 1e-25) { */
+    /*  error_count++; */
+    /*  if(error_count<50) { */
+    /*    printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]); */
+    /*  } */
+    /*   } */
+    /* } */
+  }
+  printf("**** Error Count: %d ****\n",error_count);
+  *num_errors = error_count;
+}
+
+void compare_surface_files_(int* bytes_per_iteration, int* number_of_iterations) {
+
+  char* cpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_proc000001_surface_movie";
+  char* gpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_v2_proc000001_surface_movie";
+
+  FILE* fp_cpu;
+  fp_cpu = fopen(cpu_file,"rb");
+  //char* errorstr;
+  if(fp_cpu == 0) {
+    //errorstr = (char*) strerror(errno);
+    //printf("CPU FILE ERROR:%s\n",errorstr);
+    printf("CPU FILE ERROR:%s\n",(char*) strerror(errno));
+    perror("cpu file error\n");
+  }
+  FILE* fp_gpu;
+  fp_gpu = fopen(gpu_file,"rb");
+
+  if(fp_gpu == NULL) {
+    //errorstr = (char*) strerror(errno);
+    //printf("GPU FILE ERROR:%s\n",errorstr);
+    printf("GPU FILE ERROR:%s\n",(char*) strerror(errno));
+    perror("gpu file error\n");
+  }
+
+  /* pause_for_debug(); */
+
+  float* gpu_vector = (float*)malloc(*bytes_per_iteration);
+  float* cpu_vector = (float*)malloc(*bytes_per_iteration);
+  int i,it,error_count=0;
+  for(it=0;it<*number_of_iterations;it++) {
+    int pos = (*bytes_per_iteration)*(it);
+
+    fseek(fp_cpu,pos,SEEK_SET);
+    fseek(fp_gpu,pos,SEEK_SET);
+
+    int number_of_nodes = *bytes_per_iteration/sizeof(float);
+    fread(cpu_vector,sizeof(float),number_of_nodes,fp_cpu);
+    fread(gpu_vector,sizeof(float),number_of_nodes,fp_gpu);
+    int size = number_of_nodes;
+    float gpu_min_val=10;
+    float gpu_max_val=0;
+    float cpu_min_val=10;
+    float cpu_max_val=0;
+    if(it<100) {
+      for(i=0;i<size;i++) {
+  if((fabs(cpu_vector[i] - gpu_vector[i])/(fabs(cpu_vector[i])+1e-31) > 0.01)) {
+    if(error_count < 30) printf("ERROR[%d]: %g != %g\n",i,cpu_vector[i], gpu_vector[i]);
+    if(cpu_vector[i] > 1e-30) error_count++;
+  }
+  if(gpu_vector[i]>gpu_max_val) gpu_max_val = gpu_vector[i];
+  if(gpu_vector[i]<gpu_min_val) gpu_min_val = gpu_vector[i];
+  if(cpu_vector[i]>cpu_max_val) cpu_max_val = cpu_vector[i];
+  if(cpu_vector[i]<cpu_min_val) cpu_min_val = cpu_vector[i];
+      }
+      printf("%d Total Errors\n",error_count);
+      printf("size:%d\n",size);
+      printf("GPU:[min/max]=%e/%e\n",gpu_min_val,gpu_max_val);
+      printf("CPU:[min/max]=%e/%e\n",cpu_min_val,cpu_max_val);
+    }
+  }
+  printf("End of Surface Compare\n");
+  exit(1);
+}
+
+
+void compare_fvector_(float* vector, int* size, int* id, int* cpu_or_gpu) {
+  FILE* fp;
+  char cmp_filename[BUFSIZ];
+  float* compare_vector = (float*)malloc(*size*sizeof(float));
+  if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare
+    sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id);
+  }
+  else {
+    sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id);
+  }
+  fopen(cmp_filename, "rb");
+  /* read the values */
+  if((fp=fopen(cmp_filename, "rb"))==NULL) {
+    printf("Cannot open comparison file %s.\n",cmp_filename);
+    exit(1);
+  }
+  if(fread(compare_vector, sizeof(float), *size, fp) != *size) {
+    if(feof(fp))
+       printf("Premature end of file.");
+    else
+       printf("File read error.");
+  }
+
+  fclose(fp);
+
+  int i;
+  int error_count=0;
+  for(i=0;i<*size;i++) {
+    if((fabs(vector[i] - compare_vector[i])/vector[i] > 0.0001)) {
+      if(error_count < 30) {
+        printf("ERROR[%d]: %f != %f\n",i,compare_vector[i], vector[i]);
+      }
+      error_count++;
+      /* if(compare_vector[i] > 1e-30) error_count++; */
+    }
+  }
+  printf("%d Total Errors\n",error_count);
+  printf("size:%d\n",*size);
+  /* for(i=0;i<30;i++) { */
+  /*   printf("val[%d]: %g != %g\n",i,compare_vector[i], vector[i]); */
+  /*   /\* printf("error_check[%d]= %g\n",abs(vector[i] - compare_vector[i])/vector[i]); *\/ */
+  /* } */
+}
+
+void compare_ivector_(int* vector, int* size, int* id, int* cpu_or_gpu) {
+  FILE* fp;
+  char cmp_filename[BUFSIZ];
+  int* compare_vector = (int*)malloc(*size*sizeof(int));
+  if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare
+    sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id);
+  }
+  else {
+    sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id);
+  }
+  fopen(cmp_filename, "rb");
+  /* read the values */
+  if((fp=fopen(cmp_filename, "rb"))==NULL) {
+    printf("Cannot open comparison file %s.\n",cmp_filename);
+    exit(1);
+  }
+  if(fread(compare_vector, sizeof(int), *size, fp) != *size) {
+    if(feof(fp))
+       printf("Premature end of file.");
+    else
+       printf("File read error.");
+  }
+
+  fclose(fp);
+
+  int i;
+  int error_count=0;
+  for(i=0;i<*size;i++) {
+    if((abs(vector[i] - compare_vector[i])/vector[i] > 0.01) && error_count < 30) {
+      printf("ERROR[%d]: %d != %d\n",i,compare_vector[i], vector[i]);
+      error_count++;
+    }
+  }
+  printf("%d Total Errors\n",error_count);
+}
diff --git a/src/cuda/specfem3D_gpu_cuda_method_stubs.c b/src/cuda/specfem3D_gpu_cuda_method_stubs.c
new file mode 100644
index 000000000..d87e083ce
--- /dev/null
+++ b/src/cuda/specfem3D_gpu_cuda_method_stubs.c
@@ -0,0 +1,805 @@
+/*
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config.h"
+
+typedef float realw;
+
+
+
+//
+// src/cuda/check_fields_cuda.cu
+//
+
+void FC_FUNC_(check_max_norm_displ_gpu,
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_vector,
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+
+void FC_FUNC_(check_max_norm_displ,
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_displ_gpu,
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_accel_gpu,
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_veloc_gpu,
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_displ,
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_accel,
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+
+void FC_FUNC_(check_error_vectors,
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+
+void FC_FUNC_(get_max_accel,
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
+
+void FC_FUNC_(get_norm_acoustic_from_device,
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
+                                                  long* Mesh_pointer_f,
+                                                  int* SIMULATION_TYPE) {}
+
+void FC_FUNC_(get_norm_elastic_from_device,
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                                 long* Mesh_pointer_f,
+                                                 int* SIMULATION_TYPE) {}
+
+
+//
+// src/cuda/compute_add_sources_acoustic_cuda.cu
+//
+
+void FC_FUNC_(compute_add_sources_ac_cuda,
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* NSOURCESf,
+                                                 int* SIMULATION_TYPEf,
+                                                 double* h_stf_pre_compute,
+                                                 int* myrankf) {}
+
+void FC_FUNC_(compute_add_sources_ac_s3_cuda,
+              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
+                                                      int* phase_is_innerf,
+                                                      int* NSOURCESf,
+                                                      int* SIMULATION_TYPEf,
+                                                      double* h_stf_pre_compute,
+                                                      int* myrankf) {}
+
+void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
+              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_acoustic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+
+
+//
+// src/cuda/compute_add_sources_elastic_cuda.cu
+//
+
+void FC_FUNC_(compute_add_sources_el_cuda,
+              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* NSOURCESf,
+                                            double* h_stf_pre_compute,
+                                            int* myrankf) {}
+
+void FC_FUNC_(compute_add_sources_el_s3_cuda,
+              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
+                                              double* h_stf_pre_compute,
+                                              int* NSOURCESf,
+                                              int* phase_is_inner,
+                                              int* myrank) {}
+
+void FC_FUNC_(add_source_master_rec_noise_cu,
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+                                                int* myrank_f,
+                                                int* it_f,
+                                                int* irec_master_noise_f,
+                                                int* islice_selected_rec) {}
+
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_elastic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+
+
+//
+// src/cuda/compute_coupling_cuda.cu
+//
+
+void FC_FUNC_(compute_coupling_ac_el_cuda,
+              COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* num_coupling_ac_el_facesf,
+                                           int* SIMULATION_TYPEf) {}
+
+void FC_FUNC_(compute_coupling_el_ac_cuda,
+              COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* num_coupling_ac_el_facesf,
+                                           int* SIMULATION_TYPEf) {}
+
+
+//
+// src/cuda/compute_forces_acoustic_cuda.cu
+//
+
+void FC_FUNC_(transfer_boun_pot_from_device,
+              TRANSFER_BOUN_POT_FROM_DEVICE)(
+                                              int* size,
+                                              long* Mesh_pointer_f,
+                                              realw* potential_dot_dot_acoustic,
+                                              realw* send_potential_dot_dot_buffer,
+                                              int* num_interfaces_ext_mesh,
+                                              int* max_nibool_interfaces_ext_mesh,
+                                              int* nibool_interfaces_ext_mesh,
+                                              int* ibool_interfaces_ext_mesh,
+                                              int* FORWARD_OR_ADJOINT){}
+
+void FC_FUNC_(transfer_asmbl_pot_to_device,
+              TRANSFER_ASMBL_POT_TO_DEVICE)(
+                                                long* Mesh_pointer,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* buffer_recv_scalar_ext_mesh,
+                                                int* num_interfaces_ext_mesh,
+                                                int* max_nibool_interfaces_ext_mesh,
+                                                int* nibool_interfaces_ext_mesh,
+                                                int* ibool_interfaces_ext_mesh,
+                                                int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(compute_forces_acoustic_cuda,
+              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+                                            int* iphase,
+                                            int* nspec_outer_acoustic,
+                                            int* nspec_inner_acoustic,
+                                            int* SIMULATION_TYPE) {}
+
+void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                             long* Mesh_pointer,
+                             int* size_F,
+                             int* SIMULATION_TYPE) {}
+
+void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                                                             long* Mesh_pointer,
+                                                             int* size_F,
+                                                             realw* deltatover2_F,
+                                                             int* SIMULATION_TYPE,
+                                                             realw* b_deltatover2_F) {}
+
+void FC_FUNC_(acoustic_enforce_free_surf_cuda,
+              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
+                                                  int* SIMULATION_TYPE,
+                                                  int* ABSORB_FREE_SURFACE) {}
+
+
+//
+// src/cuda/compute_forces_elastic_cuda.cu
+//
+
+//void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*); // {};}
+
+void FC_FUNC_(transfer_boun_accel_from_device,
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
+                                                    realw* send_accel_buffer,
+                                                    int* num_interfaces_ext_mesh,
+                                                    int* max_nibool_interfaces_ext_mesh,
+                                                    int* nibool_interfaces_ext_mesh,
+                                                    int* ibool_interfaces_ext_mesh,
+                                                    int* FORWARD_OR_ADJOINT){}
+
+void FC_FUNC_(transfer_boundary_from_device_a,
+              TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
+                                               int* nspec_outer_elastic) {}
+
+void FC_FUNC_(transfer_boundary_to_device_a,
+              TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
+                                             realw* buffer_recv_vector_ext_mesh,
+                                             int* num_interfaces_ext_mesh,
+                                             int* max_nibool_interfaces_ext_mesh) {}
+
+//void FC_FUNC_(assemble_accel_on_device,
+//              ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
+//                                              realw* buffer_recv_vector_ext_mesh,
+//                                              int* num_interfaces_ext_mesh,
+//                                              int* max_nibool_interfaces_ext_mesh,
+//                                              int* nibool_interfaces_ext_mesh,
+//                                              int* ibool_interfaces_ext_mesh,
+//                                              int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(transfer_asmbl_accel_to_device,
+              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
+                                                    realw* buffer_recv_vector_ext_mesh,
+                                                    int* num_interfaces_ext_mesh,
+                                                    int* max_nibool_interfaces_ext_mesh,
+                                                    int* nibool_interfaces_ext_mesh,
+                                                    int* ibool_interfaces_ext_mesh,
+                                                    int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(compute_forces_elastic_cuda,
+              COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* iphase,
+                                           int* nspec_outer_elastic,
+                                           int* nspec_inner_elastic,
+                                           int* SIMULATION_TYPE,
+                                           int* COMPUTE_AND_STORE_STRAIN,
+                                           int* ATTENUATION,
+                                           int* ANISOTROPY) {}
+
+void FC_FUNC_(sync_copy_from_device,
+              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f,
+                                     int* iphase,
+                                     realw* send_buffer) {}
+
+void FC_FUNC_(kernel_3_a_cuda,
+              KERNEL_3_A_CUDA)(long* Mesh_pointer,
+                               int* size_F,
+                               realw* deltatover2_F,
+                               int* SIMULATION_TYPE_f,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {}
+
+void FC_FUNC_(kernel_3_b_cuda,
+              KERNEL_3_B_CUDA)(long* Mesh_pointer,
+                             int* size_F,
+                             realw* deltatover2_F,
+                             int* SIMULATION_TYPE_f,
+                             realw* b_deltatover2_F) {}
+
+void FC_FUNC_(elastic_ocean_load_cuda,
+              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
+                                       int* SIMULATION_TYPE) {}
+
+
+//
+// src/cuda/compute_kernels_cuda.cu
+//
+
+void FC_FUNC_(compute_kernels_elastic_cuda,
+              COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
+                                            realw* deltat_f) {}
+
+void FC_FUNC_(compute_kernels_strgth_noise_cu,
+              COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
+                                                    realw* h_noise_surface_movie,
+                                                    realw* deltat) {}
+
+void FC_FUNC_(compute_kernels_acoustic_cuda,
+              COMPUTE_KERNELS_ACOUSTIC_CUDA)(
+                                             long* Mesh_pointer,
+                                             realw* deltat_f) {}
+
+void FC_FUNC_(compute_kernels_hess_cuda,
+              COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
+                                         realw* deltat_f,
+                                         int* ELASTIC_SIMULATION,
+                                         int* ACOUSTIC_SIMULATION) {}
+
+
+//
+// src/cuda/compute_stacey_acoustic_cuda.cu
+//
+
+void FC_FUNC_(compute_stacey_acoustic_cuda,
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(
+                                    long* Mesh_pointer_f,
+                                    int* phase_is_innerf,
+                                    int* SIMULATION_TYPEf,
+                                    int* SAVE_FORWARDf,
+                                    realw* h_b_absorb_potential) {}
+
+
+//
+// src/cuda/compute_stacey_elastic_cuda.cu
+//
+
+void FC_FUNC_(compute_stacey_elastic_cuda,
+              COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* SIMULATION_TYPEf,
+                                           int* SAVE_FORWARDf,
+                                           realw* h_b_absorb_field) {}
+
+
+//
+// src/cuda/it_update_displacement_cuda.cu
+//
+
+void FC_FUNC_(it_update_displacement_cuda,
+              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
+                                                 int* size_F,
+                                                 realw* deltat_F,
+                                                 realw* deltatsqover2_F,
+                                                 realw* deltatover2_F,
+                                                 int* SIMULATION_TYPE,
+                                                 realw* b_deltat_F,
+                                                 realw* b_deltatsqover2_F,
+                                                 realw* b_deltatover2_F) {}
+
+void FC_FUNC_(it_update_displacement_ac_cuda,
+              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
+                                               int* size_F,
+                                               realw* deltat_F,
+                                               realw* deltatsqover2_F,
+                                               realw* deltatover2_F,
+                                               int* SIMULATION_TYPE,
+                                               realw* b_deltat_F,
+                                               realw* b_deltatsqover2_F,
+                                               realw* b_deltatover2_F) {}
+
+
+//
+// src/cuda/noise_tomography_cuda.cu
+//
+
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+
+void FC_FUNC_(transfer_surface_to_host,
+              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
+                                        realw* h_noise_surface_movie) {}
+
+void FC_FUNC_(noise_read_add_surface_movie_cu,
+              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+                                               realw* h_noise_surface_movie,
+                                               int* NOISE_TOMOGRAPHYf) {}
+
+
+//
+// src/cuda/prepare_mesh_constants_cuda.cu
+//
+
+void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {}
+
+void FC_FUNC_(output_free_device_memory,
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+
+void FC_FUNC_(show_free_device_memory,
+              SHOW_FREE_DEVICE_MEMORY)() {}
+
+void FC_FUNC_(get_free_device_memory,
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+
+void FC_FUNC_(prepare_cuda_device,
+              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+ fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
+ exit(1);
+}
+
+void FC_FUNC_(prepare_constants_device,
+              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
+                                        int* h_NGLLX,
+                                        int* NSPEC_AB, int* NGLOB_AB,
+                                        realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                        realw* h_etax, realw* h_etay, realw* h_etaz,
+                                        realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                        realw* h_kappav, realw* h_muv,
+                                        int* h_ibool,
+                                        int* num_interfaces_ext_mesh,
+                                        int* max_nibool_interfaces_ext_mesh,
+                                        int* h_nibool_interfaces_ext_mesh,
+                                        int* h_ibool_interfaces_ext_mesh,
+                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+                                        realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
+                                        int* ABSORBING_CONDITIONS,
+                                        int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
+                                        realw* h_abs_boundary_normal,
+                                        realw* h_abs_boundary_jacobian2Dw,
+                                        int* h_num_abs_boundary_faces,
+                                        int* h_ispec_is_inner,
+                                        int* NSOURCES,
+                                        int* nsources_local_f,
+                                        realw* h_sourcearrays,
+                                        int* h_islice_selected_source,
+                                        int* h_ispec_selected_source,
+                                        int* h_number_receiver_global,
+                                        int* h_ispec_selected_rec,
+                                        int* nrec_f,
+                                        int* nrec_local_f,
+                                        int* SIMULATION_TYPE,
+                                        int* USE_MESH_COLORING_GPU_f,
+                                        int* nspec_acoustic,int* nspec_elastic,
+                                        int* my_neighbours_ext_mesh,
+                                        int* request_send_vector_ext_mesh,
+                                        int* request_recv_vector_ext_mesh,
+                                        realw* buffer_recv_vector_ext_mesh
+                                        ) {}
+
+void FC_FUNC_(prepare_fields_acoustic_device,
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
+                                              realw* rmass_acoustic,
+                                              realw* rhostore,
+                                              realw* kappastore,
+                                              int* num_phase_ispec_acoustic,
+                                              int* phase_ispec_inner_acoustic,
+                                              int* ispec_is_acoustic,
+                                              int* NOISE_TOMOGRAPHY,
+                                              int* num_free_surface_faces,
+                                              int* free_surface_ispec,
+                                              int* free_surface_ijk,
+                                              int* ABSORBING_CONDITIONS,
+                                              int* b_reclen_potential,
+                                              realw* b_absorb_potential,
+                                              int* ELASTIC_SIMULATION,
+                                              int* num_coupling_ac_el_faces,
+                                              int* coupling_ac_el_ispec,
+                                              int* coupling_ac_el_ijk,
+                                              realw* coupling_ac_el_normal,
+                                              realw* coupling_ac_el_jacobian2Dw,
+                                              int* num_colors_outer_acoustic,
+                                              int* num_colors_inner_acoustic,
+                                              int* num_elem_colors_acoustic) {}
+
+void FC_FUNC_(prepare_fields_acoustic_adj_dev,
+              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPE,
+                                              int* APPROXIMATE_HESS_KL) {}
+
+void FC_FUNC_(prepare_fields_elastic_device,
+              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
+                                             int* size,
+                                             realw* rmass,
+                                             realw* rho_vp,
+                                             realw* rho_vs,
+                                             int* num_phase_ispec_elastic,
+                                             int* phase_ispec_inner_elastic,
+                                             int* ispec_is_elastic,
+                                             int* ABSORBING_CONDITIONS,
+                                             realw* h_b_absorb_field,
+                                             int* h_b_reclen_field,
+                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                             realw* epsilondev_xz,realw* epsilondev_yz,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                             realw* one_minus_sum_beta,realw* factor_common,
+                                             realw* alphaval,realw* betaval,realw* gammaval,
+                                             int* OCEANS,
+                                             realw* rmass_ocean_load,
+                                             int* NOISE_TOMOGRAPHY,
+                                             realw* free_surface_normal,
+                                             int* free_surface_ispec,
+                                             int* free_surface_ijk,
+                                             int* num_free_surface_faces,
+                                             int* ACOUSTIC_SIMULATION,
+                                             int* num_colors_outer_elastic,
+                                             int* num_colors_inner_elastic,
+                                             int* num_elem_colors_elastic,
+                                             int* ANISOTROPY,
+                                             realw *c11store,
+                                             realw *c12store,
+                                             realw *c13store,
+                                             realw *c14store,
+                                             realw *c15store,
+                                             realw *c16store,
+                                             realw *c22store,
+                                             realw *c23store,
+                                             realw *c24store,
+                                             realw *c25store,
+                                             realw *c26store,
+                                             realw *c33store,
+                                             realw *c34store,
+                                             realw *c35store,
+                                             realw *c36store,
+                                             realw *c44store,
+                                             realw *c45store,
+                                             realw *c46store,
+                                             realw *c55store,
+                                             realw *c56store,
+                                             realw *c66store){}
+
+void FC_FUNC_(prepare_fields_elastic_adj_dev,
+              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                             int* size,
+                                             int* SIMULATION_TYPE,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilon_trace_over_3,
+                                             realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                             realw* b_epsilon_trace_over_3,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
+                                             int* APPROXIMATE_HESS_KL){}
+
+void FC_FUNC_(prepare_sim2_or_3_const_device,
+              PREPARE_SIM2_OR_3_CONST_DEVICE)(
+                                              long* Mesh_pointer_f,
+                                              int* islice_selected_rec,
+                                              int* islice_selected_rec_size,
+                                              int* nadj_rec_local,
+                                              int* nrec,
+                                              int* myrank) {}
+
+void FC_FUNC_(prepare_fields_noise_device,
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+                                           int* NSPEC_AB, int* NGLOB_AB,
+                                           int* free_surface_ispec,
+                                           int* free_surface_ijk,
+                                           int* num_free_surface_faces,
+                                           int* SIMULATION_TYPE,
+                                           int* NOISE_TOMOGRAPHY,
+                                           int* NSTEP,
+                                           realw* noise_sourcearray,
+                                           realw* normal_x_noise,
+                                           realw* normal_y_noise,
+                                           realw* normal_z_noise,
+                                           realw* mask_noise,
+                                           realw* free_surface_jacobian2Dw) {}
+
+void FC_FUNC_(prepare_fields_gravity_device,
+              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+                                             int* GRAVITY,
+                                             realw* minus_deriv_gravity,
+                                             realw* minus_g,
+                                             realw* h_wgll_cube,
+                                             int* ACOUSTIC_SIMULATION,
+                                             realw* rhostore) {}
+
+void FC_FUNC_(prepare_seismogram_fields,
+              PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {}
+
+void FC_FUNC_(prepare_cleanup_device,
+              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
+                                      int* SIMULATION_TYPE,
+                                      int* SAVE_FORWARD,
+                                      int* ACOUSTIC_SIMULATION,
+                                      int* ELASTIC_SIMULATION,
+                                      int* ABSORBING_CONDITIONS,
+                                      int* NOISE_TOMOGRAPHY,
+                                      int* COMPUTE_AND_STORE_STRAIN,
+                                      int* ATTENUATION,
+                                      int* ANISOTROPY,
+                                      int* OCEANS,
+                                      int* APPROXIMATE_HESS_KL) {}
+
+
+//
+// src/cuda/transfer_fields_cuda.cu
+//
+
+void FC_FUNC_(transfer_fields_el_to_device,
+              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_fields_el_from_device,
+              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_fields_to_device,
+              TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                           long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_fields_from_device,
+              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_accel_to_device,
+              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_accel_from_device,
+              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_accel_from_device,
+              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_sigma_from_device,
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_displ_from_device,
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_displ_from_device,
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa) {}
+
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,
+                                                          realw* epsilondev_yy,
+                                                          realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,
+                                                          realw* epsilondev_yz,
+                                                          int* size_epsilondev,
+                                                          realw* b_epsilondev_xx,
+                                                          realw* b_epsilondev_yy,
+                                                          realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,
+                                                          realw* b_epsilondev_yz,
+                                                          int* size_b_epsilondev,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
+                                                          int* size_epsilon_trace_over_3) {}
+
+void FC_FUNC_(transfer_b_fields_att_to_device,
+              TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             int* size_R,
+                                             realw* b_epsilondev_xx,
+                                             realw* b_epsilondev_yy,
+                                             realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,
+                                             realw* b_epsilondev_yz,
+                                             int* size_epsilondev) {}
+
+void FC_FUNC_(transfer_fields_att_from_device,
+              TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
+                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                               int* size_R,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
+                                               int* size_epsilondev) {}
+
+void FC_FUNC_(transfer_kernels_el_to_host,
+              TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
+                                                    realw* h_rho_kl,
+                                                    realw* h_mu_kl,
+                                                    realw* h_kappa_kl,
+                                                    int* NSPEC_AB) {}
+
+void FC_FUNC_(transfer_kernels_noise_to_host,
+              TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
+                                                          realw* h_Sigma_kl,
+                                                          int* NSPEC_AB) {}
+
+void FC_FUNC_(transfer_fields_ac_to_device,
+              TRANSFER_FIELDS_AC_TO_DEVICE)(
+                                                  int* size,
+                                                  realw* potential_acoustic,
+                                                  realw* potential_dot_acoustic,
+                                                  realw* potential_dot_dot_acoustic,
+                                                  long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_fields_ac_to_device,
+              TRANSFER_B_FIELDS_AC_TO_DEVICE)(
+                                                    int* size,
+                                                    realw* b_potential_acoustic,
+                                                    realw* b_potential_dot_acoustic,
+                                                    realw* b_potential_dot_dot_acoustic,
+                                                    long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_fields_ac_from_device,
+              TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size,
+                                              realw* potential_acoustic,
+                                              realw* potential_dot_acoustic,
+                                              realw* potential_dot_dot_acoustic,
+                                              long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_fields_ac_from_device,
+              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
+                                                      int* size,
+                                                      realw* b_potential_acoustic,
+                                                      realw* b_potential_dot_acoustic,
+                                                      realw* b_potential_dot_dot_acoustic,
+                                                      long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_dot_dot_from_device,
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_dot_dot_from_device,
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_kernels_ac_to_host,
+              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
+                                                             realw* h_rho_ac_kl,
+                                                             realw* h_kappa_ac_kl,
+                                                             int* NSPEC_AB) {}
+
+void FC_FUNC_(transfer_kernels_hess_el_tohost,
+              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
+                                              realw* h_hess_kl,
+                                              int* NSPEC_AB) {}
+
+void FC_FUNC_(transfer_kernels_hess_ac_tohost,
+              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
+                                             realw* h_hess_ac_kl,
+                                             int* NSPEC_AB) {}
+
+
+//
+// src/cuda/write_seismograms_cuda.cu
+//
+
+void FC_FUNC_(transfer_seismograms_el_from_d,
+              TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local,
+                                              long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPEf,
+                                              realw* seismograms_d,
+                                              realw* seismograms_v,
+                                              realw* seismograms_a,
+                                              int* it) {}
+
+void FC_FUNC_(transfer_station_el_from_device,
+              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                   long* Mesh_pointer_f,int* number_receiver_global,
+                                                   int* ispec_selected_rec,int* ispec_selected_source,
+                                                   int* ibool,int* SIMULATION_TYPEf) {}
+
+void FC_FUNC_(transfer_station_ac_from_device,
+              TRANSFER_STATION_AC_FROM_DEVICE)(
+                                                realw* potential_acoustic,
+                                                realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                long* Mesh_pointer_f,
+                                                int* number_receiver_global,
+                                                int* ispec_selected_rec,
+                                                int* ispec_selected_source,
+                                                int* ibool,
+                                                int* SIMULATION_TYPEf) {}
+
diff --git a/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak b/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak
new file mode 100644
index 000000000..d87e083ce
--- /dev/null
+++ b/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak
@@ -0,0 +1,805 @@
+/*
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config.h"
+
+typedef float realw;
+
+
+
+//
+// src/cuda/check_fields_cuda.cu
+//
+
+void FC_FUNC_(check_max_norm_displ_gpu,
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_vector,
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+
+void FC_FUNC_(check_max_norm_displ,
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_displ_gpu,
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_accel_gpu,
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_veloc_gpu,
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_displ,
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+
+void FC_FUNC_(check_max_norm_b_accel,
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+
+void FC_FUNC_(check_error_vectors,
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+
+void FC_FUNC_(get_max_accel,
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
+
+void FC_FUNC_(get_norm_acoustic_from_device,
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
+                                                  long* Mesh_pointer_f,
+                                                  int* SIMULATION_TYPE) {}
+
+void FC_FUNC_(get_norm_elastic_from_device,
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                                 long* Mesh_pointer_f,
+                                                 int* SIMULATION_TYPE) {}
+
+
+//
+// src/cuda/compute_add_sources_acoustic_cuda.cu
+//
+
+void FC_FUNC_(compute_add_sources_ac_cuda,
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* NSOURCESf,
+                                                 int* SIMULATION_TYPEf,
+                                                 double* h_stf_pre_compute,
+                                                 int* myrankf) {}
+
+void FC_FUNC_(compute_add_sources_ac_s3_cuda,
+              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
+                                                      int* phase_is_innerf,
+                                                      int* NSOURCESf,
+                                                      int* SIMULATION_TYPEf,
+                                                      double* h_stf_pre_compute,
+                                                      int* myrankf) {}
+
+void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
+              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_acoustic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+
+
+//
+// src/cuda/compute_add_sources_elastic_cuda.cu
+//
+
+void FC_FUNC_(compute_add_sources_el_cuda,
+              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* NSOURCESf,
+                                            double* h_stf_pre_compute,
+                                            int* myrankf) {}
+
+void FC_FUNC_(compute_add_sources_el_s3_cuda,
+              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
+                                              double* h_stf_pre_compute,
+                                              int* NSOURCESf,
+                                              int* phase_is_inner,
+                                              int* myrank) {}
+
+void FC_FUNC_(add_source_master_rec_noise_cu,
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+                                                int* myrank_f,
+                                                int* it_f,
+                                                int* irec_master_noise_f,
+                                                int* islice_selected_rec) {}
+
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_elastic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+
+
+//
+// src/cuda/compute_coupling_cuda.cu
+//
+
+void FC_FUNC_(compute_coupling_ac_el_cuda,
+              COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* num_coupling_ac_el_facesf,
+                                           int* SIMULATION_TYPEf) {}
+
+void FC_FUNC_(compute_coupling_el_ac_cuda,
+              COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* num_coupling_ac_el_facesf,
+                                           int* SIMULATION_TYPEf) {}
+
+
+//
+// src/cuda/compute_forces_acoustic_cuda.cu
+//
+
+void FC_FUNC_(transfer_boun_pot_from_device,
+              TRANSFER_BOUN_POT_FROM_DEVICE)(
+                                              int* size,
+                                              long* Mesh_pointer_f,
+                                              realw* potential_dot_dot_acoustic,
+                                              realw* send_potential_dot_dot_buffer,
+                                              int* num_interfaces_ext_mesh,
+                                              int* max_nibool_interfaces_ext_mesh,
+                                              int* nibool_interfaces_ext_mesh,
+                                              int* ibool_interfaces_ext_mesh,
+                                              int* FORWARD_OR_ADJOINT){}
+
+void FC_FUNC_(transfer_asmbl_pot_to_device,
+              TRANSFER_ASMBL_POT_TO_DEVICE)(
+                                                long* Mesh_pointer,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* buffer_recv_scalar_ext_mesh,
+                                                int* num_interfaces_ext_mesh,
+                                                int* max_nibool_interfaces_ext_mesh,
+                                                int* nibool_interfaces_ext_mesh,
+                                                int* ibool_interfaces_ext_mesh,
+                                                int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(compute_forces_acoustic_cuda,
+              COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+                                            int* iphase,
+                                            int* nspec_outer_acoustic,
+                                            int* nspec_inner_acoustic,
+                                            int* SIMULATION_TYPE) {}
+
+void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                             long* Mesh_pointer,
+                             int* size_F,
+                             int* SIMULATION_TYPE) {}
+
+void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                                                             long* Mesh_pointer,
+                                                             int* size_F,
+                                                             realw* deltatover2_F,
+                                                             int* SIMULATION_TYPE,
+                                                             realw* b_deltatover2_F) {}
+
+void FC_FUNC_(acoustic_enforce_free_surf_cuda,
+              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
+                                                  int* SIMULATION_TYPE,
+                                                  int* ABSORB_FREE_SURFACE) {}
+
+
+//
+// src/cuda/compute_forces_elastic_cuda.cu
+//
+
+//void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*); // {};}
+
+void FC_FUNC_(transfer_boun_accel_from_device,
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
+                                                    realw* send_accel_buffer,
+                                                    int* num_interfaces_ext_mesh,
+                                                    int* max_nibool_interfaces_ext_mesh,
+                                                    int* nibool_interfaces_ext_mesh,
+                                                    int* ibool_interfaces_ext_mesh,
+                                                    int* FORWARD_OR_ADJOINT){}
+
+void FC_FUNC_(transfer_boundary_from_device_a,
+              TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
+                                               int* nspec_outer_elastic) {}
+
+void FC_FUNC_(transfer_boundary_to_device_a,
+              TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
+                                             realw* buffer_recv_vector_ext_mesh,
+                                             int* num_interfaces_ext_mesh,
+                                             int* max_nibool_interfaces_ext_mesh) {}
+
+//void FC_FUNC_(assemble_accel_on_device,
+//              ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
+//                                              realw* buffer_recv_vector_ext_mesh,
+//                                              int* num_interfaces_ext_mesh,
+//                                              int* max_nibool_interfaces_ext_mesh,
+//                                              int* nibool_interfaces_ext_mesh,
+//                                              int* ibool_interfaces_ext_mesh,
+//                                              int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(transfer_asmbl_accel_to_device,
+              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
+                                                    realw* buffer_recv_vector_ext_mesh,
+                                                    int* num_interfaces_ext_mesh,
+                                                    int* max_nibool_interfaces_ext_mesh,
+                                                    int* nibool_interfaces_ext_mesh,
+                                                    int* ibool_interfaces_ext_mesh,
+                                                    int* FORWARD_OR_ADJOINT) {}
+
+void FC_FUNC_(compute_forces_elastic_cuda,
+              COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* iphase,
+                                           int* nspec_outer_elastic,
+                                           int* nspec_inner_elastic,
+                                           int* SIMULATION_TYPE,
+                                           int* COMPUTE_AND_STORE_STRAIN,
+                                           int* ATTENUATION,
+                                           int* ANISOTROPY) {}
+
+void FC_FUNC_(sync_copy_from_device,
+              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f,
+                                     int* iphase,
+                                     realw* send_buffer) {}
+
+void FC_FUNC_(kernel_3_a_cuda,
+              KERNEL_3_A_CUDA)(long* Mesh_pointer,
+                               int* size_F,
+                               realw* deltatover2_F,
+                               int* SIMULATION_TYPE_f,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {}
+
+void FC_FUNC_(kernel_3_b_cuda,
+              KERNEL_3_B_CUDA)(long* Mesh_pointer,
+                             int* size_F,
+                             realw* deltatover2_F,
+                             int* SIMULATION_TYPE_f,
+                             realw* b_deltatover2_F) {}
+
+void FC_FUNC_(elastic_ocean_load_cuda,
+              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
+                                       int* SIMULATION_TYPE) {}
+
+
+//
+// src/cuda/compute_kernels_cuda.cu
+//
+
+void FC_FUNC_(compute_kernels_elastic_cuda,
+              COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
+                                            realw* deltat_f) {}
+
+void FC_FUNC_(compute_kernels_strgth_noise_cu,
+              COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
+                                                    realw* h_noise_surface_movie,
+                                                    realw* deltat) {}
+
+void FC_FUNC_(compute_kernels_acoustic_cuda,
+              COMPUTE_KERNELS_ACOUSTIC_CUDA)(
+                                             long* Mesh_pointer,
+                                             realw* deltat_f) {}
+
+void FC_FUNC_(compute_kernels_hess_cuda,
+              COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
+                                         realw* deltat_f,
+                                         int* ELASTIC_SIMULATION,
+                                         int* ACOUSTIC_SIMULATION) {}
+
+
+//
+// src/cuda/compute_stacey_acoustic_cuda.cu
+//
+
+void FC_FUNC_(compute_stacey_acoustic_cuda,
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(
+                                    long* Mesh_pointer_f,
+                                    int* phase_is_innerf,
+                                    int* SIMULATION_TYPEf,
+                                    int* SAVE_FORWARDf,
+                                    realw* h_b_absorb_potential) {}
+
+
+//
+// src/cuda/compute_stacey_elastic_cuda.cu
+//
+
+void FC_FUNC_(compute_stacey_elastic_cuda,
+              COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* SIMULATION_TYPEf,
+                                           int* SAVE_FORWARDf,
+                                           realw* h_b_absorb_field) {}
+
+
+//
+// src/cuda/it_update_displacement_cuda.cu
+//
+
+void FC_FUNC_(it_update_displacement_cuda,
+              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
+                                                 int* size_F,
+                                                 realw* deltat_F,
+                                                 realw* deltatsqover2_F,
+                                                 realw* deltatover2_F,
+                                                 int* SIMULATION_TYPE,
+                                                 realw* b_deltat_F,
+                                                 realw* b_deltatsqover2_F,
+                                                 realw* b_deltatover2_F) {}
+
+void FC_FUNC_(it_update_displacement_ac_cuda,
+              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
+                                               int* size_F,
+                                               realw* deltat_F,
+                                               realw* deltatsqover2_F,
+                                               realw* deltatover2_F,
+                                               int* SIMULATION_TYPE,
+                                               realw* b_deltat_F,
+                                               realw* b_deltatsqover2_F,
+                                               realw* b_deltatover2_F) {}
+
+
+//
+// src/cuda/noise_tomography_cuda.cu
+//
+
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+
+void FC_FUNC_(transfer_surface_to_host,
+              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
+                                        realw* h_noise_surface_movie) {}
+
+void FC_FUNC_(noise_read_add_surface_movie_cu,
+              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+                                               realw* h_noise_surface_movie,
+                                               int* NOISE_TOMOGRAPHYf) {}
+
+
+//
+// src/cuda/prepare_mesh_constants_cuda.cu
+//
+
+void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {}
+
+void FC_FUNC_(output_free_device_memory,
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+
+void FC_FUNC_(show_free_device_memory,
+              SHOW_FREE_DEVICE_MEMORY)() {}
+
+void FC_FUNC_(get_free_device_memory,
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+
+void FC_FUNC_(prepare_cuda_device,
+              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+ fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
+ exit(1);
+}
+
+void FC_FUNC_(prepare_constants_device,
+              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
+                                        int* h_NGLLX,
+                                        int* NSPEC_AB, int* NGLOB_AB,
+                                        realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                        realw* h_etax, realw* h_etay, realw* h_etaz,
+                                        realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                        realw* h_kappav, realw* h_muv,
+                                        int* h_ibool,
+                                        int* num_interfaces_ext_mesh,
+                                        int* max_nibool_interfaces_ext_mesh,
+                                        int* h_nibool_interfaces_ext_mesh,
+                                        int* h_ibool_interfaces_ext_mesh,
+                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+                                        realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
+                                        int* ABSORBING_CONDITIONS,
+                                        int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
+                                        realw* h_abs_boundary_normal,
+                                        realw* h_abs_boundary_jacobian2Dw,
+                                        int* h_num_abs_boundary_faces,
+                                        int* h_ispec_is_inner,
+                                        int* NSOURCES,
+                                        int* nsources_local_f,
+                                        realw* h_sourcearrays,
+                                        int* h_islice_selected_source,
+                                        int* h_ispec_selected_source,
+                                        int* h_number_receiver_global,
+                                        int* h_ispec_selected_rec,
+                                        int* nrec_f,
+                                        int* nrec_local_f,
+                                        int* SIMULATION_TYPE,
+                                        int* USE_MESH_COLORING_GPU_f,
+                                        int* nspec_acoustic,int* nspec_elastic,
+                                        int* my_neighbours_ext_mesh,
+                                        int* request_send_vector_ext_mesh,
+                                        int* request_recv_vector_ext_mesh,
+                                        realw* buffer_recv_vector_ext_mesh
+                                        ) {}
+
+void FC_FUNC_(prepare_fields_acoustic_device,
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
+                                              realw* rmass_acoustic,
+                                              realw* rhostore,
+                                              realw* kappastore,
+                                              int* num_phase_ispec_acoustic,
+                                              int* phase_ispec_inner_acoustic,
+                                              int* ispec_is_acoustic,
+                                              int* NOISE_TOMOGRAPHY,
+                                              int* num_free_surface_faces,
+                                              int* free_surface_ispec,
+                                              int* free_surface_ijk,
+                                              int* ABSORBING_CONDITIONS,
+                                              int* b_reclen_potential,
+                                              realw* b_absorb_potential,
+                                              int* ELASTIC_SIMULATION,
+                                              int* num_coupling_ac_el_faces,
+                                              int* coupling_ac_el_ispec,
+                                              int* coupling_ac_el_ijk,
+                                              realw* coupling_ac_el_normal,
+                                              realw* coupling_ac_el_jacobian2Dw,
+                                              int* num_colors_outer_acoustic,
+                                              int* num_colors_inner_acoustic,
+                                              int* num_elem_colors_acoustic) {}
+
+void FC_FUNC_(prepare_fields_acoustic_adj_dev,
+              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPE,
+                                              int* APPROXIMATE_HESS_KL) {}
+
+void FC_FUNC_(prepare_fields_elastic_device,
+              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
+                                             int* size,
+                                             realw* rmass,
+                                             realw* rho_vp,
+                                             realw* rho_vs,
+                                             int* num_phase_ispec_elastic,
+                                             int* phase_ispec_inner_elastic,
+                                             int* ispec_is_elastic,
+                                             int* ABSORBING_CONDITIONS,
+                                             realw* h_b_absorb_field,
+                                             int* h_b_reclen_field,
+                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                             realw* epsilondev_xz,realw* epsilondev_yz,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                             realw* one_minus_sum_beta,realw* factor_common,
+                                             realw* alphaval,realw* betaval,realw* gammaval,
+                                             int* OCEANS,
+                                             realw* rmass_ocean_load,
+                                             int* NOISE_TOMOGRAPHY,
+                                             realw* free_surface_normal,
+                                             int* free_surface_ispec,
+                                             int* free_surface_ijk,
+                                             int* num_free_surface_faces,
+                                             int* ACOUSTIC_SIMULATION,
+                                             int* num_colors_outer_elastic,
+                                             int* num_colors_inner_elastic,
+                                             int* num_elem_colors_elastic,
+                                             int* ANISOTROPY,
+                                             realw *c11store,
+                                             realw *c12store,
+                                             realw *c13store,
+                                             realw *c14store,
+                                             realw *c15store,
+                                             realw *c16store,
+                                             realw *c22store,
+                                             realw *c23store,
+                                             realw *c24store,
+                                             realw *c25store,
+                                             realw *c26store,
+                                             realw *c33store,
+                                             realw *c34store,
+                                             realw *c35store,
+                                             realw *c36store,
+                                             realw *c44store,
+                                             realw *c45store,
+                                             realw *c46store,
+                                             realw *c55store,
+                                             realw *c56store,
+                                             realw *c66store){}
+
+void FC_FUNC_(prepare_fields_elastic_adj_dev,
+              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                             int* size,
+                                             int* SIMULATION_TYPE,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilon_trace_over_3,
+                                             realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                             realw* b_epsilon_trace_over_3,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
+                                             int* APPROXIMATE_HESS_KL){}
+
+void FC_FUNC_(prepare_sim2_or_3_const_device,
+              PREPARE_SIM2_OR_3_CONST_DEVICE)(
+                                              long* Mesh_pointer_f,
+                                              int* islice_selected_rec,
+                                              int* islice_selected_rec_size,
+                                              int* nadj_rec_local,
+                                              int* nrec,
+                                              int* myrank) {}
+
+void FC_FUNC_(prepare_fields_noise_device,
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+                                           int* NSPEC_AB, int* NGLOB_AB,
+                                           int* free_surface_ispec,
+                                           int* free_surface_ijk,
+                                           int* num_free_surface_faces,
+                                           int* SIMULATION_TYPE,
+                                           int* NOISE_TOMOGRAPHY,
+                                           int* NSTEP,
+                                           realw* noise_sourcearray,
+                                           realw* normal_x_noise,
+                                           realw* normal_y_noise,
+                                           realw* normal_z_noise,
+                                           realw* mask_noise,
+                                           realw* free_surface_jacobian2Dw) {}
+
+void FC_FUNC_(prepare_fields_gravity_device,
+              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+                                             int* GRAVITY,
+                                             realw* minus_deriv_gravity,
+                                             realw* minus_g,
+                                             realw* h_wgll_cube,
+                                             int* ACOUSTIC_SIMULATION,
+                                             realw* rhostore) {}
+
+void FC_FUNC_(prepare_seismogram_fields,
+              PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {}
+
+void FC_FUNC_(prepare_cleanup_device,
+              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
+                                      int* SIMULATION_TYPE,
+                                      int* SAVE_FORWARD,
+                                      int* ACOUSTIC_SIMULATION,
+                                      int* ELASTIC_SIMULATION,
+                                      int* ABSORBING_CONDITIONS,
+                                      int* NOISE_TOMOGRAPHY,
+                                      int* COMPUTE_AND_STORE_STRAIN,
+                                      int* ATTENUATION,
+                                      int* ANISOTROPY,
+                                      int* OCEANS,
+                                      int* APPROXIMATE_HESS_KL) {}
+
+
+//
+// src/cuda/transfer_fields_cuda.cu
+//
+
+void FC_FUNC_(transfer_fields_el_to_device,
+              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_fields_el_from_device,
+              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_fields_to_device,
+              TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                           long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_fields_from_device,
+              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_accel_to_device,
+              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_accel_from_device,
+              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_accel_from_device,
+              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_sigma_from_device,
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_displ_from_device,
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_displ_from_device,
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa) {}
+
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,
+                                                          realw* epsilondev_yy,
+                                                          realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,
+                                                          realw* epsilondev_yz,
+                                                          int* size_epsilondev,
+                                                          realw* b_epsilondev_xx,
+                                                          realw* b_epsilondev_yy,
+                                                          realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,
+                                                          realw* b_epsilondev_yz,
+                                                          int* size_b_epsilondev,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
+                                                          int* size_epsilon_trace_over_3) {}
+
+void FC_FUNC_(transfer_b_fields_att_to_device,
+              TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             int* size_R,
+                                             realw* b_epsilondev_xx,
+                                             realw* b_epsilondev_yy,
+                                             realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,
+                                             realw* b_epsilondev_yz,
+                                             int* size_epsilondev) {}
+
+void FC_FUNC_(transfer_fields_att_from_device,
+              TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
+                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                               int* size_R,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
+                                               int* size_epsilondev) {}
+
+void FC_FUNC_(transfer_kernels_el_to_host,
+              TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
+                                                    realw* h_rho_kl,
+                                                    realw* h_mu_kl,
+                                                    realw* h_kappa_kl,
+                                                    int* NSPEC_AB) {}
+
+void FC_FUNC_(transfer_kernels_noise_to_host,
+              TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
+                                                          realw* h_Sigma_kl,
+                                                          int* NSPEC_AB) {}
+
+void FC_FUNC_(transfer_fields_ac_to_device,
+              TRANSFER_FIELDS_AC_TO_DEVICE)(
+                                                  int* size,
+                                                  realw* potential_acoustic,
+                                                  realw* potential_dot_acoustic,
+                                                  realw* potential_dot_dot_acoustic,
+                                                  long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_fields_ac_to_device,
+              TRANSFER_B_FIELDS_AC_TO_DEVICE)(
+                                                    int* size,
+                                                    realw* b_potential_acoustic,
+                                                    realw* b_potential_dot_acoustic,
+                                                    realw* b_potential_dot_dot_acoustic,
+                                                    long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_fields_ac_from_device,
+              TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size,
+                                              realw* potential_acoustic,
+                                              realw* potential_dot_acoustic,
+                                              realw* potential_dot_dot_acoustic,
+                                              long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_fields_ac_from_device,
+              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
+                                                      int* size,
+                                                      realw* b_potential_acoustic,
+                                                      realw* b_potential_dot_acoustic,
+                                                      realw* b_potential_dot_dot_acoustic,
+                                                      long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_dot_dot_from_device,
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_b_dot_dot_from_device,
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+
+void FC_FUNC_(transfer_kernels_ac_to_host,
+              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
+                                                             realw* h_rho_ac_kl,
+                                                             realw* h_kappa_ac_kl,
+                                                             int* NSPEC_AB) {}
+
+void FC_FUNC_(transfer_kernels_hess_el_tohost,
+              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
+                                              realw* h_hess_kl,
+                                              int* NSPEC_AB) {}
+
+void FC_FUNC_(transfer_kernels_hess_ac_tohost,
+              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
+                                             realw* h_hess_ac_kl,
+                                             int* NSPEC_AB) {}
+
+
+//
+// src/cuda/write_seismograms_cuda.cu
+//
+
+void FC_FUNC_(transfer_seismograms_el_from_d,
+              TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local,
+                                              long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPEf,
+                                              realw* seismograms_d,
+                                              realw* seismograms_v,
+                                              realw* seismograms_a,
+                                              int* it) {}
+
+void FC_FUNC_(transfer_station_el_from_device,
+              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                   long* Mesh_pointer_f,int* number_receiver_global,
+                                                   int* ispec_selected_rec,int* ispec_selected_source,
+                                                   int* ibool,int* SIMULATION_TYPEf) {}
+
+void FC_FUNC_(transfer_station_ac_from_device,
+              TRANSFER_STATION_AC_FROM_DEVICE)(
+                                                realw* potential_acoustic,
+                                                realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                long* Mesh_pointer_f,
+                                                int* number_receiver_global,
+                                                int* ispec_selected_rec,
+                                                int* ispec_selected_source,
+                                                int* ibool,
+                                                int* SIMULATION_TYPEf) {}
+
diff --git a/src/cuda/transfer_fields_cuda.cu b/src/cuda/transfer_fields_cuda.cu
new file mode 100644
index 000000000..2e6ca44e2
--- /dev/null
+++ b/src/cuda/transfer_fields_cuda.cu
@@ -0,0 +1,586 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+#include "prepare_constants_cuda.h"
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Transfer functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_fields_el_to_device,
+              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_fields_el_to_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40005);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_fields_el_from_device,
+              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_el_from_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_to_device,
+              TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                           long* Mesh_pointer_f) {
+
+  TRACE("transfer_b_fields_to_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  cudaMemcpy(mp->d_b_displ,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_accel,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_from_device,
+              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_b_fields_from_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(b_displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_accel_to_device,
+              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_accel_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40016);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_accel_from_device,
+              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_accel_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40026);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_accel_from_device,
+              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_b_accel_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40036);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_sigma_from_device,
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {
+
+TRACE("transfer_sigma_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40046);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_displ_from_device,
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
+
+TRACE("transfer_b_displ_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40056);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_displ_from_device,
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
+
+TRACE("transfer_displ_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40066);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa) {
+TRACE("transfer_compute_kernel_answers_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,
+                                                          realw* epsilondev_yy,
+                                                          realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,
+                                                          realw* epsilondev_yz,
+                                                          int* size_epsilondev,
+                                                          realw* b_epsilondev_xx,
+                                                          realw* b_epsilondev_yy,
+                                                          realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,
+                                                          realw* b_epsilondev_yz,
+                                                          int* size_b_epsilondev,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
+                                                          int* size_epsilon_trace_over_3) {
+TRACE("transfer_compute_kernel_fields_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
+       cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
+       cudaMemcpyDeviceToHost);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_compute_kernel_fields_from_device");
+#endif
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// attenuation fields
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_att_to_device,
+              TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             int* size_R,
+                                             realw* b_epsilondev_xx,
+                                             realw* b_epsilondev_yy,
+                                             realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,
+                                             realw* b_epsilondev_yz,
+                                             int* size_epsilondev) {
+  TRACE("transfer_b_fields_att_to_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+
+  cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_fields_att_to_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// attenuation fields
+
+extern "C"
+void FC_FUNC_(transfer_fields_att_from_device,
+              TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
+                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                               int* size_R,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
+                                               int* size_epsilondev) {
+  TRACE("transfer_fields_att_from_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_fields_att_from_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_el_to_host,
+              TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
+                                                    realw* h_rho_kl,
+                                                    realw* h_mu_kl,
+                                                    realw* h_kappa_kl,
+                                                    int* NSPEC_AB) {
+TRACE("transfer_kernels_el_to_host");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),40101);
+  print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),40102);
+  print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),40103);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for NOISE simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_noise_to_host,
+              TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
+                                                          realw* h_Sigma_kl,
+                                                          int* NSPEC_AB) {
+TRACE("transfer_kernels_noise_to_host");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),40201);
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_fields_ac_to_device,
+              TRANSFER_FIELDS_AC_TO_DEVICE)(
+                                                  int* size,
+                                                  realw* potential_acoustic,
+                                                  realw* potential_dot_acoustic,
+                                                  realw* potential_dot_dot_acoustic,
+                                                  long* Mesh_pointer_f) {
+TRACE("transfer_fields_ac_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50110);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50120);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50130);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_fields_ac_to_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_ac_to_device,
+              TRANSFER_B_FIELDS_AC_TO_DEVICE)(
+                                                    int* size,
+                                                    realw* b_potential_acoustic,
+                                                    realw* b_potential_dot_acoustic,
+                                                    realw* b_potential_dot_dot_acoustic,
+                                                    long* Mesh_pointer_f) {
+TRACE("transfer_b_fields_ac_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51110);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51120);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51130);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_fields_ac_to_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_fields_ac_from_device,
+              TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size,
+                                              realw* potential_acoustic,
+                                              realw* potential_dot_acoustic,
+                                              realw* potential_dot_dot_acoustic,
+                                              long* Mesh_pointer_f) {
+TRACE("transfer_fields_ac_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52111);
+  print_CUDA_error_if_any(cudaMemcpy(potential_dot_acoustic,mp->d_potential_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52121);
+  print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52131);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_fields_ac_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_ac_from_device,
+              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
+                                                      int* size,
+                                                      realw* b_potential_acoustic,
+                                                      realw* b_potential_dot_acoustic,
+                                                      realw* b_potential_dot_dot_acoustic,
+                                                      long* Mesh_pointer_f) {
+TRACE("transfer_b_fields_ac_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53111);
+  print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_acoustic,mp->d_b_potential_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53121);
+  print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53131);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_fields_ac_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_dot_dot_from_device,
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+
+  TRACE("transfer_dot_dot_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50041);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_dot_dot_from_device,
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+
+  TRACE("transfer_b_dot_dot_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50042);
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_ac_to_host,
+              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
+                                                             realw* h_rho_ac_kl,
+                                                             realw* h_kappa_ac_kl,
+                                                             int* NSPEC_AB) {
+
+  TRACE("transfer_kernels_ac_to_host");
+
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+  int size = *NSPEC_AB*NGLL3;
+
+  // copies kernel values over to CPU host
+  print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),54101);
+  print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),54102);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for Hess kernel calculations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_hess_el_tohost,
+              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
+                                              realw* h_hess_kl,
+                                              int* NSPEC_AB) {
+TRACE("transfer_kernels_hess_el_tohost");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),70201);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_hess_ac_tohost,
+              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
+                                             realw* h_hess_ac_kl,
+                                             int* NSPEC_AB) {
+  TRACE("transfer_kernels_hess_ac_tohost");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),70202);
+}
+
+
diff --git a/src/cuda/write_seismograms_cuda.cu b/src/cuda/write_seismograms_cuda.cu
new file mode 100644
index 000000000..d706905b2
--- /dev/null
+++ b/src/cuda/write_seismograms_cuda.cu
@@ -0,0 +1,537 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+      ! gets global number of that receiver
+    irec = number_receiver_global(irec_local)
+
+    ! gets local receiver interpolators
+    ! (1-D Lagrange interpolators)
+    hxir(:) = hxir_store(irec_local,:)
+    hetar(:) = hetar_store(irec_local,:)
+    hgammar(:) = hgammar_store(irec_local,:)
+
+*/
+
+
+// Initially sets the blocks_x to be the num_blocks, and adds rows as
+// needed. If an additional row is added, the row length is cut in
+// half. If the block count is odd, there will be 1 too many blocks,
+// which must be managed at runtime with an if statement.
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
+  *num_blocks_x = num_blocks;
+  *num_blocks_y = 1;
+  while(*num_blocks_x > 65535) {
+    *num_blocks_x = (int) ceil(*num_blocks_x*0.5f);
+    *num_blocks_y = *num_blocks_y*2;
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull =
+                             (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val +
+                               __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_interpolated_dva_plus_seismogram(int nrec_local,
+               realw* displ, realw* veloc, realw* accel,
+               int* ibool,
+               double* hxir, double* hetar, double* hgammar,
+               realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
+               double* nu,
+               int* number_receiver_global,
+               int* ispec_selected_rec) {
+  int irec_local = blockIdx.x + blockIdx.y*gridDim.x;
+  int i = threadIdx.x;
+  int j = threadIdx.y;
+  int k = threadIdx.z;
+  int ijk = i+5*(j+5*(k));
+
+  // we do the **d variable reduction in shared memory, because the
+  // atomicAdd() should be faster on the shared memory registers
+  // according to
+  // http://supercomputingblog.com/cuda/cuda-tutorial-4-atomic-operations/
+  __shared__ double sh_dxd[NGLL3];
+  __shared__ double sh_dyd[NGLL3];
+  __shared__ double sh_dzd[NGLL3];
+  __shared__ double sh_vxd[NGLL3];
+  __shared__ double sh_vyd[NGLL3];
+  __shared__ double sh_vzd[NGLL3];
+  __shared__ double sh_axd[NGLL3];
+  __shared__ double sh_ayd[NGLL3];
+  __shared__ double sh_azd[NGLL3];
+
+  if(irec_local < nrec_local) {
+    int irec = number_receiver_global[irec_local]-1;
+    int ispec = ispec_selected_rec[irec]-1;
+    int iglob = ibool[ijk+125*ispec]-1;
+    double hlagrange = hxir[irec_local + nrec_local*i]*hetar[irec_local + nrec_local*j]*hgammar[irec_local + nrec_local*k];
+    sh_dxd[ijk] = hlagrange*displ[0+3*iglob];
+    sh_dyd[ijk] = hlagrange*displ[1+3*iglob];
+    sh_dzd[ijk] = hlagrange*displ[2+3*iglob];
+
+    sh_vxd[ijk] = hlagrange*veloc[0+3*iglob];
+    sh_vyd[ijk] = hlagrange*veloc[1+3*iglob];
+    sh_vzd[ijk] = hlagrange*veloc[2+3*iglob];
+
+    sh_axd[ijk] = hlagrange*accel[0+3*iglob];
+    sh_ayd[ijk] = hlagrange*accel[1+3*iglob];
+    sh_azd[ijk] = hlagrange*accel[2+3*iglob];
+
+    // the reduction has to skip the first element (we don't need to
+    // add element 0 to itself) This reduction serializes the code,
+    // but it should be fast enough --- it can be made faster with a
+    // proper reduction algorithm.
+    __syncthreads();
+
+    // if(ijk>0) {
+    // reduction needs to be done atomically to avoid race conditions
+      // atomicAdd(&sh_dxd[0],sh_dxd[ijk]);
+      // atomicAdd(&sh_dyd[0],sh_dyd[ijk]);
+      // atomicAdd(&sh_dzd[0],sh_dzd[ijk]);
+
+      // atomicAdd(&sh_vxd[0],sh_vxd[ijk]);
+      // atomicAdd(&sh_vyd[0],sh_vyd[ijk]);
+      // atomicAdd(&sh_vzd[0],sh_vzd[ijk]);
+
+      // atomicAdd(&sh_axd[0],sh_axd[ijk]);
+      // atomicAdd(&sh_ayd[0],sh_ayd[ijk]);
+      // atomicAdd(&sh_azd[0],sh_azd[ijk]);
+    // }
+    // __syncthreads();
+    if(ijk==0) {
+      // a loop in thread 0 is 4 times faster than atomic operations
+      for(int i=1;i<125;i++) {
+        sh_dxd[0] += sh_dxd[i];
+        sh_dyd[0] += sh_dyd[i];
+        sh_dzd[0] += sh_dzd[i];
+
+        sh_vxd[0] += sh_vxd[i];
+        sh_vyd[0] += sh_vyd[i];
+        sh_vzd[0] += sh_vzd[i];
+
+        sh_axd[0] += sh_axd[i];
+        sh_ayd[0] += sh_ayd[i];
+        sh_azd[0] += sh_azd[i];
+
+      }
+
+      seismograms_d[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_dxd[0] + nu[0+3*(1+3*irec)]*sh_dyd[0] + nu[0+3*(2+3*irec)]*sh_dzd[0];
+      seismograms_d[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_dxd[0] + nu[1+3*(1+3*irec)]*sh_dyd[0] + nu[1+3*(2+3*irec)]*sh_dzd[0];
+      seismograms_d[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_dxd[0] + nu[2+3*(1+3*irec)]*sh_dyd[0] + nu[2+3*(2+3*irec)]*sh_dzd[0];
+
+      seismograms_v[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_vxd[0] + nu[0+3*(1+3*irec)]*sh_vyd[0] + nu[0+3*(2+3*irec)]*sh_vzd[0];
+      seismograms_v[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_vxd[0] + nu[1+3*(1+3*irec)]*sh_vyd[0] + nu[1+3*(2+3*irec)]*sh_vzd[0];
+      seismograms_v[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_vxd[0] + nu[2+3*(1+3*irec)]*sh_vyd[0] + nu[2+3*(2+3*irec)]*sh_vzd[0];
+
+      seismograms_a[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_axd[0] + nu[0+3*(1+3*irec)]*sh_ayd[0] + nu[0+3*(2+3*irec)]*sh_azd[0];
+      seismograms_a[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_axd[0] + nu[1+3*(1+3*irec)]*sh_ayd[0] + nu[1+3*(2+3*irec)]*sh_azd[0];
+      seismograms_a[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_axd[0] + nu[2+3*(1+3*irec)]*sh_ayd[0] + nu[2+3*(2+3*irec)]*sh_azd[0];
+
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_seismograms_el_from_d,
+              TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local,
+                                              long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPEf,
+                                              realw* seismograms_d,
+                                              realw* seismograms_v,
+                                              realw* seismograms_a,
+                                              int* it) {
+
+// transfers seismograms from device to host
+
+  TRACE("transfer_seismograms_el_from_d");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(*nrec_local,&num_blocks_x,&num_blocks_y);
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  // double h_debug[125]; for(int i=0;i<125;i++) h_debug[i] = 0;
+  // double* d_debug; cudaMalloc((void**)&d_debug,125*sizeof(double));
+  // cudaMemcpy(d_debug,h_debug,125*sizeof(double),cudaMemcpyHostToDevice);
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  compute_interpolated_dva_plus_seismogram<<<grid,threads,0,mp->compute_stream>>>(*nrec_local,
+                      mp->d_displ,mp->d_veloc,mp->d_accel,
+                      mp->d_ibool,
+                      mp->d_hxir, mp->d_hetar, mp->d_hgammar,
+                      mp->d_seismograms_d,
+                      mp->d_seismograms_v,
+                      mp->d_seismograms_a,
+                      mp->d_nu,
+                      mp->d_number_receiver_global,
+                      mp->d_ispec_selected_rec
+                      );
+
+  // cudaMemcpy(h_debug,d_debug,125*sizeof(double),cudaMemcpyDeviceToHost);
+
+  cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+  cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+  cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("seismogram Execution Time: %f ms\n",time);
+
+  // if(abs(mp->h_seismograms_d_it[0]) < 1e-25) printf("seismo1_x=%e\n",mp->h_seismograms_d_it[0]);
+  // if(abs(mp->h_seismograms_d_it[1]) < 1e-25) printf("seismo1_y=%e\n",mp->h_seismograms_d_it[1]);
+
+  // if(abs(mp->h_seismograms_d_it[2]) < 1e-25) {
+
+  // printf("%d:seismo1_z=%e\n",*it,mp->h_seismograms_d_it[2]);
+
+  // }
+
+
+  memcpy(&seismograms_d[3**nrec_local*(*it-1)],mp->h_seismograms_d_it,3* *nrec_local*sizeof(realw));
+  memcpy(&seismograms_v[3**nrec_local*(*it-1)],mp->h_seismograms_v_it,3* *nrec_local*sizeof(realw));
+  memcpy(&seismograms_a[3**nrec_local*(*it-1)],mp->h_seismograms_a_it,3* *nrec_local*sizeof(realw));
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global,
+                                                            int* ispec_selected_rec,
+                                                            int* ibool,
+                                                            realw* station_seismo_field,
+                                                            realw* desired_field,
+                                                            int nrec_local) {
+  int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+  if(blockID<nrec_local) {
+    int irec = number_receiver_global[blockID]-1;
+    int ispec = ispec_selected_rec[irec]-1;
+    int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+0] = desired_field[3*iglob];
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+1] = desired_field[3*iglob+1];
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+2] = desired_field[3*iglob+2];
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void transfer_field_from_device(Mesh* mp, realw* d_field,realw* h_field,
+                                          int* number_receiver_global,
+                                          int* d_ispec_selected,
+                                          int* h_ispec_selected,
+                                          int* ibool) {
+
+TRACE("transfer_field_from_device");
+
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  int blocksize = NGLL3;
+  int num_blocks_x = mp->nrec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // prepare field transfer array on device
+  transfer_stations_fields_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
+                                                                d_ispec_selected,
+                                                                mp->d_ibool,
+                                                                mp->d_station_seismo_field,
+                                                                d_field,
+                                                                mp->nrec_local);
+
+  cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
+       (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  int irec_local;
+  for(irec_local=0;irec_local<mp->nrec_local;irec_local++) {
+    int irec = number_receiver_global[irec_local] - 1;
+    int ispec = h_ispec_selected[irec] - 1;
+
+    for(int i=0;i<NGLL3;i++) {
+      int iglob = ibool[i+NGLL3*ispec] - 1;
+      h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
+      h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
+      h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
+    }
+
+  }
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_field_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_station_el_from_device,
+              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                   long* Mesh_pointer_f,int* number_receiver_global,
+                                                   int* ispec_selected_rec,int* ispec_selected_source,
+                                                   int* ibool,int* SIMULATION_TYPEf) {
+TRACE("transfer_station_el_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  int SIMULATION_TYPE = *SIMULATION_TYPEf;
+
+  if(SIMULATION_TYPE == 1) {
+    transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+  }
+  else if(SIMULATION_TYPE == 2) {
+    transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global,
+             mp->d_ispec_selected_source, ispec_selected_source, ibool);
+    transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global,
+             mp->d_ispec_selected_source, ispec_selected_source, ibool);
+    transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global,
+             mp->d_ispec_selected_source, ispec_selected_source, ibool);
+  }
+  else if(SIMULATION_TYPE == 3) {
+    transfer_field_from_device(mp,mp->d_b_displ,b_displ, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_from_device(mp,mp->d_b_veloc,b_veloc, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_from_device(mp,mp->d_b_accel,b_accel, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void transfer_stations_fields_acoustic_from_device_kernel(int* number_receiver_global,
+                                                                     int* ispec_selected_rec,
+                                                                     int* ibool,
+                                                                     realw* station_seismo_potential,
+                                                                     realw* desired_potential) {
+
+  int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+  int nodeID = threadIdx.x + blockID*blockDim.x;
+
+  int irec = number_receiver_global[blockID]-1;
+  int ispec = ispec_selected_rec[irec]-1;
+  int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+
+  //if(threadIdx.x == 0 ) printf("node acoustic: %i %i %i %i %i %e \n",blockID,nodeID,irec,ispec,iglob,desired_potential[iglob]);
+
+  station_seismo_potential[nodeID] = desired_potential[iglob];
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void transfer_field_acoustic_from_device(Mesh* mp,
+                                         realw* d_potential,
+                                         realw* h_potential,
+                                         int* number_receiver_global,
+                                         int* d_ispec_selected,
+                                         int* h_ispec_selected,
+                                         int* ibool) {
+
+TRACE("transfer_field_acoustic_from_device");
+
+  int irec_local,irec,ispec,iglob,j;
+
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  // sets up kernel dimensions
+  int blocksize = NGLL3;
+  int num_blocks_x = mp->nrec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // prepare field transfer array on device
+  transfer_stations_fields_acoustic_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+                                                                         d_ispec_selected,
+                                                                         mp->d_ibool,
+                                                                         mp->d_station_seismo_potential,
+                                                                         d_potential);
+
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
+                                     mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),500);
+
+  //printf("copy local receivers: %i \n",mp->nrec_local);
+
+  for(irec_local=0; irec_local < mp->nrec_local; irec_local++) {
+    irec = number_receiver_global[irec_local]-1;
+    ispec = h_ispec_selected[irec]-1;
+
+    // copy element values
+    // note: iglob may vary and can be irregularly accessing the h_potential array
+    for(j=0; j < NGLL3; j++){
+      iglob = ibool[j+NGLL3*ispec]-1;
+      h_potential[iglob] = mp->h_station_seismo_potential[j+irec_local*NGLL3];
+    }
+
+    // copy each station element's points to working array
+    // note: this works if iglob values would be all aligned...
+    //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*NGLL3]),NGLL3*sizeof(realw));
+
+  }
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_field_acoustic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_station_ac_from_device,
+              TRANSFER_STATION_AC_FROM_DEVICE)(
+                                                realw* potential_acoustic,
+                                                realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                long* Mesh_pointer_f,
+                                                int* number_receiver_global,
+                                                int* ispec_selected_rec,
+                                                int* ispec_selected_source,
+                                                int* ibool,
+                                                int* SIMULATION_TYPEf) {
+
+TRACE("transfer_station_ac_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  int SIMULATION_TYPE = *SIMULATION_TYPEf;
+
+  if(SIMULATION_TYPE == 1) {
+    transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_acoustic,potential_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+  }
+  else if(SIMULATION_TYPE == 2) {
+    transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_acoustic,potential_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
+  }
+  else if(SIMULATION_TYPE == 3) {
+    transfer_field_acoustic_from_device(mp,mp->d_b_potential_acoustic,b_potential_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("transfer_station_ac_from_device");
+#endif
+}
+
diff --git a/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90 b/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
index a5bfe5752..4b4bede11 100644
--- a/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
+++ b/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
@@ -41,7 +41,7 @@ module decompose_mesh_SCOTCH
   integer, dimension(:,:), allocatable  :: elmnts
   integer, dimension(:,:), allocatable  :: mat
   integer, dimension(:), allocatable  :: part
-  
+
   integer :: nnodes
   double precision, dimension(:,:), allocatable  :: nodes_coords
 
@@ -121,7 +121,7 @@ subroutine read_mesh_files
     character(len=256)  :: line
     logical :: use_poroelastic_file
     integer(long) :: nspec_long
-    
+
   ! sets number of nodes per element
     ngnod = esize
 
@@ -158,10 +158,10 @@ subroutine read_mesh_files
       print*,'bit size fortran: ',bit_size(nspec)
       stop 'error number of elements too large'
     endif
-    
+
     ! sets number of elements (integer 4-byte)
     nspec = nspec_long
-    
+
     allocate(elmnts(esize,nspec),stat=ier)
     if( ier /= 0 ) stop 'error allocating array elmnts'
     do ispec = 1, nspec
@@ -683,6 +683,7 @@ subroutine scotch_partitioning
     integer, dimension(:),allocatable  :: num_material
     integer :: ier
 
+    ! starts from 0
     elmnts(:,:) = elmnts(:,:) - 1
 
     ! determines maximum neighbors based on 1 common node
@@ -725,7 +726,7 @@ subroutine scotch_partitioning
     call acoustic_elastic_poro_load(elmnts_load,nspec,count_def_mat,count_undef_mat, &
                                   num_material,mat_prop,undef_mat_prop)
 
-    deallocate(num_material)
+
 
     ! SCOTCH partitioning
 
@@ -810,36 +811,40 @@ subroutine scotch_partitioning
     if (ier /= 0) then
        stop 'ERROR : MAIN : Cannot destroy strat'
     endif
-    
-  ! re-partitioning puts poroelastic-elastic coupled elements into same partition
-  !  integer  :: nfaces_coupled
-  !  integer, dimension(:,:), pointer  :: faces_coupled
+
+    ! re-partitioning puts poroelastic-elastic coupled elements into same partition
+    !  integer  :: nfaces_coupled
+    !  integer, dimension(:,:), pointer  :: faces_coupled
+
+    ! TODO: supposed to rebalance, but currently broken
     call poro_elastic_repartitioning (nspec, nnodes, elmnts, &
-                     count_def_mat, mat(1,:) , mat_prop, &
+                     count_def_mat, num_material , mat_prop, &
                      sup_neighbour, nsize, &
                      nparts, part)
-                     !nparts, part, nfaces_coupled, faces_coupled)
 
-  ! re-partitioning puts moho-surface coupled elements into same partition
+    deallocate(num_material)
+
+    ! re-partitioning puts moho-surface coupled elements into same partition
     call moho_surface_repartitioning (nspec, nnodes, elmnts, &
                      sup_neighbour, nsize, nparts, part, &
                      nspec2D_moho,ibelm_moho,nodes_ibelm_moho )
 
-
-  ! local number of each element for each partition    
+    ! local number of each element for each partition
     call build_glob2loc_elmnts(nspec, part, glob2loc_elmnts,nparts)
 
-  ! local number of each node for each partition
+    ! local number of each node for each partition
     call build_glob2loc_nodes(nspec, nnodes,nsize, nnodes_elmnts, nodes_elmnts, part, &
          glob2loc_nodes_nparts, glob2loc_nodes_parts, glob2loc_nodes, nparts)
 
-  ! mpi interfaces
+    ! mpi interfaces
     ! acoustic/elastic/poroelastic boundaries will be split into different MPI partitions
     call build_interfaces(nspec, sup_neighbour, part, elmnts, &
                              xadj, adjncy, tab_interfaces, &
                              tab_size_interfaces, ninterfaces, &
                              nparts)
 
+
+
     !or: uncomment if you want acoustic/elastic boundaries NOT to be separated into different MPI partitions
     !call build_interfaces_no_ac_el_sep(nspec, sup_neighbour, part, elmnts, &
     !                          xadj, adjncy, tab_interfaces, &
@@ -877,19 +882,23 @@ subroutine write_mesh_databases
        endif
 
        ! gets number of nodes
+
        call write_glob2loc_nodes_database(IIN_database, ipart, nnodes_loc, nodes_coords, &
                                   glob2loc_nodes_nparts, glob2loc_nodes_parts, &
                                   glob2loc_nodes, nnodes, 1)
 
-       ! gets number of spectral elements
        call write_partition_database(IIN_database, ipart, nspec_local, nspec, elmnts, &
                                   glob2loc_elmnts, glob2loc_nodes_nparts, &
                                   glob2loc_nodes_parts, glob2loc_nodes, part, mat, ngnod, 1)
 
+       !debug
+       !print*, ipart,": nspec_local=",nspec_local, " nnodes_local=", nnodes_loc
+
        ! writes out node coordinate locations
        !write(IIN_database,*) nnodes_loc
        write(IIN_database) nnodes_loc
 
+
        call write_glob2loc_nodes_database(IIN_database, ipart, nnodes_loc, nodes_coords,&
                                   glob2loc_nodes_nparts, glob2loc_nodes_parts, &
                                   glob2loc_nodes, nnodes, 2)
@@ -956,4 +965,3 @@ end subroutine write_mesh_databases
 !end program pre_meshfem3D
 
 end module decompose_mesh_SCOTCH
-
diff --git a/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90 b/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
index c76c4ba7c..4bbb81cd4 100644
--- a/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
+++ b/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
@@ -1138,6 +1138,7 @@ subroutine write_interfaces_database(IIN_database, tab_interfaces, tab_size_inte
                      !                   local_nodes(1), -1, -1, -1
                      write(IIN_database) local_elmnt, tab_interfaces(k*7+2), &
                                         local_nodes(1), -1, -1, -1
+
                   case (2)
                      ! edge element
                      do l = glob2loc_nodes_nparts(tab_interfaces(k*7+3)), &
@@ -1156,6 +1157,7 @@ subroutine write_interfaces_database(IIN_database, tab_interfaces, tab_size_inte
                      !                   local_nodes(1), local_nodes(2), -1, -1
                      write(IIN_database) local_elmnt, tab_interfaces(k*7+2), &
                                         local_nodes(1), local_nodes(2), -1, -1
+
                   case (4)
                      ! face element
                      count_faces = count_faces + 1
@@ -1187,6 +1189,7 @@ subroutine write_interfaces_database(IIN_database, tab_interfaces, tab_size_inte
                      !     local_nodes(1), local_nodes(2),local_nodes(3), local_nodes(4)
                      write(IIN_database) local_elmnt, tab_interfaces(k*7+2), &
                           local_nodes(1), local_nodes(2),local_nodes(3), local_nodes(4)
+
                   case default
                      print *, "error in write_interfaces_database!", tab_interfaces(k*7+2), iproc
                   end select
@@ -1382,12 +1385,11 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, &
                         nb_materials, num_material, mat_prop, &
                         sup_neighbour, nsize, &
                         nproc, part)
-                        !nproc, part, nfaces_coupled, faces_coupled)
 
     implicit none
 
-    integer,intent(in) :: nspec
-    integer, intent(in)  :: nnodes, nproc, nb_materials
+    integer, intent(in) :: nspec
+    integer, intent(in) :: nnodes, nproc, nb_materials
     integer, intent(in) :: sup_neighbour
     integer, intent(in) :: nsize
 
@@ -1398,11 +1400,10 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, &
     integer, dimension(0:nspec-1)  :: part
     integer, dimension(0:esize*nspec-1)  :: elmnts
 
+    ! local parameters
     integer  :: nfaces_coupled
-    !integer, intent(out)  :: nfaces_coupled
     integer, dimension(:,:), pointer  :: faces_coupled
 
-
     logical, dimension(nb_materials)  :: is_poroelastic, is_elastic
 
     ! neighbors
@@ -1429,8 +1430,8 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, &
     enddo
 
     ! checks if any poroelastic/elastic elements are set
-    !if( .not. any(is_poroelastic) ) return
-    !if( .not. any(is_elastic) ) return
+    if( .not. any(is_poroelastic) ) return
+    if( .not. any(is_elastic) ) return
 
     ! gets neighbors by 4 common nodes (face)
     allocate(xadj(0:nspec),stat=ier)
@@ -1461,6 +1462,7 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, &
     ! coupled elements
     allocate(faces_coupled(2,nfaces_coupled),stat=ier)
     if( ier /= 0 ) stop 'error allocating array faces_coupled'
+    faces_coupled(:,:) = -1
 
     ! stores elements indices
     nfaces_coupled = 0
@@ -1494,7 +1496,10 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, &
        endif
     enddo
 
- end subroutine poro_elastic_repartitioning
+    deallocate(xadj,adjncy,nnodes_elmnts,nodes_elmnts)
+    deallocate(faces_coupled)
+
+  end subroutine poro_elastic_repartitioning
 
   !--------------------------------------------------
   ! Repartitioning : two coupled moho surface elements are transfered to the same partition
@@ -1624,6 +1629,7 @@ subroutine moho_surface_repartitioning (nspec, nnodes, elmnts, &
     ! coupled elements
     allocate(faces_coupled(2,nfaces_coupled),stat=ier)
     if( ier /= 0 ) stop 'error allocating array faces_coupled'
+    faces_coupled(:,:) = -1
 
     ! stores elements indices
     nfaces_coupled = 0
@@ -1658,6 +1664,10 @@ subroutine moho_surface_repartitioning (nspec, nnodes, elmnts, &
        endif
     enddo
 
+    deallocate(is_moho,node_is_moho)
+    deallocate(xadj,adjncy,nnodes_elmnts,nodes_elmnts)
+    deallocate(faces_coupled)
+
  end subroutine moho_surface_repartitioning
 
 
diff --git a/src/decompose_mesh_SCOTCH/program_decompose_mesh_SCOTCH.f90 b/src/decompose_mesh_SCOTCH/program_decompose_mesh_SCOTCH.f90
index 90c5bb961..aad5f807b 100644
--- a/src/decompose_mesh_SCOTCH/program_decompose_mesh_SCOTCH.f90
+++ b/src/decompose_mesh_SCOTCH/program_decompose_mesh_SCOTCH.f90
@@ -31,6 +31,9 @@ program pre_meshfem3D
                                   check_valence, &
                                   scotch_partitioning, &
                                   write_mesh_databases
+
+! daniel: ifort
+!  USE IFPORT,only: getarg
   implicit none
   integer :: i
   character(len=256) :: arg(3)
diff --git a/src/generate_databases/Makefile.in b/src/generate_databases/Makefile.in
index 3a5c6867f..c22375564 100644
--- a/src/generate_databases/Makefile.in
+++ b/src/generate_databases/Makefile.in
@@ -78,14 +78,15 @@ libgendatabases_a_OBJECTS = \
 	$O/detect_surface.o \
 	$O/exit_mpi.o \
 	$O/get_absorbing_boundary.o \
-	$O/get_coupling_surfaces.o \
-	$O/get_model.o \
-	$O/get_MPI.o \
 	$O/get_attenuation_model.o \
 	$O/get_cmt.o \
+	$O/get_coupling_surfaces.o \
 	$O/get_element_face.o \
 	$O/get_global.o \
 	$O/get_jacobian_boundaries.o \
+	$O/get_model.o \
+	$O/get_MPI.o \
+	$O/get_perm_color.o \
 	$O/get_shape2D.o \
 	$O/get_shape3D.o \
 	$O/get_value_parameters.o \
@@ -98,6 +99,7 @@ libgendatabases_a_OBJECTS = \
 	$O/model_aniso.o \
 	$O/model_default.o \
 	$O/model_external_values.o \
+	$O/model_ipati.o \
 	$O/model_gll.o \
 	$O/model_salton_trough.o \
 	$O/model_tomography.o \
@@ -199,9 +201,6 @@ $O/assemble_MPI_scalar.o:  ${SHARED}/constants.h ${SHARED}/assemble_MPI_scalar.f
 $O/parallel.o:  ${SHARED}/constants.h ${SHARED}/parallel.f90
 	${MPIFCCOMPILE_CHECK} -c -o $O/parallel.o ${SHARED}/parallel.f90
 
-###
-###
-###
 
 ###
 ### serial compilation without optimization
@@ -230,6 +229,9 @@ $O/model_default.o:  ${SHARED}/constants.h model_default.f90
 $O/model_external_values.o:  ${SHARED}/constants.h model_external_values.f90
 	${FCCOMPILE_CHECK} -c -o $O/model_external_values.o model_external_values.f90
 
+$O/model_ipati.o:  ${SHARED}/constants.h model_ipati.f90
+	${FCCOMPILE_CHECK} -c -o $O/model_ipati.o model_ipati.f90
+
 $O/model_gll.o:  ${SHARED}/constants.h model_gll.f90
 	${FCCOMPILE_CHECK} -c -o $O/model_gll.o model_gll.f90
 
@@ -351,6 +353,9 @@ $O/get_model.o:  ${SHARED}/constants.h get_model.f90
 $O/get_MPI.o:  ${SHARED}/constants.h get_MPI.f90
 	${FCCOMPILE_CHECK} -c -o $O/get_MPI.o get_MPI.f90
 
+$O/get_perm_color.o:  ${SHARED}/constants.h get_perm_color.f90
+	${FCCOMPILE_CHECK} -c -o $O/get_perm_color.o get_perm_color.f90
+
 $O/create_name_database.o:  ${SHARED}/constants.h ${SHARED}/create_name_database.f90
 	${FCCOMPILE_CHECK} -c -o $O/create_name_database.o ${SHARED}/create_name_database.f90
 
diff --git a/src/generate_databases/create_mass_matrices.f90 b/src/generate_databases/create_mass_matrices.f90
index 668f9a9f2..32926ca70 100644
--- a/src/generate_databases/create_mass_matrices.f90
+++ b/src/generate_databases/create_mass_matrices.f90
@@ -160,7 +160,7 @@ subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool,OCEANS,TOPOGRAPHY,
   integer :: ier
 
   real(kind=CUSTOM_REAL) :: xloc,yloc,loc_elevation
-  
+
   ! creates ocean load mass matrix
   if(OCEANS) then
 
@@ -184,24 +184,24 @@ subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool,OCEANS,TOPOGRAPHY,
 
         do igll=1,NGLLSQUARE
           ix_oceans = free_surface_ijk(1,igll,ispec2D)
-          iy_oceans = free_surface_ijk(1,igll,ispec2D)
-          iz_oceans = free_surface_ijk(1,igll,ispec2D)
+          iy_oceans = free_surface_ijk(2,igll,ispec2D)
+          iz_oceans = free_surface_ijk(3,igll,ispec2D)
 
           iglobnum=ibool(ix_oceans,iy_oceans,iz_oceans,ispec_oceans)
 
           ! compute local height of oceans
           if( TOPOGRAPHY ) then
-          
+
             ! takes elevation from topography file
             xloc = xstore_dummy(iglobnum)
             yloc = ystore_dummy(iglobnum)
-            
+
             call get_topo_bathy_elevation(xloc,yloc,loc_elevation, &
                                         itopo_bathy,NX_TOPO,NY_TOPO, &
                                         UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION)
-            
+
             elevation = dble(loc_elevation)
-            
+
           else
 
             ! takes elevation from z-coordinate of mesh point
diff --git a/src/generate_databases/create_regions_mesh.f90 b/src/generate_databases/create_regions_mesh.f90
index 94516c734..695ce8ab3 100644
--- a/src/generate_databases/create_regions_mesh.f90
+++ b/src/generate_databases/create_regions_mesh.f90
@@ -33,7 +33,7 @@ module create_regions_mesh_ext_par
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: ystore_dummy
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: zstore_dummy
   integer :: nglob_dummy
-  
+
 ! Gauss-Lobatto-Legendre points and weights of integration
   double precision, dimension(:), allocatable :: xigll,yigll,zigll,wxgll,wygll,wzgll
 
@@ -148,7 +148,12 @@ module create_regions_mesh_ext_par
 
   logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION
 
+  ! mesh coloring
+  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
+  integer, dimension(:), allocatable :: num_elem_colors_acoustic
 
+  integer :: num_colors_outer_elastic,num_colors_inner_elastic
+  integer, dimension(:), allocatable :: num_elem_colors_elastic
   end module create_regions_mesh_ext_par
 
 !
@@ -160,7 +165,6 @@ end module create_regions_mesh_ext_par
   subroutine create_regions_mesh()
 
 ! create the different regions of the mesh
-
   use generate_databases_par,only: &
     nspec => NSPEC_AB,nglob => NGLOB_AB, &
     ibool,xstore,ystore,zstore, &
@@ -183,7 +187,7 @@ subroutine create_regions_mesh()
     ATTENUATION,USE_OLSEN_ATTENUATION, &
     UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, &
     NX_TOPO,NY_TOPO,itopo_bathy, &
-    nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho  
+    nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho
   use create_regions_mesh_ext_par
   implicit none
 
@@ -197,26 +201,6 @@ subroutine create_regions_mesh()
 !  integer,dimension(:),allocatable :: itest_flag
 !  integer, dimension(:), allocatable :: elem_flag
 
-! For Piero Basini :
-! integer :: doubling_value_found_for_Piero
-!   double precision :: xmesh,ymesh,zmesh
-!   double precision :: rho,vp,vs
-
-!   integer,dimension(nspec) ::  idoubling
-!   integer :: doubling_value_found_for_Piero
-!   integer, parameter :: NUMBER_OF_STATIONS = 6
-!   double precision, parameter :: RADIUS_TO_EXCLUDE = 250.d0
-!   double precision, dimension(NUMBER_OF_STATIONS) :: utm_x_station,utm_y_station
-
-!   logical :: is_around_a_station
-!   integer :: istation
-
-! ! store bedrock values
-!   integer ::  icornerlat,icornerlong
-!   double precision ::  lat,long,elevation_bedrock
-!   double precision ::  lat_corner,long_corner,ratio_xi,ratio_eta
-!real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: ibedrock
-
 ! initializes arrays
   call sync_all()
   if( myrank == 0) then
@@ -227,7 +211,6 @@ subroutine create_regions_mesh()
                         nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax, &
                         nspec2D_bottom,nspec2D_top,ANISOTROPY)
 
-
 ! fills location and weights for Gauss-Lobatto-Legendre points, shape and derivations,
 ! returns jacobianstore,xixstore,...gammazstore
 ! and GLL-point locations in xstore,ystore,zstore
@@ -275,6 +258,18 @@ subroutine create_regions_mesh()
                             nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax, &
                             nspec2D_bottom,nspec2D_top)
 
+! sets up up Moho surface
+  NSPEC2D_MOHO = 0
+  if( SAVE_MOHO_MESH ) then
+    call sync_all()
+    if( myrank == 0) then
+      write(IMAIN,*) '  ...setting up Moho surface'
+    endif
+    call crm_setup_moho(myrank,nspec, &
+                      nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
+                      nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
+  endif
+
 ! sets material velocities
   call sync_all()
   if( myrank == 0) then
@@ -283,8 +278,7 @@ subroutine create_regions_mesh()
   call get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
                         materials_ext_mesh,nmat_ext_mesh, &
                         undef_mat_prop,nundefMat_ext_mesh, &
-                        ANISOTROPY,LOCAL_PATH)
-
+                        ANISOTROPY)
 
 ! sets up acoustic-elastic-poroelastic coupling surfaces
   call sync_all()
@@ -297,17 +291,26 @@ subroutine create_regions_mesh()
                         num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
                         my_neighbours_ext_mesh)
 
-! sets up up Moho surface
-  NSPEC2D_MOHO = 0
-  if( SAVE_MOHO_MESH ) then
-    call sync_all()
-    if( myrank == 0) then
-      write(IMAIN,*) '  ...setting up Moho surface'
-    endif
-    call crm_setup_moho(myrank,nspec, &
-                      nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
-                      nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
+! locates inner and outer elements
+  call sync_all()
+  if( myrank == 0) then
+    write(IMAIN,*) '  ...element inner/outer separation '
+  endif
+  call crm_setup_inner_outer_elemnts(myrank,nspec, &
+                                    num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
+                                    nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                    ibool,SAVE_MESH_FILES)
+
+! colors mesh if requested
+  call sync_all()
+  if( myrank == 0) then
+    write(IMAIN,*) '  ...element mesh coloring '
   endif
+  call crm_setup_color_perm(myrank,nspec,nglob,ibool,ANISOTROPY,SAVE_MESH_FILES)
+
+! overwrites material parameters from external binary files
+  call sync_all()
+  call get_model_binaries(myrank,nspec,LOCAL_PATH)
 
 ! creates mass matrix
   call sync_all()
@@ -325,16 +328,6 @@ subroutine create_regions_mesh()
                                       UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, &
                                       NX_TOPO,NY_TOPO,itopo_bathy)
 
-! locates inner and outer elements
-  call sync_all()
-  if( myrank == 0) then
-    write(IMAIN,*) '  ...element inner/outer separation '
-  endif
-  call crm_setup_inner_outer_elemnts(myrank,nspec, &
-                                    num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                                    nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                                    ibool,SAVE_MESH_FILES)
-
 ! saves the binary mesh files
   call sync_all()
   if( myrank == 0) then
@@ -342,44 +335,52 @@ subroutine create_regions_mesh()
   endif
   !call create_name_database(prname,myrank,LOCAL_PATH)
   call save_arrays_solver_ext_mesh(nspec,nglob_dummy, &
-                        xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore,&
-                        gammaxstore,gammaystore,gammazstore, &
-                        jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
-                        rhostore,kappastore,mustore, &
-                        rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
-                        rho_vpI,rho_vpII,rho_vsI, &
-                        rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
-                        OCEANS,rmass_ocean_load,NGLOB_OCEAN, &
+!                        xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore,&
+!                        gammaxstore,gammaystore,gammazstore, &
+!                        jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
+!                        rhostore,kappastore,mustore, &
+!                        rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
+!                        rho_vpI,rho_vpII,rho_vsI, &
+!                        rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
+                        OCEANS, &
+!                        rmass_ocean_load,NGLOB_OCEAN, &
                         ibool, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
-                        abs_boundary_ijk,abs_boundary_ispec,num_abs_boundary_faces, &
-                        free_surface_normal,free_surface_jacobian2Dw, &
-                        free_surface_ijk,free_surface_ispec, &
-                        num_free_surface_faces, &
-                        coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-                        coupling_ac_el_ijk,coupling_ac_el_ispec, &
-                        num_coupling_ac_el_faces, &
-                        coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
-                        coupling_ac_po_ijk,coupling_ac_po_ispec, &
-                        num_coupling_ac_po_faces, &
-                        coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
-                        coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
-                        coupling_po_el_ispec,num_coupling_el_po_faces, &
+!                        xstore_dummy,ystore_dummy,zstore_dummy, &
+!                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
+!                        abs_boundary_ijk,abs_boundary_ispec,num_abs_boundary_faces, &
+!                        free_surface_normal,free_surface_jacobian2Dw, &
+!                        free_surface_ijk,free_surface_ispec, &
+!                        num_free_surface_faces, &
+!                        coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
+!                        coupling_ac_el_ijk,coupling_ac_el_ispec, &
+!                        num_coupling_ac_el_faces, &
+!                        coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
+!                        coupling_ac_po_ijk,coupling_ac_po_ispec, &
+!                        num_coupling_ac_po_faces, &
+!                        coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
+!                        coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
+!                        coupling_po_el_ispec,num_coupling_el_po_faces, &
                         num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, &
                         max_interface_size_ext_mesh,ibool_interfaces_ext_mesh, &
-                        prname,SAVE_MESH_FILES, &
-                        ANISOTROPY,NSPEC_ANISO, &
-                        c11store,c12store,c13store,c14store,c15store,c16store, &
-                        c22store,c23store,c24store,c25store,c26store,c33store, &
-                        c34store,c35store,c36store,c44store,c45store,c46store, &
-                        c55store,c56store,c66store, &
-                        ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
-                        ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
-                        nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
-                        num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
-                        num_phase_ispec_elastic,phase_ispec_inner_elastic, &
-                        num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic)
+!                        prname, &
+                        SAVE_MESH_FILES, &
+                        ANISOTROPY &
+!                        NSPEC_ANISO, &
+!                        c11store,c12store,c13store,c14store,c15store,c16store, &
+!                        c22store,c23store,c24store,c25store,c26store,c33store, &
+!                        c34store,c35store,c36store,c44store,c45store,c46store, &
+!                        c55store,c56store,c66store, &
+!                        ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
+!                        ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
+!                        nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
+!                        num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+!                        num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+!                        num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, &
+!                        num_colors_outer_acoustic,num_colors_inner_acoustic, &
+!                        num_elem_colors_acoustic, &
+!                        num_colors_outer_elastic,num_colors_inner_elastic, &
+!                        num_elem_colors_elastic, &
+                      )
 
 ! saves moho surface
   if( SAVE_MOHO_MESH ) then
@@ -394,12 +395,21 @@ subroutine create_regions_mesh()
 
 ! checks the mesh, stability and resolved period
   call sync_all()
-!chris: check for poro: At the moment cpI & cpII are for eta=0
-  call check_mesh_resolution_poro(myrank,nspec,nglob_dummy,ibool,&
+
+  if( POROELASTIC_SIMULATION ) then
+    !chris: check for poro: At the moment cpI & cpII are for eta=0
+    call check_mesh_resolution_poro(myrank,nspec,nglob_dummy,ibool,&
                             xstore_dummy,ystore_dummy,zstore_dummy, &
                             -1.0d0, model_speed_max,min_resolved_period, &
                             phistore,tortstore,rhoarraystore,rho_vpI,rho_vpII,rho_vsI, &
                             LOCAL_PATH,SAVE_MESH_FILES )
+  else
+    call check_mesh_resolution(myrank,nspec,nglob_dummy, &
+                              ibool,xstore_dummy,ystore_dummy,zstore_dummy, &
+                              kappastore,mustore,rho_vp,rho_vs, &
+                              -1.0d0,model_speed_max,min_resolved_period, &
+                              LOCAL_PATH,SAVE_MESH_FILES)
+  endif
 
 ! saves binary mesh files for attenuation
   if( ATTENUATION ) then
@@ -1172,6 +1182,8 @@ subroutine crm_setup_inner_outer_elemnts(myrank,nspec, &
   character(len=256) :: filename
   logical,dimension(:),allocatable :: iglob_is_inner
 
+  logical,parameter :: DEBUG = .false.
+
   ! allocates arrays
   allocate(ispec_is_inner(nspec),stat=ier)
   if( ier /= 0 ) stop 'error allocating array ispec_is_inner'
@@ -1205,7 +1217,7 @@ subroutine crm_setup_inner_outer_elemnts(myrank,nspec, &
   ! frees temporary array
   deallocate( iglob_is_inner )
 
-  if( SAVE_MESH_FILES ) then
+  if( SAVE_MESH_FILES .and. DEBUG ) then
     filename = prname(1:len_trim(prname))//'ispec_is_inner'
     call write_VTK_data_elem_l(nspec,nglob_dummy, &
                         xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
@@ -1354,3 +1366,678 @@ subroutine crm_setup_inner_outer_elemnts(myrank,nspec, &
 
   end subroutine crm_setup_inner_outer_elemnts
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine crm_setup_color_perm(myrank,nspec,nglob,ibool,ANISOTROPY,SAVE_MESH_FILES)
+
+! sets up mesh coloring and permutes elements
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer :: myrank,nspec,nglob
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  logical :: ANISOTROPY,SAVE_MESH_FILES
+
+  ! local parameters
+  integer, dimension(:), allocatable :: perm
+  integer :: ier
+
+  ! user output
+  if(myrank == 0) then
+    write(IMAIN,*) '     use coloring = ',USE_MESH_COLORING_GPU
+  endif
+
+  ! initializes
+  num_colors_outer_acoustic = 0
+  num_colors_inner_acoustic = 0
+  num_colors_outer_elastic = 0
+  num_colors_inner_elastic = 0
+
+  ! mesh coloring
+  if( USE_MESH_COLORING_GPU ) then
+
+    ! creates coloring of elements
+    allocate(perm(nspec),stat=ier)
+    if( ier /= 0 ) stop 'error allocating temporary perm array'
+    perm(:) = 0
+
+    ! acoustic domains
+    if( ACOUSTIC_SIMULATION ) then
+      if( myrank == 0) then
+        write(IMAIN,*) '     acoustic domains: '
+      endif
+      call crm_setup_color(myrank,nspec,nglob,ibool,perm, &
+                          ispec_is_acoustic,1, &
+                          num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+                          SAVE_MESH_FILES)
+    else
+      ! allocates dummy arrays
+      allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+    endif
+
+    ! elastic domains
+    if( ELASTIC_SIMULATION ) then
+      if( myrank == 0) then
+        write(IMAIN,*) '     elastic domains: '
+      endif
+      call crm_setup_color(myrank,nspec,nglob,ibool,perm, &
+                          ispec_is_elastic,2, &
+                          num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+                          SAVE_MESH_FILES)
+    else
+      allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+    endif
+
+    ! checks: after all domains are done
+    if(minval(perm) /= 1) &
+      call exit_MPI(myrank, 'minval(perm) should be 1')
+    if(maxval(perm) /= max(num_phase_ispec_acoustic,num_phase_ispec_elastic)) &
+      call exit_MPI(myrank, 'maxval(perm) should be max(num_phase_ispec_..)')
+
+    ! sorts array according to permutation
+    call sync_all()
+    if(myrank == 0) then
+      write(IMAIN,*) '     mesh permutation:'
+    endif
+    call crm_setup_permutation(myrank,nspec,nglob,ibool,ANISOTROPY,perm, &
+                              SAVE_MESH_FILES)
+
+    deallocate(perm)
+
+  else
+
+    ! allocates dummy arrays
+    allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+    allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+
+  endif ! USE_MESH_COLORING_GPU
+
+  end subroutine crm_setup_color_perm
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine crm_setup_color(myrank,nspec,nglob,ibool,perm, &
+                            ispec_is_d,idomain, &
+                            num_phase_ispec_d,phase_ispec_inner_d, &
+                            SAVE_MESH_FILES)
+
+! sets up mesh coloring
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer :: myrank,nspec,nglob
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  integer, dimension(nspec) :: perm
+
+  ! wrapper array for ispec is in domain:
+  ! idomain acoustic == 1 or elastic == 2
+  integer :: idomain
+  logical, dimension(nspec) :: ispec_is_d
+  integer :: num_phase_ispec_d
+  integer, dimension(num_phase_ispec_d,2) :: phase_ispec_inner_d
+
+  logical :: SAVE_MESH_FILES
+
+  ! local parameters
+  ! added for color permutation
+  integer :: nb_colors_outer_elements,nb_colors_inner_elements
+  integer, dimension(:), allocatable :: num_of_elems_in_this_color
+  integer, dimension(:), allocatable :: color
+  integer, dimension(:), allocatable :: first_elem_number_in_this_color
+  logical, dimension(:), allocatable :: is_on_a_slice_edge
+
+  integer :: nspec_outer,nspec_inner,nspec_domain
+  integer :: nspec_outer_min_global,nspec_outer_max_global
+  integer :: nb_colors,nb_colors_min,nb_colors_max
+
+  integer :: icolor,ispec,ispec_counter
+  integer :: ispec_inner,ispec_outer
+  integer :: ier
+
+  character(len=2),dimension(2) :: str_domain = (/ "ac", "el" /)
+  character(len=256) :: filename
+
+  logical, parameter :: DEBUG = .false.
+
+  !!!! David Michea: detection of the edges, coloring and permutation separately
+
+  ! implement mesh coloring for GPUs if needed, to create subsets of disconnected elements
+  ! to remove dependencies and the need for atomic operations in the sum of
+  ! elemental contributions in the solver
+
+  ! allocates temporary array with colors
+  allocate(color(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating temporary color array'
+  allocate(first_elem_number_in_this_color(MAX_NUMBER_OF_COLORS + 1),stat=ier)
+  if( ier /= 0 ) stop 'error allocating first_elem_number_in_this_color array'
+
+  ! flags for elements on outer rims
+  ! opposite to what is stored in ispec_is_inner
+  allocate(is_on_a_slice_edge(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating is_on_a_slice_edge array'
+  do ispec = 1,nspec
+    is_on_a_slice_edge(ispec) = .not. ispec_is_inner(ispec)
+  enddo
+
+  ! fast element coloring scheme
+  call get_perm_color_faster(is_on_a_slice_edge,ispec_is_d, &
+                            ibool,perm,color, &
+                            nspec,nglob, &
+                            nb_colors_outer_elements,nb_colors_inner_elements, &
+                            nspec_outer,nspec_inner,nspec_domain, &
+                            first_elem_number_in_this_color, &
+                            myrank)
+
+  ! for the last color, the next color is fictitious and its first (fictitious) element number is nspec + 1
+  first_elem_number_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements + 1) &
+    = nspec_domain + 1
+
+  allocate(num_of_elems_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements),stat=ier)
+  if( ier /= 0 ) stop 'error allocating num_of_elems_in_this_color array'
+
+  num_of_elems_in_this_color(:) = 0
+  do icolor = 1, nb_colors_outer_elements + nb_colors_inner_elements
+    num_of_elems_in_this_color(icolor) = first_elem_number_in_this_color(icolor+1) - first_elem_number_in_this_color(icolor)
+  enddo
+
+  ! check that the sum of all the numbers of elements found in each color is equal
+  ! to the total number of elements in the mesh
+  if(sum(num_of_elems_in_this_color) /= nspec_domain) then
+    print *,'error number of elements in this color:',idomain
+    print *,'rank: ',myrank,' nspec = ',nspec_domain
+    print *,'  total number of elements in all the colors of the mesh = ', &
+      sum(num_of_elems_in_this_color)
+    call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh')
+  endif
+
+  ! check that the sum of all the numbers of elements found in each color for the outer elements is equal
+  ! to the total number of outer elements found in the mesh
+  if(sum(num_of_elems_in_this_color(1:nb_colors_outer_elements)) /= nspec_outer) then
+    print *,'error number of outer elements in this color:',idomain
+    print *,'rank: ',myrank,' nspec_outer = ',nspec_outer
+    print*,'nb_colors_outer_elements = ',nb_colors_outer_elements
+    print *,'total number of elements in all the colors of the mesh for outer elements = ', &
+      sum(num_of_elems_in_this_color(1:nb_colors_outer_elements))
+    call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh for outer elements')
+  endif
+
+  ! debug: file output
+  if( SAVE_MESH_FILES .and. DEBUG ) then
+    filename = prname(1:len_trim(prname))//'color_'//str_domain(idomain)
+    call write_VTK_data_elem_i(nspec,nglob, &
+                              xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                              color,filename)
+  endif
+
+  deallocate(first_elem_number_in_this_color)
+  deallocate(is_on_a_slice_edge)
+  deallocate(color)
+
+  ! debug: no mesh coloring, only creates dummy coloring arrays
+  if( DEBUG ) then
+    nb_colors_outer_elements = 0
+    nb_colors_inner_elements = 0
+    ispec_counter = 0
+
+    ! first generate all the outer elements
+    do ispec = 1,nspec
+      if( ispec_is_d(ispec) ) then
+        if( ispec_is_inner(ispec) .eqv. .false. ) then
+          ispec_counter = ispec_counter + 1
+          perm(ispec) = ispec_counter
+        endif
+      endif
+    enddo
+
+    ! store total number of outer elements
+    nspec_outer = ispec_counter
+
+    ! only single color
+    if(nspec_outer > 0 ) nb_colors_outer_elements = 1
+
+    ! then generate all the inner elements
+    do ispec = 1,nspec
+      if( ispec_is_d(ispec) ) then
+        if( ispec_is_inner(ispec) .eqv. .true. ) then
+          ispec_counter = ispec_counter + 1
+          perm(ispec) = ispec_counter - nspec_outer ! starts again at 1
+        endif
+      endif
+    enddo
+    nspec_inner = ispec_counter - nspec_outer
+
+    ! only single color
+    if(nspec_inner > 0 ) nb_colors_inner_elements = 1
+
+    allocate(num_of_elems_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_of_elems_in_this_color array'
+
+    if( nspec_outer > 0 ) num_of_elems_in_this_color(1) = nspec_outer
+    if( nspec_inner > 0 ) num_of_elems_in_this_color(2) = nspec_inner
+  endif ! debug
+
+  ! debug: saves mesh coloring numbers into files
+  if( DEBUG ) then
+    ! debug file output
+    filename = prname(1:len_trim(prname))//'num_of_elems_in_this_color_'//str_domain(idomain)//'.dat'
+    open(unit=99,file=trim(filename),status='unknown',iostat=ier)
+    if( ier /= 0 ) stop 'error opening num_of_elems_in_this_color file'
+    ! number of colors for outer elements
+    write(99,*) nb_colors_outer_elements
+    ! number of colors for inner elements
+    write(99,*) nb_colors_inner_elements
+    ! number of elements in each color
+    ! outer elements
+    do icolor = 1, nb_colors_outer_elements + nb_colors_inner_elements
+      write(99,*) num_of_elems_in_this_color(icolor)
+    enddo
+    close(99)
+  endif
+
+  ! sets up domain coloring arrays
+  select case(idomain)
+  case( 1 )
+    ! acoustic domains
+    num_colors_outer_acoustic = nb_colors_outer_elements
+    num_colors_inner_acoustic = nb_colors_inner_elements
+
+    allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+
+    num_elem_colors_acoustic(:) = num_of_elems_in_this_color(:)
+
+  case( 2 )
+    ! elastic domains
+    num_colors_outer_elastic = nb_colors_outer_elements
+    num_colors_inner_elastic = nb_colors_inner_elements
+
+    allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+    if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+
+    num_elem_colors_elastic(:) = num_of_elems_in_this_color(:)
+
+  case default
+    stop 'error idomain not recognized'
+  end select
+
+  ! sets up elements for loops in simulations
+  ispec_inner = 0
+  ispec_outer = 0
+  do ispec = 1, nspec
+    ! only elements in this domain
+    if( ispec_is_d(ispec) ) then
+
+      ! sets phase_ispec arrays with ordering of elements
+      if( ispec_is_inner(ispec) .eqv. .false. ) then
+        ! outer elements
+        ispec_outer = perm(ispec)
+
+        ! checks
+        if( ispec_outer < 1 .or. ispec_outer > num_phase_ispec_d ) then
+          print*,'error outer permutation:',idomain
+          print*,'rank:',myrank,'  ispec_inner = ',ispec_outer
+          print*,'num_phase_ispec_d = ',num_phase_ispec_d
+          call exit_MPI(myrank,'error outer acoustic permutation')
+        endif
+
+        phase_ispec_inner_d(ispec_outer,1) = ispec
+
+      else
+        ! inner elements
+        ispec_inner = perm(ispec)
+
+        ! checks
+        if( ispec_inner < 1 .or. ispec_inner > num_phase_ispec_d ) then
+          print*,'error inner permutation:',idomain
+          print*,'rank:',myrank,'  ispec_inner = ',ispec_inner
+          print*,'num_phase_ispec_d = ',num_phase_ispec_d
+          call exit_MPI(myrank,'error inner acoustic permutation')
+        endif
+
+        phase_ispec_inner_d(ispec_inner,2) = ispec
+
+      endif
+    endif
+  enddo
+
+  ! total number of colors
+  nb_colors = nb_colors_inner_elements + nb_colors_outer_elements
+  call min_all_i(nb_colors,nb_colors_min)
+  call max_all_i(nb_colors,nb_colors_max)
+
+  ! user output
+  call min_all_i(nspec_outer,nspec_outer_min_global)
+  call max_all_i(nspec_outer,nspec_outer_max_global)
+  call min_all_i(nspec_outer,nspec_outer_min_global)
+  call max_all_i(nspec_outer,nspec_outer_max_global)
+  if(myrank == 0) then
+    write(IMAIN,*) '       colors min = ',nb_colors_min
+    write(IMAIN,*) '       colors max = ',nb_colors_max
+    write(IMAIN,*) '       outer elements: min = ',nspec_outer_min_global
+    write(IMAIN,*) '       outer elements: max = ',nspec_outer_max_global
+  endif
+
+  ! debug: outputs permutation array as vtk file
+  if( DEBUG ) then
+    filename = prname(1:len_trim(prname))//'perm_'//str_domain(idomain)
+    call write_VTK_data_elem_i(nspec,nglob, &
+                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                        perm,filename)
+  endif
+
+  deallocate(num_of_elems_in_this_color)
+
+  end subroutine crm_setup_color
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine crm_setup_permutation(myrank,nspec,nglob,ibool,ANISOTROPY,perm, &
+                                  SAVE_MESH_FILES)
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer :: myrank,nspec,nglob
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  logical :: ANISOTROPY
+
+  integer, dimension(nspec),intent(inout) :: perm
+
+  logical :: SAVE_MESH_FILES
+
+  ! local parameters
+  ! added for sorting
+  integer, dimension(:,:,:,:), allocatable :: temp_array_int
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: temp_array_real
+  logical, dimension(:), allocatable :: temp_array_logical_1D
+
+  integer, dimension(:), allocatable :: temp_perm_global
+  logical, dimension(:), allocatable :: mask_global
+
+  integer :: icolor,icounter,ispec,ielem,ier,i
+  integer :: iface,old_ispec,new_ispec
+
+  character(len=256) :: filename
+
+  logical,parameter :: DEBUG = .false.
+
+  ! sorts array according to permutation
+  allocate(temp_perm_global(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error temp_perm_global array'
+
+  ! global ordering
+  temp_perm_global(:) = 0
+  icounter = 0
+
+  ! fills global permutation array
+  ! starts with elastic elements
+  if( ELASTIC_SIMULATION ) then
+    ! first outer elements coloring
+    ! phase element counter
+    ielem = 0
+    do icolor = 1,num_colors_outer_elastic
+      ! loops through elements
+      do i = 1,num_elem_colors_elastic(icolor)
+        ielem = ielem + 1
+        ispec = phase_ispec_inner_elastic(ielem,1) ! 1 <-- first phase, outer elements
+        ! reorders elements
+        icounter = icounter + 1
+        temp_perm_global(ispec) = icounter
+        ! resets to new order
+        phase_ispec_inner_elastic(ielem,1) = icounter
+      enddo
+    enddo
+    ! inner elements coloring
+    ielem = 0
+    do icolor = num_colors_outer_elastic+1,num_colors_outer_elastic+num_colors_inner_elastic
+      ! loops through elements
+      do i = 1,num_elem_colors_elastic(icolor)
+        ielem = ielem + 1
+        ispec = phase_ispec_inner_elastic(ielem,2) ! 2 <-- second phase, inner elements
+        ! reorders elements
+        icounter = icounter + 1
+        temp_perm_global(ispec) = icounter
+        ! resets to new order
+        phase_ispec_inner_elastic(ielem,2) = icounter
+      enddo
+    enddo
+  endif
+
+  ! continues with acoustic elements
+  if( ACOUSTIC_SIMULATION ) then
+    ! first outer elements coloring
+    ! phase element counter
+    ielem = 0
+    do icolor = 1,num_colors_outer_acoustic
+      ! loops through elements
+      do i = 1,num_elem_colors_acoustic(icolor)
+        ielem = ielem + 1
+        ispec = phase_ispec_inner_acoustic(ielem,1) ! 1 <-- first phase, outer elements
+        ! reorders elements
+        icounter = icounter + 1
+        temp_perm_global(ispec) = icounter
+        ! resets to new order
+        phase_ispec_inner_acoustic(ielem,1) = icounter
+      enddo
+    enddo
+    ! inner elements coloring
+    ielem = 0
+    do icolor = num_colors_outer_acoustic+1,num_colors_outer_acoustic+num_colors_inner_acoustic
+      ! loops through elements
+      do i = 1,num_elem_colors_acoustic(icolor)
+        ielem = ielem + 1
+        ispec = phase_ispec_inner_acoustic(ielem,2) ! 2 <-- second phase, inner elements
+        ! reorders elements
+        icounter = icounter + 1
+        temp_perm_global(ispec) = icounter
+        ! resets to new order
+        phase_ispec_inner_acoustic(ielem,2) = icounter
+      enddo
+    enddo
+  endif
+
+  ! checks
+  if( icounter /= nspec ) then
+    print*,'error temp perm: ',icounter,nspec
+    stop 'error temporary global permutation incomplete'
+  endif
+
+  ! checks perm entries
+  if(minval(temp_perm_global) /= 1) call exit_MPI(myrank, 'minval(temp_perm_global) should be 1')
+  if(maxval(temp_perm_global) /= nspec) call exit_MPI(myrank, 'maxval(temp_perm_global) should be nspec')
+
+  ! checks if every element was uniquely set
+  allocate(mask_global(nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating temporary mask_global'
+  mask_global(:) = .false.
+  icounter = 0 ! counts permutations
+  do ispec = 1, nspec
+    new_ispec = temp_perm_global(ispec)
+    ! checks bounds
+    if( new_ispec < 1 .or. new_ispec > nspec ) call exit_MPI(myrank,'error temp_perm_global ispec bounds')
+    ! checks if already set
+    if( mask_global(new_ispec) ) then
+      print*,'error temp_perm_global:',ispec,new_ispec,'element already set'
+      call exit_MPI(myrank,'error global permutation')
+    else
+      mask_global(new_ispec) = .true.
+    endif
+    ! counts permutations
+    if( new_ispec /= ispec ) icounter = icounter + 1
+  enddo
+
+  ! checks number of set elements
+  if( count(mask_global(:)) /= nspec ) then
+    print*,'error temp_perm_global:',count(mask_global(:)),nspec,'permutation incomplete'
+    call exit_MPI(myrank,'error global permutation incomplete')
+  endif
+  deallocate(mask_global)
+
+  ! user output
+  if(myrank == 0) then
+    write(IMAIN,*) '       number of permutations = ',icounter
+  endif
+
+  ! outputs permutation array as vtk file
+  if( SAVE_MESH_FILES .and. DEBUG ) then
+    filename = prname(1:len_trim(prname))//'perm_global'
+    call write_VTK_data_elem_i(nspec,nglob, &
+                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                        temp_perm_global,filename)
+  endif
+
+  ! store as new permutation
+  perm(:) = temp_perm_global(:)
+  deallocate(temp_perm_global)
+
+  ! permutes all required mesh arrays according to new ordering
+
+  ! permutation of ibool
+  allocate(temp_array_int(NGLLX,NGLLY,NGLLZ,nspec))
+  call permute_elements_integer(ibool,temp_array_int,perm,nspec)
+  deallocate(temp_array_int)
+
+  ! element domain flags
+  allocate(temp_array_logical_1D(nspec))
+  call permute_elements_logical1D(ispec_is_acoustic,temp_array_logical_1D,perm,nspec)
+  call permute_elements_logical1D(ispec_is_elastic,temp_array_logical_1D,perm,nspec)
+  call permute_elements_logical1D(ispec_is_poroelastic,temp_array_logical_1D,perm,nspec)
+  call permute_elements_logical1D(ispec_is_inner,temp_array_logical_1D,perm,nspec)
+  deallocate(temp_array_logical_1D)
+
+  ! mesh arrays
+  allocate(temp_array_real(NGLLX,NGLLY,NGLLZ,nspec))
+  call permute_elements_real(xixstore,temp_array_real,perm,nspec)
+  call permute_elements_real(xiystore,temp_array_real,perm,nspec)
+  call permute_elements_real(xizstore,temp_array_real,perm,nspec)
+  call permute_elements_real(etaxstore,temp_array_real,perm,nspec)
+  call permute_elements_real(etaystore,temp_array_real,perm,nspec)
+  call permute_elements_real(etazstore,temp_array_real,perm,nspec)
+  call permute_elements_real(gammaxstore,temp_array_real,perm,nspec)
+  call permute_elements_real(gammaystore,temp_array_real,perm,nspec)
+  call permute_elements_real(gammazstore,temp_array_real,perm,nspec)
+  call permute_elements_real(jacobianstore,temp_array_real,perm,nspec)
+
+  ! material parameters
+  call permute_elements_real(kappastore,temp_array_real,perm,nspec)
+  call permute_elements_real(mustore,temp_array_real,perm,nspec)
+
+  ! acoustic arrays
+  if( ACOUSTIC_SIMULATION ) then
+    call permute_elements_real(rhostore,temp_array_real,perm,nspec)
+  endif
+
+  ! elastic arrays
+  if( ELASTIC_SIMULATION ) then
+    call permute_elements_real(rho_vp,temp_array_real,perm,nspec)
+    call permute_elements_real(rho_vs,temp_array_real,perm,nspec)
+    if( ANISOTROPY ) then
+      call permute_elements_real(c11store,temp_array_real,perm,nspec)
+      call permute_elements_real(c12store,temp_array_real,perm,nspec)
+      call permute_elements_real(c13store,temp_array_real,perm,nspec)
+      call permute_elements_real(c14store,temp_array_real,perm,nspec)
+      call permute_elements_real(c15store,temp_array_real,perm,nspec)
+      call permute_elements_real(c16store,temp_array_real,perm,nspec)
+      call permute_elements_real(c22store,temp_array_real,perm,nspec)
+      call permute_elements_real(c23store,temp_array_real,perm,nspec)
+      call permute_elements_real(c24store,temp_array_real,perm,nspec)
+      call permute_elements_real(c25store,temp_array_real,perm,nspec)
+      call permute_elements_real(c33store,temp_array_real,perm,nspec)
+      call permute_elements_real(c34store,temp_array_real,perm,nspec)
+      call permute_elements_real(c35store,temp_array_real,perm,nspec)
+      call permute_elements_real(c36store,temp_array_real,perm,nspec)
+      call permute_elements_real(c44store,temp_array_real,perm,nspec)
+      call permute_elements_real(c45store,temp_array_real,perm,nspec)
+      call permute_elements_real(c46store,temp_array_real,perm,nspec)
+      call permute_elements_real(c55store,temp_array_real,perm,nspec)
+      call permute_elements_real(c56store,temp_array_real,perm,nspec)
+      call permute_elements_real(c66store,temp_array_real,perm,nspec)
+    endif
+  endif
+  deallocate(temp_array_real)
+
+  ! poroelastic arrays
+  if( POROELASTIC_SIMULATION ) then
+    stop 'mesh permutation for poroelastic simulations not supported yet'
+  endif
+
+  ! boundary surface
+  if( num_abs_boundary_faces > 0 ) then
+    do iface = 1,num_abs_boundary_faces
+      old_ispec = abs_boundary_ispec(iface)
+      new_ispec = perm(old_ispec)
+      abs_boundary_ispec(iface) = new_ispec
+    enddo
+  endif
+
+  ! free surface
+  if( num_free_surface_faces > 0 ) then
+    do iface = 1,num_free_surface_faces
+      old_ispec = free_surface_ispec(iface)
+      new_ispec = perm(old_ispec)
+      free_surface_ispec(iface) = new_ispec
+    enddo
+  endif
+
+  ! coupling surface
+  if( num_coupling_ac_el_faces > 0 ) then
+    do iface = 1,num_coupling_ac_el_faces
+      old_ispec = coupling_ac_el_ispec(iface)
+      new_ispec = perm(old_ispec)
+      coupling_ac_el_ispec(iface) = new_ispec
+    enddo
+  endif
+  if( num_coupling_ac_po_faces > 0 ) then
+    do iface = 1,num_coupling_ac_po_faces
+      old_ispec = coupling_ac_po_ispec(iface)
+      new_ispec = perm(old_ispec)
+      coupling_ac_po_ispec(iface) = new_ispec
+    enddo
+  endif
+  if( num_coupling_el_po_faces > 0 ) then
+    do iface = 1,num_coupling_el_po_faces
+      ! elastic-poroelastic
+      old_ispec = coupling_el_po_ispec(iface)
+      new_ispec = perm(old_ispec)
+      coupling_el_po_ispec(iface) = new_ispec
+      ! poroelastic-elastic
+      old_ispec = coupling_po_el_ispec(iface)
+      new_ispec = perm(old_ispec)
+      coupling_po_el_ispec(iface) = new_ispec
+    enddo
+  endif
+
+  ! moho surface
+  if( NSPEC2D_MOHO > 0 ) then
+    allocate(temp_array_logical_1D(nspec))
+    call permute_elements_logical1D(is_moho_top,temp_array_logical_1D,perm,nspec)
+    call permute_elements_logical1D(is_moho_bot,temp_array_logical_1D,perm,nspec)
+    deallocate(temp_array_logical_1D)
+    do iface = 1,NSPEC2D_MOHO
+      ! top
+      old_ispec = ibelm_moho_top(iface)
+      new_ispec = perm(old_ispec)
+      ibelm_moho_top(iface) = new_ispec
+      ! bottom
+      old_ispec = ibelm_moho_bot(iface)
+      new_ispec = perm(old_ispec)
+      ibelm_moho_bot(iface) = new_ispec
+    enddo
+  endif
+
+  end subroutine crm_setup_permutation
diff --git a/src/generate_databases/generate_databases.f90 b/src/generate_databases/generate_databases.f90
index 28c888033..86c9306f5 100644
--- a/src/generate_databases/generate_databases.f90
+++ b/src/generate_databases/generate_databases.f90
@@ -295,7 +295,7 @@ module generate_databases_par
 ! flag for noise simulation
   integer :: NOISE_TOMOGRAPHY
   integer :: IMODEL
-  
+
   end module generate_databases_par
 
 !
@@ -418,7 +418,7 @@ subroutine gd_read_parameters
     write(IMAIN,*) 'Shape functions defined by NGNOD = ',NGNOD,' control nodes'
     write(IMAIN,*) 'Surface shape functions defined by NGNOD2D = ',NGNOD2D,' control nodes'
     write(IMAIN,*)
-    
+
     write(IMAIN,'(a)',advance='no') ' velocity model: '
     select case(IMODEL)
     case( IMODEL_DEFAULT )
@@ -437,8 +437,10 @@ subroutine gd_read_parameters
     write(IMAIN,'(a)',advance='yes') '  tomo'
     case( IMODEL_USER_EXTERNAL )
     write(IMAIN,'(a)',advance='yes') '  external'
+    case( IMODEL_IPATI )
+    write(IMAIN,'(a)',advance='yes') '  ipati'
     end select
-    
+
     write(IMAIN,*)
   endif
 
@@ -588,8 +590,8 @@ subroutine gd_read_partition_files
   open(unit=IIN,file=prname(1:len_trim(prname))//'Database', &
         status='old',action='read',form='unformatted',iostat=ier)
   if( ier /= 0 ) then
-    write(IMAIN,*) 'error opening file: ',prname(1:len_trim(prname))//'Database'
-    write(IMAIN,*) 'make sure file exists'
+    print*,'rank ',myrank,' error opening file: ',prname(1:len_trim(prname))//'Database'
+    print*,'please make sure file exists'
     call exit_mpi(myrank,'error opening database file')
   endif
   !read(IIN,*) nnodes_ext_mesh
@@ -930,7 +932,8 @@ subroutine gd_setup_mesh
     write(IMAIN,*) 'create regions: '
   endif
   call create_regions_mesh()
-  
+
+! now done inside create_regions_mesh_ext routine...
 ! now done inside create_regions_mesh_ext routine...
 ! Moho boundary parameters, 2-D jacobians and normals
 !  if( SAVE_MOHO_MESH ) then
diff --git a/src/generate_databases/get_coupling_surfaces.f90 b/src/generate_databases/get_coupling_surfaces.f90
index 8cdae42e7..f4ce27ad7 100644
--- a/src/generate_databases/get_coupling_surfaces.f90
+++ b/src/generate_databases/get_coupling_surfaces.f90
@@ -32,6 +32,7 @@ subroutine get_coupling_surfaces(myrank, &
                         my_neighbours_ext_mesh)
 
 ! determines coupling surface for acoustic-elastic domains
+! based on ispec_is_acoustic, ispec_is_elastic and ispec_is_poroelastic arrays
 
   use create_regions_mesh_ext_par
   implicit none
@@ -49,31 +50,106 @@ subroutine get_coupling_surfaces(myrank, &
             ibool_interfaces_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
 
+  ! local parameters
+  integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,poroelastic_flag
+  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
+  integer :: max_nibool_interfaces_ext_mesh
+  integer :: count_elastic,count_acoustic,count_poroelastic
+  integer :: ispec,i,j,k,iglob,ier,inum
+
   ! initializes number of coupling faces
   num_coupling_ac_el_faces = 0
   num_coupling_ac_po_faces = 0
   num_coupling_el_po_faces = 0
 
-  ! acoustic - elastic domain coupling
-  call get_coupling_surfaces_ac_el(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
+  ! sets flags for acoustic / elastic / poroelastic on global points
+  allocate(acoustic_flag(nglob_dummy),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array acoustic_flag'
+  allocate(elastic_flag(nglob_dummy),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array elastic_flag'
+  allocate(poroelastic_flag(nglob_dummy),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array poroelastic_flag'
+
+  acoustic_flag(:) = 0
+  elastic_flag(:) = 0
+  poroelastic_flag(:) = 0
+
+  count_acoustic = 0
+  count_elastic = 0
+  count_poroelastic = 0
+
+  do ispec = 1, nspec
+    ! counts elements
+    if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1
+    if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1
+    if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1
+
+    ! sets flags on global points
+    do k = 1, NGLLZ
+      do j = 1, NGLLY
+        do i = 1, NGLLX
+          ! global index
+          iglob = ibool(i,j,k,ispec)
+          ! sets acoustic flag
+          if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) =  myrank+1
+          ! sets elastic flag
+          if( ispec_is_elastic(ispec) ) elastic_flag(iglob) =  myrank+1
+          ! sets poroelastic flag
+          if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) =  myrank+1
+        enddo
+      enddo
+    enddo
+  enddo
+  call sum_all_i(count_acoustic,inum)
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     total acoustic elements   :',inum
+  endif
+  call sum_all_i(count_elastic,inum)
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     total elastic elements    :',inum
+  endif
+  call sum_all_i(count_poroelastic,inum)
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     total poroelastic elements:',inum
+  endif
+
+  ! collects contributions from different MPI partitions
+  ! sets up MPI communications
+  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
+  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
+  do i = 1, num_interfaces_ext_mesh
+     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
+  enddo
+  ! sums acoustic flags
+  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
+                        my_neighbours_ext_mesh)
+  ! sums elastic flags
+  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
                         my_neighbours_ext_mesh)
+  ! sums poroelastic flags
+  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
+                        my_neighbours_ext_mesh)
+
+
+  ! determines common faces between different domains
+  ! acoustic - elastic domain coupling
+  call get_coupling_surfaces_ac_el(myrank,nspec,ibool,elastic_flag)
 
   ! acoustic - poroelastic domain coupling
-  call get_coupling_surfaces_ac_poro(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  call get_coupling_surfaces_ac_poro(myrank,nspec,ibool,acoustic_flag)
 
   ! elastic - poroelastic domain coupling
-  call get_coupling_surfaces_el_poro(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  call get_coupling_surfaces_el_poro(myrank,nspec,ibool,elastic_flag)
+
+  ! frees temporary arrays
+  deallocate(acoustic_flag,elastic_flag,poroelastic_flag)
 
   end subroutine get_coupling_surfaces
 
@@ -81,11 +157,7 @@ end subroutine get_coupling_surfaces
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine get_coupling_surfaces_ac_el(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  subroutine get_coupling_surfaces_ac_el(myrank,nspec,ibool,elastic_flag)
 
 ! determines coupling surface for acoustic-elastic domains
 
@@ -93,17 +165,12 @@ subroutine get_coupling_surfaces_ac_el(myrank, &
   implicit none
 
 ! number of spectral elements in each block
-  integer :: myrank,nspec,NPROC
+  integer :: myrank,nspec
 
 ! arrays with the mesh
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
 
-! MPI communication
-  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
-            ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer,dimension(nglob_dummy) :: elastic_flag
 
 ! local parameters
   ! (assumes NGLLX=NGLLY=NGLLZ)
@@ -116,16 +183,9 @@ subroutine get_coupling_surfaces_ac_el(myrank, &
   integer,dimension(:,:,:),allocatable :: tmp_ijk
   integer,dimension(:),allocatable :: tmp_ispec
 
-  integer,dimension(NGNOD2D) :: iglob_corners_ref !,iglob_corners
-  integer :: ispec,i,j,k,igll,ier,iglob
-  integer :: inum,iface_ref,icorner,iglob_midpoint ! iface,ispec_neighbor
-  integer :: count_elastic,count_acoustic
-
-  ! mpi interface communication
-  integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,test_flag
-  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
-  integer :: max_nibool_interfaces_ext_mesh
-  logical, dimension(:), allocatable :: mask_ibool
+  integer,dimension(NGNOD2D) :: iglob_corners_ref
+  integer :: ispec,i,j,igll,ier
+  integer :: inum,iface_ref
 
   ! corners indices of reference cube faces
   integer,dimension(3,4),parameter :: iface1_corner_ijk = &
@@ -153,7 +213,7 @@ subroutine get_coupling_surfaces_ac_el(myrank, &
   !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data
   !character(len=256):: prname_file
 
-! allocates temporary arrays
+  ! allocates temporary arrays
   allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier)
   if( ier /= 0 ) stop 'error allocating array tmp_normal'
   allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier)
@@ -167,184 +227,80 @@ subroutine get_coupling_surfaces_ac_el(myrank, &
   tmp_normal(:,:,:) = 0.0
   tmp_jacobian2Dw(:,:) = 0.0
 
-  ! sets flags for acoustic / elastic on global points
-  allocate(elastic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array elastic_flag'
-  allocate(acoustic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array acoustic_flag'
-  allocate(test_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array test_flag'
-  allocate(mask_ibool(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array mask_ibool'
-  elastic_flag(:) = 0
-  acoustic_flag(:) = 0
-  test_flag(:) = 0
-  count_elastic = 0
-  count_acoustic = 0
-  do ispec = 1, nspec
-    ! counts elements
-    if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1
-    if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1
-
-    ! sets flags on global points
-    do k = 1, NGLLZ
-      do j = 1, NGLLY
-        do i = 1, NGLLX
-          ! global index
-          iglob = ibool(i,j,k,ispec)
-          ! sets elastic flag
-          if( ispec_is_elastic(ispec) ) elastic_flag(iglob) =  myrank+1
-          ! sets acoustic flag
-          if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) =  myrank+1
-          ! sets test flag
-          test_flag(iglob) = myrank+1
-        enddo
-      enddo
-    enddo
-  enddo
-  call sum_all_i(count_acoustic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total acoustic elements:',inum
-  endif
-  call sum_all_i(count_elastic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total elastic elements :',inum
-  endif
-
-  ! collects contributions from different MPI partitions
-  ! sets up MPI communications
-  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
-  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
-  do i = 1, num_interfaces_ext_mesh
-     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
-  enddo
-  ! sums elastic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-  ! sums acoustic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
-  ! sums test flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
   ! loops over all element faces and
   ! counts number of coupling faces between acoustic and elastic elements
-  mask_ibool(:) = .false.
   inum = 0
+
+  ! coupling surfaces: takes point of view from acoustic elements, i.e. if element is acoustic
+  !                               and has an elastic neighbor, then this acoustic element is added to
+  !                               the coupling elements and its corresponding coupling surface
+  !                               ( no matter in which MPI partition it is)
+  ! note: we use acoustic elements as reference elements because we will need
+  !          density from acoustic element when coupling pressure in case of gravity
   do ispec=1,nspec
 
-    ! loops over each face
-    do iface_ref= 1, 6
-
-      ! takes indices of corners of reference face
-      do icorner = 1,NGNOD2D
-        i = iface_all_corner_ijk(1,icorner,iface_ref)
-        j = iface_all_corner_ijk(2,icorner,iface_ref)
-        k = iface_all_corner_ijk(3,icorner,iface_ref)
-        ! global reference indices
-        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
-
-        ! reference corner coordinates
-        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-      enddo
+    if( ispec_is_acoustic(ispec) ) then
+
+      ! loops over each face
+      do iface_ref= 1, 6
+
+        ! takes indices of corners of reference face
+        call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, &
+                                ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, &
+                                iface_all_corner_ijk)
 
-      ! checks if face has acoustic side
-      if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(4) ) >= 1) then
         ! checks if face is has an elastic side
         if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
            elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
            elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
            elastic_flag( iglob_corners_ref(4) ) >= 1) then
 
-          ! reference midpoint on face (used to avoid redundant face counting)
-          i = iface_all_midpointijk(1,iface_ref)
-          j = iface_all_midpointijk(2,iface_ref)
-          k = iface_all_midpointijk(3,iface_ref)
-          iglob_midpoint = ibool(i,j,k,ispec)
-
-          ! checks if points on this face are masked already
-          if( .not. mask_ibool(iglob_midpoint) .and. &
-              ( acoustic_flag(iglob_midpoint) >= 1 .and. elastic_flag(iglob_midpoint) >= 1) ) then
-
-            ! gets face GLL points i,j,k indices from element face
-            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
-
-            ! takes each element face only once, if it lies on an MPI interface
-            ! note: this is not exactly load balanced
-            !          lowest rank process collects as many faces as possible, second lowest as so forth
-            if( (test_flag(iglob_midpoint) == myrank+1) .or. &
-               (test_flag(iglob_midpoint) > 2*(myrank+1)) ) then
-
-              ! gets face GLL 2Djacobian, weighted from element face
-              call get_jacobian_boundary_face(myrank,nspec, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-
-              ! normal convention: points away from acoustic, reference element
-              !                                switch normal direction if necessary
-              do j=1,NGLLY
-                do i=1,NGLLX
-                    ! directs normals such that they point outwards of element
-                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-                                                ibool,nspec,nglob_dummy, &
-                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-                                                normal_face(:,i,j) )
-                    ! makes sure that it always points away from acoustic element,
-                    ! otherwise switch direction
-                    if( ispec_is_elastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j)
-                enddo
-              enddo
-
-              ! stores informations about this face
-              inum = inum + 1
-              tmp_ispec(inum) = ispec
-              igll = 0
-              do j=1,NGLLY
-                do i=1,NGLLX
-                  ! adds all gll points on this face
-                  igll = igll + 1
-
-                  ! do we need to store local i,j,k,ispec info? or only global indices iglob?
-                  tmp_ijk(:,igll,inum) = ijk_face(:,i,j)
-
-                  ! stores weighted jacobian and normals
-                  tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
-                  tmp_normal(:,igll,inum) = normal_face(:,i,j)
-
-                  ! masks global points ( to avoid redundant counting of faces)
-                  iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec)
-                  mask_ibool(iglob) = .true.
-                enddo
-              enddo
-            else
-              ! assumes to be already collected by lower rank process, masks face points
-              do j=1,NGLLY
-                do i=1,NGLLX
-                  iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec)
-                  mask_ibool(iglob) = .true.
-                enddo
-              enddo
-            endif ! test_flag
-          endif ! mask_ibool
+          ! gets face GLL points i,j,k indices from element face
+          call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
+
+          ! gets face GLL 2Djacobian, weighted from element face
+          call get_jacobian_boundary_face(myrank,nspec, &
+                    xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
+                    dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
+                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                    ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
+
+          ! normal convention: points away from acoustic, reference element
+          !                                switch normal direction if necessary
+          do j=1,NGLLY
+            do i=1,NGLLX
+                ! directs normals such that they point outwards of element
+                call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
+                                            ibool,nspec,nglob_dummy, &
+                                            xstore_dummy,ystore_dummy,zstore_dummy, &
+                                            normal_face(:,i,j) )
+                ! makes sure that it always points away from acoustic element,
+                ! otherwise switch direction
+                ! note: this should not happen, since we only loop over acoustic elements
+                if( ispec_is_elastic(ispec) ) stop 'error acoustic-elastic coupling surface'
+            enddo
+          enddo
+
+          ! stores informations about this face
+          inum = inum + 1
+          tmp_ispec(inum) = ispec
+          igll = 0
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! adds all gll points on this face
+              igll = igll + 1
+
+              ! do we need to store local i,j,k,ispec info? or only global indices iglob?
+              tmp_ijk(:,igll,inum) = ijk_face(:,i,j)
+
+              ! stores weighted jacobian and normals
+              tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
+              tmp_normal(:,igll,inum) = normal_face(:,i,j)
+            enddo
+          enddo
         endif ! elastic_flag
-      endif ! acoustic_flag
-    enddo ! iface_ref
+      enddo ! iface_ref
+    endif ! ispec_is_acoustic
   enddo ! ispec
 
 ! stores completed coupling face informations
@@ -370,8 +326,7 @@ subroutine get_coupling_surfaces_ac_el(myrank, &
 ! user output
   call sum_all_i(num_coupling_ac_el_faces,inum)
   if( myrank == 0 ) then
-    write(IMAIN,*) '     acoustic-elastic coupling:'
-    write(IMAIN,*) '     total number of faces = ',inum
+    write(IMAIN,*) '     acoustic-elastic coupling    : total number of faces = ',inum
   endif
 
   end subroutine get_coupling_surfaces_ac_el
@@ -381,11 +336,7 @@ end subroutine get_coupling_surfaces_ac_el
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine get_coupling_surfaces_ac_poro(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  subroutine get_coupling_surfaces_ac_poro(myrank,nspec,ibool,acoustic_flag)
 
 ! determines coupling surface for acoustic-poroelastic domains
 
@@ -393,17 +344,12 @@ subroutine get_coupling_surfaces_ac_poro(myrank, &
   implicit none
 
 ! number of spectral elements in each block
-  integer :: myrank,nspec,NPROC
+  integer :: myrank,nspec
 
 ! arrays with the mesh
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
 
-! MPI communication
-  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
-            ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer,dimension(nglob_dummy) :: acoustic_flag
 
 ! local parameters
   ! (assumes NGLLX=NGLLY=NGLLZ)
@@ -417,14 +363,8 @@ subroutine get_coupling_surfaces_ac_poro(myrank, &
   integer,dimension(:),allocatable :: tmp_ispec
 
   integer,dimension(NGNOD2D) :: iglob_corners_ref
-  integer :: ispec,i,j,k,igll,ier,iglob
-  integer :: inum,iface_ref,icorner
-  integer :: count_poroelastic,count_acoustic
-
-  ! mpi interface communication
-  integer, dimension(:), allocatable :: poroelastic_flag,acoustic_flag,test_flag
-  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
-  integer :: max_nibool_interfaces_ext_mesh
+  integer :: ispec,i,j,igll,ier
+  integer :: inum,iface_ref
 
   ! corners indices of reference cube faces
   integer,dimension(3,4),parameter :: iface1_corner_ijk = &
@@ -452,7 +392,7 @@ subroutine get_coupling_surfaces_ac_poro(myrank, &
   !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data
   !character(len=256):: prname_file
 
-! allocates temporary arrays
+  ! allocates temporary arrays
   allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier)
   if( ier /= 0 ) stop 'error allocating array tmp_normal'
   allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier)
@@ -466,147 +406,71 @@ subroutine get_coupling_surfaces_ac_poro(myrank, &
   tmp_normal(:,:,:) = 0.0
   tmp_jacobian2Dw(:,:) = 0.0
 
-  ! sets flags for acoustic / poroelastic on global points
-  allocate(poroelastic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array poroelastic_flag'
-  allocate(acoustic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array acoustic_flag'
-  allocate(test_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array test_flag'
-  poroelastic_flag(:) = 0
-  acoustic_flag(:) = 0
-  test_flag(:) = 0
-  count_poroelastic = 0
-  count_acoustic = 0
-  do ispec = 1, nspec
-    ! counts elements
-    if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1
-    if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1
-
-    ! sets flags on global points
-    do k = 1, NGLLZ
-      do j = 1, NGLLY
-        do i = 1, NGLLX
-          ! global index
-          iglob = ibool(i,j,k,ispec)
-          ! sets poroelastic flag
-          if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) =  myrank+1
-          ! sets acoustic flag
-          if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) =  myrank+1
-          ! sets test flag
-          test_flag(iglob) = myrank+1
-        enddo
-      enddo
-    enddo
-  enddo
-  call sum_all_i(count_acoustic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total acoustic elements:',inum
-  endif
-  call sum_all_i(count_poroelastic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total poroelastic elements :',inum
-  endif
-
-  ! collects contributions from different MPI partitions
-  ! sets up MPI communications
-  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
-  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
-  do i = 1, num_interfaces_ext_mesh
-     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
-  enddo
-  ! sums poroelastic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-  ! sums acoustic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
-  ! sums test flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
   ! loops over all element faces and
   ! counts number of coupling faces between acoustic and poroelastic elements
   inum = 0
   do ispec=1,nspec
 
-   if(ispec_is_poroelastic(ispec)) then
-
-    ! loops over each face
-    do iface_ref= 1, 6
-
-      ! takes indices of corners of reference face
-      do icorner = 1,NGNOD2D
-        i = iface_all_corner_ijk(1,icorner,iface_ref)
-        j = iface_all_corner_ijk(2,icorner,iface_ref)
-        k = iface_all_corner_ijk(3,icorner,iface_ref)
-        ! global reference indices
-        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
-
-        ! reference corner coordinates
-        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-      enddo
-
-      ! checks if face has acoustic side
-      if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(4) ) >= 1) then
-
-            ! gets face GLL points i,j,k indices from poroelastic element face
-            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
-
-              ! gets face GLL 2Djacobian, weighted from element face
-              call get_jacobian_boundary_face(myrank,nspec, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-
-              ! normal convention: points away from acoustic, reference element
-              !                                switch normal direction if necessary
-              do j=1,NGLLY
-                do i=1,NGLLX
-                    ! directs normals such that they point outwards of element
-                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-                                                ibool,nspec,nglob_dummy, &
-                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-                                                normal_face(:,i,j) )
-                    ! reverse the sign, we know we are in a poroelastic element
-                    normal_face(:,i,j) = - normal_face(:,i,j)
-                enddo
-              enddo
-
-              ! stores informations about this face
-              inum = inum + 1
-              tmp_ispec(inum) = ispec
-              igll = 0
-              do j=1,NGLLY
-                do i=1,NGLLX
-                  ! adds all gll points on this face
-                  igll = igll + 1
-
-                  ! we need to store local i,j,k,ispec info
-                  tmp_ijk(:,igll,inum) = ijk_face(:,i,j)
-
-                  ! stores weighted jacobian and normals
-                  tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
-                  tmp_normal(:,igll,inum) = normal_face(:,i,j)
-                enddo
-              enddo
-      endif ! acoustic_flag
-    enddo ! iface_ref
-   endif ! ispec_is_poroelastic
+    if(ispec_is_poroelastic(ispec)) then
+
+      ! loops over each face
+      do iface_ref= 1, 6
+
+        ! takes indices of corners of reference face
+        call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, &
+                                ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, &
+                                iface_all_corner_ijk)
+
+        ! checks if face has acoustic side
+        if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
+           acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
+           acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
+           acoustic_flag( iglob_corners_ref(4) ) >= 1) then
+
+          ! gets face GLL points i,j,k indices from poroelastic element face
+          call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
+
+          ! gets face GLL 2Djacobian, weighted from element face
+          call get_jacobian_boundary_face(myrank,nspec, &
+                    xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
+                    dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
+                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                    ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
+
+          ! normal convention: points away from acoustic, reference element
+          !                                switch normal direction if necessary
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! directs normals such that they point outwards of element
+              call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
+                                          ibool,nspec,nglob_dummy, &
+                                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                                          normal_face(:,i,j) )
+              ! reverse the sign, we know we are in a poroelastic element
+              normal_face(:,i,j) = - normal_face(:,i,j)
+            enddo
+          enddo
+
+          ! stores informations about this face
+          inum = inum + 1
+          tmp_ispec(inum) = ispec
+          igll = 0
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! adds all gll points on this face
+              igll = igll + 1
+
+              ! we need to store local i,j,k,ispec info
+              tmp_ijk(:,igll,inum) = ijk_face(:,i,j)
+
+              ! stores weighted jacobian and normals
+              tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
+              tmp_normal(:,igll,inum) = normal_face(:,i,j)
+            enddo
+          enddo
+        endif ! acoustic_flag
+      enddo ! iface_ref
+    endif ! ispec_is_poroelastic
   enddo ! ispec
 
 ! stores completed coupling face informations
@@ -633,8 +497,7 @@ subroutine get_coupling_surfaces_ac_poro(myrank, &
 ! user output
   call sum_all_i(num_coupling_ac_po_faces,inum)
   if( myrank == 0 ) then
-    write(IMAIN,*) '     acoustic-poroelastic coupling:'
-    write(IMAIN,*) '     total number of faces = ',inum
+    write(IMAIN,*) '     acoustic-poroelastic coupling: total number of faces = ',inum
   endif
 
   end subroutine get_coupling_surfaces_ac_poro
@@ -643,11 +506,7 @@ end subroutine get_coupling_surfaces_ac_poro
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine get_coupling_surfaces_el_poro(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  subroutine get_coupling_surfaces_el_poro(myrank,nspec,ibool,elastic_flag)
 
 ! determines coupling surface for elastic-poroelastic domains
 
@@ -655,17 +514,12 @@ subroutine get_coupling_surfaces_el_poro(myrank, &
   implicit none
 
 ! number of spectral elements in each block
-  integer :: myrank,nspec,NPROC
+  integer :: myrank,nspec
 
 ! arrays with the mesh
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
 
-! MPI communication
-  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
-            ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer,dimension(nglob_dummy) :: elastic_flag
 
 ! local parameters
   ! (assumes NGLLX=NGLLY=NGLLZ)
@@ -679,14 +533,9 @@ subroutine get_coupling_surfaces_el_poro(myrank, &
   integer,dimension(:),allocatable :: tmp_ispec,tmp_ispec_el
 
   integer,dimension(NGNOD2D) :: iglob_corners_ref,iglob_corners_ref_el
-  integer :: ispec,i,j,k,igll,ier,iglob,ispec_el,ispec_ref_el
+  integer :: ispec,i,j,k,igll,ier
+  integer :: ispec_el,ispec_ref_el
   integer :: inum,iface_ref,iface_ref_el,iface_el,icorner
-  integer :: count_poroelastic,count_elastic
-
-  ! mpi interface communication
-  integer, dimension(:), allocatable :: poroelastic_flag,elastic_flag,test_flag
-  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
-  integer :: max_nibool_interfaces_ext_mesh
 
   ! corners indices of reference cube faces
   integer,dimension(3,4),parameter :: iface1_corner_ijk = &
@@ -734,175 +583,100 @@ subroutine get_coupling_surfaces_el_poro(myrank, &
   tmp_normal(:,:,:) = 0.0
   tmp_jacobian2Dw(:,:) = 0.0
 
-  ! sets flags for elastic / poroelastic on global points
-  allocate(poroelastic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array poroelastic_flag'
-  allocate(elastic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array elastic_flag'
-  allocate(test_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array test_flag'
-  poroelastic_flag(:) = 0
-  elastic_flag(:) = 0
-  test_flag(:) = 0
-  count_poroelastic = 0
-  count_elastic = 0
-  do ispec = 1, nspec
-    ! counts elements
-    if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1
-    if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1
-
-    ! sets flags on global points
-    do k = 1, NGLLZ
-      do j = 1, NGLLY
-        do i = 1, NGLLX
-          ! global index
-          iglob = ibool(i,j,k,ispec)
-          ! sets poroelastic flag
-          if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) =  myrank+1
-          ! sets elastic flag
-          if( ispec_is_elastic(ispec) ) elastic_flag(iglob) =  myrank+1
-          ! sets test flag
-          test_flag(iglob) = myrank+1
-        enddo
-      enddo
-    enddo
-  enddo
-  call sum_all_i(count_elastic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total elastic elements:',inum
-  endif
-  call sum_all_i(count_poroelastic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total poroelastic elements :',inum
-  endif
-
-  ! collects contributions from different MPI partitions
-  ! sets up MPI communications
-  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
-  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
-  do i = 1, num_interfaces_ext_mesh
-     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
-  enddo
-  ! sums poroelastic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-  ! sums elastic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
-  ! sums test flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
   ! loops over all element faces and
   ! counts number of coupling faces between elastic and poroelastic elements
   inum = 0
   do ispec=1,nspec
 
-   if(ispec_is_poroelastic(ispec)) then
-
-    ! loops over each face
-    do iface_ref= 1, 6
+    if(ispec_is_poroelastic(ispec)) then
 
-      ! takes indices of corners of reference face
-      do icorner = 1,NGNOD2D
-        i = iface_all_corner_ijk(1,icorner,iface_ref)
-        j = iface_all_corner_ijk(2,icorner,iface_ref)
-        k = iface_all_corner_ijk(3,icorner,iface_ref)
-        ! global reference indices
-        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
+      ! loops over each face
+      do iface_ref= 1, 6
 
-        ! reference corner coordinates
-        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
+        ! takes indices of corners of reference face
+        call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, &
+                                ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, &
+                                iface_all_corner_ijk)
 
-      enddo
-
-      ! checks if face has elastic side
-      if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-         elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-         elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-         elastic_flag( iglob_corners_ref(4) ) >= 1) then
+        ! checks if face has elastic side
+        if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(4) ) >= 1) then
 
-      ! need to find elastic element for coupling
+          ! need to find elastic element for coupling
+          !
+          ! note: this assumes that both, elastic and poroelastic element, are in the same
+          !          partition; check with decomposition that this is valid for this mesh partitioning
           do ispec_el=1,nspec
             if(ispec_is_elastic(ispec_el))then
-     do iface_el=6,1,-1
-      ! takes indices of corners of reference face
-      do icorner = 1,NGNOD2D
-        i = iface_all_corner_ijk(1,icorner,iface_el)
-        j = iface_all_corner_ijk(2,icorner,iface_el)
-        k = iface_all_corner_ijk(3,icorner,iface_el)
-        ! global reference indices
-        iglob_corners_ref_el(icorner) = ibool(i,j,k,ispec_el)
-
-      enddo
-
-      if ( (iglob_corners_ref(1) == iglob_corners_ref_el(3)) .and. &
-      (iglob_corners_ref(3) == iglob_corners_ref_el(1)) ) then
-
-           iface_ref_el = iface_el ![CM]: for some reason this shows a wrong orientation
-                                   ! but the calcul is ok.
-           ispec_ref_el = ispec_el
-
-            ! gets face GLL points i,j,k indices from poroelastic element face
-            call get_element_face_gll_indices(iface_ref,ijk_face_po,NGLLX,NGLLY)
-            ! gets face GLL points i,j,k indices from elastic element face
-            call get_element_face_gll_indices(iface_ref_el,ijk_face_el,NGLLX,NGLLY)
-
-              ! gets face GLL 2Djacobian, weighted from element face
-              call get_jacobian_boundary_face(myrank,nspec, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-
-              ! normal convention: points away from poroelastic, reference element
-              do j=1,NGLLY
-                do i=1,NGLLX
-                    ! directs normals such that they point outwards of poroelastic element
-                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-                                                ibool,nspec,nglob_dummy, &
-                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-                                                normal_face(:,i,j) )
-                enddo
-              enddo
-
-              ! stores informations about this face
-              inum = inum + 1
-              tmp_ispec(inum) = ispec
-              tmp_ispec_el(inum) = ispec_ref_el
-              igll = 0
-              do j=1,NGLLY
-                do i=1,NGLLX
-                  ! adds all gll points on this face
-                  igll = igll + 1
-
-                  ! we need to store local i,j,k,ispec info
-                  tmp_ijk(:,igll,inum) = ijk_face_po(:,i,j)
-                  tmp_ijk_el(:,igll,inum) = ijk_face_el(:,NGLLY-j+1,NGLLX-i+1)
-
-                  ! stores weighted jacobian and normals
-                  tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
-                  tmp_normal(:,igll,inum) = normal_face(:,i,j)
+              do iface_el=6,1,-1
+                ! takes indices of corners of reference face
+                do icorner = 1,NGNOD2D
+                  i = iface_all_corner_ijk(1,icorner,iface_el)
+                  j = iface_all_corner_ijk(2,icorner,iface_el)
+                  k = iface_all_corner_ijk(3,icorner,iface_el)
+                  ! global reference indices
+                  iglob_corners_ref_el(icorner) = ibool(i,j,k,ispec_el)
                 enddo
-              enddo
-       endif ! if
 
-     enddo ! do iface_ref_el=1,6
-             endif ! if(ispec_is_elastic(ispec_el))then
+                if ( (iglob_corners_ref(1) == iglob_corners_ref_el(3)) .and. &
+                  (iglob_corners_ref(3) == iglob_corners_ref_el(1)) ) then
+
+                  iface_ref_el = iface_el ![CM]: for some reason this shows a wrong orientation
+                                          ! but the calcul is ok.
+                  ispec_ref_el = ispec_el
+
+                  ! gets face GLL points i,j,k indices from poroelastic element face
+                  call get_element_face_gll_indices(iface_ref,ijk_face_po,NGLLX,NGLLY)
+                  ! gets face GLL points i,j,k indices from elastic element face
+                  call get_element_face_gll_indices(iface_ref_el,ijk_face_el,NGLLX,NGLLY)
+
+                  ! gets face GLL 2Djacobian, weighted from element face
+                  call get_jacobian_boundary_face(myrank,nspec, &
+                          xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
+                          dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
+                          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                          ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
+
+                  ! normal convention: points away from poroelastic, reference element
+                  do j=1,NGLLY
+                    do i=1,NGLLX
+                      ! directs normals such that they point outwards of poroelastic element
+                      call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
+                                                  ibool,nspec,nglob_dummy, &
+                                                  xstore_dummy,ystore_dummy,zstore_dummy, &
+                                                  normal_face(:,i,j) )
+                    enddo
+                  enddo
+
+                  ! stores informations about this face
+                  inum = inum + 1
+                  tmp_ispec(inum) = ispec
+                  tmp_ispec_el(inum) = ispec_ref_el
+                  igll = 0
+                  do j=1,NGLLY
+                    do i=1,NGLLX
+                      ! adds all gll points on this face
+                      igll = igll + 1
+
+                      ! we need to store local i,j,k,ispec info
+                      tmp_ijk(:,igll,inum) = ijk_face_po(:,i,j)
+                      tmp_ijk_el(:,igll,inum) = ijk_face_el(:,NGLLY-j+1,NGLLX-i+1)
+
+                      ! stores weighted jacobian and normals
+                      tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
+                      tmp_normal(:,igll,inum) = normal_face(:,i,j)
+                    enddo
+                  enddo
+                endif ! if
+
+              enddo ! do iface_ref_el=1,6
+            endif ! if(ispec_is_elastic(ispec_el))then
           enddo ! do ispec_el=1,nspec
-      endif ! elastic_flag
-    enddo ! iface_ref
-   endif ! ispec_is_poroelastic
+        endif ! elastic_flag
+      enddo ! iface_ref
+    endif ! ispec_is_poroelastic
   enddo ! ispec
 
 ! stores completed coupling face informations
@@ -935,575 +709,8 @@ subroutine get_coupling_surfaces_el_poro(myrank, &
 ! user output
   call sum_all_i(num_coupling_el_po_faces,inum)
   if( myrank == 0 ) then
-    write(IMAIN,*) '     elastic-poroelastic coupling:'
-    write(IMAIN,*) '     total number of faces = ',inum
+    write(IMAIN,*) '     elastic-poroelastic coupling : total number of faces = ',inum
   endif
 
   end subroutine get_coupling_surfaces_el_poro
 
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-! not working properly yet...
-
-!  subroutine get_coupling_surfaces_comb(myrank, &
-!                        nspec,ibool,NPROC, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-!                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-!                        my_neighbours_ext_mesh)
-!
-!! determines coupling surface for acoustic-elastic-poroelastic domains
-!
-!  use create_regions_mesh_ext_par
-!  implicit none
-!
-!! number of spectral elements in each block
-!  integer :: myrank,nspec,NPROC
-!
-!! arrays with the mesh
-!  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
-!
-!! MPI communication
-!  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
-!  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-!  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
-!            ibool_interfaces_ext_mesh
-!  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
-!
-!! local parameters
-!  ! (assumes NGLLX=NGLLY=NGLLZ)
-!  real(kind=CUSTOM_REAL),dimension(NGNOD2D) :: xcoord,ycoord,zcoord
-!  real(kind=CUSTOM_REAL) :: jacobian2Dw_face(NGLLX,NGLLY)
-!  real(kind=CUSTOM_REAL) :: normal_face(NDIM,NGLLX,NGLLY)
-!  real(kind=CUSTOM_REAL),dimension(:,:,:),allocatable :: tmp_normal
-!  real(kind=CUSTOM_REAL),dimension(:,:),allocatable :: tmp_jacobian2Dw
-!  integer :: ijk_face(3,NGLLX,NGLLY)
-!  integer,dimension(:,:,:),allocatable :: tmp_ijk
-!  integer,dimension(:),allocatable :: tmp_ispec
-!
-!  integer,dimension(NGNOD2D) :: iglob_corners_ref !,iglob_corners
-!  integer :: ispec,i,j,k,igll,ier,iglob
-!  integer :: inum,inum_ac,inum_el,inum_po,iface_ref,icorner,iglob_midpoint ! iface,ispec_neighbor
-!  integer :: inum_ac_el,inum_el_po,inum_ac_po
-!  integer :: count_elastic,count_acoustic,count_poroelastic
-!
-!  ! mpi interface communication
-!  integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,poroelastic_flag,test_flag
-!  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
-!  integer :: max_nibool_interfaces_ext_mesh
-!  logical, dimension(:), allocatable :: mask_ibool_ac_el,mask_ibool_ac_po,mask_ibool_el_po
-!
-!  ! corners indices of reference cube faces
-!  integer,dimension(3,4),parameter :: iface1_corner_ijk = &
-!             reshape( (/ 1,1,1, 1,NGLLY,1, 1,NGLLY,NGLLZ, 1,1,NGLLZ /),(/3,4/))   ! xmin
-!  integer,dimension(3,4),parameter :: iface2_corner_ijk = &
-!             reshape( (/ NGLLX,1,1, NGLLX,NGLLY,1, NGLLX,NGLLY,NGLLZ, NGLLX,1,NGLLZ  /),(/3,4/))   ! xmax
-!  integer,dimension(3,4),parameter :: iface3_corner_ijk = &
-!             reshape( (/ 1,1,1, 1,1,NGLLZ, NGLLX,1,NGLLZ, NGLLX,1,1  /),(/3,4/))   ! ymin
-!  integer,dimension(3,4),parameter :: iface4_corner_ijk = &
-!             reshape( (/ 1,NGLLY,1, NGLLX,NGLLY,1, NGLLX,NGLLY,NGLLZ, 1,NGLLY,NGLLZ /),(/3,4/))   ! ymax
-!  integer,dimension(3,4),parameter :: iface5_corner_ijk = &
-!             reshape( (/ 1,1,1, 1,NGLLY,1, NGLLX,NGLLY,1, NGLLX,1,1 /),(/3,4/))  ! bottom
-!  integer,dimension(3,4),parameter :: iface6_corner_ijk = &
-!             reshape( (/ 1,1,NGLLZ, NGLLX,1,NGLLZ, NGLLX,NGLLY,NGLLZ, 1,NGLLY,NGLLZ  /),(/3,4/))   ! top
-!  integer,dimension(3,4,6),parameter :: iface_all_corner_ijk = &
-!             reshape( (/ iface1_corner_ijk,iface2_corner_ijk, &
-!                 iface3_corner_ijk,iface4_corner_ijk, &
-!                 iface5_corner_ijk,iface6_corner_ijk /),(/3,4,6/))   ! all faces
-!  ! midpoint indices for each face (xmin,xmax,ymin,ymax,zmin,zmax)
-!  integer,dimension(3,6),parameter :: iface_all_midpointijk = &
-!             reshape( (/ 1,2,2, NGLLX,2,2, 2,1,2, 2,NGLLY,2, 2,2,1, 2,2,NGLLZ  /),(/3,6/))   ! top
-!
-!
-!  ! test vtk output
-!  !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data
-!  !character(len=256):: prname_file
-!
-!  ! initializes number of coupling faces
-!  num_coupling_ac_el_faces = 0
-!  num_coupling_ac_po_faces = 0
-!  num_coupling_el_po_faces = 0
-!
-!  ! allocates temporary arrays
-!  allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array tmp_normal'
-!  allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array tmp_jacobian2Dw'
-!  allocate(tmp_ijk(3,NGLLSQUARE,nspec*6),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array tmp_ijk'
-!  allocate(tmp_ispec(nspec*6),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array tmp_ispec'
-!  tmp_ispec(:) = 0
-!  tmp_ijk(:,:,:) = 0
-!  tmp_normal(:,:,:) = 0.0
-!  tmp_jacobian2Dw(:,:) = 0.0
-!
-!  ! sets flags for acoustic / elastic /poroelastic on global points
-!  allocate(elastic_flag(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array elastic_flag'
-!  allocate(acoustic_flag(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array acoustic_flag'
-!  allocate(poroelastic_flag(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array poroelastic_flag'
-!  allocate(test_flag(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array test_flag'
-!  allocate(mask_ibool_ac_el(nglob_dummy),stat=ier)
-!  allocate(mask_ibool_ac_po(nglob_dummy),stat=ier)
-!  allocate(mask_ibool_el_po(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array mask_ibool'
-!  elastic_flag(:) = 0
-!  acoustic_flag(:) = 0
-!  poroelastic_flag(:) = 0
-!  test_flag(:) = 0
-!  count_elastic = 0
-!  count_acoustic = 0
-!  count_poroelastic = 0
-!  do ispec = 1, nspec
-!    ! counts elements
-!    if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1
-!    if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1
-!    if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1
-!
-!    ! sets flags on global points
-!    do k = 1, NGLLZ
-!      do j = 1, NGLLY
-!        do i = 1, NGLLX
-!          ! global index
-!          iglob = ibool(i,j,k,ispec)
-!          ! sets elastic flag
-!          if( ispec_is_elastic(ispec) ) elastic_flag(iglob) =  myrank+1
-!          ! sets acoustic flag
-!          if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) =  myrank+1
-!          ! sets poroelastic flag
-!          if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) =  myrank+1
-!          ! sets test flag
-!          test_flag(iglob) = myrank+1
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  call sum_all_i(count_acoustic,inum_ac)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     total acoustic elements    :',inum_ac
-!  endif
-!  call sum_all_i(count_elastic,inum_el)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     total elastic elements     :',inum_el
-!  endif
-!  call sum_all_i(count_poroelastic,inum_po)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     total poroelastic elements :',inum_po
-!  endif
-!
-!  ! collects contributions from different MPI partitions
-!  ! sets up MPI communications
-!  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
-!  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
-!  do i = 1, num_interfaces_ext_mesh
-!     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
-!  enddo
-!  ! sums elastic flags
-!  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-!                        my_neighbours_ext_mesh)
-!  ! sums acoustic flags
-!  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-!                        my_neighbours_ext_mesh)
-!  ! sums poroelastic flags
-!  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-!                        my_neighbours_ext_mesh)
-!  ! sums test flags
-!  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-!                        my_neighbours_ext_mesh)
-!
-!!----------------------
-!! acoustic-elastic
-!!----------------------
-!  inum_ac_el = 0
-!  !if (inum_el >0 .and. inum_ac >0) then
-!  ! loops over all element faces and
-!  ! counts number of coupling faces between acoustic and elastic elements
-!  mask_ibool_ac_el(:) = .false.
-!  do ispec=1,nspec
-!
-!    ! loops over each face
-!    do iface_ref= 1, 6
-!
-!      ! takes indices of corners of reference face
-!      do icorner = 1,NGNOD2D
-!        i = iface_all_corner_ijk(1,icorner,iface_ref)
-!        j = iface_all_corner_ijk(2,icorner,iface_ref)
-!        k = iface_all_corner_ijk(3,icorner,iface_ref)
-!        ! global reference indices
-!        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
-!
-!        ! reference corner coordinates
-!        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-!        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-!        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-!      enddo
-!
-!      ! checks if face has acoustic side
-!      if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(4) ) >= 1) then
-!        ! checks if face is has an elastic side
-!        if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-!           elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-!           elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-!           elastic_flag( iglob_corners_ref(4) ) >= 1) then
-!
-!          ! reference midpoint on face (used to avoid redundant face counting)
-!          i = iface_all_midpointijk(1,iface_ref)
-!          j = iface_all_midpointijk(2,iface_ref)
-!          k = iface_all_midpointijk(3,iface_ref)
-!          iglob_midpoint = ibool(i,j,k,ispec)
-!
-!          ! checks if points on this face are masked already
-!          if( .not. mask_ibool_ac_el(iglob_midpoint) ) then
-!
-!            ! gets face GLL points i,j,k indices from element face
-!            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
-!
-!            ! takes each element face only once, if it lies on an MPI interface
-!            ! note: this is not exactly load balanced
-!            !          lowest rank process collects as many faces as possible, second lowest as so forth
-!            if( (test_flag(iglob_midpoint) == myrank+1) .or. &
-!               (test_flag(iglob_midpoint) > 2*(myrank+1)) ) then
-!
-!              ! gets face GLL 2Djacobian, weighted from element face
-!              call get_jacobian_boundary_face(myrank,nspec, &
-!                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-!                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-!                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-!                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-!
-!              ! normal convention: points away from acoustic, reference element
-!              !                                switch normal direction if necessary
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                    ! directs normals such that they point outwards of element
-!                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-!                                                ibool,nspec,nglob_dummy, &
-!                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-!                                                normal_face(:,i,j) )
-!                    ! makes sure that it always points away from acoustic element,
-!                    ! otherwise switch direction
-!                    if( ispec_is_elastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j)
-!                enddo
-!              enddo
-!
-!              ! stores informations about this face
-!              inum_ac_el = inum_ac_el + 1
-!              tmp_ispec(inum_ac_el) = ispec
-!              igll = 0
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                  ! adds all gll points on this face
-!                  igll = igll + 1
-!
-!                  ! do we need to store local i,j,k,ispec info? or only global indices iglob?
-!                  tmp_ijk(:,igll,inum_ac_el) = ijk_face(:,i,j)
-!
-!                  ! stores weighted jacobian and normals
-!                  tmp_jacobian2Dw(igll,inum_ac_el) = jacobian2Dw_face(i,j)
-!                  tmp_normal(:,igll,inum_ac_el) = normal_face(:,i,j)
-!
-!                  ! masks global points ( to avoid redundant counting of faces)
-!                  iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec)
-!                  mask_ibool_ac_el(iglob) = .true.
-!                enddo
-!              enddo
-!            else
-!              ! assumes to be already collected by lower rank process, masks face points
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                  iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec)
-!                  mask_ibool_ac_el(iglob) = .true.
-!                enddo
-!              enddo
-!            endif ! test_flag
-!          endif ! mask_ibool
-!        endif ! elastic_flag
-!      endif ! acoustic_flag
-!    enddo ! iface_ref
-!  enddo ! ispec
-!
-!  !endif !if (count_elastic >0 .and. count_acoustic >0)
-!
-!! stores completed coupling face informations
-!!
-!! note: no need to store material parameters on these coupling points
-!!          for acoustic-elastic interface
-!  num_coupling_ac_el_faces = inum_ac_el
-!  allocate(coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_el_normal'
-!  allocate(coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_el_jacobian2Dw'
-!  allocate(coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_el_ijk'
-!  allocate(coupling_ac_el_ispec(num_coupling_ac_el_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_el_ispec'
-!  do inum = 1,num_coupling_ac_el_faces
-!    coupling_ac_el_normal(:,:,inum) = tmp_normal(:,:,inum)
-!    coupling_ac_el_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum)
-!    coupling_ac_el_ijk(:,:,inum) = tmp_ijk(:,:,inum)
-!    coupling_ac_el_ispec(inum) = tmp_ispec(inum)
-!  enddo
-!
-!! user output
-!! makes sure processes are synchronized
-!  call sum_all_i(num_coupling_ac_el_faces,inum_ac_el)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     acoustic-elastic coupling:'
-!    write(IMAIN,*) '     total number of faces = ',inum_ac_el
-!  endif
-!
-!
-!!----------------------
-!! acoustic-poroelastic
-!!----------------------
-!  tmp_ispec(:) = 0
-!  tmp_ijk(:,:,:) = 0
-!  tmp_normal(:,:,:) = 0.0
-!  tmp_jacobian2Dw(:,:) = 0.0
-!  inum_ac_po = 0
-!  !if (inum_po >0 .and. inum_ac >0) then
-!  ! loops over all element faces and
-!  ! counts number of coupling faces between acoustic and poroelastic elements
-!  do ispec=1,nspec
-!
-!   if(ispec_is_poroelastic(ispec)) then
-!
-!    ! loops over each face
-!    do iface_ref= 1, 6
-!
-!      ! takes indices of corners of reference face
-!      do icorner = 1,NGNOD2D
-!        i = iface_all_corner_ijk(1,icorner,iface_ref)
-!        j = iface_all_corner_ijk(2,icorner,iface_ref)
-!        k = iface_all_corner_ijk(3,icorner,iface_ref)
-!        ! global reference indices
-!        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
-!
-!        ! reference corner coordinates
-!        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-!        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-!        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-!      enddo
-!
-!      ! checks if face has acoustic side
-!      if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(4) ) >= 1) then
-!
-!
-!            ! gets face GLL points i,j,k indices from element face
-!            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
-!
-!            ! gets face GLL 2Djacobian, weighted from element face
-!              call get_jacobian_boundary_face(myrank,nspec, &
-!                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-!                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-!                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-!                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-!
-!              ! normal convention: points away from acoustic, reference element
-!              !                                switch normal direction if necessary
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                    ! directs normals such that they point outwards of element
-!                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-!                                                ibool,nspec,nglob_dummy, &
-!                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-!                                                normal_face(:,i,j) )
-!                    ! makes sure that it always points away from acoustic element,
-!                    ! otherwise switch direction
-!                    !if( ispec_is_poroelastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j)
-!                    normal_face(:,i,j) = - normal_face(:,i,j)
-!                enddo
-!              enddo
-!
-!              ! stores informations about this face
-!              inum_ac_po = inum_ac_po + 1
-!              tmp_ispec(inum_ac_po) = ispec
-!              igll = 0
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                  ! adds all gll points on this face
-!                  igll = igll + 1
-!
-!                  ! do we need to store local i,j,k,ispec info? or only global indices iglob?
-!                  tmp_ijk(:,igll,inum_ac_po) = ijk_face(:,i,j)
-!
-!                  ! stores weighted jacobian and normals
-!                  tmp_jacobian2Dw(igll,inum_ac_po) = jacobian2Dw_face(i,j)
-!                  tmp_normal(:,igll,inum_ac_po) = normal_face(:,i,j)
-!
-!                enddo
-!              enddo
-!      endif ! acoustic_flag
-!    enddo ! iface_ref
-!   endif ! ispec_is_poroelastic
-!  enddo ! ispec
-!
-!  !endif !if (count_poroelastic >0 .and. count_acoustic >0)
-!
-!! stores completed coupling face informations
-!!
-!! note: for this coupling we need to have access to porous properties. The construction is such
-!! that i,j,k, & face correspond to poroelastic interface. Note that the normal is pointing outward the
-!! acoustic element
-!  num_coupling_ac_po_faces = inum_ac_po
-!  allocate(coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_po_normal'
-!  allocate(coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_po_jacobian2Dw'
-!  allocate(coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_po_ijk'
-!  allocate(coupling_ac_po_ispec(num_coupling_ac_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_po_ispec'
-!  do inum = 1,num_coupling_ac_po_faces
-!    coupling_ac_po_normal(:,:,inum) = tmp_normal(:,:,inum)
-!    coupling_ac_po_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum)
-!    coupling_ac_po_ijk(:,:,inum) = tmp_ijk(:,:,inum)
-!    coupling_ac_po_ispec(inum) = tmp_ispec(inum)
-!  enddo
-!
-!! user output
-!! makes sure processes are synchronized
-!  call sum_all_i(num_coupling_ac_po_faces,inum_ac_po)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     acoustic-poroelastic coupling:'
-!    write(IMAIN,*) '     total number of faces = ',inum_ac_po
-!  endif
-!
-!
-!!----------------------
-!! elastic-poroelastic
-!!----------------------
-!  tmp_ispec(:) = 0
-!  tmp_ijk(:,:,:) = 0
-!  tmp_normal(:,:,:) = 0.0
-!  tmp_jacobian2Dw(:,:) = 0.0
-!  inum_el_po = 0
-!  !if (inum_el >0 .and. inum_po >0) then
-!  ! loops over all element faces and
-!  ! counts number of coupling faces between elastic and poroelastic elements
-!  do ispec=1,nspec
-!
-!   if(ispec_is_poroelastic(ispec)) then
-!
-!    ! loops over each face
-!    do iface_ref= 1, 6
-!
-!      ! takes indices of corners of reference face
-!      do icorner = 1,NGNOD2D
-!        i = iface_all_corner_ijk(1,icorner,iface_ref)
-!        j = iface_all_corner_ijk(2,icorner,iface_ref)
-!        k = iface_all_corner_ijk(3,icorner,iface_ref)
-!        ! global reference indices
-!        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
-!
-!        ! reference corner coordinates
-!        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-!        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-!        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-!      enddo
-!
-!      ! checks if face has elastic side
-!      if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-!         elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-!         elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-!         elastic_flag( iglob_corners_ref(4) ) >= 1) then
-!
-!            ! gets face GLL points i,j,k indices from element face
-!            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
-!
-!              ! gets face GLL 2Djacobian, weighted from element face
-!              call get_jacobian_boundary_face(myrank,nspec, &
-!                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-!                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-!                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-!                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-!
-!              ! normal convention: points away from poroelastic, reference element
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                    ! directs normals such that they point outwards of poroelastic element
-!                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-!                                                ibool,nspec,nglob_dummy, &
-!                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-!                                                normal_face(:,i,j) )
-!                enddo
-!              enddo
-!
-!              ! stores informations about this face
-!              inum_el_po = inum_el_po + 1
-!              tmp_ispec(inum_el_po) = ispec
-!              igll = 0
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                  ! adds all gll points on this face
-!                  igll = igll + 1
-!
-!                  ! do we need to store local i,j,k,ispec info? or only global indices iglob?
-!                  tmp_ijk(:,igll,inum_el_po) = ijk_face(:,i,j)
-!
-!                  ! stores weighted jacobian and normals
-!                  tmp_jacobian2Dw(igll,inum_el_po) = jacobian2Dw_face(i,j)
-!                  tmp_normal(:,igll,inum_el_po) = normal_face(:,i,j)
-!
-!                enddo
-!              enddo
-!        endif ! elastic_flag
-!    enddo ! iface_ref
-!   endif ! ispec_is_poroelastic
-!  enddo ! ispec
-!
-!  !endif !if (count_elastic >0 .and. count_poroelastic >0)
-!
-!! stores completed coupling face informations
-!!
-!! note: for this coupling we need to have access to porous properties. The construction is such
-!! that i,j,k, & face correspond to poroelastic interface. Note that the normal is pointing outward the
-!! poroelastic element
-!  num_coupling_el_po_faces = inum_el_po
-!  allocate(coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_el_po_normal'
-!  allocate(coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_el_po_jacobian2Dw'
-!  allocate(coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_el_po_ijk'
-!  allocate(coupling_el_po_ispec(num_coupling_el_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_el_po_ispec'
-!  do inum = 1,num_coupling_el_po_faces
-!    coupling_el_po_normal(:,:,inum) = tmp_normal(:,:,inum)
-!    coupling_el_po_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum)
-!    coupling_el_po_ijk(:,:,inum) = tmp_ijk(:,:,inum)
-!    coupling_el_po_ispec(inum) = tmp_ispec(inum)
-!  enddo
-!
-!! user output
-!! makes sure processes are synchronized
-!  call sum_all_i(num_coupling_el_po_faces,inum_el_po)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     elastic-poroelastic coupling:'
-!    write(IMAIN,*) '     total number of faces = ',inum_el_po
-!  endif
-!
-!
-!
-!  end subroutine get_coupling_surfaces_comb
-
diff --git a/src/generate_databases/get_global.f90 b/src/generate_databases/get_global.f90
index 752957836..8a5ad614e 100644
--- a/src/generate_databases/get_global.f90
+++ b/src/generate_databases/get_global.f90
@@ -237,7 +237,7 @@ subroutine swap_all(IA,A,B,C,IW,W,ind,n)
     C(i)=W(ind(i))
   enddo
 
-end subroutine swap_all
+  end subroutine swap_all
 
 ! ------------------------------------------------------------------
 
@@ -292,4 +292,4 @@ subroutine get_global_indirect_addressing(nspec,nglob,ibool)
   deallocate(copy_ibool_ori,stat=ier); if(ier /= 0) stop 'error in deallocate'
   deallocate(mask_ibool,stat=ier); if(ier /= 0) stop 'error in deallocate'
 
-end subroutine get_global_indirect_addressing
+  end subroutine get_global_indirect_addressing
diff --git a/src/generate_databases/get_model.f90 b/src/generate_databases/get_model.f90
index 05b171881..91425cf59 100644
--- a/src/generate_databases/get_model.f90
+++ b/src/generate_databases/get_model.f90
@@ -24,15 +24,17 @@
 !
 !=====================================================================
 
+
   subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
                         materials_ext_mesh,nmat_ext_mesh, &
                         undef_mat_prop,nundefMat_ext_mesh, &
-                        ANISOTROPY,LOCAL_PATH)
+                        ANISOTROPY)
 
   use generate_databases_par,only: IMODEL
   use create_regions_mesh_ext_par
   implicit none
 
+
   ! number of spectral elements in each block
   integer :: myrank,nspec
 
@@ -59,26 +61,34 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
                         afactor,bfactor,cfactor
 
   integer :: ispec,i,j,k
-  
+
   ! material domain
   integer :: idomain_id
-  
+
   integer :: imaterial_id,imaterial_def
 
   ! gll point location
-  double precision :: xmesh,ymesh,zmesh  
+  double precision :: xmesh,ymesh,zmesh
   integer :: iglob
-  character(len=256) LOCAL_PATH
+
+  ! timing
+  double precision, external :: wtime
+  double precision :: time_start,tCPU
 
   ! initializes element domain flags
   ispec_is_acoustic(:) = .false.
   ispec_is_elastic(:) = .false.
   ispec_is_poroelastic(:) = .false.
 
-  ! prepares tomography model if needed for elements with undefined material definitions
-  if( nundefMat_ext_mesh > 0 .or. IMODEL == IMODEL_TOMO ) then
-    call model_tomography_broadcast(myrank)    
-  endif
+  !debug
+  !print*,"nundefMat_ext_mesh:",nundefMat_ext_mesh
+
+! prepares tomography model if needed for elements with undefined material definitions
+  ! TODO: Max -- somehow this code is breaking when I try to run
+  ! Piero's PREM
+  ! if( nundefMat_ext_mesh > 0 .or. IMODEL == IMODEL_TOMO ) then
+  ! call model_tomography_broadcast(myrank)
+  ! endif
 
   ! prepares external model values if needed
   select case( IMODEL )
@@ -92,6 +102,8 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
 ! in case, see file model_interface_bedrock.f90:
 !  call model_bedrock_broadcast(myrank)
 
+  ! get MPI starting time
+  time_start = wtime()
 
   ! material properties on all GLL points: taken from material values defined for
   ! each spectral element in input mesh
@@ -106,7 +118,7 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
           vp = 0._CUSTOM_REAL
           vs = 0._CUSTOM_REAL
           rho = 0._CUSTOM_REAL
-          
+
           rho_s = 0._CUSTOM_REAL
           kappa_s = 0._CUSTOM_REAL
           rho_f = 0._CUSTOM_REAL
@@ -122,7 +134,7 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
           kyy = 0._CUSTOM_REAL
           kyz = 0._CUSTOM_REAL
           kzz = 0._CUSTOM_REAL
-          
+
           qmu_atten = 0._CUSTOM_REAL
 
           c11 = 0._CUSTOM_REAL
@@ -154,7 +166,7 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
           zmesh = zstore_dummy(iglob)
 
           ! material index 1: associated material number
-          ! 1 = acoustic, 2 = elastic, 3 = poroelastic, -1 = undefined tomographic        
+          ! 1 = acoustic, 2 = elastic, 3 = poroelastic, -1 = undefined tomographic
           imaterial_id = mat_ext_mesh(1,ispec)
 
           ! material index 2: associated material definition
@@ -173,12 +185,12 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
                                c22,c23,c24,c25,c26,c33, &
                                c34,c35,c36,c44,c45,c46,c55,c56,c66, &
                                ANISOTROPY)
-          
+
 
           ! stores velocity model
 
-          if(idomain_id == IDOMAIN_ACOUSTIC .or. idomain_id == IDOMAIN_ELASTIC) then 
-          
+          if(idomain_id == IDOMAIN_ACOUSTIC .or. idomain_id == IDOMAIN_ELASTIC) then
+
             ! elastic or acoustic material
 
             ! density
@@ -204,8 +216,8 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
             tortstore(i,j,k,ispec) = 1.d0
             !end pll
 
-          else                                         
-            
+          else
+
             ! poroelastic material
 
             ! solid properties
@@ -277,11 +289,11 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
 
           ! stores material domain
           select case( idomain_id )
-          case( IDOMAIN_ACOUSTIC )          
+          case( IDOMAIN_ACOUSTIC )
             ispec_is_acoustic(ispec) = .true.
           case( IDOMAIN_ELASTIC )
             ispec_is_elastic(ispec) = .true.
-          case( IDOMAIN_POROELASTIC )           
+          case( IDOMAIN_POROELASTIC )
             ispec_is_poroelastic(ispec) = .true.
           case default
             stop 'error material domain index'
@@ -290,6 +302,17 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
         enddo
       enddo
     enddo
+
+    ! user output
+    if(myrank == 0 ) then
+      if( mod(ispec,nspec/10) == 0 ) then
+        tCPU = wtime() - time_start
+        ! remaining
+        tCPU = (10.0-ispec/(nspec/10.0))/ispec/(nspec/10.0)*tCPU
+        write(IMAIN,*) "    ",ispec/(nspec/10) * 10," %", &
+                      " time remaining:", tCPU,"s"
+      endif
+    endif
   enddo
 
   ! checks material domains
@@ -318,16 +341,6 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
     endif
   enddo
 
-  ! GLL model
-  ! variables for importing models from files in SPECFEM format, e.g.,  proc000000_vp.bin etc.
-  ! can be used for importing updated model in iterative inversions
-  if( IMODEL == IMODEL_GLL ) then
-    ! note:
-    ! import the model from files in SPECFEM format
-    ! note that those those files should be saved in LOCAL_PATH    
-    call model_gll(myrank,nspec,LOCAL_PATH)
-  endif
-
   end subroutine get_model
 
 !
@@ -356,7 +369,7 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, &
   integer, intent(in) :: nundefMat_ext_mesh
   character (len=30), dimension(6,nundefMat_ext_mesh):: undef_mat_prop
 
-  integer, intent(in) :: imaterial_id,imaterial_def  
+  integer, intent(in) :: imaterial_id,imaterial_def
 
   double precision, intent(in) :: xmesh,ymesh,zmesh
 
@@ -374,18 +387,20 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, &
 
   ! local parameters
   integer :: iflag_aniso
-  
+  integer :: iundef,imaterial_PB
+
   ! use acoustic domains for simulation
   logical,parameter :: USE_PURE_ACOUSTIC_MOD = .false.
 
   ! initializes with default values
+  ! no anisotropy
   iflag_aniso = 0
   idomain_id = IDOMAIN_ELASTIC
-  
+
   ! selects chosen velocity model
   select case( IMODEL )
 
-  case( IMODEL_DEFAULT, IMODEL_GLL )
+  case( IMODEL_DEFAULT,IMODEL_GLL,IMODEL_IPATI )
     ! material values determined by mesh properties
     call model_default(materials_ext_mesh,nmat_ext_mesh, &
                           undef_mat_prop,nundefMat_ext_mesh, &
@@ -395,11 +410,21 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, &
                           iflag_aniso,qmu_atten,idomain_id, &
                           rho_s,kappa_s,rho_f,kappa_f,eta_f,kappa_fr,mu_fr, &
                           phi,tort,kxx,kxy,kxz,kyy,kyz,kzz)
-        
+
   case( IMODEL_1D_PREM )
     ! 1D model profile from PREM
     call model_1D_prem_iso(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
-                      
+
+  case( IMODEL_1D_PREM_PB )
+    ! 1D model profile from PREM modified by Piero
+    imaterial_PB = abs(imaterial_id)
+    call model_1D_PREM_routine_PB(xmesh,ymesh,zmesh,rho,vp,vs,imaterial_PB)
+    ! attenuation: arbitrary value, see maximum in constants.h
+    qmu_atten = ATTENUATION_COMP_MAXIMUM
+    ! sets acoustic/elastic domain as given in materials properties
+    iundef = - imaterial_id    ! iundef must be positive
+    read(undef_mat_prop(6,iundef),*) idomain_id
+
   case( IMODEL_1D_CASCADIA )
     ! 1D model profile for Cascadia region
     call model_1D_cascadia(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
@@ -411,7 +436,7 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, &
   case( IMODEL_SALTON_TROUGH )
     ! gets model values from tomography file
     call model_salton_trough(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
-      
+
   case( IMODEL_TOMO )
     ! gets model values from tomography file
     call model_tomography(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
@@ -420,9 +445,9 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, &
     ! user model from external routine
     ! adds/gets velocity model as specified in model_external_values.f90
     call model_external_values(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten,iflag_aniso,idomain_id)
-  
-  case default  
-    stop 'error: model not implemented yet'    
+
+  case default
+    stop 'error: model not implemented yet'
   end select
 
   ! adds anisotropic default model
@@ -430,7 +455,7 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, &
     call model_aniso(iflag_aniso,rho,vp,vs, &
                     c11,c12,c13,c14,c15,c16, &
                     c22,c23,c24,c25,c26,c33, &
-                    c34,c35,c36,c44,c45,c46,c55,c56,c66)  
+                    c34,c35,c36,c44,c45,c46,c55,c56,c66)
   endif
 
   ! for pure acoustic simulations (a way of avoiding re-mesh, re-partition etc.)
@@ -439,5 +464,41 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, &
   if( USE_PURE_ACOUSTIC_MOD ) then
     idomain_id = IDOMAIN_ACOUSTIC
   endif
-  
+
   end subroutine get_model_values
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_model_binaries(myrank,nspec,LOCAL_PATH)
+
+! reads in material parameters from external binary files
+
+  use generate_databases_par,only: IMODEL
+  use create_regions_mesh_ext_par
+  implicit none
+
+  ! number of spectral elements in each block
+  integer :: myrank,nspec
+  character(len=256) :: LOCAL_PATH
+
+  ! external GLL models
+  ! variables for importing models from files in SPECFEM format, e.g.,  proc000000_vp.bin etc.
+  ! can be used for importing updated model in iterative inversions
+
+  ! note: we read in these binary files after mesh coloring, since mesh coloring is permuting arrays.
+  !          here, the ordering in **_vp.bin etc. can be permuted as they are outputted when saving mesh files
+
+  select case( IMODEL )
+  case( IMODEL_GLL )
+    ! note:
+    ! import the model from files in SPECFEM format
+    ! note that those those files should be saved in LOCAL_PATH
+    call model_gll(myrank,nspec,LOCAL_PATH)
+  case( IMODEL_IPATI )
+    ! import the model from modified files in SPECFEM format
+    call model_ipati(myrank,nspec,LOCAL_PATH)
+  end select
+
+  end subroutine get_model_binaries
diff --git a/src/generate_databases/get_perm_color.f90 b/src/generate_databases/get_perm_color.f90
new file mode 100644
index 000000000..e0d743bb5
--- /dev/null
+++ b/src/generate_databases/get_perm_color.f90
@@ -0,0 +1,1210 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+! define sets of colors that contain disconnected elements for the CUDA solver.
+! also split the elements into two subsets: inner and outer elements, in order
+! to be able to compute the outer elements first in the solver and then
+! start non-blocking MPI calls and overlap them with the calculation of the inner elements
+! (which works fine because there are always far more inner elements than outer elements)
+
+! note:    these are modified routines to use element domain flags given in ispec_is_d, thus
+!             coloring only acoustic or elastic (or..) elements in one run, then repeat run for other domains.
+!             also, the permutation re-starts at 1 for outer and for inner elements,
+!             making it usable for the phase_ispec_inner_** arrays for acoustic and elastic elements.
+
+  subroutine get_perm_color_faster(is_on_a_slice_edge,ispec_is_d, &
+                                  ibool,perm,color, &
+                                  nspec,nglob, &
+                                  nb_colors_outer_elements,nb_colors_inner_elements, &
+                                  nspec_outer,nspec_inner,nspec_domain, &
+                                  first_elem_number_in_this_color, &
+                                  myrank)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec, nglob
+  logical, dimension(nspec), intent(in) :: is_on_a_slice_edge
+  logical, dimension(nspec), intent(in) :: ispec_is_d
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec), intent(in) :: ibool
+  integer, dimension(nspec),intent(inout) :: perm
+
+  integer, dimension(nspec),intent(inout) :: color
+  integer, dimension(MAX_NUMBER_OF_COLORS+1),intent(inout) :: first_elem_number_in_this_color
+  integer, intent(out) :: nb_colors_outer_elements,nb_colors_inner_elements
+
+  integer, intent(out) :: nspec_outer,nspec_inner,nspec_domain
+  integer, intent(in) :: myrank
+
+  ! local variables
+  integer :: nb_colors
+
+  ! coloring algorithm w/ Droux
+  call get_color_faster(ibool, is_on_a_slice_edge, ispec_is_d, &
+                        myrank, nspec, nglob, &
+                        color, nb_colors_outer_elements, nb_colors_inner_elements, &
+                        nspec_outer,nspec_inner,nspec_domain)
+
+  !debug output
+  if(myrank == 0) then
+    write(IMAIN,*) '     colors:'
+    write(IMAIN,*) '     number of colors for inner elements = ',nb_colors_inner_elements
+    write(IMAIN,*) '     number of colors for outer elements = ',nb_colors_outer_elements
+    write(IMAIN,*) '     total number of colors (sum of both) = ', nb_colors_inner_elements + nb_colors_outer_elements
+    write(IMAIN,*) '     elements:'
+    write(IMAIN,*) '     number of elements for outer elements  = ',nspec_outer
+    write(IMAIN,*) '     number of elements for inner elements  = ',nspec_inner
+    write(IMAIN,*) '     total number of elements for domain elements  = ',nspec_domain
+  endif
+
+  ! total number of colors used
+  nb_colors = nb_colors_inner_elements+nb_colors_outer_elements
+  first_elem_number_in_this_color(:) = 0
+
+  ! gets element permutation depending on colors
+  call get_final_perm(color,perm,first_elem_number_in_this_color(1:nb_colors), &
+                     nspec,nb_colors,nb_colors_outer_elements, &
+                     ispec_is_d,nspec_domain)
+
+
+  end subroutine get_perm_color_faster
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_color_faster(ibool, is_on_a_slice_edge, ispec_is_d, &
+                             myrank, nspec, nglob, &
+                             color, nb_colors_outer_elements, nb_colors_inner_elements, &
+                             nspec_outer,nspec_inner,nspec_domain)
+
+  implicit none
+
+  include "constants.h"
+
+  integer nspec,nglob
+  logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+  integer, dimension(nspec) :: color
+  integer :: nb_colors_outer_elements,nb_colors_inner_elements,myrank
+
+  integer :: nspec_outer,nspec_inner,nspec_domain
+
+  ! local variables
+  integer :: ispec
+  logical, dimension(:), allocatable :: mask_ibool
+  integer :: icolor, nb_already_colored
+  integer :: iglob1,iglob2,iglob3,iglob4,iglob5,iglob6,iglob7,iglob8
+  integer :: ier
+  logical :: conflict_found_need_new_color
+  ! Droux
+  logical :: try_Droux_coloring
+  logical :: fail_safe
+  ! valence
+  integer :: maxval_count_ibool_outer,maxval_count_ibool_inner
+
+  ! display absolute minimum possible number of colors, i.e., maximum valence (for information only)
+  ! beware: this wastes memory (needs an additional array called "count_ibool")
+  logical, parameter :: DISPLAY_MIN_POSSIBLE_COLORS = .false.
+
+  ! user output
+  if( myrank == 0 ) then
+    if( USE_DROUX_OPTIMIZATION ) then
+      write(IMAIN,*) '     fast coloring mesh algorithm w/ Droux optimization'
+    else if( BALANCE_COLORS_SIMPLE_ALGO ) then
+      write(IMAIN,*) '     fast coloring mesh algorithm w/ color balancing'
+    else
+      write(IMAIN,*) '     fast coloring mesh algorithm'
+    endif
+  endif
+
+  ! counts number of elements for inner, outer and total domain
+  nspec_outer = 0
+  nspec_inner = 0
+  nspec_domain = 0
+  do ispec=1,nspec
+    ! domain elements
+    if(ispec_is_d(ispec)) then
+      ! outer/inner elements
+      if(is_on_a_slice_edge(ispec)) then
+        nspec_outer=nspec_outer+1
+      else
+        nspec_inner=nspec_inner+1
+      endif
+      nspec_domain=nspec_domain+1
+    endif
+  enddo
+
+  ! debug
+  !if(myrank == 0) then
+  !  print *
+  !  print *,'----------------------------------'
+  !  print *,'coloring the mesh'
+  !  print *,'----------------------------------'
+  !  print *
+  !endif
+
+  ! Droux optimization
+  try_Droux_coloring = USE_DROUX_OPTIMIZATION
+
+  if(BALANCE_COLORS_SIMPLE_ALGO .and. USE_DROUX_OPTIMIZATION ) then
+    if( myrank == 0 ) then
+      print *,'noticed a problem with mesh coloring options: '
+      print *,'  cannot set both USE_DROUX_OPTIMAL_ALGO and BALANCE_COLORS_SIMPLE_ALGO'
+      print *,'  -> this run will use only BALANCE_COLORS_SIMPLE_ALGO'
+      print *,'please check parameter settings in constants.h...'
+    endif
+    try_Droux_coloring = .false.
+  endif
+
+  ! gives a lower bound for the number of colors needed
+  if(DISPLAY_MIN_POSSIBLE_COLORS .or. try_Droux_coloring) then
+    ! gets maximum values of valence for inner and outer element points
+    call count_mesh_valence(ibool,is_on_a_slice_edge,ispec_is_d, &
+                           myrank, nspec, nglob, &
+                           maxval_count_ibool_outer,maxval_count_ibool_inner)
+  endif
+
+  ! allocates mask
+  allocate(mask_ibool(nglob),stat=ier)
+  if( ier /= 0 ) stop 'error allocating mask_ibool array'
+
+  ! entry point for fail-safe mechanism when Droux 1993 fails
+  999 continue
+
+  ! first set color of all elements to 0,
+  ! to use it as a flag to detect elements not yet colored
+  color(:) = 0
+  icolor = 0
+  nb_already_colored = 0
+
+  ! colors outer elements
+  do while( nb_already_colored < nspec_outer )
+
+    333 continue
+    icolor = icolor + 1
+
+    ! debug: user output
+    !if(myrank == 0) then
+    !  print *,'  analyzing color ',icolor,' - outer elements'
+    !endif
+
+    ! resets flags
+    mask_ibool(:) = .false.
+    conflict_found_need_new_color = .false.
+
+    ! finds un-colored elements
+    do ispec = 1,nspec
+      ! domain elements only
+      if( ispec_is_d(ispec) ) then
+        ! outer elements
+        if( is_on_a_slice_edge(ispec) ) then
+          if(color(ispec) == 0) then
+            ! the eight corners of the current element
+            iglob1=ibool(1,1,1,ispec)
+            iglob2=ibool(NGLLX,1,1,ispec)
+            iglob3=ibool(NGLLX,NGLLY,1,ispec)
+            iglob4=ibool(1,NGLLY,1,ispec)
+            iglob5=ibool(1,1,NGLLZ,ispec)
+            iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+            iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+            iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+            if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. &
+               mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then
+              ! if element of this color has a common point with another element of that same color
+              ! then we need to create a new color, i.e., increment the color of the current element
+              conflict_found_need_new_color = .true.
+            else
+              color(ispec) = icolor
+              nb_already_colored = nb_already_colored + 1
+              mask_ibool(iglob1) = .true.
+              mask_ibool(iglob2) = .true.
+              mask_ibool(iglob3) = .true.
+              mask_ibool(iglob4) = .true.
+              mask_ibool(iglob5) = .true.
+              mask_ibool(iglob6) = .true.
+              mask_ibool(iglob7) = .true.
+              mask_ibool(iglob8) = .true.
+            endif
+          endif
+        endif
+      endif
+    enddo
+
+    ! debug: user output
+    !if(myrank == 0) then
+    !  print *,'  done ',(100.0*nb_already_colored)/nspec_domain,'% of ',nspec_domain,'elements'
+    !endif
+
+    if(conflict_found_need_new_color) then
+      if( icolor >= MAX_NUMBER_OF_COLORS ) stop 'error MAX_NUMBER_OF_COLORS too small'
+      goto 333
+    endif
+  enddo
+
+  nb_colors_outer_elements = icolor
+
+  ! colors inner elements
+  do while(nb_already_colored < nspec_domain)
+
+    334 continue
+    icolor = icolor + 1
+
+    ! debug: user output
+    !if(myrank == 0) then
+    !  print *,'  analyzing color ',icolor,' - inner elements'
+    !endif
+
+    ! resets flags
+    mask_ibool(:) = .false.
+    conflict_found_need_new_color = .false.
+
+    do ispec = 1,nspec
+      ! domain elements only
+      if(ispec_is_d(ispec)) then
+        ! inner elements
+        if (.not. is_on_a_slice_edge(ispec)) then
+          if(color(ispec) == 0) then
+            ! the eight corners of the current element
+            iglob1=ibool(1,1,1,ispec)
+            iglob2=ibool(NGLLX,1,1,ispec)
+            iglob3=ibool(NGLLX,NGLLY,1,ispec)
+            iglob4=ibool(1,NGLLY,1,ispec)
+            iglob5=ibool(1,1,NGLLZ,ispec)
+            iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+            iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+            iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+            if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. &
+               mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then
+              ! if element of this color has a common point with another element of that same color
+              ! then we need to create a new color, i.e., increment the color of the current element
+              conflict_found_need_new_color = .true.
+            else
+              color(ispec) = icolor
+              nb_already_colored = nb_already_colored + 1
+              mask_ibool(iglob1) = .true.
+              mask_ibool(iglob2) = .true.
+              mask_ibool(iglob3) = .true.
+              mask_ibool(iglob4) = .true.
+              mask_ibool(iglob5) = .true.
+              mask_ibool(iglob6) = .true.
+              mask_ibool(iglob7) = .true.
+              mask_ibool(iglob8) = .true.
+            endif
+          endif
+        endif
+      endif
+    enddo
+
+    ! debug user output
+    !if(myrank == 0) then
+    !  print *,'  done ',(100.0*nb_already_colored)/nspec_domain,'% of ',nspec_domain,'elements'
+    !endif
+
+    if(conflict_found_need_new_color) then
+      if( icolor >= MAX_NUMBER_OF_COLORS ) stop 'error MAX_NUMBER_OF_COLORS too small'
+      goto 334
+    endif
+  enddo
+
+  nb_colors_inner_elements = icolor - nb_colors_outer_elements
+
+  ! Droux optimization:
+  ! added this to create more balanced colors according to JJ Droux (1993)
+  ! note: this might not find an optimial solution.
+  !          we will probably have to try a few times with increasing colors
+  if( try_Droux_coloring ) then
+    ! initializes fail-safe mechanism
+    fail_safe = .false.
+
+    ! tries to find a balanced coloring
+    call balance_colors_Droux(ibool,is_on_a_slice_edge,ispec_is_d, &
+                              myrank, nspec, nglob, &
+                              color,nb_colors_outer_elements,nb_colors_inner_elements, &
+                              nspec_outer,nspec_inner,maxval_count_ibool_inner, &
+                              mask_ibool,fail_safe)
+
+    ! in case it fails go back to simple coloring algorithm
+    if( fail_safe ) then
+      try_Droux_coloring = .false.
+      if(myrank == 0) write(IMAIN,*) '     giving up on Droux 1993 algorithm, calling fail-safe mechanism'
+      goto 999
+    endif
+  endif ! of if(try_Droux_coloring)
+
+  ! balances colors using a simple algorithm (if Droux was not used)
+  if( BALANCE_COLORS_SIMPLE_ALGO ) then
+    call balance_colors_simple(ibool,is_on_a_slice_edge,ispec_is_d, &
+                              myrank, nspec, nglob, &
+                              color,nb_colors_outer_elements,nb_colors_inner_elements, &
+                              nspec_outer,nspec_inner,mask_ibool)
+  endif
+
+  ! checks that all the color sets are independent
+  do icolor = 1,maxval(color)
+    mask_ibool(:) = .false.
+    do ispec = 1,nspec
+      ! domain elements only
+      if(ispec_is_d(ispec)) then
+        if(color(ispec) == icolor ) then
+          ! the eight corners of the current element
+          iglob1=ibool(1,1,1,ispec)
+          iglob2=ibool(NGLLX,1,1,ispec)
+          iglob3=ibool(NGLLX,NGLLY,1,ispec)
+          iglob4=ibool(1,NGLLY,1,ispec)
+          iglob5=ibool(1,1,NGLLZ,ispec)
+          iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+          iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+          iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+          if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. &
+             mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then
+            ! if element of this color has a common point with another element of that same color
+            ! then there is a problem, the color set is not correct
+            print*,'error check color:',icolor
+            stop 'error detected: found a common point inside a color set'
+          else
+            mask_ibool(iglob1) = .true.
+            mask_ibool(iglob2) = .true.
+            mask_ibool(iglob3) = .true.
+            mask_ibool(iglob4) = .true.
+            mask_ibool(iglob5) = .true.
+            mask_ibool(iglob6) = .true.
+            mask_ibool(iglob7) = .true.
+            mask_ibool(iglob8) = .true.
+          endif
+        endif
+      endif
+    enddo
+
+    !debug output
+    !if(myrank == 0) print *,'  color ',icolor,' has disjoint elements only and is therefore OK'
+    !if(myrank == 0) print *,'  color ',icolor,' contains ',count(color == icolor),' elements'
+  enddo
+  ! debug output
+  !if(myrank == 0) then
+  !  print*, '     the ',maxval(color),' color sets are OK'
+  !endif
+
+  deallocate(mask_ibool)
+
+  end subroutine get_color_faster
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine count_mesh_valence(ibool,is_on_a_slice_edge,ispec_is_d, &
+                                myrank, nspec, nglob, &
+                                maxval_count_ibool_outer,maxval_count_ibool_inner)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: nspec,nglob
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d
+
+  integer :: myrank
+  integer :: maxval_count_ibool_outer,maxval_count_ibool_inner
+
+  ! local parameters
+  integer, dimension(:), allocatable :: count_ibool
+  integer :: ispec
+  integer :: iglob1,iglob2,iglob3,iglob4,iglob5,iglob6,iglob7,iglob8
+  integer :: ier
+
+  ! allocates count array
+  allocate(count_ibool(nglob),stat=ier)
+  if( ier /= 0 ) stop 'error allocating count_ibool array'
+
+  ! valence numbers of the mesh
+  maxval_count_ibool_outer = 0
+  maxval_count_ibool_inner = 0
+
+  ! valence for outer elements
+  count_ibool(:) = 0
+  do ispec = 1,nspec
+    ! domain elements only
+    if(ispec_is_d(ispec)) then
+      ! outer elements
+      if (is_on_a_slice_edge(ispec)) then
+        ! the eight corners of the current element
+        iglob1=ibool(1,1,1,ispec)
+        iglob2=ibool(NGLLX,1,1,ispec)
+        iglob3=ibool(NGLLX,NGLLY,1,ispec)
+        iglob4=ibool(1,NGLLY,1,ispec)
+        iglob5=ibool(1,1,NGLLZ,ispec)
+        iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+        iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+        iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+        count_ibool(iglob1) = count_ibool(iglob1) + 1
+        count_ibool(iglob2) = count_ibool(iglob2) + 1
+        count_ibool(iglob3) = count_ibool(iglob3) + 1
+        count_ibool(iglob4) = count_ibool(iglob4) + 1
+        count_ibool(iglob5) = count_ibool(iglob5) + 1
+        count_ibool(iglob6) = count_ibool(iglob6) + 1
+        count_ibool(iglob7) = count_ibool(iglob7) + 1
+        count_ibool(iglob8) = count_ibool(iglob8) + 1
+      endif
+    endif
+  enddo
+  maxval_count_ibool_outer = maxval(count_ibool)
+
+  ! valence for inner elements
+  count_ibool(:) = 0
+  do ispec = 1,nspec
+    ! domain elements only
+    if(ispec_is_d(ispec)) then
+      ! inner elements
+      if (.not. is_on_a_slice_edge(ispec)) then
+        ! the eight corners of the current element
+        iglob1=ibool(1,1,1,ispec)
+        iglob2=ibool(NGLLX,1,1,ispec)
+        iglob3=ibool(NGLLX,NGLLY,1,ispec)
+        iglob4=ibool(1,NGLLY,1,ispec)
+        iglob5=ibool(1,1,NGLLZ,ispec)
+        iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+        iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+        iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+        count_ibool(iglob1) = count_ibool(iglob1) + 1
+        count_ibool(iglob2) = count_ibool(iglob2) + 1
+        count_ibool(iglob3) = count_ibool(iglob3) + 1
+        count_ibool(iglob4) = count_ibool(iglob4) + 1
+        count_ibool(iglob5) = count_ibool(iglob5) + 1
+        count_ibool(iglob6) = count_ibool(iglob6) + 1
+        count_ibool(iglob7) = count_ibool(iglob7) + 1
+        count_ibool(iglob8) = count_ibool(iglob8) + 1
+      endif
+    endif
+  enddo
+  maxval_count_ibool_inner = maxval(count_ibool)
+
+  ! debug outupt
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     maximum valence (i.e. minimum possible nb of colors) for outer = ',maxval_count_ibool_outer
+    write(IMAIN,*) '     maximum valence (i.e. minimum possible nb of colors) for inner = ',maxval_count_ibool_inner
+  endif
+
+  deallocate(count_ibool)
+
+  end subroutine count_mesh_valence
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine balance_colors_Droux(ibool,is_on_a_slice_edge,ispec_is_d, &
+                                  myrank, nspec, nglob, &
+                                  color, nb_colors_outer_elements, nb_colors_inner_elements, &
+                                  nspec_outer,nspec_inner,maxval_count_ibool_inner, &
+                                  mask_ibool,fail_safe)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: nspec,nglob
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d
+  logical, dimension(nglob) :: mask_ibool
+
+  integer, dimension(nspec) :: color
+
+  integer :: myrank
+  integer :: nb_colors_outer_elements,nb_colors_inner_elements
+
+  integer :: nspec_outer,nspec_inner
+  integer :: maxval_count_ibool_inner
+
+  logical :: fail_safe
+
+  ! local parameters
+  logical, dimension(:), allocatable :: icolor_conflict_found
+  integer, dimension(:), allocatable :: nb_elems_in_this_color
+  integer :: ispec,ispec2,icolor,ncolors,icolormin,icolormax,icolor_chosen,nb_elems_in_color_chosen
+  integer :: nb_tries_of_Droux_1993,last_ispec_studied
+  integer :: ier
+
+  ! debug outupt
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     balancing colors: Droux algorithm'
+    write(IMAIN,*) '       initial number of outer element colors = ',nb_colors_outer_elements
+    write(IMAIN,*) '       initial number of inner element colors = ',nb_colors_inner_elements
+    write(IMAIN,*) '       initial number of total colors = ',nb_colors_outer_elements + nb_colors_inner_elements
+  endif
+
+  ! initial guess of number of colors needed
+  if( maxval_count_ibool_inner > 0 .and. maxval_count_ibool_inner < nb_colors_inner_elements ) then
+    ! uses maximum valence to estimate number of colors for Droux
+    nb_colors_inner_elements = maxval_count_ibool_inner
+  endif
+
+  !! DK DK do it for inner elements only for now
+  ! Droux optimization run
+  nb_tries_of_Droux_1993 = 1
+
+  ! entry point to re-try Droux
+  765 continue
+
+  ! initial guess of number of colors needed
+  ncolors = nb_colors_outer_elements + nb_colors_inner_elements
+
+  ! debug output
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     Droux optimization: try = ',nb_tries_of_Droux_1993,'colors = ',ncolors
+  endif
+
+  icolormin = nb_colors_outer_elements + 1
+  icolormax = ncolors
+
+  ! allocates temporary arrays
+  allocate(nb_elems_in_this_color(ncolors), &
+          icolor_conflict_found(ncolors),stat=ier)
+  if( ier /= 0 ) stop 'error allocating nb_elems_in_this_color arrays'
+
+  nb_elems_in_this_color(:) = 0
+  mask_ibool(:) = .false.
+  last_ispec_studied = -1
+
+  do ispec = 1,nspec
+    ! domain elements only
+    if(ispec_is_d(ispec)) then
+
+      ! only inner elements
+      if (is_on_a_slice_edge(ispec)) cycle
+
+      ! unmark the eight corners of the previously marked element
+      if(last_ispec_studied > 0) then
+        mask_ibool(ibool(1,1,1,last_ispec_studied)) = .false.
+        mask_ibool(ibool(NGLLX,1,1,last_ispec_studied)) = .false.
+        mask_ibool(ibool(NGLLX,NGLLY,1,last_ispec_studied)) = .false.
+        mask_ibool(ibool(1,NGLLY,1,last_ispec_studied)) = .false.
+        mask_ibool(ibool(1,1,NGLLZ,last_ispec_studied)) = .false.
+        mask_ibool(ibool(NGLLX,1,NGLLZ,last_ispec_studied)) = .false.
+        mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,last_ispec_studied)) = .false.
+        mask_ibool(ibool(1,NGLLY,NGLLZ,last_ispec_studied)) = .false.
+      endif
+      icolor_conflict_found(icolormin:icolormax) = .false.
+
+      ! mark the eight corners of the current element
+      mask_ibool(ibool(1,1,1,ispec)) = .true.
+      mask_ibool(ibool(NGLLX,1,1,ispec)) = .true.
+      mask_ibool(ibool(NGLLX,NGLLY,1,ispec)) = .true.
+      mask_ibool(ibool(1,NGLLY,1,ispec)) = .true.
+      mask_ibool(ibool(1,1,NGLLZ,ispec)) = .true.
+      mask_ibool(ibool(NGLLX,1,NGLLZ,ispec)) = .true.
+      mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec)) = .true.
+      mask_ibool(ibool(1,NGLLY,NGLLZ,ispec)) = .true.
+      last_ispec_studied = ispec
+
+      if(ispec > 1) then
+        do ispec2 = 1,ispec - 1
+          ! domain elements only
+          if(ispec_is_d(ispec2)) then
+
+            ! only inner elements
+            if (is_on_a_slice_edge(ispec2)) cycle
+
+            ! if conflict already found previously with this color, no need to test again
+            if (icolor_conflict_found(color(ispec2))) cycle
+
+            ! test the eight corners of the current element for a common point with element under study
+            if (mask_ibool(ibool(1,1,1,ispec2)) .or. &
+                mask_ibool(ibool(NGLLX,1,1,ispec2)) .or. &
+                mask_ibool(ibool(NGLLX,NGLLY,1,ispec2)) .or. &
+                mask_ibool(ibool(1,NGLLY,1,ispec2)) .or. &
+                mask_ibool(ibool(1,1,NGLLZ,ispec2)) .or. &
+                mask_ibool(ibool(NGLLX,1,NGLLZ,ispec2)) .or. &
+                mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec2)) .or. &
+                mask_ibool(ibool(1,NGLLY,NGLLZ,ispec2))) &
+              icolor_conflict_found(color(ispec2)) = .true.
+
+          endif ! domain elements
+        enddo
+      endif
+
+      ! check if the Droux 1993 algorithm found a solution
+      if (all(icolor_conflict_found(icolormin:icolormax))) then
+        ! user output
+        !if(myrank == 0) write(IMAIN,*) '     Droux 1993 algorithm did not find any solution for ncolors = ',ncolors
+
+        ! try with one more color
+        if(nb_tries_of_Droux_1993 < MAX_NB_TRIES_OF_DROUX_1993) then
+          nb_colors_inner_elements = nb_colors_inner_elements + 1
+          deallocate(nb_elems_in_this_color)
+          deallocate(icolor_conflict_found)
+          nb_tries_of_Droux_1993 = nb_tries_of_Droux_1993 + 1
+          goto 765
+        else
+          ! fail-safe mechanism: if Droux 1993 still fails after all the tries with one more color,
+          ! then go back to my original simple and fast coloring algorithm
+          fail_safe = .true.
+          return
+        endif
+      endif
+
+      ! loop on all the colors to determine the color with the smallest number
+      ! of elements and for which there is no conflict
+      nb_elems_in_color_chosen = 2147000000 ! start with extremely large unrealistic value
+      icolor_chosen = 0
+      do icolor = icolormin,icolormax
+        if (.not. icolor_conflict_found(icolor) .and. nb_elems_in_this_color(icolor) < nb_elems_in_color_chosen) then
+          icolor_chosen = icolor
+          nb_elems_in_color_chosen = nb_elems_in_this_color(icolor)
+        endif
+      enddo
+
+      ! store the color finally chosen
+      color(ispec) = icolor_chosen
+      nb_elems_in_this_color(icolor_chosen) = nb_elems_in_this_color(icolor_chosen) + 1
+
+    endif ! domain elements
+  enddo
+
+  ! debug output
+  if(myrank == 0) then
+    write(IMAIN,*) '     created a total of ',maxval(color),' colors in this domain' ! 'for all the domain elements of the mesh'
+    if( nb_colors_outer_elements > 0 ) &
+      write(IMAIN,*) '     typical nb of elements per color for outer elements should be ', &
+        nspec_outer / nb_colors_outer_elements
+    if( nb_colors_inner_elements > 0 ) &
+      write(IMAIN,*) '     typical nb of elements per color for inner elements should be ', &
+        nspec_inner / nb_colors_inner_elements
+  endif
+
+
+  end subroutine balance_colors_Droux
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine balance_colors_simple(ibool,is_on_a_slice_edge,ispec_is_d, &
+                                  myrank, nspec, nglob, &
+                                  color, nb_colors_outer_elements, nb_colors_inner_elements, &
+                                  nspec_outer,nspec_inner,mask_ibool)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: nspec,nglob
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d
+  logical, dimension(nglob) :: mask_ibool
+
+  integer, dimension(nspec) :: color
+
+  integer :: myrank
+  integer :: nb_colors_outer_elements,nb_colors_inner_elements
+
+  integer :: nspec_outer,nspec_inner
+
+  ! local parameters
+  logical, dimension(:), allocatable :: icolor_conflict_found
+  integer, dimension(:), allocatable :: nb_elems_in_this_color
+  integer :: ispec,ispec2,icolor,ncolors,icolormin,icolormax,icolor_chosen,nb_elems_in_color_chosen
+  integer :: last_ispec_studied
+  integer :: target_nb_elems_per_color,icolor_target
+  integer :: ier
+
+  ! debug outupt
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     balancing colors: simple algorithm'
+    write(IMAIN,*) '       number of outer element colors = ',nb_colors_outer_elements
+    write(IMAIN,*) '       number of inner element colors = ',nb_colors_inner_elements
+    write(IMAIN,*) '       number of total colors = ',nb_colors_outer_elements + nb_colors_inner_elements
+  endif
+
+  ! balances colors in postprocess if Droux (1993) is not used
+  ncolors = nb_colors_outer_elements + nb_colors_inner_elements
+
+  ! allocates temporary arrays
+  allocate(nb_elems_in_this_color(ncolors), &
+          icolor_conflict_found(ncolors),stat=ier)
+  if( ier /= 0 ) stop 'error allocating nb_elems_in_this_color arrays'
+
+  !! DK DK do it for outer elements
+  icolormin = 1
+  icolormax = nb_colors_outer_elements
+
+  ! ideal value if all colors are perfectly balanced
+  if( nb_colors_outer_elements > 0 ) then
+    target_nb_elems_per_color = nspec_outer / nb_colors_outer_elements + 1
+  else
+    target_nb_elems_per_color = 1
+  endif
+
+  ! print *,'nspec_outer,target_nb_elems_per_color = ',nspec_outer,target_nb_elems_per_color
+
+  ! count the initial number of elements in each color
+  nb_elems_in_this_color(:) = 0
+  do icolor = icolormin,icolormax
+    nb_elems_in_this_color(icolor) = count(color == icolor)
+  enddo
+
+  ! do not balance the last one, because it will be balanced automatically by the others
+  do icolor = icolormin,icolormax-1
+
+    ! if color is already balanced, do nothing
+    ! (this works because in the initial set the number of elements per color decreases when the color number increases)
+    if(nb_elems_in_this_color(icolor) <= target_nb_elems_per_color) cycle
+
+    mask_ibool(:) = .false.
+    last_ispec_studied = -1
+
+    do ispec = 1,nspec
+      ! domain elements only
+      if(ispec_is_d(ispec)) then
+
+        ! only outer elements
+        if (.not. is_on_a_slice_edge(ispec)) cycle
+
+        ! only elements of this color
+        if (color(ispec) /= icolor) cycle
+
+        ! if color is now balanced because we have moved enough elements then stop searching
+        if(nb_elems_in_this_color(icolor) <= target_nb_elems_per_color) exit
+
+        ! unmark the eight corners of the previously marked element
+        if(last_ispec_studied > 0) then
+          mask_ibool(ibool(1,1,1,last_ispec_studied)) = .false.
+          mask_ibool(ibool(NGLLX,1,1,last_ispec_studied)) = .false.
+          mask_ibool(ibool(NGLLX,NGLLY,1,last_ispec_studied)) = .false.
+          mask_ibool(ibool(1,NGLLY,1,last_ispec_studied)) = .false.
+          mask_ibool(ibool(1,1,NGLLZ,last_ispec_studied)) = .false.
+          mask_ibool(ibool(NGLLX,1,NGLLZ,last_ispec_studied)) = .false.
+          mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,last_ispec_studied)) = .false.
+          mask_ibool(ibool(1,NGLLY,NGLLZ,last_ispec_studied)) = .false.
+        endif
+        icolor_conflict_found(icolormin:icolormax) = .false.
+        icolor_conflict_found(icolor) = .true. ! cannot move element to the color it already has
+
+        ! mark the eight corners of the current element
+        mask_ibool(ibool(1,1,1,ispec)) = .true.
+        mask_ibool(ibool(NGLLX,1,1,ispec)) = .true.
+        mask_ibool(ibool(NGLLX,NGLLY,1,ispec)) = .true.
+        mask_ibool(ibool(1,NGLLY,1,ispec)) = .true.
+        mask_ibool(ibool(1,1,NGLLZ,ispec)) = .true.
+        mask_ibool(ibool(NGLLX,1,NGLLZ,ispec)) = .true.
+        mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec)) = .true.
+        mask_ibool(ibool(1,NGLLY,NGLLZ,ispec)) = .true.
+        last_ispec_studied = ispec
+
+        ! test if we can move this element to another color
+        do ispec2 = 1,nspec
+          ! domain elements only
+          if(ispec_is_d(ispec2)) then
+
+            ! do not test that element itself
+            if (ispec2 == ispec) cycle
+
+            ! only outer elements
+            if (.not. is_on_a_slice_edge(ispec2)) cycle
+
+            ! if conflict already found previously with this color, no need to test again
+            if (icolor_conflict_found(color(ispec2))) cycle
+
+            ! test the eight corners of the current element for a common point with element under study
+            if (mask_ibool(ibool(1,1,1,ispec2)) .or. &
+              mask_ibool(ibool(NGLLX,1,1,ispec2)) .or. &
+              mask_ibool(ibool(NGLLX,NGLLY,1,ispec2)) .or. &
+              mask_ibool(ibool(1,NGLLY,1,ispec2)) .or. &
+              mask_ibool(ibool(1,1,NGLLZ,ispec2)) .or. &
+              mask_ibool(ibool(NGLLX,1,NGLLZ,ispec2)) .or. &
+              mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec2)) .or. &
+              mask_ibool(ibool(1,NGLLY,NGLLZ,ispec2))) &
+                icolor_conflict_found(color(ispec2)) = .true.
+          endif ! domain elements
+        enddo
+
+        ! if color is already above target size for a balanced set, do not move to it
+        do icolor_target = icolormin,icolormax
+          if(nb_elems_in_this_color(icolor_target) >= target_nb_elems_per_color) &
+            icolor_conflict_found(icolor_target) = .true.
+        enddo
+
+        ! if cannot find any other color to move this element to
+        if (all(icolor_conflict_found(icolormin:icolormax))) cycle
+
+        ! loop on all the colors to determine the color with the smallest number of elements
+        ! and for which there is no conflict
+        nb_elems_in_color_chosen = 2147000000 ! start with extremely large unrealistic value
+        icolor_chosen = 0
+        do icolor_target = icolormin,icolormax
+          if (.not. icolor_conflict_found(icolor_target) .and. &
+             nb_elems_in_this_color(icolor_target) < nb_elems_in_color_chosen) then
+            icolor_chosen = icolor_target
+            nb_elems_in_color_chosen = nb_elems_in_this_color(icolor_target)
+          endif
+        enddo
+
+        ! move the element to that new color
+        ! remove element from its current color
+        nb_elems_in_this_color(color(ispec)) = nb_elems_in_this_color(color(ispec)) - 1
+        color(ispec) = icolor_chosen
+        ! and add it to the new color
+        nb_elems_in_this_color(icolor_chosen) = nb_elems_in_this_color(icolor_chosen) + 1
+
+      endif ! domain elements
+    enddo
+
+  enddo ! icolor
+
+!! DK DK do it for inner elements
+
+  icolormin = nb_colors_outer_elements + 1
+  icolormax = ncolors
+
+  ! ideal value if all colors are perfectly balanced
+  if( nb_colors_inner_elements > 0 ) then
+    target_nb_elems_per_color = nspec_inner / nb_colors_inner_elements + 1
+  else
+    target_nb_elems_per_color = 1
+  endif
+  ! print *,'nspec_inner,target_nb_elems_per_color = ',nspec_inner,target_nb_elems_per_color
+
+  ! count the initial number of elements in each color
+  nb_elems_in_this_color(:) = 0
+  do icolor = icolormin,icolormax
+    nb_elems_in_this_color(icolor) = count(color == icolor)
+  enddo
+
+  ! do not balance the last one, because it will be balanced automatically by the others
+  do icolor = icolormin,icolormax-1
+
+    ! if color is already balanced, do nothing
+    ! (this works because in the initial set the number of elements per color decreases when the color number increases)
+    if(nb_elems_in_this_color(icolor) <= target_nb_elems_per_color) cycle
+
+    mask_ibool(:) = .false.
+    last_ispec_studied = -1
+
+    do ispec = 1,nspec
+      ! domain elements only
+      if(ispec_is_d(ispec)) then
+
+        ! only inner elements
+        if (is_on_a_slice_edge(ispec)) cycle
+
+        ! only elements of this color
+        if (color(ispec) /= icolor) cycle
+
+        ! if color is now balanced because we have moved enough elements then stop searching
+        if(nb_elems_in_this_color(icolor) <= target_nb_elems_per_color) exit
+
+        ! unmark the eight corners of the previously marked element
+        if(last_ispec_studied > 0) then
+          mask_ibool(ibool(1,1,1,last_ispec_studied)) = .false.
+          mask_ibool(ibool(NGLLX,1,1,last_ispec_studied)) = .false.
+          mask_ibool(ibool(NGLLX,NGLLY,1,last_ispec_studied)) = .false.
+          mask_ibool(ibool(1,NGLLY,1,last_ispec_studied)) = .false.
+          mask_ibool(ibool(1,1,NGLLZ,last_ispec_studied)) = .false.
+          mask_ibool(ibool(NGLLX,1,NGLLZ,last_ispec_studied)) = .false.
+          mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,last_ispec_studied)) = .false.
+          mask_ibool(ibool(1,NGLLY,NGLLZ,last_ispec_studied)) = .false.
+        endif
+        icolor_conflict_found(icolormin:icolormax) = .false.
+        icolor_conflict_found(icolor) = .true. ! cannot move element to the color it already has
+
+        ! mark the eight corners of the current element
+        mask_ibool(ibool(1,1,1,ispec)) = .true.
+        mask_ibool(ibool(NGLLX,1,1,ispec)) = .true.
+        mask_ibool(ibool(NGLLX,NGLLY,1,ispec)) = .true.
+        mask_ibool(ibool(1,NGLLY,1,ispec)) = .true.
+        mask_ibool(ibool(1,1,NGLLZ,ispec)) = .true.
+        mask_ibool(ibool(NGLLX,1,NGLLZ,ispec)) = .true.
+        mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec)) = .true.
+        mask_ibool(ibool(1,NGLLY,NGLLZ,ispec)) = .true.
+        last_ispec_studied = ispec
+
+        ! test if we can move this element to another color
+        do ispec2 = 1,nspec
+          ! domain elements only
+          if(ispec_is_d(ispec2)) then
+
+            ! do not test that element itself
+            if (ispec2 == ispec) cycle
+
+            ! only inner elements
+            if (is_on_a_slice_edge(ispec2)) cycle
+
+            ! if conflict already found previously with this color, no need to test again
+            if (icolor_conflict_found(color(ispec2))) cycle
+
+            ! test the eight corners of the current element for a common point with element under study
+            if (mask_ibool(ibool(1,1,1,ispec2)) .or. &
+              mask_ibool(ibool(NGLLX,1,1,ispec2)) .or. &
+              mask_ibool(ibool(NGLLX,NGLLY,1,ispec2)) .or. &
+              mask_ibool(ibool(1,NGLLY,1,ispec2)) .or. &
+              mask_ibool(ibool(1,1,NGLLZ,ispec2)) .or. &
+              mask_ibool(ibool(NGLLX,1,NGLLZ,ispec2)) .or. &
+              mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec2)) .or. &
+              mask_ibool(ibool(1,NGLLY,NGLLZ,ispec2))) &
+                icolor_conflict_found(color(ispec2)) = .true.
+
+          endif ! domain elements
+        enddo
+
+        ! if color is already above target size for a balanced set, do not move to it
+        do icolor_target = icolormin,icolormax
+          if(nb_elems_in_this_color(icolor_target) >= target_nb_elems_per_color) &
+            icolor_conflict_found(icolor_target) = .true.
+        enddo
+
+        ! if cannot find any other color to move this element to
+        if (all(icolor_conflict_found(icolormin:icolormax))) cycle
+
+        ! loops on all the colors to determine the color with the smallest number of elements
+        ! and for which there is no conflict
+        nb_elems_in_color_chosen = 2147000000 ! start with extremely large unrealistic value
+        icolor_chosen = 0
+        do icolor_target = icolormin,icolormax
+          if (.not. icolor_conflict_found(icolor_target) .and. &
+            nb_elems_in_this_color(icolor_target) < nb_elems_in_color_chosen) then
+            icolor_chosen = icolor_target
+            nb_elems_in_color_chosen = nb_elems_in_this_color(icolor_target)
+          endif
+        enddo
+
+        ! moves the element to that new color
+        ! remove element from its current color
+        nb_elems_in_this_color(color(ispec)) = nb_elems_in_this_color(color(ispec)) - 1
+        color(ispec) = icolor_chosen
+        ! and add it to the new color
+        nb_elems_in_this_color(icolor_chosen) = nb_elems_in_this_color(icolor_chosen) + 1
+
+      endif ! domain elements
+    enddo
+
+  enddo ! icolor
+
+  end subroutine balance_colors_simple
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_final_perm(color,perm,first_elem_number_in_this_color, &
+                            nspec,nb_colors,nb_colors_outer_elements, &
+                            ispec_is_d,nspec_domain)
+
+  integer, intent(in) :: nspec,nb_colors
+
+  integer,dimension(nspec), intent(in) :: color
+  integer,dimension(nspec), intent(inout) :: perm
+
+  integer, intent(inout) :: first_elem_number_in_this_color(nb_colors)
+
+  logical,dimension(nspec),intent(in) :: ispec_is_d
+
+  integer,intent(in) :: nb_colors_outer_elements,nspec_domain
+
+  ! local parameters
+  integer :: ispec,icolor,icounter,counter_outer
+
+  ! note: permutations are only valid within each domain
+  !          also, the counters start at 1 for each inner/outer element range
+
+  ! outer elements first ( note: inner / outer order sensitive)
+  icounter = 1
+  do icolor = 1, nb_colors_outer_elements
+    first_elem_number_in_this_color(icolor) = icounter
+    do ispec = 1, nspec
+      ! elements in this domain only
+      if( ispec_is_d(ispec) ) then
+        if(color(ispec) == icolor) then
+          perm(ispec) = icounter
+          icounter = icounter + 1
+        endif
+      endif
+    enddo
+  enddo
+  counter_outer = icounter - 1
+
+  ! inner elements second
+  icounter = 1
+  do icolor = nb_colors_outer_elements+1, nb_colors
+    first_elem_number_in_this_color(icolor) = icounter + counter_outer
+    do ispec = 1, nspec
+      ! elements in this domain only
+      if( ispec_is_d(ispec) ) then
+        ! outer elements
+        if(color(ispec) == icolor) then
+          perm(ispec) = icounter
+          icounter = icounter + 1
+        endif
+      endif
+    enddo
+  enddo
+
+  ! checks
+  if( counter_outer + icounter -1 /= nspec_domain ) then
+    print*,'error: perm: ',nspec_domain,counter_outer,icounter,counter_outer+icounter-1
+    stop 'error get_final_perm: counter incomplete'
+  endif
+
+  end subroutine get_final_perm
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+! PERMUTATIONS
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! implement permutation of elements for arrays of real (CUSTOM_REAL) type
+
+  subroutine permute_elements_real(array_to_permute,temp_array,perm,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec
+  integer, intent(in), dimension(nspec) :: perm
+
+  real(kind=CUSTOM_REAL), intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: &
+    array_to_permute,temp_array
+
+  integer old_ispec,new_ispec
+
+  ! copy the original array
+  temp_array(:,:,:,:) = array_to_permute(:,:,:,:)
+
+  do old_ispec = 1,nspec
+    new_ispec = perm(old_ispec)
+    array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec)
+  enddo
+
+  end subroutine permute_elements_real
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! implement permutation of elements for arrays of integer type
+
+  subroutine permute_elements_integer(array_to_permute,temp_array,perm,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec
+  integer, intent(in), dimension(nspec) :: perm
+
+  integer, intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: &
+    array_to_permute,temp_array
+
+  integer old_ispec,new_ispec
+
+  ! copy the original array
+  temp_array(:,:,:,:) = array_to_permute(:,:,:,:)
+
+  do old_ispec = 1,nspec
+    new_ispec = perm(old_ispec)
+    array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec)
+  enddo
+
+  end subroutine permute_elements_integer
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! implement permutation of elements for arrays of double precision type
+
+  subroutine permute_elements_dble(array_to_permute,temp_array,perm,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec
+  integer, intent(in), dimension(nspec) :: perm
+
+  double precision, intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: &
+    array_to_permute,temp_array
+
+  integer old_ispec,new_ispec
+
+  ! copy the original array
+  temp_array(:,:,:,:) = array_to_permute(:,:,:,:)
+
+  do old_ispec = 1,nspec
+    new_ispec = perm(old_ispec)
+    array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec)
+  enddo
+
+  end subroutine permute_elements_dble
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! implement permutation of elements for arrays of double precision type
+
+  subroutine permute_elements_logical1D(array_to_permute,temp_array,perm,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer, intent(in) :: nspec
+  integer, intent(in), dimension(nspec) :: perm
+
+  logical, intent(inout), dimension(nspec) :: array_to_permute,temp_array
+
+  integer old_ispec,new_ispec
+
+  ! copy the original array
+  temp_array(:) = array_to_permute(:)
+
+  do old_ispec = 1,nspec
+    new_ispec = perm(old_ispec)
+    array_to_permute(new_ispec) = temp_array(old_ispec)
+  enddo
+
+  end subroutine permute_elements_logical1D
diff --git a/src/generate_databases/memory_eval.f90 b/src/generate_databases/memory_eval.f90
index a0c592728..1942bc415 100644
--- a/src/generate_databases/memory_eval.f90
+++ b/src/generate_databases/memory_eval.f90
@@ -152,10 +152,10 @@ end subroutine memory_eval
 ! compute the approximate amount of static memory needed to run the mesher
 
  subroutine memory_eval_mesher(myrank,nspec,npointot,nnodes_ext_mesh, &
-              nelmnts_ext_mesh,nmat_ext_mesh,num_interfaces_ext_mesh, &
-              max_interface_size_ext_mesh,nspec2D_xmin,nspec2D_xmax, &
-              nspec2D_ymin,nspec2D_ymax,nspec2D_bottom,nspec2D_top, &
-              static_memory_size_request)
+                              nelmnts_ext_mesh,nmat_ext_mesh,num_interfaces_ext_mesh, &
+                              max_interface_size_ext_mesh,nspec2D_xmin,nspec2D_xmax, &
+                              nspec2D_ymin,nspec2D_ymax,nspec2D_bottom,nspec2D_top, &
+                              static_memory_size_request)
 
   implicit none
 
@@ -166,7 +166,7 @@ subroutine memory_eval_mesher(myrank,nspec,npointot,nnodes_ext_mesh, &
            max_interface_size_ext_mesh,nspec2D_xmin,nspec2D_xmax, &
            nspec2D_ymin,nspec2D_ymax,nspec2D_bottom,nspec2D_top
 
-  integer,intent(inout) :: static_memory_size_request
+  double precision,intent(inout) :: static_memory_size_request
 
   ! local parameters
   integer :: static_memory_size
diff --git a/src/generate_databases/model_1d_cascadia.f90 b/src/generate_databases/model_1d_cascadia.f90
index c3fc2d054..92bcec695 100644
--- a/src/generate_databases/model_1d_cascadia.f90
+++ b/src/generate_databases/model_1d_cascadia.f90
@@ -48,7 +48,7 @@ subroutine model_1D_cascadia(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
   ! local parameters
   real(kind=CUSTOM_REAL) :: x,y,z
   real(kind=CUSTOM_REAL) :: depth
-  real(kind=CUSTOM_REAL) :: elevation,distmin  
+  real(kind=CUSTOM_REAL) :: elevation,distmin
 
   ! converts GLL point location to real
   x = xmesh
@@ -61,19 +61,19 @@ subroutine model_1D_cascadia(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
   call get_topo_elevation_free_closest(x,y,elevation,distmin, &
                     nspec,nglob_dummy,ibool,xstore_dummy,ystore_dummy,zstore_dummy, &
                     num_free_surface_faces,free_surface_ispec,free_surface_ijk)
-  
+
   ! depth in Z-direction
-  if( distmin < HUGEVAL ) then  
+  if( distmin < HUGEVAL ) then
     depth = elevation - z
   else
     depth = - z
   endif
 
   ! depth in km
-  depth = depth / 1000.0  
+  depth = depth / 1000.0
 
   ! 1D profile Cascadia
-  
+
   ! super-imposes values
   if( depth < 1.0 ) then
     ! vp in m/s
diff --git a/src/generate_databases/model_1d_prem.f90 b/src/generate_databases/model_1d_prem.f90
index 185c59335..93dacd96d 100644
--- a/src/generate_databases/model_1d_prem.f90
+++ b/src/generate_databases/model_1d_prem.f90
@@ -51,29 +51,29 @@ subroutine model_1D_prem_iso(xmesh,ymesh,zmesh,rho_prem,vp_prem,vs_prem,qmu_atte
   implicit none
 
   ! GLL point location
-  double precision, intent(in) :: xmesh,ymesh,zmesh  
-  
+  double precision, intent(in) :: xmesh,ymesh,zmesh
+
   ! material properties
   real(kind=CUSTOM_REAL), intent(inout) :: rho_prem,vp_prem,vs_prem,qmu_atten
-  
+
   ! local parameters
   real(kind=CUSTOM_REAL) :: xloc,yloc,zloc
   real(kind=CUSTOM_REAL) :: depth
-  real(kind=CUSTOM_REAL) :: elevation,distmin  
+  real(kind=CUSTOM_REAL) :: elevation,distmin
   double precision :: x,rho,drhodr,vp,vs,Qkappa,Qmu
   double precision :: &
-      R_EARTH,RICB,RCMB,RTOPDDOUBLEPRIME, &
+      R_EARTH_M,RICB,RCMB,RTOPDDOUBLEPRIME, &
       R771,R600,R670,R400,R220,R80,RMOHO,RMIDDLE_CRUST,ROCEAN
   double precision :: r
 
   ! uses crustal values from other models (like crust2.0) than prem
   ! set to .false. to use PREM crustal values, otherwise will take mantle values up to surface
-  logical,parameter :: CRUSTAL = .false. 
+  logical,parameter :: CRUSTAL = .false.
   ! avoids crustal values, uses Moho values up to the surface
   logical,parameter :: SUPPRESS_CRUSTAL_MESH = .false.
   ! same properties everywhere in PREM crust if we decide to define only one layer in the crust
   logical,parameter :: ONE_CRUST = .false.
-  
+
   ! GLL point location converted to real
   xloc = xmesh
   yloc = ymesh
@@ -85,16 +85,16 @@ subroutine model_1D_prem_iso(xmesh,ymesh,zmesh,rho_prem,vp_prem,vs_prem,qmu_atte
   call get_topo_elevation_free_closest(xloc,yloc,elevation,distmin, &
                     nspec,nglob_dummy,ibool,xstore_dummy,ystore_dummy,zstore_dummy, &
                     num_free_surface_faces,free_surface_ispec,free_surface_ijk)
-  
+
   ! depth in Z-direction
-  if( distmin < HUGEVAL ) then  
+  if( distmin < HUGEVAL ) then
     depth = elevation - zloc
   else
     depth = - zloc
   endif
 
   ! PREM layers (in m)
-  R_EARTH = 6371000.d0
+  R_EARTH_M = 6371000.d0
   ROCEAN = 6368000.d0
   RMIDDLE_CRUST = 6356000.d0
   RMOHO = 6346600.d0
@@ -113,10 +113,18 @@ subroutine model_1D_prem_iso(xmesh,ymesh,zmesh,rho_prem,vp_prem,vs_prem,qmu_atte
 
   ! normalized radius
   x = r / R_EARTH
-  
+
   ! given a normalized radius x, gives the non-dimensionalized density rho,
   ! speeds vp and vs, and the quality factors Qkappa and Qmu
 
+  ! initializes
+  rho = 0.d0
+  vp = 0.d0
+  vs = 0.d0
+  Qmu = 0.d0
+  Qkappa = 0.d0
+  drhodr = 0.d0
+
   !
   !--- inner core
   !
@@ -271,5 +279,102 @@ subroutine model_1D_prem_iso(xmesh,ymesh,zmesh,rho_prem,vp_prem,vs_prem,qmu_atte
   vs_prem=vs*1000.0d0
 
   qmu_atten = Qmu
-  
+
   end subroutine model_1D_prem_iso
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+!PB ROUTINE WHICH ASSINGS AT EACH GLL POINT THE VALUES OF v_p v_s AND rho TAKEN FROM THE PREM MODEL
+!PB IT'S CALLED IN get_model.f90(:149) WITHIN THE generate_databases PROCESS
+
+  subroutine model_1D_PREM_routine_PB(xloc,yloc,zloc,ro_prem,vp_prem,vs_prem,idom)
+
+  use generate_databases_par,only: CUSTOM_REAL
+  implicit none
+
+  double precision, intent(in) :: xloc,yloc,zloc
+  real(kind=CUSTOM_REAL),intent(inout) :: ro_prem,vp_prem,vs_prem
+  integer, intent(in)          :: idom
+
+  ! local parameters
+  double precision             :: r0,r,x_prem
+  !double precision             :: ro_prem,vp_prem,vs_prem
+  !character(len=3), intent(in) :: param !rho, vs,vp
+
+  r0=sqrt(xloc**2+yloc**2+zloc**2)
+  !  print*,'xloc,yloc,zloc,r0,idom',xloc,yloc,zloc,r0,idom
+  r=r0/1000.
+
+  x_prem=r/6371.     ! Radius (normalized to x(surface)=1 )
+  !  print*,'x_prem',x_prem
+  IF(idom==1)THEN        ! upper crustal layer
+     !print*,'I am in domain 1'
+     ro_prem=2.6
+     vp_prem=5.8
+     vs_prem=3.2
+  !    print*,'ro,vp,vs form domain 1',ro_prem,vp_prem,vs_prem
+  ELSEIF(idom==2)THEN
+     ro_prem=2.9                       ! lower crustal layer
+     vp_prem=6.8
+     vs_prem=3.9
+  ELSEIF(idom==3)THEN
+     ro_prem=2.691+.6924*x_prem             ! upper mantle
+     vp_prem=4.1875+3.9382*x_prem
+     vs_prem=2.1519+2.3481*x_prem
+  ELSEIF(idom==4)THEN
+     ro_prem=7.1089-3.8045*x_prem
+     vp_prem=20.3926-12.2569*x_prem
+     vs_prem=8.9496-4.4597*x_prem
+  ELSEIF(idom==5)THEN
+     ro_prem=11.2494-8.0298*x_prem
+     vp_prem=39.7027-32.6166*x_prem
+     vs_prem=22.3512-18.5856*x_prem
+  ELSEIF(idom==6)THEN
+     ro_prem=5.3197-1.4836*x_prem
+     vp_prem=19.0957-9.8672*x_prem
+     vs_prem=9.9839-4.9324*x_prem
+  ELSEIF(idom==7)THEN   !lower mantle
+     ro_prem=7.9565-6.4761*x_prem+5.5283*x_prem**2-3.0807*x_prem**3
+     vp_prem=29.2766-23.6027*x_prem+5.5242*x_prem**2-2.5514*x_prem**3
+     vs_prem=22.3459-17.2473*x_prem-2.0834*x_prem**2+0.9783*x_prem**3
+  ELSEIF(idom==8)THEN
+     ro_prem=7.9565-6.4761*x_prem+5.5283*x_prem**2-3.0807*x_prem**3
+     vp_prem=24.9520-40.4673*x_prem+51.4832*x_prem**2-26.6419*x_prem**3
+     vs_prem=11.1671-13.7818*x_prem+17.4575*x_prem**2-9.2777*x_prem**3
+  ELSEIF(idom==9)THEN
+     ro_prem=7.9565-6.4761*x_prem+5.5283*x_prem**2-3.0807*x_prem**3
+     vp_prem=15.3891-5.3181*x_prem+5.5242*x_prem**2-2.5514*x_prem**3
+     vs_prem=6.9254+1.4672*x_prem-2.0834*x_prem**2+.9783*x_prem**3
+  ELSEIF(idom==10)THEN  ! outer core
+     ro_prem=12.5815-1.2638*x_prem-3.6426*x_prem**2-5.5281*x_prem**3
+     vp_prem=11.0487-4.0362*x_prem+4.8023*x_prem**2-13.5732*x_prem**3
+     vs_prem=0.00
+  ELSEIF(idom==11)THEN                        ! inner core
+     ro_prem=13.0885-8.8381*x_prem**2
+     vp_prem=11.2622-6.3640*x_prem**2
+     vs_prem=3.6678-4.4475*x_prem**2
+  ENDIF
+
+  !    print*,'ro,vp,vs passed from routine and not multiplied',ro_prem,vp_prem,vs_prem
+
+  ro_prem=ro_prem*1000
+  vp_prem=vp_prem*1000
+  vs_prem=vs_prem*1000
+
+  !   print*,'ro,vp,vs passed from PREM_ROUTINE multiplied by 1000',ro_prem,vp_prem,vs_prem
+
+  !  if (param=='rho') then
+  !     prem_sub=ro_prem*1000.
+  !  elseif (param=='v_p') then
+  !     prem_sub=vp_prem*1000.
+  !  elseif (param=='v_s') then
+  !     prem_sub=vs_prem*1000.
+  !  else
+  !     write(6,*)'ERROR IN PREM_SUB FUNCTION:',param,'NOT AN OPTION'
+  !     stop
+
+  end subroutine model_1D_PREM_routine_PB
+
diff --git a/src/generate_databases/model_1d_socal.f90 b/src/generate_databases/model_1d_socal.f90
index f9d95cdb4..05e813279 100644
--- a/src/generate_databases/model_1d_socal.f90
+++ b/src/generate_databases/model_1d_socal.f90
@@ -26,13 +26,13 @@
 
 !--------------------------------------------------------------------------------------------------
 !
-! 1D Southern California model 
+! 1D Southern California model
 !
 ! model is the standard model used in southern California:
-!   Kanamori and Hadley (1975), Dreger and Helmberger (1990), Wald-Hutton,Given (1995)   
+!   Kanamori and Hadley (1975), Dreger and Helmberger (1990), Wald-Hutton,Given (1995)
 !
 !--------------------------------------------------------------------------------------------------
-   
+
   subroutine model_1D_socal(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
 
 ! given a GLL point, returns super-imposed velocity model values
@@ -49,12 +49,12 @@ subroutine model_1D_socal(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
 
   ! local parameters
   real(kind=CUSTOM_REAL) :: depth,x,y,z
-  
+
   ! mesh point location
   x = xmesh
   y = ymesh
   z = zmesh
-  
+
   ! depth in m
   depth = -zmesh
 
@@ -88,5 +88,5 @@ subroutine model_1D_socal(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
 
   ! no attenuation information
   qmu_atten = 0.d0
-  
+
   end subroutine model_1D_socal
diff --git a/src/generate_databases/model_aniso.f90 b/src/generate_databases/model_aniso.f90
index 7c1f7bc8e..736f9297f 100644
--- a/src/generate_databases/model_aniso.f90
+++ b/src/generate_databases/model_aniso.f90
@@ -316,5 +316,28 @@ subroutine model_aniso(iflag_aniso,rho,vp,vs, &
   c56 = - d46
   c66 = d66
 
+! unused: fills values with the isotropic model
+!  c11 = rho*vpv*vpv
+!  c12 = rho*(vpv*vpv-2.*vsv*vsv)
+!  c13 = c12
+!  c14 = 0.d0
+!  c15 = 0.d0
+!  c16 = 0.d0
+!  c22 = c11
+!  c23 = c12
+!  c24 = 0.d0
+!  c25 = 0.d0
+!  c26 = 0.d0
+!  c33 = c11
+!  c34 = 0.d0
+!  c35 = 0.d0
+!  c36 = 0.d0
+!  c44 = rho*vsv*vsv
+!  c45 = 0.d0
+!  c46 = 0.d0
+!  c55 = c44
+!  c56 = 0.d0
+!  c66 = c44
+
   end subroutine model_aniso
 
diff --git a/src/generate_databases/model_default.f90 b/src/generate_databases/model_default.f90
index 85cb89cc5..c6c2e27d6 100644
--- a/src/generate_databases/model_default.f90
+++ b/src/generate_databases/model_default.f90
@@ -49,7 +49,7 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, &
   integer, intent(in) :: nundefMat_ext_mesh
   character (len=30), dimension(6,nundefMat_ext_mesh):: undef_mat_prop
 
-  integer, intent(in) :: imaterial_id,imaterial_def  
+  integer, intent(in) :: imaterial_id,imaterial_def
 
   double precision, intent(in) :: xmesh,ymesh,zmesh
 
@@ -60,11 +60,11 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, &
 
   real(kind=CUSTOM_REAL) :: kappa_s,kappa_f,kappa_fr,mu_fr,rho_s,rho_f,phi,tort,eta_f, &
                            kxx,kxy,kxz,kyy,kyz,kzz
-  
+
   ! local parameters
   integer :: iflag,flag_below,flag_above
   integer :: iundef
-  
+
   ! check if the material is known or unknown
   if( imaterial_id > 0 ) then
     ! gets velocity model as specified by (cubit) mesh files for elastic & acoustic
@@ -74,8 +74,8 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, &
     idomain_id = materials_ext_mesh(6,imaterial_id)
 
     select case( idomain_id )
-        
-    case( IDOMAIN_ACOUSTIC,IDOMAIN_ELASTIC) 
+
+    case( IDOMAIN_ACOUSTIC,IDOMAIN_ELASTIC)
       ! elastic or acoustic
 
       ! density
@@ -93,7 +93,7 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, &
       ! anisotropy
       iflag_aniso = materials_ext_mesh(5,imaterial_id)
 
-    case( IDOMAIN_POROELASTIC ) 
+    case( IDOMAIN_POROELASTIC )
       ! poroelastic
       ! materials_ext_mesh format:
       ! rho_s,kappa_s,rho_f,kappa_f,eta_f,kappa_fr,mu_fr,phi,tort,kxx,kxy,kxz,kyy,kyz,kzz
@@ -120,9 +120,9 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, &
     case default
       print*,'error: domain id = ',idomain_id,'not recognized'
       stop 'error: domain not recognized'
-      
+
     end select
-    
+
   else if ( imaterial_def == 1 ) then
 
     stop 'material: interface not implemented yet'
@@ -142,7 +142,7 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, &
     rho = materials_ext_mesh(1,iflag)
     vp = materials_ext_mesh(2,iflag)
     vs = materials_ext_mesh(3,iflag)
-    qmu_atten = materials_ext_mesh(4,iflag)    
+    qmu_atten = materials_ext_mesh(4,iflag)
     iflag_aniso = materials_ext_mesh(5,iflag)
     idomain_id = materials_ext_mesh(6,iflag)
 
@@ -154,7 +154,7 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, &
     call model_tomography(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
 
     ! no anisotropy
-    iflag_aniso = 0   
+    iflag_aniso = 0
 
     ! sets acoustic/elastic domain as given in materials properties
     iundef = - imaterial_id    ! iundef must be positive
diff --git a/src/generate_databases/model_external_values.f90 b/src/generate_databases/model_external_values.f90
index 789e7f891..15433822a 100644
--- a/src/generate_databases/model_external_values.f90
+++ b/src/generate_databases/model_external_values.f90
@@ -124,7 +124,7 @@ subroutine model_external_values(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten,iflag_ani
   use create_regions_mesh_ext_par
   implicit none
 
-  ! GLL point 
+  ! GLL point
   double precision, intent(in) :: xmesh,ymesh,zmesh
 
   ! density, Vp and Vs
@@ -172,14 +172,14 @@ subroutine model_external_values(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten,iflag_ani
                                   nspec,nglob_dummy,ibool,xstore_dummy,ystore_dummy,zstore_dummy, &
                                   num_free_surface_faces,free_surface_ispec,free_surface_ijk)
 
-                    
+
   ! depth in Z-direction
-  if( distmin < HUGEVAL ) then  
+  if( distmin < HUGEVAL ) then
     depth = elevation - z
   else
     depth = zmin - z
   endif
-  
+
   ! normalizes depth between 0 and 1
   if( abs( zmax - zmin ) > TINYVAL ) depth = depth / (zmax - zmin)
 
@@ -204,6 +204,6 @@ subroutine model_external_values(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten,iflag_ani
   iflag_aniso = 0
 
   ! elastic material
-  idomain_id = IDOMAIN_ELASTIC  
-  
+  idomain_id = IDOMAIN_ELASTIC
+
   end subroutine model_external_values
diff --git a/src/generate_databases/model_gll.f90 b/src/generate_databases/model_gll.f90
index eb0c537a3..e3622f558 100644
--- a/src/generate_databases/model_gll.f90
+++ b/src/generate_databases/model_gll.f90
@@ -39,14 +39,14 @@ subroutine model_gll(myrank,nspec,LOCAL_PATH)
 
   use create_regions_mesh_ext_par
   implicit none
-  
+
   integer, intent(in) :: myrank,nspec
   character(len=256) :: LOCAL_PATH
-    
-  ! local parameters    
+
+  ! local parameters
   real, dimension(:,:,:,:),allocatable :: vp_read,vs_read,rho_read
   integer :: ier
-  character(len=256) :: prname_lp,filename  
+  character(len=256) :: prname_lp,filename
 
   ! processors name
   write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'proc',myrank,'_'
@@ -60,7 +60,11 @@ subroutine model_gll(myrank,nspec,LOCAL_PATH)
   if( ier /= 0 ) stop 'error allocating array rho_read'
 
   filename = prname_lp(1:len_trim(prname_lp))//'rho.bin'
-  open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted')
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading rho.bin file'
+  endif
 
   read(28) rho_read
   close(28)
@@ -70,7 +74,11 @@ subroutine model_gll(myrank,nspec,LOCAL_PATH)
   if( ier /= 0 ) stop 'error allocating array vp_read'
 
   filename = prname_lp(1:len_trim(prname_lp))//'vp.bin'
-  open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted')
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading vp.bin file'
+  endif
 
   read(28) vp_read
   close(28)
@@ -80,8 +88,12 @@ subroutine model_gll(myrank,nspec,LOCAL_PATH)
   if( ier /= 0 ) stop 'error allocating array vs_read'
 
   filename = prname_lp(1:len_trim(prname_lp))//'vs.bin'
-  open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted')
-  
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading vs.bin file'
+  endif
+
   read(28) vs_read
   close(28)
 
diff --git a/src/generate_databases/model_ipati.f90 b/src/generate_databases/model_ipati.f90
new file mode 100644
index 000000000..f5739bd7f
--- /dev/null
+++ b/src/generate_databases/model_ipati.f90
@@ -0,0 +1,113 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+!--------------------------------------------------------------------------------------------------
+!
+! IPATI
+!
+! based on given rho and vp structure for GLL files
+!
+!--------------------------------------------------------------------------------------------------
+
+  subroutine model_ipati(myrank,nspec,LOCAL_PATH)
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer, intent(in) :: myrank,nspec
+  character(len=256) :: LOCAL_PATH
+
+  ! local parameters
+  real, dimension(:,:,:,:),allocatable :: vp_read,vs_read,rho_read
+  integer :: ier
+  character(len=256) :: prname_lp,filename
+
+  ! -----------------------------------------------------------------------------
+
+  ! note: vp not vs structure is available (as is often the case in exploration seismology),
+  ! scaling factor
+  real, parameter :: SCALING_FACTOR = 1.0/1.8
+
+  ! -----------------------------------------------------------------------------
+
+  ! user output
+  if (myrank==0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'using external IPATI model from:',trim(LOCAL_PATH)
+    write(IMAIN,*) 'scaling factor: ',SCALING_FACTOR
+    write(IMAIN,*)
+  endif
+
+  ! processors name
+  write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'proc',myrank,'_'
+
+  ! density
+  allocate( rho_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array rho_read'
+
+  filename = prname_lp(1:len_trim(prname_lp))//'rho.bin'
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading rho.bin file'
+  endif
+
+  read(28) rho_read
+  close(28)
+
+  ! vp
+  allocate( vp_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array vp_read'
+
+  filename = prname_lp(1:len_trim(prname_lp))//'vp.bin'
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading vp.bin file'
+  endif
+
+  read(28) vp_read
+  close(28)
+
+  ! vs scaled from vp
+  allocate( vs_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array vs_read'
+
+  ! scaling
+  vs_read = vp_read * SCALING_FACTOR
+
+  ! isotropic model parameters
+  rhostore    = rho_read
+  kappastore  = rhostore * ( vp_read * vp_read - FOUR_THIRDS * vs_read * vs_read )
+  mustore     = rhostore * vs_read * vs_read
+  rho_vp = rhostore * vp_read
+  rho_vs = rhostore * vs_read
+
+  ! free memory
+  deallocate( rho_read,vp_read,vs_read)
+
+  end subroutine model_ipati
diff --git a/src/generate_databases/model_salton_trough.f90 b/src/generate_databases/model_salton_trough.f90
index 7e17ff64d..4776887ae 100644
--- a/src/generate_databases/model_salton_trough.f90
+++ b/src/generate_databases/model_salton_trough.f90
@@ -43,7 +43,7 @@ module salton_trough_par
   double precision, parameter :: GOCAD_ST_V_X = 109670.74, GOCAD_ST_V_Y = 71530.72
   double precision, parameter :: GOCAD_ST_W_Z =  7666.334
   double precision, parameter :: GOCAD_ST_NO_DATA_VALUE = -99999
-  
+
   real,dimension(:,:,:),allocatable :: vp_array
 
   end module salton_trough_par
@@ -63,8 +63,8 @@ subroutine model_salton_trough_broadcast(myrank)
 
   ! local parameters
   integer :: ier
-  
-  
+
+
   allocate(vp_array(GOCAD_ST_NU,GOCAD_ST_NV,GOCAD_ST_NW),stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating vp_array for salton')
 
@@ -92,11 +92,11 @@ subroutine read_salton_sea_model()
 
   ! array length
   reclen=(GOCAD_ST_NU * GOCAD_ST_NV * GOCAD_ST_NW) * 4
-  
-  ! file name  
+
+  ! file name
   call get_value_string(SALTON_SEA_MODEL_FILE,'model.SALTON_SEA_MODEL_FILE', &
                        'DATA/st_3D_block_harvard/regrid3_vel_p.bin')
- 
+
   ! reads in file values
   open(11,file=trim(SALTON_SEA_MODEL_FILE), &
         status='old',action='read',form='unformatted',access='direct',recl=reclen,iostat=ios)
@@ -104,10 +104,10 @@ subroutine read_salton_sea_model()
     print *,'error opening file: ',trim(SALTON_SEA_MODEL_FILE),' iostat = ', ios
     call exit_mpi(0,'Error opening file salton trough')
   endif
-  
-  read(11,rec=1,iostat=ios) vp_array  
+
+  read(11,rec=1,iostat=ios) vp_array
   if (ios /= 0) stop 'Error reading vp_array'
-  
+
   close(11)
 
   end subroutine read_salton_sea_model
@@ -126,7 +126,7 @@ subroutine model_salton_trough(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
   use create_regions_mesh_ext_par
   implicit none
 
-  ! GLL point 
+  ! GLL point
   double precision, intent(in) :: xmesh,ymesh,zmesh
 
   ! density, Vp and Vs
@@ -139,20 +139,20 @@ subroutine model_salton_trough(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
   double precision :: uc,vc,wc
   double precision :: vp_st,vs_st,rho_st
 
-  ! GLL point location converted to u,v,w  
+  ! GLL point location converted to u,v,w
   call vx_xyz2uvw(xmesh,ymesh,zmesh,uc,vc,wc)
 
   ! model values
   call vx_xyz_interp(uc,vc,wc,vp_st,vs_st,rho_st)
-  
+
   ! converts to custom real
   vp = vp_st
   vs = vs_st
   rho = rho_st
-  
+
   ! no attenuation info
   qmu_atten = 0.0
-  
+
   end subroutine model_salton_trough
 
 !
@@ -221,7 +221,7 @@ subroutine vx_xyz_interp(uc,vc,wc, vp, vs, rho)
     v8 = vp_array(i,j+1,k+1)
     vi = vp_array(i+ixi,j+ieta,k+iga)
     !    print *, v1, v2, v3, v4, v5, v6, v7, v8
-    
+
     if ((v1 - GOCAD_ST_NO_DATA_VALUE) > eps .and. &
        (v2 - GOCAD_ST_NO_DATA_VALUE) > eps .and. &
        (v3 - GOCAD_ST_NO_DATA_VALUE) > eps .and. &
@@ -240,7 +240,7 @@ subroutine vx_xyz_interp(uc,vc,wc, vp, vs, rho)
                 v8 * (1-xi) * eta * ga)
     else if ((vi - GOCAD_ST_NO_DATA_VALUE) > eps) then
       vp = dble(vi)
-      
+
   !    else if ((v1 - GOCAD_ST_NO_DATA_VALUE) > eps) then
   !      vp = dble(v1)
   !    else if ((v2 - GOCAD_ST_NO_DATA_VALUE) > eps) then
@@ -257,21 +257,21 @@ subroutine vx_xyz_interp(uc,vc,wc, vp, vs, rho)
   !      vp = dble(v7)
   !    else if ((v7 - GOCAD_ST_NO_DATA_VALUE) > eps) then
   !      vp = dble(v8)
-  
+
     else
       vp = GOCAD_ST_NO_DATA_VALUE
     endif
-    
+
     ! depth
     zmesh = wc / (GOCAD_ST_NW - 1) * GOCAD_ST_W_Z + GOCAD_ST_O_Z
-    
+
     ! vs
     if (zmesh > -8500.)  then
       vs = vp / (2 - (0.27*zmesh/(-8500)))
     else
       vs = vp/1.73
     endif
-    
+
     ! density
     if (vp > 2160.) then
       rho = vp/3 + 1280.
diff --git a/src/generate_databases/model_tomography.f90 b/src/generate_databases/model_tomography.f90
index bdb83bb4e..5b4fbd802 100644
--- a/src/generate_databases/model_tomography.f90
+++ b/src/generate_databases/model_tomography.f90
@@ -82,7 +82,7 @@ subroutine model_tomography_broadcast(myrank)
   !if(myrank == 0) call read_external_model()
   ! broadcast the information read on the master to the nodes, e.g.
   !call bcast_all_i(nrecord,1)
-  
+
   !if( myrank /= 0 ) then
   ! allocate( vp_tomography(1:nrecord) ,stat=ier)
   ! if( ier /= 0 ) stop 'error allocating array vp_tomography'
@@ -161,7 +161,6 @@ subroutine read_model_tomography(myrank)
 
   end subroutine read_model_tomography
 
-
 !
 !-------------------------------------------------------------------------------------------------
 !
@@ -179,7 +178,7 @@ subroutine model_tomography(xmesh,ymesh,zmesh,rho_final,vp_final,vs_final,qmu_at
   !double precision, intent(in) :: VP_MIN,VS_MIN,RHO_MIN,VP_MAX,VS_MAX,RHO_MAX
 
   double precision, intent(in) :: xmesh,ymesh,zmesh
-  
+
   real(kind=CUSTOM_REAL), intent(out) :: vp_final,vs_final,rho_final,qmu_atten
 
   ! local parameters
@@ -367,6 +366,6 @@ subroutine model_tomography(xmesh,ymesh,zmesh,rho_final,vp_final,vs_final,qmu_at
   if(rho_final > RHO_MAX) rho_final = RHO_MAX
 
   ! attenuation: arbitrary value, see maximum in constants.h
-  qmu_atten = ATTENUATION_COMP_MAXIMUM   
+  qmu_atten = ATTENUATION_COMP_MAXIMUM
 
   end subroutine model_tomography
diff --git a/src/generate_databases/save_arrays_solver.f90 b/src/generate_databases/save_arrays_solver.f90
index 5d4a3fa55..08da5c804 100644
--- a/src/generate_databases/save_arrays_solver.f90
+++ b/src/generate_databases/save_arrays_solver.f90
@@ -28,114 +28,124 @@
 ! for external mesh
 
   subroutine save_arrays_solver_ext_mesh(nspec,nglob, &
-                    xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, &
-                    gammaxstore,gammaystore,gammazstore, &
-                    jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
-                    rhostore,kappastore,mustore, &
-                    rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
-                    rho_vpI,rho_vpII,rho_vsI, &
-                    rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
-                    OCEANS,rmass_ocean_load,NGLOB_OCEAN,&
+!                    xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, &
+!                    gammaxstore,gammaystore,gammazstore, &
+!                    jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
+!                    rhostore,kappastore,mustore, &
+!                    rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
+!                    rho_vpI,rho_vpII,rho_vsI, &
+!                    rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
+                    OCEANS, &
+!                    rmass_ocean_load,NGLOB_OCEAN,&
                     ibool, &
-                    xstore_dummy,ystore_dummy,zstore_dummy, &
-                    abs_boundary_normal,abs_boundary_jacobian2Dw, &
-                    abs_boundary_ijk,abs_boundary_ispec, &
-                    num_abs_boundary_faces, &
-                    free_surface_normal,free_surface_jacobian2Dw, &
-                    free_surface_ijk,free_surface_ispec, &
-                    num_free_surface_faces, &
-                    coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-                    coupling_ac_el_ijk,coupling_ac_el_ispec, &
-                    num_coupling_ac_el_faces, &
-                    coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
-                    coupling_ac_po_ijk,coupling_ac_po_ispec, &
-                    num_coupling_ac_po_faces, &
-                    coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
-                    coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
-                    coupling_po_el_ispec,num_coupling_el_po_faces, &
+!                    xstore_dummy,ystore_dummy,zstore_dummy, &
+!                    abs_boundary_normal,abs_boundary_jacobian2Dw, &
+!                    abs_boundary_ijk,abs_boundary_ispec, &
+!                    num_abs_boundary_faces, &
+!                    free_surface_normal,free_surface_jacobian2Dw, &
+!                    free_surface_ijk,free_surface_ispec, &
+!                    num_free_surface_faces, &
+!                    coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
+!                    coupling_ac_el_ijk,coupling_ac_el_ispec, &
+!                    num_coupling_ac_el_faces, &
+!                    coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
+!                    coupling_ac_po_ijk,coupling_ac_po_ispec, &
+!                    num_coupling_ac_po_faces, &
+!                    coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
+!                    coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
+!                    coupling_po_el_ispec,num_coupling_el_po_faces, &
                     num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, &
                     max_interface_size_ext_mesh,ibool_interfaces_ext_mesh, &
-                    prname,SAVE_MESH_FILES, &
-                    ANISOTROPY,NSPEC_ANISO, &
-                    c11store,c12store,c13store,c14store,c15store,c16store, &
-                    c22store,c23store,c24store,c25store,c26store,c33store, &
-                    c34store,c35store,c36store,c44store,c45store,c46store, &
-                    c55store,c56store,c66store, &
-                    ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
-                    ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
-                    nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
-                    num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
-                    num_phase_ispec_elastic,phase_ispec_inner_elastic, &
-                    num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic)
+!                    prname, &
+                    SAVE_MESH_FILES, &
+                    ANISOTROPY &
+!                    NSPEC_ANISO, &
+!                    c11store,c12store,c13store,c14store,c15store,c16store, &
+!                    c22store,c23store,c24store,c25store,c26store,c33store, &
+!                    c34store,c35store,c36store,c44store,c45store,c46store, &
+!                    c55store,c56store,c66store, &
+!                    ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
+!                    ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
+!                    nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
+!                    num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+!                    num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+!                    num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, &
+!                    num_colors_outer_acoustic,num_colors_inner_acoustic, &
+!                    num_elem_colors_acoustic, &
+!                    num_colors_outer_elastic,num_colors_inner_elastic, &
+!                    num_elem_colors_elastic, &
+              )
+
+  use create_regions_mesh_ext_par
 
   implicit none
 
-  include "constants.h"
+!  include "constants.h"
 
   integer :: nspec,nglob
 
 ! jacobian
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xixstore,xiystore,xizstore, &
-            etaxstore,etaystore,etazstore,gammaxstore,gammaystore,gammazstore,jacobianstore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vp,rho_vs
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xixstore,xiystore,xizstore, &
+!            etaxstore,etaystore,etazstore,gammaxstore,gammaystore,gammazstore,jacobianstore
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vp,rho_vs
 
 ! attenuation
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: qmu_attenuation_store
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: qmu_attenuation_store
 
 ! material
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rhostore,kappastore,mustore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: etastore,phistore,tortstore
-  real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,nspec) :: rhoarraystore
-  real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,nspec) :: kappaarraystore
-  real(kind=CUSTOM_REAL), dimension(6,NGLLX,NGLLY,NGLLZ,nspec) :: permstore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vpI,rho_vpII,rho_vsI
-  real(kind=CUSTOM_REAL), dimension(nglob) :: rmass,rmass_acoustic, &
-            rmass_solid_poroelastic,rmass_fluid_poroelastic
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rhostore,kappastore,mustore
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: etastore,phistore,tortstore
+!  real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,nspec) :: rhoarraystore
+!  real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,nspec) :: kappaarraystore
+!  real(kind=CUSTOM_REAL), dimension(6,NGLLX,NGLLY,NGLLZ,nspec) :: permstore
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vpI,rho_vpII,rho_vsI
+!  real(kind=CUSTOM_REAL), dimension(nglob) :: rmass,rmass_acoustic, &
+!            rmass_solid_poroelastic,rmass_fluid_poroelastic
 ! ocean load
   logical :: OCEANS
-  integer :: NGLOB_OCEAN
-  real(kind=CUSTOM_REAL),dimension(NGLOB_OCEAN) :: rmass_ocean_load
+!  integer :: NGLOB_OCEAN
+!  real(kind=CUSTOM_REAL),dimension(NGLOB_OCEAN) :: rmass_ocean_load
 
 ! mesh coordinates
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
-  real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
+!  real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
 
 ! absorbing boundary surface
-  integer :: num_abs_boundary_faces
-  real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces)
-  real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces)
-  integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces)
-  integer :: abs_boundary_ispec(num_abs_boundary_faces)
+!  integer :: num_abs_boundary_faces
+!  real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces)
+!  real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces)
+!  integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces)
+!  integer :: abs_boundary_ispec(num_abs_boundary_faces)
 
 ! free surface
-  integer :: num_free_surface_faces
-  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
-  real(kind=CUSTOM_REAL) :: free_surface_jacobian2Dw(NGLLSQUARE,num_free_surface_faces)
-  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
-  integer :: free_surface_ispec(num_free_surface_faces)
+!  integer :: num_free_surface_faces
+!  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
+!  real(kind=CUSTOM_REAL) :: free_surface_jacobian2Dw(NGLLSQUARE,num_free_surface_faces)
+!  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
+!  integer :: free_surface_ispec(num_free_surface_faces)
 
 ! acoustic-elastic coupling surface
-  integer :: num_coupling_ac_el_faces
-  real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces)
-  real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces)
-  integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces)
-  integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces)
+!  integer :: num_coupling_ac_el_faces
+!  real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces)
+!  real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces)
+!  integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces)
+!  integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces)
 
 ! acoustic-poroelastic coupling surface
-  integer :: num_coupling_ac_po_faces
-  real(kind=CUSTOM_REAL) :: coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces)
-  real(kind=CUSTOM_REAL) :: coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces)
-  integer :: coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces)
-  integer :: coupling_ac_po_ispec(num_coupling_ac_po_faces)
+!  integer :: num_coupling_ac_po_faces
+!  real(kind=CUSTOM_REAL) :: coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces)
+!  real(kind=CUSTOM_REAL) :: coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces)
+!  integer :: coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces)
+!  integer :: coupling_ac_po_ispec(num_coupling_ac_po_faces)
 
 ! elastic-poroelastic coupling surface
-  integer :: num_coupling_el_po_faces
-  real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces)
-  real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_el_po_ispec(num_coupling_el_po_faces)
-  integer :: coupling_po_el_ispec(num_coupling_el_po_faces)
+!  integer :: num_coupling_el_po_faces
+!  real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces)
+!  real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces)
+!  integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
+!  integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
+!  integer :: coupling_el_po_ispec(num_coupling_el_po_faces)
+!  integer :: coupling_po_el_ispec(num_coupling_el_po_faces)
 
 ! MPI interfaces
   integer :: num_interfaces_ext_mesh
@@ -146,35 +156,43 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, &
   integer :: max_nibool_interfaces_ext_mesh
 
 ! file name
-  character(len=256) prname
+!  character(len=256) prname
   logical :: SAVE_MESH_FILES
 
 ! anisotropy
   logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
+!  integer :: NSPEC_ANISO
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+!            c11store,c12store,c13store,c14store,c15store,c16store, &
+!            c22store,c23store,c24store,c25store,c26store,c33store, &
+!            c34store,c35store,c36store,c44store,c45store,c46store, &
+!            c55store,c56store,c66store
 
 ! material domain flags
-  logical, dimension(nspec) :: ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic
+!  logical, dimension(nspec) :: ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic
 
 ! inner/outer elements
-  logical,dimension(nspec) :: ispec_is_inner
-  integer :: nspec_inner_acoustic,nspec_outer_acoustic
-  integer :: nspec_inner_elastic,nspec_outer_elastic
-  integer :: nspec_inner_poroelastic,nspec_outer_poroelastic
+!  logical,dimension(nspec) :: ispec_is_inner
+!  integer :: nspec_inner_acoustic,nspec_outer_acoustic
+!  integer :: nspec_inner_elastic,nspec_outer_elastic
+!  integer :: nspec_inner_poroelastic,nspec_outer_poroelastic
 
-  integer :: num_phase_ispec_acoustic
-  integer,dimension(num_phase_ispec_acoustic,2) :: phase_ispec_inner_acoustic
+!  integer :: num_phase_ispec_acoustic
+!  integer,dimension(num_phase_ispec_acoustic,2) :: phase_ispec_inner_acoustic
 
-  integer :: num_phase_ispec_elastic
-  integer,dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+!  integer :: num_phase_ispec_elastic
+!  integer,dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
 
-  integer :: num_phase_ispec_poroelastic
-  integer,dimension(num_phase_ispec_poroelastic,2) :: phase_ispec_inner_poroelastic
+!  integer :: num_phase_ispec_poroelastic
+!  integer,dimension(num_phase_ispec_poroelastic,2) :: phase_ispec_inner_poroelastic
+
+  ! mesh coloring
+!  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
+!  integer, dimension(num_colors_outer_acoustic + num_colors_inner_acoustic) :: &
+!    num_elem_colors_acoustic
+!  integer :: num_colors_outer_elastic,num_colors_inner_elastic
+!  integer, dimension(num_colors_outer_elastic + num_colors_inner_elastic) :: &
+!    num_elem_colors_elastic
 
 ! local parameters
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: v_tmp
@@ -183,13 +201,15 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, &
   !real(kind=CUSTOM_REAL) :: minimum(1)
   integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
   integer :: ier,i
-  logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION
+!  logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION
   character(len=256) :: filename
 
   integer, dimension(:), allocatable :: iglob_tmp
   integer :: j,inum
 
-! saves mesh file proc***_external_mesh.bin
+  logical,parameter :: DEBUG = .false.
+
+  ! saves mesh file proc***_external_mesh.bin
   filename = prname(1:len_trim(prname))//'external_mesh.bin'
   open(unit=IOUT,file=trim(filename),status='unknown',action='write',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening database proc######_external_mesh.bin'
@@ -247,6 +267,8 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, &
 ! poroelastic
   call any_all_l( ANY(ispec_is_poroelastic), POROELASTIC_SIMULATION )
   if( POROELASTIC_SIMULATION ) then
+    stop 'not implemented yet: write rmass_solid_poroelastic .. '
+
     write(IOUT) rmass_solid_poroelastic
     write(IOUT) rmass_fluid_poroelastic
     write(IOUT) rhoarraystore
@@ -370,6 +392,18 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, &
     if(num_phase_ispec_poroelastic > 0 ) write(IOUT) phase_ispec_inner_poroelastic
   endif
 
+  ! mesh coloring
+  if( USE_MESH_COLORING_GPU ) then
+    if( ACOUSTIC_SIMULATION ) then
+      write(IOUT) num_colors_outer_acoustic,num_colors_inner_acoustic
+      write(IOUT) num_elem_colors_acoustic
+    endif
+    if( ELASTIC_SIMULATION ) then
+      write(IOUT) num_colors_outer_elastic,num_colors_inner_elastic
+      write(IOUT) num_elem_colors_elastic
+    endif
+  endif
+
   close(IOUT)
 
 
@@ -461,180 +495,181 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, &
                         xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
                         qmu_attenuation_store,filename)
 
+    deallocate(v_tmp)
+
     ! VTK file output
-    ! acoustic-elastic domains
-    if( ACOUSTIC_SIMULATION .and. ELASTIC_SIMULATION ) then
-      ! saves points on acoustic-elastic coupling interface
-      allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_el_faces),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array iglob_tmp'
-      inum = 0
-      iglob_tmp(:) = 0
-      do i=1,num_coupling_ac_el_faces
-        do j=1,NGLLSQUARE
-          inum = inum+1
-          iglob_tmp(inum) = ibool(coupling_ac_el_ijk(1,j,i), &
-                                  coupling_ac_el_ijk(2,j,i), &
-                                  coupling_ac_el_ijk(3,j,i), &
-                                  coupling_ac_el_ispec(i) )
+    if( DEBUG ) then
+      ! acoustic-elastic domains
+      if( ACOUSTIC_SIMULATION .and. ELASTIC_SIMULATION ) then
+        ! saves points on acoustic-elastic coupling interface
+        allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_el_faces),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array iglob_tmp'
+        inum = 0
+        iglob_tmp(:) = 0
+        do i=1,num_coupling_ac_el_faces
+          do j=1,NGLLSQUARE
+            inum = inum+1
+            iglob_tmp(inum) = ibool(coupling_ac_el_ijk(1,j,i), &
+                                    coupling_ac_el_ijk(2,j,i), &
+                                    coupling_ac_el_ijk(3,j,i), &
+                                    coupling_ac_el_ispec(i) )
+          enddo
         enddo
-      enddo
-      filename = prname(1:len_trim(prname))//'coupling_acoustic_elastic'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        iglob_tmp,NGLLSQUARE*num_coupling_ac_el_faces, &
-                        filename)
-
-      ! saves acoustic/elastic flag
-      allocate(v_tmp_i(nspec),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array v_tmp_i'
-      do i=1,nspec
-        if( ispec_is_acoustic(i) ) then
-          v_tmp_i(i) = 1
-        else if( ispec_is_elastic(i) ) then
-          v_tmp_i(i) = 2
-        else
-          v_tmp_i(i) = 0
-        endif
-      enddo
-      filename = prname(1:len_trim(prname))//'acoustic_elastic_flag'
-      call write_VTK_data_elem_i(nspec,nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
-                        v_tmp_i,filename)
-
-      deallocate(iglob_tmp,v_tmp_i)
-    endif
-
-    ! saves free surface points
-    if( num_free_surface_faces > 0 ) then
-      ! saves free surface interface points
-      allocate( iglob_tmp(NGLLSQUARE*num_free_surface_faces),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array iglob_tmp'
-      inum = 0
-      iglob_tmp(:) = 0
-      do i=1,num_free_surface_faces
-        do j=1,NGLLSQUARE
-          inum = inum+1
-          iglob_tmp(inum) = ibool(free_surface_ijk(1,j,i), &
-                                  free_surface_ijk(2,j,i), &
-                                  free_surface_ijk(3,j,i), &
-                                  free_surface_ispec(i) )
+        filename = prname(1:len_trim(prname))//'coupling_acoustic_elastic'
+        call write_VTK_data_points(nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                          iglob_tmp,NGLLSQUARE*num_coupling_ac_el_faces, &
+                          filename)
+
+        ! saves acoustic/elastic flag
+        allocate(v_tmp_i(nspec),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array v_tmp_i'
+        do i=1,nspec
+          if( ispec_is_acoustic(i) ) then
+            v_tmp_i(i) = 1
+          else if( ispec_is_elastic(i) ) then
+            v_tmp_i(i) = 2
+          else
+            v_tmp_i(i) = 0
+          endif
         enddo
-      enddo
-      filename = prname(1:len_trim(prname))//'free_surface'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        iglob_tmp,NGLLSQUARE*num_free_surface_faces, &
-                        filename)
-
-      deallocate(iglob_tmp)
-    endif
-
-    ! debug: saves 1. MPI interface
-    !if( num_interfaces_ext_mesh >= 1 ) then
-    !  filename = prname(1:len_trim(prname))//'MPI_1_points'
-    !  call write_VTK_data_points(nglob, &
-    !                    xstore_dummy,ystore_dummy,zstore_dummy, &
-    !                    ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), &
-    !                    nibool_interfaces_ext_mesh(1), &
-    !                    filename)
-    !endif
-
-    ! acoustic-poroelastic domains
-    if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then
-      ! saves points on acoustic-poroelastic coupling interface
-      allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_po_faces),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array iglob_tmp'
-      inum = 0
-      iglob_tmp(:) = 0
-      do i=1,num_coupling_ac_po_faces
-        do j=1,NGLLSQUARE
-          inum = inum+1
-          iglob_tmp(inum) = ibool(coupling_ac_po_ijk(1,j,i), &
-                                  coupling_ac_po_ijk(2,j,i), &
-                                  coupling_ac_po_ijk(3,j,i), &
-                                  coupling_ac_po_ispec(i) )
+        filename = prname(1:len_trim(prname))//'acoustic_elastic_flag'
+        call write_VTK_data_elem_i(nspec,nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                          v_tmp_i,filename)
+
+        deallocate(iglob_tmp,v_tmp_i)
+      endif
+
+      ! saves free surface points
+      if( num_free_surface_faces > 0 ) then
+        ! saves free surface interface points
+        allocate( iglob_tmp(NGLLSQUARE*num_free_surface_faces),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array iglob_tmp'
+        inum = 0
+        iglob_tmp(:) = 0
+        do i=1,num_free_surface_faces
+          do j=1,NGLLSQUARE
+            inum = inum+1
+            iglob_tmp(inum) = ibool(free_surface_ijk(1,j,i), &
+                                    free_surface_ijk(2,j,i), &
+                                    free_surface_ijk(3,j,i), &
+                                    free_surface_ispec(i) )
+          enddo
         enddo
-      enddo
-      filename = prname(1:len_trim(prname))//'coupling_acoustic_poroelastic'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        iglob_tmp,NGLLSQUARE*num_coupling_ac_po_faces, &
-                        filename)
-
-      ! saves acoustic/poroelastic flag
-      allocate(v_tmp_i(nspec),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array v_tmp_i'
-      do i=1,nspec
-        if( ispec_is_acoustic(i) ) then
-          v_tmp_i(i) = 1
-        else if( ispec_is_poroelastic(i) ) then
-          v_tmp_i(i) = 2
-        else
-          v_tmp_i(i) = 0
-        endif
-      enddo
-      filename = prname(1:len_trim(prname))//'acoustic_poroelastic_flag'
-      call write_VTK_data_elem_i(nspec,nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
-                        v_tmp_i,filename)
-
-    deallocate(v_tmp_i,iglob_tmp)
-    endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION )
-
-    ! elastic-poroelastic domains
-    if( ELASTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then
-      ! saves points on elastic-poroelastic coupling interface
-      allocate( iglob_tmp(NGLLSQUARE*num_coupling_el_po_faces),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array iglob_tmp'
-      inum = 0
-      iglob_tmp(:) = 0
-      do i=1,num_coupling_el_po_faces
-        do j=1,NGLLSQUARE
-          inum = inum+1
-          iglob_tmp(inum) = ibool(coupling_el_po_ijk(1,j,i), &
-                                  coupling_el_po_ijk(2,j,i), &
-                                  coupling_el_po_ijk(3,j,i), &
-                                  coupling_el_po_ispec(i) )
+        filename = prname(1:len_trim(prname))//'free_surface'
+        call write_VTK_data_points(nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                          iglob_tmp,NGLLSQUARE*num_free_surface_faces, &
+                          filename)
+
+        deallocate(iglob_tmp)
+      endif
+
+      ! debug: saves 1. MPI interface
+      !if( num_interfaces_ext_mesh >= 1 ) then
+      !  filename = prname(1:len_trim(prname))//'MPI_1_points'
+      !  call write_VTK_data_points(nglob, &
+      !                    xstore_dummy,ystore_dummy,zstore_dummy, &
+      !                    ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), &
+      !                    nibool_interfaces_ext_mesh(1), &
+      !                    filename)
+      !endif
+
+      ! acoustic-poroelastic domains
+      if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then
+        ! saves points on acoustic-poroelastic coupling interface
+        allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_po_faces),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array iglob_tmp'
+        inum = 0
+        iglob_tmp(:) = 0
+        do i=1,num_coupling_ac_po_faces
+          do j=1,NGLLSQUARE
+            inum = inum+1
+            iglob_tmp(inum) = ibool(coupling_ac_po_ijk(1,j,i), &
+                                    coupling_ac_po_ijk(2,j,i), &
+                                    coupling_ac_po_ijk(3,j,i), &
+                                    coupling_ac_po_ispec(i) )
+          enddo
         enddo
-      enddo
-      filename = prname(1:len_trim(prname))//'coupling_elastic_poroelastic'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        iglob_tmp,NGLLSQUARE*num_coupling_el_po_faces, &
-                        filename)
-
-      ! saves elastic/poroelastic flag
-      allocate(v_tmp_i(nspec),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array v_tmp_i'
-      do i=1,nspec
-        if( ispec_is_elastic(i) ) then
-          v_tmp_i(i) = 1
-        else if( ispec_is_poroelastic(i) ) then
-          v_tmp_i(i) = 2
-        else
-          v_tmp_i(i) = 0
-        endif
-      enddo
-      filename = prname(1:len_trim(prname))//'elastic_poroelastic_flag'
-      call write_VTK_data_elem_i(nspec,nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
-                        v_tmp_i,filename)
-
-    deallocate(v_tmp_i,iglob_tmp)
-    endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION
-
-    !debug: saves 1. MPI interface
-    !    if( num_interfaces_ext_mesh >= 1 ) then
-    !      filename = prname(1:len_trim(prname))//'MPI_1_points'
-    !      call write_VTK_data_points(nglob, &
-    !                        xstore_dummy,ystore_dummy,zstore_dummy, &
-    !                        ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), &
-    !                        nibool_interfaces_ext_mesh(1), &
-    !                        filename)
-    !    endif
-    !
-
-    deallocate(v_tmp)
+        filename = prname(1:len_trim(prname))//'coupling_acoustic_poroelastic'
+        call write_VTK_data_points(nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                          iglob_tmp,NGLLSQUARE*num_coupling_ac_po_faces, &
+                          filename)
+
+        ! saves acoustic/poroelastic flag
+        allocate(v_tmp_i(nspec),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array v_tmp_i'
+        do i=1,nspec
+          if( ispec_is_acoustic(i) ) then
+            v_tmp_i(i) = 1
+          else if( ispec_is_poroelastic(i) ) then
+            v_tmp_i(i) = 2
+          else
+            v_tmp_i(i) = 0
+          endif
+        enddo
+        filename = prname(1:len_trim(prname))//'acoustic_poroelastic_flag'
+        call write_VTK_data_elem_i(nspec,nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                          v_tmp_i,filename)
+
+      deallocate(v_tmp_i,iglob_tmp)
+      endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION )
+
+      ! elastic-poroelastic domains
+      if( ELASTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then
+        ! saves points on elastic-poroelastic coupling interface
+        allocate( iglob_tmp(NGLLSQUARE*num_coupling_el_po_faces),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array iglob_tmp'
+        inum = 0
+        iglob_tmp(:) = 0
+        do i=1,num_coupling_el_po_faces
+          do j=1,NGLLSQUARE
+            inum = inum+1
+            iglob_tmp(inum) = ibool(coupling_el_po_ijk(1,j,i), &
+                                    coupling_el_po_ijk(2,j,i), &
+                                    coupling_el_po_ijk(3,j,i), &
+                                    coupling_el_po_ispec(i) )
+          enddo
+        enddo
+        filename = prname(1:len_trim(prname))//'coupling_elastic_poroelastic'
+        call write_VTK_data_points(nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                          iglob_tmp,NGLLSQUARE*num_coupling_el_po_faces, &
+                          filename)
+
+        ! saves elastic/poroelastic flag
+        allocate(v_tmp_i(nspec),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array v_tmp_i'
+        do i=1,nspec
+          if( ispec_is_elastic(i) ) then
+            v_tmp_i(i) = 1
+          else if( ispec_is_poroelastic(i) ) then
+            v_tmp_i(i) = 2
+          else
+            v_tmp_i(i) = 0
+          endif
+        enddo
+        filename = prname(1:len_trim(prname))//'elastic_poroelastic_flag'
+        call write_VTK_data_elem_i(nspec,nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                          v_tmp_i,filename)
+
+      deallocate(v_tmp_i,iglob_tmp)
+      endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION
+
+      !debug: saves 1. MPI interface
+      !if( num_interfaces_ext_mesh >= 1 ) then
+      !  filename = prname(1:len_trim(prname))//'MPI_1_points'
+      !  call write_VTK_data_points(nglob, &
+      !                    xstore_dummy,ystore_dummy,zstore_dummy, &
+      !                    ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), &
+      !                    nibool_interfaces_ext_mesh(1), &
+      !                    filename)
+      !endif
+    endif ! DEBUG
 
   endif ! SAVE_MESH_FILES
 
diff --git a/src/meshfem3D/check_mesh_quality.f90 b/src/meshfem3D/check_mesh_quality.f90
index 8c262d69f..ad5fcaa6b 100644
--- a/src/meshfem3D/check_mesh_quality.f90
+++ b/src/meshfem3D/check_mesh_quality.f90
@@ -54,7 +54,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
 
   logical :: CREATE_VTK_FILES
   character(len=256) prname
-  
+
   ! local parameters
   integer :: ispec,ispec_min_edge_length,ispec_max_edge_length,ispec_max_skewness, &
        ispec_max_skewness_MPI,skewness_max_rank,NSPEC_ALL_SLICES
@@ -91,6 +91,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
   !logical :: USE_OPENDX
 
   !character(len=256):: line
+  integer,dimension(1) :: tmp_ispec_max_skewness,tmp_ispec_max_skewness_MPI
 
   ! debug: for vtk output
   real(kind=CUSTOM_REAL),dimension(:),allocatable :: tmp1
@@ -123,6 +124,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
 
   ispec_min_edge_length = -1
   ispec_max_edge_length = -1
+  ispec_max_skewness = -1
 
   ! debug: for vtk output
   if( CREATE_VTK_FILES ) then
@@ -145,7 +147,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
      if(equiangle_skewness > equiangle_skewness_max) ispec_max_skewness = ispec
 
      if( CREATE_VTK_FILES ) tmp1(ispec) = equiangle_skewness
-     
+
      ! compute minimum and maximum of quality numbers
      equiangle_skewness_min = min(equiangle_skewness_min,equiangle_skewness)
      edge_aspect_ratio_min = min(edge_aspect_ratio_min,edge_aspect_ratio)
@@ -183,7 +185,8 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
 
 
   if((myrank == skewness_max_rank) .and. (myrank /= 0)) then
-     call send_i_t(ispec_max_skewness,1,0)
+     tmp_ispec_max_skewness(1) = ispec_max_skewness
+     call send_i_t(tmp_ispec_max_skewness,1,0)
   end if
 
 
@@ -191,7 +194,8 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
 
 
      if(skewness_max_rank /= myrank) then
-        call recv_i_t(ispec_max_skewness_MPI,1,skewness_max_rank)
+        call recv_i_t(tmp_ispec_max_skewness_MPI,1,skewness_max_rank)
+        ispec_max_skewness_MPI = tmp_ispec_max_skewness_MPI(1)
      else
         ispec_max_skewness_MPI = ispec_max_skewness
      end if
@@ -347,8 +351,8 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
   end if
 
   ! debug: for vtk output
-  if( CREATE_VTK_FILES ) then    
-    ! vtk file output    
+  if( CREATE_VTK_FILES ) then
+    ! vtk file output
     open(66,file=prname(1:len_trim(prname))//'skewness.vtk',status='unknown')
     write(66,'(a)') '# vtk DataFile Version 3.1'
     write(66,'(a)') 'material model VTK file'
@@ -357,7 +361,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
     write(66, '(a,i12,a)') 'POINTS ', nglob, ' float'
     do ipoin = 1,nglob
       write(66,*) sngl(x(ipoin)),sngl(y(ipoin)),sngl(z(ipoin))
-    enddo    
+    enddo
     write(66,*) ""
 
     ! note: indices for vtk start at 0
@@ -381,8 +385,8 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, &
       write(66,*) tmp1(ispec)
     enddo
     write(66,*) ""
-    close(66)                               
-                               
+    close(66)
+
     deallocate(tmp1)
   endif
 
diff --git a/src/meshfem3D/compute_parameters.f90 b/src/meshfem3D/compute_parameters.f90
index 1efb4186d..d2349d9c7 100644
--- a/src/meshfem3D/compute_parameters.f90
+++ b/src/meshfem3D/compute_parameters.f90
@@ -315,7 +315,7 @@ subroutine compute_parameters(NER,NEX_XI,NEX_ETA,NPROC_XI,NPROC_ETA, &
      NSPEC2DMAX_XMIN_XMAX = NSPEC2D_B_ETA
      NSPEC2DMAX_YMIN_YMAX = NSPEC2D_B_XI
 
-     !debug 
+     !debug
      !print*,'nspec minmax:',NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_yMAX
 
      ! theoretical number of Gauss-Lobatto points in radial direction
diff --git a/src/meshfem3D/create_regions_mesh.f90 b/src/meshfem3D/create_regions_mesh.f90
index a493f72f2..8433af266 100644
--- a/src/meshfem3D/create_regions_mesh.f90
+++ b/src/meshfem3D/create_regions_mesh.f90
@@ -397,7 +397,7 @@ subroutine create_regions_mesh(xgrid,ygrid,zgrid,ibool, &
     if( ier /= 0 ) stop 'error allocating array nodes_coords'
     nodes_coords(:,:) = 0.0d0
     ibool(:,:,:,:) = 0
-    
+
     do ispec=1,nspec
        ieoff = NGLLCUBE*(ispec-1)
        ilocnum = 0
@@ -413,7 +413,7 @@ subroutine create_regions_mesh(xgrid,ygrid,zgrid,ibool, &
           enddo
        enddo
     enddo
-    
+
     ! checks ibool range
     if(minval(ibool(:,:,:,:)) /= 1 .or. maxval(ibool(:,:,:,:)) /= nglob) then
        print*,'error ibool: maximum value ',maxval(ibool(:,:,:,:)) ,'should be ',nglob
diff --git a/src/meshfem3D/create_visual_files.f90 b/src/meshfem3D/create_visual_files.f90
index fca4e0f13..01e304d99 100644
--- a/src/meshfem3D/create_visual_files.f90
+++ b/src/meshfem3D/create_visual_files.f90
@@ -71,7 +71,7 @@ subroutine create_visual_files(CREATE_ABAQUS_FILES,CREATE_DX_FILES,CREATE_VTK_FI
              ibool(2,2,2,ispec)
      end do
      close(64)
-     
+
   end if
 
 
@@ -127,7 +127,7 @@ subroutine create_visual_files(CREATE_ABAQUS_FILES,CREATE_DX_FILES,CREATE_VTK_FI
   end if
 
   if( CREATE_VTK_FILES ) then
-    ! vtk file output    
+    ! vtk file output
     open(66,file=prname(1:len_trim(prname))//'mesh.vtk',status='unknown')
     write(66,'(a)') '# vtk DataFile Version 3.1'
     write(66,'(a)') 'material model VTK file'
@@ -136,7 +136,7 @@ subroutine create_visual_files(CREATE_ABAQUS_FILES,CREATE_DX_FILES,CREATE_VTK_FI
     write(66, '(a,i12,a)') 'POINTS ', nglob, ' float'
     do ipoin = 1,nglob
       write(66,*) sngl(nodes_coords(ipoin,1)),sngl(nodes_coords(ipoin,2)),sngl(nodes_coords(ipoin,3))
-    enddo    
+    enddo
     write(66,*) ""
 
     ! note: indices for vtk start at 0
@@ -161,7 +161,7 @@ subroutine create_visual_files(CREATE_ABAQUS_FILES,CREATE_DX_FILES,CREATE_VTK_FI
     enddo
     write(66,*) ""
     close(66)
-  
+
   endif
 
   call sync_all()
diff --git a/src/meshfem3D/meshfem3D.f90 b/src/meshfem3D/meshfem3D.f90
index a98eddaaa..fd88cd702 100644
--- a/src/meshfem3D/meshfem3D.f90
+++ b/src/meshfem3D/meshfem3D.f90
@@ -405,7 +405,7 @@ subroutine meshfem3D
       write(IMAIN,*) 'error: number of MPI processors actually run on: ',sizeprocs
       print*
       print*, 'error meshfem3D: number of processors supposed to run on: ',NPROC
-      print*, 'error meshfem3D: number of MPI processors actually run on: ',sizeprocs      
+      print*, 'error meshfem3D: number of MPI processors actually run on: ',sizeprocs
       print*
     endif
     call exit_MPI(myrank,'wrong number of MPI processes')
diff --git a/src/meshfem3D/store_boundaries.f90 b/src/meshfem3D/store_boundaries.f90
index 80508d785..0eb636698 100644
--- a/src/meshfem3D/store_boundaries.f90
+++ b/src/meshfem3D/store_boundaries.f90
@@ -90,21 +90,21 @@ subroutine store_boundaries(myrank,iboun,nspec,&
     ! on boundary: ymax
     if(iboun(4,ispec)) then
       ispecb4=ispecb4+1
-      if( ispecb4 > NSPEC2DMAX_YMIN_YMAX ) stop 'error NSPEC2DMAX_YMIN_YMAX too small'      
+      if( ispecb4 > NSPEC2DMAX_YMIN_YMAX ) stop 'error NSPEC2DMAX_YMIN_YMAX too small'
       ibelm_ymax(ispecb4)=ispec
     endif
 
     ! on boundary: bottom
     if(iboun(5,ispec)) then
       ispecb5=ispecb5+1
-      if( ispecb5 > NSPEC2D_BOTTOM ) stop 'error NSPEC2D_BOTTOM too small'      
+      if( ispecb5 > NSPEC2D_BOTTOM ) stop 'error NSPEC2D_BOTTOM too small'
       ibelm_bottom(ispecb5)=ispec
     endif
 
     ! on boundary: top
     if(iboun(6,ispec)) then
       ispecb6=ispecb6+1
-      if( ispecb6 > NSPEC2D_TOP ) stop 'error NSPEC2D_TOP too small'      
+      if( ispecb6 > NSPEC2D_TOP ) stop 'error NSPEC2D_TOP too small'
       ibelm_top(ispecb6)=ispec
     endif
 
diff --git a/src/shared/assemble_MPI_scalar.f90 b/src/shared/assemble_MPI_scalar.f90
index 1e1c13cc0..c74035ad4 100644
--- a/src/shared/assemble_MPI_scalar.f90
+++ b/src/shared/assemble_MPI_scalar.f90
@@ -356,4 +356,3 @@ subroutine assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,array_val, &
 
   end subroutine assemble_MPI_scalar_ext_mesh_w
 
-
diff --git a/src/shared/check_mesh_resolution.f90 b/src/shared/check_mesh_resolution.f90
index b435ffe81..ab9a38505 100644
--- a/src/shared/check_mesh_resolution.f90
+++ b/src/shared/check_mesh_resolution.f90
@@ -154,6 +154,7 @@ subroutine check_mesh_resolution(myrank,NSPEC_AB,NGLOB_AB,ibool,xstore,ystore,zs
       if( SAVE_MESH_FILES ) tmp1(ispec) = cmax
     endif
 
+
     ! suggested timestep
     dt_suggested = COURANT_SUGGESTED * distance_min / max( vpmax,vsmax )
     dt_suggested_glob = min( dt_suggested_glob, dt_suggested)
@@ -406,9 +407,11 @@ subroutine check_mesh_resolution_poro(myrank,NSPEC_AB,NGLOB_AB,ibool,xstore,ysto
   !real(kind=CUSTOM_REAL),parameter :: NELEM_PER_WAVELENGTH = 1.5
 
   logical :: has_vs_zero,has_vp2_zero
+  real(kind=CUSTOM_REAL),dimension(1) :: tmp_val
 
   ! debug: for vtk output
   real(kind=CUSTOM_REAL),dimension(:),allocatable :: tmp1,tmp2
+
   integer:: ier
   character(len=256) :: filename,prname
 
@@ -680,11 +683,15 @@ subroutine check_mesh_resolution_poro(myrank,NSPEC_AB,NGLOB_AB,ibool,xstore,ysto
       model_speed_max = vsmax_glob
     endif
   endif
-  call bcast_all_cr(model_speed_max,1)
+  tmp_val(1) = model_speed_max
+  call bcast_all_cr(tmp_val,1)
+  model_speed_max = tmp_val(1)
 
   ! returns minimum period
   if( myrank == 0 ) min_resolved_period = pmax_glob
-  call bcast_all_cr(min_resolved_period,1)
+  tmp_val(1) = min_resolved_period
+  call bcast_all_cr(tmp_val,1)
+  min_resolved_period = tmp_val(1)
 
   ! debug: for vtk output
   if( SAVE_MESH_FILES ) then
diff --git a/src/shared/combine_vol_data.f90 b/src/shared/combine_vol_data.f90
index eeb380328..332bf5847 100644
--- a/src/shared/combine_vol_data.f90
+++ b/src/shared/combine_vol_data.f90
@@ -38,7 +38,7 @@ module vtk
 
     ! maximum number of slices
     integer,parameter :: MAX_NUM_NODES = 600
-    
+
   end module vtk
 
 !
@@ -76,9 +76,9 @@ program combine_paraview_data_ext_mesh
 
   integer :: i, ios, it, ier
   integer :: iproc, proc1, proc2, num_node
-  
+
   integer,dimension(MAX_NUM_NODES) :: node_list
-  
+
   integer :: np, ne, npp, nee, nelement, njunk
 
   character(len=256) :: sline, arg(6), filename, indir, outdir
@@ -382,9 +382,9 @@ subroutine cvd_count_totals_ext_mesh(num_node,node_list,LOCAL_PATH,&
   include 'constants.h'
 
   integer,intent(in) :: num_node
-  integer,dimension(MAX_NUM_NODES),intent(in) :: node_list  
+  integer,dimension(MAX_NUM_NODES),intent(in) :: node_list
   character(len=256),intent(in) :: LOCAL_PATH
-  
+
   integer,intent(out) :: npp,nee
   logical,intent(in) :: HIGH_RESOLUTION_MESH
 
diff --git a/src/shared/compute_arrays_source.f90 b/src/shared/compute_arrays_source.f90
new file mode 100644
index 000000000..ef6729311
--- /dev/null
+++ b/src/shared/compute_arrays_source.f90
@@ -0,0 +1,352 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine compute_arrays_source(ispec_selected_source, &
+             xi_source,eta_source,gamma_source,sourcearray, &
+             Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
+             xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+             xigll,yigll,zigll,nspec)
+
+  implicit none
+
+  include "constants.h"
+
+  integer ispec_selected_source
+  integer nspec
+
+  double precision xi_source,eta_source,gamma_source
+  double precision Mxx,Myy,Mzz,Mxy,Mxz,Myz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xix,xiy,xiz,etax,etay,etaz, &
+        gammax,gammay,gammaz
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearray
+
+  double precision xixd,xiyd,xizd,etaxd,etayd,etazd,gammaxd,gammayd,gammazd
+
+! Gauss-Lobatto-Legendre points of integration and weights
+  double precision, dimension(NGLLX) :: xigll
+  double precision, dimension(NGLLY) :: yigll
+  double precision, dimension(NGLLZ) :: zigll
+
+! source arrays
+  double precision, dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearrayd
+  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: G11,G12,G13,G21,G22,G23,G31,G32,G33
+  double precision, dimension(NGLLX) :: hxis,hpxis
+  double precision, dimension(NGLLY) :: hetas,hpetas
+  double precision, dimension(NGLLZ) :: hgammas,hpgammas
+
+  integer k,l,m
+
+! calculate G_ij for general source location
+! the source does not necessarily correspond to a Gauss-Lobatto point
+  do m=1,NGLLZ
+    do l=1,NGLLY
+      do k=1,NGLLX
+
+        xixd    = dble(xix(k,l,m,ispec_selected_source))
+        xiyd    = dble(xiy(k,l,m,ispec_selected_source))
+        xizd    = dble(xiz(k,l,m,ispec_selected_source))
+        etaxd   = dble(etax(k,l,m,ispec_selected_source))
+        etayd   = dble(etay(k,l,m,ispec_selected_source))
+        etazd   = dble(etaz(k,l,m,ispec_selected_source))
+        gammaxd = dble(gammax(k,l,m,ispec_selected_source))
+        gammayd = dble(gammay(k,l,m,ispec_selected_source))
+        gammazd = dble(gammaz(k,l,m,ispec_selected_source))
+
+        G11(k,l,m) = Mxx*xixd+Mxy*xiyd+Mxz*xizd
+        G12(k,l,m) = Mxx*etaxd+Mxy*etayd+Mxz*etazd
+        G13(k,l,m) = Mxx*gammaxd+Mxy*gammayd+Mxz*gammazd
+        G21(k,l,m) = Mxy*xixd+Myy*xiyd+Myz*xizd
+        G22(k,l,m) = Mxy*etaxd+Myy*etayd+Myz*etazd
+        G23(k,l,m) = Mxy*gammaxd+Myy*gammayd+Myz*gammazd
+        G31(k,l,m) = Mxz*xixd+Myz*xiyd+Mzz*xizd
+        G32(k,l,m) = Mxz*etaxd+Myz*etayd+Mzz*etazd
+        G33(k,l,m) = Mxz*gammaxd+Myz*gammayd+Mzz*gammazd
+
+      enddo
+    enddo
+  enddo
+
+! compute Lagrange polynomials at the source location
+  call lagrange_any(xi_source,NGLLX,xigll,hxis,hpxis)
+  call lagrange_any(eta_source,NGLLY,yigll,hetas,hpetas)
+  call lagrange_any(gamma_source,NGLLZ,zigll,hgammas,hpgammas)
+
+! calculate source array
+  do m=1,NGLLZ
+    do l=1,NGLLY
+      do k=1,NGLLX
+        call multiply_arrays_source(sourcearrayd,G11,G12,G13,G21,G22,G23, &
+                  G31,G32,G33,hxis,hpxis,hetas,hpetas,hgammas,hpgammas,k,l,m)
+      enddo
+    enddo
+  enddo
+
+! distinguish between single and double precision for reals
+  if(CUSTOM_REAL == SIZE_REAL) then
+    sourcearray(:,:,:,:) = sngl(sourcearrayd(:,:,:,:))
+  else
+    sourcearray(:,:,:,:) = sourcearrayd(:,:,:,:)
+  endif
+
+  end subroutine compute_arrays_source
+
+!=============================================================================
+
+  subroutine compute_arrays_adjoint_source(myrank, adj_source_file, &
+                    xi_receiver,eta_receiver,gamma_receiver, adj_sourcearray, &
+                    xigll,yigll,zigll, &
+                    it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
+  implicit none
+
+  include 'constants.h'
+
+! input
+  integer myrank, NSTEP, it_sub_adj, NTSTEP_BETWEEN_READ_ADJSRC
+
+  double precision xi_receiver, eta_receiver, gamma_receiver
+
+  character(len=*) adj_source_file
+
+! output
+  real(kind=CUSTOM_REAL),dimension(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ) :: adj_sourcearray
+
+! Gauss-Lobatto-Legendre points of integration and weights
+  double precision, dimension(NGLLX) :: xigll
+  double precision, dimension(NGLLY) :: yigll
+  double precision, dimension(NGLLZ) :: zigll
+
+  double precision :: hxir(NGLLX), hpxir(NGLLX), hetar(NGLLY), hpetar(NGLLY), &
+        hgammar(NGLLZ), hpgammar(NGLLZ)
+
+  real(kind=CUSTOM_REAL) :: adj_src(NTSTEP_BETWEEN_READ_ADJSRC,NDIM)
+
+  integer icomp, itime, i, j, k, ios, it_start, it_end
+  double precision :: junk
+  ! note: should have same order as orientation in write_seismograms_to_file()
+  character(len=3),dimension(NDIM) :: comp != (/ "BHE", "BHN", "BHZ" /)
+  character(len=256) :: filename
+
+  ! gets channel names
+  do icomp=1,NDIM
+    call write_channel_name(icomp,comp(icomp))
+  enddo
+
+  ! range of the block we need to read
+  it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
+  it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
+
+  !adj_sourcearray(:,:,:,:,:) = 0.
+  adj_src = 0._CUSTOM_REAL
+
+  ! loops over components
+  do icomp = 1, NDIM
+
+    filename = OUTPUT_FILES_PATH(1:len_trim(OUTPUT_FILES_PATH))//'/../SEM/'//trim(adj_source_file)//'.'//comp(icomp)//'.adj'
+    open(unit=IIN,file=trim(filename),status='old',action='read',iostat = ios)
+    ! cycles to next file (this might be more error prone)
+    !if (ios /= 0) cycle
+    ! requires adjoint files to exist (users will have to be more careful in setting up adjoint runs)
+    if (ios /= 0) call exit_MPI(myrank, ' file '//trim(filename)//' does not exist - required for adjoint runs')
+
+    ! reads in adjoint source trace
+    !! skip unused blocks
+    do itime = 1, it_start-1
+      read(IIN,*,iostat=ios) junk, junk
+      if( ios /= 0 ) &
+        call exit_MPI(myrank, &
+          'file '//trim(filename)//' has wrong length, please check with your simulation duration (1111)')
+    enddo
+    !! read the block we need
+    do itime = it_start, it_end
+      read(IIN,*,iostat=ios) junk, adj_src(itime-it_start+1,icomp)
+      !!! used to check whether we read the correct block
+      ! if (icomp==1)      print *, junk, adj_src(itime-it_start+1,icomp)
+      if( ios /= 0 ) &
+        call exit_MPI(myrank, &
+          'file '//trim(filename)//' has wrong length, please check with your simulation duration (2222)')
+    enddo
+    close(IIN)
+
+  enddo
+
+  ! lagrange interpolators for receiver location
+  call lagrange_any(xi_receiver,NGLLX,xigll,hxir,hpxir)
+  call lagrange_any(eta_receiver,NGLLY,yigll,hetar,hpetar)
+  call lagrange_any(gamma_receiver,NGLLZ,zigll,hgammar,hpgammar)
+
+  ! interpolates adjoint source onto GLL points within this element
+  do k = 1, NGLLZ
+    do j = 1, NGLLY
+      do i = 1, NGLLX
+        adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:)
+      enddo
+    enddo
+  enddo
+
+end subroutine compute_arrays_adjoint_source
+
+
+! =======================================================================
+
+! compute array for acoustic source
+  subroutine compute_arrays_source_acoustic(xi_source,eta_source,gamma_source,&
+                        sourcearray,xigll,yigll,zigll,factor_source)
+
+  implicit none
+
+  include "constants.h"
+
+  double precision :: xi_source,eta_source,gamma_source
+  real(kind=CUSTOM_REAL) :: factor_source
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearray
+
+! Gauss-Lobatto-Legendre points of integration and weights
+  double precision, dimension(NGLLX) :: xigll
+  double precision, dimension(NGLLY) :: yigll
+  double precision, dimension(NGLLZ) :: zigll
+
+! local parameters
+! source arrays
+  double precision, dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearrayd
+  double precision, dimension(NGLLX) :: hxis,hpxis
+  double precision, dimension(NGLLY) :: hetas,hpetas
+  double precision, dimension(NGLLZ) :: hgammas,hpgammas
+  integer :: i,j,k
+
+! initializes
+  sourcearray(:,:,:,:) = 0._CUSTOM_REAL
+  sourcearrayd(:,:,:,:) = 0.d0
+
+! computes Lagrange polynomials at the source location
+  call lagrange_any(xi_source,NGLLX,xigll,hxis,hpxis)
+  call lagrange_any(eta_source,NGLLY,yigll,hetas,hpetas)
+  call lagrange_any(gamma_source,NGLLZ,zigll,hgammas,hpgammas)
+
+! calculates source array for interpolated location
+  do k=1,NGLLZ
+    do j=1,NGLLY
+      do i=1,NGLLX
+        ! identical source array components in x,y,z-direction
+        sourcearrayd(:,i,j,k) = hxis(i)*hetas(j)*hgammas(k)*dble(factor_source)
+      enddo
+    enddo
+  enddo
+
+! distinguish between single and double precision for reals
+  if(CUSTOM_REAL == SIZE_REAL) then
+    sourcearray(:,:,:,:) = sngl(sourcearrayd(:,:,:,:))
+  else
+    sourcearray(:,:,:,:) = sourcearrayd(:,:,:,:)
+  endif
+
+  end subroutine compute_arrays_source_acoustic
+
+
+! testing read in adjoint sources block by block
+
+!!!the original version
+!!!
+!!!subroutine compute_arrays_adjoint_source(myrank, adj_source_file, &
+!!!                    xi_receiver,eta_receiver,gamma_receiver, adj_sourcearray, &
+!!!                    xigll,yigll,zigll,NSTEP)
+!!!
+!!!
+!!!  implicit none
+!!!
+!!!  include 'constants.h'
+!!!
+!!!! input
+!!!  integer myrank, NSTEP
+!!!
+!!!  double precision xi_receiver, eta_receiver, gamma_receiver
+!!!
+!!!  character(len=*) adj_source_file
+!!!
+!!!! output
+!!!  real(kind=CUSTOM_REAL),dimension(NSTEP,NDIM,NGLLX,NGLLY,NGLLZ) :: adj_sourcearray
+!!!
+!!!! Gauss-Lobatto-Legendre points of integration and weights
+!!!  double precision, dimension(NGLLX) :: xigll
+!!!  double precision, dimension(NGLLY) :: yigll
+!!!  double precision, dimension(NGLLZ) :: zigll
+!!!
+!!!  double precision :: hxir(NGLLX), hpxir(NGLLX), hetar(NGLLY), hpetar(NGLLY), &
+!!!        hgammar(NGLLZ), hpgammar(NGLLZ)
+!!!
+!!!  real(kind=CUSTOM_REAL) :: adj_src(NSTEP,NDIM)
+!!!
+!!!  integer icomp, itime, i, j, k, ios
+!!!  double precision :: junk
+!!!  ! note: should have same order as orientation in write_seismograms_to_file()
+!!!  character(len=3),dimension(NDIM) :: comp = (/ "BHE", "BHN", "BHZ" /)
+!!!  character(len=256) :: filename
+!!!
+!!!  !adj_sourcearray(:,:,:,:,:) = 0.
+!!!  adj_src = 0._CUSTOM_REAL
+!!!
+!!!  ! loops over components
+!!!  do icomp = 1, NDIM
+!!!
+!!!    filename = 'SEM/'//trim(adj_source_file) // '.'// comp(icomp) // '.adj'
+!!!    open(unit=IIN,file=trim(filename),status='old',action='read',iostat = ios)
+!!!    if (ios /= 0) cycle ! cycles to next file
+!!!    !if (ios /= 0) call exit_MPI(myrank, ' file '//trim(filename)//'does not exist')
+!!!
+!!!    ! reads in adjoint source trace
+!!!    do itime = 1, NSTEP
+!!!
+!!!      ! things become a bit tricky because of the Newmark time scheme at
+!!!      ! the very beginning of the time loop. however, when we read in the backward/reconstructed
+!!!      ! wavefields at the end of the first time loop, we can use the adjoint source index from 1 to NSTEP
+!!!      ! (and then access it in reverse NSTEP-it+1  down to 1, for it=1,..NSTEP; see compute_add_sources*.f90).
+!!!      read(IIN,*,iostat=ios) junk, adj_src(itime,icomp)
+!!!      if( ios /= 0 ) &
+!!!        call exit_MPI(myrank, &
+!!!          'file '//trim(filename)//' has wrong length, please check with your simulation duration')
+!!!    enddo
+!!!    close(IIN)
+!!!
+!!!  enddo
+!!!
+!!!  ! lagrange interpolators for receiver location
+!!!  call lagrange_any(xi_receiver,NGLLX,xigll,hxir,hpxir)
+!!!  call lagrange_any(eta_receiver,NGLLY,yigll,hetar,hpetar)
+!!!  call lagrange_any(gamma_receiver,NGLLZ,zigll,hgammar,hpgammar)
+!!!
+!!!  ! interpolates adjoint source onto GLL points within this element
+!!!  do k = 1, NGLLZ
+!!!    do j = 1, NGLLY
+!!!      do i = 1, NGLLX
+!!!        adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:)
+!!!      enddo
+!!!    enddo
+!!!  enddo
+!!!
+!!!end subroutine compute_arrays_adjoint_source
+
diff --git a/src/shared/constants.h.in b/src/shared/constants.h.in
index 058a919d4..c769df374 100644
--- a/src/shared/constants.h.in
+++ b/src/shared/constants.h.in
@@ -70,10 +70,16 @@
 ! for optimized routines by Deville et al. (2002)
   integer, parameter :: m1 = NGLLX, m2 = NGLLX * NGLLY
 
+!!-----------------------------------------------------------
+!!
+!! seismogram and i/o output
+!!
+!!-----------------------------------------------------------
 ! ouput format of seismograms, ASCII or binary
   logical, parameter :: SEISMOGRAMS_BINARY = .false.
+
 ! output format of seismograms, Seismic Unix (binary with 240-byte-headers)
-  logical, parameter :: SU_FORMAT=.false.
+  logical, parameter :: SU_FORMAT = .false.
 
 ! input, output and main MPI I/O files
   integer, parameter :: ISTANDARD_OUTPUT = 6
@@ -101,6 +107,12 @@
 ! ignore variable name field (junk) at the beginning of each input line
   logical, parameter :: IGNORE_JUNK = .true.,DONT_IGNORE_JUNK = .false.
 
+!!-----------------------------------------------------------
+!!
+!! source/receiver setup
+!!
+!!-----------------------------------------------------------
+
 ! flag to print the details of source location
   logical, parameter :: SHOW_DETAILS_LOCATE_SOURCE = .false.
 
@@ -123,6 +135,12 @@
 ! use directory OUTPUT_FILES/ for seismogram output
   logical,parameter :: USE_OUTPUT_FILES_PATH = .true.
 
+!!-----------------------------------------------------------
+!!
+!! absorption and PML
+!!
+!!-----------------------------------------------------------
+
 ! absorb top surface
 ! (defined in mesh as 'free_surface_file')
   logical,parameter :: ABSORB_FREE_SURFACE = .false.
@@ -133,6 +151,12 @@
 ! (user parameters can be specified in PML_init.f90)
   logical,parameter :: ABSORB_USE_PML = .false.
 
+!!-----------------------------------------------------------
+!!
+!! directory structure
+!!
+!!-----------------------------------------------------------
+
 ! paths for inputs and outputs files
   character(len=256), parameter :: IN_DATA_FILES_PATH = '../in_data_files/'
   character(len=256), parameter :: MF_IN_DATA_FILES_PATH = '../in_data_files/meshfem3D_files/'
@@ -173,6 +197,9 @@
   double precision, parameter :: FACTOR_FORCE_SOURCE = 1.d15
   integer, parameter :: COMPONENT_FORCE_SOURCE = 3  ! takes direction in comp E/N/Z = 1/2/3
 
+! set to use a Ricker source time function instead of a gaussian
+  logical, parameter :: USE_RICKER_IPATI = .false.
+
 ! use this t0 as earliest starting time rather than the automatically calculated one
 ! (must be positive and bigger than the automatically one to be effective;
 !  simulation will start at t = - t0)
@@ -190,6 +217,13 @@
 ! that completes an orthonormal.
   logical, parameter :: EXT_MESH_RECV_NORMAL = .false.
 
+
+!!-----------------------------------------------------------
+!!
+!! image outputs
+!!
+!!-----------------------------------------------------------
+
 ! shakemaps and movies can not be generated during the same run. Mutually exclusive.
   logical, parameter :: EXTERNAL_MESH_MOVIE_SURFACE = .false.
   logical, parameter :: EXTERNAL_MESH_CREATE_SHAKEMAP = .false.
@@ -215,6 +249,26 @@
 ! this is an absolute value for normalized coordinates in the Earth
   double precision, parameter :: SMALLVAL_TOL = 1.d-10
 
+!!-----------------------------------------------------------
+!!
+!! mesh optimization
+!!
+!!-----------------------------------------------------------
+
+! add mesh coloring for the GPU + MPI implementation
+  logical, parameter :: USE_MESH_COLORING_GPU = .false.
+  integer, parameter :: MAX_NUMBER_OF_COLORS = 1000
+
+! enhanced coloring:
+!
+! using Droux algorithm
+! try several times with one more color before giving up
+  logical, parameter :: USE_DROUX_OPTIMIZATION = .false.
+  integer, parameter :: MAX_NB_TRIES_OF_DROUX_1993 = 15
+!
+! postprocess the colors to balance them if Droux (1993) algorithm is not used
+  logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .false.
+
 !------------------------------------------------------
 !----------- do not modify anything below -------------
 !------------------------------------------------------
@@ -316,12 +370,44 @@
 ! flag for projection from latitude/longitude to UTM, and back
   integer, parameter :: ILONGLAT2UTM = 0, IUTM2LONGLAT = 1
 
+!!-----------------------------------------------------------
+!!
+!! OCEANS load approximation
+!!
+!!-----------------------------------------------------------
 ! minimum thickness in meters to include the effect of the oceans
 ! to avoid taking into account spurious oscillations in topography model
   double precision, parameter :: MINIMUM_THICKNESS_3D_OCEANS = 10.d0
 ! density of sea water
   real(kind=CUSTOM_REAL), parameter :: RHO_OCEANS = 1020.0
 
+!!-----------------------------------------------------------
+!!
+!! GRAVITY
+!!
+!!-----------------------------------------------------------
+! gravitational constant
+  double precision, parameter :: GRAV = 6.6723d-11
+! number of layers in PREM
+  integer, parameter :: NR = 640
+! for lookup table for gravity every 100 m in radial direction of Earth model
+  integer, parameter :: NRAD_GRAVITY = 70000
+! R_EARTH is the radius of the bottom of the oceans (radius of Earth in m)
+  double precision, parameter :: R_EARTH = 6371000.d0
+! same radius in km
+  double precision, parameter :: R_EARTH_KM = R_EARTH / 1000.d0
+! radius of the Earth for gravity calculation
+  double precision, parameter :: R_EARTH_GRAVITY = 6371000.d0
+! radius of the ocean floor for gravity calculation
+  double precision, parameter :: ROCEAN_GRAVITY = 6368000.d0
+! average density in the full Earth to normalize equation
+  double precision, parameter :: RHOAV = 5514.3d0
+
+!!-----------------------------------------------------------
+!!
+!! DOMAINS
+!!
+!!-----------------------------------------------------------
 ! material domain ids
   integer, parameter :: IDOMAIN_ACOUSTIC    = 1
   integer, parameter :: IDOMAIN_ELASTIC     = 2
@@ -339,3 +425,5 @@
   integer, parameter :: IMODEL_USER_EXTERNAL    = 6
   integer, parameter :: IMODEL_GLL              = 7
   integer, parameter :: IMODEL_SALTON_TROUGH    = 8
+  integer, parameter :: IMODEL_1D_PREM_PB       = 9
+  integer, parameter :: IMODEL_IPATI            = 10
diff --git a/src/shared/detect_surface.f90 b/src/shared/detect_surface.f90
index a1a4c6629..17c6c80ab 100644
--- a/src/shared/detect_surface.f90
+++ b/src/shared/detect_surface.f90
@@ -703,9 +703,10 @@ subroutine detect_surface_PNM_GIF_image(NPROC,nglob,nspec,ibool,&
   real(kind=CUSTOM_REAL), dimension(nglob) :: xstore,ystore,zstore
 
 !local parameters
-  real(kind=CUSTOM_REAL) :: mindist
+  real(kind=CUSTOM_REAL) :: mindist,distance
   integer, dimension(:), allocatable :: valence_external_mesh
   integer :: ispec,i,j,k,iglob,ier,count
+  real(kind=CUSTOM_REAL),parameter :: TOLERANCE_DISTANCE = 0.9
 
 ! detecting surface points/elements (based on valence check on NGLL points) for external mesh
   allocate(valence_external_mesh(nglob),stat=ier)
@@ -722,6 +723,7 @@ subroutine detect_surface_PNM_GIF_image(NPROC,nglob,nspec,ibool,&
                   + (ystore(ibool(1,1,1,:)) - ystore(ibool(2,1,1,:)))**2 &
                   + (zstore(ibool(1,1,1,:)) - zstore(ibool(2,1,1,:)))**2 )
   mindist = sqrt(mindist)
+  distance = TOLERANCE_DISTANCE*mindist
 
 ! sets valence value to one corresponding to process rank  for points on cross-sections
   do ispec = 1, nspec
@@ -732,7 +734,7 @@ subroutine detect_surface_PNM_GIF_image(NPROC,nglob,nspec,ibool,&
 
           ! chooses points close to cross-section
           if( abs((xstore(iglob)-section_xorg)*section_nx + (ystore(iglob)-section_yorg)*section_ny &
-                 + (zstore(iglob)-section_zorg)*section_nz )  < 0.8*mindist ) then
+                 + (zstore(iglob)-section_zorg)*section_nz )  < distance ) then
             ! sets valence to 1 for points on cross-sections
             valence_external_mesh(iglob) = myrank+1
           endif
diff --git a/src/shared/get_element_face.f90 b/src/shared/get_element_face.f90
index 8c1f07159..2b068d36c 100644
--- a/src/shared/get_element_face.f90
+++ b/src/shared/get_element_face.f90
@@ -24,7 +24,8 @@
 !
 !=====================================================================
 
-subroutine get_element_face_id(ispec,xcoord,ycoord,zcoord,&
+
+  subroutine get_element_face_id(ispec,xcoord,ycoord,zcoord,&
                               ibool,nspec,nglob, &
                               xstore_dummy,ystore_dummy,zstore_dummy, &
                               iface_id )
@@ -174,13 +175,13 @@ subroutine get_element_face_id(ispec,xcoord,ycoord,zcoord,&
 
   endif
 
-end subroutine get_element_face_id
+  end subroutine get_element_face_id
 
 !
-!----
+!-------------------------------------------------------------------------------------------------
 !
 
-subroutine get_element_face_gll_indices(iface,ijk_face,NGLLA,NGLLB )
+  subroutine get_element_face_gll_indices(iface,ijk_face,NGLLA,NGLLB )
 
 ! returns local indices in ijk_face for specified face
 
@@ -368,10 +369,10 @@ subroutine get_element_face_gll_indices(iface,ijk_face,NGLLA,NGLLB )
 end subroutine get_element_face_gll_indices
 
 !
-!----
+!-------------------------------------------------------------------------------------------------
 !
 
-subroutine get_element_face_normal(ispec,iface,xcoord,ycoord,zcoord, &
+  subroutine get_element_face_normal(ispec,iface,xcoord,ycoord,zcoord, &
                                 ibool,nspec,nglob, &
                                 xstore_dummy,ystore_dummy,zstore_dummy, &
                                 normal)
@@ -463,13 +464,13 @@ subroutine get_element_face_normal(ispec,iface,xcoord,ycoord,zcoord, &
   endif
   !print*,'face ',iface,'scalarproduct:',tmp
 
-end subroutine get_element_face_normal
+  end subroutine get_element_face_normal
 
 !
-!----
+!-------------------------------------------------------------------------------------------------
 !
 
-subroutine get_element_face_normal_idirect(ispec,iface,xcoord,ycoord,zcoord, &
+  subroutine get_element_face_normal_idirect(ispec,iface,xcoord,ycoord,zcoord, &
                                 ibool,nspec,nglob, &
                                 xstore_dummy,ystore_dummy,zstore_dummy, &
                                 normal,idirect)
@@ -565,5 +566,50 @@ subroutine get_element_face_normal_idirect(ispec,iface,xcoord,ycoord,zcoord, &
     idirect = 1
   endif
 
-end subroutine get_element_face_normal_idirect
+  end subroutine get_element_face_normal_idirect
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, &
+                                ibool,nspec,nglob,xstore_dummy,ystore_dummy,zstore_dummy, &
+                                iface_all_corner_ijk)
+
+  implicit none
+
+  include "constants.h"
+
+  integer,intent(in) :: ispec,iface_ref,nspec,nglob
+
+  ! face corner locations
+  real(kind=CUSTOM_REAL),dimension(NGNOD2D),intent(out) :: xcoord,ycoord,zcoord
+  integer,dimension(NGNOD2D),intent(out):: iglob_corners_ref
+
+  ! index array
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+  ! global point locations
+  real(kind=CUSTOM_REAL) :: xstore_dummy(nglob),ystore_dummy(nglob),zstore_dummy(nglob)
+
+  ! assumes NGNOD2D == 4
+  integer,dimension(3,4,6) :: iface_all_corner_ijk
+
+  ! local parameters
+  integer :: icorner,i,j,k
+
+  ! loops over corners
+  do icorner = 1,NGNOD2D
+    i = iface_all_corner_ijk(1,icorner,iface_ref)
+    j = iface_all_corner_ijk(2,icorner,iface_ref)
+    k = iface_all_corner_ijk(3,icorner,iface_ref)
+
+    ! global reference indices
+    iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
+
+    ! reference corner coordinates
+    xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
+    ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
+    zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
+  enddo
 
+  end subroutine get_element_corners
diff --git a/src/shared/param_reader.c b/src/shared/param_reader.c
index 701f965c3..51896d89d 100644
--- a/src/shared/param_reader.c
+++ b/src/shared/param_reader.c
@@ -142,7 +142,7 @@ FC_FUNC_(param_read,PARAM_READ)(char * string_read, int * string_read_len, char
   /* Regular expression for parsing lines from param file.
    ** Good luck reading this regular expression.  Basically, the lines of
    ** the parameter file should be of the form 'parameter = value',
-   ** optionally followed by a #-delimited comment.  
+   ** optionally followed by a #-delimited comment.
    ** 'value' can be any number of space- or tab-separated words. Blank
    ** lines, lines containing only white space and lines whose first non-
    ** whitespace character is '#' are ignored.  White space is generally
diff --git a/src/shared/read_parameter_file.f90 b/src/shared/read_parameter_file.f90
index 8a6e75508..999beeee0 100644
--- a/src/shared/read_parameter_file.f90
+++ b/src/shared/read_parameter_file.f90
@@ -50,7 +50,7 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, &
   logical ANISOTROPY,SAVE_MESH_FILES,PRINT_SOURCE_TIME_FUNCTION,SUPPRESS_UTM_PROJECTION
 
   character(len=256) LOCAL_PATH,CMTSOLUTION
-  
+
 ! local variables
   integer ::ios,icounter,isource,idummy,nproc_eta_old,nproc_xi_old
   double precision :: hdur,minval_hdur
@@ -101,7 +101,7 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, &
   ! define the velocity model
   call read_value_string(MODEL, 'model.MODEL')
   if(err_occurred() /= 0) stop 'an error occurred while reading the parameter file: MODEL'
-  
+
   call read_value_logical(OCEANS, 'model.OCEANS')
   if(err_occurred() /= 0) return
   call read_value_logical(TOPOGRAPHY, 'model.TOPOGRAPHY')
@@ -178,15 +178,15 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, &
   call get_value_string(CMTSOLUTION, 'solver.CMTSOLUTION',&
        IN_DATA_FILES_PATH(1:len_trim(IN_DATA_FILES_PATH))//'CMTSOLUTION')
 
-  open(unit=1,file=CMTSOLUTION,iostat=ios,status='old',action='read')
+  open(unit=21,file=trim(CMTSOLUTION),iostat=ios,status='old',action='read')
   if(ios /= 0) stop 'error opening CMTSOLUTION file'
 
   icounter = 0
   do while(ios == 0)
-    read(1,"(a)",iostat=ios) dummystring
+    read(21,"(a)",iostat=ios) dummystring
     if(ios == 0) icounter = icounter + 1
   enddo
-  close(1)
+  close(21)
 
   if(mod(icounter,NLINES_PER_CMTSOLUTION_SOURCE) /= 0) &
     stop 'total number of lines in CMTSOLUTION file should be a multiple of NLINES_PER_CMTSOLUTION_SOURCE'
@@ -195,27 +195,27 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, &
   if(NSOURCES < 1) stop 'need at least one source in CMTSOLUTION file'
 
   ! compute the minimum value of hdur in CMTSOLUTION file
-  open(unit=1,file=CMTSOLUTION,status='old',action='read')
+  open(unit=21,file=trim(CMTSOLUTION),status='old',action='read')
   minval_hdur = HUGEVAL
   do isource = 1,NSOURCES
 
     ! skip other information
     do idummy = 1,3
-      read(1,"(a)") dummystring
+      read(21,"(a)") dummystring
     enddo
 
     ! read half duration and compute minimum
-    read(1,"(a)") dummystring
+    read(21,"(a)") dummystring
     read(dummystring(15:len_trim(dummystring)),*) hdur
     minval_hdur = min(minval_hdur,hdur)
 
     ! skip other information
     do idummy = 1,9
-      read(1,"(a)") dummystring
+      read(21,"(a)") dummystring
     enddo
 
   enddo
-  close(1)
+  close(21)
 
   ! one cannot use a Heaviside source for the movies
   if((MOVIE_SURFACE .or. MOVIE_VOLUME) .and. sqrt(minval_hdur**2 + HDUR_MOVIE**2) < TINYVAL) &
@@ -242,24 +242,62 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, &
   case( '1d_cascadia')
     IMODEL = IMODEL_1D_CASCADIA
 
-  ! user models  
+  ! user models
+  case( '1d_prem_pb' )
+    IMODEL = IMODEL_1D_PREM_PB
+  case( 'aniso' )
+    IMODEL = IMODEL_DEFAULT
+    ANISOTROPY = .true.
+  case( 'external' )
+    IMODEL = IMODEL_USER_EXTERNAL
+  case( 'ipati' )
+    IMODEL = IMODEL_IPATI
+  case( 'gll' )
+    IMODEL = IMODEL_GLL
   case( 'salton_trough')
     IMODEL = IMODEL_SALTON_TROUGH
   case( 'tomo' )
     IMODEL = IMODEL_TOMO
-  case( 'external' )
-    IMODEL = IMODEL_USER_EXTERNAL
-  case( 'aniso' )
-    IMODEL = IMODEL_DEFAULT
-    ANISOTROPY = .true.
-  case default  
+
+  case default
     print*
     print*,'********** model not recognized: ',trim(MODEL),' **************'
     print*,'********** using model: default',' **************'
     print*
     IMODEL = IMODEL_DEFAULT
   end select
-  
+
+  ! check
+  if( IMODEL == IMODEL_IPATI ) then
+    if( USE_RICKER_IPATI .eqv. .false. ) stop 'please set USE_RICKER_IPATI to true in shared/constants.h and recompile'
+  endif
 
   end subroutine read_parameter_file
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_gpu_mode(GPU_MODE,GRAVITY)
+
+  implicit none
+  include "constants.h"
+
+  logical :: GPU_MODE
+  logical :: GRAVITY
+
+  ! initializes flags
+  GPU_MODE = .false.
+  GRAVITY = .false.
+
+  ! opens file Par_file
+  call open_parameter_file()
+
+  call read_value_logical(GPU_MODE, 'solver.GPU_MODE')
+  call read_value_logical(GRAVITY, 'solver.GRAVITY')
+
+  ! close parameter file
+  call close_parameter_file()
+
+  end subroutine read_gpu_mode
+
diff --git a/src/shared/read_topo_bathy_file.f90 b/src/shared/read_topo_bathy_file.f90
index 4cded288b..f05313209 100644
--- a/src/shared/read_topo_bathy_file.f90
+++ b/src/shared/read_topo_bathy_file.f90
@@ -48,7 +48,7 @@ subroutine read_topo_bathy_file(itopo_bathy,NX_TOPO,NY_TOPO)
     print*,'error opening topography file: ',trim(TOPO_FILE)
     stop 'error opening topography file'
   endif
-  
+
   ! reads in values
   do iy=1,NY_TOPO
     do ix=1,NX_TOPO
@@ -74,7 +74,7 @@ subroutine get_topo_bathy_elevation(x_target,y_target,target_elevation, &
   include "constants.h"
 
   real(kind=CUSTOM_REAL),intent(in) :: x_target,y_target
-  
+
   real(kind=CUSTOM_REAL),intent(out) :: target_elevation
 
   integer :: NX_TOPO,NY_TOPO
@@ -87,7 +87,7 @@ subroutine get_topo_bathy_elevation(x_target,y_target,target_elevation, &
   double precision :: xval,yval,long,lat
   double precision :: long_corner,lat_corner,ratio_xi,ratio_eta
   integer :: icornerlong,icornerlat
-            
+
   ! get coordinates of current point
   xval = dble(x_target)
   yval = dble(y_target)
@@ -127,7 +127,7 @@ subroutine get_topo_bathy_elevation(x_target,y_target,target_elevation, &
         itopo_bathy(icornerlong,icornerlat+1)*(1.-ratio_xi)*ratio_eta
 
   end subroutine get_topo_bathy_elevation
-  
+
 !
 !-------------------------------------------------------------------------------------------------
 !
@@ -143,10 +143,10 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
   include "constants.h"
 
   real(kind=CUSTOM_REAL),intent(in) :: x_target,y_target
-  
+
   real(kind=CUSTOM_REAL),intent(out) :: target_elevation
   real(kind=CUSTOM_REAL),intent(out) :: target_distmin
-  
+
   integer :: NSPEC_AB,NGLOB_AB
 
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
@@ -160,7 +160,7 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
   integer, dimension(3,NGLLSQUARE,num_free_surface_faces) :: free_surface_ijk
 
   ! local parameters
-  real(kind=CUSTOM_REAL),dimension(4) :: elevation_node,dist_node  
+  real(kind=CUSTOM_REAL),dimension(4) :: elevation_node,dist_node
   real(kind=CUSTOM_REAL) :: distmin,dist
 
   integer :: iface,i,j,ispec,iglob,igll,jgll,kgll
@@ -172,26 +172,26 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
   integer,parameter :: MIDX = (NGLLX+1)/2
   integer,parameter :: MIDY = (NGLLY+1)/2
   integer,parameter :: MIDZ = (NGLLZ+1)/2
-  
+
   real(kind=CUSTOM_REAL) :: typical_size
   logical :: located_target
-  
+
   ! initialize
   target_elevation = 0.0_CUSTOM_REAL
   target_distmin = HUGEVAL
-  
-  
+
+
   if(num_free_surface_faces > 0) then
 
     ! computes typical size of elements at the surface (uses first element for estimation)
     if( USE_DISTANCE_CRITERION ) then
-      ispec = free_surface_ispec(1)    
+      ispec = free_surface_ispec(1)
       typical_size =  (xstore(ibool(1,1,1,ispec)) - xstore(ibool(NGLLX,NGLLY,NGLLZ,ispec)))**2 &
                     + (ystore(ibool(1,1,1,ispec)) - ystore(ibool(NGLLX,NGLLY,NGLLZ,ispec)))**2
       ! use 10 times the distance as a criterion for point detection
       typical_size = 10. * typical_size
     endif
-    
+
     ! flag to check that we located at least one target element
     located_target = .false.
 
@@ -200,7 +200,7 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
     iselected = 2
     jselected = 2
     iface_selected = 1
-      
+
     ! loops over all free surface faces
     do iface=1,num_free_surface_faces
       ispec = free_surface_ispec(iface)
@@ -208,10 +208,10 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
       ! exclude elements that are too far from target
       if( USE_DISTANCE_CRITERION ) then
         iglob = ibool(MIDX,MIDY,MIDZ,ispec)
-        dist = (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2 
+        dist = (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2
         if( dist > typical_size ) cycle
       endif
-      
+
       ! loop only on points inside the element
       ! exclude edges to ensure this point is not shared with other elements
       do j = 2,NGLLY - 1
@@ -220,13 +220,13 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
           igll = free_surface_ijk(1,(j-1)*NGLLY+i,iface)
           jgll = free_surface_ijk(2,(j-1)*NGLLY+i,iface)
           kgll = free_surface_ijk(3,(j-1)*NGLLY+i,iface)
-          
+
           iglob = ibool(igll,jgll,kgll,ispec)
 
           ! distance (squared) to target
           dist = ( x_target - xstore(iglob) )**2 + &
                  ( y_target - ystore(iglob) )**2
-                 
+
           ! keep this point if it is closer to the receiver
           if(dist < distmin) then
             distmin = dist
@@ -238,8 +238,8 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
             located_target = .true.
           endif
         enddo
-      enddo      
-    end do    
+      enddo
+    end do
 
     ! if we have not located a target element, the point is not in this slice
     ! therefore use first element only for fictitious iterative search
@@ -248,7 +248,7 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
       jselected = 2
       iface_selected = 1
     endif
-    
+
     !  weighted mean at current point of topography elevation of the four closest nodes
     !  set distance to huge initial value
     distmin = HUGEVAL
@@ -264,23 +264,23 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
             jgll = free_surface_ijk(2,(j-jadjust-1)*NGLLY+i-iadjust,iface_selected)
             kgll = free_surface_ijk(3,(j-jadjust-1)*NGLLY+i-iadjust,iface_selected)
             iglob = ibool(igll,jgll,kgll,ispec)
-      
+
             ! stores node infos
             inode = inode + 1
             elevation_node(inode) = zstore(iglob)
             dist_node(inode) = sqrt( (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2 )
           end do
         end do
-        
+
         ! weighted elevation
         dist = sum( dist_node(:) )
         if(dist < distmin) then
-        
+
           ! sets new minimum distance (of all 4 closest nodes)
           distmin = dist
           target_distmin = distmin
-          
-          ! interpolates elevation 
+
+          ! interpolates elevation
           if( dist > TINYVAL ) then
             target_elevation =  (dist_node(1)/dist)*elevation_node(1) + &
                                 (dist_node(2)/dist)*elevation_node(2) + &
@@ -290,10 +290,10 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis
             stop 'error summed distance to node is zero'
           endif
         endif
-        
-      end do      
+
+      end do
     end do
-    
+
   end if
 
   end subroutine get_topo_elevation_free
@@ -313,10 +313,10 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta
   include "constants.h"
 
   real(kind=CUSTOM_REAL),intent(in) :: x_target,y_target
-  
+
   real(kind=CUSTOM_REAL),intent(out) :: target_elevation
   real(kind=CUSTOM_REAL),intent(out) :: target_distmin
-  
+
   integer :: NSPEC_AB,NGLOB_AB
 
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
@@ -339,26 +339,26 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta
   integer,parameter :: MIDX = (NGLLX+1)/2
   integer,parameter :: MIDY = (NGLLY+1)/2
   integer,parameter :: MIDZ = (NGLLZ+1)/2
-  
+
   real(kind=CUSTOM_REAL) :: typical_size
   logical :: located_target
-  
+
   ! initialize
   target_elevation = 0.0_CUSTOM_REAL
   target_distmin = HUGEVAL
-  
-  
+
+
   if(num_free_surface_faces > 0) then
 
     ! computes typical size of elements at the surface (uses first element for estimation)
     if( USE_DISTANCE_CRITERION ) then
-      ispec = free_surface_ispec(1)    
+      ispec = free_surface_ispec(1)
       typical_size =  (xstore(ibool(1,1,1,ispec)) - xstore(ibool(NGLLX,NGLLY,NGLLZ,ispec)))**2 &
                     + (ystore(ibool(1,1,1,ispec)) - ystore(ibool(NGLLX,NGLLY,NGLLZ,ispec)))**2
       ! use 10 times the distance as a criterion for point detection
       typical_size = 10. * typical_size
     endif
-    
+
     ! flag to check that we located at least one target element
     located_target = .false.
 
@@ -372,22 +372,22 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta
       ! excludes elements that are too far from target
       if( USE_DISTANCE_CRITERION ) then
         iglob = ibool(MIDX,MIDY,MIDZ,ispec)
-        dist = (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2 
+        dist = (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2
         if( dist > typical_size ) cycle
       endif
-      
+
       ! loop only on points inside the element
       do i = 1,NGLLSQUARE
         igll = free_surface_ijk(1,i,iface)
         jgll = free_surface_ijk(2,i,iface)
         kgll = free_surface_ijk(3,i,iface)
-        
+
         iglob = ibool(igll,jgll,kgll,ispec)
 
         ! distance (squared) to target
         dist = ( x_target - xstore(iglob) )**2 + &
                ( y_target - ystore(iglob) )**2
-               
+
         ! keep this point if it is closer to the receiver
         if(dist < distmin) then
           distmin = dist
@@ -397,8 +397,8 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta
           target_distmin = dist
           located_target = .true.
         endif
-      enddo      
-    end do    
+      enddo
+    end do
 
     ! if we have not located a target element, the point is not in this slice
     ! therefore use first element only for fictitious iterative search
@@ -409,10 +409,10 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta
       ! elevation (given in z - coordinate)
       target_elevation = zstore(iglob)
       target_distmin = ( x_target - xstore(iglob) )**2 + ( y_target - ystore(iglob) )**2
-      located_target = .true.      
+      located_target = .true.
     endif
-        
+
   end if
 
   end subroutine get_topo_elevation_free_closest
-  
\ No newline at end of file
+
diff --git a/src/shared/serial.f90 b/src/shared/serial.f90
index aad0e25a5..97bc42672 100644
--- a/src/shared/serial.f90
+++ b/src/shared/serial.f90
@@ -38,6 +38,8 @@ end subroutine stop_all
 !
 
   double precision function wtime()
+
+  implicit none
   real :: ct
 
   ! note: for simplicity, we take cpu_time which returns the elapsed CPU time in seconds
@@ -45,6 +47,7 @@ double precision function wtime()
   call cpu_time(ct)
 
   wtime = ct
+
   end function wtime
 
 !
@@ -682,11 +685,10 @@ end subroutine recv_i_t
   subroutine send_dp(sendbuf, sendcount, dest, sendtag)
 
   implicit none
-  include "constants.h"
 
   integer dest,sendtag
   integer sendcount
-  real(kind=CUSTOM_REAL),dimension(sendcount):: sendbuf
+  double precision,dimension(sendcount):: sendbuf
 
   stop 'send_dp not implemented for serial code'
 
@@ -697,13 +699,12 @@ end subroutine send_dp
   subroutine recv_dp(recvbuf, recvcount, dest, recvtag)
 
   implicit none
-  include "constants.h"
 
   integer dest,recvtag
   integer recvcount
-  real(kind=CUSTOM_REAL),dimension(recvcount):: recvbuf
+  double precision,dimension(recvcount):: recvbuf
 
-  stop 'recv_dp not implemented for parallel code'
+  stop 'recv_dp not implemented for serial code'
 
   end subroutine recv_dp
 
diff --git a/src/shared/smooth_vol_data.f90 b/src/shared/smooth_vol_data.f90
index 9af47205f..9948b6bd8 100644
--- a/src/shared/smooth_vol_data.f90
+++ b/src/shared/smooth_vol_data.f90
@@ -73,7 +73,7 @@ program smooth_vol_data
   real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: dummy_2
   real(kind=CUSTOM_REAL), dimension(:,:,:),allocatable :: dummy_3
   real(kind=CUSTOM_REAL), dimension(:,:,:,:,:),allocatable :: dummy_5
-  
+
   integer, dimension(:),allocatable :: idummy
   logical, dimension(:),allocatable :: ldummy
   integer, dimension(:,:,:),allocatable :: idummy_3
@@ -262,7 +262,7 @@ program smooth_vol_data
   ! reads mesh file
   !
   ! needs to get point locations, jacobians and MPI neighbours
-    
+
   ! opens external mesh file
   write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'/proc',myrank,'_'//'external_mesh.bin'
   open(unit=27,file=trim(prname_lp),&
@@ -273,7 +273,7 @@ program smooth_vol_data
     call exit_mpi(myrank, 'error reading external mesh file')
   endif
 
-  ! gets number of elements and global points for this partition          
+  ! gets number of elements and global points for this partition
   read(27) NSPEC_AB
   read(27) NGLOB_AB
 
@@ -292,7 +292,7 @@ program smooth_vol_data
   allocate(dummy(NGLLX,NGLLY,NGLLZ,NSPEC_AB), &
           jacobian(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
   if( ier /= 0 ) stop 'error allocating array dummy and jacobian'
-  
+
   ! needs jacobian
   read(27) dummy ! xix
   read(27) dummy ! xiy
@@ -306,7 +306,7 @@ program smooth_vol_data
   read(27) jacobian
 
   ! now skips all until MPI section can be read in
-  
+
   ! reads in partiton neighbors
   read(27) dummy ! kappastore
   read(27) dummy ! mustore
@@ -331,7 +331,7 @@ program smooth_vol_data
     read(27) dummy_1 ! rmass_acoustic
     read(27) dummy ! rhostore
   endif
-  
+
   ! elastic
   if( ELASTIC_SIMULATION ) then
     read(27) dummy_1 ! rmass
@@ -341,28 +341,28 @@ program smooth_vol_data
     read(27) dummy ! rho_vp
     read(27) dummy ! rho_vs
   endif
-  
+
   ! poroelastic
   if( POROELASTIC_SIMULATION ) then
     read(27) dummy_1 ! rmass_solid_poroelastic
     read(27) dummy_1 ! rmass_fluid_poroelastic
     allocate(dummy_5(2,NGLLX,NGLLY,NGLLZ,NSPEC_AB))
     read(27) dummy_5 ! rhoarraystore
-    deallocate(dummy_5)    
+    deallocate(dummy_5)
     allocate(dummy_5(3,NGLLX,NGLLY,NGLLZ,NSPEC_AB))
     read(27) dummy_5 ! kappaarraystore
     deallocate(dummy_5)
     read(27) dummy ! etastore
-    read(27) dummy ! tortstore    
+    read(27) dummy ! tortstore
     allocate(dummy_5(6,NGLLX,NGLLY,NGLLZ,NSPEC_AB))
     read(27) dummy_5 ! permstore
     deallocate(dummy_5)
     read(27) dummy ! phistore
     read(27) dummy ! rho_vpI
     read(27) dummy ! rho_vpII
-    read(27) dummy ! rho_vsI    
+    read(27) dummy ! rho_vsI
   endif
-  
+
   deallocate(dummy_1)
   deallocate(dummy)
 
@@ -388,7 +388,7 @@ program smooth_vol_data
     read(27) dummy_3 ! abs_boundary_normal
     deallocate( idummy,idummy_3,dummy_2,dummy_3)
   endif
-  
+
   ! free surface
   read(27) idummy_a ! num_free_surface_faces
   if( idummy_a > 0 ) then
@@ -403,7 +403,7 @@ program smooth_vol_data
     read(27) dummy_3  ! free_surface_normal
     deallocate( idummy,idummy_3,dummy_2,dummy_3)
   endif
-  
+
   ! acoustic-elastic coupling surface
   read(27) idummy_a ! num_coupling_ac_el_faces
   if( idummy_a > 0 ) then
@@ -411,7 +411,7 @@ program smooth_vol_data
             idummy_3(3,NGLLSQUARE,idummy_a), &
             dummy_2(NGLLSQUARE,idummy_a), &
             dummy_3(NDIM,NGLLSQUARE,idummy_a),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array idummy etc.'  
+    if( ier /= 0 ) stop 'error allocating array idummy etc.'
     read(27) idummy   ! coupling_ac_el_ispec
     read(27) idummy_3 ! coupling_ac_el_ijk
     read(27) dummy_2  ! coupling_ac_el_jacobian2Dw
@@ -426,7 +426,7 @@ program smooth_vol_data
             idummy_3(3,NGLLSQUARE,idummy_a), &
             dummy_2(NGLLSQUARE,idummy_a), &
             dummy_3(NDIM,NGLLSQUARE,idummy_a),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array idummy etc.'  
+    if( ier /= 0 ) stop 'error allocating array idummy etc.'
     read(27) idummy   ! coupling_ac_po_ispec
     read(27) idummy_3 ! coupling_ac_po_ijk
     read(27) dummy_2  ! coupling_ac_po_jacobian2Dw
@@ -441,11 +441,11 @@ program smooth_vol_data
             idummy_3(3,NGLLSQUARE,idummy_a), &
             dummy_2(NGLLSQUARE,idummy_a), &
             dummy_3(NDIM,NGLLSQUARE,idummy_a),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array idummy etc.'  
+    if( ier /= 0 ) stop 'error allocating array idummy etc.'
     read(27) idummy   ! coupling_el_po_ispec
-    read(27) idummy   ! coupling_po_el_ispec    
+    read(27) idummy   ! coupling_po_el_ispec
     read(27) idummy_3 ! coupling_el_po_ijk
-    read(27) idummy_3 ! coupling_po_el_ijk    
+    read(27) idummy_3 ! coupling_po_el_ijk
     read(27) dummy_2  ! coupling_el_po_jacobian2Dw
     read(27) dummy_3  ! coupling_el_po_normal
     deallocate( idummy,idummy_3,dummy_2,dummy_3)
@@ -461,12 +461,12 @@ program smooth_vol_data
     read(27) my_neighbours_ext_mesh
     ! no more information is needed from external mesh files
   endif
-  
+
   ! we're done reading in mesh arrays
   close(27)
 
 ! ---------------------
-  
+
   ! for smoothing, we use cell centers to find and locate nearby elements
   !
   ! sets the location of the center of the elements and local points
diff --git a/src/shared/sum_kernels.f90 b/src/shared/sum_kernels.f90
index 143f73268..0f761f1f9 100644
--- a/src/shared/sum_kernels.f90
+++ b/src/shared/sum_kernels.f90
@@ -24,7 +24,7 @@
 module sum_par
 
   include 'constants.h'
-  
+
   ! USER PARAMETERS
 
   ! by default, this algorithm uses transverse isotropic (bulk,bulk_betav,bulk_betah,eta) kernels to sum up
@@ -38,13 +38,13 @@ module sum_par
 
   ! 1 permille of maximum for inverting hessian
   real(kind=CUSTOM_REAL),parameter :: THRESHOLD_HESS = 1.e-3
-  
+
   ! sums all hessians before inverting and preconditioning
   logical, parameter :: USE_HESS_SUM = .true.
-  
+
   ! uses source mask to blend out source elements
   logical, parameter :: USE_SOURCE_MASK = .false.
-  
+
   ! maximum number of kernels listed
   integer, parameter :: MAX_NUM_NODES = 1000
 
@@ -53,7 +53,7 @@ module sum_par
 
   ! mesh size
   integer :: NSPEC_AB, NGLOB_AB
-    
+
 end module sum_par
 
 !
@@ -64,7 +64,7 @@ program sum_kernels
 
   use sum_par
   implicit none
-  
+
   include 'mpif.h'
   include 'precision.h'
 
@@ -102,7 +102,7 @@ program sum_kernels
     write(*,*) 'reading kernel list: '
   endif
   call mpi_barrier(MPI_COMM_WORLD,ier)
-  
+
   ! reads in event list
   nker=0
   open(unit = 20, file = trim(kernel_file_list), status = 'old',iostat = ios)
@@ -150,7 +150,7 @@ program sum_kernels
   ! reads mesh file
   !
   ! needs to get array dimensions
-    
+
   ! opens external mesh file
   write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'/proc',myrank,'_'//'external_mesh.bin'
   open(unit=27,file=trim(prname_lp),&
@@ -161,7 +161,7 @@ program sum_kernels
     call exit_mpi(myrank, 'error reading external mesh file')
   endif
 
-  ! gets number of elements and global points for this partition          
+  ! gets number of elements and global points for this partition
   read(27) NSPEC_AB
   read(27) NGLOB_AB
 
@@ -177,7 +177,7 @@ program sum_kernels
 
   ! synchronizes
   call mpi_barrier(MPI_COMM_WORLD,ier)
-  
+
   ! sums up kernels
   if( USE_ISO_KERNELS ) then
 
@@ -260,18 +260,18 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank)
   allocate(kernel(NGLLX,NGLLY,NGLLZ,NSPEC_AB), &
           hess(NGLLX,NGLLY,NGLLZ,NSPEC_AB), &
           total_kernel(NGLLX,NGLLY,NGLLZ,NSPEC_AB))
-  
-  
-  if( USE_HESS_SUM ) then  
+
+
+  if( USE_HESS_SUM ) then
     allocate( total_hess(NGLLX,NGLLY,NGLLZ,NSPEC_AB) )
-    total_hess(:,:,:,:) = 0.0_CUSTOM_REAL    
+    total_hess(:,:,:,:) = 0.0_CUSTOM_REAL
   endif
-  
-  if( USE_SOURCE_MASK ) then  
+
+  if( USE_SOURCE_MASK ) then
     allocate( mask_source(NGLLX,NGLLY,NGLLZ,NSPEC_AB) )
-    mask_source(:,:,:,:) = 1.0_CUSTOM_REAL      
+    mask_source(:,:,:,:) = 1.0_CUSTOM_REAL
   endif
-  
+
   ! loops over all event kernels
   total_kernel = 0._CUSTOM_REAL
   do iker = 1, nker
@@ -295,7 +295,7 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank)
     read(12) kernel
     close(12)
 
-    ! outputs norm of kernel 
+    ! outputs norm of kernel
     norm = sum( kernel * kernel )
     call mpi_reduce(norm,norm_sum,1,CUSTOM_MPI_TYPE,MPI_SUM,0,MPI_COMM_WORLD,ier)
     if( myrank == 0 ) then
@@ -326,7 +326,7 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank)
 
     ! note: we take absolute values for hessian (as proposed by Yang)
     hess = abs(hess)
-    
+
     ! source mask
     if( USE_SOURCE_MASK ) then
       ! reads in mask
@@ -339,10 +339,10 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank)
       endif
       read(12) mask_source
       close(12)
-      
+
       ! masks source elements
       kernel = kernel * mask_source
-      
+
     endif
 
     ! precondition
@@ -355,26 +355,26 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank)
 
       ! inverts hessian
       call invert_hess( myrank,hess )
-      
+
       ! preconditions each event kernel with its hessian
       kernel = kernel * hess
 
     endif
-    
+
     ! sums all kernels from each event
     total_kernel = total_kernel + kernel
-      
+
   enddo
 
   ! preconditions summed kernels with summed hessians
   if( USE_HESS_SUM ) then
-  
+
       ! inverts hessian matrix
       call invert_hess( myrank,total_hess )
-      
+
       ! preconditions kernel
       total_kernel = total_kernel * total_hess
-  
+
   endif
 
   ! stores summed kernels
@@ -396,7 +396,7 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank)
   deallocate(kernel,hess,total_kernel)
   if( USE_HESS_SUM ) deallocate(total_hess)
   if( USE_SOURCE_MASK ) deallocate(mask_source)
-  
+
 end subroutine sum_kernel_pre
 
 !
@@ -409,7 +409,7 @@ subroutine invert_hess( myrank,hess_matrix )
 ! the approximate hessian is only defined for diagonal elements: like
 ! H_nn = \frac{ \partial^2 \chi }{ \partial \rho_n \partial \rho_n }
 ! on all GLL points, which are indexed (i,j,k,ispec)
-  
+
   use sum_par
   implicit none
 
@@ -423,30 +423,30 @@ subroutine invert_hess( myrank,hess_matrix )
   ! local parameters
   real(kind=CUSTOM_REAL) :: maxh,maxh_all
   integer :: ier
-  
+
   ! maximum value of hessian
   maxh = maxval( abs(hess_matrix) )
 
   ! determines maximum from all slices on master
   call mpi_allreduce(maxh,maxh_all,1,CUSTOM_MPI_TYPE,MPI_MAX,MPI_COMM_WORLD,ier)
-  
+
   ! user output
   if( myrank == 0 ) then
     print*
     print*,'hessian maximum: ',maxh_all
     print*
   endif
-  
-  ! normalizes hessian 
+
+  ! normalizes hessian
   if( maxh_all < 1.e-18 ) then
-    ! hessian is zero, re-initializes 
+    ! hessian is zero, re-initializes
     hess_matrix = 1.0_CUSTOM_REAL
     !call exit_mpi(myrank,'error hessian too small')
   else
     ! since hessian has absolute values, this scales between [0,1]
-    hess_matrix = hess_matrix / maxh_all  
+    hess_matrix = hess_matrix / maxh_all
   endif
-  
+
 
   ! inverts hessian values
   where( abs(hess_matrix(:,:,:,:)) > THRESHOLD_HESS )
@@ -457,5 +457,5 @@ subroutine invert_hess( myrank,hess_matrix )
 
   ! rescales hessian
   !hess_matrix = hess_matrix * maxh_all
-  
+
 end subroutine invert_hess
diff --git a/src/shared/write_VTK_data.f90 b/src/shared/write_VTK_data.f90
index d5691080c..b3bf95d86 100644
--- a/src/shared/write_VTK_data.f90
+++ b/src/shared/write_VTK_data.f90
@@ -38,16 +38,19 @@ subroutine write_VTK_data_elem_i(nspec,nglob, &
 
   integer :: nspec,nglob
 
-! global coordinates
+  ! global coordinates
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
   real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
 
-! element flag array
+  ! element flag array
   integer, dimension(nspec) :: elem_flag
-  integer :: ispec,i
 
-! file name
-  character(len=256) prname_file
+  ! file name
+  character(len=256) :: prname_file
+  !character(len=2), optional, intent(in) :: str_id
+
+  ! local parameters
+  integer :: ispec,i
 
 ! write source and receiver VTK files for Paraview
   !debug
@@ -79,6 +82,11 @@ subroutine write_VTK_data_elem_i(nspec,nglob, &
   write(IOVTK,*) ""
 
   write(IOVTK,'(a,i12)') "CELL_DATA ",nspec
+  !if( present( str_id ) ) then
+  !  write(IOVTK,'(a)') "SCALARS elem_flag_"//str_id//" integer"
+  !else
+  !  write(IOVTK,'(a)') "SCALARS elem_flag integer"
+  !endif
   write(IOVTK,'(a)') "SCALARS elem_flag integer"
   write(IOVTK,'(a)') "LOOKUP_TABLE default"
   do ispec = 1,nspec
diff --git a/src/shared/write_c_binary.c b/src/shared/write_c_binary.c
index 9403dfaf7..f802b60cb 100644
--- a/src/shared/write_c_binary.c
+++ b/src/shared/write_c_binary.c
@@ -78,7 +78,7 @@ FC_FUNC_(write_real,WRITE_REAL)(float *z) {
  Jul 18, 2003
 
  - uses functions fopen/fread/fwrite for binary file I/O
-  
+
  --------------------------------------- */
 
 #define __USE_GNU
@@ -122,13 +122,13 @@ void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *files
   char * blank;
   FILE *ft;
   int ret;
-  
+
   // checks filesize
   if( *filesize == 0 ){
     perror("Error file size for reading");
     exit(EXIT_FAILURE);
   }
-  
+
   // Trim the file name.
   fncopy = strndup(filename, *length);
   blank = strchr(fncopy, ' ');
@@ -137,8 +137,8 @@ void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *files
   }
 
 /*
-//daniel: debug checks file size
-// see: 
+//debug checks file size
+// see:
 //https://www.securecoding.cert.org/confluence/display/seccode/FIO19-C.+Do+not+use+fseek()+and+ftell()+to+compute+the+size+of+a+file
   printf("file size: %lld \n",*filesize);
   int fd;
@@ -148,10 +148,10 @@ void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *files
   if(fd == -1) {
     fprintf(stderr, "Error opening file: %s exiting\n", fncopy);
     exit(-1);
-  }  
-  if( fstat(fd, &stbuf) == 0 ){ 
+  }
+  if( fstat(fd, &stbuf) == 0 ){
     size = stbuf.st_size;
-    printf("file size found is: %lld (Bytes) \n",size);  
+    printf("file size found is: %lld (Bytes) \n",size);
   }
   close(fd);
 */
@@ -161,15 +161,15 @@ void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *files
   ft = fopen( fncopy, "rb+" ); // read binary file
   if( ft == NULL ) { perror("fopen"); exit(-1); }
 
-  
+
   // sets mode for full buffering
   work_buffer[*fid] = (char *)malloc(MAX_B);
   ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
   if( ret != 0 ){
     perror("Error setting working buffer");
-    exit(EXIT_FAILURE);  
+    exit(EXIT_FAILURE);
   }
-    
+
   // stores file index id fid: from 0 to 8
   fp_abs[*fid] = ft;
 
@@ -188,7 +188,7 @@ void open_file_abs_w_fbin(int *fid, char *filename, int *length, long long *file
   char * blank;
   FILE *ft;
   int ret;
-  
+
   // checks filesize
   if( *filesize == 0 ){
     perror("Error file size for writing");
@@ -212,9 +212,9 @@ void open_file_abs_w_fbin(int *fid, char *filename, int *length, long long *file
   ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
   if( ret != 0 ){
     perror("Error setting working buffer");
-    exit(EXIT_FAILURE);  
+    exit(EXIT_FAILURE);
   }
-  
+
   // stores file index id fid: from 0 to 8
   fp_abs[*fid] = ft;
 
@@ -242,7 +242,7 @@ void write_abs_fbin(int *fid, char *buffer, int *length, int *index){
 
   FILE *ft;
   int itemlen,remlen,donelen,ret;
-  void *buf;
+  char *buf;
 
   // file pointer
   ft = fp_abs[*fid];
@@ -253,7 +253,7 @@ void write_abs_fbin(int *fid, char *buffer, int *length, int *index){
   ret = 0;
 
 /*
-//daniel: debug
+//debug
   float dat[*length/4];
   memcpy(dat,buffer,*length);
   printf("buffer length: %d %d\n",*length,*index);
@@ -262,23 +262,23 @@ void write_abs_fbin(int *fid, char *buffer, int *length, int *index){
   for(i=0;i< 50;i++){
     printf("buffer: %d %e \n",i,dat[i]);
   }
-  
+
   // positions file pointer (for reverse time access)
   // make sure to use 64-bit arithmetic to avoid overflow for very large files
   long long pos,cur;
-  
-  pos = ((long long)*length) * (*index -1 );  
+
+  pos = ((long long)*length) * (*index -1 );
   cur = ftell(ft);
-  
-  printf("current position: %d %lld %lld \n",*fid,cur,pos);  
+
+  printf("current position: %d %lld %lld \n",*fid,cur,pos);
   ret = fseek(ft, pos , SEEK_SET);
   if ( ret != 0 ) {
     perror("Error fseek");
     exit(EXIT_FAILURE);
   }
  */
-  
-  
+
+
   // writes items of maximum MAX_B to the file
   while (remlen > 0){
 
@@ -295,10 +295,8 @@ void write_abs_fbin(int *fid, char *buffer, int *length, int *index){
     }
   }
 
-//daniel: debug
-//  printf("buffer done length: %d %d\n",donelen,*length);
-  
-
+  //debug
+  //  printf("buffer done length: %d %d\n",donelen,*length);
 }
 
 //void
@@ -310,7 +308,7 @@ void read_abs_fbin(int *fid, char *buffer, int *length, int *index){
   FILE *ft;
   int ret,itemlen,remlen,donelen;
   long long pos;
-  void *buf;
+  char *buf;
 
   // file pointer
   ft = fp_abs[*fid];
@@ -318,7 +316,7 @@ void read_abs_fbin(int *fid, char *buffer, int *length, int *index){
   // positions file pointer (for reverse time access)
   // make sure to use 64-bit arithmetic to avoid overflow for very large files
   pos = ((long long)*length) * (*index -1 );
-  
+
   ret = fseek(ft, pos , SEEK_SET);
   if ( ret != 0 ) {
     perror("Error fseek");
@@ -330,9 +328,9 @@ void read_abs_fbin(int *fid, char *buffer, int *length, int *index){
   buf = buffer;
   ret = 0;
 
-  // cleans buffer 
+  // cleans buffer
   //memset( buf,0,remlen);
-  
+
   // reads items of maximum MAX_B to the file
   while (remlen > 0){
 
@@ -355,19 +353,18 @@ void read_abs_fbin(int *fid, char *buffer, int *length, int *index){
   }
 
 /*
-//daniel: debug
+// debug
   printf("position: %lld %d %d \n",pos,*length,*index);
   printf("buffer done length: %d %d\n",donelen,*length);
   float dat[*length/4];
   memcpy(dat,buffer,*length);
   printf("return buffer length: %d %d\n",*length,*index);
-  printf("return buffer size: %d %d \n",sizeof(dat),sizeof(buffer));  
+  printf("return buffer size: %d %d \n",sizeof(dat),sizeof(buffer));
   int i;
   for(i=0;i< 50;i++){
     printf("return buffer: %d %e \n",i,dat[i]);
   }
 */
-  
 }
 
 
@@ -428,13 +425,13 @@ void open_file_abs_w_map(int *fid, char *filename, int *length, long long *files
   }
 
   /*
-   // daniel: debug check filesize below or above 2 GB
+   // debug check filesize below or above 2 GB
    //            filesize gives bytes needed; 4-byte integer limited to +- 2,147,483,648 bytes ~ 2 GB
    float s = *filesize / 1024. / 1024. / 1024.;
    if( s > 2.0 ){
    printf("file size bigger than 2 GB: %lld B or %f GB \n",*filesize,s);
    }
-   */  
+   */
 
   // Trim the file name.
   fncopy = strndup(filename, *length);
diff --git a/src/specfem3D/Makefile.in b/src/specfem3D/Makefile.in
index 5ad9cd28e..c0d216071 100644
--- a/src/specfem3D/Makefile.in
+++ b/src/specfem3D/Makefile.in
@@ -29,6 +29,41 @@
 
 # @configure_input@
 
+## example:
+# CUDA_LIBS = -L/apps/eiger/Cuda-4.0/cuda/lib64 -lcuda -lcudart -lcublas
+# CUDA_INC = -I/apps/eiger/Cuda-4.0/cuda/include
+# MPI_INC = -I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include
+##
+#CUDA_LIBS= -L/u/dpeter/install/cuda/lib64 -lcudart -lcublas
+#MPI_INC= -I/usr/local/openmpi/1.4.3/gcc/x86_64/include
+
+# CUDA
+# with configure: ./configure --with-cuda CUDA_LIB=.. CUDA_INC=.. MPI_INC=..
+
+# default cuda libraries
+# runtime library -lcudart needed, others are optional -lcuda -lcublas
+@COND_CUDA_TRUE@CUDA_LIBS = -lcudart   
+@COND_CUDA_FALSE@CUDA_LIBS = 
+
+CUDA_LIB_LOCATION = @CUDA_LIB@
+CUDA_LINK = $(CUDA_LIB_LOCATION) $(CUDA_LIBS)
+CUDA_INC = @CUDA_INC@ -I../../
+MPI_INC = @MPI_INC@
+
+@COND_CUDA_TRUE@NVCC = nvcc
+@COND_CUDA_FALSE@NVCC = @CC@
+
+@COND_CUDA_TRUE@NVCC_FLAGS = $(CUDA_INC) $(MPI_INC) $(COND_MPI_CPPFLAGS) -DCUDA -gencode=arch=compute_20,code=sm_20
+@COND_CUDA_FALSE@NVCC_FLAGS = $(MPI_INC) $(COND_MPI_CPPFLAGS)
+
+# OpenMP
+# with configure: ./configure --with-openmp FLAGS_NO_CHECK="-openmp .." OPENMP_LIB=.. 
+@COND_OPENMP_TRUE@OPENMP_LIBS = $(OPENMP_LIB)
+@COND_OPENMP_FALSE@OPENMP_LIBS = 
+@COND_OPENMP_TRUE@COND_OPENMP_FFLAGS = -DOPENMP_MODE
+@COND_OPENMP_FALSE@COND_OPENMP_FFLAGS =
+
+
 FC = @FC@
 FCFLAGS = #@FCFLAGS@
 MPIFC = @MPIFC@
@@ -37,7 +72,10 @@ FLAGS_CHECK = @FLAGS_CHECK@
 FLAGS_NO_CHECK = @FLAGS_NO_CHECK@
 FCFLAGS_f90 = @FCFLAGS_f90@
 
+
 SHARED = ../shared/
+CUDAD = ../cuda/
+
 # E : executables directory
 E = ../../bin
 # O : objects directory
@@ -47,10 +85,10 @@ L = ../../lib
 # Output files directory
 OUTPUT=../../in_out_files/OUTPUT_FILES
 
-FCCOMPILE_CHECK =@FCENV@ ${FC} ${FCFLAGS} $(FLAGS_CHECK) -I$(SHARED)
-FCCOMPILE_NO_CHECK =@FCENV@ ${FC} ${FCFLAGS} $(FLAGS_NO_CHECK) -I$(SHARED)
-MPIFCCOMPILE_CHECK =@FCENV@ ${MPIFC} ${FCFLAGS} $(FLAGS_CHECK) -I$(SHARED)
-MPIFCCOMPILE_NO_CHECK =@FCENV@ ${MPIFC} ${FCFLAGS} $(FLAGS_NO_CHECK) -I$(SHARED)
+FCCOMPILE_CHECK =@FCENV@ ${FC} ${FCFLAGS} $(FLAGS_CHECK) $(COND_OPENMP_FFLAGS) -I$(SHARED)
+FCCOMPILE_NO_CHECK =@FCENV@ ${FC} ${FCFLAGS} $(FLAGS_NO_CHECK) $(COND_OPENMP_FFLAGS) -I$(SHARED)
+MPIFCCOMPILE_CHECK =@FCENV@ ${MPIFC} ${FCFLAGS} $(FLAGS_CHECK) $(COND_OPENMP_FFLAGS) -I$(SHARED)
+MPIFCCOMPILE_NO_CHECK =@FCENV@ ${MPIFC} ${FCFLAGS} $(FLAGS_NO_CHECK) $(COND_OPENMP_FFLAGS) -I$(SHARED)
 @COND_MPI_TRUE@FCLINK = $(MPIFCCOMPILE_NO_CHECK)
 @COND_MPI_FALSE@FCLINK = $(FCCOMPILE_NO_CHECK)
 
@@ -65,46 +103,66 @@ ARFLAGS = cru
 RANLIB = ranlib
 
 libspecfem_a_OBJECTS = \
-	$O/assemble_MPI_scalar.o \
-	$O/check_mesh_resolution.o \
+	$O/assemble_MPI_scalar.shared.o \
+	$O/check_mesh_resolution.shared.o \
 	$O/comp_source_time_function.o \
 	$O/compute_adj_source_frechet.o \
-	$O/compute_arrays_source.o \
-	$O/multiply_arrays_source.o \
-	$O/create_name_database.o \
-	$O/create_serial_name_database.o \
-	$O/define_derivation_matrices.o \
-	$O/detect_surface.o \
-	$O/exit_mpi.o \
-	$O/force_ftz.o \
-	$O/get_attenuation_model.o \
-	$O/get_cmt.o \
-	$O/get_element_face.o \
-	$O/get_jacobian_boundaries.o \
-	$O/get_shape2D.o \
-	$O/get_shape3D.o \
-	$O/get_value_parameters.o \
-	$O/gll_library.o \
-	$O/hex_nodes.o \
-	$O/lagrange_poly.o \
+	$O/compute_arrays_source.shared.o \
+	$O/multiply_arrays_source.shared.o \
+	$O/create_name_database.shared.o \
+	$O/create_serial_name_database.shared.o \
+	$O/define_derivation_matrices.shared.o \
+	$O/detect_surface.shared.o \
+	$O/exit_mpi.shared.o \
+	$O/force_ftz.cc.o \
+	$O/get_attenuation_model.shared.o \
+	$O/get_cmt.shared.o \
+	$O/get_element_face.shared.o \
+	$O/get_jacobian_boundaries.shared.o \
+	$O/get_shape2D.shared.o \
+	$O/get_shape3D.shared.o \
+	$O/get_value_parameters.shared.o \
+	$O/gll_library.shared.o \
+	$O/hex_nodes.shared.o \
+	$O/lagrange_poly.shared.o \
 	$O/locate_receivers.o \
 	$O/locate_source.o \
-	$O/netlib_specfun_erf.o \
-	$O/param_reader.o \
-	$O/prepare_assemble_MPI.o \
-	$O/read_topo_bathy_file.o \
-	$O/read_parameter_file.o \
-	$O/read_value_parameters.o \
-	$O/recompute_jacobian.o \
-	$O/save_header_file.o \
-	$O/sort_array_coordinates.o \
-	$O/utm_geo.o \
-	$O/write_VTK_data.o \
-	$O/write_c_binary.o \
-	$(EMPTY_MACRO)
+	$O/netlib_specfun_erf.shared.o \
+	$O/param_reader.cc.o \
+	$O/prepare_assemble_MPI.shared.o \
+	$O/read_topo_bathy_file.shared.o \
+	$O/read_parameter_file.shared.o \
+	$O/read_value_parameters.shared.o \
+	$O/recompute_jacobian.shared.o \
+	$O/save_header_file.shared.o \
+	$O/sort_array_coordinates.shared.o \
+	$O/utm_geo.shared.o \
+	$O/write_VTK_data.shared.o \
+	$O/write_c_binary.cc.o 
+
+CUDA_OBJECTS = \
+	$O/check_fields_cuda.cuda.o \
+	$O/compute_add_sources_acoustic_cuda.cuda.o \
+	$O/compute_add_sources_elastic_cuda.cuda.o \
+	$O/compute_coupling_cuda.cuda.o \
+	$O/compute_forces_acoustic_cuda.cuda.o \
+	$O/compute_forces_elastic_cuda.cuda.o \
+	$O/compute_kernels_cuda.cuda.o \
+	$O/compute_stacey_acoustic_cuda.cuda.o \
+	$O/compute_stacey_elastic_cuda.cuda.o \
+	$O/it_update_displacement_cuda.cuda.o \
+	$O/noise_tomography_cuda.cuda.o \
+	$O/prepare_mesh_constants_cuda.cuda.o \
+	$O/transfer_fields_cuda.cuda.o \
+	$O/write_seismograms_cuda.cuda.o \
+	$O/save_and_compare_cpu_vs_gpu.cudacc.o 
+
+CUDA_STUBS = \
+	$O/specfem3D_gpu_cuda_method_stubs.cudacc.o
 
 # solver objects - no statically allocated arrays anymore
 SOLVER_ARRAY_OBJECTS = \
+	$O/program_specfem3D.o \
 	$O/specfem3D_par.o \
 	$O/PML_init.o \
 	$O/compute_boundary_kernel.o \
@@ -114,6 +172,7 @@ SOLVER_ARRAY_OBJECTS = \
 	$O/compute_forces_acoustic_PML.o \
 	$O/compute_forces_elastic.o \
 	$O/compute_forces_elastic_Dev.o \
+	$O/compute_forces_elastic_Dev2.o \
 	$O/compute_forces_elastic_noDev.o \
 	$O/compute_forces_poroelastic.o \
 	$O/compute_forces_solid.o \
@@ -149,36 +208,36 @@ SOLVER_ARRAY_OBJECTS = \
 	$O/save_adjoint_kernels.o \
 	$O/specfem3D.o \
 	$O/assemble_MPI_vector.o \
-	$O/noise_tomography.o \
-	$(EMPTY_MACRO)
+	$O/make_gravity.o \
+	$O/noise_tomography.o 
 
 
 MODEL_UPD_OBJECTS = \
 	$O/model_update.o \
 	$O/specfem3D_par.o \
 	$O/initialize_simulation.o \
-	$O/read_parameter_file.o \
-	$O/read_value_parameters.o \
-	$O/get_value_parameters.o \
-	$O/param_reader.o \
-	$O/exit_mpi.o \
+	$O/read_parameter_file.shared.o \
+	$O/read_value_parameters.shared.o \
+	$O/get_value_parameters.shared.o \
+	$O/param_reader.cc.o \
+	$O/exit_mpi.shared.o \
 	$O/read_mesh_databases.o \
-	$O/create_name_database.o \
-	$O/check_mesh_resolution.o \
-	$O/gll_library.o \
-	$O/get_attenuation_model.o \
+	$O/create_name_database.shared.o \
+	$O/check_mesh_resolution.shared.o \
+	$O/gll_library.shared.o \
+	$O/get_attenuation_model.shared.o \
 	$O/save_external_bin_m_up.o \
-	$O/write_VTK_data.o \
+	$O/write_VTK_data.shared.o \
 	$(EMPTY_MACRO)
 
 
 SUM_KERNELS_OBJECTS = \
 	$O/sum_kernels.o \
-	$O/read_parameter_file.o \
-	$O/read_value_parameters.o \
-	$O/get_value_parameters.o \
-	$O/param_reader.o \
-	$O/exit_mpi.o \
+	$O/read_parameter_file.shared.o \
+	$O/read_value_parameters.shared.o \
+	$O/get_value_parameters.shared.o \
+	$O/param_reader.cc.o \
+	$O/exit_mpi.shared.o \
 	$(EMPTY_MACRO)
 
 
@@ -186,11 +245,15 @@ SUM_KERNELS_OBJECTS = \
 @COND_MPI_TRUE@COND_MPI_OBJECTS = $O/parallel.o
 @COND_MPI_FALSE@COND_MPI_OBJECTS = $O/serial.o
 
+# objects toggled between openmp and non-openmp version
+@COND_OPENMP_TRUE@COND_OPENMP_OBJECTS = $O/compute_forces_elastic_Dev_openmp.openmp.o
+@COND_OPENMP_FALSE@COND_OPENMP_OBJECTS =
+
 LIBSPECFEM = $L/libspecfem.a
 
 # objects for the pure Fortran version
-@COND_PYRE_FALSE@XGENERATE_DATABASES_OBJECTS = $O/program_generate_databases.o $(LIBSPECFEM)
-@COND_PYRE_FALSE@XSPECFEM_OBJECTS = $O/program_specfem3D.o $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM)
+@COND_PYRE_FALSE@@COND_CUDA_TRUE@XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) $(CUDA_OBJECTS)
+@COND_PYRE_FALSE@@COND_CUDA_FALSE@XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) $(CUDA_STUBS)
 
 ####
 #### targets
@@ -198,7 +261,6 @@ LIBSPECFEM = $L/libspecfem.a
 
 # default targets for the pure Fortran version
 @COND_PYRE_FALSE@DEFAULT = \
-@COND_PYRE_FALSE@	generate_databases \
 @COND_PYRE_FALSE@	specfem3D \
 @COND_PYRE_FALSE@	combine_vol_data \
 @COND_PYRE_FALSE@	combine_surf_data \
@@ -219,13 +281,11 @@ specfem3D: xspecfem3D
 #### rules for executables
 ####
 
+
 # rules for the pure Fortran version
-@COND_PYRE_FALSE@xgenerate_databases: $(XGENERATE_DATABASES_OBJECTS) $(COND_MPI_OBJECTS)
-@COND_PYRE_FALSE@	${FCLINK} -o ${E}/xgenerate_databases $(XGENERATE_DATABASES_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS)
-@COND_PYRE_FALSE@
 @COND_PYRE_FALSE@# solver also depends on values from mesher
-@COND_PYRE_FALSE@xspecfem3D: $(XSPECFEM_OBJECTS) $(COND_MPI_OBJECTS)
-@COND_PYRE_FALSE@	${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS)
+@COND_PYRE_FALSE@xspecfem3D: $(XSPECFEM_OBJECTS) $(COND_MPI_OBJECTS) $(COND_OPENMP_OBJECTS)
+@COND_PYRE_FALSE@	${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) $(COND_OPENMP_OBJECTS) $(OPENMP_LIBS) $(CUDA_LINK)
 @COND_PYRE_FALSE@
 
 convolve_source_timefunction: xconvolve_source_timefunction
@@ -236,30 +296,30 @@ smooth_vol_data: xsmooth_vol_data
 sum_kernels: xsum_kernels
 model_update: xmodel_update
 
-xconvolve_source_timefunction: $O/convolve_source_timefunction.o
-	${FCCOMPILE_CHECK} -o  ${E}/xconvolve_source_timefunction $O/convolve_source_timefunction.o
+xconvolve_source_timefunction: $O/convolve_source_timefunction.shared.o
+	${FCCOMPILE_CHECK} -o  ${E}/xconvolve_source_timefunction $O/convolve_source_timefunction.shared.o
 
-@COND_PYRE_FALSE@xcreate_movie_shakemap_AVS_DX_GMT: $O/create_movie_shakemap_AVS_DX_GMT.o $(LIBSPECFEM) $(OUTPUT)/surface_from_mesher.h
-@COND_PYRE_FALSE@	${FCCOMPILE_CHECK} -o  ${E}/xcreate_movie_shakemap_AVS_DX_GMT $O/create_movie_shakemap_AVS_DX_GMT.o $(LIBSPECFEM) -I$(OUTPUT)
+@COND_PYRE_FALSE@xcreate_movie_shakemap_AVS_DX_GMT: $O/create_movie_shakemap_AVS_DX_GMT.shared.o $(LIBSPECFEM) $(OUTPUT)/surface_from_mesher.h
+@COND_PYRE_FALSE@	${FCCOMPILE_CHECK} -o  ${E}/xcreate_movie_shakemap_AVS_DX_GMT $O/create_movie_shakemap_AVS_DX_GMT.shared.o $(LIBSPECFEM) -I$(OUTPUT)
 
-xcombine_vol_data: $O/combine_vol_data.o $O/write_c_binary.o $O/read_parameter_file.o $O/read_value_parameters.o $O/get_value_parameters.o $O/param_reader.o
-	${FCCOMPILE_CHECK} -o  ${E}/xcombine_vol_data  $O/combine_vol_data.o $O/write_c_binary.o $O/read_parameter_file.o $O/read_value_parameters.o $O/get_value_parameters.o $O/param_reader.o
+xcombine_vol_data: $O/combine_vol_data.shared.o $O/write_c_binary.cc.o $O/read_parameter_file.shared.o $O/read_value_parameters.shared.o $O/get_value_parameters.shared.o $O/param_reader.cc.o
+	${FCCOMPILE_CHECK} -o  ${E}/xcombine_vol_data  $O/combine_vol_data.shared.o $O/write_c_binary.cc.o $O/read_parameter_file.shared.o $O/read_value_parameters.shared.o $O/get_value_parameters.shared.o $O/param_reader.cc.o
 
-xcombine_surf_data: $O/combine_surf_data.o $O/write_c_binary.o $O/param_reader.o
-	${FCCOMPILE_CHECK} -o  ${E}/xcombine_surf_data  $O/combine_surf_data.o $O/write_c_binary.o $O/param_reader.o
+xcombine_surf_data: $O/combine_surf_data.shared.o $O/write_c_binary.cc.o $O/param_reader.cc.o
+	${FCCOMPILE_CHECK} -o  ${E}/xcombine_surf_data  $O/combine_surf_data.shared.o $O/write_c_binary.cc.o $O/param_reader.cc.o
 
-xsmooth_vol_data: $O/smooth_vol_data.o $O/read_parameter_file.o $O/read_value_parameters.o $O/get_value_parameters.o $O/param_reader.o $O/gll_library.o $O/exit_mpi.o $O/parallel.o
-	${FCLINK} -o  ${E}/xsmooth_vol_data  $O/smooth_vol_data.o $O/read_parameter_file.o $O/read_value_parameters.o $O/get_value_parameters.o $O/param_reader.o $O/gll_library.o $O/exit_mpi.o $O/parallel.o $(MPILIBS)
+xsmooth_vol_data: $O/smooth_vol_data.o $O/read_parameter_file.shared.o $O/read_value_parameters.shared.o $O/get_value_parameters.shared.o $O/param_reader.cc.o $O/gll_library.shared.o $O/exit_mpi.shared.o $(COND_MPI_OBJECTS)
+	${FCLINK} -o  ${E}/xsmooth_vol_data  $O/smooth_vol_data.o $O/read_parameter_file.shared.o $O/read_value_parameters.shared.o $O/get_value_parameters.shared.o $O/param_reader.cc.o $O/gll_library.shared.o $O/exit_mpi.shared.o $(COND_MPI_OBJECTS) $(MPILIBS)
 
 xsum_kernels: $(SUM_KERNELS_OBJECTS) $(COND_MPI_OBJECTS)
 	${FCLINK} -o  ${E}/xsum_kernels  $(SUM_KERNELS_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) 
 
-xmodel_update: $(MODEL_UPD_OBJECTS) $(COND_MPI_OBJECTS)
-	${FCLINK} -o  ${E}/xmodel_update  $(MODEL_UPD_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS)
+xmodel_update: $(MODEL_UPD_OBJECTS) $(COND_MPI_OBJECTS) $(CUDA_STUBS)
+	${FCLINK} -o  ${E}/xmodel_update  $(MODEL_UPD_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) $(CUDA_STUBS)
 
 clean:
 	rm -f $O/* *.o *.gnu *.mod $(OUTPUT)/timestamp* $(OUTPUT)/starttime*txt work.pc* \
-        xgenerate_databases xspecfem3D \
+        xspecfem3D \
         xconvolve_source_timefunction \
         xcreate_movie_shakemap_AVS_DX_GMT xcombine_vol_data xcombine_surf_data \
         xsmooth_vol_data xmodel_update xsum_kernels
@@ -281,293 +341,67 @@ $L/libspecfem.a: $(libspecfem_a_OBJECTS)
 ### optimized flags (not dependent on values from mesher anymore)
 ###
 
-$O/specfem3D_par.o: $(SHARED)constants.h specfem3D_par.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/specfem3D_par.o specfem3D_par.f90
-
-$O/specfem3D.o: $(SHARED)constants.h specfem3D.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/specfem3D.o specfem3D.f90
-
-$O/compute_forces_elastic_noDev.o: $(SHARED)constants.h compute_forces_elastic_noDev.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_elastic_noDev.o compute_forces_elastic_noDev.f90
-
-$O/compute_forces_elastic_Dev.o: $(SHARED)constants.h compute_forces_elastic_Dev.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_elastic_Dev.o compute_forces_elastic_Dev.f90
-
-$O/compute_forces_elastic.o: $(SHARED)constants.h compute_forces_elastic.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_elastic.o compute_forces_elastic.f90
-
-$O/compute_forces_poroelastic.o: $(SHARED)constants.h compute_forces_poroelastic.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_poroelastic.o compute_forces_poroelastic.f90
-
-$O/compute_forces_solid.o: $(SHARED)constants.h compute_forces_solid.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_solid.o compute_forces_solid.f90
-
-$O/compute_forces_fluid.o: $(SHARED)constants.h compute_forces_fluid.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_fluid.o compute_forces_fluid.f90
-
-$O/compute_forces_acoustic.o: $(SHARED)constants.h compute_forces_acoustic.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_acoustic.o compute_forces_acoustic.f90
-
-$O/compute_forces_acoustic_pot.o: $(SHARED)constants.h compute_forces_acoustic_pot.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_acoustic_pot.o compute_forces_acoustic_pot.f90
-
-$O/compute_forces_acoustic_PML.o: $(SHARED)constants.h compute_forces_acoustic_PML.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_acoustic_PML.o compute_forces_acoustic_PML.f90
-
-$O/compute_add_sources_acoustic.o: $(SHARED)constants.h compute_add_sources_acoustic.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_add_sources_acoustic.o compute_add_sources_acoustic.f90
-
-$O/compute_add_sources_elastic.o: $(SHARED)constants.h compute_add_sources_elastic.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_add_sources_elastic.o compute_add_sources_elastic.f90
-
-$O/compute_add_sources_poroelastic.o: $(SHARED)constants.h compute_add_sources_poroelastic.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_add_sources_poroelastic.o compute_add_sources_poroelastic.f90
-
-$O/get_poroelastic_velocities.o: $(SHARED)constants.h get_poroelastic_velocities.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/get_poroelastic_velocities.o get_poroelastic_velocities.f90
-
-$O/compute_coupling_acoustic_el.o: $(SHARED)constants.h compute_coupling_acoustic_el.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_acoustic_el.o compute_coupling_acoustic_el.f90
-
-$O/compute_coupling_elastic_ac.o: $(SHARED)constants.h compute_coupling_elastic_ac.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_elastic_ac.o compute_coupling_elastic_ac.f90
-
-$O/compute_coupling_elastic_po.o: $(SHARED)constants.h compute_coupling_elastic_po.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_elastic_po.o compute_coupling_elastic_po.f90
-
-$O/compute_coupling_acoustic_po.o: $(SHARED)constants.h compute_coupling_acoustic_po.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_acoustic_po.o compute_coupling_acoustic_po.f90
-
-$O/compute_coupling_poroelastic_ac.o: $(SHARED)constants.h compute_coupling_poroelastic_ac.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_poroelastic_ac.o compute_coupling_poroelastic_ac.f90
-
-$O/compute_coupling_poroelastic_el.o: $(SHARED)constants.h compute_coupling_poroelastic_el.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_poroelastic_el.o compute_coupling_poroelastic_el.f90
-
-$O/compute_stacey_acoustic.o: $(SHARED)constants.h compute_stacey_acoustic.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_stacey_acoustic.o compute_stacey_acoustic.f90
-
-$O/compute_stacey_elastic.o: $(SHARED)constants.h compute_stacey_elastic.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_stacey_elastic.o compute_stacey_elastic.f90
-
-$O/compute_gradient.o: $(SHARED)constants.h compute_gradient.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_gradient.o compute_gradient.f90
-
-$O/compute_interpolated_dva.o: $(SHARED)constants.h compute_interpolated_dva.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_interpolated_dva.o compute_interpolated_dva.f90
-
-### C compilation
-$O/force_ftz.o: ${SHARED}/force_ftz.c ../../config.h
-	${CC} -c $(CPPFLAGS) $(CFLAGS) -I../.. -o $O/force_ftz.o ${SHARED}/force_ftz.c
-
-$O/initialize_simulation.o: $(SHARED)constants.h initialize_simulation.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/initialize_simulation.o initialize_simulation.f90
-
-$O/read_mesh_databases.o: $(SHARED)constants.h read_mesh_databases.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/read_mesh_databases.o read_mesh_databases.f90
-
-$O/setup_GLL_points.o: $(SHARED)constants.h setup_GLL_points.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/setup_GLL_points.o setup_GLL_points.f90
-
-$O/detect_mesh_surfaces.o: $(SHARED)constants.h detect_mesh_surfaces.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/detect_mesh_surfaces.o detect_mesh_surfaces.f90
-
-$O/setup_movie_meshes.o: $(SHARED)constants.h setup_movie_meshes.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/setup_movie_meshes.o setup_movie_meshes.f90
-
-$O/read_topography_bathymetry.o: $(SHARED)constants.h read_topography_bathymetry.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/read_topography_bathymetry.o read_topography_bathymetry.f90
-
-$O/setup_sources_receivers.o: $(SHARED)constants.h setup_sources_receivers.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/setup_sources_receivers.o setup_sources_receivers.f90
-
-$O/prepare_timerun.o: $(SHARED)constants.h prepare_timerun.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/prepare_timerun.o prepare_timerun.f90
-
-$O/iterate_time.o: $(SHARED)constants.h iterate_time.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/iterate_time.o iterate_time.f90
-
-$O/finalize_simulation.o: $(SHARED)constants.h finalize_simulation.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/finalize_simulation.o finalize_simulation.f90
-
-$O/assemble_MPI_vector.o: $(SHARED)constants.h assemble_MPI_vector.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_vector.o assemble_MPI_vector.f90
+$O/%.o: %.f90 $(SHARED)constants.h
+	${FCCOMPILE_NO_CHECK} -c -o $@ $<
 
-$O/assemble_MPI_scalar.o: $(SHARED)constants.h ${SHARED}/assemble_MPI_scalar.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_scalar.o ${SHARED}/assemble_MPI_scalar.f90
+$O/%.o: %.F90 $(SHARED)constants.h
+	${FCCOMPILE_NO_CHECK} -c -o $@ $<
 
-$O/save_adjoint_kernels.o: $(SHARED)constants.h save_adjoint_kernels.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/save_adjoint_kernels.o save_adjoint_kernels.f90
+$O/%.shared.o: ${SHARED}%.f90 $(SHARED)constants.h
+	${FCCOMPILE_NO_CHECK} -c -o $@ $<
 
-$O/write_movie_output.o: $(SHARED)constants.h write_movie_output.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/write_movie_output.o write_movie_output.f90
-
-$O/create_color_image.o: $(SHARED)constants.h create_color_image.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/create_color_image.o create_color_image.f90
-
-$O/write_seismograms.o: $(SHARED)constants.h write_seismograms.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/write_seismograms.o write_seismograms.f90
-
-$O/write_output_ASCII.o: $(SHARED)constants.h write_output_ASCII.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/write_output_ASCII.o write_output_ASCII.f90
-
-$O/write_output_SU.o: $(SHARED)constants.h write_output_SU.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/write_output_SU.o write_output_SU.f90
-
-$O/noise_tomography.o: $(SHARED)constants.h noise_tomography.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/noise_tomography.o noise_tomography.f90
+$O/%.shared.o: ${SHARED}%.F90 $(SHARED)constants.h
+	${FCCOMPILE_NO_CHECK} -c -o $@ $<
 
 
 ###
-### MPI compilation without optimization
+### OpenMP compilation
 ###
+$O/%.openmp.o: %.f90 $(SHARED)constants.h
+	${FCCOMPILE_NO_CHECK} -c -o $@ $<
 
-$O/parallel.o: $(SHARED)constants.h ${SHARED}/parallel.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/parallel.o ${SHARED}/parallel.f90
 
 ###
-### serial compilation without optimization
+### CUDA compilation
 ###
+$O/%.cuda.o: ${CUDAD}%.cu ../../config.h $(CUDAD)mesh_constants_cuda.h $(CUDAD)prepare_constants_cuda.h
+	$(NVCC) -c $< -o $@ $(NVCC_FLAGS)
 
-$O/serial.o: $(SHARED)constants.h ${SHARED}/exit_mpi.f90
-	${FCCOMPILE_CHECK} -c -o $O/serial.o ${SHARED}/serial.f90
-
-$O/program_specfem3D.o: program_specfem3D.f90
-	${FCCOMPILE_CHECK} -c -o $O/program_specfem3D.o program_specfem3D.f90
-
-$O/locate_source.o: $(SHARED)constants.h locate_source.f90
-	${FCCOMPILE_CHECK} -c -o $O/locate_source.o locate_source.f90
-
-$O/locate_receivers.o: $(SHARED)constants.h locate_receivers.f90
-	${FCCOMPILE_CHECK} -c -o $O/locate_receivers.o locate_receivers.f90
-
-$O/exit_mpi.o: $(SHARED)constants.h ${SHARED}/exit_mpi.f90
-	${FCCOMPILE_CHECK} -c -o $O/exit_mpi.o ${SHARED}/exit_mpi.f90
-
-$O/convolve_source_timefunction.o: ${SHARED}/convolve_source_timefunction.f90
-	${FCCOMPILE_CHECK} -c -o $O/convolve_source_timefunction.o ${SHARED}/convolve_source_timefunction.f90
-
-$O/save_header_file.o: $(SHARED)constants.h ${SHARED}/save_header_file.f90
-	${FCCOMPILE_CHECK} -c -o $O/save_header_file.o ${SHARED}/save_header_file.f90
-
-$O/read_parameter_file.o: $(SHARED)constants.h ${SHARED}/read_parameter_file.f90
-	${FCCOMPILE_CHECK} -c -o $O/read_parameter_file.o ${SHARED}/read_parameter_file.f90
-
-$O/read_value_parameters.o: $(SHARED)constants.h ${SHARED}/read_value_parameters.f90
-	${FCCOMPILE_CHECK} -c -o $O/read_value_parameters.o ${SHARED}/read_value_parameters.f90
-
-$O/get_value_parameters.o: $(SHARED)constants.h ${SHARED}/get_value_parameters.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_value_parameters.o ${SHARED}/get_value_parameters.f90
-
-$O/utm_geo.o: $(SHARED)constants.h ${SHARED}/utm_geo.f90
-	${FCCOMPILE_CHECK} -c -o $O/utm_geo.o ${SHARED}/utm_geo.f90
-
-$O/check_mesh_resolution.o: $(SHARED)constants.h ${SHARED}/check_mesh_resolution.f90
-	${FCCOMPILE_CHECK} -c -o $O/check_mesh_resolution.o ${SHARED}/check_mesh_resolution.f90
-
-$O/detect_surface.o: $(SHARED)constants.h ${SHARED}/detect_surface.f90
-	${FCCOMPILE_CHECK} -c -o $O/detect_surface.o ${SHARED}/detect_surface.f90
-
-$O/gll_library.o: $(SHARED)constants.h ${SHARED}/gll_library.f90
-	${FCCOMPILE_CHECK} -c -o $O/gll_library.o ${SHARED}/gll_library.f90
-
-$O/get_jacobian_boundaries.o: $(SHARED)constants.h ${SHARED}/get_jacobian_boundaries.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_jacobian_boundaries.o ${SHARED}/get_jacobian_boundaries.f90
-
-$O/get_flags_boundaries.o: $(SHARED)constants.h ${SHARED}/get_flags_boundaries.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_flags_boundaries.o ${SHARED}/get_flags_boundaries.f90
-
-$O/get_cmt.o: $(SHARED)constants.h ${SHARED}/get_cmt.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_cmt.o  ${SHARED}/get_cmt.f90
-
-$O/create_movie_shakemap_AVS_DX_GMT.o: $(SHARED)constants.h ${SHARED}/create_movie_shakemap_AVS_DX_GMT.f90 $(OUTPUT)/surface_from_mesher.h
-	${FCCOMPILE_CHECK} -c -o $O/create_movie_shakemap_AVS_DX_GMT.o  ${SHARED}/create_movie_shakemap_AVS_DX_GMT.f90 -I$(OUTPUT)
-
-$O/get_element_face.o: $(SHARED)constants.h ${SHARED}/get_element_face.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_element_face.o  ${SHARED}/get_element_face.f90
-
-$O/write_VTK_data.o: $(SHARED)constants.h ${SHARED}/write_VTK_data.f90
-	${FCCOMPILE_CHECK} -c -o $O/write_VTK_data.o ${SHARED}/write_VTK_data.f90
-
-$O/get_shape3D.o: $(SHARED)constants.h ${SHARED}/get_shape3D.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_shape3D.o ${SHARED}/get_shape3D.f90
-
-$O/get_shape2D.o: $(SHARED)constants.h ${SHARED}/get_shape2D.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_shape2D.o ${SHARED}/get_shape2D.f90
-
-$O/hex_nodes.o: $(SHARED)constants.h ${SHARED}/hex_nodes.f90
-	${FCCOMPILE_CHECK} -c -o $O/hex_nodes.o ${SHARED}/hex_nodes.f90
-
-$O/netlib_specfun_erf.o: ${SHARED}/netlib_specfun_erf.f90
-	${FCCOMPILE_CHECK} -c -o $O/netlib_specfun_erf.o ${SHARED}/netlib_specfun_erf.f90
-
-$O/sort_array_coordinates.o: $(SHARED)constants.h ${SHARED}/sort_array_coordinates.f90
-	${FCCOMPILE_CHECK} -c -o $O/sort_array_coordinates.o ${SHARED}/sort_array_coordinates.f90
-
-$O/comp_source_time_function.o: $(SHARED)constants.h comp_source_time_function.f90
-	${FCCOMPILE_CHECK} -c -o $O/comp_source_time_function.o comp_source_time_function.f90
-
-$O/read_topo_bathy_file.o: $(SHARED)constants.h ${SHARED}/read_topo_bathy_file.f90
-	${FCCOMPILE_CHECK} -c -o $O/read_topo_bathy_file.o ${SHARED}/read_topo_bathy_file.f90
-
-$O/lagrange_poly.o: $(SHARED)constants.h ${SHARED}/lagrange_poly.f90
-	${FCCOMPILE_CHECK} -c -o $O/lagrange_poly.o ${SHARED}/lagrange_poly.f90
-
-$O/recompute_jacobian.o: $(SHARED)constants.h ${SHARED}/recompute_jacobian.f90
-	${FCCOMPILE_CHECK} -c -o $O/recompute_jacobian.o ${SHARED}/recompute_jacobian.f90
-
-$O/create_name_database.o: $(SHARED)constants.h ${SHARED}/create_name_database.f90
-	${FCCOMPILE_CHECK} -c -o $O/create_name_database.o ${SHARED}/create_name_database.f90
-
-$O/create_serial_name_database.o: $(SHARED)constants.h ${SHARED}/create_serial_name_database.f90
-	${FCCOMPILE_CHECK} -c -o $O/create_serial_name_database.o ${SHARED}/create_serial_name_database.f90
-
-$O/define_derivation_matrices.o: $(SHARED)constants.h ${SHARED}/define_derivation_matrices.f90
-	${FCCOMPILE_CHECK} -c -o $O/define_derivation_matrices.o ${SHARED}/define_derivation_matrices.f90
-
-$O/compute_adj_source_frechet.o: $(SHARED)constants.h compute_adj_source_frechet.f90
-	${FCCOMPILE_CHECK} -c -o $O/compute_adj_source_frechet.o compute_adj_source_frechet.f90
-
-$O/compute_arrays_source.o: $(SHARED)constants.h compute_arrays_source.f90
-	${FCCOMPILE_CHECK} -c -o $O/compute_arrays_source.o compute_arrays_source.f90
-
-$O/multiply_arrays_source.o:  ${SHARED}/constants.h ${SHARED}/multiply_arrays_source.f90
-	${FCCOMPILE_CHECK} -c -o $O/multiply_arrays_source.o ${SHARED}/multiply_arrays_source.f90
+###
+### C compilation
+###
+force_ftz.o: ${SHARED}force_ftz.c ../../config.h
+	${CC} -c $(CPPFLAGS) $(CFLAGS) -I../.. -o $O/force_ftz.o ${SHARED}force_ftz.c
 
-$O/get_attenuation_model.o: $(SHARED)constants.h ${SHARED}/get_attenuation_model.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_attenuation_model.o ${SHARED}/get_attenuation_model.f90
+$O/%.cc.o: ${SHARED}%.c ../../config.h
+	${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${SHARED}$< -I../../
 
-$O/compute_boundary_kernel.o: $(SHARED)constants.h compute_boundary_kernel.f90
-	${FCCOMPILE_CHECK} -c -o $O/compute_boundary_kernel.o compute_boundary_kernel.f90
+$O/%.cudacc.o: ${CUDAD}%.c ../../config.h
+	${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${CUDAD}$< -I../../
 
-$O/compute_kernels.o: $(SHARED)constants.h compute_kernels.f90
-	${FCCOMPILE_CHECK} -c -o $O/compute_kernels.o compute_kernels.f90
 
-$O/combine_vol_data.o: $(SHARED)constants.h ${SHARED}/combine_vol_data.f90
-	${FCCOMPILE_CHECK} -c -o $O/combine_vol_data.o ${SHARED}/combine_vol_data.f90
+###
+### MPI compilation without optimization
+###
 
-$O/combine_surf_data.o: $(SHARED)constants.h ${SHARED}/combine_surf_data.f90
-	${FCCOMPILE_CHECK} -c -o $O/combine_surf_data.o ${SHARED}/combine_surf_data.f90
+$O/parallel.o: $(SHARED)constants.h $(SHARED)parallel.f90
+	${MPIFCCOMPILE_CHECK} -c -o $O/parallel.o $(SHARED)parallel.f90
 
-$O/prepare_assemble_MPI.o: $(SHARED)constants.h ${SHARED}/prepare_assemble_MPI.f90
-	${FCCOMPILE_CHECK} -c -o $O/prepare_assemble_MPI.o ${SHARED}/prepare_assemble_MPI.f90
+$O/serial.o: $(SHARED)constants.h $(SHARED)serial.f90
+	${FCCOMPILE_CHECK} -c -o $O/serial.o $(SHARED)serial.f90
+  
+$O/smooth_vol_data.o: $(SHARED)constants.h $(SHARED)smooth_vol_data.f90
+	${MPIFCCOMPILE_NO_CHECK} -c -o $O/smooth_vol_data.o $(SHARED)smooth_vol_data.f90
 
-$O/PML_init.o: $(SHARED)constants.h PML_init.f90
-	${FCCOMPILE_CHECK} -c -o $O/PML_init.o PML_init.f90
 
-##
-## smoothing
-##
 
-$O/smooth_vol_data.o: $(SHARED)constants.h ${SHARED}/smooth_vol_data.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/smooth_vol_data.o ${SHARED}/smooth_vol_data.f90
 
 ##
 ## kernel summation
 ##
 
-$O/sum_kernels.o: $(SHARED)constants.h ${SHARED}/sum_kernels.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/sum_kernels.o ${SHARED}/sum_kernels.f90
+$O/sum_kernels.o: $(SHARED)constants.h $(SHARED)sum_kernels.f90
+	${MPIFCCOMPILE_NO_CHECK} -c -o $O/sum_kernels.o $(SHARED)sum_kernels.f90
 
 ##
 ## model update
@@ -576,16 +410,6 @@ $O/sum_kernels.o: $(SHARED)constants.h ${SHARED}/sum_kernels.f90
 $O/model_update.o: $(SHARED)constants.h model_update.f90
 	${MPIFCCOMPILE_NO_CHECK} -c -o $O/model_update.o model_update.f90
 
-$O/save_external_bin_m_up.o:  ${SHARED}/constants.h save_external_bin_m_up.f90
+$O/save_external_bin_m_up.o:  $(SHARED)constants.h save_external_bin_m_up.f90
 	${FCCOMPILE_CHECK} -c -o $O/save_external_bin_m_up.o save_external_bin_m_up.f90
 
-###
-### C files below
-###
-
-$O/param_reader.o: ${SHARED}/param_reader.c
-	${CC} -c $(CFLAGS) -o $O/param_reader.o ${SHARED}/param_reader.c -I../../
-
-$O/write_c_binary.o: ${SHARED}/write_c_binary.c
-	${CC} -c $(CFLAGS) -o $O/write_c_binary.o ${SHARED}/write_c_binary.c -I../../
-
diff --git a/src/specfem3D/assemble_MPI_vector.f90 b/src/specfem3D/assemble_MPI_vector.f90
index 353c653a1..b8ef9ede5 100644
--- a/src/specfem3D/assemble_MPI_vector.f90
+++ b/src/specfem3D/assemble_MPI_vector.f90
@@ -134,65 +134,65 @@ end subroutine assemble_MPI_vector_ext_mesh
 !
 
   subroutine assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,array_val, &
-            buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
-            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,my_neighbours_ext_mesh, &
-            request_send_vector_ext_mesh,request_recv_vector_ext_mesh &
-            )
+                                           buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
+                                           num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                           nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                           my_neighbours_ext_mesh, &
+                                           request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
 
-! sends data
+    ! sends data
 
-  implicit none
+    implicit none
 
-  include "constants.h"
+    include "constants.h"
 
-  integer :: NPROC
-  integer :: NGLOB_AB
+    integer :: NPROC
+    integer :: NGLOB_AB
 
-! array to assemble
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+    ! array to assemble
+    real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
 
-  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+    integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
-       buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
+    real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+         buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
 
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
-  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+    integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
+    integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+    integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
 
-  integer ipoin,iinterface
+    integer ipoin,iinterface
 
-! here we have to assemble all the contributions between partitions using MPI
+    ! here we have to assemble all the contributions between partitions using MPI
 
-! assemble only if more than one partition
-  if(NPROC > 1) then
+    ! assemble only if more than one partition
+    if(NPROC > 1) then
 
-! partition border copy into the buffer
-  do iinterface = 1, num_interfaces_ext_mesh
-    do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-      buffer_send_vector_ext_mesh(:,ipoin,iinterface) = &
-        array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface))
-    enddo
-  enddo
+       ! partition border copy into the buffer
+       do iinterface = 1, num_interfaces_ext_mesh
+          do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+             buffer_send_vector_ext_mesh(:,ipoin,iinterface) = &
+                  array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface))
+          enddo
+       enddo
 
-! send messages
-  do iinterface = 1, num_interfaces_ext_mesh
-    call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
-         NDIM*nibool_interfaces_ext_mesh(iinterface), &
-         my_neighbours_ext_mesh(iinterface), &
-         itag, &
-         request_send_vector_ext_mesh(iinterface) &
-         )
-    call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
-         NDIM*nibool_interfaces_ext_mesh(iinterface), &
-         my_neighbours_ext_mesh(iinterface), &
-         itag, &
-         request_recv_vector_ext_mesh(iinterface) &
-         )
-  enddo
+       ! send messages
+       do iinterface = 1, num_interfaces_ext_mesh
+          call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
+               NDIM*nibool_interfaces_ext_mesh(iinterface), &
+               my_neighbours_ext_mesh(iinterface), &
+               itag, &
+               request_send_vector_ext_mesh(iinterface) &
+               )
+          call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
+               NDIM*nibool_interfaces_ext_mesh(iinterface), &
+               my_neighbours_ext_mesh(iinterface), &
+               itag, &
+               request_recv_vector_ext_mesh(iinterface) &
+               )
+       enddo
 
-  endif
+    endif
 
   end subroutine assemble_MPI_vector_ext_mesh_s
 
@@ -256,6 +256,7 @@ subroutine assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB,array_val, &
 
   end subroutine assemble_MPI_vector_ext_mesh_w
 
+
 !
 !--------------------------------------------------------------------------------------------------
 !
@@ -405,3 +406,387 @@ subroutine assemble_MPI_vector_poro_w(NPROC,NGLOB_AB,array_val1,array_val2, &
   endif
 
   end subroutine assemble_MPI_vector_poro_w
+
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! with cuda functions...
+
+  subroutine transfer_boundary_to_device(NPROC, Mesh_pointer, &
+                                            buffer_recv_vector_ext_mesh, &
+                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,&
+                                            request_recv_vector_ext_mesh)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer(kind=8) :: Mesh_pointer
+
+  ! array to assemble
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_vector_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: request_recv_vector_ext_mesh
+
+  ! local parameters
+  !integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  !integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  !integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh
+  integer :: iinterface
+
+  ! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+  ! wait for communications completion (recv)
+  !write(IMAIN,*) "sending MPI_wait"
+  do iinterface = 1, num_interfaces_ext_mesh
+    call wait_req(request_recv_vector_ext_mesh(iinterface))
+  enddo
+
+  ! send contributions to GPU
+  call transfer_boundary_to_device_a(Mesh_pointer, buffer_recv_vector_ext_mesh, &
+                                    num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
+  endif
+
+  ! This step is done via previous function transfer_and_assemble...
+  ! do iinterface = 1, num_interfaces_ext_mesh
+  !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+  !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+  !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+  !   enddo
+  ! enddo
+
+  end subroutine transfer_boundary_to_device
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! not used...
+!  subroutine assemble_MPI_vector_write_cuda_no_transfer(NPROC,NGLOB_AB,array_val, Mesh_pointer, &
+!                                            buffer_recv_vector_ext_mesh, &
+!                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+!                                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+!                                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+!                                            FORWARD_OR_ADJOINT )
+!
+!  implicit none
+!
+!  include "constants.h"
+!
+!  integer :: NPROC
+!  integer :: NGLOB_AB
+!  integer(kind=8) :: Mesh_pointer
+!  ! array to assemble
+!  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+!
+!  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+!
+!  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+!       buffer_recv_vector_ext_mesh
+!
+!  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+!  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+!  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh
+!  !integer, dimension(num_interfaces_ext_mesh) :: request_recv_vector_ext_mesh
+!
+!  integer :: FORWARD_OR_ADJOINT
+!
+!  ! local parameters
+!  integer :: iinterface
+!
+!  ! here we have to assemble all the contributions between partitions using MPI
+!
+!  ! assemble only if more than one partition
+!  if(NPROC > 1) then
+!
+!     ! adding contributions of neighbours
+!     call assemble_accel_on_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
+!          num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+!          nibool_interfaces_ext_mesh,&
+!          ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+!
+!     ! This step is done via previous function transfer_and_assemble...
+!     ! do iinterface = 1, num_interfaces_ext_mesh
+!     !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+!     !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+!     !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+!     !   enddo
+!     ! enddo
+!
+!     ! wait for communications completion (send)
+!     do iinterface = 1, num_interfaces_ext_mesh
+!        call wait_req(request_send_vector_ext_mesh(iinterface))
+!     enddo
+!  endif
+!
+!  end subroutine assemble_MPI_vector_write_cuda_no_transfer
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+
+  subroutine assemble_MPI_vector_send_cuda(NPROC, &
+                                          buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
+                                          num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                          nibool_interfaces_ext_mesh, &
+                                          my_neighbours_ext_mesh, &
+                                          request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+
+! sends data
+! note: array to assemble already filled into buffer_send_vector_ext_mesh array
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+  ! local parameters
+  integer :: iinterface
+
+  ! note: preparation of the contribution between partitions using MPI
+  !          already done in transfer_boun_accel routine
+
+  ! send only if more than one partition
+  if(NPROC > 1) then
+
+     ! send messages
+     do iinterface = 1, num_interfaces_ext_mesh
+        call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
+             NDIM*nibool_interfaces_ext_mesh(iinterface), &
+             my_neighbours_ext_mesh(iinterface), &
+             itag, &
+             request_send_vector_ext_mesh(iinterface) &
+             )
+        call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
+             NDIM*nibool_interfaces_ext_mesh(iinterface), &
+             my_neighbours_ext_mesh(iinterface), &
+             itag, &
+             request_recv_vector_ext_mesh(iinterface) &
+             )
+     enddo
+
+  endif
+
+  end subroutine assemble_MPI_vector_send_cuda
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,array_val, Mesh_pointer, &
+                                            buffer_recv_vector_ext_mesh, &
+                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+                                            FORWARD_OR_ADJOINT )
+
+! waits for data to receive and assembles
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer :: NGLOB_AB
+  integer(kind=8) :: Mesh_pointer
+! array to assemble
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_vector_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+  integer :: FORWARD_OR_ADJOINT
+
+  ! local parameters
+  integer :: iinterface
+
+  ! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+    ! wait for communications completion (recv)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_recv_vector_ext_mesh(iinterface))
+    enddo
+
+    ! adding contributions of neighbours
+    call transfer_asmbl_accel_to_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
+                                      num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+                                      nibool_interfaces_ext_mesh,&
+                                      ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+
+    ! This step is done via previous function transfer_and_assemble...
+    ! do iinterface = 1, num_interfaces_ext_mesh
+    !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+    !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+    !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+    !   enddo
+    ! enddo
+
+    ! wait for communications completion (send)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_send_vector_ext_mesh(iinterface))
+    enddo
+
+  endif
+
+  end subroutine assemble_MPI_vector_write_cuda
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine assemble_MPI_scalar_send_cuda(NPROC, &
+                        buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh, &
+                        my_neighbours_ext_mesh, &
+                        request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+
+! non-blocking MPI send
+
+  ! sends data
+  ! note: assembling data already filled into buffer_send_scalar_ext_mesh array
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+
+  ! local parameters
+  integer :: iinterface
+
+  ! sends only if more than one partition
+  if(NPROC > 1) then
+
+    ! note: partition border copy into the buffer has already been done
+    !          by routine transfer_boun_pot_from_device()
+
+    ! send messages
+    do iinterface = 1, num_interfaces_ext_mesh
+      ! non-blocking synchronous send request
+      call isend_cr(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+           nibool_interfaces_ext_mesh(iinterface), &
+           my_neighbours_ext_mesh(iinterface), &
+           itag, &
+           request_send_scalar_ext_mesh(iinterface) &
+           )
+      ! receive request
+      call irecv_cr(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+           nibool_interfaces_ext_mesh(iinterface), &
+           my_neighbours_ext_mesh(iinterface), &
+           itag, &
+           request_recv_scalar_ext_mesh(iinterface) &
+           )
+
+    enddo
+
+  endif
+
+  end subroutine assemble_MPI_scalar_send_cuda
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,array_val, &
+                        Mesh_pointer, &
+                        buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, &
+                        max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
+                        FORWARD_OR_ADJOINT)
+
+! waits for send/receiver to be completed and assembles contributions
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer :: NGLOB_AB
+  integer(kind=8) :: Mesh_pointer
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  ! array to assemble
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB) :: array_val
+
+
+  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_scalar_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+
+  integer FORWARD_OR_ADJOINT
+
+  integer iinterface ! ipoin
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+    ! wait for communications completion (recv)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_recv_scalar_ext_mesh(iinterface))
+    enddo
+
+    ! adding contributions of neighbours
+    call transfer_asmbl_pot_to_device(Mesh_pointer, array_val, buffer_recv_scalar_ext_mesh, &
+                num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, nibool_interfaces_ext_mesh,&
+                ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+
+    ! note: adding contributions of neighbours has been done just above for cuda
+    !do iinterface = 1, num_interfaces_ext_mesh
+    !  do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+    !    array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+    !         array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) &
+    !         + buffer_recv_scalar_ext_mesh(ipoin,iinterface)
+    !  enddo
+    !enddo
+
+    ! wait for communications completion (send)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_send_scalar_ext_mesh(iinterface))
+    enddo
+
+  endif
+
+  end subroutine assemble_MPI_scalar_write_cuda
diff --git a/src/specfem3D/compute_add_sources_acoustic.f90 b/src/specfem3D/compute_add_sources_acoustic.f90
index 219441554..c7434e404 100644
--- a/src/specfem3D/compute_add_sources_acoustic.f90
+++ b/src/specfem3D/compute_add_sources_acoustic.f90
@@ -35,12 +35,13 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
                                   SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, &
                                   nrec,islice_selected_rec,ispec_selected_rec, &
                                   nadj_rec_local,adj_sourcearrays,b_potential_dot_dot_acoustic, &
-                                  NTSTEP_BETWEEN_READ_ADJSRC )
+                                  NTSTEP_BETWEEN_READ_ADJSRC, &
+                                  GPU_MODE, Mesh_pointer )
 
   use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, &
                         xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,&
                         station_name,network_name,adj_source_file,nrec_local,number_receiver_global, &
-                        pm1_source_encoding
+                        pm1_source_encoding,nsources_local
   implicit none
 
   include "constants.h"
@@ -74,13 +75,16 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
 
 !adjoint simulations
   integer:: SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT
+  logical:: GPU_MODE
+  integer(kind=8) :: Mesh_pointer
   integer:: nrec
   integer,dimension(nrec) :: islice_selected_rec,ispec_selected_rec
   integer:: nadj_rec_local
   real(kind=CUSTOM_REAL),dimension(NGLOB_ADJOINT):: b_potential_dot_dot_acoustic
   logical :: ibool_read_adj_arrays
   integer :: it_sub_adj,itime,NTSTEP_BETWEEN_READ_ADJSRC
-  real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: adj_sourcearrays
+  real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: &
+    adj_sourcearrays
 
 ! local parameters
   double precision :: f0
@@ -89,6 +93,7 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
   real(kind=CUSTOM_REAL) stf_used,stf_used_total_all,time_source
   integer :: isource,iglob,ispec,i,j,k,ier
   integer :: irec_local,irec
+  double precision, dimension(NSOURCES) :: stf_pre_compute
 
 ! adjoint sources in SU format
   integer :: it_start,it_end
@@ -109,101 +114,136 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
   endif
 
 ! forward simulations
-  if (SIMULATION_TYPE == 1) then
-
-    ! adds acoustic sources
-    do isource = 1,NSOURCES
-
-      !   add the source (only if this proc carries the source)
-      if(myrank == islice_selected_source(isource)) then
-
-        ispec = ispec_selected_source(isource)
+  if (SIMULATION_TYPE == 1 .and. nsources_local > 0) then
+
+!way 2
+    if(GPU_MODE) then
+      if( NSOURCES > 0 ) then
+        do isource = 1,NSOURCES
+          if(USE_FORCE_POINT_SOURCE) then
+            ! precomputes source time function factor
+            stf_pre_compute(isource) = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr( &
+                  dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource))
+          else
+            if( USE_RICKER_IPATI ) then
+              stf_pre_compute(isource) = comp_source_time_function_rickr( &
+                          dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource))
+            else
+              stf_pre_compute(isource) = comp_source_time_function_gauss( &
+                          dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+            endif
+          endif
+        enddo
+        stf_used_total = stf_used_total + sum(stf_pre_compute(:))
+        ! only implements SIMTYPE=1 and NOISE_TOM=0
+        ! write(*,*) "fortran dt = ", dt
+        ! change dt -> DT
+        call compute_add_sources_ac_cuda(Mesh_pointer, phase_is_inner, &
+                                        NSOURCES, SIMULATION_TYPE, &
+                                        stf_pre_compute, myrank)
+      endif
 
-        if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+    else ! .NOT. GPU_MODE
 
-          if( ispec_is_acoustic(ispec) ) then
+      ! adds acoustic sources
+      do isource = 1,NSOURCES
 
-            if(USE_FORCE_POINT_SOURCE) then
+        !   add the source (only if this proc carries the source)
+        if(myrank == islice_selected_source(isource)) then
 
-              ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
-              iglob = ibool(nint(xi_source(isource)), &
-                             nint(eta_source(isource)), &
-                             nint(gamma_source(isource)), &
-                             ispec)
+          ispec = ispec_selected_source(isource)
 
-              f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
+          if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
 
-              !if (it == 1 .and. myrank == 0) then
-              !  write(IMAIN,*) 'using a source of dominant frequency ',f0
-              !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-              !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-              !endif
+            if( ispec_is_acoustic(ispec) ) then
 
-              ! gaussian source time function
-              !stf_used = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+              if(USE_FORCE_POINT_SOURCE) then
 
-              ! we use nu_source(:,3) here because we want a source normal to the surface.
-              ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
-              stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
+                ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
+                iglob = ibool(nint(xi_source(isource)), &
+                               nint(eta_source(isource)), &
+                               nint(gamma_source(isource)), &
+                               ispec)
 
-              ! source encoding
-              stf_used = stf_used * pm1_source_encoding(isource)
+                f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
 
-              ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid
-              ! the sign is negative because pressure p = - Chi_dot_dot therefore we need
-              ! to add minus the source to Chi_dot_dot to get plus the source in pressure:
+                !if (it == 1 .and. myrank == 0) then
+                !  write(IMAIN,*) 'using a source of dominant frequency ',f0
+                !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+                !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+                !endif
 
-              ! acoustic source for pressure gets divided by kappa
-              ! source contribution
-              potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) &
-                                - stf_used / kappastore(nint(xi_source(isource)), &
-                                                        nint(eta_source(isource)), &
-                                                        nint(gamma_source(isource)),ispec)
+                ! we use nu_source(:,3) here because we want a source normal to the surface.
+                ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
+                stf_used = FACTOR_FORCE_SOURCE * &
+                           comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
 
-            else
 
-              ! gaussian source time
-              stf = comp_source_time_function_gauss(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                ! source encoding
+                stf_used = stf_used * pm1_source_encoding(isource)
 
-              ! quasi-heaviside
-              !stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid
+                ! the sign is negative because pressure p = - Chi_dot_dot therefore we need
+                ! to add minus the source to Chi_dot_dot to get plus the source in pressure:
 
-              ! source encoding
-              stf = stf * pm1_source_encoding(isource)
+                ! acoustic source for pressure gets divided by kappa
+                ! source contribution
+                potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) &
+                                  - stf_used / kappastore(nint(xi_source(isource)), &
+                                                          nint(eta_source(isource)), &
+                                                          nint(gamma_source(isource)),ispec)
 
-              ! distinguishes between single and double precision for reals
-              if(CUSTOM_REAL == SIZE_REAL) then
-                stf_used = sngl(stf)
               else
-                stf_used = stf
-              endif
 
-              ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid
-              ! the sign is negative because pressure p = - Chi_dot_dot therefore we need
-              ! to add minus the source to Chi_dot_dot to get plus the source in pressure
-
-              !     add source array
-              do k=1,NGLLZ
-                do j=1,NGLLY
-                   do i=1,NGLLX
-                      ! adds source contribution
-                      ! note: acoustic source for pressure gets divided by kappa
-                      iglob = ibool(i,j,k,ispec)
-                      potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) &
-                              - sourcearrays(isource,1,i,j,k) * stf_used / kappastore(i,j,k,ispec)
-                   enddo
+                if( USE_RICKER_IPATI ) then
+                  stf = comp_source_time_function_rickr( &
+                                dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource))
+                else
+                  ! gaussian source time
+                  stf = comp_source_time_function_gauss( &
+                                dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                endif
+
+                ! quasi-heaviside
+                !stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+
+                ! source encoding
+                stf = stf * pm1_source_encoding(isource)
+
+                ! distinguishes between single and double precision for reals
+                if(CUSTOM_REAL == SIZE_REAL) then
+                  stf_used = sngl(stf)
+                else
+                  stf_used = stf
+                endif
+
+                ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid
+                ! the sign is negative because pressure p = - Chi_dot_dot therefore we need
+                ! to add minus the source to Chi_dot_dot to get plus the source in pressure
+
+                !     add source array
+                do k=1,NGLLZ
+                  do j=1,NGLLY
+                     do i=1,NGLLX
+                        ! adds source contribution
+                        ! note: acoustic source for pressure gets divided by kappa
+                        iglob = ibool(i,j,k,ispec)
+                        potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) &
+                                - sourcearrays(isource,1,i,j,k) * stf_used / kappastore(i,j,k,ispec)
+                     enddo
+                  enddo
                 enddo
-              enddo
 
-            endif ! USE_FORCE_POINT_SOURCE
+              endif ! USE_FORCE_POINT_SOURCE
 
-            stf_used_total = stf_used_total + stf_used
+              stf_used_total = stf_used_total + stf_used
 
-          endif ! ispec_is_elastic
-        endif ! ispec_is_inner
-      endif ! myrank
+            endif ! ispec_is_acoustic
+          endif ! ispec_is_inner
+        endif ! myrank
 
-    enddo ! NSOURCES
+      enddo ! NSOURCES
+    endif ! GPU_MODE
   endif
 
 ! NOTE: adjoint sources and backward wavefield timing:
@@ -233,119 +273,137 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
 ! adjoint simulations
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
 
-    ! read in adjoint sources block by block (for memory consideration)
-    ! e.g., in exploration experiments, both the number of receivers (nrec) and
-    ! the number of time steps (NSTEP) are huge,
-    ! which may cause problems since we have a large array:
-    !     adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
-
-    ! figure out if we need to read in a chunk of the adjoint source at this timestep
-    it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
-    ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
-
-    ! needs to read in a new chunk/block of the adjoint source
-    ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
-    ! we first do calculations for the boudaries, and then start communication
-    ! with other partitions while we calculate for the inner part
-    ! this must be done carefully, otherwise the adjoint sources may be added twice
-    if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
-
-      ! allocates temporary source array
-      allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array adj_sourcearray'
-
-      if (.not. SU_FORMAT) then
-         !!! read ascii adjoint sources
-         irec_local = 0
-         do irec = 1, nrec
-           ! compute source arrays
-           if (myrank == islice_selected_rec(irec)) then
-             irec_local = irec_local + 1
-
-             ! reads in **sta**.**net**.**LH**.adj files
-             adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
-             call compute_arrays_adjoint_source(myrank,adj_source_file, &
-                       xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                       adj_sourcearray, xigll,yigll,zigll, &
-                       it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
-             do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
-               adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
-             enddo
-
-           endif
-         enddo
-      else
-         !!! read SU adjoint sources
-         ! range of the block we need to read
-         it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
-         it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
-         write(procname,"(i4)") myrank
-         open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', &
-                            status='old',access='direct',recl=240+4*(NSTEP),iostat = ier)
-         if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
-                                    //'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj does not exit')
-
-         do irec_local = 1,nrec_local
-           irec = number_receiver_global(irec_local)
-           read(IIN_SU1,rec=irec_local) r4head, adj_temp
-           adj_src(:,1)=adj_temp(it_start:it_end)
-           adj_src(:,2)=0.0  !TRIVIAL
-           adj_src(:,3)=0.0  !TRIVIAL
-           ! lagrange interpolators for receiver location
-           call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir)
-           call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar)
-           call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar)
-           ! interpolates adjoint source onto GLL points within this element
-           do k = 1, NGLLZ
-             do j = 1, NGLLY
-               do i = 1, NGLLX
-                 adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:)
+    ! adds adjoint source in this partitions
+    if( nadj_rec_local > 0 ) then
+
+      ! read in adjoint sources block by block (for memory consideration)
+      ! e.g., in exploration experiments, both the number of receivers (nrec) and
+      ! the number of time steps (NSTEP) are huge,
+      ! which may cause problems since we have a large array:
+      !     adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
+
+      ! figure out if we need to read in a chunk of the adjoint source at this timestep
+      it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
+      ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
+
+      ! needs to read in a new chunk/block of the adjoint source
+      ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
+      ! we first do calculations for the boudaries, and then start communication
+      ! with other partitions while we calculate for the inner part
+      ! this must be done carefully, otherwise the adjoint sources may be added twice
+      if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
+
+        ! allocates temporary source array
+        allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array adj_sourcearray'
+
+        if (.not. SU_FORMAT) then
+           !!! read ascii adjoint sources
+           irec_local = 0
+           do irec = 1, nrec
+             ! compute source arrays
+             if (myrank == islice_selected_rec(irec)) then
+               irec_local = irec_local + 1
+
+               ! reads in **sta**.**net**.**LH**.adj files
+               adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
+               call compute_arrays_adjoint_source(myrank,adj_source_file, &
+                         xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                         adj_sourcearray, xigll,yigll,zigll, &
+                         it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+               do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
+                 adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
                enddo
-             enddo
-           enddo
-           do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
-             adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
-           enddo
-         enddo
-         close(IIN_SU1)
-      endif !if (.not. SU_FORMAT)
-
-      deallocate(adj_sourcearray)
-
-    endif ! if(ibool_read_adj_arrays)
-
-    if( it < NSTEP ) then
-      ! receivers act as sources
-      irec_local = 0
-      do irec = 1,nrec
-        ! add the source (only if this proc carries the source)
-        if (myrank == islice_selected_rec(irec)) then
-          irec_local = irec_local + 1
 
-          ! adds source array
-          ispec = ispec_selected_rec(irec)
-
-          ! checks if element is in phase_is_inner run
-          if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
-
-            do k = 1,NGLLZ
-              do j = 1,NGLLY
-                do i = 1,NGLLX
-                  iglob = ibool(i,j,k,ispec)
+             endif
+           enddo
+        else
+          !!! read SU adjoint sources
+          ! range of the block we need to read
+          it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
+          it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
+          write(procname,"(i4)") myrank
+          open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', &
+                            status='old',access='direct',recl=240+4*(NSTEP),iostat = ier)
+          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
+                                    //'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj does not exit')
 
-                  potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) &
-                        + adj_sourcearrays(irec_local, &
-                          NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
-                          1,i,j,k)
+          do irec_local = 1,nrec_local
+            irec = number_receiver_global(irec_local)
+            read(IIN_SU1,rec=irec_local) r4head, adj_temp
+            adj_src(:,1)=adj_temp(it_start:it_end)
+            adj_src(:,2)=0.0  !TRIVIAL
+            adj_src(:,3)=0.0  !TRIVIAL
+            ! lagrange interpolators for receiver location
+            call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir)
+            call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar)
+            call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar)
+            ! interpolates adjoint source onto GLL points within this element
+            do k = 1, NGLLZ
+              do j = 1, NGLLY
+                do i = 1, NGLLX
+                  adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:)
                 enddo
               enddo
             enddo
-
-          endif ! phase_is_inner
-
-        endif
-      enddo ! nrec
-    endif ! it
+            do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
+              adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
+            enddo
+          enddo
+          close(IIN_SU1)
+        endif !if (.not. SU_FORMAT)
+
+        deallocate(adj_sourcearray)
+      endif ! if(ibool_read_adj_arrays)
+
+      if( it < NSTEP ) then
+        ! receivers act as sources
+        if( .NOT. GPU_MODE ) then
+          irec_local = 0
+          do irec = 1,nrec
+            ! add the source (only if this proc carries the source)
+            if (myrank == islice_selected_rec(irec)) then
+              irec_local = irec_local + 1
+
+              ! adds source array
+              ispec = ispec_selected_rec(irec)
+              if( ispec_is_acoustic(ispec) ) then
+
+                ! checks if element is in phase_is_inner run
+                if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+                  do k = 1,NGLLZ
+                    do j = 1,NGLLY
+                      do i = 1,NGLLX
+                        iglob = ibool(i,j,k,ispec)
+                        ! beware, for acoustic medium, a pressure source would be taking the negative
+                        ! and divide by Kappa of the fluid;
+                        ! this would have to be done when constructing the adjoint source.
+                        !
+                        ! note: we take the first component of the adj_sourcearrays
+                        !          the idea is to have e.g. a pressure source, where all 3 components would be the same
+                        potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) &
+                              + adj_sourcearrays(irec_local, &
+                                NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+                                1,i,j,k)
+                      enddo
+                    enddo
+                  enddo
+                endif ! phase_is_inner
+              endif
+            endif
+          enddo ! nrec
+        else
+          ! on GPU
+          call add_sources_ac_sim_2_or_3_cuda(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+                                             ispec_is_inner,ispec_is_acoustic, &
+                                             ispec_selected_rec,myrank,nrec, &
+                                             NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+                                             islice_selected_rec,nadj_rec_local, &
+                                             NTSTEP_BETWEEN_READ_ADJSRC)
+
+        endif ! GPU_MODE
+      endif ! it
+    endif ! nadj_rec_local > 0
   endif
 
 ! note:  b_potential() is read in after Newmark time scheme, thus
@@ -353,100 +411,137 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
 !           thus indexing is NSTEP - it , instead of NSTEP - it - 1
 
 ! adjoint simulations
-  if (SIMULATION_TYPE == 3) then
-    ! adds acoustic sources
-    do isource = 1,NSOURCES
-
-      !   add the source (only if this proc carries the source)
-      if(myrank == islice_selected_source(isource)) then
+  if (SIMULATION_TYPE == 3 .and. nsources_local > 0) then
+
+    ! on GPU
+    if(GPU_MODE) then
+      if( NSOURCES > 0 ) then
+        do isource = 1,NSOURCES
+          if(USE_FORCE_POINT_SOURCE) then
+            ! precomputes source time function factors
+            stf_pre_compute(isource) = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr( &
+                  dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur(isource))
+          else
+            if( USE_RICKER_IPATI ) then
+              stf_pre_compute(isource) = comp_source_time_function_rickr( &
+                            dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur(isource))
+            else
+              stf_pre_compute(isource) = comp_source_time_function_gauss( &
+                            dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+            endif
+          endif
+        enddo
+        stf_used_total = stf_used_total + sum(stf_pre_compute(:))
 
-        ispec = ispec_selected_source(isource)
+        ! only implements SIMTYPE=3
+        call compute_add_sources_ac_s3_cuda(Mesh_pointer, phase_is_inner, &
+                                           NSOURCES, SIMULATION_TYPE, &
+                                           stf_pre_compute, myrank)
+      endif
 
-        if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+    else ! .NOT. GPU_MODE
 
-          if( ispec_is_acoustic(ispec) ) then
+      ! adds acoustic sources
+      do isource = 1,NSOURCES
 
-            if(USE_FORCE_POINT_SOURCE) then
+        !   add the source (only if this proc carries the source)
+        if(myrank == islice_selected_source(isource)) then
 
-              ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
-              iglob = ibool(nint(xi_source(isource)), &
-                             nint(eta_source(isource)), &
-                             nint(gamma_source(isource)), &
-                             ispec)
+          ispec = ispec_selected_source(isource)
 
-              f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
+          if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
 
-              !if (it == 1 .and. myrank == 0) then
-              !  write(IMAIN,*) 'using a source of dominant frequency ',f0
-              !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-              !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-              !endif
+            if( ispec_is_acoustic(ispec) ) then
 
-              ! gaussian source time function
-              !stf_used = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+              if(USE_FORCE_POINT_SOURCE) then
 
-              ! we use nu_source(:,3) here because we want a source normal to the surface.
-              ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
-              stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0)
+                ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
+                iglob = ibool(nint(xi_source(isource)), &
+                               nint(eta_source(isource)), &
+                               nint(gamma_source(isource)), &
+                               ispec)
 
-              ! source encoding
-              stf_used = stf_used * pm1_source_encoding(isource)
+                f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
 
-              ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid
-              ! the sign is negative because pressure p = - Chi_dot_dot therefore we need
-              ! to add minus the source to Chi_dot_dot to get plus the source in pressure:
+                !if (it == 1 .and. myrank == 0) then
+                !  write(IMAIN,*) 'using a source of dominant frequency ',f0
+                !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+                !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+                !endif
 
-              ! acoustic source for pressure gets divided by kappa
-              ! source contribution
-              b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) &
-                          - stf_used / kappastore(nint(xi_source(isource)), &
-                                               nint(eta_source(isource)), &
-                                               nint(gamma_source(isource)),ispec)
+                ! gaussian source time function
+                !stf_used = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
 
-            else
+                ! we use nu_source(:,3) here because we want a source normal to the surface.
+                ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
+                stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr( &
+                                        dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0)
 
-              ! gaussian source time
-              stf = comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                ! source encoding
+                stf_used = stf_used * pm1_source_encoding(isource)
 
-              ! quasi-heaviside
-              !stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid
+                ! the sign is negative because pressure p = - Chi_dot_dot therefore we need
+                ! to add minus the source to Chi_dot_dot to get plus the source in pressure:
 
-              ! source encoding
-              stf = stf * pm1_source_encoding(isource)
+                ! acoustic source for pressure gets divided by kappa
+                ! source contribution
+                b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) &
+                            - stf_used / kappastore(nint(xi_source(isource)), &
+                                                 nint(eta_source(isource)), &
+                                                 nint(gamma_source(isource)),ispec)
 
-              ! distinguishes between single and double precision for reals
-              if(CUSTOM_REAL == SIZE_REAL) then
-                stf_used = sngl(stf)
               else
-                stf_used = stf
-              endif
 
-              ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid
-              ! the sign is negative because pressure p = - Chi_dot_dot therefore we need
-              ! to add minus the source to Chi_dot_dot to get plus the source in pressure
-
-              !     add source array
-              do k=1,NGLLZ
-                do j=1,NGLLY
-                   do i=1,NGLLX
-                      ! adds source contribution
-                      ! note: acoustic source for pressure gets divided by kappa
-                      iglob = ibool(i,j,k,ispec)
-                      b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) &
-                              - sourcearrays(isource,1,i,j,k) * stf_used / kappastore(i,j,k,ispec)
-                   enddo
+                if( USE_RICKER_IPATI ) then
+                  stf = comp_source_time_function_rickr( &
+                                  dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur(isource))
+                else
+                  ! gaussian source time
+                  stf = comp_source_time_function_gauss( &
+                                  dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                endif
+
+                ! quasi-heaviside
+                !stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+
+                ! source encoding
+                stf = stf * pm1_source_encoding(isource)
+
+                ! distinguishes between single and double precision for reals
+                if(CUSTOM_REAL == SIZE_REAL) then
+                  stf_used = sngl(stf)
+                else
+                  stf_used = stf
+                endif
+
+                ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid
+                ! the sign is negative because pressure p = - Chi_dot_dot therefore we need
+                ! to add minus the source to Chi_dot_dot to get plus the source in pressure
+
+                !     add source array
+                do k=1,NGLLZ
+                  do j=1,NGLLY
+                     do i=1,NGLLX
+                        ! adds source contribution
+                        ! note: acoustic source for pressure gets divided by kappa
+                        iglob = ibool(i,j,k,ispec)
+                        b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) &
+                                - sourcearrays(isource,1,i,j,k) * stf_used / kappastore(i,j,k,ispec)
+                     enddo
+                  enddo
                 enddo
-              enddo
 
-            endif ! USE_FORCE_POINT_SOURCE
+              endif ! USE_FORCE_POINT_SOURCE
 
-            stf_used_total = stf_used_total + stf_used
+              stf_used_total = stf_used_total + stf_used
 
-          endif ! ispec_is_elastic
-        endif ! ispec_is_inner
-      endif ! myrank
+            endif ! ispec_is_elastic
+          endif ! ispec_is_inner
+        endif ! myrank
 
-    enddo ! NSOURCES
+      enddo ! NSOURCES
+    endif ! GPU_MODE
   endif
 
   ! master prints out source time function to file
@@ -456,5 +551,4 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
     if( myrank == 0 ) write(IOSTF,*) time_source,stf_used_total_all
   endif
 
-
-end subroutine compute_add_sources_acoustic
+  end subroutine compute_add_sources_acoustic
diff --git a/src/specfem3D/compute_add_sources_elastic.f90 b/src/specfem3D/compute_add_sources_elastic.f90
index fda038812..7f351409f 100644
--- a/src/specfem3D/compute_add_sources_elastic.f90
+++ b/src/specfem3D/compute_add_sources_elastic.f90
@@ -34,19 +34,23 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
                         ispec_is_elastic,SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, &
                         nrec,islice_selected_rec,ispec_selected_rec, &
                         nadj_rec_local,adj_sourcearrays,b_accel, &
-                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY  )
+                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,GPU_MODE,Mesh_pointer  )
 
   use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, &
                         xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,&
                         station_name,network_name,adj_source_file, &
                         LOCAL_PATH,wgllwgll_xy, &
-                        num_free_surface_faces,free_surface_ispec,free_surface_ijk,free_surface_jacobian2Dw, &
+                        num_free_surface_faces,free_surface_ispec, &
+                        free_surface_ijk,free_surface_jacobian2Dw, &
                         noise_sourcearray,irec_master_noise, &
-                        normal_x_noise,normal_y_noise,normal_z_noise,mask_noise,noise_surface_movie, &
-                        nrec_local,number_receiver_global
+                        normal_x_noise,normal_y_noise,normal_z_noise, &
+                        mask_noise,noise_surface_movie, &
+                        nrec_local,number_receiver_global, &
+                        nsources_local
 
   use specfem_par_movie,only: &
-                        store_val_ux_external_mesh,store_val_uy_external_mesh,store_val_uz_external_mesh
+                        store_val_ux_external_mesh,store_val_uy_external_mesh, &
+                        store_val_uz_external_mesh
 
   implicit none
 
@@ -79,21 +83,26 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
 
 !adjoint simulations
   integer:: SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT
+  logical:: GPU_MODE
+  integer(kind=8) :: Mesh_pointer
   integer:: nrec
   integer,dimension(nrec) :: islice_selected_rec,ispec_selected_rec
   integer:: nadj_rec_local
   real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
   logical :: ibool_read_adj_arrays
   integer :: it_sub_adj,itime,NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY
-  real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: adj_sourcearrays
+  real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: &
+    adj_sourcearrays
 
 ! local parameters
   double precision :: f0
   double precision :: stf
   real(kind=CUSTOM_REAL),dimension(:,:,:,:,:),allocatable:: adj_sourcearray
   real(kind=CUSTOM_REAL) stf_used,stf_used_total_all,time_source
-  integer :: isource,iglob,i,j,k,ispec,ier
-  integer :: irec_local,irec
+  ! for GPU_MODE
+  double precision, dimension(NSOURCES) :: stf_pre_compute
+  integer :: isource,iglob,i,j,k,ispec
+  integer :: irec_local,irec, ier
 
 ! adjoint sources in SU format
   integer :: it_start,it_end
@@ -105,83 +114,108 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
   !integer(kind=4) :: i4head(nheader/4)  ! 4-byte-integer
   real(kind=4)    :: r4head(nheader/4)  ! 4-byte-real
   !equivalence (i2head,i4head,r4head)    ! share the same 240-byte-memory
-  double precision :: hxir(NGLLX), hpxir(NGLLX), hetar(NGLLY), hpetar(NGLLY),hgammar(NGLLZ), hpgammar(NGLLZ)
+  double precision :: hxir(NGLLX),hpxir(NGLLX),hetar(NGLLY),hpetar(NGLLY),hgammar(NGLLZ),hpgammar(NGLLZ)
 
 ! plotting source time function
   if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
-    ! initializes total
-    stf_used_total = 0.0_CUSTOM_REAL
+     ! initializes total
+     stf_used_total = 0.0_CUSTOM_REAL
   endif
 
-! forward simulations
-  if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0) then
+  ! forward simulations
+  if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
+
+    if(GPU_MODE) then
+      do isource = 1,NSOURCES
+        if( USE_RICKER_IPATI ) then
+          stf_pre_compute(isource) = comp_source_time_function_rickr( &
+                                        dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource))
+        else
+          stf_pre_compute(isource) = comp_source_time_function( &
+                                        dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+        endif
+      enddo
+      ! only implements SIMTYPE=1 and NOISE_TOM=0
+      ! write(*,*) "fortran dt = ", dt
+      ! change dt -> DT
+      call compute_add_sources_el_cuda(Mesh_pointer, &
+                                      phase_is_inner,NSOURCES, &
+                                      stf_pre_compute, myrank)
 
-    do isource = 1,NSOURCES
+    else ! .NOT. GPU_MODE
 
-      !   add the source (only if this proc carries the source)
-      if(myrank == islice_selected_source(isource)) then
+      do isource = 1,NSOURCES
 
-        ispec = ispec_selected_source(isource)
+         !   add the source (only if this proc carries the source)
+         if(myrank == islice_selected_source(isource)) then
 
-        if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+            ispec = ispec_selected_source(isource)
 
-          if( ispec_is_elastic(ispec) ) then
+            if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
 
-            if(USE_FORCE_POINT_SOURCE) then
+               if( ispec_is_elastic(ispec) ) then
 
-              ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
-              iglob = ibool(nint(xi_source(isource)), &
-                             nint(eta_source(isource)), &
-                             nint(gamma_source(isource)), &
-                             ispec_selected_source(isource))
+                  if(USE_FORCE_POINT_SOURCE) then
 
-              f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
+                    ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
+                    iglob = ibool(nint(xi_source(isource)), &
+                          nint(eta_source(isource)), &
+                          nint(gamma_source(isource)), &
+                          ispec_selected_source(isource))
 
-              !if (it == 1 .and. myrank == 0) then
-              !  write(IMAIN,*) 'using a source of dominant frequency ',f0
-              !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-              !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-              !endif
+                    f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
 
-              ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
-              stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
+                    !if (it == 1 .and. myrank == 0) then
+                    !  write(IMAIN,*) 'using a source of dominant frequency ',f0
+                    !  write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+                    !  write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+                    !endif
 
-              ! we use a force in a single direction along one of the components:
-              !  x/y/z or E/N/Z-direction would correspond to 1/2/3 = COMPONENT_FORCE_SOURCE
-              ! e.g. nu_source(:,3) here would be a source normal to the surface (z-direction).
-              accel(:,iglob) = accel(:,iglob)  &
-                               + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
+                    ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
+                    stf_used = FACTOR_FORCE_SOURCE * &
+                               comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
 
-            else
+                    ! we use a force in a single direction along one of the components:
+                    !  x/y/z or E/N/Z-direction would correspond to 1/2/3 = COMPONENT_FORCE_SOURCE
+                    ! e.g. nu_source(:,3) here would be a source normal to the surface (z-direction).
+                    accel(:,iglob) = accel(:,iglob)  &
+                          + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
 
-               stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                  else
 
-               !     distinguish between single and double precision for reals
-               if(CUSTOM_REAL == SIZE_REAL) then
-                  stf_used = sngl(stf)
-               else
-                  stf_used = stf
-               endif
+                    if( USE_RICKER_IPATI) then
+                      stf = comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource))
+                    else
+                      stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                    endif
 
-               !     add source array
-               do k=1,NGLLZ
-                  do j=1,NGLLY
-                     do i=1,NGLLX
-                        iglob = ibool(i,j,k,ispec)
-                        accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
-                     enddo
-                  enddo
-               enddo
+                    !     distinguish between single and double precision for reals
+                    if(CUSTOM_REAL == SIZE_REAL) then
+                      stf_used = sngl(stf)
+                    else
+                      stf_used = stf
+                    endif
 
-            endif ! USE_FORCE_POINT_SOURCE
+                    !     add source array
+                    do k=1,NGLLZ
+                      do j=1,NGLLY
+                        do i=1,NGLLX
+                          iglob = ibool(i,j,k,ispec)
+                          accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
+                        enddo
+                      enddo
+                    enddo
 
-            stf_used_total = stf_used_total + stf_used
+                  endif ! USE_FORCE_POINT_SOURCE
 
-          endif ! ispec_is_elastic
-        endif ! ispec_is_inner
-      endif ! myrank
+                  stf_used_total = stf_used_total + stf_used
 
-    enddo ! NSOURCES
+               endif ! ispec_is_elastic
+            endif ! ispec_is_inner
+         endif ! myrank
+
+      enddo ! NSOURCES
+    endif ! GPU_MODE
   endif ! forward
 
 ! NOTE: adjoint sources and backward wavefield timing:
@@ -211,132 +245,146 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
 ! adjoint simulations
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
 
-    ! read in adjoint sources block by block (for memory consideration)
-    ! e.g., in exploration experiments, both the number of receivers (nrec) and
-    ! the number of time steps (NSTEP) are huge,
-    ! which may cause problems since we have a large array:
-    !   adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
-
-    ! figure out if we need to read in a chunk of the adjoint source at this timestep
-    it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
-    ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
-
-    ! needs to read in a new chunk/block of the adjoint source
-    ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
-    ! we first do calculations for the boudaries, and then start communication
-    ! with other partitions while calculate for the inner part
-    ! this must be done carefully, otherwise the adjoint sources may be added twice
-    if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
-
-      ! allocates temporary source array
-      allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array adj_sourcearray'
-
-      if (.not. SU_FORMAT) then
-         !!! read ascii adjoint sources
-         irec_local = 0
-         do irec = 1, nrec
-           ! compute source arrays
-           if (myrank == islice_selected_rec(irec)) then
-             irec_local = irec_local + 1
-             ! reads in **sta**.**net**.**LH**.adj files
-             adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
-             call compute_arrays_adjoint_source(myrank,adj_source_file, &
-                       xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                       adj_sourcearray, xigll,yigll,zigll, &
-                       it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
-             do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
-               adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
-             enddo
-
-           endif
-         enddo
-      else
-         !!! read SU adjoint sources
-         ! range of the block we need to read
-         it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
-         it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
-         write(procname,"(i4)") myrank
-         ! read adjoint sources
-         open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', &
+    ! adds adjoint source in this partitions
+    if( nadj_rec_local > 0 ) then
+
+      ! read in adjoint sources block by block (for memory consideration)
+      ! e.g., in exploration experiments, both the number of receivers (nrec) and
+      ! the number of time steps (NSTEP) are huge,
+      ! which may cause problems since we have a large array:
+      !   adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
+
+      ! figure out if we need to read in a chunk of the adjoint source at this timestep
+      it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
+      ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
+
+      ! needs to read in a new chunk/block of the adjoint source
+      ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
+      ! we first do calculations for the boudaries, and then start communication
+      ! with other partitions while calculate for the inner part
+      ! this must be done carefully, otherwise the adjoint sources may be added twice
+      if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
+
+        ! allocates temporary source array
+        allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array adj_sourcearray'
+
+        if (.not. SU_FORMAT) then
+          !!! read ascii adjoint sources
+          irec_local = 0
+          do irec = 1, nrec
+            ! compute source arrays
+            if (myrank == islice_selected_rec(irec)) then
+              irec_local = irec_local + 1
+              ! reads in **sta**.**net**.**LH**.adj files
+              adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
+              call compute_arrays_adjoint_source(myrank,adj_source_file, &
+                    xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+                    adj_sourcearray, xigll,yigll,zigll, &
+                    it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+              do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
+                adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
+              enddo
+            endif
+          enddo
+        else
+          !!! read SU adjoint sources
+          ! range of the block we need to read
+          it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
+          it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
+          write(procname,"(i4)") myrank
+          ! read adjoint sources
+          open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', &
                             status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
-         if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
+          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
                                     //'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj does not exit')
-         open(unit=IIN_SU2, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj', &
+          open(unit=IIN_SU2, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj', &
                             status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
-         if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
+          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
                                     //'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj does not exit')
-         open(unit=IIN_SU3, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj', &
+          open(unit=IIN_SU3, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj', &
                             status='old',access='direct',recl=240+4*NSTEP,iostat = ier)
-         if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
+          if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) &
                                     //'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj does not exit')
 
-         do irec_local = 1,nrec_local
-           irec = number_receiver_global(irec_local)
-           read(IIN_SU1,rec=irec_local) r4head, adj_temp
-           adj_src(:,1)=adj_temp(it_start:it_end)
-           read(IIN_SU2,rec=irec_local) r4head, adj_temp
-           adj_src(:,2)=adj_temp(it_start:it_end)
-           read(IIN_SU3,rec=irec_local) r4head, adj_temp
-           adj_src(:,3)=adj_temp(it_start:it_end)
-           ! lagrange interpolators for receiver location
-           call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir)
-           call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar)
-           call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar)
-           ! interpolates adjoint source onto GLL points within this element
-           do k = 1, NGLLZ
-             do j = 1, NGLLY
-               do i = 1, NGLLX
-                 adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:)
-               enddo
-             enddo
-           enddo
-           do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
-             adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
-           enddo
-         enddo
-         close(IIN_SU1)
-         close(IIN_SU2)
-         close(IIN_SU3)
-      endif !if(.not. SU_FORMAT)
-
-      deallocate(adj_sourcearray)
-
-    endif ! if(ibool_read_adj_arrays)
-
-    if( it < NSTEP ) then
-
-      ! receivers act as sources
-      irec_local = 0
-      do irec = 1,nrec
-
-        ! add the source (only if this proc carries the source)
-        if (myrank == islice_selected_rec(irec)) then
-          irec_local = irec_local + 1
-
-          ! checks if element is in phase_is_inner run
-          if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
-
-            ! adds source array
-            do k = 1,NGLLZ
-              do j = 1,NGLLY
-                do i = 1,NGLLX
-                  iglob = ibool(i,j,k,ispec_selected_rec(irec))
-
-                  accel(:,iglob) = accel(:,iglob)  &
-                       + adj_sourcearrays(irec_local, &
-                          NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
-                          :,i,j,k)
+          do irec_local = 1,nrec_local
+            irec = number_receiver_global(irec_local)
+            read(IIN_SU1,rec=irec_local) r4head, adj_temp
+            adj_src(:,1)=adj_temp(it_start:it_end)
+            read(IIN_SU2,rec=irec_local) r4head, adj_temp
+            adj_src(:,2)=adj_temp(it_start:it_end)
+            read(IIN_SU3,rec=irec_local) r4head, adj_temp
+            adj_src(:,3)=adj_temp(it_start:it_end)
+            ! lagrange interpolators for receiver location
+            call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir)
+            call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar)
+            call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar)
+            ! interpolates adjoint source onto GLL points within this element
+            do k = 1, NGLLZ
+              do j = 1, NGLLY
+                do i = 1, NGLLX
+                  adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:)
                 enddo
               enddo
             enddo
+            do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
+              adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
+            enddo
+          enddo
+          close(IIN_SU1)
+          close(IIN_SU2)
+          close(IIN_SU3)
+        endif !if(.not. SU_FORMAT)
 
-          endif ! phase_is_inner
-        endif
-      enddo ! nrec
+        deallocate(adj_sourcearray)
+      endif ! if(ibool_read_adj_arrays)
+
+
+      if( it < NSTEP ) then
+
+        if(.NOT. GPU_MODE) then
+
+          ! receivers act as sources
+          irec_local = 0
+          do irec = 1,nrec
 
-    endif ! it
+            ! add the source (only if this proc carries the source)
+            if (myrank == islice_selected_rec(irec)) then
+              irec_local = irec_local + 1
 
+              ispec = ispec_selected_rec(irec)
+              if( ispec_is_elastic(ispec) ) then
+
+                ! checks if element is in phase_is_inner run
+                if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
+
+                  ! adds source array
+                  do k = 1,NGLLZ
+                    do j = 1,NGLLY
+                      do i = 1,NGLLX
+                        iglob = ibool(i,j,k,ispec_selected_rec(irec))
+
+                        accel(:,iglob) = accel(:,iglob)  &
+                                  + adj_sourcearrays(irec_local, &
+                                  NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+                                  :,i,j,k)
+                      enddo
+                    enddo
+                  enddo
+                endif ! phase_is_inner
+              endif ! ispec_is_elastic
+            endif
+          enddo ! nrec
+        else ! GPU_MODE == .true.
+          call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+                                            ispec_is_inner,ispec_is_elastic, &
+                                            ispec_selected_rec,myrank,nrec, &
+                                            NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+                                            islice_selected_rec,nadj_rec_local, &
+                                            NTSTEP_BETWEEN_READ_ADJSRC)
+        endif ! GPU_MODE
+      endif ! it
+    endif ! nadj_rec_local
   endif !adjoint
 
 ! note:  b_displ() is read in after Newmark time scheme, thus
@@ -344,75 +392,99 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
 !           thus indexing is NSTEP - it , instead of NSTEP - it - 1
 
 ! adjoint simulations
-  if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0) then
+  if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
+
+    if(GPU_MODE) then
+      do isource = 1,NSOURCES
+        if( USE_RICKER_IPATI ) then
+          stf_pre_compute(isource) = comp_source_time_function_rickr( &
+                                      dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur(isource))
+        else
+          stf_pre_compute(isource) = comp_source_time_function( &
+                                      dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+        endif
+      enddo
 
-    ! backward source reconstruction
-    do isource = 1,NSOURCES
+      call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute, &
+                                         NSOURCES,phase_is_inner,myrank)
 
-      ! add the source (only if this proc carries the source)
-      if(myrank == islice_selected_source(isource)) then
+    else ! .NOT. GPU_MODE
 
-        ispec = ispec_selected_source(isource)
+      ! backward source reconstruction
+      do isource = 1,NSOURCES
 
-        if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+         ! add the source (only if this proc carries the source)
+         if(myrank == islice_selected_source(isource)) then
 
-          if( ispec_is_elastic(ispec) ) then
+            ispec = ispec_selected_source(isource)
 
-            if(USE_FORCE_POINT_SOURCE) then
+            if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
 
-               ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
-               iglob = ibool(nint(xi_source(isource)), &
-                             nint(eta_source(isource)), &
-                             nint(gamma_source(isource)), &
-                             ispec_selected_source(isource))
+               if( ispec_is_elastic(ispec) ) then
 
-               f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
+                  if(USE_FORCE_POINT_SOURCE) then
 
-               !if (it == 1 .and. myrank == 0) then
-               !   write(IMAIN,*) 'using a source of dominant frequency ',f0
-               !   write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
-               !   write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
-               !endif
+                     ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
+                     iglob = ibool(nint(xi_source(isource)), &
+                          nint(eta_source(isource)), &
+                          nint(gamma_source(isource)), &
+                          ispec_selected_source(isource))
 
-               ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
-               stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0)
+                     f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
 
-               ! e.g. we use nu_source(:,3) here if we want a source normal to the surface.
-               ! note: time step is now at NSTEP-it
-               b_accel(:,iglob) = b_accel(:,iglob)  &
-                                  + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
+                     !if (it == 1 .and. myrank == 0) then
+                     !   write(IMAIN,*) 'using a source of dominant frequency ',f0
+                     !   write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0
+                     !   write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0)
+                     !endif
 
-            else
+                     ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
+                     stf_used = FACTOR_FORCE_SOURCE * &
+                                comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0)
 
-              ! see note above: time step corresponds now to NSTEP-it
-              ! (also compare to it-1 for forward simulation)
-              stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                     ! e.g. we use nu_source(:,3) here if we want a source normal to the surface.
+                     ! note: time step is now at NSTEP-it
+                     b_accel(:,iglob) = b_accel(:,iglob)  &
+                          + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
 
-              ! distinguish between single and double precision for reals
-              if(CUSTOM_REAL == SIZE_REAL) then
-                stf_used = sngl(stf)
-              else
-                stf_used = stf
-              endif
+                  else
 
-              !  add source array
-              do k=1,NGLLZ
-                do j=1,NGLLY
-                  do i=1,NGLLX
-                    iglob = ibool(i,j,k,ispec_selected_source(isource))
-                    b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
-                  enddo
-                enddo
-              enddo
-            endif ! USE_FORCE_POINT_SOURCE
+                     ! see note above: time step corresponds now to NSTEP-it
+                     ! (also compare to it-1 for forward simulation)
+                     if( USE_RICKER_IPATI ) then
+                       stf = comp_source_time_function_rickr( &
+                                      dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource))
+                     else
+                       stf = comp_source_time_function( &
+                                      dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+                     endif
+
+                     ! distinguish between single and double precision for reals
+                     if(CUSTOM_REAL == SIZE_REAL) then
+                        stf_used = sngl(stf)
+                     else
+                        stf_used = stf
+                     endif
+
+                     !  add source array
+                     do k=1,NGLLZ
+                        do j=1,NGLLY
+                           do i=1,NGLLX
+                              iglob = ibool(i,j,k,ispec_selected_source(isource))
+                              b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used
+                           enddo
+                        enddo
+                     enddo
+                  endif ! USE_FORCE_POINT_SOURCE
 
-            stf_used_total = stf_used_total + stf_used
+                  stf_used_total = stf_used_total + stf_used
 
-          endif ! elastic
-        endif ! phase_inner
-      endif ! myrank
+               endif ! elastic
+            endif ! phase_inner
+         endif ! myrank
 
-    enddo ! NSOURCES
+      enddo ! NSOURCES
+    endif ! GPU_MODE
   endif ! adjoint
 
   ! master prints out source time function to file
@@ -432,11 +504,15 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
        ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
        ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
        ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
-       call add_source_master_rec_noise(myrank,nrec, &
-                                NSTEP,accel,noise_sourcearray, &
-                                ibool,islice_selected_rec,ispec_selected_rec, &
-                                it,irec_master_noise, &
-                                NSPEC_AB,NGLOB_AB)
+       if(.NOT. GPU_MODE) then
+          call add_source_master_rec_noise(myrank,nrec, &
+               NSTEP,accel,noise_sourcearray, &
+               ibool,islice_selected_rec,ispec_selected_rec, &
+               it,irec_master_noise, &
+               NSPEC_AB,NGLOB_AB)
+       else ! GPU_MODE == .true.
+          call add_source_master_rec_noise_cu(Mesh_pointer, myrank, it, irec_master_noise, islice_selected_rec)
+       endif
     elseif ( NOISE_TOMOGRAPHY == 2 ) then
        ! second step of noise tomography, i.e., read the surface movie saved at every timestep
        ! use the movie to drive the ensemble forward wavefield
@@ -447,7 +523,8 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
                               NSTEP-it+1, &
                               NSPEC_AB,NGLOB_AB, &
                               num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
-                              free_surface_jacobian2Dw)
+                              free_surface_jacobian2Dw,&
+                              Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY)
         ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
         ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
         ! note the ensemble forward sources are generally distributed on the surface of the earth
@@ -465,7 +542,8 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
                               it, &
                               NSPEC_AB,NGLOB_AB, &
                               num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
-                              free_surface_jacobian2Dw)
+                              free_surface_jacobian2Dw,&
+                               Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY)
     endif
   endif
 
diff --git a/src/specfem3D/compute_coupling_elastic_po.f90 b/src/specfem3D/compute_coupling_elastic_po.f90
index b9e4fac83..37f239e31 100644
--- a/src/specfem3D/compute_coupling_elastic_po.f90
+++ b/src/specfem3D/compute_coupling_elastic_po.f90
@@ -224,11 +224,11 @@ subroutine compute_coupling_elastic_po(NSPEC_AB,NGLOB_AB,ibool,&
                 if (SIMULATION_TYPE == 3) then
                   ! to do
                   stop 'compute_coupling_elastic_po() : adjoint run not implemented yet'
-                  
+
                   ! dummy to avoid compiler warnings
-                  iglob = NGLOB_ADJOINT    
-                  iglob = NSPEC_ADJOINT          
-                
+                  iglob = NGLOB_ADJOINT
+                  iglob = NSPEC_ADJOINT
+
                 endif ! adjoint
     !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
 
diff --git a/src/specfem3D/compute_coupling_poroelastic_el.f90 b/src/specfem3D/compute_coupling_poroelastic_el.f90
index cc19f10df..f46d3375f 100644
--- a/src/specfem3D/compute_coupling_poroelastic_el.f90
+++ b/src/specfem3D/compute_coupling_poroelastic_el.f90
@@ -360,12 +360,12 @@ subroutine compute_coupling_poroelastic_el(NSPEC_AB,NGLOB_AB,ibool,&
                 if (SIMULATION_TYPE == 3) then
                   ! to do
                   stop 'compute_coupling_poroelastic_el() : adjoint run not implemented yet'
-                  
+
                   ! dummy to avoid compiler warnings
-                  iglob = NGLOB_ADJOINT    
-                  iglob = NSPEC_ADJOINT                          
+                  iglob = NGLOB_ADJOINT
+                  iglob = NSPEC_ADJOINT
                 endif ! adjoint
-                
+
     !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
 
     !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do
diff --git a/src/specfem3D/compute_forces_acoustic.f90 b/src/specfem3D/compute_forces_acoustic.f90
index 1e2ca39fe..89ebf7e8e 100644
--- a/src/specfem3D/compute_forces_acoustic.f90
+++ b/src/specfem3D/compute_forces_acoustic.f90
@@ -65,21 +65,36 @@ subroutine compute_forces_acoustic()
   logical:: phase_is_inner
 
 
-! enforces free surface (zeroes potentials at free surface)
-  call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, &
+  ! enforces free surface (zeroes potentials at free surface)
+  if(.NOT. GPU_MODE) then
+    ! on CPU
+    call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, &
                         potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, &
                         ibool,free_surface_ijk,free_surface_ispec, &
                         num_free_surface_faces,ispec_is_acoustic)
 
-  ! adjoint simulations
-  if( SIMULATION_TYPE == 3 ) &
-    call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT, &
+    ! adjoint simulations
+    if( SIMULATION_TYPE == 3 ) &
+      call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT, &
                         b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
                         ibool,free_surface_ijk,free_surface_ispec, &
                         num_free_surface_faces,ispec_is_acoustic)
-
+  else
+    ! on GPU
+    call acoustic_enforce_free_surf_cuda(Mesh_pointer,SIMULATION_TYPE,ABSORB_FREE_SURFACE)
+  endif
 
   if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then
+    ! enforces free surface on PML elements
+
+    ! note:
+    ! PML routines are not implemented as CUDA kernels, we just transfer the fields
+    ! from the GPU to the CPU and vice versa
+
+    ! transfers potentials to the CPU
+    if(GPU_MODE) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
     call PML_acoustic_enforce_free_srfc(NSPEC_AB,NGLOB_AB, &
                         potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, &
                         ibool,free_surface_ijk,free_surface_ispec, &
@@ -89,20 +104,26 @@ subroutine compute_forces_acoustic()
                         chi1_dot,chi2_t_dot,chi3_dot,chi4_dot,&
                         chi1_dot_dot,chi2_t_dot_dot,&
                         chi3_dot_dot,chi4_dot_dot)
+
+    ! transfers potentials back to GPU
+    if(GPU_MODE) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
+                            potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
   endif
 
-! distinguishes two runs: for points on MPI interfaces, and points within the partitions
+  ! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions
   do iphase=1,2
 
-    !first for points on MPI interfaces
+    !first for points on MPI interfaces, thus outer elements
     if( iphase == 1 ) then
       phase_is_inner = .false.
     else
       phase_is_inner = .true.
     endif
 
-! acoustic pressure term
-    call compute_forces_acoustic_pot( iphase, NSPEC_AB,NGLOB_AB, &
+    ! acoustic pressure term
+    if(.NOT. GPU_MODE) then
+      ! on CPU
+      call compute_forces_acoustic_pot( iphase, NSPEC_AB,NGLOB_AB, &
                         potential_acoustic,potential_dot_dot_acoustic, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
                         hprime_xx,hprime_yy,hprime_zz, &
@@ -112,9 +133,9 @@ subroutine compute_forces_acoustic()
                         num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
                         phase_ispec_inner_acoustic )
 
-    ! adjoint simulations
-    if( SIMULATION_TYPE == 3 ) &
-      call compute_forces_acoustic_pot( iphase, NSPEC_ADJOINT,NGLOB_ADJOINT, &
+      ! adjoint simulations
+      if( SIMULATION_TYPE == 3 ) &
+        call compute_forces_acoustic_pot( iphase, NSPEC_ADJOINT,NGLOB_ADJOINT, &
                         b_potential_acoustic,b_potential_dot_dot_acoustic, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
                         hprime_xx,hprime_yy,hprime_zz, &
@@ -123,8 +144,19 @@ subroutine compute_forces_acoustic()
                         rhostore,jacobian,ibool, &
                         num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
                         phase_ispec_inner_acoustic )
+    else
+      ! on GPU
+      ! includes code for SIMULATION_TYPE==3
+      call compute_forces_acoustic_cuda(Mesh_pointer, iphase, &
+                                      nspec_outer_acoustic, nspec_inner_acoustic, &
+                                      SIMULATION_TYPE)
+    endif
 
     if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then
+      ! transfers potentials to CPU
+      if(GPU_MODE) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
       call compute_forces_acoustic_PML(NSPEC_AB,NGLOB_AB, &
                         ibool,ispec_is_inner,phase_is_inner, &
                         rhostore,ispec_is_acoustic,potential_acoustic, &
@@ -145,9 +177,13 @@ subroutine compute_forces_acoustic()
                         num_PML_ispec,PML_ispec,iglob_is_PML_interface,&
                         chi1_dot_dot,chi3_dot_dot,chi4_dot_dot)
 
+      ! transfers potentials back to GPU
+      if(GPU_MODE) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
     endif ! PML
 
-! absorbing boundaries
+    ! absorbing boundaries
     if(ABSORBING_CONDITIONS) then
       if(ABSORB_USE_PML) then
         if( PML_USE_SOMMERFELD ) then
@@ -161,8 +197,13 @@ subroutine compute_forces_acoustic()
                         num_PML_ispec,PML_ispec,ispec_is_PML_inum,&
                         chi1_dot,chi2_t,chi2_t_dot,chi3_dot,chi4_dot,&
                         chi1_dot_dot,chi3_dot_dot,chi4_dot_dot)
+
+          ! transfers potentials back to GPU
+          if(GPU_MODE) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
         endif
       else
+        ! Stacey boundary conditions
         call compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, &
                         potential_dot_dot_acoustic,potential_dot_acoustic, &
                         ibool,ispec_is_inner,phase_is_inner, &
@@ -170,42 +211,50 @@ subroutine compute_forces_acoustic()
                         num_abs_boundary_faces,rhostore,kappastore,ispec_is_acoustic, &
                         SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it,NGLOB_ADJOINT, &
                         b_potential_dot_dot_acoustic,b_reclen_potential, &
-                        b_absorb_potential,b_num_abs_boundary_faces)
+                        b_absorb_potential,b_num_abs_boundary_faces, &
+                        GPU_MODE,Mesh_pointer)
       endif
     endif
 
-! elastic coupling
+    ! elastic coupling
     if(ELASTIC_SIMULATION ) then
       if( num_coupling_ac_el_faces > 0 ) then
-        if( SIMULATION_TYPE == 1 ) then
-          ! forward definition: \bfs=\frac{1}{\rho}\bfnabla\phi
-          call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
-                            ibool,displ,potential_dot_dot_acoustic, &
+        if( .NOT. GPU_MODE ) then
+          if( SIMULATION_TYPE == 1 ) then
+            ! forward definition: \bfs=\frac{1}{\rho}\bfnabla\phi
+            call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
+                              ibool,displ,potential_dot_dot_acoustic, &
+                              num_coupling_ac_el_faces, &
+                              coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                              coupling_ac_el_normal, &
+                              coupling_ac_el_jacobian2Dw, &
+                              ispec_is_inner,phase_is_inner)
+          else
+            ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield
+            ! adjoint definition: \partial_t^2 \bfs^\dagger=-\frac{1}{\rho}\bfnabla\phi^\dagger
+            call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
+                              ibool,-accel_adj_coupling,potential_dot_dot_acoustic, &
+                              num_coupling_ac_el_faces, &
+                              coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                              coupling_ac_el_normal, &
+                              coupling_ac_el_jacobian2Dw, &
+                              ispec_is_inner,phase_is_inner)
+          endif
+          ! adjoint/kernel simulations
+          if( SIMULATION_TYPE == 3 ) &
+            call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+                            ibool,b_displ,b_potential_dot_dot_acoustic, &
                             num_coupling_ac_el_faces, &
                             coupling_ac_el_ispec,coupling_ac_el_ijk, &
                             coupling_ac_el_normal, &
                             coupling_ac_el_jacobian2Dw, &
                             ispec_is_inner,phase_is_inner)
+
         else
-          ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield
-          ! adjoint definition: \partial_t^2 \bfs^\dagger=-\frac{1}{\rho}\bfnabla\phi^\dagger
-          call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
-                            ibool,-accel_adj_coupling,potential_dot_dot_acoustic, &
-                            num_coupling_ac_el_faces, &
-                            coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                            coupling_ac_el_normal, &
-                            coupling_ac_el_jacobian2Dw, &
-                            ispec_is_inner,phase_is_inner)
-        endif
-        ! adjoint/kernel simulations
-        if( SIMULATION_TYPE == 3 ) &
-          call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
-                          ibool,b_displ,b_potential_dot_dot_acoustic, &
-                          num_coupling_ac_el_faces, &
-                          coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                          coupling_ac_el_normal, &
-                          coupling_ac_el_jacobian2Dw, &
-                          ispec_is_inner,phase_is_inner)
+          ! on GPU
+          call compute_coupling_ac_el_cuda(Mesh_pointer,phase_is_inner, &
+                                              num_coupling_ac_el_faces,SIMULATION_TYPE)
+        endif ! GPU_MODE
       endif
     endif
 
@@ -213,7 +262,7 @@ subroutine compute_forces_acoustic()
     if(POROELASTIC_SIMULATION )  then
       if( num_coupling_ac_po_faces > 0 ) then
         if( SIMULATION_TYPE == 1 ) then
-      call compute_coupling_acoustic_po(NSPEC_AB,NGLOB_AB, &
+          call compute_coupling_acoustic_po(NSPEC_AB,NGLOB_AB, &
                         ibool,displs_poroelastic,displw_poroelastic, &
                         potential_dot_dot_acoustic, &
                         num_coupling_ac_po_faces, &
@@ -222,14 +271,14 @@ subroutine compute_forces_acoustic()
                         coupling_ac_po_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner)
         else
-      stop 'not implemented yet'
+          stop 'not implemented yet'
         endif
         if( SIMULATION_TYPE == 3 ) &
-      stop 'not implemented yet'
+          stop 'not implemented yet'
       endif
     endif
 
-! sources
+    ! sources
     call compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acoustic, &
                         ibool,ispec_is_inner,phase_is_inner, &
                         NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
@@ -239,54 +288,127 @@ subroutine compute_forces_acoustic()
                         SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, &
                         nrec,islice_selected_rec,ispec_selected_rec, &
                         nadj_rec_local,adj_sourcearrays,b_potential_dot_dot_acoustic, &
-                        NTSTEP_BETWEEN_READ_ADJSRC )
+                        NTSTEP_BETWEEN_READ_ADJSRC, &
+                        GPU_MODE, Mesh_pointer)
 
-! assemble all the contributions between slices using MPI
+    ! assemble all the contributions between slices using MPI
     if( phase_is_inner .eqv. .false. ) then
       ! sends potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking)
-      call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
+      if(.NOT. GPU_MODE) then
+        call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
                         buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
                         my_neighbours_ext_mesh, &
                         request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+      else
+        ! on GPU
+        call transfer_boun_pot_from_device(NGLOB_AB, Mesh_pointer, &
+                                            potential_dot_dot_acoustic, &
+                                            buffer_send_scalar_ext_mesh, &
+                                            num_interfaces_ext_mesh, &
+                                            max_nibool_interfaces_ext_mesh, &
+                                            nibool_interfaces_ext_mesh, &
+                                            ibool_interfaces_ext_mesh, &
+                                            1) ! <-- 1 == fwd accel
+        call assemble_MPI_scalar_send_cuda(NPROC, &
+                        buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,&
+                        my_neighbours_ext_mesh, &
+                        request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+      endif
+
       ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) &
-        call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
+      if( SIMULATION_TYPE == 3 ) then
+        if(.NOT. GPU_MODE) then
+          call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
                         b_buffer_send_scalar_ext_mesh,b_buffer_recv_scalar_ext_mesh, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
                         my_neighbours_ext_mesh, &
                         b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
+        else
+          ! on GPU
+          call transfer_boun_pot_from_device(NGLOB_AB, Mesh_pointer, &
+                                                  b_potential_dot_dot_acoustic, &
+                                                  b_buffer_send_scalar_ext_mesh,&
+                                                  num_interfaces_ext_mesh, &
+                                                  max_nibool_interfaces_ext_mesh, &
+                                                  nibool_interfaces_ext_mesh, &
+                                                  ibool_interfaces_ext_mesh, &
+                                                  3) ! <-- 3 == adjoint b_accel
+
+          call assemble_MPI_scalar_send_cuda(NPROC, &
+                          b_buffer_send_scalar_ext_mesh,b_buffer_recv_scalar_ext_mesh, &
+                          num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                          nibool_interfaces_ext_mesh,&
+                          my_neighbours_ext_mesh, &
+                          b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
+
+        endif
+      endif
+
     else
+
       ! waits for send/receive requests to be completed and assembles values
-      call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
+      if(.NOT. GPU_MODE) then
+        call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
                         buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
                         max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+      else
+        ! on GPU
+        call assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
+                        Mesh_pointer,&
+                        buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
+                        max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
+                        1)
+      endif
+
       ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) &
-      call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
+      if( SIMULATION_TYPE == 3 ) then
+        if(.NOT. GPU_MODE) then
+          call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
                         b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
                         max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
-
-    endif
-
+        else
+          ! on GPU
+          call assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,b_potential_dot_dot_acoustic, &
+                        Mesh_pointer, &
+                        b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, &
+                        max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        b_request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
+                        3)
+        endif
+      endif
+    endif !phase_is_inner
 
   enddo
 
-  ! divides pressure with mass matrix
-  potential_dot_dot_acoustic(:) = potential_dot_dot_acoustic(:) * rmass_acoustic(:)
+  if(.NOT. GPU_MODE) then
+    ! divides pressure with mass matrix
+    potential_dot_dot_acoustic(:) = potential_dot_dot_acoustic(:) * rmass_acoustic(:)
 
-  ! adjoint simulations
-  if (SIMULATION_TYPE == 3) &
-    b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:)
+    ! adjoint simulations
+    if (SIMULATION_TYPE == 3) &
+      b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:)
+  else
+    ! on GPU
+    call kernel_3_a_acoustic_cuda(Mesh_pointer,NGLOB_AB,SIMULATION_TYPE)
+  endif
 
 
   if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then
+    ! note: no need to transfer fields between CPU and GPU;
+    !           PML arrays are all handled on the CPU
+
     ! divides local contributions with mass term
     call PML_acoustic_mass_update(NSPEC_AB,NGLOB_AB,&
                         ispec_is_acoustic,rmass_acoustic,ibool,&
@@ -317,14 +439,24 @@ subroutine compute_forces_acoustic()
 !
 ! corrector:
 !   updates the chi_dot term which requires chi_dot_dot(t+delta)
-  potential_dot_acoustic(:) = potential_dot_acoustic(:) + deltatover2*potential_dot_dot_acoustic(:)
+ if( .NOT. GPU_MODE ) then
+    ! corrector
+    potential_dot_acoustic(:) = potential_dot_acoustic(:) + deltatover2*potential_dot_dot_acoustic(:)
 
-  ! adjoint simulations
-  if (SIMULATION_TYPE == 3) &
-    b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:)
+    ! adjoint simulations
+    if (SIMULATION_TYPE == 3) &
+      b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:)
+  else
+    ! on GPU
+    call kernel_3_b_acoustic_cuda(Mesh_pointer,NGLOB_AB,deltatover2,SIMULATION_TYPE,b_deltatover2)
+  endif
 
   ! updates potential_dot_acoustic and potential_dot_dot_acoustic inside PML region for plotting seismograms/movies
   if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then
+    ! transfers potentials to CPU
+    if(GPU_MODE) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
     call PML_acoustic_update_potentials(NGLOB_AB,NSPEC_AB, &
                         ibool,ispec_is_acoustic, &
                         potential_dot_acoustic,potential_dot_dot_acoustic,&
@@ -336,29 +468,44 @@ subroutine compute_forces_acoustic()
                         chi1,chi2,chi2_t,chi3,&
                         chi1_dot,chi2_t_dot,chi3_dot,chi4_dot,&
                         chi1_dot_dot,chi3_dot_dot,chi4_dot_dot)
+
+    ! transfers potentials to GPU
+    if(GPU_MODE) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
   endif
 
 ! enforces free surface (zeroes potentials at free surface)
-  call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, &
+  if(.NOT. GPU_MODE) then
+    ! on CPU
+    call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, &
                         potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, &
                         ibool,free_surface_ijk,free_surface_ispec, &
                         num_free_surface_faces,ispec_is_acoustic)
 
-  if( SIMULATION_TYPE /= 1 ) then
-    potential_acoustic_adj_coupling(:) = potential_acoustic(:) &
+    if( SIMULATION_TYPE /= 1 ) then
+      potential_acoustic_adj_coupling(:) = potential_acoustic(:) &
                             + deltat * potential_dot_acoustic(:) &
                             + deltatsqover2 * potential_dot_dot_acoustic(:)
-  endif
+    endif
 
-  ! adjoint simulations
-  if (SIMULATION_TYPE == 3) &
-    call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT, &
+    ! adjoint simulations
+    if (SIMULATION_TYPE == 3) &
+      call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT, &
                         b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
                         ibool,free_surface_ijk,free_surface_ispec, &
                         num_free_surface_faces,ispec_is_acoustic)
+  else
+    ! on GPU
+    call acoustic_enforce_free_surf_cuda(Mesh_pointer,SIMULATION_TYPE,ABSORB_FREE_SURFACE)
+  endif
 
 
   if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then
+    ! enforces free surface on PML elements
+    if( GPU_MODE ) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
     call PML_acoustic_enforce_free_srfc(NSPEC_AB,NGLOB_AB, &
                         potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, &
                         ibool,free_surface_ijk,free_surface_ispec, &
@@ -369,6 +516,9 @@ subroutine compute_forces_acoustic()
                         chi1_dot,chi2_t_dot,chi3_dot,chi4_dot,&
                         chi1_dot_dot,chi2_t_dot_dot,&
                         chi3_dot_dot,chi4_dot_dot)
+
+    if( GPU_MODE ) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
   endif
 
 end subroutine compute_forces_acoustic
diff --git a/src/specfem3D/compute_forces_elastic.f90 b/src/specfem3D/compute_forces_elastic.F90
similarity index 64%
rename from src/specfem3D/compute_forces_elastic.f90
rename to src/specfem3D/compute_forces_elastic.F90
index d76dee796..1ff63ce40 100644
--- a/src/specfem3D/compute_forces_elastic.f90
+++ b/src/specfem3D/compute_forces_elastic.F90
@@ -50,72 +50,107 @@ subroutine compute_forces_elastic()
 
 
 ! elastic term
-    if(USE_DEVILLE_PRODUCTS) then
-      ! uses Deville (2002) optimizations
-      call compute_forces_elastic_Dev_sim1(iphase)
+    if( .NOT. GPU_MODE ) then
+      if(USE_DEVILLE_PRODUCTS) then
+        ! uses Deville (2002) optimizations
+        call compute_forces_elastic_Dev_sim1(iphase)
 
-      ! adjoint simulations: backward/reconstructed wavefield
-      if( SIMULATION_TYPE == 3 ) &
-        call compute_forces_elastic_Dev_sim3(iphase)
+        ! adjoint simulations: backward/reconstructed wavefield
+        if( SIMULATION_TYPE == 3 ) &
+          call compute_forces_elastic_Dev_sim3(iphase)
+
+      else
+        ! no optimizations used
+        call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,displ,accel, &
+                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                        hprime_xx,hprime_yy,hprime_zz, &
+                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,&
+                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                        kappastore,mustore,jacobian,ibool, &
+                        ATTENUATION,&
+                        one_minus_sum_beta,factor_common, &
+                        alphaval,betaval,gammaval,&
+                        NSPEC_ATTENUATION_AB, &
+                        R_xx,R_yy,R_xy,R_xz,R_yz, &
+                        epsilondev_xx,epsilondev_yy,epsilondev_xy,&
+                        epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                        ANISOTROPY,NSPEC_ANISO, &
+                        c11store,c12store,c13store,c14store,c15store,c16store,&
+                        c22store,c23store,c24store,c25store,c26store,c33store,&
+                        c34store,c35store,c36store,c44store,c45store,c46store,&
+                        c55store,c56store,c66store, &
+                        SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                        NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
+                        is_moho_top,is_moho_bot, &
+                        dsdx_top,dsdx_bot, &
+                        ispec2D_moho_top,ispec2D_moho_bot, &
+                        num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                        phase_ispec_inner_elastic  )
+
+        ! adjoint simulations: backward/reconstructed wavefield
+        if( SIMULATION_TYPE == 3 ) &
+          call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,&
+                        b_displ,b_accel, &
+                        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                        hprime_xx,hprime_yy,hprime_zz, &
+                        hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,&
+                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                        kappastore,mustore,jacobian,ibool, &
+                        ATTENUATION,&
+                        one_minus_sum_beta,factor_common, &
+                        b_alphaval,b_betaval,b_gammaval,&
+                        NSPEC_ATTENUATION_AB, &
+                        b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+                        b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,&
+                        b_epsilondev_xz,b_epsilondev_yz,b_epsilon_trace_over_3, &
+                        ANISOTROPY,NSPEC_ANISO, &
+                        c11store,c12store,c13store,c14store,c15store,c16store,&
+                        c22store,c23store,c24store,c25store,c26store,c33store,&
+                        c34store,c35store,c36store,c44store,c45store,c46store,&
+                        c55store,c56store,c66store, &
+                        SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                        NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
+                        is_moho_top,is_moho_bot, &
+                        b_dsdx_top,b_dsdx_bot, &
+                        ispec2D_moho_top,ispec2D_moho_bot, &
+                        num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                        phase_ispec_inner_elastic  )
+
+      endif
 
     else
-      ! no optimizations used
-      call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,displ,accel, &
-                      xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                      hprime_xx,hprime_yy,hprime_zz, &
-                      hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,&
-                      wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                      kappastore,mustore,jacobian,ibool, &
-                      ATTENUATION,&
-                      one_minus_sum_beta,factor_common, &
-                      alphaval,betaval,gammaval,&
-                      NSPEC_ATTENUATION_AB, &
-                      R_xx,R_yy,R_xy,R_xz,R_yz, &
-                      epsilondev_xx,epsilondev_yy,epsilondev_xy,&
-                      epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                      ANISOTROPY,NSPEC_ANISO, &
-                      c11store,c12store,c13store,c14store,c15store,c16store,&
-                      c22store,c23store,c24store,c25store,c26store,c33store,&
-                      c34store,c35store,c36store,c44store,c45store,c46store,&
-                      c55store,c56store,c66store, &
-                      SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                      NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
-                      is_moho_top,is_moho_bot, &
-                      dsdx_top,dsdx_bot, &
-                      ispec2D_moho_top,ispec2D_moho_bot, &
-                      num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                      phase_ispec_inner_elastic  )
-
-      ! adjoint simulations: backward/reconstructed wavefield
-      if( SIMULATION_TYPE == 3 ) &
-        call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,&
-                      b_displ,b_accel, &
-                      xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                      hprime_xx,hprime_yy,hprime_zz, &
-                      hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,&
-                      wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                      kappastore,mustore,jacobian,ibool, &
-                      ATTENUATION,&
-                      one_minus_sum_beta,factor_common, &
-                      b_alphaval,b_betaval,b_gammaval,&
-                      NSPEC_ATTENUATION_AB, &
-                      b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
-                      b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,&
-                      b_epsilondev_xz,b_epsilondev_yz,b_epsilon_trace_over_3, &
-                      ANISOTROPY,NSPEC_ANISO, &
-                      c11store,c12store,c13store,c14store,c15store,c16store,&
-                      c22store,c23store,c24store,c25store,c26store,c33store,&
-                      c34store,c35store,c36store,c44store,c45store,c46store,&
-                      c55store,c56store,c66store, &
-                      SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                      NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
-                      is_moho_top,is_moho_bot, &
-                      b_dsdx_top,b_dsdx_bot, &
-                      ispec2D_moho_top,ispec2D_moho_bot, &
-                      num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                      phase_ispec_inner_elastic  )
+      ! on GPU
+      ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
+      call compute_forces_elastic_cuda(Mesh_pointer, iphase, &
+                                      nspec_outer_elastic, &
+                                      nspec_inner_elastic, &
+                                      SIMULATION_TYPE, &
+                                      COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
+
+      if(phase_is_inner .eqv. .true.) then
+         ! while Inner elements compute "Kernel_2", we wait for MPI to
+         ! finish and transfer the boundary terms to the device
+         ! asynchronously
+
+         !daniel: todo - this avoids calling the fortran vector send from CUDA routine
+         ! wait for asynchronous copy to finish
+         call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_ext_mesh)
+         ! sends mpi buffers
+         call assemble_MPI_vector_send_cuda(NPROC, &
+                  buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
+                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                  nibool_interfaces_ext_mesh,&
+                  my_neighbours_ext_mesh, &
+                  request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+
+         ! transfers mpi buffers onto GPU
+         call transfer_boundary_to_device(NPROC,Mesh_pointer,buffer_recv_vector_ext_mesh,&
+                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,&
+                  request_recv_vector_ext_mesh)
+      endif ! inner elements
+
+   endif ! GPU_MODE
 
-    endif
 
 ! adds elastic absorbing boundary term to acceleration (Stacey conditions)
     if(ABSORBING_CONDITIONS) &
@@ -127,48 +162,55 @@ subroutine compute_forces_elastic()
                         veloc,rho_vp,rho_vs, &
                         ispec_is_elastic,SIMULATION_TYPE,SAVE_FORWARD, &
                         NSTEP,it,NGLOB_ADJOINT,b_accel, &
-                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field )
+                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field,&
+                        GPU_MODE,Mesh_pointer)
 
 ! acoustic coupling
     if( ACOUSTIC_SIMULATION ) then
       if( num_coupling_ac_el_faces > 0 ) then
-        if( SIMULATION_TYPE == 1 ) then
-          ! forward definition: pressure=-potential_dot_dot
-          call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
-                            ibool,accel,potential_dot_dot_acoustic, &
-                            num_coupling_ac_el_faces, &
-                            coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                            coupling_ac_el_normal, &
-                            coupling_ac_el_jacobian2Dw, &
-                            ispec_is_inner,phase_is_inner)
-        else
-          ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield
-          ! adoint definition: pressure^\dagger=potential^\dagger
-          call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
-                            ibool,accel,-potential_acoustic_adj_coupling, &
-                            num_coupling_ac_el_faces, &
-                            coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                            coupling_ac_el_normal, &
-                            coupling_ac_el_jacobian2Dw, &
-                            ispec_is_inner,phase_is_inner)
-        endif
+        if( .NOT. GPU_MODE ) then
+          if( SIMULATION_TYPE == 1 ) then
+            ! forward definition: pressure=-potential_dot_dot
+            call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
+                        ibool,accel,potential_dot_dot_acoustic, &
+                        num_coupling_ac_el_faces, &
+                        coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                        coupling_ac_el_normal, &
+                        coupling_ac_el_jacobian2Dw, &
+                        ispec_is_inner,phase_is_inner)
+          else
+            ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield
+            ! adoint definition: pressure^\dagger=potential^\dagger
+            call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
+                              ibool,accel,-potential_acoustic_adj_coupling, &
+                              num_coupling_ac_el_faces, &
+                              coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                              coupling_ac_el_normal, &
+                              coupling_ac_el_jacobian2Dw, &
+                              ispec_is_inner,phase_is_inner)
+          endif
 
         ! adjoint simulations
         if( SIMULATION_TYPE == 3 ) &
           call compute_coupling_elastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
-                          ibool,b_accel,b_potential_dot_dot_acoustic, &
-                          num_coupling_ac_el_faces, &
-                          coupling_ac_el_ispec,coupling_ac_el_ijk, &
-                          coupling_ac_el_normal, &
-                          coupling_ac_el_jacobian2Dw, &
-                          ispec_is_inner,phase_is_inner)
-      endif
+                        ibool,b_accel,b_potential_dot_dot_acoustic, &
+                        num_coupling_ac_el_faces, &
+                        coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                        coupling_ac_el_normal, &
+                        coupling_ac_el_jacobian2Dw, &
+                        ispec_is_inner,phase_is_inner)
+
+        else
+          ! on GPU
+          call compute_coupling_el_ac_cuda(Mesh_pointer,phase_is_inner, &
+                                          num_coupling_ac_el_faces,SIMULATION_TYPE)
+        endif ! GPU_MODE
+      endif ! num_coupling_ac_el_faces
     endif
 
 
 ! poroelastic coupling
-    !print *,'entering poro counpling'
-    if( POROELASTIC_SIMULATION ) &
+    if( POROELASTIC_SIMULATION ) then
       call compute_coupling_elastic_po(NSPEC_AB,NGLOB_AB,ibool,&
                         displs_poroelastic,displw_poroelastic,&
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
@@ -188,7 +230,7 @@ subroutine compute_forces_elastic()
                         coupling_el_po_normal, &
                         coupling_el_po_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner)
-    !print *,'ok poro counpling'
+    endif
 
 ! adds source term (single-force/moment-tensor solution)
     call compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
@@ -199,43 +241,83 @@ subroutine compute_forces_elastic()
                         ispec_is_elastic,SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, &
                         nrec,islice_selected_rec,ispec_selected_rec, &
                         nadj_rec_local,adj_sourcearrays,b_accel, &
-                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY )
+                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
+                        GPU_MODE, Mesh_pointer )
 
-! assemble all the contributions between slices using MPI
+    ! assemble all the contributions between slices using MPI
     if( phase_is_inner .eqv. .false. ) then
-      ! sends accel values to corresponding MPI interface neighbors
-      call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,accel, &
-                        buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
-                        my_neighbours_ext_mesh, &
-                        request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
-
-      ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) then
-        call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_accel, &
-                        b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
-                        my_neighbours_ext_mesh, &
-                        b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
-      endif !adjoint
+       ! sends accel values to corresponding MPI interface neighbors
+       if(.NOT. GPU_MODE) then
+          call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,accel, &
+               buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
+               num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+               nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
+               my_neighbours_ext_mesh, &
+               request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+       else ! GPU_MODE==1
+
+          ! transfers boundary region to host asynchronously. The
+          ! MPI-send is done from within compute_forces_elastic_cuda,
+          ! once the inner element kernels are launched, and the
+          ! memcpy has finished. see compute_forces_elastic_cuda:1655
+          call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
+
+       endif ! GPU_MODE
+
+       ! adjoint simulations
+       if( SIMULATION_TYPE == 3 ) then
+          if(.NOT. GPU_MODE) then
+             call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_accel, &
+                  b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
+                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                  nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
+                  my_neighbours_ext_mesh, &
+                  b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
+          else ! GPU_MODE == 1
+             call transfer_boun_accel_from_device(NGLOB_AB*NDIM, Mesh_pointer, b_accel,&
+                       b_buffer_send_vector_ext_mesh,&
+                       num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh,&
+                       nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh,3) ! <-- 3 == adjoint b_accel
+             call assemble_MPI_vector_send_cuda(NPROC, &
+                  b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
+                  num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                  nibool_interfaces_ext_mesh,&
+                  my_neighbours_ext_mesh, &
+                  b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
+
+          endif ! GPU
+       endif !adjoint
 
     else
       ! waits for send/receive requests to be completed and assembles values
-      call assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB,accel, &
-                        buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
-                        max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
-
+      if(.NOT. GPU_MODE) then
+         call assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB,accel, &
+                            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
+                            max_nibool_interfaces_ext_mesh, &
+                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+      else ! GPU_MODE == 1
+         call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,accel, Mesh_pointer,&
+                            buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
+                            max_nibool_interfaces_ext_mesh, &
+                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh,1)
+      endif
       ! adjoint simulations
       if( SIMULATION_TYPE == 3 ) then
-        call assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_accel, &
-                        b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
-                        max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
+         if(.NOT. GPU_MODE) then
+            call assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_accel, &
+                             b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
+                             max_nibool_interfaces_ext_mesh, &
+                             nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                             b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
+         else ! GPU_MODE == 1
+            call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,b_accel, Mesh_pointer,&
+                              b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
+                              max_nibool_interfaces_ext_mesh, &
+                              nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                              b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh,3)
+         endif
       endif !adjoint
 
     endif
@@ -246,28 +328,35 @@ subroutine compute_forces_elastic()
     !! DK DK May 2009: and assemble_MPI_vector_ext_mesh_w above can be used.
     !! DK DK May 2009: For adjoint runs below (SIMULATION_TYPE == 3) they should be used as well.
 
-  enddo
-
-! multiplies with inverse of mass matrix (note: rmass has been inverted already)
-  accel(1,:) = accel(1,:)*rmass(:)
-  accel(2,:) = accel(2,:)*rmass(:)
-  accel(3,:) = accel(3,:)*rmass(:)
-
-  ! adjoint simulations
-  if (SIMULATION_TYPE == 3) then
-    b_accel(1,:) = b_accel(1,:)*rmass(:)
-    b_accel(2,:) = b_accel(2,:)*rmass(:)
-    b_accel(3,:) = b_accel(3,:)*rmass(:)
-  endif !adjoint
-
+ enddo
+
+ ! multiplies with inverse of mass matrix (note: rmass has been inverted already)
+ if(.NOT. GPU_MODE) then
+    accel(1,:) = accel(1,:)*rmass(:)
+    accel(2,:) = accel(2,:)*rmass(:)
+    accel(3,:) = accel(3,:)*rmass(:)
+    ! adjoint simulations
+    if (SIMULATION_TYPE == 3) then
+       b_accel(1,:) = b_accel(1,:)*rmass(:)
+       b_accel(2,:) = b_accel(2,:)*rmass(:)
+       b_accel(3,:) = b_accel(3,:)*rmass(:)
+    endif !adjoint
+ else ! GPU_MODE == 1
+    call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS)
+ endif
 
 ! updates acceleration with ocean load term
   if(OCEANS) then
-    call elastic_ocean_load(NSPEC_AB,NGLOB_AB, &
+    if( .NOT. GPU_MODE ) then
+      call elastic_ocean_load(NSPEC_AB,NGLOB_AB, &
                         ibool,rmass,rmass_ocean_load,accel, &
                         free_surface_normal,free_surface_ijk,free_surface_ispec, &
                         num_free_surface_faces,SIMULATION_TYPE, &
                         NGLOB_ADJOINT,b_accel)
+    else
+      ! on GPU
+      call elastic_ocean_load_cuda(Mesh_pointer,SIMULATION_TYPE)
+    endif
   endif
 
 ! updates velocities
@@ -286,10 +375,14 @@ subroutine compute_forces_elastic()
 !
 ! corrector:
 !   updates the velocity term which requires a(t+delta)
-  veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
-
-  ! adjoint simulations
-  if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+! GPU_MODE: this is handled in 'kernel_3' at the same time as accel*rmass
+  if(.NOT. GPU_MODE) then
+     veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
+     ! adjoint simulations
+     if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+  else ! GPU_MODE == 1
+    if( OCEANS ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2)
+  endif
 
 
 end subroutine compute_forces_elastic
@@ -422,30 +515,65 @@ subroutine compute_forces_elastic_Dev_sim1(iphase)
   select case(NGLLX)
 
   case (5)
+
+!----------------------------------------------------------------------------------------------
+
+! OpenMP routine flag for testing & benchmarking forward runs only
+! configure additional flag, e.g.: FLAGS_NO_CHECK="-O3 -DOPENMP_MODE -openmp"
+
+!----------------------------------------------------------------------------------------------
+#ifdef OPENMP_MODE
+    call compute_forces_elastic_Dev_openmp(iphase, NSPEC_AB,NGLOB_AB,displ,accel, &
+           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
+           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+           kappastore,mustore,jacobian,ibool, &
+           ATTENUATION, &
+           one_minus_sum_beta,factor_common, &
+           alphaval,betaval,gammaval, &
+           NSPEC_ATTENUATION_AB, &
+           R_xx,R_yy,R_xy,R_xz,R_yz, &
+           epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+           epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+           ANISOTROPY,NSPEC_ANISO, &
+           c11store,c12store,c13store,c14store,c15store,c16store,&
+           c22store,c23store,c24store,c25store,c26store,c33store,&
+           c34store,c35store,c36store,c44store,c45store,c46store,&
+           c55store,c56store,c66store, &
+           SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+           NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
+           is_moho_top,is_moho_bot, &
+           dsdx_top,dsdx_bot, &
+           ispec2D_moho_top,ispec2D_moho_bot, &
+           num_phase_ispec_elastic,&
+           phase_ispec_inner_elastic,&
+           num_colors_outer_elastic,num_colors_inner_elastic)
+#else
     call compute_forces_elastic_Dev_5p(iphase, NSPEC_AB,NGLOB_AB,displ,accel, &
-            xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-            hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-            wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-            kappastore,mustore,jacobian,ibool, &
-            ATTENUATION, &
-            one_minus_sum_beta,factor_common, &
-            alphaval,betaval,gammaval, &
-            NSPEC_ATTENUATION_AB, &
-            R_xx,R_yy,R_xy,R_xz,R_yz, &
-            epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-            epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-            ANISOTROPY,NSPEC_ANISO, &
-            c11store,c12store,c13store,c14store,c15store,c16store,&
-            c22store,c23store,c24store,c25store,c26store,c33store,&
-            c34store,c35store,c36store,c44store,c45store,c46store,&
-            c55store,c56store,c66store, &
-            SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-            NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
-            is_moho_top,is_moho_bot, &
-            dsdx_top,dsdx_bot, &
-            ispec2D_moho_top,ispec2D_moho_bot, &
-            num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-            phase_ispec_inner_elastic )
+             xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+             hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
+             wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+             kappastore,mustore,jacobian,ibool, &
+             ATTENUATION, &
+             one_minus_sum_beta,factor_common, &
+             alphaval,betaval,gammaval, &
+             NSPEC_ATTENUATION_AB, &
+             R_xx,R_yy,R_xy,R_xz,R_yz, &
+             epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+             epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+             ANISOTROPY,NSPEC_ANISO, &
+             c11store,c12store,c13store,c14store,c15store,c16store,&
+             c22store,c23store,c24store,c25store,c26store,c33store,&
+             c34store,c35store,c36store,c44store,c45store,c46store,&
+             c55store,c56store,c66store, &
+             SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+             NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,&
+             is_moho_top,is_moho_bot, &
+             dsdx_top,dsdx_bot, &
+             ispec2D_moho_top,ispec2D_moho_bot, &
+             num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+             phase_ispec_inner_elastic )
+#endif
 
   case (6)
     call compute_forces_elastic_Dev_6p(iphase, NSPEC_AB,NGLOB_AB,displ,accel, &
diff --git a/src/specfem3D/compute_forces_elastic_Dev.f90 b/src/specfem3D/compute_forces_elastic_Dev.f90
index 3ac3a07ce..632ac70c0 100644
--- a/src/specfem3D/compute_forces_elastic_Dev.f90
+++ b/src/specfem3D/compute_forces_elastic_Dev.f90
@@ -24,8 +24,9 @@
 !
 !=====================================================================
 
+! Deville routine for NGLL == 5 (default)
 
-subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, &
+  subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, &
                                     displ,accel, &
                                     xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
                                     hprime_xx,hprime_xxT, &
@@ -655,3454 +656,5 @@ subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, &
 
   enddo  ! spectral element loop
 
-end subroutine compute_forces_elastic_Dev_5p
+  end subroutine compute_forces_elastic_Dev_5p
 
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_6p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,6) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(6,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(6,6,6) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(6,6,6) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(6,36) :: B1_m1_m2_6points,B2_m1_m2_6points,B3_m1_m2_6points
-  real(kind=CUSTOM_REAL), dimension(6,36) :: C1_m1_m2_6points,C2_m1_m2_6points,C3_m1_m2_6points
-  real(kind=CUSTOM_REAL), dimension(6,36) :: E1_m1_m2_6points,E2_m1_m2_6points,E3_m1_m2_6points
-
-  equivalence(dummyx_loc,B1_m1_m2_6points)
-  equivalence(dummyy_loc,B2_m1_m2_6points)
-  equivalence(dummyz_loc,B3_m1_m2_6points)
-  equivalence(tempx1,C1_m1_m2_6points)
-  equivalence(tempy1,C2_m1_m2_6points)
-  equivalence(tempz1,C3_m1_m2_6points)
-  equivalence(newtempx1,E1_m1_m2_6points)
-  equivalence(newtempy1,E2_m1_m2_6points)
-  equivalence(newtempz1,E3_m1_m2_6points)
-
-  real(kind=CUSTOM_REAL), dimension(36,6) :: &
-    A1_mxm_m2_m1_6points,A2_mxm_m2_m1_6points,A3_mxm_m2_m1_6points
-  real(kind=CUSTOM_REAL), dimension(36,6) :: &
-    C1_mxm_m2_m1_6points,C2_mxm_m2_m1_6points,C3_mxm_m2_m1_6points
-  real(kind=CUSTOM_REAL), dimension(36,6) :: &
-    E1_mxm_m2_m1_6points,E2_mxm_m2_m1_6points,E3_mxm_m2_m1_6points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_6points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_6points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_6points)
-  equivalence(tempx3,C1_mxm_m2_m1_6points)
-  equivalence(tempy3,C2_mxm_m2_m1_6points)
-  equivalence(tempz3,C3_mxm_m2_m1_6points)
-  equivalence(newtempx3,E1_mxm_m2_m1_6points)
-  equivalence(newtempy3,E2_mxm_m2_m1_6points)
-  equivalence(newtempz3,E3_mxm_m2_m1_6points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_6points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_6points(i,j) = hprime_xx(i,1)*B1_m1_m2_6points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_6points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_6points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_6points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_6points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_6points(6,j)
-            C2_m1_m2_6points(i,j) = hprime_xx(i,1)*B2_m1_m2_6points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_6points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_6points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_6points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_6points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_6points(6,j)
-            C3_m1_m2_6points(i,j) = hprime_xx(i,1)*B3_m1_m2_6points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_6points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_6points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_6points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_6points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_6points(6,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_6points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_6points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_6points(i,j) = A1_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
-            C2_mxm_m2_m1_6points(i,j) = A2_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
-            C3_mxm_m2_m1_6points(i,j) = A3_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_6points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_6points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_6points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_6points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_6points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_6points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_6points(6,j)
-            E2_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_6points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_6points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_6points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_6points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_6points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_6points(6,j)
-            E3_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_6points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_6points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_6points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_6points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_6points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_6points(6,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_6points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_6points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_6points(i,j) = C1_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
-            E2_mxm_m2_m1_6points(i,j) = C2_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
-            E3_mxm_m2_m1_6points(i,j) = C3_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_6p
-
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_7p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,7) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(7,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(7,7,7) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(7,7,7) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(7,49) :: B1_m1_m2_7points,B2_m1_m2_7points,B3_m1_m2_7points
-  real(kind=CUSTOM_REAL), dimension(7,49) :: C1_m1_m2_7points,C2_m1_m2_7points,C3_m1_m2_7points
-  real(kind=CUSTOM_REAL), dimension(7,49) :: E1_m1_m2_7points,E2_m1_m2_7points,E3_m1_m2_7points
-
-  equivalence(dummyx_loc,B1_m1_m2_7points)
-  equivalence(dummyy_loc,B2_m1_m2_7points)
-  equivalence(dummyz_loc,B3_m1_m2_7points)
-  equivalence(tempx1,C1_m1_m2_7points)
-  equivalence(tempy1,C2_m1_m2_7points)
-  equivalence(tempz1,C3_m1_m2_7points)
-  equivalence(newtempx1,E1_m1_m2_7points)
-  equivalence(newtempy1,E2_m1_m2_7points)
-  equivalence(newtempz1,E3_m1_m2_7points)
-
-  real(kind=CUSTOM_REAL), dimension(49,7) :: &
-    A1_mxm_m2_m1_7points,A2_mxm_m2_m1_7points,A3_mxm_m2_m1_7points
-  real(kind=CUSTOM_REAL), dimension(49,7) :: &
-    C1_mxm_m2_m1_7points,C2_mxm_m2_m1_7points,C3_mxm_m2_m1_7points
-  real(kind=CUSTOM_REAL), dimension(49,7) :: &
-    E1_mxm_m2_m1_7points,E2_mxm_m2_m1_7points,E3_mxm_m2_m1_7points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_7points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_7points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_7points)
-  equivalence(tempx3,C1_mxm_m2_m1_7points)
-  equivalence(tempy3,C2_mxm_m2_m1_7points)
-  equivalence(tempz3,C3_mxm_m2_m1_7points)
-  equivalence(newtempx3,E1_mxm_m2_m1_7points)
-  equivalence(newtempy3,E2_mxm_m2_m1_7points)
-  equivalence(newtempz3,E3_mxm_m2_m1_7points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_7points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_7points(i,j) = hprime_xx(i,1)*B1_m1_m2_7points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_7points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_7points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_7points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_7points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_7points(6,j) + &
-                                  hprime_xx(i,7)*B1_m1_m2_7points(7,j)
-            C2_m1_m2_7points(i,j) = hprime_xx(i,1)*B2_m1_m2_7points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_7points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_7points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_7points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_7points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_7points(6,j) + &
-                                  hprime_xx(i,7)*B2_m1_m2_7points(7,j)
-            C3_m1_m2_7points(i,j) = hprime_xx(i,1)*B3_m1_m2_7points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_7points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_7points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_7points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_7points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_7points(6,j) + &
-                                  hprime_xx(i,7)*B3_m1_m2_7points(7,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_7points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyx_loc(i,7,k)*hprime_xxT(7,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyy_loc(i,7,k)*hprime_xxT(7,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyz_loc(i,7,k)*hprime_xxT(7,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_7points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_7points(i,j) = A1_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
-                                      A1_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
-            C2_mxm_m2_m1_7points(i,j) = A2_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
-                                      A2_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
-            C3_mxm_m2_m1_7points(i,j) = A3_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
-                                      A3_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_7points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_7points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_7points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_7points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_7points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_7points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_7points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C1_m1_m2_7points(7,j)
-            E2_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_7points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_7points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_7points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_7points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_7points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_7points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C2_m1_m2_7points(7,j)
-            E3_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_7points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_7points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_7points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_7points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_7points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_7points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C3_m1_m2_7points(7,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_7points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempx2(i,7,k)*hprimewgll_xx(7,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempy2(i,7,k)*hprimewgll_xx(7,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempz2(i,7,k)*hprimewgll_xx(7,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_7points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_7points(i,j) = C1_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
-                                      C1_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
-            E2_mxm_m2_m1_7points(i,j) = C2_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
-                                      C2_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
-            E3_mxm_m2_m1_7points(i,j) = C3_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
-                                      C3_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_7p
-
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_8p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,8) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(8,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(8,8,8) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(8,8,8) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(8,64) :: B1_m1_m2_8points,B2_m1_m2_8points,B3_m1_m2_8points
-  real(kind=CUSTOM_REAL), dimension(8,64) :: C1_m1_m2_8points,C2_m1_m2_8points,C3_m1_m2_8points
-  real(kind=CUSTOM_REAL), dimension(8,64) :: E1_m1_m2_8points,E2_m1_m2_8points,E3_m1_m2_8points
-
-  equivalence(dummyx_loc,B1_m1_m2_8points)
-  equivalence(dummyy_loc,B2_m1_m2_8points)
-  equivalence(dummyz_loc,B3_m1_m2_8points)
-  equivalence(tempx1,C1_m1_m2_8points)
-  equivalence(tempy1,C2_m1_m2_8points)
-  equivalence(tempz1,C3_m1_m2_8points)
-  equivalence(newtempx1,E1_m1_m2_8points)
-  equivalence(newtempy1,E2_m1_m2_8points)
-  equivalence(newtempz1,E3_m1_m2_8points)
-
-  real(kind=CUSTOM_REAL), dimension(64,8) :: &
-    A1_mxm_m2_m1_8points,A2_mxm_m2_m1_8points,A3_mxm_m2_m1_8points
-  real(kind=CUSTOM_REAL), dimension(64,8) :: &
-    C1_mxm_m2_m1_8points,C2_mxm_m2_m1_8points,C3_mxm_m2_m1_8points
-  real(kind=CUSTOM_REAL), dimension(64,8) :: &
-    E1_mxm_m2_m1_8points,E2_mxm_m2_m1_8points,E3_mxm_m2_m1_8points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_8points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_8points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_8points)
-  equivalence(tempx3,C1_mxm_m2_m1_8points)
-  equivalence(tempy3,C2_mxm_m2_m1_8points)
-  equivalence(tempz3,C3_mxm_m2_m1_8points)
-  equivalence(newtempx3,E1_mxm_m2_m1_8points)
-  equivalence(newtempy3,E2_mxm_m2_m1_8points)
-  equivalence(newtempz3,E3_mxm_m2_m1_8points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_8points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_8points(i,j) = hprime_xx(i,1)*B1_m1_m2_8points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_8points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_8points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_8points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_8points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_8points(6,j) + &
-                                  hprime_xx(i,7)*B1_m1_m2_8points(7,j) + &
-                                  hprime_xx(i,8)*B1_m1_m2_8points(8,j)
-            C2_m1_m2_8points(i,j) = hprime_xx(i,1)*B2_m1_m2_8points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_8points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_8points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_8points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_8points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_8points(6,j) + &
-                                  hprime_xx(i,7)*B2_m1_m2_8points(7,j) + &
-                                  hprime_xx(i,8)*B2_m1_m2_8points(8,j)
-            C3_m1_m2_8points(i,j) = hprime_xx(i,1)*B3_m1_m2_8points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_8points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_8points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_8points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_8points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_8points(6,j) + &
-                                  hprime_xx(i,7)*B3_m1_m2_8points(7,j) + &
-                                  hprime_xx(i,8)*B3_m1_m2_8points(8,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_8points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyx_loc(i,8,k)*hprime_xxT(8,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyy_loc(i,8,k)*hprime_xxT(8,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyz_loc(i,8,k)*hprime_xxT(8,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_8points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_8points(i,j) = A1_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
-                                      A1_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
-                                      A1_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
-            C2_mxm_m2_m1_8points(i,j) = A2_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
-                                      A2_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
-                                      A2_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
-            C3_mxm_m2_m1_8points(i,j) = A3_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
-                                      A3_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
-                                      A3_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_8points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_8points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_8points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_8points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_8points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_8points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_8points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C1_m1_m2_8points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C1_m1_m2_8points(8,j)
-            E2_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_8points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_8points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_8points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_8points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_8points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_8points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C2_m1_m2_8points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C2_m1_m2_8points(8,j)
-            E3_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_8points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_8points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_8points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_8points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_8points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_8points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C3_m1_m2_8points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C3_m1_m2_8points(8,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_8points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempx2(i,8,k)*hprimewgll_xx(8,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempy2(i,8,k)*hprimewgll_xx(8,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempz2(i,8,k)*hprimewgll_xx(8,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_8points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_8points(i,j) = C1_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
-                                      C1_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
-                                      C1_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
-            E2_mxm_m2_m1_8points(i,j) = C2_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
-                                      C2_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
-                                      C2_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
-            E3_mxm_m2_m1_8points(i,j) = C3_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
-                                      C3_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
-                                      C3_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_8p
-
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_9p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,9) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(9,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(9,9,9) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(9,9,9) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(9,81) :: B1_m1_m2_9points,B2_m1_m2_9points,B3_m1_m2_9points
-  real(kind=CUSTOM_REAL), dimension(9,81) :: C1_m1_m2_9points,C2_m1_m2_9points,C3_m1_m2_9points
-  real(kind=CUSTOM_REAL), dimension(9,81) :: E1_m1_m2_9points,E2_m1_m2_9points,E3_m1_m2_9points
-
-  equivalence(dummyx_loc,B1_m1_m2_9points)
-  equivalence(dummyy_loc,B2_m1_m2_9points)
-  equivalence(dummyz_loc,B3_m1_m2_9points)
-  equivalence(tempx1,C1_m1_m2_9points)
-  equivalence(tempy1,C2_m1_m2_9points)
-  equivalence(tempz1,C3_m1_m2_9points)
-  equivalence(newtempx1,E1_m1_m2_9points)
-  equivalence(newtempy1,E2_m1_m2_9points)
-  equivalence(newtempz1,E3_m1_m2_9points)
-
-  real(kind=CUSTOM_REAL), dimension(81,9) :: &
-    A1_mxm_m2_m1_9points,A2_mxm_m2_m1_9points,A3_mxm_m2_m1_9points
-  real(kind=CUSTOM_REAL), dimension(81,9) :: &
-    C1_mxm_m2_m1_9points,C2_mxm_m2_m1_9points,C3_mxm_m2_m1_9points
-  real(kind=CUSTOM_REAL), dimension(81,9) :: &
-    E1_mxm_m2_m1_9points,E2_mxm_m2_m1_9points,E3_mxm_m2_m1_9points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_9points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_9points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_9points)
-  equivalence(tempx3,C1_mxm_m2_m1_9points)
-  equivalence(tempy3,C2_mxm_m2_m1_9points)
-  equivalence(tempz3,C3_mxm_m2_m1_9points)
-  equivalence(newtempx3,E1_mxm_m2_m1_9points)
-  equivalence(newtempy3,E2_mxm_m2_m1_9points)
-  equivalence(newtempz3,E3_mxm_m2_m1_9points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_9points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_9points(i,j) = hprime_xx(i,1)*B1_m1_m2_9points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_9points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_9points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_9points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_9points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_9points(6,j) + &
-                                  hprime_xx(i,7)*B1_m1_m2_9points(7,j) + &
-                                  hprime_xx(i,8)*B1_m1_m2_9points(8,j) + &
-                                  hprime_xx(i,9)*B1_m1_m2_9points(9,j)
-            C2_m1_m2_9points(i,j) = hprime_xx(i,1)*B2_m1_m2_9points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_9points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_9points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_9points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_9points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_9points(6,j) + &
-                                  hprime_xx(i,7)*B2_m1_m2_9points(7,j) + &
-                                  hprime_xx(i,8)*B2_m1_m2_9points(8,j) + &
-                                  hprime_xx(i,9)*B2_m1_m2_9points(9,j)
-            C3_m1_m2_9points(i,j) = hprime_xx(i,1)*B3_m1_m2_9points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_9points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_9points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_9points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_9points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_9points(6,j) + &
-                                  hprime_xx(i,7)*B3_m1_m2_9points(7,j) + &
-                                  hprime_xx(i,8)*B3_m1_m2_9points(8,j) + &
-                                  hprime_xx(i,9)*B3_m1_m2_9points(9,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_9points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyx_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyx_loc(i,9,k)*hprime_xxT(9,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyy_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyy_loc(i,9,k)*hprime_xxT(9,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyz_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyz_loc(i,9,k)*hprime_xxT(9,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_9points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_9points(i,j) = A1_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
-                                      A1_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
-                                      A1_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
-                                      A1_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
-            C2_mxm_m2_m1_9points(i,j) = A2_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
-                                      A2_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
-                                      A2_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
-                                      A2_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
-            C3_mxm_m2_m1_9points(i,j) = A3_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
-                                      A3_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
-                                      A3_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
-                                      A3_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
- 
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_9points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_9points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_9points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_9points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_9points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_9points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_9points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C1_m1_m2_9points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C1_m1_m2_9points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C1_m1_m2_9points(9,j)
-            E2_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_9points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_9points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_9points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_9points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_9points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_9points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C2_m1_m2_9points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C2_m1_m2_9points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C2_m1_m2_9points(9,j)
-            E3_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_9points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_9points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_9points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_9points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_9points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_9points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C3_m1_m2_9points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C3_m1_m2_9points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C3_m1_m2_9points(9,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_9points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempx2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempx2(i,9,k)*hprimewgll_xx(9,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempy2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempy2(i,9,k)*hprimewgll_xx(9,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempz2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempz2(i,9,k)*hprimewgll_xx(9,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_9points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_9points(i,j) = C1_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
-                                      C1_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
-                                      C1_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
-                                      C1_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
-            E2_mxm_m2_m1_9points(i,j) = C2_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
-                                      C2_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
-                                      C2_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
-                                      C2_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
-            E3_mxm_m2_m1_9points(i,j) = C3_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
-                                      C3_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
-                                      C3_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
-                                      C3_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_9p
-
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_10p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,10) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(10,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(10,10,10) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(10,10,10) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(10,100) :: B1_m1_m2_10points,B2_m1_m2_10points,B3_m1_m2_10points
-  real(kind=CUSTOM_REAL), dimension(10,100) :: C1_m1_m2_10points,C2_m1_m2_10points,C3_m1_m2_10points
-  real(kind=CUSTOM_REAL), dimension(10,100) :: E1_m1_m2_10points,E2_m1_m2_10points,E3_m1_m2_10points
-
-  equivalence(dummyx_loc,B1_m1_m2_10points)
-  equivalence(dummyy_loc,B2_m1_m2_10points)
-  equivalence(dummyz_loc,B3_m1_m2_10points)
-  equivalence(tempx1,C1_m1_m2_10points)
-  equivalence(tempy1,C2_m1_m2_10points)
-  equivalence(tempz1,C3_m1_m2_10points)
-  equivalence(newtempx1,E1_m1_m2_10points)
-  equivalence(newtempy1,E2_m1_m2_10points)
-  equivalence(newtempz1,E3_m1_m2_10points)
-
-  real(kind=CUSTOM_REAL), dimension(100,10) :: &
-    A1_mxm_m2_m1_10points,A2_mxm_m2_m1_10points,A3_mxm_m2_m1_10points
-  real(kind=CUSTOM_REAL), dimension(100,10) :: &
-    C1_mxm_m2_m1_10points,C2_mxm_m2_m1_10points,C3_mxm_m2_m1_10points
-  real(kind=CUSTOM_REAL), dimension(100,10) :: &
-    E1_mxm_m2_m1_10points,E2_mxm_m2_m1_10points,E3_mxm_m2_m1_10points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_10points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_10points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_10points)
-  equivalence(tempx3,C1_mxm_m2_m1_10points)
-  equivalence(tempy3,C2_mxm_m2_m1_10points)
-  equivalence(tempz3,C3_mxm_m2_m1_10points)
-  equivalence(newtempx3,E1_mxm_m2_m1_10points)
-  equivalence(newtempy3,E2_mxm_m2_m1_10points)
-  equivalence(newtempz3,E3_mxm_m2_m1_10points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_10points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_10points(i,j) = hprime_xx(i,1)*B1_m1_m2_10points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_10points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_10points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_10points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_10points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_10points(6,j) + &
-                                  hprime_xx(i,7)*B1_m1_m2_10points(7,j) + &
-                                  hprime_xx(i,8)*B1_m1_m2_10points(8,j) + &
-                                  hprime_xx(i,9)*B1_m1_m2_10points(9,j) + &
-                                  hprime_xx(i,10)*B1_m1_m2_10points(10,j)
-            C2_m1_m2_10points(i,j) = hprime_xx(i,1)*B2_m1_m2_10points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_10points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_10points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_10points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_10points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_10points(6,j) + &
-                                  hprime_xx(i,7)*B2_m1_m2_10points(7,j) + &
-                                  hprime_xx(i,8)*B2_m1_m2_10points(8,j) + &
-                                  hprime_xx(i,9)*B2_m1_m2_10points(9,j) + &
-                                  hprime_xx(i,10)*B2_m1_m2_10points(10,j)
-            C3_m1_m2_10points(i,j) = hprime_xx(i,1)*B3_m1_m2_10points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_10points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_10points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_10points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_10points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_10points(6,j) + &
-                                  hprime_xx(i,7)*B3_m1_m2_10points(7,j) + &
-                                  hprime_xx(i,8)*B3_m1_m2_10points(8,j) + &
-                                  hprime_xx(i,9)*B3_m1_m2_10points(9,j) + &
-                                  hprime_xx(i,10)*B3_m1_m2_10points(10,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_10points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyx_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyx_loc(i,9,k)*hprime_xxT(9,j) + &
-                            dummyx_loc(i,10,k)*hprime_xxT(10,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyy_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyy_loc(i,9,k)*hprime_xxT(9,j) + &
-                            dummyy_loc(i,10,k)*hprime_xxT(10,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyz_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyz_loc(i,9,k)*hprime_xxT(9,j) + &
-                            dummyz_loc(i,10,k)*hprime_xxT(10,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_10points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_10points(i,j) = A1_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
-                                      A1_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
-                                      A1_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
-                                      A1_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
-                                      A1_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
-            C2_mxm_m2_m1_10points(i,j) = A2_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
-                                      A2_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
-                                      A2_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
-                                      A2_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
-                                      A2_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
-            C3_mxm_m2_m1_10points(i,j) = A3_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
-                                      A3_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
-                                      A3_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
-                                      A3_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
-                                      A3_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_10points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_10points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_10points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_10points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_10points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_10points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_10points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C1_m1_m2_10points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C1_m1_m2_10points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C1_m1_m2_10points(9,j) + &
-                                  hprimewgll_xxT(i,10)*C1_m1_m2_10points(10,j)
-            E2_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_10points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_10points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_10points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_10points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_10points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_10points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C2_m1_m2_10points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C2_m1_m2_10points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C2_m1_m2_10points(9,j) + &
-                                  hprimewgll_xxT(i,10)*C2_m1_m2_10points(10,j)
-            E3_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_10points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_10points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_10points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_10points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_10points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_10points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C3_m1_m2_10points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C3_m1_m2_10points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C3_m1_m2_10points(9,j) + &
-                                  hprimewgll_xxT(i,10)*C3_m1_m2_10points(10,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_10points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempx2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempx2(i,9,k)*hprimewgll_xx(9,j) + &
-                               tempx2(i,10,k)*hprimewgll_xx(10,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempy2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempy2(i,9,k)*hprimewgll_xx(9,j) + &
-                               tempy2(i,10,k)*hprimewgll_xx(10,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempz2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempz2(i,9,k)*hprimewgll_xx(9,j) + &
-                               tempz2(i,10,k)*hprimewgll_xx(10,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_10points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_10points(i,j) = C1_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
-                                      C1_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
-                                      C1_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
-                                      C1_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
-                                      C1_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
-            E2_mxm_m2_m1_10points(i,j) = C2_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
-                                      C2_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
-                                      C2_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
-                                      C2_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
-                                      C2_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
-            E3_mxm_m2_m1_10points(i,j) = C3_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
-                                      C3_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
-                                      C3_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
-                                      C3_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
-                                      C3_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_10p
diff --git a/src/specfem3D/compute_forces_elastic_Dev2.f90 b/src/specfem3D/compute_forces_elastic_Dev2.f90
new file mode 100644
index 000000000..6ee7d610a
--- /dev/null
+++ b/src/specfem3D/compute_forces_elastic_Dev2.f90
@@ -0,0 +1,3477 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+! higher-order Deville routines (NGLL == 6 to NGLL == 10 )
+!
+! note: put these routines into this extra file to avoid reaching internal threshold
+!          for vectorizations when compiling
+
+subroutine compute_forces_elastic_Dev_6p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,6) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(6,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(6,6,6) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(6,6,6) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(6,36) :: B1_m1_m2_6points,B2_m1_m2_6points,B3_m1_m2_6points
+  real(kind=CUSTOM_REAL), dimension(6,36) :: C1_m1_m2_6points,C2_m1_m2_6points,C3_m1_m2_6points
+  real(kind=CUSTOM_REAL), dimension(6,36) :: E1_m1_m2_6points,E2_m1_m2_6points,E3_m1_m2_6points
+
+  equivalence(dummyx_loc,B1_m1_m2_6points)
+  equivalence(dummyy_loc,B2_m1_m2_6points)
+  equivalence(dummyz_loc,B3_m1_m2_6points)
+  equivalence(tempx1,C1_m1_m2_6points)
+  equivalence(tempy1,C2_m1_m2_6points)
+  equivalence(tempz1,C3_m1_m2_6points)
+  equivalence(newtempx1,E1_m1_m2_6points)
+  equivalence(newtempy1,E2_m1_m2_6points)
+  equivalence(newtempz1,E3_m1_m2_6points)
+
+  real(kind=CUSTOM_REAL), dimension(36,6) :: &
+    A1_mxm_m2_m1_6points,A2_mxm_m2_m1_6points,A3_mxm_m2_m1_6points
+  real(kind=CUSTOM_REAL), dimension(36,6) :: &
+    C1_mxm_m2_m1_6points,C2_mxm_m2_m1_6points,C3_mxm_m2_m1_6points
+  real(kind=CUSTOM_REAL), dimension(36,6) :: &
+    E1_mxm_m2_m1_6points,E2_mxm_m2_m1_6points,E3_mxm_m2_m1_6points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_6points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_6points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_6points)
+  equivalence(tempx3,C1_mxm_m2_m1_6points)
+  equivalence(tempy3,C2_mxm_m2_m1_6points)
+  equivalence(tempz3,C3_mxm_m2_m1_6points)
+  equivalence(newtempx3,E1_mxm_m2_m1_6points)
+  equivalence(newtempy3,E2_mxm_m2_m1_6points)
+  equivalence(newtempz3,E3_mxm_m2_m1_6points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_6points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_6points(i,j) = hprime_xx(i,1)*B1_m1_m2_6points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_6points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_6points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_6points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_6points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_6points(6,j)
+            C2_m1_m2_6points(i,j) = hprime_xx(i,1)*B2_m1_m2_6points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_6points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_6points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_6points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_6points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_6points(6,j)
+            C3_m1_m2_6points(i,j) = hprime_xx(i,1)*B3_m1_m2_6points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_6points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_6points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_6points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_6points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_6points(6,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_6points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_6points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_6points(i,j) = A1_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
+            C2_mxm_m2_m1_6points(i,j) = A2_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
+            C3_mxm_m2_m1_6points(i,j) = A3_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_6points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_6points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_6points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_6points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_6points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_6points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_6points(6,j)
+            E2_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_6points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_6points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_6points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_6points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_6points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_6points(6,j)
+            E3_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_6points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_6points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_6points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_6points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_6points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_6points(6,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_6points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_6points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_6points(i,j) = C1_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
+            E2_mxm_m2_m1_6points(i,j) = C2_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
+            E3_mxm_m2_m1_6points(i,j) = C3_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_6p
+
+!
+!=====================================================================
+!
+
+subroutine compute_forces_elastic_Dev_7p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,7) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(7,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(7,7,7) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(7,7,7) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(7,49) :: B1_m1_m2_7points,B2_m1_m2_7points,B3_m1_m2_7points
+  real(kind=CUSTOM_REAL), dimension(7,49) :: C1_m1_m2_7points,C2_m1_m2_7points,C3_m1_m2_7points
+  real(kind=CUSTOM_REAL), dimension(7,49) :: E1_m1_m2_7points,E2_m1_m2_7points,E3_m1_m2_7points
+
+  equivalence(dummyx_loc,B1_m1_m2_7points)
+  equivalence(dummyy_loc,B2_m1_m2_7points)
+  equivalence(dummyz_loc,B3_m1_m2_7points)
+  equivalence(tempx1,C1_m1_m2_7points)
+  equivalence(tempy1,C2_m1_m2_7points)
+  equivalence(tempz1,C3_m1_m2_7points)
+  equivalence(newtempx1,E1_m1_m2_7points)
+  equivalence(newtempy1,E2_m1_m2_7points)
+  equivalence(newtempz1,E3_m1_m2_7points)
+
+  real(kind=CUSTOM_REAL), dimension(49,7) :: &
+    A1_mxm_m2_m1_7points,A2_mxm_m2_m1_7points,A3_mxm_m2_m1_7points
+  real(kind=CUSTOM_REAL), dimension(49,7) :: &
+    C1_mxm_m2_m1_7points,C2_mxm_m2_m1_7points,C3_mxm_m2_m1_7points
+  real(kind=CUSTOM_REAL), dimension(49,7) :: &
+    E1_mxm_m2_m1_7points,E2_mxm_m2_m1_7points,E3_mxm_m2_m1_7points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_7points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_7points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_7points)
+  equivalence(tempx3,C1_mxm_m2_m1_7points)
+  equivalence(tempy3,C2_mxm_m2_m1_7points)
+  equivalence(tempz3,C3_mxm_m2_m1_7points)
+  equivalence(newtempx3,E1_mxm_m2_m1_7points)
+  equivalence(newtempy3,E2_mxm_m2_m1_7points)
+  equivalence(newtempz3,E3_mxm_m2_m1_7points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_7points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_7points(i,j) = hprime_xx(i,1)*B1_m1_m2_7points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_7points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_7points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_7points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_7points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_7points(6,j) + &
+                                  hprime_xx(i,7)*B1_m1_m2_7points(7,j)
+            C2_m1_m2_7points(i,j) = hprime_xx(i,1)*B2_m1_m2_7points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_7points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_7points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_7points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_7points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_7points(6,j) + &
+                                  hprime_xx(i,7)*B2_m1_m2_7points(7,j)
+            C3_m1_m2_7points(i,j) = hprime_xx(i,1)*B3_m1_m2_7points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_7points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_7points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_7points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_7points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_7points(6,j) + &
+                                  hprime_xx(i,7)*B3_m1_m2_7points(7,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_7points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyx_loc(i,7,k)*hprime_xxT(7,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyy_loc(i,7,k)*hprime_xxT(7,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyz_loc(i,7,k)*hprime_xxT(7,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_7points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_7points(i,j) = A1_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
+                                      A1_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
+            C2_mxm_m2_m1_7points(i,j) = A2_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
+                                      A2_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
+            C3_mxm_m2_m1_7points(i,j) = A3_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
+                                      A3_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_7points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_7points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_7points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_7points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_7points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_7points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_7points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C1_m1_m2_7points(7,j)
+            E2_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_7points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_7points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_7points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_7points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_7points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_7points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C2_m1_m2_7points(7,j)
+            E3_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_7points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_7points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_7points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_7points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_7points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_7points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C3_m1_m2_7points(7,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_7points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempx2(i,7,k)*hprimewgll_xx(7,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempy2(i,7,k)*hprimewgll_xx(7,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempz2(i,7,k)*hprimewgll_xx(7,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_7points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_7points(i,j) = C1_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
+                                      C1_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
+            E2_mxm_m2_m1_7points(i,j) = C2_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
+                                      C2_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
+            E3_mxm_m2_m1_7points(i,j) = C3_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
+                                      C3_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_7p
+
+!
+!=====================================================================
+!
+
+subroutine compute_forces_elastic_Dev_8p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,8) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(8,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(8,8,8) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(8,8,8) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(8,64) :: B1_m1_m2_8points,B2_m1_m2_8points,B3_m1_m2_8points
+  real(kind=CUSTOM_REAL), dimension(8,64) :: C1_m1_m2_8points,C2_m1_m2_8points,C3_m1_m2_8points
+  real(kind=CUSTOM_REAL), dimension(8,64) :: E1_m1_m2_8points,E2_m1_m2_8points,E3_m1_m2_8points
+
+  equivalence(dummyx_loc,B1_m1_m2_8points)
+  equivalence(dummyy_loc,B2_m1_m2_8points)
+  equivalence(dummyz_loc,B3_m1_m2_8points)
+  equivalence(tempx1,C1_m1_m2_8points)
+  equivalence(tempy1,C2_m1_m2_8points)
+  equivalence(tempz1,C3_m1_m2_8points)
+  equivalence(newtempx1,E1_m1_m2_8points)
+  equivalence(newtempy1,E2_m1_m2_8points)
+  equivalence(newtempz1,E3_m1_m2_8points)
+
+  real(kind=CUSTOM_REAL), dimension(64,8) :: &
+    A1_mxm_m2_m1_8points,A2_mxm_m2_m1_8points,A3_mxm_m2_m1_8points
+  real(kind=CUSTOM_REAL), dimension(64,8) :: &
+    C1_mxm_m2_m1_8points,C2_mxm_m2_m1_8points,C3_mxm_m2_m1_8points
+  real(kind=CUSTOM_REAL), dimension(64,8) :: &
+    E1_mxm_m2_m1_8points,E2_mxm_m2_m1_8points,E3_mxm_m2_m1_8points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_8points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_8points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_8points)
+  equivalence(tempx3,C1_mxm_m2_m1_8points)
+  equivalence(tempy3,C2_mxm_m2_m1_8points)
+  equivalence(tempz3,C3_mxm_m2_m1_8points)
+  equivalence(newtempx3,E1_mxm_m2_m1_8points)
+  equivalence(newtempy3,E2_mxm_m2_m1_8points)
+  equivalence(newtempz3,E3_mxm_m2_m1_8points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_8points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_8points(i,j) = hprime_xx(i,1)*B1_m1_m2_8points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_8points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_8points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_8points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_8points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_8points(6,j) + &
+                                  hprime_xx(i,7)*B1_m1_m2_8points(7,j) + &
+                                  hprime_xx(i,8)*B1_m1_m2_8points(8,j)
+            C2_m1_m2_8points(i,j) = hprime_xx(i,1)*B2_m1_m2_8points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_8points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_8points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_8points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_8points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_8points(6,j) + &
+                                  hprime_xx(i,7)*B2_m1_m2_8points(7,j) + &
+                                  hprime_xx(i,8)*B2_m1_m2_8points(8,j)
+            C3_m1_m2_8points(i,j) = hprime_xx(i,1)*B3_m1_m2_8points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_8points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_8points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_8points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_8points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_8points(6,j) + &
+                                  hprime_xx(i,7)*B3_m1_m2_8points(7,j) + &
+                                  hprime_xx(i,8)*B3_m1_m2_8points(8,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_8points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyx_loc(i,8,k)*hprime_xxT(8,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyy_loc(i,8,k)*hprime_xxT(8,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyz_loc(i,8,k)*hprime_xxT(8,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_8points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_8points(i,j) = A1_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
+                                      A1_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
+                                      A1_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
+            C2_mxm_m2_m1_8points(i,j) = A2_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
+                                      A2_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
+                                      A2_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
+            C3_mxm_m2_m1_8points(i,j) = A3_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
+                                      A3_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
+                                      A3_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_8points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_8points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_8points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_8points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_8points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_8points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_8points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C1_m1_m2_8points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C1_m1_m2_8points(8,j)
+            E2_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_8points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_8points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_8points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_8points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_8points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_8points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C2_m1_m2_8points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C2_m1_m2_8points(8,j)
+            E3_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_8points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_8points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_8points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_8points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_8points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_8points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C3_m1_m2_8points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C3_m1_m2_8points(8,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_8points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempx2(i,8,k)*hprimewgll_xx(8,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempy2(i,8,k)*hprimewgll_xx(8,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempz2(i,8,k)*hprimewgll_xx(8,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_8points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_8points(i,j) = C1_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
+                                      C1_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
+                                      C1_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
+            E2_mxm_m2_m1_8points(i,j) = C2_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
+                                      C2_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
+                                      C2_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
+            E3_mxm_m2_m1_8points(i,j) = C3_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
+                                      C3_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
+                                      C3_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_8p
+
+!
+!=====================================================================
+!
+
+subroutine compute_forces_elastic_Dev_9p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,9) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(9,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(9,9,9) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(9,9,9) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(9,81) :: B1_m1_m2_9points,B2_m1_m2_9points,B3_m1_m2_9points
+  real(kind=CUSTOM_REAL), dimension(9,81) :: C1_m1_m2_9points,C2_m1_m2_9points,C3_m1_m2_9points
+  real(kind=CUSTOM_REAL), dimension(9,81) :: E1_m1_m2_9points,E2_m1_m2_9points,E3_m1_m2_9points
+
+  equivalence(dummyx_loc,B1_m1_m2_9points)
+  equivalence(dummyy_loc,B2_m1_m2_9points)
+  equivalence(dummyz_loc,B3_m1_m2_9points)
+  equivalence(tempx1,C1_m1_m2_9points)
+  equivalence(tempy1,C2_m1_m2_9points)
+  equivalence(tempz1,C3_m1_m2_9points)
+  equivalence(newtempx1,E1_m1_m2_9points)
+  equivalence(newtempy1,E2_m1_m2_9points)
+  equivalence(newtempz1,E3_m1_m2_9points)
+
+  real(kind=CUSTOM_REAL), dimension(81,9) :: &
+    A1_mxm_m2_m1_9points,A2_mxm_m2_m1_9points,A3_mxm_m2_m1_9points
+  real(kind=CUSTOM_REAL), dimension(81,9) :: &
+    C1_mxm_m2_m1_9points,C2_mxm_m2_m1_9points,C3_mxm_m2_m1_9points
+  real(kind=CUSTOM_REAL), dimension(81,9) :: &
+    E1_mxm_m2_m1_9points,E2_mxm_m2_m1_9points,E3_mxm_m2_m1_9points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_9points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_9points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_9points)
+  equivalence(tempx3,C1_mxm_m2_m1_9points)
+  equivalence(tempy3,C2_mxm_m2_m1_9points)
+  equivalence(tempz3,C3_mxm_m2_m1_9points)
+  equivalence(newtempx3,E1_mxm_m2_m1_9points)
+  equivalence(newtempy3,E2_mxm_m2_m1_9points)
+  equivalence(newtempz3,E3_mxm_m2_m1_9points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_9points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_9points(i,j) = hprime_xx(i,1)*B1_m1_m2_9points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_9points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_9points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_9points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_9points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_9points(6,j) + &
+                                  hprime_xx(i,7)*B1_m1_m2_9points(7,j) + &
+                                  hprime_xx(i,8)*B1_m1_m2_9points(8,j) + &
+                                  hprime_xx(i,9)*B1_m1_m2_9points(9,j)
+            C2_m1_m2_9points(i,j) = hprime_xx(i,1)*B2_m1_m2_9points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_9points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_9points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_9points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_9points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_9points(6,j) + &
+                                  hprime_xx(i,7)*B2_m1_m2_9points(7,j) + &
+                                  hprime_xx(i,8)*B2_m1_m2_9points(8,j) + &
+                                  hprime_xx(i,9)*B2_m1_m2_9points(9,j)
+            C3_m1_m2_9points(i,j) = hprime_xx(i,1)*B3_m1_m2_9points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_9points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_9points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_9points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_9points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_9points(6,j) + &
+                                  hprime_xx(i,7)*B3_m1_m2_9points(7,j) + &
+                                  hprime_xx(i,8)*B3_m1_m2_9points(8,j) + &
+                                  hprime_xx(i,9)*B3_m1_m2_9points(9,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_9points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyx_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyx_loc(i,9,k)*hprime_xxT(9,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyy_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyy_loc(i,9,k)*hprime_xxT(9,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyz_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyz_loc(i,9,k)*hprime_xxT(9,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_9points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_9points(i,j) = A1_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
+                                      A1_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
+                                      A1_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
+                                      A1_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
+            C2_mxm_m2_m1_9points(i,j) = A2_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
+                                      A2_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
+                                      A2_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
+                                      A2_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
+            C3_mxm_m2_m1_9points(i,j) = A3_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
+                                      A3_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
+                                      A3_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
+                                      A3_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_9points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_9points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_9points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_9points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_9points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_9points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_9points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C1_m1_m2_9points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C1_m1_m2_9points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C1_m1_m2_9points(9,j)
+            E2_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_9points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_9points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_9points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_9points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_9points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_9points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C2_m1_m2_9points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C2_m1_m2_9points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C2_m1_m2_9points(9,j)
+            E3_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_9points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_9points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_9points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_9points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_9points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_9points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C3_m1_m2_9points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C3_m1_m2_9points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C3_m1_m2_9points(9,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_9points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempx2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempx2(i,9,k)*hprimewgll_xx(9,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempy2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempy2(i,9,k)*hprimewgll_xx(9,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempz2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempz2(i,9,k)*hprimewgll_xx(9,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_9points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_9points(i,j) = C1_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
+                                      C1_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
+                                      C1_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
+                                      C1_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
+            E2_mxm_m2_m1_9points(i,j) = C2_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
+                                      C2_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
+                                      C2_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
+                                      C2_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
+            E3_mxm_m2_m1_9points(i,j) = C3_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
+                                      C3_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
+                                      C3_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
+                                      C3_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_9p
+
+!
+!=====================================================================
+!
+
+subroutine compute_forces_elastic_Dev_10p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,10) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(10,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(10,10,10) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(10,10,10) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(10,100) :: B1_m1_m2_10points,B2_m1_m2_10points,B3_m1_m2_10points
+  real(kind=CUSTOM_REAL), dimension(10,100) :: C1_m1_m2_10points,C2_m1_m2_10points,C3_m1_m2_10points
+  real(kind=CUSTOM_REAL), dimension(10,100) :: E1_m1_m2_10points,E2_m1_m2_10points,E3_m1_m2_10points
+
+  equivalence(dummyx_loc,B1_m1_m2_10points)
+  equivalence(dummyy_loc,B2_m1_m2_10points)
+  equivalence(dummyz_loc,B3_m1_m2_10points)
+  equivalence(tempx1,C1_m1_m2_10points)
+  equivalence(tempy1,C2_m1_m2_10points)
+  equivalence(tempz1,C3_m1_m2_10points)
+  equivalence(newtempx1,E1_m1_m2_10points)
+  equivalence(newtempy1,E2_m1_m2_10points)
+  equivalence(newtempz1,E3_m1_m2_10points)
+
+  real(kind=CUSTOM_REAL), dimension(100,10) :: &
+    A1_mxm_m2_m1_10points,A2_mxm_m2_m1_10points,A3_mxm_m2_m1_10points
+  real(kind=CUSTOM_REAL), dimension(100,10) :: &
+    C1_mxm_m2_m1_10points,C2_mxm_m2_m1_10points,C3_mxm_m2_m1_10points
+  real(kind=CUSTOM_REAL), dimension(100,10) :: &
+    E1_mxm_m2_m1_10points,E2_mxm_m2_m1_10points,E3_mxm_m2_m1_10points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_10points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_10points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_10points)
+  equivalence(tempx3,C1_mxm_m2_m1_10points)
+  equivalence(tempy3,C2_mxm_m2_m1_10points)
+  equivalence(tempz3,C3_mxm_m2_m1_10points)
+  equivalence(newtempx3,E1_mxm_m2_m1_10points)
+  equivalence(newtempy3,E2_mxm_m2_m1_10points)
+  equivalence(newtempz3,E3_mxm_m2_m1_10points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_10points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_10points(i,j) = hprime_xx(i,1)*B1_m1_m2_10points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_10points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_10points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_10points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_10points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_10points(6,j) + &
+                                  hprime_xx(i,7)*B1_m1_m2_10points(7,j) + &
+                                  hprime_xx(i,8)*B1_m1_m2_10points(8,j) + &
+                                  hprime_xx(i,9)*B1_m1_m2_10points(9,j) + &
+                                  hprime_xx(i,10)*B1_m1_m2_10points(10,j)
+            C2_m1_m2_10points(i,j) = hprime_xx(i,1)*B2_m1_m2_10points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_10points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_10points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_10points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_10points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_10points(6,j) + &
+                                  hprime_xx(i,7)*B2_m1_m2_10points(7,j) + &
+                                  hprime_xx(i,8)*B2_m1_m2_10points(8,j) + &
+                                  hprime_xx(i,9)*B2_m1_m2_10points(9,j) + &
+                                  hprime_xx(i,10)*B2_m1_m2_10points(10,j)
+            C3_m1_m2_10points(i,j) = hprime_xx(i,1)*B3_m1_m2_10points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_10points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_10points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_10points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_10points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_10points(6,j) + &
+                                  hprime_xx(i,7)*B3_m1_m2_10points(7,j) + &
+                                  hprime_xx(i,8)*B3_m1_m2_10points(8,j) + &
+                                  hprime_xx(i,9)*B3_m1_m2_10points(9,j) + &
+                                  hprime_xx(i,10)*B3_m1_m2_10points(10,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_10points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyx_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyx_loc(i,9,k)*hprime_xxT(9,j) + &
+                            dummyx_loc(i,10,k)*hprime_xxT(10,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyy_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyy_loc(i,9,k)*hprime_xxT(9,j) + &
+                            dummyy_loc(i,10,k)*hprime_xxT(10,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyz_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyz_loc(i,9,k)*hprime_xxT(9,j) + &
+                            dummyz_loc(i,10,k)*hprime_xxT(10,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_10points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_10points(i,j) = A1_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
+                                      A1_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
+                                      A1_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
+                                      A1_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
+                                      A1_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
+            C2_mxm_m2_m1_10points(i,j) = A2_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
+                                      A2_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
+                                      A2_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
+                                      A2_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
+                                      A2_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
+            C3_mxm_m2_m1_10points(i,j) = A3_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
+                                      A3_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
+                                      A3_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
+                                      A3_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
+                                      A3_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_10points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_10points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_10points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_10points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_10points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_10points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_10points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C1_m1_m2_10points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C1_m1_m2_10points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C1_m1_m2_10points(9,j) + &
+                                  hprimewgll_xxT(i,10)*C1_m1_m2_10points(10,j)
+            E2_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_10points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_10points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_10points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_10points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_10points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_10points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C2_m1_m2_10points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C2_m1_m2_10points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C2_m1_m2_10points(9,j) + &
+                                  hprimewgll_xxT(i,10)*C2_m1_m2_10points(10,j)
+            E3_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_10points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_10points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_10points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_10points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_10points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_10points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C3_m1_m2_10points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C3_m1_m2_10points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C3_m1_m2_10points(9,j) + &
+                                  hprimewgll_xxT(i,10)*C3_m1_m2_10points(10,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_10points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempx2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempx2(i,9,k)*hprimewgll_xx(9,j) + &
+                               tempx2(i,10,k)*hprimewgll_xx(10,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempy2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempy2(i,9,k)*hprimewgll_xx(9,j) + &
+                               tempy2(i,10,k)*hprimewgll_xx(10,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempz2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempz2(i,9,k)*hprimewgll_xx(9,j) + &
+                               tempz2(i,10,k)*hprimewgll_xx(10,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_10points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_10points(i,j) = C1_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
+                                      C1_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
+                                      C1_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
+                                      C1_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
+                                      C1_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
+            E2_mxm_m2_m1_10points(i,j) = C2_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
+                                      C2_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
+                                      C2_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
+                                      C2_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
+                                      C2_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
+            E3_mxm_m2_m1_10points(i,j) = C3_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
+                                      C3_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
+                                      C3_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
+                                      C3_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
+                                      C3_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_10p
diff --git a/src/specfem3D/compute_forces_elastic_Dev_openmp.f90 b/src/specfem3D/compute_forces_elastic_Dev_openmp.f90
new file mode 100644
index 000000000..7a73f63bc
--- /dev/null
+++ b/src/specfem3D/compute_forces_elastic_Dev_openmp.f90
@@ -0,0 +1,786 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            October 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! OpenMP Threaded variant by John Levesque, Max Rietmann and Olaf Schenk
+
+  subroutine compute_forces_elastic_Dev_openmp(iphase ,NSPEC_AB,NGLOB_AB, &
+                             displ,accel, &
+                             xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                             hprime_xx,hprime_xxT, &
+                             hprimewgll_xx,hprimewgll_xxT, &
+                             wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                             kappastore,mustore,jacobian,ibool, &
+                             ATTENUATION, &
+                             one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                             NSPEC_ATTENUATION_AB, &
+                             R_xx,R_yy,R_xy,R_xz,R_yz, &
+                             epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                             epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                             ANISOTROPY,NSPEC_ANISO, &
+                             c11store,c12store,c13store,c14store,c15store,c16store,&
+                             c22store,c23store,c24store,c25store,c26store,c33store,&
+                             c34store,c35store,c36store,c44store,c45store,c46store,&
+                             c55store,c56store,c66store, &
+                             SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                             NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                             is_moho_top,is_moho_bot, &
+                             dsdx_top,dsdx_bot, &
+                             ispec2D_moho_top,ispec2D_moho_bot, &
+                             num_phase_ispec_elastic,&
+                             phase_ispec_inner_elastic,&
+                             num_colors_outer_elastic,num_colors_inner_elastic)
+
+
+
+  ! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+       N_SLS,SAVE_MOHO_MESH, &
+       ONE_THIRD,FOUR_THIRDS,m1,m2
+
+  ! Trying to pass these variables as subroutine arguments ran into
+  ! problems, so we reference them from their module, making them
+  ! accessible from this subroutine
+  use specfem_par_elastic, only:dummyx_loc,dummyy_loc,dummyz_loc,newtempx1,newtempx2,newtempx3,&
+       newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3,&
+       tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3,num_elem_colors_elastic
+
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+  ! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+
+  ! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+       xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+       kappastore,mustore,jacobian
+
+  ! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+  ! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+       R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+  ! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+       c11store,c12store,c13store,c14store,c15store,c16store, &
+       c22store,c23store,c24store,c25store,c26store,c33store, &
+       c34store,c35store,c36store,c44store,c45store,c46store, &
+       c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+  ! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+       dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  integer OMP_get_thread_num
+  integer OMP_GET_MAX_THREADS
+
+  ! timing
+  !double precision omp_get_wtime
+  !double precision start_time
+  !double precision end_time
+  !double precision accumulate_time_start
+  !double precision accumulate_time_stop
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+       c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+  integer thread_id
+  integer NUM_THREADS
+  !integer omp_get_num_threads ! function
+
+  ! coloring additions
+  ! integer, dimension(:), allocatable :: num_elem_colors_elastic
+  integer istart, estart, number_of_colors
+  integer num_colors_outer_elastic, num_colors_inner_elastic
+  integer icolor
+
+  ! write(*,*) "num_elem_colors_elastic(1) = ",num_elem_colors_elastic(1)
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! NUM_THREADS = 1
+  NUM_THREADS = OMP_GET_MAX_THREADS()
+
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+     number_of_colors = num_colors_outer_elastic
+     istart = 1
+  else
+     number_of_colors = num_colors_inner_elastic + num_colors_outer_elastic
+     istart = num_colors_outer_elastic+1
+     ! istart = num_colors_outer_elastic
+  endif
+
+  ! "start" timer
+  ! start_time = omp_get_wtime()
+
+  ! The mesh coloring algorithm provides disjoint sets of elements that
+  ! do not share degrees of freedom which is required for the assembly
+  ! step at the "accel(iglob) += update" step. The coloring is
+  ! implemented, such that the element and node indices are ordered by
+  ! color. This requires then only to iterate through the elements in
+  ! order, stopping to synchronize threads after all the elements in a
+  ! color are finished.
+  estart = 1
+  do icolor = istart, number_of_colors
+
+    !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(&
+    !$OMP R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3,&
+    !$OMP factor_loc,alphaval_loc,betaval_loc,gammaval_loc,&
+    !$OMP Sn,Snp1,&
+    !$OMP templ,&
+    !$OMP xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl,&
+    !$OMP duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,&
+    !$OMP duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,&
+    !$OMP duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,&
+    !$OMP sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,&
+    !$OMP fac1,fac2,fac3,&
+    !$OMP lambdal,mul,lambdalplus2mul,kappal,&
+    !$OMP c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+    !$OMP c33,c34,c35,c36,c44,c45,c46,c55,c56,c66,&
+    !$OMP i_SLS,&
+    !$OMP ispec,iglob,ispec_p,&
+    !$OMP i,j,k,&
+    !$OMP thread_id)
+
+    thread_id = OMP_get_thread_num()+1
+
+    ! we retrive the subset of the total elements determined by the mesh
+    ! coloring. This number changes as we iterate through the colors
+    num_elements = num_elem_colors_elastic(icolor)
+    !$OMP DO
+    do ispec_p = estart,(estart-1)+num_elements
+
+
+       ! returns element id from stored element list
+       ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+       ! adjoint simulations: moho kernel
+       if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+             ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+             ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+       endif ! adjoint
+
+       ! stores displacment values in local array
+       do k=1,NGLLZ
+          do j=1,NGLLY
+             do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k,thread_id) = displ(1,iglob)
+                dummyy_loc(i,j,k,thread_id) = displ(2,iglob)
+                dummyz_loc(i,j,k,thread_id) = displ(3,iglob)
+             enddo
+          enddo
+       enddo
+
+       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+       ! for incompressible fluid flow, Cambridge University Press (2002),
+       ! pages 386 and 389 and Figure 8.3.1
+       ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+       do j=1,m2
+          do i=1,m1
+             tempx1(i,j,1,thread_id) = &
+                  hprime_xx(i,1)*dummyx_loc(1,j,1,thread_id) + &
+                  hprime_xx(i,2)*dummyx_loc(2,j,1,thread_id) + &
+                  hprime_xx(i,3)*dummyx_loc(3,j,1,thread_id) + &
+                  hprime_xx(i,4)*dummyx_loc(4,j,1,thread_id) + &
+                  hprime_xx(i,5)*dummyx_loc(5,j,1,thread_id)
+             tempy1(i,j,1,thread_id) = &
+                  hprime_xx(i,1)*dummyy_loc(1,j,1,thread_id) + &
+                  hprime_xx(i,2)*dummyy_loc(2,j,1,thread_id) + &
+                  hprime_xx(i,3)*dummyy_loc(3,j,1,thread_id) + &
+                  hprime_xx(i,4)*dummyy_loc(4,j,1,thread_id) + &
+                  hprime_xx(i,5)*dummyy_loc(5,j,1,thread_id)
+             tempz1(i,j,1,thread_id) = &
+                  hprime_xx(i,1)*dummyz_loc(1,j,1,thread_id) + &
+                  hprime_xx(i,2)*dummyz_loc(2,j,1,thread_id) + &
+                  hprime_xx(i,3)*dummyz_loc(3,j,1,thread_id) + &
+                  hprime_xx(i,4)*dummyz_loc(4,j,1,thread_id) + &
+                  hprime_xx(i,5)*dummyz_loc(5,j,1,thread_id)
+          enddo
+       enddo
+
+       !   call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+       !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+       do j=1,m1
+          do i=1,m1
+             ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+             do k = 1,NGLLX
+                tempx2(i,j,k,thread_id) = dummyx_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                     dummyx_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                     dummyx_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                     dummyx_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                     dummyx_loc(i,5,k,thread_id)*hprime_xxT(5,j)
+                tempy2(i,j,k,thread_id) = dummyy_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                     dummyy_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                     dummyy_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                     dummyy_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                     dummyy_loc(i,5,k,thread_id)*hprime_xxT(5,j)
+                tempz2(i,j,k,thread_id) = dummyz_loc(i,1,k,thread_id)*hprime_xxT(1,j) + &
+                     dummyz_loc(i,2,k,thread_id)*hprime_xxT(2,j) + &
+                     dummyz_loc(i,3,k,thread_id)*hprime_xxT(3,j) + &
+                     dummyz_loc(i,4,k,thread_id)*hprime_xxT(4,j) + &
+                     dummyz_loc(i,5,k,thread_id)*hprime_xxT(5,j)
+             enddo
+          enddo
+       enddo
+
+       ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+       do j=1,m1
+          do i=1,m2
+             tempx3(i,1,j,thread_id) = &
+                  dummyx_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                  dummyx_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                  dummyx_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                  dummyx_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                  dummyx_loc(i,1,5,thread_id)*hprime_xxT(5,j)
+             tempy3(i,1,j,thread_id) = &
+                  dummyy_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                  dummyy_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                  dummyy_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                  dummyy_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                  dummyy_loc(i,1,5,thread_id)*hprime_xxT(5,j)
+             tempz3(i,1,j,thread_id) = &
+                  dummyz_loc(i,1,1,thread_id)*hprime_xxT(1,j) + &
+                  dummyz_loc(i,1,2,thread_id)*hprime_xxT(2,j) + &
+                  dummyz_loc(i,1,3,thread_id)*hprime_xxT(3,j) + &
+                  dummyz_loc(i,1,4,thread_id)*hprime_xxT(4,j) + &
+                  dummyz_loc(i,1,5,thread_id)*hprime_xxT(5,j)
+
+          enddo
+       enddo
+
+       do k=1,NGLLZ
+          do j=1,NGLLY
+             do i=1,NGLLX
+                ! get derivatives of ux, uy and uz with respect to x, y and z
+                xixl = xix(i,j,k,ispec)
+                xiyl = xiy(i,j,k,ispec)
+                xizl = xiz(i,j,k,ispec)
+                etaxl = etax(i,j,k,ispec)
+                etayl = etay(i,j,k,ispec)
+                etazl = etaz(i,j,k,ispec)
+                gammaxl = gammax(i,j,k,ispec)
+                gammayl = gammay(i,j,k,ispec)
+                gammazl = gammaz(i,j,k,ispec)
+                jacobianl = jacobian(i,j,k,ispec)
+
+                duxdxl = xixl*tempx1(i,j,k,thread_id) + etaxl*tempx2(i,j,k,thread_id) + gammaxl*tempx3(i,j,k,thread_id)
+                duxdyl = xiyl*tempx1(i,j,k,thread_id) + etayl*tempx2(i,j,k,thread_id) + gammayl*tempx3(i,j,k,thread_id)
+                duxdzl = xizl*tempx1(i,j,k,thread_id) + etazl*tempx2(i,j,k,thread_id) + gammazl*tempx3(i,j,k,thread_id)
+
+                duydxl = xixl*tempy1(i,j,k,thread_id) + etaxl*tempy2(i,j,k,thread_id) + gammaxl*tempy3(i,j,k,thread_id)
+                duydyl = xiyl*tempy1(i,j,k,thread_id) + etayl*tempy2(i,j,k,thread_id) + gammayl*tempy3(i,j,k,thread_id)
+                duydzl = xizl*tempy1(i,j,k,thread_id) + etazl*tempy2(i,j,k,thread_id) + gammazl*tempy3(i,j,k,thread_id)
+
+                duzdxl = xixl*tempz1(i,j,k,thread_id) + etaxl*tempz2(i,j,k,thread_id) + gammaxl*tempz3(i,j,k,thread_id)
+                duzdyl = xiyl*tempz1(i,j,k,thread_id) + etayl*tempz2(i,j,k,thread_id) + gammayl*tempz3(i,j,k,thread_id)
+                duzdzl = xizl*tempz1(i,j,k,thread_id) + etazl*tempz2(i,j,k,thread_id) + gammazl*tempz3(i,j,k,thread_id)
+
+                ! save strain on the Moho boundary
+                if (SAVE_MOHO_MESH ) then
+                   if (is_moho_top(ispec)) then
+                      dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                      dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                      dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                      dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                      dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                      dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                      dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                      dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                      dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                   else if (is_moho_bot(ispec)) then
+                      dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                      dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                      dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                      dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                      dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                      dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                      dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                      dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                      dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                   endif
+                endif
+
+                ! precompute some sums to save CPU time
+                duxdxl_plus_duydyl = duxdxl + duydyl
+                duxdxl_plus_duzdzl = duxdxl + duzdzl
+                duydyl_plus_duzdzl = duydyl + duzdzl
+                duxdyl_plus_duydxl = duxdyl + duydxl
+                duzdxl_plus_duxdzl = duzdxl + duxdzl
+                duzdyl_plus_duydzl = duzdyl + duydzl
+
+                ! computes deviatoric strain attenuation and/or for kernel calculations
+                if (COMPUTE_AND_STORE_STRAIN) then
+                   templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                   if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                   epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                   epsilondev_yy_loc(i,j,k) = duydyl - templ
+                   epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                   epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                   epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+                endif
+
+                kappal = kappastore(i,j,k,ispec)
+                mul = mustore(i,j,k,ispec)
+
+                ! attenuation
+                if(ATTENUATION) then
+                   ! use unrelaxed parameters if attenuation
+                   mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+                endif
+
+                ! full anisotropic case, stress calculations
+                if(ANISOTROPY) then
+                   c11 = c11store(i,j,k,ispec)
+                   c12 = c12store(i,j,k,ispec)
+                   c13 = c13store(i,j,k,ispec)
+                   c14 = c14store(i,j,k,ispec)
+                   c15 = c15store(i,j,k,ispec)
+                   c16 = c16store(i,j,k,ispec)
+                   c22 = c22store(i,j,k,ispec)
+                   c23 = c23store(i,j,k,ispec)
+                   c24 = c24store(i,j,k,ispec)
+                   c25 = c25store(i,j,k,ispec)
+                   c26 = c26store(i,j,k,ispec)
+                   c33 = c33store(i,j,k,ispec)
+                   c34 = c34store(i,j,k,ispec)
+                   c35 = c35store(i,j,k,ispec)
+                   c36 = c36store(i,j,k,ispec)
+                   c44 = c44store(i,j,k,ispec)
+                   c45 = c45store(i,j,k,ispec)
+                   c46 = c46store(i,j,k,ispec)
+                   c55 = c55store(i,j,k,ispec)
+                   c56 = c56store(i,j,k,ispec)
+                   c66 = c66store(i,j,k,ispec)
+
+                   sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                        c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                   sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                        c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                   sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                        c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                   sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                        c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                   sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                        c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                   sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                        c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+                else
+
+                   ! isotropic case
+                   lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                   lambdal = lambdalplus2mul - 2.*mul
+
+                   ! compute stress sigma
+                   sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                   sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                   sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                   sigma_xy = mul*duxdyl_plus_duydxl
+                   sigma_xz = mul*duzdxl_plus_duxdzl
+                   sigma_yz = mul*duzdyl_plus_duydzl
+
+                endif ! ANISOTROPY
+
+                ! subtract memory variables if attenuation
+                if(ATTENUATION) then
+                   ! way 1
+                   !                do i_sls = 1,N_SLS
+                   !                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+                   !                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+                   !                  sigma_xx = sigma_xx - R_xx_val
+                   !                  sigma_yy = sigma_yy - R_yy_val
+                   !                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+                   !                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                   !                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                   !                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                   !                enddo
+
+                   ! way 2
+                   ! note: this should help compilers to pipeline the code and make better use of the cache;
+                   !          depending on compilers, it can further decrease the computation time by ~ 30%.
+                   !          by default, N_SLS = 3, therefore we take steps of 3
+                   if(imodulo_N_SLS >= 1) then
+                      do i_sls = 1,imodulo_N_SLS
+                         R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                         R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                         sigma_xx = sigma_xx - R_xx_val1
+                         sigma_yy = sigma_yy - R_yy_val1
+                         sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                      enddo
+                   endif
+
+                   if(N_SLS >= imodulo_N_SLS+1) then
+                      do i_sls = imodulo_N_SLS+1,N_SLS,3
+                         R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                         R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                         sigma_xx = sigma_xx - R_xx_val1
+                         sigma_yy = sigma_yy - R_yy_val1
+                         sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                         R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                         R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                         sigma_xx = sigma_xx - R_xx_val2
+                         sigma_yy = sigma_yy - R_yy_val2
+                         sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                         R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                         R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                         sigma_xx = sigma_xx - R_xx_val3
+                         sigma_yy = sigma_yy - R_yy_val3
+                         sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                         sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                         sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                         sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                      enddo
+                   endif
+
+
+                endif
+
+                ! form dot product with test vector, symmetric form
+                tempx1(i,j,k,thread_id) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
+                tempy1(i,j,k,thread_id) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
+                tempz1(i,j,k,thread_id) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
+
+                tempx2(i,j,k,thread_id) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
+                tempy2(i,j,k,thread_id) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
+                tempz2(i,j,k,thread_id) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
+
+                tempx3(i,j,k,thread_id) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
+                tempy3(i,j,k,thread_id) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
+                tempz3(i,j,k,thread_id) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
+
+             enddo
+          enddo
+       enddo
+
+       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+       ! for incompressible fluid flow, Cambridge University Press (2002),
+       ! pages 386 and 389 and Figure 8.3.1
+       ! call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+       do j=1,m2
+          do i=1,m1
+             newtempx1(i,j,1,thread_id) = &
+                  hprimewgll_xxT(i,1)*tempx1(1,j,1,thread_id) + &
+                  hprimewgll_xxT(i,2)*tempx1(2,j,1,thread_id) + &
+                  hprimewgll_xxT(i,3)*tempx1(3,j,1,thread_id) + &
+                  hprimewgll_xxT(i,4)*tempx1(4,j,1,thread_id) + &
+                  hprimewgll_xxT(i,5)*tempx1(5,j,1,thread_id)
+             newtempy1(i,j,1,thread_id) = &
+                  hprimewgll_xxT(i,1)*tempy1(1,j,1,thread_id) + &
+                  hprimewgll_xxT(i,2)*tempy1(2,j,1,thread_id) + &
+                  hprimewgll_xxT(i,3)*tempy1(3,j,1,thread_id) + &
+                  hprimewgll_xxT(i,4)*tempy1(4,j,1,thread_id) + &
+                  hprimewgll_xxT(i,5)*tempy1(5,j,1,thread_id)
+             newtempz1(i,j,1,thread_id) = &
+                  hprimewgll_xxT(i,1)*tempz1(1,j,1,thread_id) + &
+                  hprimewgll_xxT(i,2)*tempz1(2,j,1,thread_id) + &
+                  hprimewgll_xxT(i,3)*tempz1(3,j,1,thread_id) + &
+                  hprimewgll_xxT(i,4)*tempz1(4,j,1,thread_id) + &
+                  hprimewgll_xxT(i,5)*tempz1(5,j,1,thread_id)
+          enddo
+       enddo
+
+       !   call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+       !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+       do i=1,m1
+          do j=1,m1
+             ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+             do k = 1,NGLLX
+                newtempx2(i,j,k,thread_id) = tempx2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
+                     tempx2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
+                     tempx2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
+                     tempx2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
+                     tempx2(i,5,k,thread_id)*hprimewgll_xx(5,j)
+                newtempy2(i,j,k,thread_id) = tempy2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
+                     tempy2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
+                     tempy2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
+                     tempy2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
+                     tempy2(i,5,k,thread_id)*hprimewgll_xx(5,j)
+                newtempz2(i,j,k,thread_id) = tempz2(i,1,k,thread_id)*hprimewgll_xx(1,j) + &
+                     tempz2(i,2,k,thread_id)*hprimewgll_xx(2,j) + &
+                     tempz2(i,3,k,thread_id)*hprimewgll_xx(3,j) + &
+                     tempz2(i,4,k,thread_id)*hprimewgll_xx(4,j) + &
+                     tempz2(i,5,k,thread_id)*hprimewgll_xx(5,j)
+             enddo
+          enddo
+       enddo
+
+       ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+       do j=1,m1
+          do i=1,m2
+             newtempx3(i,1,j,thread_id) = &
+                  tempx3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
+                  tempx3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
+                  tempx3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
+                  tempx3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
+                  tempx3(i,1,5,thread_id)*hprimewgll_xx(5,j)
+             newtempy3(i,1,j,thread_id) = &
+                  tempy3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
+                  tempy3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
+                  tempy3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
+                  tempy3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
+                  tempy3(i,1,5,thread_id)*hprimewgll_xx(5,j)
+             newtempz3(i,1,j,thread_id) = &
+                  tempz3(i,1,1,thread_id)*hprimewgll_xx(1,j) + &
+                  tempz3(i,1,2,thread_id)*hprimewgll_xx(2,j) + &
+                  tempz3(i,1,3,thread_id)*hprimewgll_xx(3,j) + &
+                  tempz3(i,1,4,thread_id)*hprimewgll_xx(4,j) + &
+                  tempz3(i,1,5,thread_id)*hprimewgll_xx(5,j)
+          enddo
+       enddo
+
+       do k=1,NGLLZ
+          do j=1,NGLLY
+             do i=1,NGLLX
+
+                fac1 = wgllwgll_yz(j,k)
+                fac2 = wgllwgll_xz(i,k)
+                fac3 = wgllwgll_xy(i,j)
+
+                ! sum contributions from each element to the global mesh using indirect addressing
+                iglob = ibool(i,j,k,ispec)
+
+                ! accel_omp(1,iglob,thread_id) = accel_omp(1,iglob,thread_id)&
+                !      - fac1*newtempx1(i,j,k,thread_id) - fac2*newtempx2(i,j,k,thread_id)&
+                !      - fac3*newtempx3(i,j,k,thread_id)
+                ! accel_omp(2,iglob,thread_id) = accel_omp(2,iglob,thread_id)&
+                !      - fac1*newtempy1(i,j,k,thread_id) - fac2*newtempy2(i,j,k,thread_id)&
+                !      - fac3*newtempy3(i,j,k,thread_id)
+                ! accel_omp(3,iglob,thread_id) = accel_omp(3,iglob,thread_id)&
+                !      - fac1*newtempz1(i,j,k,thread_id) - fac2*newtempz2(i,j,k,thread_id)&
+                !      - fac3*newtempz3(i,j,k,thread_id)
+
+                ! Assembly of shared degrees of freedom fixed through mesh coloring
+                !! !$OMP ATOMIC
+                accel(1,iglob) = accel(1,iglob) &
+                                  - (fac1*newtempx1(i,j,k,thread_id) &
+                                   + fac2*newtempx2(i,j,k,thread_id) &
+                                   + fac3*newtempx3(i,j,k,thread_id))
+                !! !$OMP ATOMIC
+                accel(2,iglob) = accel(2,iglob) &
+                                  - (fac1*newtempy1(i,j,k,thread_id) &
+                                   + fac2*newtempy2(i,j,k,thread_id) &
+                                   + fac3*newtempy3(i,j,k,thread_id))
+                !! !$OMP ATOMIC
+                accel(3,iglob) = accel(3,iglob) &
+                                  - (fac1*newtempz1(i,j,k,thread_id) &
+                                   + fac2*newtempz2(i,j,k,thread_id) &
+                                   + fac3*newtempz3(i,j,k,thread_id))
+
+                ! accel(1,iglob) = accel(1,iglob) - &
+                ! (fac1*newtempx1(i,j,k,thread_id) + fac2*newtempx2(i,j,k,thread_id) + fac3*newtempx3(i,j,k,thread_id))
+                ! accel(2,iglob) = accel(2,iglob) - &
+                ! (fac1*newtempy1(i,j,k,thread_id) + fac2*newtempy2(i,j,k,thread_id) + fac3*newtempy3(i,j,k,thread_id))
+                ! accel(3,iglob) = accel(3,iglob) - &
+                ! (fac1*newtempz1(i,j,k,thread_id) + fac2*newtempz2(i,j,k,thread_id) + fac3*newtempz3(i,j,k,thread_id))
+
+                ! accel_omp(1,iglob,thread_id) = accel_omp(1,iglob,thread_id) - fac1*newtempx1(i,j,k,thread_id) - &
+                !                   fac2*newtempx2(i,j,k,thread_id) - fac3*newtempx3(i,j,k,thread_id)
+                ! accel_omp(2,iglob,thread_id) = accel_omp(2,iglob,thread_id) - fac1*newtempy1(i,j,k,thread_id) - &
+                !                   fac2*newtempy2(i,j,k,thread_id) - fac3*newtempy3(i,j,k,thread_id)
+                ! accel_omp(3,iglob,thread_id) = accel_omp(3,iglob,thread_id) - fac1*newtempz1(i,j,k,thread_id) - &
+                !      fac2*newtempz2(i,j,k,thread_id) - fac3*newtempz3(i,j,k,thread_id)
+
+                !  update memory variables based upon the Runge-Kutta scheme
+                if(ATTENUATION) then
+
+                   ! use Runge-Kutta scheme to march in time
+                   do i_sls = 1,N_SLS
+
+                      factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                      alphaval_loc = alphaval(i_sls)
+                      betaval_loc = betaval(i_sls)
+                      gammaval_loc = gammaval(i_sls)
+
+                      ! term in xx
+                      Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                      R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+                      ! term in yy
+                      Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                      R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+                      ! term in zz not computed since zero trace
+                      ! term in xy
+                      Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                      R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+                      ! term in xz
+                      Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                      R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+                      ! term in yz
+                      Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                      Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                      R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                           betaval_loc * Sn + gammaval_loc * Snp1
+
+                   enddo   ! end of loop on memory variables
+
+                endif  !  end attenuation
+
+             enddo
+          enddo
+       enddo
+
+       ! save deviatoric strain for Runge-Kutta scheme
+       if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+       endif
+
+    enddo  ! spectral element loop
+    !$OMP END DO
+    !$OMP END PARALLEL
+
+    ! The elements are in order of color. First we do color 1 elements,
+    ! then color 2, etc. The ispec has to moved to start at the next
+    ! color.
+    estart = estart + num_elements
+
+  enddo ! loop over colors
+
+
+  ! "stop" timer
+  ! end_time = omp_get_wtime()
+
+  ! write(*,*) "Total Elapsed time: ", (end_time-start_time) , "seconds. (Threads=",NUM_THREADS,")"
+  ! write(*,*) "Accumulate Elapsed time: ", (accumulate_time_stop-accumulate_time_start) , "seconds"
+
+
+  ! These are now allocated at the beginning and never deallocated
+  ! because the program just finishes at the end.
+
+  ! deallocate(dummyx_loc)
+  ! deallocate(dummyy_loc)
+  ! deallocate(dummyz_loc)
+  ! deallocate(newtempx1)
+  ! deallocate(newtempx2)
+  ! deallocate(newtempx3)
+  ! deallocate(newtempy1)
+  ! deallocate(newtempy2)
+  ! deallocate(newtempy3)
+  ! deallocate(newtempz1)
+  ! deallocate(newtempz2)
+  ! deallocate(newtempz3)
+  ! deallocate(tempx1)
+  ! deallocate(tempx2)
+  ! deallocate(tempx3)
+  ! deallocate(tempy1)
+  ! deallocate(tempy2)
+  ! deallocate(tempy3)
+  ! deallocate(tempz1)
+  ! deallocate(tempz2)
+  ! deallocate(tempz3)
+
+  ! accel(:,:) = accel_omp(:,:,1)
+
+  end subroutine compute_forces_elastic_Dev_openmp
+
+
diff --git a/src/specfem3D/compute_gradient.f90 b/src/specfem3D/compute_gradient.f90
index 39dd1b961..3fd2dffe3 100644
--- a/src/specfem3D/compute_gradient.f90
+++ b/src/specfem3D/compute_gradient.f90
@@ -29,14 +29,18 @@ subroutine compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                         scalar_field, vector_field_element,&
                         hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore)
+                        ibool,rhostore,GRAVITY)
 
 ! calculates gradient of given acoustic scalar (potential) field on all GLL points in one, single element
 ! note:
 !   displacement s = (rho)^{-1} \del \chi
 !   velocity          v = (rho)^{-1} \del \ddot \chi
 !
+!  in case of gravity:
+!   displacement s = \del \chi
+!   velocity          v = \del \ddot \chi
 ! returns: (1/rho) times gradient vector field (vector_field_element) in specified element
+!             or in gravity case, just gradient vector field
 
   implicit none
   include 'constants.h'
@@ -59,6 +63,8 @@ subroutine compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
   real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy
   real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz
 
+  logical :: GRAVITY
+
 ! local parameters
   real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl
   real(kind=CUSTOM_REAL) temp1l,temp2l,temp3l
@@ -100,7 +106,12 @@ subroutine compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
         gammayl = gammay(i,j,k,ispec)
         gammazl = gammaz(i,j,k,ispec)
 
-        rho_invl = 1.0_CUSTOM_REAL / rhostore(i,j,k,ispec)
+        ! daniel: TODO - check gravity case here
+        if( GRAVITY ) then
+          rho_invl = 1.0_CUSTOM_REAL / rhostore(i,j,k,ispec)
+        else
+          rho_invl = 1.0_CUSTOM_REAL / rhostore(i,j,k,ispec)
+        endif
 
         ! derivatives of acoustic scalar potential field on GLL points
         vector_field_element(1,i,j,k) = (temp1l*xixl + temp2l*etaxl + temp3l*gammaxl) * rho_invl
diff --git a/src/specfem3D/compute_kernels.f90 b/src/specfem3D/compute_kernels.f90
index b7fee1029..c1c7606d3 100644
--- a/src/specfem3D/compute_kernels.f90
+++ b/src/specfem3D/compute_kernels.f90
@@ -32,68 +32,160 @@ subroutine compute_kernels()
   use specfem_par
   use specfem_par_elastic
   use specfem_par_acoustic
+  implicit none
+
+  ! elastic simulations
+  if( ELASTIC_SIMULATION ) then
+    call compute_kernels_el()
+  endif
+
+  ! elastic simulations
+  if( ACOUSTIC_SIMULATION ) then
+    call compute_kernels_ac()
+  endif
+
+  ! computes an approximative hessian for preconditioning kernels
+  if ( APPROXIMATE_HESS_KL ) then
+    call compute_kernels_hessian()
+  endif
+
+  end subroutine compute_kernels
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine compute_kernels_el()
+
+! kernel calculations
+! see e.g. Tromp et al. (2005)
+
+  use specfem_par
+  use specfem_par_elastic
 
   implicit none
   ! local parameters
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: b_displ_elm,accel_elm
-  real(kind=CUSTOM_REAL) :: kappal,rhol
   integer :: i,j,k,ispec,iglob
   real(kind=CUSTOM_REAL), dimension(5) :: epsilondev_loc,b_epsilondev_loc
 
-  ! updates kernels
-  do ispec = 1, NSPEC_AB
+  if( .not. GPU_MODE ) then
+    ! updates kernels on CPU
+    do ispec = 1, NSPEC_AB
+
+      ! elastic domains
+      if( ispec_is_elastic(ispec) ) then
+
+         do k = 1, NGLLZ
+            do j = 1, NGLLY
+               do i = 1, NGLLX
+                  iglob = ibool(i,j,k,ispec)
+
+                  ! isotropic kernels
+                  ! note: takes displacement from backward/reconstructed (forward) field b_displ
+                  !          and acceleration from adjoint field accel (containing adjoint sources)
+                  !
+                  ! note: : time integral summation uses deltat
+                  !
+                  ! compare with Tromp et al. (2005), eq. (14), which takes adjoint displacement
+                  ! and forward acceleration, that is the symmetric form of what is calculated here
+                  ! however, this kernel expression is symmetric with regards
+                  ! to interchange adjoint - forward field
+                  rho_kl(i,j,k,ispec) =  rho_kl(i,j,k,ispec) &
+                       + deltat * dot_product(accel(:,iglob), b_displ(:,iglob))
+
+                  ! kernel for shear modulus, see e.g. Tromp et al. (2005), equation (17)
+                  ! note: multiplication with 2*mu(x) will be done after the time loop
+                  epsilondev_loc(1) = epsilondev_xx(i,j,k,ispec)
+                  epsilondev_loc(2) = epsilondev_yy(i,j,k,ispec)
+                  epsilondev_loc(3) = epsilondev_xy(i,j,k,ispec)
+                  epsilondev_loc(4) = epsilondev_xz(i,j,k,ispec)
+                  epsilondev_loc(5) = epsilondev_yz(i,j,k,ispec)
+
+                  b_epsilondev_loc(1) = b_epsilondev_xx(i,j,k,ispec)
+                  b_epsilondev_loc(2) = b_epsilondev_yy(i,j,k,ispec)
+                  b_epsilondev_loc(3) = b_epsilondev_xy(i,j,k,ispec)
+                  b_epsilondev_loc(4) = b_epsilondev_xz(i,j,k,ispec)
+                  b_epsilondev_loc(5) = b_epsilondev_yz(i,j,k,ispec)
+
+                  mu_kl(i,j,k,ispec) =  mu_kl(i,j,k,ispec) &
+                       + deltat * (epsilondev_loc(1)*b_epsilondev_loc(1) + epsilondev_loc(2)*b_epsilondev_loc(2) &
+                       + (epsilondev_loc(1)+epsilondev_loc(2)) * (b_epsilondev_loc(1)+b_epsilondev_loc(2)) &
+                       + 2 * (epsilondev_loc(3)*b_epsilondev_loc(3) + epsilondev_loc(4)*b_epsilondev_loc(4) + &
+                       epsilondev_loc(5)*b_epsilondev_loc(5)) )
+
+                  ! kernel for bulk modulus, see e.g. Tromp et al. (2005), equation (18)
+                  ! note: multiplication with kappa(x) will be done after the time loop
+                  kappa_kl(i,j,k,ispec) = kappa_kl(i,j,k,ispec) &
+                       + deltat * (9 * epsilon_trace_over_3(i,j,k,ispec) &
+                       * b_epsilon_trace_over_3(i,j,k,ispec))
+
+               enddo
+            enddo
+         enddo
+      endif !ispec_is_elastic
+
+    enddo
+
+  else
+    ! updates kernels on GPU
+    call compute_kernels_elastic_cuda(Mesh_pointer,deltat)
+  endif
 
-    ! elastic domains
-    if( ispec_is_elastic(ispec) ) then
 
-      do k = 1, NGLLZ
-        do j = 1, NGLLY
-          do i = 1, NGLLX
-            iglob = ibool(i,j,k,ispec)
+  ! moho kernel
+  if( SAVE_MOHO_MESH ) then
+    if( GPU_MODE ) then
+      call transfer_accel_from_device(NDIM*NGLOB_AB,accel,Mesh_pointer)
+      call transfer_b_displ_from_device(NDIM*NGLOB_AB,b_displ,Mesh_pointer)
+    endif
+    ! updates on CPU
+    call compute_boundary_kernel()
+  endif
 
-            ! isotropic kernels
-            ! note: takes displacement from backward/reconstructed (forward) field b_displ
-            !          and acceleration from adjoint field accel (containing adjoint sources)
-            !
-            ! note: : time integral summation uses deltat
-            !
-            ! compare with Tromp et al. (2005), eq. (14), which takes adjoint displacement
-            ! and forward acceleration, that is the symmetric form of what is calculated here
-            ! however, this kernel expression is symmetric with regards
-            ! to interchange adjoint - forward field
-            rho_kl(i,j,k,ispec) =  rho_kl(i,j,k,ispec) &
-                                + deltat * dot_product(accel(:,iglob), b_displ(:,iglob))
-
-            ! kernel for shear modulus, see e.g. Tromp et al. (2005), equation (17)
-            ! note: multiplication with 2*mu(x) will be done after the time loop
-            epsilondev_loc(1) = epsilondev_xx(i,j,k,ispec)
-            epsilondev_loc(2) = epsilondev_yy(i,j,k,ispec)
-            epsilondev_loc(3) = epsilondev_xy(i,j,k,ispec)
-            epsilondev_loc(4) = epsilondev_xz(i,j,k,ispec)
-            epsilondev_loc(5) = epsilondev_yz(i,j,k,ispec)
-
-            b_epsilondev_loc(1) = b_epsilondev_xx(i,j,k,ispec)
-            b_epsilondev_loc(2) = b_epsilondev_yy(i,j,k,ispec)
-            b_epsilondev_loc(3) = b_epsilondev_xy(i,j,k,ispec)
-            b_epsilondev_loc(4) = b_epsilondev_xz(i,j,k,ispec)
-            b_epsilondev_loc(5) = b_epsilondev_yz(i,j,k,ispec)
-
-            mu_kl(i,j,k,ispec) =  mu_kl(i,j,k,ispec) &
-             + deltat * (epsilondev_loc(1)*b_epsilondev_loc(1) + epsilondev_loc(2)*b_epsilondev_loc(2) &
-             + (epsilondev_loc(1)+epsilondev_loc(2)) * (b_epsilondev_loc(1)+b_epsilondev_loc(2)) &
-             + 2 * (epsilondev_loc(3)*b_epsilondev_loc(3) + epsilondev_loc(4)*b_epsilondev_loc(4) + &
-              epsilondev_loc(5)*b_epsilondev_loc(5)) )
-
-            ! kernel for bulk modulus, see e.g. Tromp et al. (2005), equation (18)
-            ! note: multiplication with kappa(x) will be done after the time loop
-            kappa_kl(i,j,k,ispec) = kappa_kl(i,j,k,ispec) &
-                              + deltat * (9 * epsilon_trace_over_3(i,j,k,ispec) &
-                                          * b_epsilon_trace_over_3(i,j,k,ispec))
+  ! for noise simulations --- source strength kernel
+  if (NOISE_TOMOGRAPHY == 3) then
+    call compute_kernels_strength_noise(NGLLSQUARE*num_free_surface_faces,ibool, &
+                        sigma_kl,displ,deltat,it, &
+                        normal_x_noise,normal_y_noise,normal_z_noise, &
+                        noise_surface_movie, &
+                        NSPEC_AB,NGLOB_AB, &
+                        num_free_surface_faces,free_surface_ispec,free_surface_ijk,&
+                        GPU_MODE,Mesh_pointer)
+  endif
 
-          enddo
-        enddo
-      enddo
-    endif !ispec_is_elastic
+  end subroutine compute_kernels_el
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine compute_kernels_ac()
+
+! kernel calculations
+! see e.g. Tromp et al. (2005)
+
+  use specfem_par
+  use specfem_par_acoustic
+
+  implicit none
+  ! local parameters
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: b_displ_elm,accel_elm
+  real(kind=CUSTOM_REAL) :: kappal,rhol
+  integer :: i,j,k,ispec,iglob
+
+  ! updates kernels on GPU
+  if(GPU_MODE) then
+
+    ! computes contribution to density and bulk modulus kernel
+    call compute_kernels_acoustic_cuda(Mesh_pointer,deltat)
+
+    ! kernels are done
+    return
+  endif
+
+  ! updates kernels
+  do ispec = 1, NSPEC_AB
 
     ! acoustic domains
     if( ispec_is_acoustic(ispec) ) then
@@ -103,41 +195,20 @@ subroutine compute_kernels()
                       b_potential_acoustic, b_displ_elm,&
                       hprime_xx,hprime_yy,hprime_zz, &
                       xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                      ibool,rhostore)
+                      ibool,rhostore,GRAVITY)
       ! adjoint fields: acceleration vector
-      !<YANGL
-      !!!! old expression (\bfs\dagger=\frac{1}{\rho}\bfnabla\phi^\dagger)
-      !!!call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
-      !!!                potential_dot_dot_acoustic, accel_elm,&
-      !!!                hprime_xx,hprime_yy,hprime_zz, &
-      !!!                xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-      !!!                ibool,rhostore)
       ! new expression (\partial_t^2\bfs^\dagger=-\frac{1}{\rho}\bfnabla\phi^\dagger)
       call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                       potential_acoustic, accel_elm,&
                       hprime_xx,hprime_yy,hprime_zz, &
                       xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                      ibool,rhostore)
-      !>YANGL
+                      ibool,rhostore,GRAVITY)
 
       do k = 1, NGLLZ
         do j = 1, NGLLY
           do i = 1, NGLLX
             iglob = ibool(i,j,k,ispec)
 
-            !<YANGL
-            !!!! old expression
-            !!!! density kernel
-            !!!rhol = rhostore(i,j,k,ispec)
-            !!!rho_ac_kl(i,j,k,ispec) =  rho_ac_kl(i,j,k,ispec) &
-            !!!          - deltat * rhol * dot_product(accel_elm(:,i,j,k), b_displ_elm(:,i,j,k))
-
-            !!!! bulk modulus kernel
-            !!!kappal = kappastore(i,j,k,ispec)
-            !!!kappa_ac_kl(i,j,k,ispec) = kappa_ac_kl(i,j,k,ispec) &
-            !!!                      - deltat / kappal  &
-            !!!                      * potential_dot_dot_acoustic(iglob) &
-            !!!                      * b_potential_dot_dot_acoustic(iglob)
             ! new expression
             ! density kernel
             rhol = rhostore(i,j,k,ispec)
@@ -148,10 +219,8 @@ subroutine compute_kernels()
             kappal = 1._CUSTOM_REAL / kappastore(i,j,k,ispec)
             kappa_ac_kl(i,j,k,ispec) = kappa_ac_kl(i,j,k,ispec) &
                                   + deltat * kappal  &
-                                  * potential_acoustic(iglob) * b_potential_dot_dot_acoustic(iglob)
-
-
-            !>YANGL
+                                  * potential_acoustic(iglob) &
+                                  * b_potential_dot_dot_acoustic(iglob)
 
           enddo
         enddo
@@ -160,29 +229,11 @@ subroutine compute_kernels()
 
   enddo
 
-  ! moho kernel
-  if( ELASTIC_SIMULATION  .and. SAVE_MOHO_MESH ) then
-      call compute_boundary_kernel()
-  endif
-
-  ! for noise simulations --- source strength kernel
-  if (NOISE_TOMOGRAPHY == 3)  &
-    call compute_kernels_strength_noise(NGLLSQUARE*num_free_surface_faces,ibool, &
-                        sigma_kl,displ,deltat,it, &
-                        normal_x_noise,normal_y_noise,normal_z_noise, &
-                        noise_surface_movie, &
-                        NSPEC_AB,NGLOB_AB, &
-                        num_free_surface_faces,free_surface_ispec,free_surface_ijk)
+  end subroutine compute_kernels_ac
 
-  ! computes an approximative hessian for preconditioning kernels
-  if ( APPROXIMATE_HESS_KL ) then
-    call compute_kernels_hessian()
-  endif
-
-  end subroutine compute_kernels
-
-
-!-----------------------------------------------------------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
 
   subroutine compute_kernels_hessian()
 
@@ -195,6 +246,17 @@ subroutine compute_kernels_hessian()
   real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: b_accel_elm,accel_elm
   integer :: i,j,k,ispec,iglob
 
+  ! updates kernels on GPU
+  if(GPU_MODE) then
+
+    ! computes contribution to density and bulk modulus kernel
+    call compute_kernels_hess_cuda(Mesh_pointer,deltat, &
+                                  ELASTIC_SIMULATION,ACOUSTIC_SIMULATION)
+
+    ! done on GPU
+    return
+  endif
+
   ! loops over all elements
   do ispec = 1, NSPEC_AB
 
@@ -206,14 +268,14 @@ subroutine compute_kernels_hessian()
                       potential_dot_dot_acoustic, accel_elm,&
                       hprime_xx,hprime_yy,hprime_zz, &
                       xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                      ibool,rhostore)
+                      ibool,rhostore,GRAVITY)
 
       ! adjoint fields: acceleration vector
       call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                       b_potential_dot_dot_acoustic, b_accel_elm,&
                       hprime_xx,hprime_yy,hprime_zz, &
                       xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                      ibool,rhostore)
+                      ibool,rhostore,GRAVITY)
 
       do k = 1, NGLLZ
         do j = 1, NGLLY
diff --git a/src/specfem3D/compute_stacey_acoustic.f90 b/src/specfem3D/compute_stacey_acoustic.f90
index 14e01a728..422205a92 100644
--- a/src/specfem3D/compute_stacey_acoustic.f90
+++ b/src/specfem3D/compute_stacey_acoustic.f90
@@ -33,7 +33,8 @@ subroutine compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, &
                             num_abs_boundary_faces,rhostore,kappastore,ispec_is_acoustic,&
                             SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it,NGLOB_ADJOINT, &
                             b_potential_dot_dot_acoustic,b_reclen_potential, &
-                            b_absorb_potential,b_num_abs_boundary_faces)
+                            b_absorb_potential,b_num_abs_boundary_faces, &
+                            GPU_MODE,Mesh_pointer)
 
   implicit none
 
@@ -67,16 +68,23 @@ subroutine compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, &
   real(kind=CUSTOM_REAL),dimension(NGLLSQUARE,b_num_abs_boundary_faces):: b_absorb_potential
   logical:: SAVE_FORWARD
 
+  ! GPU_MODE variables
+  integer(kind=8) :: Mesh_pointer
+  logical :: GPU_MODE
+
 ! local parameters
   real(kind=CUSTOM_REAL) :: rhol,cpl,jacobianw,absorbl
   integer :: ispec,iglob,i,j,k,iface,igll
   !integer:: reclen1,reclen2
 
+  ! checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return
+
   ! adjoint simulations:
   if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
     ! reads in absorbing boundary array when first phase is running
     if( phase_is_inner .eqv. .false. ) then
-      ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newark scheme
+      ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
       ! uses fortran routine
       !read(IOABS_AC,rec=NSTEP-it+1) reclen1,b_absorb_potential,reclen2
       !if (reclen1 /= b_reclen_potential .or. reclen1 /= reclen2) &
@@ -87,52 +95,61 @@ subroutine compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, &
   endif !adjoint
 
   ! absorbs absorbing-boundary surface using Sommerfeld condition (vanishing field in the outer-space)
-  do iface=1,num_abs_boundary_faces
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do iface=1,num_abs_boundary_faces
 
-    ispec = abs_boundary_ispec(iface)
+      ispec = abs_boundary_ispec(iface)
 
-    if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+      if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
 
-      if( ispec_is_acoustic(ispec) ) then
+        if( ispec_is_acoustic(ispec) ) then
 
-        ! reference gll points on boundary face
-        do igll = 1,NGLLSQUARE
+          ! reference gll points on boundary face
+          do igll = 1,NGLLSQUARE
 
-          ! gets local indices for GLL point
-          i = abs_boundary_ijk(1,igll,iface)
-          j = abs_boundary_ijk(2,igll,iface)
-          k = abs_boundary_ijk(3,igll,iface)
+            ! gets local indices for GLL point
+            i = abs_boundary_ijk(1,igll,iface)
+            j = abs_boundary_ijk(2,igll,iface)
+            k = abs_boundary_ijk(3,igll,iface)
 
-          ! gets global index
-          iglob=ibool(i,j,k,ispec)
+            ! gets global index
+            iglob=ibool(i,j,k,ispec)
 
-          ! determines bulk sound speed
-          rhol = rhostore(i,j,k,ispec)
-          cpl = sqrt( kappastore(i,j,k,ispec) / rhol )
+            ! determines bulk sound speed
+            rhol = rhostore(i,j,k,ispec)
+            cpl = sqrt( kappastore(i,j,k,ispec) / rhol )
 
-          ! gets associated, weighted jacobian
-          jacobianw = abs_boundary_jacobian2Dw(igll,iface)
+            ! gets associated, weighted jacobian
+            jacobianw = abs_boundary_jacobian2Dw(igll,iface)
 
-          ! Sommerfeld condition
-          absorbl = potential_dot_acoustic(iglob) * jacobianw / cpl / rhol
-          potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) &
-                                              - absorbl
+            ! Sommerfeld condition
+            absorbl = potential_dot_acoustic(iglob) * jacobianw / cpl / rhol
+            potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) &
+                                                - absorbl
 
-          ! adjoint simulations
-          if (SIMULATION_TYPE == 3) then
-            b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) &
-                                                  - b_absorb_potential(igll,iface)
+            ! adjoint simulations
+            if (SIMULATION_TYPE == 3) then
+              b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) &
+                                                    - b_absorb_potential(igll,iface)
 
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-              b_absorb_potential(igll,iface) = absorbl
+            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+                b_absorb_potential(igll,iface) = absorbl
 
-          endif !adjoint
+            endif !adjoint
 
-         enddo
+           enddo
 
-      endif ! ispec_is_acoustic
-    endif ! ispec_is_inner
-  enddo ! num_abs_boundary_faces
+        endif ! ispec_is_acoustic
+      endif ! ispec_is_inner
+    enddo ! num_abs_boundary_faces
+  else
+    ! GPU_MODE == .true.
+    if( num_abs_boundary_faces > 0 ) &
+      call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner, &
+                                       SIMULATION_TYPE,SAVE_FORWARD,b_absorb_potential)
+
+  endif
 
   ! adjoint simulations: stores absorbed wavefield part
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
diff --git a/src/specfem3D/compute_stacey_elastic.f90 b/src/specfem3D/compute_stacey_elastic.f90
index 18633e893..c4997d532 100644
--- a/src/specfem3D/compute_stacey_elastic.f90
+++ b/src/specfem3D/compute_stacey_elastic.f90
@@ -36,7 +36,8 @@ subroutine compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, &
                         veloc,rho_vp,rho_vs, &
                         ispec_is_elastic,SIMULATION_TYPE,SAVE_FORWARD, &
                         NSTEP,it,NGLOB_ADJOINT,b_accel, &
-                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field)
+                        b_num_abs_boundary_faces,b_reclen_field,b_absorb_field, &
+                        GPU_MODE,Mesh_pointer)
 
   implicit none
 
@@ -74,11 +75,17 @@ subroutine compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, &
   real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
   logical:: SAVE_FORWARD
 
+  ! GPU_MODE variables
+  integer(kind=8) :: Mesh_pointer
+  logical :: GPU_MODE
+
 ! local parameters
   real(kind=CUSTOM_REAL) vx,vy,vz,nx,ny,nz,tx,ty,tz,vn,jacobianw
   integer :: ispec,iglob,i,j,k,iface,igll
   !integer:: reclen1,reclen2
 
+  ! checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return
 
 ! adjoint simulations:
   if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
@@ -95,63 +102,72 @@ subroutine compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, &
   endif !adjoint
 
 
-! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist)
-  do iface=1,num_abs_boundary_faces
+  if(.NOT. GPU_MODE) then
+
+     ! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist)
+     do iface=1,num_abs_boundary_faces
 
-    ispec = abs_boundary_ispec(iface)
+        ispec = abs_boundary_ispec(iface)
 
-    if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+        if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
 
-      if( ispec_is_elastic(ispec) ) then
+           if( ispec_is_elastic(ispec) ) then
 
-        ! reference gll points on boundary face
-        do igll = 1,NGLLSQUARE
+              ! reference gll points on boundary face
+              do igll = 1,NGLLSQUARE
 
-          ! gets local indices for GLL point
-          i = abs_boundary_ijk(1,igll,iface)
-          j = abs_boundary_ijk(2,igll,iface)
-          k = abs_boundary_ijk(3,igll,iface)
+                 ! gets local indices for GLL point
+                 i = abs_boundary_ijk(1,igll,iface)
+                 j = abs_boundary_ijk(2,igll,iface)
+                 k = abs_boundary_ijk(3,igll,iface)
 
-          ! gets velocity
-          iglob=ibool(i,j,k,ispec)
-          vx=veloc(1,iglob)
-          vy=veloc(2,iglob)
-          vz=veloc(3,iglob)
+                 ! gets velocity
+                 iglob=ibool(i,j,k,ispec)
+                 vx=veloc(1,iglob)
+                 vy=veloc(2,iglob)
+                 vz=veloc(3,iglob)
 
-          ! gets associated normal
-          nx = abs_boundary_normal(1,igll,iface)
-          ny = abs_boundary_normal(2,igll,iface)
-          nz = abs_boundary_normal(3,igll,iface)
+                 ! gets associated normal
+                 nx = abs_boundary_normal(1,igll,iface)
+                 ny = abs_boundary_normal(2,igll,iface)
+                 nz = abs_boundary_normal(3,igll,iface)
 
-          ! velocity component in normal direction (normal points out of element)
-          vn = vx*nx + vy*ny + vz*nz
+                 ! velocity component in normal direction (normal points out of element)
+                 vn = vx*nx + vy*ny + vz*nz
 
-          ! stacey term: velocity vector component * vp * rho in normal direction + vs * rho component tangential to it
-          tx = rho_vp(i,j,k,ispec)*vn*nx + rho_vs(i,j,k,ispec)*(vx-vn*nx)
-          ty = rho_vp(i,j,k,ispec)*vn*ny + rho_vs(i,j,k,ispec)*(vy-vn*ny)
-          tz = rho_vp(i,j,k,ispec)*vn*nz + rho_vs(i,j,k,ispec)*(vz-vn*nz)
+                 ! stacey term: velocity vector component * vp * rho in normal direction + vs * rho component tangential to it
+                 tx = rho_vp(i,j,k,ispec)*vn*nx + rho_vs(i,j,k,ispec)*(vx-vn*nx)
+                 ty = rho_vp(i,j,k,ispec)*vn*ny + rho_vs(i,j,k,ispec)*(vy-vn*ny)
+                 tz = rho_vp(i,j,k,ispec)*vn*nz + rho_vs(i,j,k,ispec)*(vz-vn*nz)
 
-          ! gets associated, weighted jacobian
-          jacobianw = abs_boundary_jacobian2Dw(igll,iface)
+                 ! gets associated, weighted jacobian
+                 jacobianw = abs_boundary_jacobian2Dw(igll,iface)
 
-          ! adds stacey term (weak form)
-          accel(1,iglob) = accel(1,iglob) - tx*jacobianw
-          accel(2,iglob) = accel(2,iglob) - ty*jacobianw
-          accel(3,iglob) = accel(3,iglob) - tz*jacobianw
+                 ! adds stacey term (weak form)
+                 accel(1,iglob) = accel(1,iglob) - tx*jacobianw
+                 accel(2,iglob) = accel(2,iglob) - ty*jacobianw
+                 accel(3,iglob) = accel(3,iglob) - tz*jacobianw
 
-          ! adjoint simulations
-          if (SIMULATION_TYPE == 3) then
-            b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-            b_absorb_field(1,igll,iface) = tx*jacobianw
-            b_absorb_field(2,igll,iface) = ty*jacobianw
-            b_absorb_field(3,igll,iface) = tz*jacobianw
-          endif !adjoint
+                 ! adjoint simulations
+                 if (SIMULATION_TYPE == 3) then
+                    b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface)
+                 else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+                    b_absorb_field(1,igll,iface) = tx*jacobianw
+                    b_absorb_field(2,igll,iface) = ty*jacobianw
+                    b_absorb_field(3,igll,iface) = tz*jacobianw
+                 endif !adjoint
 
-         enddo
-      endif ! ispec_is_elastic
-    endif ! ispec_is_inner
-  enddo
+              enddo
+           endif ! ispec_is_elastic
+        endif ! ispec_is_inner
+     enddo
+
+  else
+    ! GPU_MODE == .true.
+    if( num_abs_boundary_faces > 0 ) &
+      call compute_stacey_elastic_cuda(Mesh_pointer,phase_is_inner, &
+                                      SIMULATION_TYPE,SAVE_FORWARD,b_absorb_field)
+  endif
 
   ! adjoint simulations: stores absorbed wavefield part
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
diff --git a/src/specfem3D/create_color_image.f90 b/src/specfem3D/create_color_image.f90
index 76672681a..45f629df2 100644
--- a/src/specfem3D/create_color_image.f90
+++ b/src/specfem3D/create_color_image.f90
@@ -33,32 +33,32 @@ module image_PNM_GIF_par
   ! USER PARAMETER
 
   ! image data output:
-  !   type = 1 : velocity V_x component
-  !   type = 2 : velocity V_y component
-  !   type = 3 : velocity V_z component
-  !   type = 4 : velocity V norm
-  integer,parameter:: IMAGE_TYPE = 3
+  !   type = 1 : displ/velocity x-component
+  !   type = 2 : displ/velocity y-component
+  !   type = 3 : displ/velocity z-component
+  !   type = 4 : displ/velocity norm
+  integer,parameter:: IMAGE_TYPE = 3 ! 4
 
   ! cross-section surface
   ! cross-section origin point
-  real(kind=CUSTOM_REAL),parameter:: section_xorg = 67000.0
-  real(kind=CUSTOM_REAL),parameter:: section_yorg = 0.0
-  real(kind=CUSTOM_REAL),parameter:: section_zorg = -1000.0
+  real(kind=CUSTOM_REAL),parameter:: section_xorg = 0.0 ! 67000.0
+  real(kind=CUSTOM_REAL),parameter:: section_yorg = 0.0 ! 0.0
+  real(kind=CUSTOM_REAL),parameter:: section_zorg = -100.0 ! 0.0
 
   ! cross-section surface normal
-  real(kind=CUSTOM_REAL),parameter:: section_nx = 0.0
-  real(kind=CUSTOM_REAL),parameter:: section_ny = 0.0
-  real(kind=CUSTOM_REAL),parameter:: section_nz = 1.0
+  real(kind=CUSTOM_REAL),parameter:: section_nx = 0.0 !1.0
+  real(kind=CUSTOM_REAL),parameter:: section_ny = 0.0 !0.0
+  real(kind=CUSTOM_REAL),parameter:: section_nz = 1.0 !0.0
 
   ! cross-section (in-plane) horizontal-direction
-  real(kind=CUSTOM_REAL),parameter:: section_hdirx = 1.0
-  real(kind=CUSTOM_REAL),parameter:: section_hdiry = 0.0
-  real(kind=CUSTOM_REAL),parameter:: section_hdirz = 0.0
+  real(kind=CUSTOM_REAL),parameter:: section_hdirx = 1.0 ! 0.0
+  real(kind=CUSTOM_REAL),parameter:: section_hdiry = 0.0 !1.0
+  real(kind=CUSTOM_REAL),parameter:: section_hdirz = 0.0 ! 0.0
 
   ! cross-section (in-plane) vertical-direction
-  real(kind=CUSTOM_REAL),parameter:: section_vdirx = 0.0
-  real(kind=CUSTOM_REAL),parameter:: section_vdiry = 1.0
-  real(kind=CUSTOM_REAL),parameter:: section_vdirz = 0.0
+  real(kind=CUSTOM_REAL),parameter:: section_vdirx = 0.0 ! 0.0
+  real(kind=CUSTOM_REAL),parameter:: section_vdiry = 1.0 ! 0.0
+  real(kind=CUSTOM_REAL),parameter:: section_vdirz = 0.0 ! 1.0
 
   ! non linear display to enhance small amplitudes in color images
   real(kind=CUSTOM_REAL), parameter :: POWER_DISPLAY_COLOR = 0.30_CUSTOM_REAL
@@ -118,6 +118,7 @@ subroutine write_PNM_GIF_initialize()
   real(kind=CUSTOM_REAL),dimension(:,:),allocatable :: dist_pixel_image,dist_pixel_recv
   real(kind=CUSTOM_REAL):: pixel_midpoint_x,pixel_midpoint_z,x_loc,z_loc,xtmp,ztmp
   real(kind=CUSTOM_REAL):: ratio
+  real(kind=CUSTOM_REAL):: distance_x1,distance_x2,distance_z1,distance_z2
   integer:: npgeo,npgeo_glob
   integer:: i,j,k,iproc,iglob,ispec,ier
   ! data from mesh
@@ -129,6 +130,8 @@ subroutine write_PNM_GIF_initialize()
   !character(len=256) :: vtkfilename
   integer :: zoom_factor = 4
   logical :: zoom
+  integer, dimension(1) :: tmp_pixel_loc
+  integer, dimension(1,0:NPROC-1) :: tmp_pixel_per_proc
 
   ! checks image type
   if(IMAGE_TYPE > 4 .or. IMAGE_TYPE < 1) then
@@ -263,8 +266,17 @@ subroutine write_PNM_GIF_initialize()
   endif
 
   ! create all the pixels
-  size_pixel_horizontal = (xmax_color_image - xmin_color_image) / dble(NX_IMAGE_color)
-  size_pixel_vertical = (zmax_color_image - zmin_color_image) / dble(NZ_IMAGE_color)
+  if( NX_IMAGE_color /= 0 ) then
+    size_pixel_horizontal = (xmax_color_image - xmin_color_image) / dble(NX_IMAGE_color)
+  else
+    size_pixel_horizontal = 0.0
+  endif
+
+  if( NZ_IMAGE_color /= 0 ) then
+    size_pixel_vertical = (zmax_color_image - zmin_color_image) / dble(NZ_IMAGE_color)
+  else
+    size_pixel_vertical = 0.0
+  endif
 
   if (myrank == 0) then
     write(IMAIN,*) '  image points: ',npgeo_glob
@@ -285,6 +297,19 @@ subroutine write_PNM_GIF_initialize()
   iglob_image_color(:,:) = -1
   ispec_image_color(:,:) = 0
   dist_pixel_image(:,:) = HUGEVAL
+
+  if( zoom ) then
+    distance_x1 = zoom_factor*size_pixel_horizontal
+    distance_x2 = (zoom_factor+1)*size_pixel_horizontal
+    distance_z1 = zoom_factor*size_pixel_vertical
+    distance_z2 = (zoom_factor+1)*size_pixel_vertical
+  else
+    distance_x1 = 0.0
+    distance_x2 = 2.0*size_pixel_horizontal
+    distance_z1 = 0.0
+    distance_z2 = 2.0*size_pixel_vertical
+  endif
+
   do j=1,NZ_IMAGE_color
     do i=1,NX_IMAGE_color
       ! calculates midpoint of pixel
@@ -307,15 +332,8 @@ subroutine write_PNM_GIF_initialize()
         z_loc = zcoord(iglob)
 
         ! checks if inside pixel range for larger numbers of points, minimizing computation time
-        if( zoom ) then
-          if( x_loc < xtmp-zoom_factor*size_pixel_horizontal .or. &
-             x_loc > xtmp + (zoom_factor+1)*size_pixel_horizontal ) cycle
-          if( z_loc < ztmp-zoom_factor*size_pixel_vertical .or. &
-             z_loc > ztmp + (zoom_factor+1)*size_pixel_vertical ) cycle
-        else
-          if( x_loc < xtmp .or. x_loc > xtmp + size_pixel_horizontal ) cycle
-          if( z_loc < ztmp .or. z_loc > ztmp + size_pixel_vertical ) cycle
-        endif
+        if( x_loc < xtmp - distance_x1 .or. x_loc > xtmp + distance_x2 ) cycle
+        if( z_loc < ztmp - distance_z1 .or. z_loc > ztmp + distance_z2 ) cycle
 
         ! stores closest iglob
         x_loc = pixel_midpoint_x - x_loc
@@ -382,7 +400,10 @@ subroutine write_PNM_GIF_initialize()
   ! filling array iglob_image_color, containing info on which process owns which pixels.
   allocate(nb_pixel_per_proc(0:NPROC-1),stat=ier)
   if( ier /= 0 ) stop 'error allocating array nb_pixel_per_proc'
-  call gather_all_i(nb_pixel_loc,1,nb_pixel_per_proc,1,NPROC)
+
+  tmp_pixel_loc(1) = nb_pixel_loc
+  call gather_all_i(tmp_pixel_loc,1,tmp_pixel_per_proc,1,NPROC)
+  nb_pixel_per_proc(:) = tmp_pixel_per_proc(1,:)
 
   ! allocates receiving array
   if ( myrank == 0 ) then
@@ -506,7 +527,7 @@ subroutine write_PNM_GIF_create_image
   implicit none
 
   ! local parameters
-  real(kind=CUSTOM_REAL),dimension(NDIM) :: veloc_val
+  real(kind=CUSTOM_REAL),dimension(NDIM) :: val_vector
   real(kind=CUSTOM_REAL):: temp
   integer :: i,j,k,iglob,ispec,iproc
 
@@ -523,15 +544,15 @@ subroutine write_PNM_GIF_create_image
     ispec = ispec_image_color(i,j)
 
     ! gets velocity for point iglob
-    call get_iglob_veloc(iglob,ispec,veloc_val)
+    call get_iglob_veloc(iglob,ispec,val_vector)
 
     ! data type
     if( IMAGE_TYPE == 4 ) then
       ! velocity norm
-      temp = sqrt( veloc_val(1)**2 + veloc_val(2)**2 + veloc_val(3)**2 )
+      temp = sqrt( val_vector(1)**2 + val_vector(2)**2 + val_vector(3)**2 )
     else
       ! velocity component
-      temp = veloc_val(IMAGE_TYPE)
+      temp = val_vector(IMAGE_TYPE)
     endif
 
     ! stores data
@@ -829,53 +850,89 @@ end subroutine get_iglob_vp
 
 !=============================================================
 
-  subroutine get_iglob_veloc(iglob,ispec,veloc_val)
+  subroutine get_iglob_veloc(iglob,ispec,val_vector)
 
   use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM
-  use specfem_par_acoustic,only: ACOUSTIC_SIMULATION,potential_dot_acoustic,&
-                                rhostore,ispec_is_acoustic,b_potential_dot_acoustic
-  use specfem_par_elastic,only: ELASTIC_SIMULATION,veloc,ispec_is_elastic,b_veloc
+  use specfem_par_acoustic,only: ACOUSTIC_SIMULATION,potential_acoustic,potential_dot_acoustic, &
+                                rhostore,ispec_is_acoustic, &
+                                b_potential_acoustic,b_potential_dot_acoustic
+  use specfem_par_elastic,only: ELASTIC_SIMULATION,displ,veloc, &
+                                ispec_is_elastic,b_displ,b_veloc
   use specfem_par,only: NSPEC_AB,NGLOB_AB,hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,SIMULATION_TYPE
+                        ibool,SIMULATION_TYPE,GRAVITY
+  use specfem_par_movie,only:SAVE_DISPLACEMENT
   implicit none
 
   integer,intent(in) :: iglob,ispec
-  real(kind=CUSTOM_REAL),dimension(NDIM),intent(out):: veloc_val
+  real(kind=CUSTOM_REAL),dimension(NDIM),intent(out):: val_vector
 
   ! local parameters
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: veloc_element
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: val_element
   integer :: i,j,k
 
   ! returns first element encountered for iglob index
   if( ELASTIC_SIMULATION ) then
     if( ispec_is_elastic(ispec) ) then
-      if( SIMULATION_TYPE == 3 ) then
-        veloc_val(:) = b_veloc(:,iglob)
+      if(SAVE_DISPLACEMENT) then
+        if( SIMULATION_TYPE == 3 ) then
+          ! to display re-constructed wavefield
+          !val_vector(:) = b_displ(:,iglob)
+          ! to display adjoint wavefield
+          val_vector(:) = displ(:,iglob)
+        else
+          val_vector(:) = displ(:,iglob)
+        endif
       else
-        veloc_val(:) = veloc(:,iglob)
+        if( SIMULATION_TYPE == 3 ) then
+          ! to display re-constructed wavefield
+          !val_vector(:) = b_veloc(:,iglob)
+          ! to display adjoint wavefield
+          val_vector(:) = veloc(:,iglob)
+        else
+          val_vector(:) = veloc(:,iglob)
+        endif
       endif
 
       ! returns with this result
       return
     endif
   endif
+
   if( ACOUSTIC_SIMULATION ) then
     if( ispec_is_acoustic(ispec) ) then
-      if( SIMULATION_TYPE == 3 ) then
-        ! velocity vector for backward/reconstructed wavefield
-        call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
-                          b_potential_dot_acoustic, veloc_element,&
+      if(SAVE_DISPLACEMENT) then
+        if( SIMULATION_TYPE == 3 ) then
+          ! displacement vector from backward potential
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+                          b_potential_acoustic, val_element,&
+                          hprime_xx,hprime_yy,hprime_zz, &
+                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                          ibool,rhostore,GRAVITY)
+        else
+          ! displacement vector
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+                          potential_acoustic, val_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
+        endif
       else
-        ! velocity vector
-        call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
-                          potential_dot_acoustic, veloc_element,&
+        if( SIMULATION_TYPE == 3 ) then
+          ! velocity vector for backward/reconstructed wavefield
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+                          b_potential_dot_acoustic, val_element,&
+                          hprime_xx,hprime_yy,hprime_zz, &
+                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                          ibool,rhostore,GRAVITY)
+        else
+          ! velocity vector
+          call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+                          potential_dot_acoustic, val_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
+        endif
       endif
 
       ! returns corresponding iglob velocity entry
@@ -883,7 +940,7 @@ subroutine get_iglob_veloc(iglob,ispec,veloc_val)
         do j=1,NGLLY
           do i=1,NGLLX
             if( ibool(i,j,k,ispec) == iglob ) then
-              veloc_val(:) = veloc_element(:,i,j,k)
+              val_vector(:) = val_element(:,i,j,k)
               return
             endif
           enddo
diff --git a/src/specfem3D/finalize_simulation.f90 b/src/specfem3D/finalize_simulation.f90
index 4b5acfc40..a20d9c007 100644
--- a/src/specfem3D/finalize_simulation.f90
+++ b/src/specfem3D/finalize_simulation.f90
@@ -36,10 +36,10 @@ subroutine finalize_simulation()
 
   integer :: irec_local
 
-! save last frame
+  ! save last frame
 
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
-    open(unit=27,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',&
+     open(unit=27,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',&
           status='unknown',form='unformatted')
 
     if( ACOUSTIC_SIMULATION ) then
diff --git a/src/specfem3D/initialize_simulation.f90 b/src/specfem3D/initialize_simulation.f90
index f63fd21ab..54d337be3 100644
--- a/src/specfem3D/initialize_simulation.f90
+++ b/src/specfem3D/initialize_simulation.f90
@@ -48,6 +48,9 @@ subroutine initialize_simulation()
                         NTSTEP_BETWEEN_OUTPUT_INFO,SIMULATION_TYPE,SAVE_FORWARD, &
                         NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,IMODEL)
 
+  ! GPU_MODE is in par_file
+  call read_gpu_mode(GPU_MODE,GRAVITY)
+
   ! get the base pathname for output files
   call get_value_string(OUTPUT_FILES, 'OUTPUT_FILES', OUTPUT_FILES_PATH(1:len_trim(OUTPUT_FILES_PATH)))
 
@@ -111,9 +114,11 @@ subroutine initialize_simulation()
     write(IMAIN,'(a)',advance='yes') '  tomo'
     case( IMODEL_USER_EXTERNAL )
     write(IMAIN,'(a)',advance='yes') '  external'
+    case( IMODEL_IPATI )
+    write(IMAIN,'(a)',advance='yes') '  ipati'
     end select
-    
-    write(IMAIN,*)    
+
+    write(IMAIN,*)
   endif
 
   ! reads in numbers of spectral elements and points for this process' domain
@@ -167,15 +172,18 @@ subroutine initialize_simulation()
           gammaz(NGLLX,NGLLY,NGLLZ,NSPEC_AB), &
           jacobian(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
   if( ier /= 0 ) stop 'error allocating arrays for databases'
+
   ! mesh node locations
   allocate(xstore(NGLOB_AB), &
           ystore(NGLOB_AB), &
           zstore(NGLOB_AB),stat=ier)
   if( ier /= 0 ) stop 'error allocating arrays for mesh nodes'
+
   ! material properties
   allocate(kappastore(NGLLX,NGLLY,NGLLZ,NSPEC_AB), &
           mustore(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
   if( ier /= 0 ) stop 'error allocating arrays for material properties'
+
   ! material flags
   allocate(ispec_is_acoustic(NSPEC_AB), &
           ispec_is_elastic(NSPEC_AB), &
@@ -185,6 +193,9 @@ subroutine initialize_simulation()
   ! initializes adjoint simulations
   call initialize_simulation_adjoint()
 
+  ! initializes GPU cards
+  if( GPU_MODE ) call initialize_GPU()
+
   end subroutine initialize_simulation
 
 !
@@ -214,7 +225,7 @@ subroutine initialize_simulation_check()
       write(IMAIN,*) 'error: number of MPI processors actually run on: ',sizeprocs
       print*
       print*, 'error specfem3D: number of processors supposed to run on: ',NPROC
-      print*, 'error specfem3D: number of MPI processors actually run on: ',sizeprocs      
+      print*, 'error specfem3D: number of MPI processors actually run on: ',sizeprocs
       print*
     endif
     call exit_MPI(myrank,'wrong number of MPI processes')
@@ -233,6 +244,10 @@ subroutine initialize_simulation_check()
       stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ is in [5-10]'
   endif
 
+  ! gravity only on GPU supported
+  if( .not. GPU_MODE .and. GRAVITY ) &
+    stop 'GRAVITY only supported in GPU mode'
+
   ! absorbing surfaces
   if( ABSORBING_CONDITIONS ) then
     ! for arbitrary orientation of elements, which face belongs to xmin,xmax,etc... -
@@ -278,6 +293,7 @@ subroutine initialize_simulation_check()
   endif
 
   end subroutine initialize_simulation_check
+
 !
 !-------------------------------------------------------------------------------------------------
 !
@@ -318,3 +334,55 @@ subroutine initialize_simulation_adjoint()
   endif
 
   end subroutine initialize_simulation_adjoint
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine initialize_GPU()
+
+! initialization for GPU cards
+
+  use specfem_par
+  use specfem_par_elastic
+  use specfem_par_acoustic
+  use specfem_par_poroelastic
+  implicit none
+  integer :: ncuda_devices,ncuda_devices_min,ncuda_devices_max
+
+  ! GPU_MODE now defined in Par_file
+  if(myrank == 0 ) then
+    write(IMAIN,*)
+    write(IMAIN,*) "GPU_MODE Active."
+  endif
+
+  ! check for GPU runs
+  if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
+    stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
+  if( CUSTOM_REAL /= 4 ) &
+    stop 'GPU mode runs only with CUSTOM_REAL == 4'
+  if( SAVE_MOHO_MESH ) &
+    stop 'GPU mode does not support SAVE_MOHO_MESH yet'
+  if( ATTENUATION ) then
+    if( N_SLS /= 3 ) &
+      stop 'GPU mode does not support N_SLS /= 3 yet'
+  endif
+  if( POROELASTIC_SIMULATION ) then
+    stop 'poroelastic simulations on GPU not supported yet'
+  endif
+
+  ! initializes GPU and outputs info to files for all processes
+  call prepare_cuda_device(myrank,ncuda_devices)
+
+  ! collects min/max of local devices found for statistics
+  call sync_all()
+  call min_all_i(ncuda_devices,ncuda_devices_min)
+  call max_all_i(ncuda_devices,ncuda_devices_max)
+
+  if( myrank == 0 ) then
+    write(IMAIN,*) "GPU number of devices per node: min =",ncuda_devices_min
+    write(IMAIN,*) "                                max =",ncuda_devices_max
+    write(IMAIN,*)
+  endif
+
+  end subroutine initialize_GPU
diff --git a/src/specfem3D/iterate_time.f90 b/src/specfem3D/iterate_time.f90
index bb74fadb2..d0b317748 100644
--- a/src/specfem3D/iterate_time.f90
+++ b/src/specfem3D/iterate_time.f90
@@ -90,8 +90,8 @@ subroutine iterate_time()
      call it_read_forward_arrays()
     endif
 
-    ! write the seismograms with time shift
-    if (nrec_local > 0) then
+    ! write the seismograms with time shift (GPU_MODE transfer included)
+    if (nrec_local > 0 .or. ( WRITE_SEISMOGRAMS_BY_MASTER .and. myrank == 0 ) ) then
       call write_seismograms()
     endif
 
@@ -111,12 +111,14 @@ subroutine iterate_time()
     endif
 
     ! first step of noise tomography, i.e., save a surface movie at every time step
-    if ( NOISE_TOMOGRAPHY == 1 ) then
-      call noise_save_surface_movie(displ, &
-                              ibool, &
-                              noise_surface_movie,it, &
-                              NSPEC_AB,NGLOB_AB, &
-                              num_free_surface_faces,free_surface_ispec,free_surface_ijk)
+    if ( NOISE_TOMOGRAPHY == 1) then
+      if( num_free_surface_faces > 0) then
+        call noise_save_surface_movie(displ,ibool, &
+                            noise_surface_movie,it, &
+                            NSPEC_AB,NGLOB_AB, &
+                            num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+                            Mesh_pointer,GPU_MODE)
+      endif
     endif
 
 !
@@ -124,9 +126,39 @@ subroutine iterate_time()
 !
   enddo   ! end of main time loop
 
+  call it_print_elapsed_time()
+
+  ! Transfer fields from GPU card to host for further analysis
+  if(GPU_MODE) call it_transfer_from_GPU()
+
   end subroutine iterate_time
 
 
+!=====================================================================
+
+  subroutine it_print_elapsed_time()
+    use specfem_par
+    use specfem_par_elastic
+    use specfem_par_acoustic
+    implicit none
+
+    ! local parameters
+    double precision :: tCPU
+    integer :: ihours,iminutes,iseconds,int_tCPU
+
+    if(myrank == 0) then
+       ! elapsed time since beginning of the simulation
+       tCPU = wtime() - time_start
+       int_tCPU = int(tCPU)
+       ihours = int_tCPU / 3600
+       iminutes = (int_tCPU - 3600*ihours) / 60
+       iseconds = int_tCPU - 3600*ihours - 60*iminutes
+       write(IMAIN,*) 'Time-Loop Complete. Timing info:'
+       write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
+       write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+    endif
+  end subroutine it_print_elapsed_time
+
 !=====================================================================
 
   subroutine it_check_stability()
@@ -146,47 +178,104 @@ subroutine it_check_stability()
              ihours_remain,iminutes_remain,iseconds_remain,int_t_remain, &
              ihours_total,iminutes_total,iseconds_total,int_t_total
 
+  ! maximum of the norm of the displacement
+  real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all ! elastic
+  real(kind=CUSTOM_REAL) Usolidnormp,Usolidnormp_all ! acoustic
+  real(kind=CUSTOM_REAL) Usolidnorms,Usolidnorms_all ! solid poroelastic
+  real(kind=CUSTOM_REAL) Usolidnormw,Usolidnormw_all ! fluid (w.r.t.s) poroelastic
+
+  ! norm of the backward displacement
+  real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all
+
+  ! initializes
+  Usolidnorm_all = 0.0_CUSTOM_REAL
+  Usolidnormp_all = 0.0_CUSTOM_REAL
+  Usolidnorms_all = 0.0_CUSTOM_REAL
+  Usolidnormw_all = 0.0_CUSTOM_REAL
+
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 !chris: Rewrite to get norm for each material when coupled simulations
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
 ! compute maximum of norm of displacement in each slice
-  if( ELASTIC_SIMULATION ) &
-    Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
-  if( ACOUSTIC_SIMULATION ) &
+  if( ELASTIC_SIMULATION ) then
+    if( GPU_MODE) then
+      ! way 2: just get maximum of field from GPU
+      call get_norm_elastic_from_device(Usolidnorm,Mesh_pointer,1)
+    else
+      Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
+    endif
+
+    ! check stability of the code, exit if unstable
+    ! negative values can occur with some compilers when the unstable value is greater
+    ! than the greatest possible floating-point number of the machine
+    if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0) &
+      call exit_MPI(myrank,'forward simulation became unstable and blew up')
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnorm,Usolidnorm_all)
+  endif
+
+  if( ACOUSTIC_SIMULATION ) then
+    if(GPU_MODE) then
+      ! way 2: just get maximum of field from GPU
+      call get_norm_acoustic_from_device(Usolidnormp,Mesh_pointer,1)
+    else
       Usolidnormp = maxval(abs(potential_dot_dot_acoustic(:)))
+    endif
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnormp,Usolidnormp_all)
+  endif
+
   if( POROELASTIC_SIMULATION ) then
     Usolidnorms = maxval(sqrt(displs_poroelastic(1,:)**2 + displs_poroelastic(2,:)**2 + &
                              displs_poroelastic(3,:)**2))
     Usolidnormw = maxval(sqrt(displw_poroelastic(1,:)**2 + displw_poroelastic(2,:)**2 + &
                              displw_poroelastic(3,:)**2))
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnorms,Usolidnorms_all)
+    call max_all_cr(Usolidnormw,Usolidnormw_all)
   endif
 
-! compute the maximum of the maxima for all the slices using an MPI reduction
-  call max_all_cr(Usolidnorm,Usolidnorm_all)
-  call max_all_cr(Usolidnormp,Usolidnormp_all)
-  call max_all_cr(Usolidnorms,Usolidnorms_all)
-  call max_all_cr(Usolidnormw,Usolidnormw_all)
 
-! adjoint simulations
+  ! adjoint simulations
   if( SIMULATION_TYPE == 3 ) then
     if( ELASTIC_SIMULATION ) then
-      b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
+      ! way 2
+      if(GPU_MODE) then
+        call get_norm_elastic_from_device(b_Usolidnorm,Mesh_pointer,3)
+      else
+        b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
+      endif
     else
       if( ACOUSTIC_SIMULATION ) then
-        b_Usolidnorm = maxval(abs(b_potential_dot_dot_acoustic(:)))
+        ! way 2
+        if(GPU_MODE) then
+          call get_norm_acoustic_from_device(b_Usolidnorm,Mesh_pointer,3)
+        else
+          b_Usolidnorm = maxval(abs(b_potential_dot_dot_acoustic(:)))
+        endif
       endif
     endif
+    ! check stability of the code, exit if unstable
+    ! negative values can occur with some compilers when the unstable value is greater
+    ! than the greatest possible floating-point number of the machine
+    if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0) &
+      call exit_MPI(myrank,'backward simulation became unstable and blew up')
+
+    ! compute max of all slices
     call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
-   endif
+  endif
 
-! user output
+  ! user output
   if(myrank == 0) then
 
     write(IMAIN,*) 'Time step # ',it
     write(IMAIN,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
 
-! elapsed time since beginning of the simulation
+    ! elapsed time since beginning of the simulation
     tCPU = wtime() - time_start
     int_tCPU = int(tCPU)
     ihours = int_tCPU / 3600
@@ -194,11 +283,13 @@ subroutine it_check_stability()
     iseconds = int_tCPU - 3600*ihours - 60*iminutes
     write(IMAIN,*) 'Elapsed time in seconds = ',tCPU
     write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
-    write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
-    if( ELASTIC_SIMULATION ) &
+    write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',sngl(tCPU/dble(it))
+    if( ELASTIC_SIMULATION ) then
       write(IMAIN,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
-    if( ACOUSTIC_SIMULATION ) &
+    endif
+    if( ACOUSTIC_SIMULATION ) then
         write(IMAIN,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
+    endif
     if( POROELASTIC_SIMULATION ) then
         write(IMAIN,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
         write(IMAIN,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
@@ -207,7 +298,7 @@ subroutine it_check_stability()
     if (SIMULATION_TYPE == 3) write(IMAIN,*) &
            'Max norm U (backward) in all slices = ',b_Usolidnorm_all
 
-! compute estimated remaining simulation time
+    ! compute estimated remaining simulation time
     t_remain = (NSTEP - it) * (tCPU/dble(it))
     int_t_remain = int(t_remain)
     ihours_remain = int_t_remain / 3600
@@ -215,17 +306,17 @@ subroutine it_check_stability()
     iseconds_remain = int_t_remain - 3600*ihours_remain - 60*iminutes_remain
     write(IMAIN,*) 'Time steps done = ',it,' out of ',NSTEP
     write(IMAIN,*) 'Time steps remaining = ',NSTEP - it
-    write(IMAIN,*) 'Estimated remaining time in seconds = ',t_remain
+    write(IMAIN,*) 'Estimated remaining time in seconds = ',sngl(t_remain)
     write(IMAIN,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
              ihours_remain,iminutes_remain,iseconds_remain
 
-! compute estimated total simulation time
+    ! compute estimated total simulation time
     t_total = t_remain + tCPU
     int_t_total = int(t_total)
     ihours_total = int_t_total / 3600
     iminutes_total = (int_t_total - 3600*ihours_total) / 60
     iseconds_total = int_t_total - 3600*ihours_total - 60*iminutes_total
-    write(IMAIN,*) 'Estimated total run time in seconds = ',t_total
+    write(IMAIN,*) 'Estimated total run time in seconds = ',sngl(t_total)
     write(IMAIN,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
              ihours_total,iminutes_total,iseconds_total
     write(IMAIN,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
@@ -238,7 +329,7 @@ subroutine it_check_stability()
     endif
     write(IMAIN,*)
 
-! write time stamp file to give information about progression of simulation
+    ! write time stamp file to give information about progression of simulation
     write(outputname,"('/timestamp',i6.6)") it
     open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown')
     write(IOUT,*) 'Time step # ',it
@@ -246,10 +337,12 @@ subroutine it_check_stability()
     write(IOUT,*) 'Elapsed time in seconds = ',tCPU
     write(IOUT,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
     write(IOUT,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
-    if( ELASTIC_SIMULATION ) &
+    if( ELASTIC_SIMULATION ) then
       write(IOUT,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
-    if( ACOUSTIC_SIMULATION ) &
+    endif
+    if( ACOUSTIC_SIMULATION ) then
         write(IOUT,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
+    endif
     if( POROELASTIC_SIMULATION ) then
         write(IOUT,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
         write(IOUT,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
@@ -259,28 +352,27 @@ subroutine it_check_stability()
            'Max norm U (backward) in all slices = ',b_Usolidnorm_all
     ! estimation
     write(IOUT,*) 'Time steps done = ',it,' out of ',NSTEP
-    write(IOUT,*) 'Time steps remaining = ',NSTEP - it    
+    write(IOUT,*) 'Time steps remaining = ',NSTEP - it
     write(IOUT,*) 'Estimated remaining time in seconds = ',t_remain
     write(IOUT,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
-             ihours_remain,iminutes_remain,iseconds_remain    
+             ihours_remain,iminutes_remain,iseconds_remain
     write(IOUT,*) 'Estimated total run time in seconds = ',t_total
     write(IOUT,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
              ihours_total,iminutes_total,iseconds_total
-    write(IOUT,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'           
+    write(IOUT,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
     close(IOUT)
 
-
-! check stability of the code, exit if unstable
-! negative values can occur with some compilers when the unstable value is greater
-! than the greatest possible floating-point number of the machine
-    if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0 &
-     .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0 &
-     .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0 &
-     .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0) &
+    ! check stability of the code, exit if unstable
+    ! negative values can occur with some compilers when the unstable value is greater
+    ! than the greatest possible floating-point number of the machine
+    if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0 &
+     .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0 &
+     .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0 &
+     .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0) &
         call exit_MPI(myrank,'forward simulation became unstable and blew up')
     ! adjoint simulations
     if(SIMULATION_TYPE == 3 .and. (b_Usolidnorm_all > STABILITY_THRESHOLD &
-      .or. b_Usolidnorm_all < 0)) &
+      .or. b_Usolidnorm_all < 0.0)) &
         call exit_MPI(myrank,'backward simulation became unstable and blew up')
 
   endif ! myrank
@@ -331,15 +423,27 @@ subroutine it_update_displacement_scheme()
 
 ! updates acoustic potentials
   if( ACOUSTIC_SIMULATION ) then
-    potential_acoustic(:) = potential_acoustic(:) &
+
+    if(.NOT. GPU_MODE) then
+      ! on CPU
+      potential_acoustic(:) = potential_acoustic(:) &
                             + deltat * potential_dot_acoustic(:) &
                             + deltatsqover2 * potential_dot_dot_acoustic(:)
-    potential_dot_acoustic(:) = potential_dot_acoustic(:) &
+      potential_dot_acoustic(:) = potential_dot_acoustic(:) &
                                 + deltatover2 * potential_dot_dot_acoustic(:)
-    potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
+      potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
+    else
+      ! on GPU
+      call it_update_displacement_ac_cuda(Mesh_pointer, NGLOB_AB, &
+                                        deltat, deltatsqover2, deltatover2, &
+                                        SIMULATION_TYPE, b_deltat, b_deltatsqover2, b_deltatover2)
+    endif
 
     ! time marching potentials
     if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then
+      if( GPU_MODE ) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
       call PML_acoustic_time_march(NSPEC_AB,NGLOB_AB,ibool,&
                         potential_acoustic,potential_dot_acoustic,&
                         deltat,deltatsqover2,deltatover2,&
@@ -352,15 +456,28 @@ subroutine it_update_displacement_scheme()
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
                         my_neighbours_ext_mesh,NPROC,&
                         ispec_is_acoustic)
+
+      if( GPU_MODE ) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
     endif
-  endif
+
+  endif ! ACOUSTIC_SIMULATION
 
 ! updates elastic displacement and velocity
   if( ELASTIC_SIMULATION ) then
-    displ(:,:) = displ(:,:) + deltat*veloc(:,:) + deltatsqover2*accel(:,:)
-    veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
-    if( SIMULATION_TYPE /= 1 ) accel_adj_coupling(:,:) = accel(:,:)
-    accel(:,:) = 0._CUSTOM_REAL
+
+    if(.NOT. GPU_MODE) then
+      ! on CPU
+      displ(:,:) = displ(:,:) + deltat*veloc(:,:) + deltatsqover2*accel(:,:)
+      veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
+      if( SIMULATION_TYPE /= 1 ) accel_adj_coupling(:,:) = accel(:,:)
+      accel(:,:) = 0._CUSTOM_REAL
+    else
+      ! on GPU
+      ! Includes SIM_TYPE 1 & 3 (for noise tomography)
+      call it_update_displacement_cuda(Mesh_pointer, size(displ), deltat, deltatsqover2,&
+             deltatover2, SIMULATION_TYPE, b_deltat, b_deltatsqover2, b_deltatover2)
+    endif
   endif
 
 ! updates poroelastic displacements and velocities
@@ -379,7 +496,7 @@ subroutine it_update_displacement_scheme()
   endif
 
 ! adjoint simulations
-  if (SIMULATION_TYPE == 3) then
+  if (SIMULATION_TYPE == 3 .and. .NOT. GPU_MODE) then
     ! acoustic backward fields
     if( ACOUSTIC_SIMULATION ) then
       b_potential_acoustic(:) = b_potential_acoustic(:) &
@@ -389,6 +506,7 @@ subroutine it_update_displacement_scheme()
                                   + b_deltatover2 * b_potential_dot_dot_acoustic(:)
       b_potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
     endif
+
     ! elastic backward fields
     if( ELASTIC_SIMULATION ) then
       b_displ(:,:) = b_displ(:,:) + b_deltat*b_veloc(:,:) + b_deltatsqover2*b_accel(:,:)
@@ -450,6 +568,11 @@ subroutine it_read_forward_arrays()
     read(27) b_potential_acoustic
     read(27) b_potential_dot_acoustic
     read(27) b_potential_dot_dot_acoustic
+
+    ! transfers fields onto GPU
+    if(GPU_MODE) &
+      call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, &
+                          b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
   endif
 
   ! elastic wavefields
@@ -458,18 +581,29 @@ subroutine it_read_forward_arrays()
     read(27) b_veloc
     read(27) b_accel
 
+    ! puts elastic wavefield to GPU
+    if(GPU_MODE) &
+      call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
+
     ! memory variables if attenuation
     if( ATTENUATION ) then
-       read(27) b_R_xx
-       read(27) b_R_yy
-       read(27) b_R_xy
-       read(27) b_R_xz
-       read(27) b_R_yz
-       read(27) b_epsilondev_xx
-       read(27) b_epsilondev_yy
-       read(27) b_epsilondev_xy
-       read(27) b_epsilondev_xz
-       read(27) b_epsilondev_yz
+      read(27) b_R_xx
+      read(27) b_R_yy
+      read(27) b_R_xy
+      read(27) b_R_xz
+      read(27) b_R_yz
+      read(27) b_epsilondev_xx
+      read(27) b_epsilondev_yy
+      read(27) b_epsilondev_xy
+      read(27) b_epsilondev_xz
+      read(27) b_epsilondev_yz
+
+      ! puts elastic attenuation arrays to GPU
+      if(GPU_MODE) &
+          call transfer_b_fields_att_to_device(Mesh_pointer, &
+                                            b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &
+                                            b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
+                                            size(b_epsilondev_xx))
     endif
 
   endif
@@ -514,9 +648,17 @@ subroutine it_store_attenuation_arrays()
       open(unit=27,file=trim(prname_Q)//trim(outputname),status='old',&
             action='read',form='unformatted')
       if( ELASTIC_SIMULATION ) then
+        ! reads arrays from disk files
         read(27) b_displ
         read(27) b_veloc
         read(27) b_accel
+
+        ! puts elastic fields onto GPU
+        if(GPU_MODE) then
+          ! wavefields
+          call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
+        endif
+
         read(27) b_R_xx
         read(27) b_R_yy
         read(27) b_R_xy
@@ -527,11 +669,28 @@ subroutine it_store_attenuation_arrays()
         read(27) b_epsilondev_xy
         read(27) b_epsilondev_xz
         read(27) b_epsilondev_yz
+
+        ! puts elastic fields onto GPU
+        if(GPU_MODE) then
+          ! attenuation arrays
+          call transfer_b_fields_att_to_device(Mesh_pointer, &
+                                            b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &
+                                            b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
+                                            size(b_epsilondev_xx))
+        endif
       endif
+
       if( ACOUSTIC_SIMULATION ) then
+        ! reads arrays from disk files
         read(27) b_potential_acoustic
         read(27) b_potential_dot_acoustic
         read(27) b_potential_dot_dot_acoustic
+
+        ! puts acoustic fields onto GPU
+        if(GPU_MODE) &
+          call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, &
+                              b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+
       endif
       close(27)
     else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. mod(it,NSTEP_Q_SAVE) == 0) then
@@ -540,9 +699,24 @@ subroutine it_store_attenuation_arrays()
       open(unit=27,file=trim(prname_Q)//trim(outputname),status='unknown',&
            action='write',form='unformatted')
       if( ELASTIC_SIMULATION ) then
+        ! gets elastic fields from GPU onto CPU
+        if(GPU_MODE) then
+          call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+        endif
+
+        ! writes to disk file
         write(27) displ
         write(27) veloc
         write(27) accel
+
+        if(GPU_MODE) then
+          ! attenuation arrays
+          call transfer_fields_att_from_device(Mesh_pointer, &
+                                            R_xx,R_yy,R_xy,R_xz,R_yz,size(R_xx), &
+                                            epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
+                                            size(epsilondev_xx))
+        endif
+
         write(27) R_xx
         write(27) R_yy
         write(27) R_xy
@@ -555,6 +729,12 @@ subroutine it_store_attenuation_arrays()
         write(27) epsilondev_yz
       endif
       if( ACOUSTIC_SIMULATION ) then
+       ! gets acoustic fields from GPU onto CPU
+        if(GPU_MODE) &
+          call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
+        ! writes to disk file
         write(27) potential_acoustic
         write(27) potential_dot_acoustic
         write(27) potential_dot_dot_acoustic
@@ -564,3 +744,80 @@ subroutine it_store_attenuation_arrays()
   endif ! it
 
   end subroutine it_store_attenuation_arrays
+
+
+!=====================================================================
+
+  subroutine it_transfer_from_GPU()
+
+! transfers fields on GPU back onto CPU
+
+  use specfem_par
+  use specfem_par_elastic
+  use specfem_par_acoustic
+
+  implicit none
+
+  ! to store forward wave fields
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+
+    ! acoustic potentials
+    if( ACOUSTIC_SIMULATION ) &
+      call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+                            potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+
+    ! elastic wavefield
+    if( ELASTIC_SIMULATION ) then
+      call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+
+      if (ATTENUATION) &
+        call transfer_fields_att_from_device(Mesh_pointer, &
+                                            R_xx,R_yy,R_xy,R_xz,R_yz,size(R_xx), &
+                                            epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
+                                            size(epsilondev_xx))
+
+    endif
+  else if (SIMULATION_TYPE == 3) then
+
+    ! to store kernels
+    ! acoustic domains
+    if( ACOUSTIC_SIMULATION ) then
+      ! only in case needed...
+      !call transfer_b_fields_ac_from_device(NGLOB_AB,b_potential_acoustic, &
+      !                      b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+
+      ! acoustic kernels
+      call transfer_kernels_ac_to_host(Mesh_pointer,rho_ac_kl,kappa_ac_kl,NSPEC_AB)
+    endif
+
+    ! elastic domains
+    if( ELASTIC_SIMULATION ) then
+      ! only in case needed...
+      !call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
+
+      ! elastic kernels
+      call transfer_kernels_el_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,NSPEC_AB)
+    endif
+
+    ! specific noise strength kernel
+    if( NOISE_TOMOGRAPHY == 3 ) then
+      call transfer_kernels_noise_to_host(Mesh_pointer,Sigma_kl,NSPEC_AB)
+    endif
+
+    ! approximative hessian for preconditioning kernels
+    if ( APPROXIMATE_HESS_KL ) then
+      if( ELASTIC_SIMULATION ) call transfer_kernels_hess_el_tohost(Mesh_pointer,hess_kl,NSPEC_AB)
+      if( ACOUSTIC_SIMULATION ) call transfer_kernels_hess_ac_tohost(Mesh_pointer,hess_ac_kl,NSPEC_AB)
+    endif
+
+  endif
+
+  ! frees allocated memory on GPU
+  call prepare_cleanup_device(Mesh_pointer, &
+                              SIMULATION_TYPE,SAVE_FORWARD, &
+                              ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, &
+                              ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, &
+                              ATTENUATION,ANISOTROPY,OCEANS, &
+                              APPROXIMATE_HESS_KL)
+
+  end subroutine it_transfer_from_GPU
diff --git a/src/specfem3D/locate_receivers.f90 b/src/specfem3D/locate_receivers.f90
index 079afc269..0003a9cc2 100644
--- a/src/specfem3D/locate_receivers.f90
+++ b/src/specfem3D/locate_receivers.f90
@@ -86,17 +86,17 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
   integer ios
 
   double precision,dimension(1) :: altitude_rec,distmin_ele
-  !double precision,dimension(4) :: elevation_node,dist_node  
+  !double precision,dimension(4) :: elevation_node,dist_node
   double precision,dimension(NPROC) :: distmin_ele_all,elevation_all
 
   real(kind=CUSTOM_REAL) :: xloc,yloc,loc_ele,loc_distmin
-  
+
   double precision, allocatable, dimension(:) :: x_target,y_target,z_target
-  
+
   double precision, allocatable, dimension(:) :: horiz_dist
   double precision, allocatable, dimension(:) :: x_found,y_found,z_found
   double precision dist
-  
+
   double precision xi,eta,gamma,dx,dy,dz,dxi,deta,dgamma
   double precision x,y,z
   double precision xix,xiy,xiz
@@ -220,11 +220,11 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
               status='unknown',action='write',iostat=ios)
         if( ios /= 0 ) &
           call exit_mpi(myrank,'error opening file '//trim(OUTPUT_FILES)//'/output_list_stations.txt')
-          
+
         do irec=1,nrec
           write(IOUT_SU,*) x_found(irec),y_found(irec),z_found(irec)
         enddo
-        
+
         close(IOUT_SU)
         deallocate(x_found,y_found,z_found)
       endif
@@ -286,7 +286,7 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
 
     read(1,*,iostat=ios) station_name(irec),network_name(irec), &
                           stlat(irec),stlon(irec),stele(irec),stbur(irec)
-                          
+
     if (ios /= 0) call exit_mpi(myrank, 'Error reading station file '//trim(rec_filename))
 
     ! convert station location to UTM
@@ -315,7 +315,7 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
                                 num_free_surface_faces,free_surface_ispec,free_surface_ijk)
     altitude_rec(1) = loc_ele
     distmin_ele(1) = loc_distmin
-    
+
 !    !   set distance to huge initial value
 !    distmin = HUGEVAL
 !    if(num_free_surface_faces > 0) then
@@ -385,11 +385,11 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
 !        end do
 !      end do
 !    end if
-    
+
     !  MPI communications to determine the best slice
     call gather_all_dp(distmin_ele,1,distmin_ele_all,1,NPROC)
     call gather_all_dp(altitude_rec,1,elevation_all,1,NPROC)
-    
+
     if(myrank == 0) then
       iproc = minloc(distmin_ele_all)
       altitude_rec(1) = elevation_all(iproc(1))
@@ -499,6 +499,7 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
       ! end of loop on all the spectral elements in current slice
       enddo
     else
+      ! SeismicUnix format
       ispec_selected_rec(irec) = 0
       ix_initial_guess(irec) = 0
       iy_initial_guess(irec) = 0
@@ -519,7 +520,7 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
               (y_target(irec)>=ymin_ELE .and. y_target(irec)<=ymax_ELE) .and. &
               (z_target(irec)>=zmin_ELE .and. z_target(irec)<=zmax_ELE) ) then
             ! we find the element (ispec) which "may" contain the receiver (irec)
-            ! so we only need to compute distances 
+            ! so we only need to compute distances
             !(which is expensive because of "dsqrt") within those elements
             ispec_selected_rec(irec) = ispec
             do k = kmin_temp,kmax_temp
@@ -994,11 +995,11 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, &
               status='unknown',action='write',iostat=ios)
     if( ios /= 0 ) &
       call exit_mpi(myrank,'error opening file '//trim(OUTPUT_FILES)//'/output_list_stations.txt')
-      
+
     do irec=1,nrec
       write(IOUT_SU,*) x_found(irec),y_found(irec),z_found(irec)
     enddo
-    
+
     close(IOUT_SU)
 
     ! stores station infos for later runs
diff --git a/src/specfem3D/locate_source.f90 b/src/specfem3D/locate_source.f90
index db33f5aed..67d7c2649 100644
--- a/src/specfem3D/locate_source.f90
+++ b/src/specfem3D/locate_source.f90
@@ -68,7 +68,7 @@ subroutine locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB,xstore,ystore,z
   integer iprocloop
 
   integer i,j,k,ispec,iglob,isource
-  integer imin,imax,jmin,jmax,kmin,kmax 
+  integer imin,imax,jmin,jmax,kmin,kmax
 !  integer  igll,jgll,kgll,inode,iface,iglob_selected,
 !  integer iselected,jselected,iface_selected,iadjust,jadjust
   integer iproc(1)
@@ -103,12 +103,12 @@ subroutine locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB,xstore,ystore,z
   double precision x_target_source,y_target_source,z_target_source
 
   double precision,dimension(1) :: altitude_source,distmin_ele
-  
+
   double precision,dimension(NPROC) :: distmin_ele_all,elevation_all
 
 !  double precision,dimension(4) :: elevation_node,dist_node
   real(kind=CUSTOM_REAL) :: xloc,yloc,loc_ele,loc_distmin
-  
+
 
   integer islice_selected_source(NSOURCES)
 
@@ -220,9 +220,9 @@ subroutine locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB,xstore,ystore,z
                               num_free_surface_faces,free_surface_ispec,free_surface_ijk)
     altitude_source(1) = loc_ele
     distmin_ele(1) = loc_distmin
-    
 
-    
+
+
 !    ! set distance to huge initial value
 !    distmin = HUGEVAL
 !    if(num_free_surface_faces > 0) then
@@ -295,11 +295,11 @@ subroutine locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB,xstore,ystore,z
 !      end do
 !    end if
 !    distmin_ele(1)= distmin
-    
+
     !  MPI communications to determine the best slice
     call gather_all_dp(distmin_ele,1,distmin_ele_all,1,NPROC)
     call gather_all_dp(altitude_source,1,elevation_all,1,NPROC)
-    
+
     if(myrank == 0) then
       iproc = minloc(distmin_ele_all)
       altitude_source(1) = elevation_all(iproc(1))
diff --git a/src/specfem3D/make_gravity.f90 b/src/specfem3D/make_gravity.f90
new file mode 100644
index 000000000..7dce52175
--- /dev/null
+++ b/src/specfem3D/make_gravity.f90
@@ -0,0 +1,680 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+!
+! United States and French Government Sponsorship Acknowledged.
+
+
+  subroutine make_gravity(nspl,rspl,gspl,gspl2, &
+                          ROCEAN,RMIDDLE_CRUST,RMOHO,R80,R220,R400,R600,R670, &
+                          R771,RTOPDDOUBLEPRIME,RCMB,RICB)
+
+! creates a spline for the gravity profile in PREM
+! radius and density are non-dimensional
+
+  implicit none
+
+  include "constants.h"
+
+  integer:: nspl
+  double precision:: rspl(NR),gspl(NR),gspl2(NR)
+  double precision ROCEAN,RMIDDLE_CRUST,RMOHO,R80,R220,R400,R600,R670, &
+                   R771,RTOPDDOUBLEPRIME,RCMB,RICB
+
+  ! local parameters
+  double precision r_icb,r_cmb,r_topddoubleprime,r_771,r_670,r_600
+  double precision r_400,r_220,r_80,r_moho,r_middle_crust,r_ocean,r_0
+  double precision r(NR),rho(NR),g(NR),i_rho
+  double precision s1(NR),s2(NR),s3(NR)
+  double precision yp1,ypn
+  integer i
+
+! PREM
+  ! values in (m)
+  ROCEAN = 6368000.d0
+  RMIDDLE_CRUST = 6356000.d0
+  RMOHO = 6346600.d0 ! PREM moho depth at 24.4 km
+  R80  = 6291000.d0
+  R220 = 6151000.d0
+  R400 = 5971000.d0
+  R600 = 5771000.d0
+  R670 = 5701000.d0
+  R771 = 5600000.d0
+  RTOPDDOUBLEPRIME = 3630000.d0
+  RCMB = 3480000.d0
+  RICB = 1221000.d0
+
+! non-dimensionalize
+  r_icb = RICB/R_EARTH_GRAVITY
+  r_cmb = RCMB/R_EARTH_GRAVITY
+  r_topddoubleprime = RTOPDDOUBLEPRIME/R_EARTH_GRAVITY
+  r_771 = R771/R_EARTH_GRAVITY
+  r_670 = R670/R_EARTH_GRAVITY
+  r_600 = R600/R_EARTH_GRAVITY
+  r_400 = R400/R_EARTH_GRAVITY
+  r_220 = R220/R_EARTH_GRAVITY
+  r_80 = R80/R_EARTH_GRAVITY
+  r_moho = RMOHO/R_EARTH_GRAVITY
+  r_middle_crust = RMIDDLE_CRUST/R_EARTH_GRAVITY
+  r_ocean = ROCEAN_GRAVITY/R_EARTH_GRAVITY
+  r_0 = 1.d0
+
+  do i=1,163
+    r(i) = r_icb*dble(i-1)/dble(162)
+  enddo
+  do i=164,323
+    r(i) = r_icb+(r_cmb-r_icb)*dble(i-164)/dble(159)
+  enddo
+  do i=324,336
+    r(i) = r_cmb+(r_topddoubleprime-r_cmb)*dble(i-324)/dble(12)
+  enddo
+  do i=337,517
+    r(i) = r_topddoubleprime+(r_771-r_topddoubleprime)*dble(i-337)/dble(180)
+  enddo
+  do i=518,530
+    r(i) = r_771+(r_670-r_771)*dble(i-518)/dble(12)
+  enddo
+  do i=531,540
+    r(i) = r_670+(r_600-r_670)*dble(i-531)/dble(9)
+  enddo
+  do i=541,565
+    r(i) = r_600+(r_400-r_600)*dble(i-541)/dble(24)
+  enddo
+  do i=566,590
+    r(i) = r_400+(r_220-r_400)*dble(i-566)/dble(24)
+  enddo
+  do i=591,609
+    r(i) = r_220+(r_80-r_220)*dble(i-591)/dble(18)
+  enddo
+  do i=610,619
+    r(i) = r_80+(r_moho-r_80)*dble(i-610)/dble(9)
+  enddo
+  do i=620,626
+    r(i) = r_moho+(r_middle_crust-r_moho)*dble(i-620)/dble(6)
+  enddo
+  do i=627,633
+    r(i) = r_middle_crust+(r_ocean-r_middle_crust)*dble(i-627)/dble(6)
+  enddo
+  do i=634,NR
+    r(i) = r_ocean+(r_0-r_ocean)*dble(i-634)/dble(6)
+  enddo
+
+! use PREM to get the density profile for ellipticity (fine for other 1D reference models)
+  do i=1,NR
+    call prem_density(r(i),rho(i), &
+                    RICB,RCMB,RTOPDDOUBLEPRIME, &
+                    R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN_GRAVITY)
+  enddo
+
+  g(1)=0.0d0
+  do i=2,NR
+    call intgrl(i_rho,r,1,i,rho,s1,s2,s3)
+    g(i)=4.0d0*i_rho/(r(i)*r(i))
+  enddo
+
+!
+! get ready to spline g
+!
+  nspl=1
+  rspl(1)=r(1)
+  gspl(1)=g(1)
+  do i=2,NR
+    if(r(i)/=r(i-1)) then
+      nspl=nspl+1
+      rspl(nspl)=r(i)
+      gspl(nspl)=g(i)
+    endif
+  enddo
+  yp1=(4.0d0/3.0d0)*rho(1)
+  ypn=4.0d0*rho(NR)-2.0d0*g(NR)/r(NR)
+  call spline_construction(rspl,gspl,nspl,yp1,ypn,gspl2)
+
+  end subroutine make_gravity
+
+
+!--------------------------------------------------------------------------------------------------
+!
+! PREM [Dziewonski and Anderson, 1981].
+!
+! A. M. Dziewonski and D. L. Anderson.
+! Preliminary reference Earth model.
+! Phys. Earth Planet. Inter., 25:297–356, 1981.
+!
+! Isotropic (iso) and transversely isotropic (aniso) version of the
+! spherically symmetric Preliminary Reference Earth Model
+!
+!--------------------------------------------------------------------------------------------------
+
+
+  subroutine model_prem_iso(x,rho,drhodr,vp,vs,Qkappa,Qmu,&
+                          RICB,RCMB,RTOPDDOUBLEPRIME, &
+                          R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN)
+
+  implicit none
+
+  include "constants.h"
+
+! given a normalized radius x, gives the non-dimensionalized density rho,
+! speeds vp and vs, and the quality factors Qkappa and Qmu
+
+  double precision x,rho,drhodr,vp,vs,Qkappa,Qmu,RICB,RCMB,RTOPDDOUBLEPRIME, &
+      R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN
+
+  double precision r,scaleval
+
+  logical,parameter :: ONE_CRUST = .false.
+  logical,parameter :: CRUSTAL = .false.
+
+! compute real physical radius in meters
+  r = x * R_EARTH
+
+!
+!--- inner core
+!
+  if(r >= 0.d0 .and. r <= RICB) then
+    drhodr=-2.0d0*8.8381d0*x
+    rho=13.0885d0-8.8381d0*x*x
+    vp=11.2622d0-6.3640d0*x*x
+    vs=3.6678d0-4.4475d0*x*x
+    Qmu=84.6d0
+    Qkappa=1327.7d0
+!
+!--- outer core
+!
+  else if(r > RICB .and. r <= RCMB) then
+    drhodr=-1.2638d0-2.0d0*3.6426d0*x-3.0d0*5.5281d0*x*x
+    rho=12.5815d0-1.2638d0*x-3.6426d0*x*x-5.5281d0*x*x*x
+    vp=11.0487d0-4.0362d0*x+4.8023d0*x*x-13.5732d0*x*x*x
+    vs=0.0d0
+    Qmu=0.0d0
+    Qkappa=57827.0d0
+!
+!--- D" at the base of the mantle
+!
+  else if(r > RCMB .and. r <= RTOPDDOUBLEPRIME) then
+    drhodr=-6.4761d0+2.0d0*5.5283d0*x-3.0d0*3.0807d0*x*x
+    rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x
+    vp=15.3891d0-5.3181d0*x+5.5242d0*x*x-2.5514d0*x*x*x
+    vs=6.9254d0+1.4672d0*x-2.0834d0*x*x+0.9783d0*x*x*x
+    Qmu=312.0d0
+    Qkappa=57827.0d0
+!
+!--- mantle: from top of D" to d670
+!
+  else if(r > RTOPDDOUBLEPRIME .and. r <= R771) then
+    drhodr=-6.4761d0+2.0d0*5.5283d0*x-3.0d0*3.0807d0*x*x
+    rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x
+    vp=24.9520d0-40.4673d0*x+51.4832d0*x*x-26.6419d0*x*x*x
+    vs=11.1671d0-13.7818d0*x+17.4575d0*x*x-9.2777d0*x*x*x
+    Qmu=312.0d0
+    Qkappa=57827.0d0
+  else if(r > R771 .and. r <= R670) then
+    drhodr=-6.4761d0+2.0d0*5.5283d0*x-3.0d0*3.0807d0*x*x
+    rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x
+    vp=29.2766d0-23.6027d0*x+5.5242d0*x*x-2.5514d0*x*x*x
+    vs=22.3459d0-17.2473d0*x-2.0834d0*x*x+0.9783d0*x*x*x
+    Qmu=312.0d0
+    Qkappa=57827.0d0
+!
+!--- mantle: above d670
+!
+  else if(r > R670 .and. r <= R600) then
+    drhodr=-1.4836d0
+    rho=5.3197d0-1.4836d0*x
+    vp=19.0957d0-9.8672d0*x
+    vs=9.9839d0-4.9324d0*x
+    Qmu=143.0d0
+    Qkappa=57827.0d0
+  else if(r > R600 .and. r <= R400) then
+    drhodr=-8.0298d0
+    rho=11.2494d0-8.0298d0*x
+    vp=39.7027d0-32.6166d0*x
+    vs=22.3512d0-18.5856d0*x
+    Qmu=143.0d0
+    Qkappa=57827.0d0
+  else if(r > R400 .and. r <= R220) then
+    drhodr=-3.8045d0
+    rho=7.1089d0-3.8045d0*x
+    vp=20.3926d0-12.2569d0*x
+    vs=8.9496d0-4.4597d0*x
+    Qmu=143.0d0
+    Qkappa=57827.0d0
+  else if(r > R220 .and. r <= R80) then
+    drhodr=0.6924d0
+    rho=2.6910d0+0.6924d0*x
+    vp=4.1875d0+3.9382d0*x
+    vs=2.1519d0+2.3481d0*x
+    Qmu=80.0d0
+    Qkappa=57827.0d0
+  else
+    if(CRUSTAL) then
+    ! fill with PREM mantle and later add CRUST2.0
+      if(r > R80) then
+        ! density/velocity from mantle just below moho
+        drhodr=0.6924d0
+        rho=2.6910d0+0.6924d0*x
+        vp=4.1875d0+3.9382d0*x
+        vs=2.1519d0+2.3481d0*x
+        ! shear attenuation for R80 to surface
+        Qmu=600.0d0
+        Qkappa=57827.0d0
+      endif
+    else
+    ! use PREM crust
+      if(r > R80 .and. r <= RMOHO) then
+        drhodr=0.6924d0
+        rho=2.6910d0+0.6924d0*x
+        vp=4.1875d0+3.9382d0*x
+        vs=2.1519d0+2.3481d0*x
+        Qmu=600.0d0
+        Qkappa=57827.0d0
+
+      else if(r > RMOHO .and. r <= RMIDDLE_CRUST) then
+        drhodr=0.0d0
+        rho=2.9d0
+        vp=6.8d0
+        vs=3.9d0
+        Qmu=600.0d0
+        Qkappa=57827.0d0
+
+    ! same properties everywhere in PREM crust if we decide to define only one layer in the crust
+        if(ONE_CRUST) then
+          drhodr=0.0d0
+          rho=2.6d0
+          vp=5.8d0
+          vs=3.2d0
+          Qmu=600.0d0
+          Qkappa=57827.0d0
+        endif
+
+      else if(r > RMIDDLE_CRUST .and. r <= ROCEAN) then
+        drhodr=0.0d0
+        rho=2.6d0
+        vp=5.8d0
+        vs=3.2d0
+        Qmu=600.0d0
+        Qkappa=57827.0d0
+    ! for density profile for gravity, we do not check that r <= R_EARTH
+      else if(r > ROCEAN) then
+        drhodr=0.0d0
+        rho=2.6d0
+        vp=5.8d0
+        vs=3.2d0
+        Qmu=600.0d0
+        Qkappa=57827.0d0
+
+      endif
+    endif
+  endif
+
+! non-dimensionalize
+! time scaling (s^{-1}) is done with scaleval
+  scaleval=dsqrt(PI*GRAV*RHOAV)
+  drhodr=drhodr*1000.0d0/RHOAV
+  rho=rho*1000.0d0/RHOAV
+  vp=vp*1000.0d0/(R_EARTH*scaleval)
+  vs=vs*1000.0d0/(R_EARTH*scaleval)
+
+  end subroutine model_prem_iso
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prem_density(x,rho, &
+                         RICB,RCMB,RTOPDDOUBLEPRIME, &
+                         R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN)
+
+  implicit none
+
+  include "constants.h"
+
+  double precision x,rho,RICB,RCMB,RTOPDDOUBLEPRIME, &
+      R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN
+  double precision r
+
+  logical,parameter :: ONE_CRUST = .false.
+
+  ! compute real physical radius in meters
+  r = x * R_EARTH
+
+  ! calculates density according to radius
+  if(r <= RICB) then
+    rho=13.0885d0-8.8381d0*x*x
+  else if(r > RICB .and. r <= RCMB) then
+    rho=12.5815d0-1.2638d0*x-3.6426d0*x*x-5.5281d0*x*x*x
+  else if(r > RCMB .and. r <= RTOPDDOUBLEPRIME) then
+    rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x
+  else if(r > RTOPDDOUBLEPRIME .and. r <= R771) then
+    rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x
+  else if(r > R771 .and. r <= R670) then
+    rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x
+  else if(r > R670 .and. r <= R600) then
+    rho=5.3197d0-1.4836d0*x
+  else if(r > R600 .and. r <= R400) then
+    rho=11.2494d0-8.0298d0*x
+  else if(r > R400 .and. r <= R220) then
+    rho=7.1089d0-3.8045d0*x
+  else if(r > R220 .and. r <= R80) then
+    rho=2.6910d0+0.6924d0*x
+  else
+    if(r > R80 .and. r <= RMOHO) then
+      rho=2.6910d0+0.6924d0*x
+    else if(r > RMOHO .and. r <= RMIDDLE_CRUST) then
+      if(ONE_CRUST) then
+        rho=2.6d0
+      else
+        rho=2.9d0
+      endif
+    else if(r > RMIDDLE_CRUST .and. r <= ROCEAN) then
+      rho=2.6d0
+    else if(r > ROCEAN) then
+      rho=2.6d0
+    endif
+  endif
+
+  rho=rho*1000.0d0/RHOAV
+
+  end subroutine prem_density
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine intgrl(sum,r,nir,ner,f,s1,s2,s3)
+
+! Computes the integral of f[i]*r[i]*r[i] from i=nir to i=ner for
+! radii values as in model PREM_an640
+
+  implicit none
+
+! Argument variables
+  integer ner,nir
+  double precision f(640),r(640),s1(640),s2(640)
+  double precision s3(640),sum
+
+! Local variables
+  double precision, parameter :: third = 1.0d0/3.0d0
+  double precision, parameter :: fifth = 1.0d0/5.0d0
+  double precision, parameter :: sixth = 1.0d0/6.0d0
+
+  double precision rji,yprime(640)
+  double precision s1l,s2l,s3l
+
+  integer i,j,n,kdis(28)
+  integer ndis,nir1
+
+  data kdis/163,323,336,517,530,540,565,590,609,619,626,633,16*0/
+
+  ndis = 12
+  n = 640
+
+  call deriv(f,yprime,n,r,ndis,kdis,s1,s2,s3)
+  nir1 = nir + 1
+  sum = 0.0d0
+  do i=nir1,ner
+    j = i-1
+    rji = r(i) - r(j)
+    s1l = s1(j)
+    s2l = s2(j)
+    s3l = s3(j)
+    sum = sum + r(j)*r(j)*rji*(f(j) &
+              + rji*(0.5d0*s1l + rji*(third*s2l + rji*0.25d0*s3l))) &
+              + 2.0d0*r(j)*rji*rji*(0.5d0*f(j) + rji*(third*s1l + rji*(0.25d0*s2l + rji*fifth*s3l))) &
+              + rji*rji*rji*(third*f(j) + rji*(0.25d0*s1l + rji*(fifth*s2l + rji*sixth*s3l)))
+  enddo
+
+  end subroutine intgrl
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine deriv(y,yprime,n,r,ndis,kdis,s1,s2,s3)
+
+  implicit none
+
+! Argument variables
+  integer kdis(28),n,ndis
+  double precision r(n),s1(n),s2(n),s3(n)
+  double precision y(n),yprime(n)
+
+! Local variables
+  integer i,j,j1,j2
+  integer k,nd,ndp
+  double precision a0,b0,b1
+  double precision f(3,1000),h,h2,h2a
+  double precision h2b,h3a,ha,s13
+  double precision s21,s32,yy(3)
+
+  yy(1) = 0.d0
+  yy(2) = 0.d0
+  yy(3) = 0.d0
+
+  ndp=ndis+1
+  do 3 nd=1,ndp
+  if(nd == 1) goto 4
+  if(nd == ndp) goto 5
+  j1=kdis(nd-1)+1
+  j2=kdis(nd)-2
+  goto 6
+    4 j1=1
+  j2=kdis(1)-2
+  goto 6
+    5 j1=kdis(ndis)+1
+  j2=n-2
+    6 if((j2+1-j1)>0) goto 11
+  j2=j2+2
+  yy(1)=(y(j2)-y(j1))/(r(j2)-r(j1))
+  s1(j1)=yy(1)
+  s1(j2)=yy(1)
+  s2(j1)=yy(2)
+  s2(j2)=yy(2)
+  s3(j1)=yy(3)
+  s3(j2)=yy(3)
+  goto 3
+   11 a0=0.0d0
+  if(j1 == 1) goto 7
+  h=r(j1+1)-r(j1)
+  h2=r(j1+2)-r(j1)
+  yy(1)=h*h2*(h2-h)
+  h=h*h
+  h2=h2*h2
+  b0=(y(j1)*(h-h2)+y(j1+1)*h2-y(j1+2)*h)/yy(1)
+  goto 8
+ 7 b0=0.0d0
+ 8 b1=b0
+
+  if(j2 > 1000) stop 'error in subroutine deriv for j2'
+
+  do i=j1,j2
+    h=r(i+1)-r(i)
+    yy(1)=y(i+1)-y(i)
+    h2=h*h
+    ha=h-a0
+    h2a=h-2.0d0*a0
+    h3a=2.0d0*h-3.0d0*a0
+    h2b=h2*b0
+    s1(i)=h2/ha
+    s2(i)=-ha/(h2a*h2)
+    s3(i)=-h*h2a/h3a
+    f(1,i)=(yy(1)-h*b0)/(h*ha)
+    f(2,i)=(h2b-yy(1)*(2.0d0*h-a0))/(h*h2*h2a)
+    f(3,i)=-(h2b-3.0d0*yy(1)*ha)/(h*h3a)
+    a0=s3(i)
+    b0=f(3,i)
+  enddo
+
+  i=j2+1
+  h=r(i+1)-r(i)
+  yy(1)=y(i+1)-y(i)
+  h2=h*h
+  ha=h-a0
+  h2a=h*ha
+  h2b=h2*b0-yy(1)*(2.d0*h-a0)
+  s1(i)=h2/ha
+  f(1,i)=(yy(1)-h*b0)/h2a
+  ha=r(j2)-r(i+1)
+  yy(1)=-h*ha*(ha+h)
+  ha=ha*ha
+  yy(1)=(y(i+1)*(h2-ha)+y(i)*ha-y(j2)*h2)/yy(1)
+  s3(i)=(yy(1)*h2a+h2b)/(h*h2*(h-2.0d0*a0))
+  s13=s1(i)*s3(i)
+  s2(i)=f(1,i)-s13
+
+  do j=j1,j2
+    k=i-1
+    s32=s3(k)*s2(i)
+    s1(i)=f(3,k)-s32
+    s21=s2(k)*s1(i)
+    s3(k)=f(2,k)-s21
+    s13=s1(k)*s3(k)
+    s2(k)=f(1,k)-s13
+    i=k
+  enddo
+
+  s1(i)=b1
+  j2=j2+2
+  s1(j2)=yy(1)
+  s2(j2)=yy(2)
+  s3(j2)=yy(3)
+ 3 continue
+
+  do i=1,n
+    yprime(i)=s1(i)
+  enddo
+
+  end subroutine deriv
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! compute spline coefficients
+
+  subroutine spline_construction(xpoint,ypoint,npoint,tangent_first_point,tangent_last_point,spline_coefficients)
+
+  implicit none
+
+! tangent to the spline imposed at the first and last points
+  double precision, intent(in) :: tangent_first_point,tangent_last_point
+
+! number of input points and coordinates of the input points
+  integer, intent(in) :: npoint
+  double precision, dimension(npoint), intent(in) :: xpoint,ypoint
+
+! spline coefficients output by the routine
+  double precision, dimension(npoint), intent(out) :: spline_coefficients
+
+  integer :: i
+
+  double precision, dimension(:), allocatable :: temporary_array
+
+  allocate(temporary_array(npoint))
+
+  spline_coefficients(1) = - 1.d0 / 2.d0
+
+  temporary_array(1) = (3.d0/(xpoint(2)-xpoint(1)))*((ypoint(2)-ypoint(1))/(xpoint(2)-xpoint(1))-tangent_first_point)
+
+  do i = 2,npoint-1
+
+    spline_coefficients(i) = ((xpoint(i)-xpoint(i-1))/(xpoint(i+1)-xpoint(i-1))-1.d0) &
+       / ((xpoint(i)-xpoint(i-1))/(xpoint(i+1)-xpoint(i-1))*spline_coefficients(i-1)+2.d0)
+
+    temporary_array(i) = (6.d0*((ypoint(i+1)-ypoint(i))/(xpoint(i+1)-xpoint(i)) &
+       - (ypoint(i)-ypoint(i-1))/(xpoint(i)-xpoint(i-1)))/(xpoint(i+1)-xpoint(i-1)) &
+       - (xpoint(i)-xpoint(i-1))/(xpoint(i+1)-xpoint(i-1))*temporary_array(i-1)) &
+       / ((xpoint(i)-xpoint(i-1))/(xpoint(i+1)-xpoint(i-1))*spline_coefficients(i-1)+2.d0)
+
+  enddo
+
+  spline_coefficients(npoint) = ((3.d0/(xpoint(npoint)-xpoint(npoint-1))) &
+      * (tangent_last_point-(ypoint(npoint)-ypoint(npoint-1))/(xpoint(npoint)-xpoint(npoint-1))) &
+      - 1.d0/2.d0*temporary_array(npoint-1))/(1.d0/2.d0*spline_coefficients(npoint-1)+1.d0)
+
+  do i = npoint-1,1,-1
+    spline_coefficients(i) = spline_coefficients(i)*spline_coefficients(i+1) + temporary_array(i)
+  enddo
+
+  deallocate(temporary_array)
+
+  end subroutine spline_construction
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! evaluate a spline
+
+  subroutine spline_evaluation(xpoint,ypoint,spline_coefficients,npoint,x_evaluate_spline,y_spline_obtained)
+
+  implicit none
+
+! number of input points and coordinates of the input points
+  integer, intent(in) :: npoint
+  double precision, dimension(npoint), intent(in) :: xpoint,ypoint
+
+! spline coefficients to use
+  double precision, dimension(npoint), intent(in) :: spline_coefficients
+
+! abscissa at which we need to evaluate the value of the spline
+  double precision, intent(in):: x_evaluate_spline
+
+! ordinate evaluated by the routine for the spline at this abscissa
+  double precision, intent(out):: y_spline_obtained
+
+  integer :: index_loop,index_lower,index_higher
+
+  double precision :: coef1,coef2
+
+! initialize to the whole interval
+  index_lower = 1
+  index_higher = npoint
+
+! determine the right interval to use, by dichotomy
+  do while (index_higher - index_lower > 1)
+! compute the middle of the interval
+    index_loop = (index_higher + index_lower) / 2
+    if(xpoint(index_loop) > x_evaluate_spline) then
+      index_higher = index_loop
+    else
+      index_lower = index_loop
+    endif
+  enddo
+
+! test that the interval obtained does not have a size of zero
+! (this could happen for instance in the case of duplicates in the input list of points)
+  if(xpoint(index_higher) == xpoint(index_lower)) stop 'incorrect interval found in spline evaluation'
+
+  coef1 = (xpoint(index_higher) - x_evaluate_spline) / (xpoint(index_higher) - xpoint(index_lower))
+  coef2 = (x_evaluate_spline - xpoint(index_lower)) / (xpoint(index_higher) - xpoint(index_lower))
+
+  y_spline_obtained = coef1*ypoint(index_lower) + coef2*ypoint(index_higher) + &
+        ((coef1**3 - coef1)*spline_coefficients(index_lower) + &
+         (coef2**3 - coef2)*spline_coefficients(index_higher))*((xpoint(index_higher) - xpoint(index_lower))**2)/6.d0
+
+  end subroutine spline_evaluation
+
+
diff --git a/src/specfem3D/noise_tomography.f90 b/src/specfem3D/noise_tomography.f90
index 3ea63bbc2..9f50482e6 100644
--- a/src/specfem3D/noise_tomography.f90
+++ b/src/specfem3D/noise_tomography.f90
@@ -43,14 +43,54 @@
 ! =============================================================================================================
 ! =============================================================================================================
 
+module user_noise_distribution
+
+!daniel: TODO -- setting USE_PIERO_DISTRIBUTION = .true. will produce errors
+!            when using with the default example in "example/noise_tomography/"
+!            i left it here so that Max can run his example without changing this every time...
+  logical,parameter :: USE_PIERO_DISTRIBUTION = .true.
+
+contains
+
+! wrapper function
+! this subroutine must be modified by USERS for their own noise distribution
+
+  subroutine noise_distribution_direction(xcoord_in,ycoord_in,zcoord_in, &
+                  normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
+                  mask_noise_out)
+  implicit none
+  include "constants.h"
+  ! input parameters
+  real(kind=CUSTOM_REAL) :: xcoord_in,ycoord_in,zcoord_in
+  ! output parameters
+  real(kind=CUSTOM_REAL) :: normal_x_noise_out,normal_y_noise_out,normal_z_noise_out,mask_noise_out
+
+  ! Setup for NOISE_TOMOGRAPHY by Piero Basini
+  if( USE_PIERO_DISTRIBUTION ) then
+    call noise_distribution_dir_non_uni(xcoord_in,ycoord_in,zcoord_in, &
+                                      normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
+                                      mask_noise_out)
+  else
+    ! DEFAULT routine
+    call noise_distribution_direction_d(xcoord_in,ycoord_in,zcoord_in, &
+                                       normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
+                                       mask_noise_out)
+  endif
+
+  end subroutine noise_distribution_direction
+
+  !
+  !-----------------------------------------------------------------------------------------------
+  !
+
 ! characterizes noise statistics:
 !     for a given point (xcoord,ycoord,zcoord), specify the noise direction "normal_x/y/z_noise"
 !     and noise distribution "mask_noise"
 !
 ! USERS: need to modify this subroutine for their own noise characteristics
-  subroutine noise_distribution_direction(xcoord_in,ycoord_in,zcoord_in, &
-                  normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
-                  mask_noise_out)
+  subroutine noise_distribution_direction_d(xcoord_in,ycoord_in,zcoord_in, &
+                                           normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
+                                           mask_noise_out)
   implicit none
   include "constants.h"
   ! input parameters
@@ -78,7 +118,100 @@ subroutine noise_distribution_direction(xcoord_in,ycoord_in,zcoord_in, &
   ldummy = ycoord_in
   ldummy = zcoord_in
 
-  end subroutine noise_distribution_direction
+  end subroutine noise_distribution_direction_d
+
+  !
+  !-----------------------------------------------------------------------------------------------
+  !
+
+  subroutine noise_distribution_dir_non_uni(xcoord_in,ycoord_in,zcoord_in, &
+                  normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
+                  mask_noise_out)
+  implicit none
+  include "constants.h"
+  ! input parameters
+  real(kind=CUSTOM_REAL) :: xcoord_in,ycoord_in,zcoord_in
+  ! output parameters
+  real(kind=CUSTOM_REAL) :: normal_x_noise_out,normal_y_noise_out,normal_z_noise_out,mask_noise_out
+  ! local parameters
+  !PB VARIABLES TO DEFINE THE REGION OF NOISE
+  real(kind=CUSTOM_REAL) :: xcoord,ycoord,zcoord !,xcoord_center,ycoord_center
+  real :: lon,lat,colat,lon_cn,lat_cn,dsigma,d,dmax
+
+  ! coordinates "x/y/zcoord_in" actually contain r theta phi, therefore convert back to x y z
+  ! call rthetaphi_2_xyz(xcoord,ycoord,zcoord, xcoord_in,ycoord_in,zcoord_in)
+  xcoord=xcoord_in
+  ycoord=ycoord_in
+  zcoord=zcoord_in
+
+  !PB NOT UNIF DISTRIBUTION OF NOISE ON THE SURFACE OF A SPHERE
+  !PB lon lat colat ARE IN RADIANS SINCE ARE OBTAINED FROM CARTESIAN COORDINATES
+  !PB lon_cn lat_cn (cn = CENTER OF NOISE REGION) IF NOT, MUST BE CONVERTED IN RADIANS
+  !PB lon_cn lat_cn  ARE INSERTED DIRECTLY HERE FOR SIMPLICITY
+
+  lon_cn = (3.89)*PI/180
+  lat_cn = (45.113)*PI/180
+
+  if (xcoord >= 0) then
+   lon=asin(ycoord/(sqrt(xcoord**2+ycoord**2)))
+  else
+   lon=(PI-(asin(ycoord/(sqrt(xcoord**2+ycoord**2)))))
+  endif
+   colat=atan(sqrt(xcoord**2+ycoord**2)/zcoord)
+   lat=(PI/2)-colat
+
+  !PB CALCULATE THE DISTANCE BETWEEN CENTER OF NOISE REGION AND EACH
+  ! POINT OF THE MODEL'S FREE SURFACE  !PB dsigma IS THE "3D" ANGLE BETWEEN
+  ! THE TWO POINTS, THEN d = R*dsigma
+  dsigma=acos(sin(lon)*sin(lon_cn)+cos(lon)*cos(lon_cn)*cos(lat-lat_cn))
+  d=sqrt(xcoord**2+ycoord**2+zcoord**2)*dsigma
+
+  !PB IF YOU WANT TO USE A NONUNIFORM DISTRIBUTION OF NOISE IN THE EXAMPLE
+  !PROVIDED WITH THE CODE, THEN UNCOMMENT THE FOLLOWING LINES (before definition
+  !of dmax)
+
+  !  xcoord_center = 30000
+  !  ycoord_center = 30000
+  !  d = sqrt((xcoord_center-xcoord)**2+(ycoord_center-ycoord)**2)
+
+  !PB NOTE THAT d IS EXPRESSED IN METERS REMEBER THAT WHEN YOU SET THE PARAMETER dmax
+  !PB dmax IS THE RADIUS OF THE AREA IN WHICH masc_noise_out IS 1 (NOISE IS DEFINED)
+
+  dmax = 300000
+
+  ! NOTE that all coordinates are non-dimensionalized in GLOBAL package!
+  ! USERS are free to choose which set to use,
+  ! either "r theta phi" (xcoord_in,ycoord_in,zcoord_in)
+  ! or     "x y z"       (xcoord,ycoord,zcoord)
+
+
+  !*****************************************************************************************************************
+  !******************************** change your noise characteristics below ****************************************
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! noise direction
+  !!!!! here, the noise is assumed to be vertical (SESAME)
+  normal_x_noise_out = 0.0
+  normal_y_noise_out = 0.0
+  normal_z_noise_out = 1.0
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  noise distribution
+  !!!!! here, the noise is assumed to be uniform
+  !  mask_noise_out = 1.0
+
+  !HERE IS NOT UNIFORM
+  if (d <= dmax) then
+  mask_noise_out = 1.0
+  else
+  mask_noise_out = 0.0
+  endif
+
+  !******************************** change your noise characteristics above ****************************************
+  !*****************************************************************************************************************
+
+  end subroutine noise_distribution_dir_non_uni
+
+
+end module user_noise_distribution
+
+
 
 ! =============================================================================================================
 ! =============================================================================================================
@@ -94,6 +227,7 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, &
                                    NSPEC_AB_VAL,NGLOB_AB_VAL, &
                                    num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
                                    ispec_is_acoustic)
+  use user_noise_distribution
   implicit none
   include "constants.h"
   ! input parameters
@@ -116,7 +250,7 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, &
 
   logical, dimension(NSPEC_AB_VAL) :: ispec_is_acoustic
 
-!daniel: from global code...
+  !from global code...
   !integer, dimension(NSPEC2D_TOP_VAL) :: ibelm_top ! equals free_surface_ispec
   !integer :: NSPEC2D_TOP_VAL ! equals num_free_surface_faces
   !integer :: nspec_top ! equals num_free_surface_faces
@@ -131,7 +265,7 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, &
   character(len=256) :: filename
 
   ! read master receiver ID -- the ID in "STATIONS"
-  filename = trim(OUTPUT_FILES_PATH)//'/../OUTPUT_FILES/NOISE_TOMOGRAPHY/irec_master_noise'
+  filename = trim(OUTPUT_FILES_PATH)//'/..//NOISE_TOMOGRAPHY/irec_master_noise'
   open(unit=IIN_NOISE,file=trim(filename),status='old',action='read',iostat=ios)
   if( ios /= 0 ) &
     call exit_MPI(myrank, 'file '//trim(filename)//' does NOT exist! This file contains the ID of the master receiver')
@@ -163,7 +297,7 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, &
   ! noise distribution and noise direction
   ipoin = 0
 
-  !daniel: from global code, carefull: ngllz must not be face on top...
+  !from global code, carefull: ngllz must not be face on top...
   !  do ispec2D = 1, nspec_top
   !    ispec = ibelm_top(ispec2D)
   !    k = NGLLZ
@@ -187,11 +321,13 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, &
 
         ipoin = ipoin + 1
         iglob = ibool(i,j,k,ispec)
-        ! this subroutine must be modified by USERS
+
+        ! this subroutine must be modified by USERS in module user_noise_distribution
         call noise_distribution_direction(xstore(iglob), &
-                  ystore(iglob),zstore(iglob), &
-                  normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
-                  mask_noise_out)
+                                          ystore(iglob),zstore(iglob), &
+                                          normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
+                                          mask_noise_out)
+
         normal_x_noise(ipoin) = normal_x_noise_out
         normal_y_noise(ipoin) = normal_y_noise_out
         normal_z_noise(ipoin) = normal_z_noise_out
@@ -314,22 +450,34 @@ subroutine check_parameters_noise(myrank,NOISE_TOMOGRAPHY,SIMULATION_TYPE,SAVE_F
 
      ! size of single record
      reclen=CUSTOM_REAL*NDIM*NGLLSQUARE*NSPEC_TOP
-     ! total file size
-     filesize = reclen
-     filesize = filesize*NSTEP
-
-     write(outputname,"('/proc',i6.6,'_surface_movie')") myrank
-     if (NOISE_TOMOGRAPHY==1) call open_file_abs_w(2,trim(LOCAL_PATH)//trim(outputname), &
-                                      len_trim(trim(LOCAL_PATH)//trim(outputname)), &
-                                      filesize)
-     if (NOISE_TOMOGRAPHY==2) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
-                                      len_trim(trim(LOCAL_PATH)//trim(outputname)), &
-                                      filesize)
-     if (NOISE_TOMOGRAPHY==3) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
-                                      len_trim(trim(LOCAL_PATH)//trim(outputname)), &
-                                      filesize)
-  endif
 
+     ! only open files if there are surface faces in this paritition
+     if(NSPEC_TOP .gt. 0) then
+
+        ! check integer size limit: size of b_reclen_field must fit onto an 4-byte integer
+        if( NSPEC_TOP > 2147483647 / (CUSTOM_REAL * NGLLSQUARE * NDIM) ) then
+           print *,'reclen of noise surface_movie needed exceeds integer 4-byte limit: ',reclen
+           print *,'  ',CUSTOM_REAL, NDIM, NGLLSQUARE, NSPEC_TOP
+           print*,'bit size fortran: ',bit_size(NSPEC_TOP)
+           call exit_MPI(myrank,"error NSPEC_TOP integer limit")
+        endif
+
+        ! total file size
+        filesize = reclen
+        filesize = filesize*NSTEP
+
+        write(outputname,"('/proc',i6.6,'_surface_movie')") myrank
+        if (NOISE_TOMOGRAPHY==1) call open_file_abs_w(2,trim(LOCAL_PATH)//trim(outputname), &
+             len_trim(trim(LOCAL_PATH)//trim(outputname)), &
+             filesize)
+        if (NOISE_TOMOGRAPHY==2) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
+             len_trim(trim(LOCAL_PATH)//trim(outputname)), &
+             filesize)
+        if (NOISE_TOMOGRAPHY==3) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
+             len_trim(trim(LOCAL_PATH)//trim(outputname)), &
+             filesize)
+     endif
+  endif
   end subroutine check_parameters_noise
 
 ! =============================================================================================================
@@ -364,7 +512,7 @@ subroutine compute_arrays_source_noise(myrank, &
 
   noise_src(:) = 0._CUSTOM_REAL
   ! noise file (source time function)
-  filename = trim(OUTPUT_FILES_PATH)//'/../OUTPUT_FILES/NOISE_TOMOGRAPHY/S_squared'
+  filename = trim(OUTPUT_FILES_PATH)//'/..//NOISE_TOMOGRAPHY/S_squared'
   open(unit=IIN_NOISE,file=trim(filename),status='old',action='read',iostat=ios)
   if( ios /= 0 .and. myrank == 0 )  &
     call exit_MPI(myrank, 'file '//trim(filename)//' does NOT exist! This file should have been generated using Matlab scripts')
@@ -379,7 +527,7 @@ subroutine compute_arrays_source_noise(myrank, &
 
 
   ! master receiver component direction, \nu_master
-  filename = trim(OUTPUT_FILES_PATH)//'/../OUTPUT_FILES/NOISE_TOMOGRAPHY/nu_master'
+  filename = trim(OUTPUT_FILES_PATH)//'/..//NOISE_TOMOGRAPHY/nu_master'
   open(unit=IIN_NOISE,file=trim(filename),status='old',action='read',iostat=ios)
   if( ios /= 0 .and. myrank == 0 ) &
     call exit_MPI(myrank,&
@@ -483,7 +631,8 @@ subroutine noise_save_surface_movie(displ, &
                     ibool, &
                     noise_surface_movie,it, &
                     NSPEC_AB_VAL,NGLOB_AB_VAL, &
-                    num_free_surface_faces,free_surface_ispec,free_surface_ijk)
+                    num_free_surface_faces,free_surface_ispec,free_surface_ijk,&
+                    Mesh_pointer,GPU_MODE)
   implicit none
   include "constants.h"
   ! input parameters
@@ -505,25 +654,37 @@ subroutine noise_save_surface_movie(displ, &
   ! local parameters
   integer :: ispec,i,j,k,iglob,iface,igll
   real(kind=CUSTOM_REAL),dimension(NDIM,NGLLSQUARE,num_free_surface_faces) :: noise_surface_movie
+  integer(kind=8) :: Mesh_pointer
+  logical :: GPU_MODE
+
+  ! writes out wavefield at surface
+  if( num_free_surface_faces > 0 ) then
+
+    if(.NOT. GPU_MODE) then
+       ! loops over surface points
+       ! get coordinates of surface mesh and surface displacement
+       do iface = 1, num_free_surface_faces
+
+          ispec = free_surface_ispec(iface)
+
+          do igll = 1, NGLLSQUARE
+             i = free_surface_ijk(1,igll,iface)
+             j = free_surface_ijk(2,igll,iface)
+             k = free_surface_ijk(3,igll,iface)
+
+             iglob = ibool(i,j,k,ispec)
+             noise_surface_movie(:,igll,iface) = displ(:,iglob)
+          enddo
+       enddo
+    ! TODO: Check if transfer_surface_to_hose is compatible with newer version above
+    else ! GPU_MODE == 1
+       call transfer_surface_to_host(Mesh_pointer,noise_surface_movie)
+    endif
 
-  ! loops over surface points
-  ! get coordinates of surface mesh and surface displacement
-  do iface = 1, num_free_surface_faces
-
-    ispec = free_surface_ispec(iface)
-
-    do igll = 1, NGLLSQUARE
-        i = free_surface_ijk(1,igll,iface)
-        j = free_surface_ijk(2,igll,iface)
-        k = free_surface_ijk(3,igll,iface)
-
-        iglob = ibool(i,j,k,ispec)
-        noise_surface_movie(:,igll,iface) = displ(:,iglob)
-    enddo
-  enddo
+    ! save surface motion to disk
+    call write_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it)
 
-  ! save surface motion to disk
-  call write_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it)
+  endif
 
   end subroutine noise_save_surface_movie
 
@@ -544,7 +705,8 @@ subroutine noise_read_add_surface_movie(nmovie_points, &
                   it, &
                   NSPEC_AB_VAL,NGLOB_AB_VAL, &
                   num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
-                  free_surface_jacobian2Dw)
+                  free_surface_jacobian2Dw, &
+                  Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY)
   implicit none
   include "constants.h"
   ! input parameters
@@ -572,39 +734,54 @@ subroutine noise_read_add_surface_movie(nmovie_points, &
   real(kind=CUSTOM_REAL) :: eta
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLSQUARE,num_free_surface_faces) :: noise_surface_movie
 
-  ! read surface movie
-  call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it)
+  ! GPU_MODE parameters
+  integer(kind=8) :: Mesh_pointer
+  logical :: GPU_MODE
+  integer :: NOISE_TOMOGRAPHY
 
-  ! get coordinates of surface mesh and surface displacement
-  ipoin = 0
+  ! reads in ensemble noise sources at surface
+  if( num_free_surface_faces > 0 ) then
 
-  ! loops over surface points
-  ! puts noise distrubution and direction onto the surface points
-  do iface = 1, num_free_surface_faces
+    ! read surface movie
+    call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it)
 
-    ispec = free_surface_ispec(iface)
+    if(GPU_MODE) then
+       call noise_read_add_surface_movie_cu(Mesh_pointer, noise_surface_movie,NOISE_TOMOGRAPHY)
+    else ! GPU_MODE==0
 
-    do igll = 1, NGLLSQUARE
-      i = free_surface_ijk(1,igll,iface)
-      j = free_surface_ijk(2,igll,iface)
-      k = free_surface_ijk(3,igll,iface)
-
-      ipoin = ipoin + 1
-      iglob = ibool(i,j,k,ispec)
-
-      eta = noise_surface_movie(1,igll,iface) * normal_x_noise(ipoin) + &
-            noise_surface_movie(2,igll,iface) * normal_y_noise(ipoin) + &
-            noise_surface_movie(3,igll,iface) * normal_z_noise(ipoin)
-
-      accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) &
-                                  * free_surface_jacobian2Dw(igll,iface)
-      accel(2,iglob) = accel(2,iglob) + eta * mask_noise(ipoin) * normal_y_noise(ipoin) &
-                                  * free_surface_jacobian2Dw(igll,iface)
-      accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) &
-                                  * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface)
-    enddo
+       ! get coordinates of surface mesh and surface displacement
+       ipoin = 0
 
-  enddo
+       ! loops over surface points
+       ! puts noise distrubution and direction onto the surface points
+       do iface = 1, num_free_surface_faces
+
+          ispec = free_surface_ispec(iface)
+
+          do igll = 1, NGLLSQUARE
+             i = free_surface_ijk(1,igll,iface)
+             j = free_surface_ijk(2,igll,iface)
+             k = free_surface_ijk(3,igll,iface)
+
+             ipoin = ipoin + 1
+             iglob = ibool(i,j,k,ispec)
+
+             eta = noise_surface_movie(1,igll,iface) * normal_x_noise(ipoin) + &
+                  noise_surface_movie(2,igll,iface) * normal_y_noise(ipoin) + &
+                  noise_surface_movie(3,igll,iface) * normal_z_noise(ipoin)
+
+             accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) &
+                  * free_surface_jacobian2Dw(igll,iface)
+             accel(2,iglob) = accel(2,iglob) + eta * mask_noise(ipoin) * normal_y_noise(ipoin) &
+                  * free_surface_jacobian2Dw(igll,iface)
+             accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) &
+                  * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface)
+          enddo
+
+       enddo
+    endif ! GPU_MODE
+
+  endif
 
   end subroutine noise_read_add_surface_movie
 
@@ -620,7 +797,8 @@ subroutine compute_kernels_strength_noise(nmovie_points,ibool, &
                           normal_x_noise,normal_y_noise,normal_z_noise, &
                           noise_surface_movie, &
                           NSPEC_AB_VAL,NGLOB_AB_VAL, &
-                          num_free_surface_faces,free_surface_ispec,free_surface_ijk)
+                          num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+                          GPU_MODE,Mesh_pointer)
   implicit none
   include "constants.h"
   ! input parameters
@@ -650,39 +828,54 @@ subroutine compute_kernels_strength_noise(nmovie_points,ibool, &
   real(kind=CUSTOM_REAL) :: eta
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLSQUARE,num_free_surface_faces) :: noise_surface_movie
 
-  ! read surface movie, needed for Sigma_kl
-  call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it)
+  ! GPU_MODE parameters
+  integer(kind=8) :: Mesh_pointer
+  logical :: GPU_MODE
 
-  ! noise source strength kernel
-  ! to keep similar structure to other kernels, the source strength kernel is saved as a volumetric kernel
-  ! but only updated at the surface, because the noise is generated there
-  ipoin = 0
+  ! updates contribution to noise strength kernel
+  if( num_free_surface_faces > 0 ) then
 
-  ! loops over surface points
-  ! puts noise distrubution and direction onto the surface points
-  do iface = 1, num_free_surface_faces
+    ! read surface movie, needed for Sigma_kl
+    call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it)
 
-    ispec = free_surface_ispec(iface)
+    if(.NOT. GPU_MODE) then
 
-    do igll = 1, NGLLSQUARE
-      i = free_surface_ijk(1,igll,iface)
-      j = free_surface_ijk(2,igll,iface)
-      k = free_surface_ijk(3,igll,iface)
+       ! noise source strength kernel
+       ! to keep similar structure to other kernels, the source strength kernel is saved as a volumetric kernel
+       ! but only updated at the surface, because the noise is generated there
+       ipoin = 0
 
-      ipoin = ipoin + 1
-      iglob = ibool(i,j,k,ispec)
+       ! loops over surface points
+       ! puts noise distrubution and direction onto the surface points
+       do iface = 1, num_free_surface_faces
 
-      eta = noise_surface_movie(1,igll,iface) * normal_x_noise(ipoin) + &
-            noise_surface_movie(2,igll,iface) * normal_y_noise(ipoin) + &
-            noise_surface_movie(3,igll,iface) * normal_z_noise(ipoin)
+          ispec = free_surface_ispec(iface)
 
-      Sigma_kl(i,j,k,ispec) =  Sigma_kl(i,j,k,ispec) &
-         + deltat * eta * ( normal_x_noise(ipoin) * displ(1,iglob) &
-                          + normal_y_noise(ipoin) * displ(2,iglob) &
-                          + normal_z_noise(ipoin) * displ(3,iglob) )
-    enddo
+          do igll = 1, NGLLSQUARE
+             i = free_surface_ijk(1,igll,iface)
+             j = free_surface_ijk(2,igll,iface)
+             k = free_surface_ijk(3,igll,iface)
 
-  enddo
+             ipoin = ipoin + 1
+             iglob = ibool(i,j,k,ispec)
+
+             eta = noise_surface_movie(1,igll,iface) * normal_x_noise(ipoin) + &
+                  noise_surface_movie(2,igll,iface) * normal_y_noise(ipoin) + &
+                  noise_surface_movie(3,igll,iface) * normal_z_noise(ipoin)
+
+             Sigma_kl(i,j,k,ispec) =  Sigma_kl(i,j,k,ispec) &
+                  + deltat * eta * ( normal_x_noise(ipoin) * displ(1,iglob) &
+                  + normal_y_noise(ipoin) * displ(2,iglob) &
+                  + normal_z_noise(ipoin) * displ(3,iglob) )
+          enddo
+
+       enddo
+
+    else ! GPU_MODE==1
+       call compute_kernels_strgth_noise_cu(Mesh_pointer,noise_surface_movie,deltat)
+    endif ! GPU_MODE
+
+  endif
 
   end subroutine compute_kernels_strength_noise
 
diff --git a/src/specfem3D/prepare_timerun.f90 b/src/specfem3D/prepare_timerun.F90
similarity index 59%
rename from src/specfem3D/prepare_timerun.f90
rename to src/specfem3D/prepare_timerun.F90
index 7738a3377..ebe21d418 100644
--- a/src/specfem3D/prepare_timerun.f90
+++ b/src/specfem3D/prepare_timerun.F90
@@ -75,6 +75,13 @@ subroutine prepare_timerun()
       write(IMAIN,*) 'no oceans'
     endif
 
+    write(IMAIN,*)
+    if(GRAVITY) then
+      write(IMAIN,*) 'incorporating gravity'
+    else
+      write(IMAIN,*) 'no gravity'
+    endif
+
     write(IMAIN,*)
     if(ACOUSTIC_SIMULATION) then
       write(IMAIN,*) 'incorporating acoustic simulation'
@@ -176,6 +183,9 @@ subroutine prepare_timerun()
   ! prepares attenuation arrays
   call prepare_timerun_attenuation()
 
+  ! prepares gravity arrays
+  call prepare_timerun_gravity()
+
   ! initializes PML arrays
   if( ABSORBING_CONDITIONS  ) then
     if (SIMULATION_TYPE /= 1 .and. ABSORB_USE_PML )  then
@@ -210,10 +220,11 @@ subroutine prepare_timerun()
     write(IMAIN,*) 'total simulated time: ',sngl(NSTEP*DT),' seconds'
     write(IMAIN,*) 'start time:',sngl(-t0),' seconds'
     write(IMAIN,*)
-    
-    !debug: time estimation
+
+    !daniel debug: time estimation
     ! elastic elements: time per element t_per_element = 1.40789368e-05 s
     ! total time = nspec * nstep * t_per_element
+
   endif
 
   ! prepares ADJOINT simulations
@@ -222,6 +233,14 @@ subroutine prepare_timerun()
   ! prepares noise simulations
   call prepare_timerun_noise()
 
+  ! prepares GPU arrays
+  if(GPU_MODE) call prepare_timerun_GPU()
+
+#ifdef OPENMP_MODE
+  ! prepares arrays for OpenMP
+  call prepare_timerun_OpenMP()
+#endif
+
   end subroutine prepare_timerun
 
 !
@@ -416,6 +435,100 @@ subroutine prepare_timerun_attenuation()
 
   end subroutine prepare_timerun_attenuation
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_gravity()
+
+! precomputes gravity factors
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+  implicit none
+
+  ! local parameters
+  double precision RICB,RCMB,RTOPDDOUBLEPRIME, &
+    R80,R220,R400,R600,R670,R771,RMOHO,RMIDDLE_CRUST,ROCEAN
+  double precision :: rspl_gravity(NR),gspl(NR),gspl2(NR)
+  double precision :: radius,g,dg ! radius_km
+  !double precision :: g_cmb_dble,g_icb_dble
+  double precision :: rho,drhodr,vp,vs,Qkappa,Qmu
+  integer :: nspl_gravity !int_radius
+  integer :: i,j,k,iglob,ier
+
+  ! sets up weights needed for integration of gravity
+  do k=1,NGLLZ
+    do j=1,NGLLY
+      do i=1,NGLLX
+        wgll_cube(i,j,k) = sngl( wxgll(i)*wygll(j)*wzgll(k) )
+      enddo
+    enddo
+  enddo
+
+  ! store g, rho and dg/dr=dg using normalized radius in lookup table every 100 m
+  ! get density and velocity from PREM model using dummy doubling flag
+  ! this assumes that the gravity perturbations are small and smooth
+  ! and that we can neglect the 3D model and use PREM every 100 m in all cases
+  ! this is probably a rather reasonable assumption
+  if(GRAVITY) then
+
+    ! allocates gravity arrays
+    allocate( minus_deriv_gravity(NGLOB_AB), &
+             minus_g(NGLOB_AB), stat=ier)
+    if( ier /= 0 ) stop 'error allocating gravity arrays'
+
+    ! sets up spline table
+    call make_gravity(nspl_gravity,rspl_gravity,gspl,gspl2, &
+                          ROCEAN,RMIDDLE_CRUST,RMOHO,R80,R220,R400,R600,R670, &
+                          R771,RTOPDDOUBLEPRIME,RCMB,RICB)
+
+    ! pre-calculates gravity terms for all global points
+    do iglob = 1,NGLOB_AB
+
+      ! normalized radius ( zstore values given in m, negative values for depth)
+      radius = ( R_EARTH + zstore(iglob) ) / R_EARTH
+      call spline_evaluation(rspl_gravity,gspl,gspl2,nspl_gravity,radius,g)
+
+      ! use PREM density profile to calculate gravity (fine for other 1D models)
+      call model_prem_iso(radius,rho,drhodr,vp,vs,Qkappa,Qmu, &
+                        RICB,RCMB,RTOPDDOUBLEPRIME, &
+                        R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN)
+
+      dg = 4.0d0*rho - 2.0d0*g/radius
+
+      ! re-dimensionalize
+      g = g * R_EARTH*(PI*GRAV*RHOAV) ! in m / s^2 ( should be around 10 m/s^2 )
+      dg = dg * R_EARTH*(PI*GRAV*RHOAV) / R_EARTH ! gradient d/dz g , in 1/s^2
+
+      minus_deriv_gravity(iglob) = - dg
+      minus_g(iglob) = - g ! in negative z-direction
+
+      ! debug
+      !if( iglob == 1 .or. iglob == 1000 .or. iglob == 10000 ) then
+      !  ! re-dimensionalize
+      !  radius = radius * R_EARTH ! in m
+      !  vp = vp * R_EARTH*dsqrt(PI*GRAV*RHOAV)  ! in m / s
+      !  rho = rho  * RHOAV  ! in kg / m^3
+      !  print*,'gravity: radius=',radius,'g=',g,'depth=',radius-R_EARTH
+      !  print*,'vp=',vp,'rho=',rho,'kappa=',(vp**2) * rho
+      !  print*,'minus_g..=',minus_g(iglob)
+      !endif
+    enddo
+
+  else
+
+    ! allocates dummy gravity arrays
+    allocate( minus_deriv_gravity(0), &
+             minus_g(0), stat=ier)
+    if( ier /= 0 ) stop 'error allocating gravity arrays'
+
+  endif
+
+  end subroutine prepare_timerun_gravity
+
 
 !
 !-------------------------------------------------------------------------------------------------
@@ -553,6 +666,14 @@ subroutine prepare_timerun_adjoint()
         ! size of single record
         b_reclen_field = CUSTOM_REAL * NDIM * NGLLSQUARE * num_abs_boundary_faces
 
+        ! check integer size limit: size of b_reclen_field must fit onto an 4-byte integer
+        if( num_abs_boundary_faces > 2147483647 / (CUSTOM_REAL * NDIM * NGLLSQUARE) ) then
+          print *,'reclen needed exceeds integer 4-byte limit: ',b_reclen_field
+          print *,'  ',CUSTOM_REAL, NDIM, NGLLSQUARE, num_abs_boundary_faces
+          print*,'bit size fortran: ',bit_size(b_reclen_field)
+          call exit_MPI(myrank,"error b_reclen_field integer limit")
+        endif
+
         ! total file size
         filesize = b_reclen_field
         filesize = filesize*NSTEP
@@ -594,12 +715,20 @@ subroutine prepare_timerun_adjoint()
         ! size of single record
         b_reclen_potential = CUSTOM_REAL * NGLLSQUARE * num_abs_boundary_faces
 
+        ! check integer size limit: size of b_reclen_potential must fit onto an 4-byte integer
+        if( num_abs_boundary_faces > 2147483647 / (CUSTOM_REAL * NGLLSQUARE) ) then
+          print *,'reclen needed exceeds integer 4-byte limit: ',b_reclen_potential
+          print *,'  ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces
+          print*,'bit size fortran: ',bit_size(b_reclen_potential)
+          call exit_MPI(myrank,"error b_reclen_potential integer limit")
+        endif
+
         ! total file size (two lines to implicitly convert to 8-byte integers)
         filesize = b_reclen_potential
         filesize = filesize*NSTEP
 
-        ! daniel: debug check size limit
-        !if( NSTEP > 2147483648 / b_reclen_potential ) then
+        ! debug check size limit
+        !if( NSTEP > 2147483647 / b_reclen_potential ) then
         !  print *,'file size needed exceeds integer 4-byte limit: ',b_reclen_potential,NSTEP
         !  print *,'  ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces,NSTEP
         !  print*,'file size fortran: ',filesize
@@ -633,9 +762,8 @@ subroutine prepare_timerun_adjoint()
 
         endif
       endif
-
     else
-      ! dummy array
+      ! needs dummy array
       b_num_abs_boundary_faces = 1
       if( ELASTIC_SIMULATION ) then
         allocate(b_absorb_field(NDIM,NGLLSQUARE,b_num_abs_boundary_faces),stat=ier)
@@ -646,10 +774,22 @@ subroutine prepare_timerun_adjoint()
         allocate(b_absorb_potential(NGLLSQUARE,b_num_abs_boundary_faces),stat=ier)
         if( ier /= 0 ) stop 'error allocating array b_absorb_potential'
       endif
+    endif
+  else ! ABSORBING_CONDITIONS
+    ! needs dummy array
+    b_num_abs_boundary_faces = 1
+    if( ELASTIC_SIMULATION ) then
+      allocate(b_absorb_field(NDIM,NGLLSQUARE,b_num_abs_boundary_faces),stat=ier)
+      if( ier /= 0 ) stop 'error allocating array b_absorb_field'
+    endif
 
+    if( ACOUSTIC_SIMULATION ) then
+      allocate(b_absorb_potential(NGLLSQUARE,b_num_abs_boundary_faces),stat=ier)
+      if( ier /= 0 ) stop 'error allocating array b_absorb_potential'
     endif
   endif
 
+
   end subroutine prepare_timerun_adjoint
 
 !
@@ -674,7 +814,8 @@ subroutine prepare_timerun_noise()
 
     ! checks if free surface is defined
     if( num_free_surface_faces == 0 ) then
-      stop 'error: noise simulations need a free surface'
+       write(*,*) myrank, " doesn't have a free_surface_face"
+       ! stop 'error: noise simulations need a free surface'
     endif
 
     ! allocates arrays
@@ -719,3 +860,258 @@ subroutine prepare_timerun_noise()
 
   end subroutine prepare_timerun_noise
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_GPU()
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+  use specfem_par_movie
+
+  implicit none
+  real :: free_mb,used_mb,total_mb
+
+  ! GPU_MODE now defined in Par_file
+  if(myrank == 0 ) then
+    write(IMAIN,*)
+    write(IMAIN,*) "GPU Preparing Fields and Constants on Device."
+    write(IMAIN,*)
+  endif
+
+  ! prepares general fields on GPU
+  call prepare_constants_device(Mesh_pointer, &
+                                  NGLLX, NSPEC_AB, NGLOB_AB, &
+                                  xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
+                                  kappastore, mustore,ibool, &
+                                  num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+                                  nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
+                                  hprime_xx, hprime_yy, hprime_zz, &
+                                  hprimewgll_xx, hprimewgll_yy, hprimewgll_zz, &
+                                  wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+                                  ABSORBING_CONDITIONS, &
+                                  abs_boundary_ispec, abs_boundary_ijk, &
+                                  abs_boundary_normal, &
+                                  abs_boundary_jacobian2Dw, &
+                                  num_abs_boundary_faces, &
+                                  ispec_is_inner, &
+                                  NSOURCES, nsources_local, &
+                                  sourcearrays, islice_selected_source, ispec_selected_source, &
+                                  number_receiver_global, ispec_selected_rec, &
+                                  nrec, nrec_local, &
+                                  SIMULATION_TYPE, &
+                                  USE_MESH_COLORING_GPU, &
+                                  nspec_acoustic,nspec_elastic,&
+                                  my_neighbours_ext_mesh,&
+                                  request_send_vector_ext_mesh,&
+                                  request_recv_vector_ext_mesh,&
+                                  buffer_recv_vector_ext_mesh)
+
+
+  ! prepares fields on GPU for acoustic simulations
+  if( ACOUSTIC_SIMULATION ) then
+    call prepare_fields_acoustic_device(Mesh_pointer,rmass_acoustic,rhostore,kappastore, &
+                                  num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+                                  ispec_is_acoustic, &
+                                  NOISE_TOMOGRAPHY,num_free_surface_faces, &
+                                  free_surface_ispec,free_surface_ijk, &
+                                  ABSORBING_CONDITIONS,b_reclen_potential,b_absorb_potential, &
+                                  ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
+                                  coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                                  coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
+                                  num_colors_outer_acoustic,num_colors_inner_acoustic, &
+                                  num_elem_colors_acoustic)
+
+    if( SIMULATION_TYPE == 3 ) &
+      call prepare_fields_acoustic_adj_dev(Mesh_pointer, &
+                                  SIMULATION_TYPE, &
+                                  APPROXIMATE_HESS_KL)
+
+  endif
+
+  ! prepares fields on GPU for elastic simulations
+  if( ELASTIC_SIMULATION ) then
+    call prepare_fields_elastic_device(Mesh_pointer, NDIM*NGLOB_AB, &
+                                  rmass,rho_vp,rho_vs, &
+                                  num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+                                  ispec_is_elastic, &
+                                  ABSORBING_CONDITIONS,b_absorb_field,b_reclen_field, &
+                                  SIMULATION_TYPE,SAVE_FORWARD, &
+                                  COMPUTE_AND_STORE_STRAIN, &
+                                  epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                  epsilondev_xz,epsilondev_yz, &
+                                  ATTENUATION, &
+                                  size(R_xx), &
+                                  R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                  one_minus_sum_beta,factor_common, &
+                                  alphaval,betaval,gammaval, &
+                                  OCEANS,rmass_ocean_load, &
+                                  NOISE_TOMOGRAPHY, &
+                                  free_surface_normal,free_surface_ispec,free_surface_ijk, &
+                                  num_free_surface_faces, &
+                                  ACOUSTIC_SIMULATION, &
+                                  num_colors_outer_elastic,num_colors_inner_elastic, &
+                                  num_elem_colors_elastic, &
+                                  ANISOTROPY, &
+                                  c11store,c12store,c13store,c14store,c15store,c16store, &
+                                  c22store,c23store,c24store,c25store,c26store, &
+                                  c33store,c34store,c35store,c36store, &
+                                  c44store,c45store,c46store,c55store,c56store,c66store)
+
+    if( SIMULATION_TYPE == 3 ) &
+      call prepare_fields_elastic_adj_dev(Mesh_pointer, NDIM*NGLOB_AB, &
+                                  SIMULATION_TYPE, &
+                                  COMPUTE_AND_STORE_STRAIN, &
+                                  epsilon_trace_over_3, &
+                                  b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
+                                  b_epsilondev_xz,b_epsilondev_yz, &
+                                  b_epsilon_trace_over_3, &
+                                  ATTENUATION,size(R_xx), &
+                                  b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+                                  b_alphaval,b_betaval,b_gammaval, &
+                                  APPROXIMATE_HESS_KL)
+
+  endif
+
+  ! prepares fields on GPU for poroelastic simulations
+  if( POROELASTIC_SIMULATION ) then
+    stop 'todo poroelastic simulations on GPU'
+  endif
+
+  ! prepares needed receiver array for adjoint runs
+  if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
+    call prepare_sim2_or_3_const_device(Mesh_pointer, &
+                                       islice_selected_rec,size(islice_selected_rec), &
+                                       nadj_rec_local,nrec,myrank)
+
+  ! prepares fields on GPU for noise simulations
+  if ( NOISE_TOMOGRAPHY > 0 ) then
+    ! note: noise tomography is only supported for elastic domains so far.
+
+    ! copies noise  arrays to GPU
+    call prepare_fields_noise_device(Mesh_pointer, NSPEC_AB, NGLOB_AB, &
+                                  free_surface_ispec, &
+                                  free_surface_ijk, &
+                                  num_free_surface_faces, &
+                                  SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
+                                  NSTEP,noise_sourcearray, &
+                                  normal_x_noise,normal_y_noise,normal_z_noise, &
+                                  mask_noise,free_surface_jacobian2Dw)
+
+  endif ! NOISE_TOMOGRAPHY
+
+  ! prepares gravity arrays
+  if( GRAVITY ) then
+    call prepare_fields_gravity_device(Mesh_pointer,GRAVITY, &
+                                    minus_deriv_gravity,minus_g,wgll_cube,&
+                                    ACOUSTIC_SIMULATION,rhostore)
+  endif
+
+  ! sends initial data to device
+
+  ! puts acoustic initial fields onto GPU
+  if( ACOUSTIC_SIMULATION ) then
+    call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
+                          potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer)
+    if( SIMULATION_TYPE == 3 ) &
+      call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, &
+                          b_potential_dot_acoustic,b_potential_dot_dot_acoustic,Mesh_pointer)
+  endif
+
+  ! puts elastic initial fields onto GPU
+  if( ELASTIC_SIMULATION ) then
+    ! transfer forward and backward fields to device with initial values
+    call transfer_fields_el_to_device(NDIM*NGLOB_AB,displ,veloc,accel,Mesh_pointer)
+    if(SIMULATION_TYPE == 3) &
+      call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
+  endif
+
+  ! outputs GPU usage to files for all processes
+  call output_free_device_memory(myrank)
+
+  ! outputs usage for main process
+  if( myrank == 0 ) then
+    call get_free_device_memory(free_mb,used_mb,total_mb)
+    write(IMAIN,*) "GPU usage: free  =",free_mb," MB",nint(free_mb/total_mb*100.0),"%"
+    write(IMAIN,*) "           used  =",used_mb," MB",nint(used_mb/total_mb*100.0),"%"
+    write(IMAIN,*) "           total =",total_mb," MB",nint(total_mb/total_mb*100.0),"%"
+    write(IMAIN,*)
+  endif
+
+  end subroutine prepare_timerun_GPU
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! OpenMP version uses "special" compute_forces_elastic_Dev routine
+! we need to set num_elem_colors_elastic arrays
+
+#ifdef OPENMP_MODE
+  subroutine prepare_timerun_OpenMP()
+
+  use specfem_par
+  use specfem_par_elastic
+  implicit none
+
+  ! local parameters
+  integer :: ier
+  integer :: NUM_THREADS
+  integer :: OMP_GET_MAX_THREADS
+
+  ! OpenMP for elastic simulation only supported yet
+  if( ELASTIC_SIMULATION ) then
+
+    NUM_THREADS = OMP_GET_MAX_THREADS()
+    if( myrank == 0 ) then
+      write(IMAIN,*)
+      write(IMAIN,*) 'Using:',NUM_THREADS, ' OpenMP threads'
+      write(IMAIN,*)
+    endif
+
+    ! allocate cfe_Dev_openmp local arrays for OpenMP version
+    allocate(dummyx_loc(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(dummyy_loc(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(dummyz_loc(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempx1(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempx2(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempx3(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempy1(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempy2(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempy3(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempz1(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempz2(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(newtempz3(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempx1(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempx2(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempx3(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempy1(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempy2(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempy3(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempz1(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempz2(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+    allocate(tempz3(NGLLX,NGLLY,NGLLZ,NUM_THREADS))
+
+    ! set num_elem_colors array in case no mesh coloring is used
+    if( .not. USE_MESH_COLORING_GPU ) then
+      ! deallocate dummy array
+      if( allocated(num_elem_colors_elastic) ) deallocate(num_elem_colors_elastic)
+
+      ! loads with corresonding values
+      num_colors_outer_elastic = 1
+      num_colors_inner_elastic = 1
+      allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+
+      ! sets to all elements in inner/outer phase
+      num_elem_colors_elastic(1) = nspec_outer_elastic
+      num_elem_colors_elastic(2) = nspec_inner_elastic
+    endif
+
+  endif
+
+  end subroutine prepare_timerun_OpenMP
+#endif
diff --git a/src/specfem3D/read_mesh_databases.f90 b/src/specfem3D/read_mesh_databases.f90
index 8281b24c4..0b97cce17 100644
--- a/src/specfem3D/read_mesh_databases.f90
+++ b/src/specfem3D/read_mesh_databases.f90
@@ -76,6 +76,8 @@ subroutine read_mesh_databases()
   read(27) ispec_is_poroelastic
 
   ! acoustic
+  ! number of acoustic elements in this partition
+  nspec_acoustic = count(ispec_is_acoustic(:))
   ! all processes will have acoustic_simulation set if any flag is .true.
   call any_all_l( ANY(ispec_is_acoustic), ACOUSTIC_SIMULATION )
   if( ACOUSTIC_SIMULATION ) then
@@ -101,6 +103,10 @@ subroutine read_mesh_databases()
   endif
 
   ! elastic
+  ! number of elastic elements in this partition
+  nspec_elastic = count(ispec_is_elastic(:))
+
+  ! elastic simulation
   call any_all_l( ANY(ispec_is_elastic), ELASTIC_SIMULATION )
   if( ELASTIC_SIMULATION ) then
     ! displacement,velocity,acceleration
@@ -170,7 +176,8 @@ subroutine read_mesh_databases()
     if( ier /= 0 ) stop 'error allocating array one_minus_sum_beta etc.'
 
     ! reads mass matrices
-    read(27) rmass
+    read(27,iostat=ier) rmass
+    if( ier /= 0 ) stop 'error reading in array rmass'
 
     if( OCEANS ) then
       ! ocean mass matrix
@@ -182,10 +189,36 @@ subroutine read_mesh_databases()
       allocate(rmass_ocean_load(1),stat=ier)
       if( ier /= 0 ) stop 'error allocating dummy array rmass_ocean_load'
     endif
-    !pll
-    read(27) rho_vp
-    read(27) rho_vs
 
+    !pll material parameters for stacey conditions
+    read(27,iostat=ier) rho_vp
+    if( ier /= 0 ) stop 'error reading in array rho_vp'
+    read(27,iostat=ier) rho_vs
+    if( ier /= 0 ) stop 'error reading in array rho_vs'
+
+    ! checks if rhostore is available for gravity
+    if( GRAVITY ) then
+
+      if( .not. ACOUSTIC_SIMULATION ) then
+        ! rho array needed for gravity
+        allocate(rhostore(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array rhostore'
+
+        ! extract rho information from mu = rho * vs * vs and rho_vs = rho * vs
+        rhostore = 0.0_CUSTOM_REAL
+        where( mustore > TINYVAL )
+          rhostore = (rho_vs*rho_vs) / mustore
+        endwhere
+
+        ! note: the construct below leads to a segmentation fault (ifort v11.1). not sure why...
+        !          (where statement - standard fortran 95)
+        !where( mustore > TINYVAL )
+        !  rhostore = (rho_vs*rho_vs) / mustore
+        !elsewhere
+        !  rhostore = 0.0_CUSTOM_REAL
+        !endwhere
+      endif
+    endif
   else
     ! no elastic attenuation & anisotropy
     ATTENUATION = .false.
@@ -195,6 +228,9 @@ subroutine read_mesh_databases()
   ! poroelastic
   call any_all_l( ANY(ispec_is_poroelastic), POROELASTIC_SIMULATION )
   if( POROELASTIC_SIMULATION ) then
+
+    if( GPU_MODE ) call exit_mpi(myrank,'POROELASTICITY not supported by GPU mode yet...')
+
     ! displacement,velocity,acceleration for the solid (s) & fluid (w) phases
     allocate(displs_poroelastic(NDIM,NGLOB_AB),stat=ier)
     if( ier /= 0 ) stop 'error allocating array displs_poroelastic'
@@ -393,6 +429,42 @@ subroutine read_mesh_databases()
     if(num_phase_ispec_poroelastic > 0 ) read(27) phase_ispec_inner_poroelastic
   endif
 
+! mesh coloring for GPUs
+  if( USE_MESH_COLORING_GPU ) then
+    ! acoustic domain colors
+    if( ACOUSTIC_SIMULATION ) then
+      read(27) num_colors_outer_acoustic,num_colors_inner_acoustic
+
+      allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+
+      read(27) num_elem_colors_acoustic
+    endif
+    ! elastic domain colors
+    if( ELASTIC_SIMULATION ) then
+      read(27) num_colors_outer_elastic,num_colors_inner_elastic
+
+      allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+
+      read(27) num_elem_colors_elastic
+    endif
+  else
+    ! allocates dummy arrays
+    if( ACOUSTIC_SIMULATION ) then
+      num_colors_outer_acoustic = 0
+      num_colors_inner_acoustic = 0
+      allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array'
+    endif
+    if( ELASTIC_SIMULATION ) then
+      num_colors_outer_elastic = 0
+      num_colors_inner_elastic = 0
+      allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier)
+      if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array'
+    endif
+  endif
+
   close(27)
 
   ! outputs total element numbers
diff --git a/src/specfem3D/setup_GLL_points.f90 b/src/specfem3D/setup_GLL_points.f90
index 514522849..31d24757c 100644
--- a/src/specfem3D/setup_GLL_points.f90
+++ b/src/specfem3D/setup_GLL_points.f90
@@ -41,9 +41,9 @@ subroutine setup_GLL_points()
 
 ! set up GLL points, weights and derivation matrices for reference element (between -1,1)
   call define_derivation_matrices(xigll,yigll,zigll,wxgll,wygll,wzgll, &
-         hprime_xx,hprime_yy,hprime_zz, &
-         hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-         wgllwgll_xy,wgllwgll_xz,wgllwgll_yz)
+                                hprime_xx,hprime_yy,hprime_zz, &
+                                hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+                                wgllwgll_xy,wgllwgll_xz,wgllwgll_yz)
 
 ! define transpose of derivation matrix
   do j = 1,NGLLY
diff --git a/src/specfem3D/setup_sources_receivers.f90 b/src/specfem3D/setup_sources_receivers.f90
index c265b593e..437c80a4b 100644
--- a/src/specfem3D/setup_sources_receivers.f90
+++ b/src/specfem3D/setup_sources_receivers.f90
@@ -153,7 +153,7 @@ subroutine setup_sources()
   call max_all_all_dp(t0_acoustic,t0)
 
   ! point force sources will start depending on the frequency given by hdur
-  if( USE_FORCE_POINT_SOURCE ) then
+  if( USE_FORCE_POINT_SOURCE .or. USE_RICKER_IPATI ) then
     ! note: point force sources will give the dominant frequency in hdur,
     !          thus the main period is 1/hdur.
     !          also, these sources use a Ricker source time function instead of a gaussian.
@@ -211,6 +211,12 @@ subroutine setup_sources()
     call exit_mpi(myrank,'error negative USER_T0 parameter in constants.h')
   endif
 
+  ! count number of sources located in this slice
+  nsources_local = 0
+  do isource = 1, NSOURCES
+    if(myrank == islice_selected_source(isource)) nsources_local = nsources_local + 1
+  enddo
+
   ! checks if source is in an acoustic element and exactly on the free surface because pressure is zero there
   call setup_sources_check_acoustic()
 
@@ -523,6 +529,7 @@ subroutine setup_sources_precompute_arrays()
   real(kind=CUSTOM_REAL) :: junk
   integer :: isource,ispec
   integer :: irec !,irec_local
+  integer :: i,j,k,iglob
   integer :: icomp,itime,nadj_files_found,nadj_files_found_tot,ier
   character(len=3),dimension(NDIM) :: comp ! = (/ "BHE", "BHN", "BHZ" /)
   character(len=256) :: filename
@@ -577,11 +584,44 @@ subroutine setup_sources_precompute_arrays()
                         sourcearray,xigll,yigll,zigll,factor_source)
         endif
 
+        ! point forces, initializes sourcearray, used for simplified CUDA routines
+        if(USE_FORCE_POINT_SOURCE) then
+          ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*]
+          iglob = ibool(nint(xi_source(isource)), &
+                        nint(eta_source(isource)), &
+                        nint(gamma_source(isource)), &
+                        ispec)
+          ! sets sourcearrays
+          sourcearray(:,:,:,:) = 0.0
+          do k=1,NGLLZ
+            do j=1,NGLLY
+              do i=1,NGLLX
+                if( ibool(i,j,k,ispec) == iglob ) then
+                  ! acoustic source
+                  ! identical source array components in x,y,z-direction
+                  if( ispec_is_acoustic(ispec) ) then
+                    sourcearray(:,i,j,k) = 1.0
+                  endif
+                  ! elastic source
+                  if( ispec_is_elastic(ispec) ) then
+                    sourcearray(:,i,j,k) = nu_source(COMPONENT_FORCE_SOURCE,:,isource)
+                  endif
+                endif
+              enddo
+            enddo
+          enddo
+        endif
+
         ! stores source excitations
         sourcearrays(isource,:,:,:,:) = sourcearray(:,:,:,:)
 
       endif
     enddo
+  else
+    ! SIMULATION_TYPE == 2
+    ! allocate dummy array (needed for subroutine calls)
+    allocate(sourcearrays(0,0,0,0,0),stat=ier)
+    if( ier /= 0 ) stop 'error allocating dummy sourcearrays'
   endif
 
   ! ADJOINT simulations
@@ -679,17 +719,19 @@ subroutine setup_receivers_precompute_intp()
 
   integer :: irec,irec_local,isource,ier
 
-! stores local receivers interpolation factors
+  ! needs to be allocate for subroutine calls (even if nrec_local == 0)
+  allocate(number_receiver_global(nrec_local),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array number_receiver_global'
+
+  ! stores local receivers interpolation factors
   if (nrec_local > 0) then
-  ! allocate Lagrange interpolators for receivers
+    ! allocate Lagrange interpolators for receivers
     allocate(hxir_store(nrec_local,NGLLX), &
             hetar_store(nrec_local,NGLLY), &
             hgammar_store(nrec_local,NGLLZ),stat=ier)
     if( ier /= 0 ) stop 'error allocating array hxir_store etc.'
 
-  ! define local to global receiver numbering mapping
-    allocate(number_receiver_global(nrec_local),stat=ier)
-    if( ier /= 0 ) stop 'error allocating array number_reciever_global'
+    ! define local to global receiver numbering mapping
     irec_local = 0
     if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
       do irec = 1,nrec
@@ -762,8 +804,21 @@ subroutine setup_sources_receivers_VTKfile()
   double precision :: xil,etal,gammal
   double precision :: xmesh,ymesh,zmesh
   real(kind=CUSTOM_REAL),dimension(NGNOD) :: xelm,yelm,zelm
-  integer :: ia,ispec,isource,irec,ier
-  character(len=256) :: filename,filename_new,system_command
+  integer :: ia,ispec,isource,irec,ier,totalpoints
+
+  !INTEGER(kind=4) :: system_command_status
+  !integer :: ret
+  !integer,external :: system
+
+  character(len=256) :: filename,filename_new,system_command,system_command1,system_command2
+
+  ! determines number of points for vtk file
+  if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+    totalpoints = NSOURCES + nrec
+  else
+    ! pure adjoint simulation only needs receivers
+    totalpoints = nrec
+  endif
 
   if (myrank == 0) then
     ! vtk file
@@ -774,63 +829,65 @@ subroutine setup_sources_receivers_VTKfile()
     write(IOVTK,'(a)') 'Source and Receiver VTK file'
     write(IOVTK,'(a)') 'ASCII'
     write(IOVTK,'(a)') 'DATASET POLYDATA'
-    write(IOVTK, '(a,i6,a)') 'POINTS ', NSOURCES+nrec, ' float'
+    write(IOVTK, '(a,i6,a)') 'POINTS ', totalpoints, ' float'
   endif
 
   ! sources
-  do isource=1,NSOURCES
-    ! spectral element id
-    ispec = ispec_selected_source(isource)
+  if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+    do isource=1,NSOURCES
+      ! spectral element id
+      ispec = ispec_selected_source(isource)
 
-    ! gets element ancor nodes
-    if( myrank == islice_selected_source(isource) ) then
-      ! find the coordinates of the eight corner nodes of the element
-      call get_shape3D_element_corners(xelm,yelm,zelm,ispec,&
-                      ibool,xstore,ystore,zstore,NSPEC_AB,NGLOB_AB)
+      ! gets element ancor nodes
+      if( myrank == islice_selected_source(isource) ) then
+        ! find the coordinates of the eight corner nodes of the element
+        call get_shape3D_element_corners(xelm,yelm,zelm,ispec,&
+                        ibool,xstore,ystore,zstore,NSPEC_AB,NGLOB_AB)
 
-    endif
-    ! master collects corner locations
-    if( islice_selected_source(isource) /= 0 ) then
-      if( myrank == 0 ) then
-        call recvv_cr(xelm,NGNOD,islice_selected_source(isource),0)
-        call recvv_cr(yelm,NGNOD,islice_selected_source(isource),0)
-        call recvv_cr(zelm,NGNOD,islice_selected_source(isource),0)
-      else if( myrank == islice_selected_source(isource) ) then
-        call sendv_cr(xelm,NGNOD,0,0)
-        call sendv_cr(yelm,NGNOD,0,0)
-        call sendv_cr(zelm,NGNOD,0,0)
       endif
-    endif
-
-    if( myrank == 0 ) then
-      ! get the 3-D shape functions
-      if( USE_FORCE_POINT_SOURCE ) then
-        ! note: we switch xi,eta,gamma range to be [-1,1]
-        ! uses initial guess in xi, eta and gamma
-        xil = xigll(nint(xi_source(isource)))
-        etal = yigll(nint(eta_source(isource)))
-        gammal = zigll(nint(gamma_source(isource)))
-      else
-        xil = xi_source(isource)
-        etal = eta_source(isource)
-        gammal = gamma_source(isource)
+      ! master collects corner locations
+      if( islice_selected_source(isource) /= 0 ) then
+        if( myrank == 0 ) then
+          call recvv_cr(xelm,NGNOD,islice_selected_source(isource),0)
+          call recvv_cr(yelm,NGNOD,islice_selected_source(isource),0)
+          call recvv_cr(zelm,NGNOD,islice_selected_source(isource),0)
+        else if( myrank == islice_selected_source(isource) ) then
+          call sendv_cr(xelm,NGNOD,0,0)
+          call sendv_cr(yelm,NGNOD,0,0)
+          call sendv_cr(zelm,NGNOD,0,0)
+        endif
       endif
-      call get_shape3D_single(myrank,shape3D,xil,etal,gammal)
 
-      ! interpolates source locations
-      xmesh = 0.0
-      ymesh = 0.0
-      zmesh = 0.0
-      do ia=1,NGNOD
-        xmesh = xmesh + shape3D(ia)*xelm(ia)
-        ymesh = ymesh + shape3D(ia)*yelm(ia)
-        zmesh = zmesh + shape3D(ia)*zelm(ia)
-      enddo
+      if( myrank == 0 ) then
+        ! get the 3-D shape functions
+        if( USE_FORCE_POINT_SOURCE ) then
+          ! note: we switch xi,eta,gamma range to be [-1,1]
+          ! uses initial guess in xi, eta and gamma
+          xil = xigll(nint(xi_source(isource)))
+          etal = yigll(nint(eta_source(isource)))
+          gammal = zigll(nint(gamma_source(isource)))
+        else
+          xil = xi_source(isource)
+          etal = eta_source(isource)
+          gammal = gamma_source(isource)
+        endif
+        call get_shape3D_single(myrank,shape3D,xil,etal,gammal)
+
+        ! interpolates source locations
+        xmesh = 0.0
+        ymesh = 0.0
+        zmesh = 0.0
+        do ia=1,NGNOD
+          xmesh = xmesh + shape3D(ia)*xelm(ia)
+          ymesh = ymesh + shape3D(ia)*yelm(ia)
+          zmesh = zmesh + shape3D(ia)*zelm(ia)
+        enddo
 
-      ! writes out to VTK file
-      write(IOVTK,*) xmesh,ymesh,zmesh
-    endif
-  enddo ! NSOURCES
+        ! writes out to VTK file
+        write(IOVTK,*) xmesh,ymesh,zmesh
+      endif
+    enddo ! NSOURCES
+  endif
 
   ! receivers
   do irec=1,nrec
@@ -856,15 +913,9 @@ subroutine setup_sources_receivers_VTKfile()
 
     if( myrank == 0 ) then
       ! get the 3-D shape functions
-      if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
-        xil = xi_receiver(irec)
-        etal = eta_receiver(irec)
-        gammal = gamma_receiver(irec)
-      else
-        xil = xi_source(irec)
-        etal = eta_source(irec)
-        gammal = gamma_source(irec)
-      endif
+      xil = xi_receiver(irec)
+      etal = eta_receiver(irec)
+      gammal = gamma_receiver(irec)
       call get_shape3D_single(myrank,shape3D,xil,etal,gammal)
 
       ! interpolates receiver locations
@@ -888,22 +939,50 @@ subroutine setup_sources_receivers_VTKfile()
     close(IOVTK)
 
     ! creates additional receiver and source files
-    ! extracts receiver locations
-    filename = trim(OUTPUT_FILES)//'/sr.vtk'
-    filename_new = trim(OUTPUT_FILES)//'/receiver.vtk'
-    write(system_command, &
-  "('awk ',a1,'{if(NR<5) print $0;if(NR==6)print ',a1,'POINTS',i6,' float',a1,';if(NR>5+',i6,')print $0}',a1,' < ',a,' > ',a)")&
+    if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+      ! extracts receiver locations
+      filename = trim(OUTPUT_FILES)//'/sr.vtk'
+      filename_new = trim(OUTPUT_FILES)//'/receiver.vtk'
+
+      ! vtk file for receivers only
+      write(system_command, &
+  "('awk ',a1,'{if(NR<5) print $0;if(NR==5)print ',a1,'POINTS',i6,' float',a1,';if(NR>5+',i6,')print $0}',a1,' < ',a,' > ',a)")&
       "'",'"',nrec,'"',NSOURCES,"'",trim(filename),trim(filename_new)
-    call system(system_command)
 
-    ! extracts source locations
-    filename_new = trim(OUTPUT_FILES)//'/source.vtk'
-    write(system_command, &
-  "('awk ',a1,'{if(NR< 6 + ',i6,') print $0}END{print}',a1,' < ',a,' > ',a)")&
-      "'",NSOURCES,"'",trim(filename),trim(filename_new)
-    call system(system_command)
+!daniel:
+! gfortran
+!      call system(trim(system_command),system_command_status)
+! ifort
+!      ret = system(trim(system_command))
+
+      ! extracts source locations
+      !"('awk ',a1,'{if(NR< 6 + ',i6,') print $0}END{print}',a1,' < ',a,' > ',a)")&
+      filename_new = trim(OUTPUT_FILES)//'/source.vtk'
+
+      write(system_command1, &
+  "('awk ',a1,'{if(NR<5) print $0;if(NR==5)print ',a1,'POINTS',i6,' float',a1,';')") &
+        "'",'"',NSOURCES,'"'
 
+      !daniel
+      !print*,'command 1:',trim(system_command1)
 
+      write(system_command2, &
+  "('if(NR>5 && NR <6+',i6,')print $0}END{print ',a,'}',a1,' < ',a,' > ',a)") &
+        NSOURCES,'" "',"'",trim(filename),trim(filename_new)
+
+      !print*,'command 2:',trim(system_command2)
+
+      system_command = trim(system_command1)//trim(system_command2)
+
+      !print*,'command:',trim(system_command)
+!daniel:
+! gfortran
+!      call system(trim(system_command),system_command_status)
+! ifort
+!      ret = system(trim(system_command))
+
+    endif
   endif
 
+
   end subroutine setup_sources_receivers_VTKfile
diff --git a/src/specfem3D/specfem3D_par.f90 b/src/specfem3D/specfem3D_par.f90
index 9de96fc53..ad549bd8f 100644
--- a/src/specfem3D/specfem3D_par.f90
+++ b/src/specfem3D/specfem3D_par.f90
@@ -42,8 +42,32 @@ module specfem_par
 
   implicit none
 
-! attenuation
-  integer :: NSPEC_ATTENUATION_AB
+! parameters deduced from parameters read from file
+  integer :: NPROC
+  integer :: NSPEC_AB, NGLOB_AB
+
+! mesh parameters
+  integer, dimension(:,:,:,:), allocatable :: ibool
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: xstore,ystore,zstore
+
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz,jacobian
+
+! material properties
+  ! isotropic
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: kappastore,mustore
+
+! CUDA mesh pointer<->integer wrapper
+  integer(kind=8) :: Mesh_pointer
+
+! Global GPU toggle. Set in Par_file
+  logical :: GPU_MODE
+
+! use integer array to store topography values
+  integer :: NX_TOPO,NY_TOPO
+  !double precision :: ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO
+  !character(len=100) :: topo_file
+  integer, dimension(:,:), allocatable :: itopo_bathy
 
 ! absorbing boundary arrays (for all boundaries) - keeps all infos, allowing for irregular surfaces
   real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: abs_boundary_normal
@@ -59,16 +83,10 @@ module specfem_par
   integer, dimension(:), allocatable :: free_surface_ispec
   integer :: num_free_surface_faces
 
-! mesh parameters
-  integer, dimension(:,:,:,:), allocatable :: ibool
-  real(kind=CUSTOM_REAL), dimension(:), allocatable :: xstore,ystore,zstore
-
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz,jacobian
+! attenuation
+  integer :: NSPEC_ATTENUATION_AB
+  character(len=256) prname_Q
 
-! material properties
-  ! isotropic
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: kappastore,mustore
 
 ! additional mass matrix for ocean load
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmass_ocean_load
@@ -91,7 +109,7 @@ module specfem_par
   double precision, external :: comp_source_time_function
   double precision :: t0
   real(kind=CUSTOM_REAL) :: stf_used_total
-  integer :: NSOURCES
+  integer :: NSOURCES,nsources_local
   ! source encoding
   ! for acoustic sources: takes +/- 1 sign, depending on sign(Mxx)[ = sign(Myy) = sign(Mzz)
   ! since they have to equal in the acoustic setting]
@@ -101,8 +119,9 @@ module specfem_par
   character(len=256) :: rec_filename,filtered_rec_filename,dummystring
   integer :: nrec,nrec_local,nrec_tot_found
   integer :: nrec_simulation
-  integer, allocatable, dimension(:) :: islice_selected_rec,ispec_selected_rec,number_receiver_global
-  double precision, allocatable, dimension(:) :: xi_receiver,eta_receiver,gamma_receiver
+  integer, dimension(:), allocatable :: islice_selected_rec,ispec_selected_rec
+  integer, dimension(:), allocatable :: number_receiver_global
+  double precision, dimension(:), allocatable :: xi_receiver,eta_receiver,gamma_receiver
   double precision, dimension(:,:), allocatable :: hpxir_store,hpetar_store,hpgammar_store
 
 ! timing information for the stations
@@ -137,17 +156,16 @@ module specfem_par
   double precision, external :: wtime
   double precision :: time_start
 
-! parameters read from parameter file
-  integer :: NPROC_XI,NPROC_ETA
-  integer :: NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,UTM_PROJECTION_ZONE
+! parameters
   integer :: SIMULATION_TYPE
+  integer :: NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,UTM_PROJECTION_ZONE
   integer :: IMODEL
-  
+
   double precision :: DT
-  double precision :: LATITUDE_MIN,LATITUDE_MAX,LONGITUDE_MIN,LONGITUDE_MAX
 
   logical :: ATTENUATION,USE_OLSEN_ATTENUATION, &
             OCEANS,TOPOGRAPHY,ABSORBING_CONDITIONS,ANISOTROPY
+  logical :: GRAVITY
 
   logical :: SAVE_FORWARD,SAVE_MESH_FILES,PRINT_SOURCE_TIME_FUNCTION
 
@@ -155,11 +173,11 @@ module specfem_par
 
   integer :: NTSTEP_BETWEEN_OUTPUT_INFO
 
-  character(len=256) OUTPUT_FILES,LOCAL_PATH,prname,prname_Q
+! parameters read from mesh parameter file
+  integer :: NPROC_XI,NPROC_ETA
+  double precision :: LATITUDE_MIN,LATITUDE_MAX,LONGITUDE_MIN,LONGITUDE_MAX
 
-! parameters deduced from parameters read from file
-  integer :: NPROC
-  integer :: NSPEC_AB, NGLOB_AB
+  character(len=256) OUTPUT_FILES,LOCAL_PATH,prname
 
 ! names of the data files for all the processors in MPI
   character(len=256) outputname
@@ -194,13 +212,6 @@ module specfem_par
 ! MPI partition surfaces
   logical, dimension(:), allocatable :: ispec_is_inner
 
-! maximum of the norm of the displacement
-  real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all ! elastic
-  real(kind=CUSTOM_REAL) Usolidnormp,Usolidnormp_all ! acoustic
-  real(kind=CUSTOM_REAL) Usolidnorms,Usolidnorms_all ! solid poroelastic
-  real(kind=CUSTOM_REAL) Usolidnormw,Usolidnormw_all ! fluid (w.r.t.s) poroelastic
-  integer:: Usolidnorm_index(1)
-
 ! maximum speed in velocity model
   real(kind=CUSTOM_REAL):: model_speed_max
 
@@ -210,6 +221,10 @@ module specfem_par
 !!$  real(kind=CUSTOM_REAL) :: weight, jacobianl
 !!!! NL NL REGOLITH
 
+  ! gravity
+  real(kind=CUSTOM_REAL), dimension(:),allocatable :: minus_deriv_gravity,minus_g
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
+
 
 ! ADJOINT parameters
 
@@ -241,9 +256,6 @@ module specfem_par
   ! adjoint elements
   integer :: NSPEC_ADJOINT, NGLOB_ADJOINT
 
-  ! norm of the backward displacement
-   real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all
-
   ! length of reading blocks
   integer :: NTSTEP_BETWEEN_READ_ADJSRC
 
@@ -285,6 +297,12 @@ module specfem_par_elastic
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: displ,veloc,accel
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: accel_adj_coupling
 
+! variables needed for OpenMP version
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: &
+       dummyx_loc,dummyy_loc,dummyz_loc,newtempx1,newtempx2,newtempx3,&
+       newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3,&
+       tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
 ! mass matrix
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmass
 
@@ -308,8 +326,12 @@ module specfem_par_elastic
   integer, dimension(:,:), allocatable :: phase_ispec_inner_elastic
   integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
 
-  logical :: ELASTIC_SIMULATION
+! mesh coloring
+  integer :: num_colors_outer_elastic,num_colors_inner_elastic
+  integer, dimension(:), allocatable :: num_elem_colors_elastic
+  integer :: nspec_elastic
 
+  logical :: ELASTIC_SIMULATION
 
 ! ADJOINT elastic
 
@@ -389,6 +411,11 @@ module specfem_par_acoustic
   integer, dimension(:,:), allocatable :: phase_ispec_inner_acoustic
   integer :: num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic
 
+! mesh coloring
+  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
+  integer, dimension(:), allocatable :: num_elem_colors_acoustic
+  integer :: nspec_acoustic
+
   logical :: ACOUSTIC_SIMULATION
 
 ! ADJOINT acoustic
diff --git a/src/specfem3D/write_movie_output.f90 b/src/specfem3D/write_movie_output.f90
index f33bf475d..c03c31ad0 100644
--- a/src/specfem3D/write_movie_output.f90
+++ b/src/specfem3D/write_movie_output.f90
@@ -30,8 +30,32 @@ subroutine write_movie_output()
 
   use specfem_par
   use specfem_par_movie
+  use specfem_par_elastic
+  use specfem_par_acoustic
   implicit none
 
+  ! gets resulting array values onto CPU
+  if(GPU_MODE .and. &
+    ( &
+      EXTERNAL_MESH_CREATE_SHAKEMAP .or. &
+      CREATE_SHAKEMAP .or. &
+      ( MOVIE_SURFACE .and. mod(it,NTSTEP_BETWEEN_FRAMES) == 0) .or. &
+      ( MOVIE_VOLUME .and. mod(it,NTSTEP_BETWEEN_FRAMES) == 0) .or. &
+      ( PNM_GIF_IMAGE .and. mod(it,NTSTEP_BETWEEN_FRAMES) == 0) &
+     ) ) then
+    ! acoustic domains
+    if( ACOUSTIC_SIMULATION ) then
+      ! transfers whole fields
+      call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+                potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer)
+    endif
+    ! elastic domains
+    if( ELASTIC_SIMULATION ) then
+      ! transfers whole fields
+      call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+    endif
+  endif
+
   ! shakemap creation
   if (EXTERNAL_MESH_CREATE_SHAKEMAP) then
     call wmo_create_shakemap_em()
@@ -126,19 +150,19 @@ subroutine wmo_create_shakemap_em()
                           potential_acoustic, displ_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
       ! velocity vector
       call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                           potential_dot_acoustic, veloc_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
       ! accel ?
       call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                           potential_dot_dot_acoustic, accel_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
     endif
 
 
@@ -146,7 +170,7 @@ subroutine wmo_create_shakemap_em()
     if (USE_HIGHRES_FOR_MOVIES) then
       do ipoin = 1, NGLLX*NGLLY
         iglob = faces_surface_ext_mesh(ipoin,ispec2D)
-        
+
         ! saves norm of displacement,velocity and acceleration vector
         if( ispec_is_elastic(ispec) ) then
           ! norm of displacement
@@ -286,7 +310,7 @@ subroutine wmo_get_max_vector(ispec,ispec2D,iglob,ipoin, &
   logical :: is_done
 
   is_done = .false.
-  
+
   ! loops over all gll points from this element
   do k=1,NGLLZ
     do j=1,NGLLY
@@ -374,14 +398,14 @@ subroutine wmo_create_movie_surface_em()
                           potential_acoustic, val_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
       else
         ! velocity vector
         call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                           potential_dot_acoustic, val_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
       endif
     endif
 
@@ -613,14 +637,14 @@ subroutine wmo_movie_surface_output_o()
                           potential_acoustic, val_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
       else
         ! velocity vector
         call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                           potential_dot_acoustic, val_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
       endif
     endif
 
@@ -770,19 +794,19 @@ subroutine wmo_create_shakemap_o()
                           potential_acoustic, displ_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
       ! velocity vector
       call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                           potential_dot_acoustic, veloc_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
       ! accel ?
       call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                           potential_dot_dot_acoustic, accel_element,&
                           hprime_xx,hprime_yy,hprime_zz, &
                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                          ibool,rhostore)
+                          ibool,rhostore,GRAVITY)
     endif
 
     ! save all points for high resolution, or only four corners for low resolution
@@ -940,14 +964,14 @@ subroutine wmo_get_max_vector_o(ispec,iglob,ipoin,displ_element,veloc_element,ac
 
   ! velocity vector
   is_done = .false.
-    
+
   ! loops over all gll points from this element
   do k=1,NGLLZ
     do j=1,NGLLY
       do i=1,NGLLX
         ! checks if global point is found
         if( iglob == ibool(i,j,k,ispec) ) then
-        
+
           ! horizontal displacement
           store_val_ux_external_mesh(ipoin) = max(store_val_ux_external_mesh(ipoin),&
                                         abs(displ_element(1,i,j,k)),abs(displ_element(2,i,j,k)))
@@ -1019,7 +1043,7 @@ subroutine wmo_movie_volume_output()
                         potential_dot_acoustic, veloc_element,&
                         hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore)
+                        ibool,rhostore,GRAVITY)
       velocity_x(:,:,:,ispec) = veloc_element(1,:,:,:)
       velocity_y(:,:,:,ispec) = veloc_element(2,:,:,:)
       velocity_z(:,:,:,ispec) = veloc_element(3,:,:,:)
diff --git a/src/specfem3D/write_seismograms.f90 b/src/specfem3D/write_seismograms.f90
index 3bd8512d9..b03cdf6da 100644
--- a/src/specfem3D/write_seismograms.f90
+++ b/src/specfem3D/write_seismograms.f90
@@ -45,6 +45,50 @@ subroutine write_seismograms()
   real(kind=CUSTOM_REAL):: stf_deltat
   double precision :: stf
 
+  ! TODO: Test and Fix CUDA seismograms code.
+  logical,parameter :: USE_CUDA_SEISMOGRAMS = .false.
+
+  ! gets resulting array values onto CPU
+  if(GPU_MODE) then
+    if( nrec_local > 0 ) then
+      ! this transfers fields only in elements with stations for efficiency
+      if( ACOUSTIC_SIMULATION ) then
+        ! only copy corresponding elements to CPU host
+        ! timing: Elapsed time: 5.230904e-04
+        call transfer_station_ac_from_device( &
+                        potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, &
+                        b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
+                        Mesh_pointer,number_receiver_global, &
+                        ispec_selected_rec,ispec_selected_source,ibool,SIMULATION_TYPE)
+
+        ! alternative: transfers whole fields
+        ! timing: Elapsed time: 4.138947e-03
+        !call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
+        !          potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer)
+      endif
+
+      ! this transfers fields only in elements with stations for efficiency
+      if( ELASTIC_SIMULATION ) then
+         if(USE_CUDA_SEISMOGRAMS) then
+            call transfer_seismograms_el_from_d(nrec_local,Mesh_pointer, &
+                                               SIMULATION_TYPE,&
+                                               seismograms_d,seismograms_v,seismograms_a,&
+                                               it)
+         else
+            call transfer_station_el_from_device(displ,veloc,accel, &
+                                                b_displ,b_veloc,b_accel, &
+                                                Mesh_pointer,number_receiver_global, &
+                                                ispec_selected_rec,ispec_selected_source, &
+                                                ibool,SIMULATION_TYPE)
+         endif
+        ! alternative: transfers whole fields
+        !  call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+      endif
+    endif
+  endif
+
+  if(.not. GPU_MODE .or. (GPU_MODE .and. (.not. USE_CUDA_SEISMOGRAMS))) then
+
   do irec_local = 1,nrec_local
 
     ! gets global number of that receiver
@@ -57,7 +101,8 @@ subroutine write_seismograms()
     hgammar(:) = hgammar_store(irec_local,:)
 
     ! forward simulations
-    if (SIMULATION_TYPE == 1)  then
+    select case( SIMULATION_TYPE )
+    case( 1 )
 
       ! receiver's spectral element
       ispec = ispec_selected_rec(irec)
@@ -79,13 +124,13 @@ subroutine write_seismograms()
                         potential_acoustic, displ_element,&
                         hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore)
+                        ibool,rhostore,GRAVITY)
         ! velocity vector
         call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                         potential_dot_acoustic, veloc_element,&
                         hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore)
+                        ibool,rhostore,GRAVITY)
 
         ! interpolates displ/veloc/pressure at receiver locations
         call compute_interpolated_dva_ac(displ_element,veloc_element,&
@@ -109,7 +154,7 @@ subroutine write_seismograms()
       endif !poroelastic
 
     !adjoint simulations
-    else if (SIMULATION_TYPE == 2) then
+    case( 2 )
 
       ! adjoint source is placed at receiver
       ispec = ispec_selected_source(irec)
@@ -166,13 +211,13 @@ subroutine write_seismograms()
                         potential_acoustic, displ_element,&
                         hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore)
+                        ibool,rhostore,GRAVITY)
         ! velocity vector
         call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
                         potential_dot_acoustic, veloc_element,&
                         hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore)
+                        ibool,rhostore,GRAVITY)
 
         ! interpolates displ/veloc/pressure at receiver locations
         call compute_interpolated_dva_ac(displ_element,veloc_element,&
@@ -185,7 +230,7 @@ subroutine write_seismograms()
       endif ! acoustic
 
     !adjoint simulations
-    else if (SIMULATION_TYPE == 3) then
+    case( 3 )
 
       ispec = ispec_selected_rec(irec)
 
@@ -206,13 +251,13 @@ subroutine write_seismograms()
                         b_potential_acoustic, displ_element,&
                         hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore)
+                        ibool,rhostore,GRAVITY)
         ! backward fields: velocity vector
         call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
                         b_potential_dot_acoustic, veloc_element,&
                         hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                        ibool,rhostore)
+                        ibool,rhostore,GRAVITY)
 
         ! backward fields: interpolates displ/veloc/pressure at receiver locations
         call compute_interpolated_dva_ac(displ_element,veloc_element,&
@@ -224,10 +269,10 @@ subroutine write_seismograms()
                         dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
       endif ! acoustic
 
-    endif ! SIMULATION_TYPE
+    end select ! SIMULATION_TYPE
 
-! store North, East and Vertical components
-! distinguish between single and double precision for reals
+    ! store North, East and Vertical components
+    ! distinguish between single and double precision for reals
     if(CUSTOM_REAL == SIZE_REAL) then
       seismograms_d(:,irec_local,it) = sngl((nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd))
       seismograms_v(:,irec_local,it) = sngl((nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd))
@@ -243,23 +288,22 @@ subroutine write_seismograms()
 
   enddo ! nrec_local
 
-! write the current or final seismograms
+  endif
+
+  ! write the current or final seismograms
   if((mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it == NSTEP) .and. (.not.SU_FORMAT)) then
     if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
-      call write_seismograms_to_file(myrank,seismograms_d,number_receiver_global,station_name, &
-            network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,1,SIMULATION_TYPE)
-      call write_seismograms_to_file(myrank,seismograms_v,number_receiver_global,station_name, &
-            network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,2,SIMULATION_TYPE)
-      call write_seismograms_to_file(myrank,seismograms_a,number_receiver_global,station_name, &
-            network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,3,SIMULATION_TYPE)
+      call write_seismograms_to_file(seismograms_d,1)
+      call write_seismograms_to_file(seismograms_v,2)
+      call write_seismograms_to_file(seismograms_a,3)
     else
       call write_adj_seismograms_to_file(myrank,seismograms_d,number_receiver_global, &
             nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,1)
     endif
   endif
 
-! write ONE binary file for all receivers (nrec_local) within one proc
-! SU format, with 240-byte-header for each trace
+  ! write ONE binary file for all receivers (nrec_local) within one proc
+  ! SU format, with 240-byte-header for each trace
   if ((mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it==NSTEP) .and. SU_FORMAT) &
      call write_output_SU()
 
@@ -271,28 +315,19 @@ end subroutine write_seismograms
 
 ! write seismograms to text files
 
-  subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global, &
-               station_name,network_name,nrec,nrec_local, &
-               it,DT,NSTEP,t0,LOCAL_PATH,istore,SIMULATION_TYPE)
+  subroutine write_seismograms_to_file(seismograms,istore)
 
-  implicit none
-
-  include "constants.h"
+  use constants
+  use specfem_par,only: &
+          myrank,number_receiver_global,station_name,network_name, &
+          nrec,nrec_local,islice_selected_rec, &
+          it,DT,NSTEP,t0,LOCAL_PATH,SIMULATION_TYPE
 
-  integer :: NSTEP,it
-  integer :: nrec,nrec_local
-  integer :: myrank,istore
-  integer :: SIMULATION_TYPE
+  implicit none
 
-  integer, dimension(nrec_local) :: number_receiver_global
+  integer :: istore
   real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local,NSTEP) :: seismograms
 
-  double precision t0,DT
-
-  character(len=256) LOCAL_PATH
-  character(len=MAX_LENGTH_STATION_NAME), dimension(nrec) :: station_name
-  character(len=MAX_LENGTH_NETWORK_NAME), dimension(nrec) :: network_name
-
   ! local parameters
   integer irec,irec_local
   integer irecord
@@ -303,6 +338,8 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global,
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: one_seismogram
   integer :: nrec_local_received,NPROCTOT,total_seismos,receiver,sender
   integer :: iproc,ier
+  integer,dimension(1) :: tmp_nrec_local_received,tmp_irec,tmp_nrec_local
+  integer,dimension(:),allocatable:: islice_num_rec_local
 
   ! saves displacement, velocity or acceleration
   if(istore == 1) then
@@ -350,12 +387,27 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global,
 
       ! loop on all the slices
       call world_size(NPROCTOT)
+
+      ! counts number of local receivers for each slice
+      allocate(islice_num_rec_local(0:NPROCTOT-1),stat=ier)
+      if( ier /= 0 ) call exit_mpi(myrank,'error allocating islice_num_rec_local')
+      islice_num_rec_local(:) = 0
+      do irec = 1,nrec
+        iproc = islice_selected_rec(irec)
+        islice_num_rec_local(iproc) = islice_num_rec_local(iproc) + 1
+      enddo
+
+      ! loops on all the slices
       do iproc = 0,NPROCTOT-1
 
+        ! communicate only with processes which contain local receivers
+        if( islice_num_rec_local(iproc) == 0 ) cycle
+
         ! receive except from proc 0, which is me and therefore I already have this value
         sender = iproc
         if(iproc /= 0) then
-          call recv_i(nrec_local_received,1,sender,itag)
+          call recv_i(tmp_nrec_local_received,1,sender,itag)
+          nrec_local_received = tmp_nrec_local_received(1)
           if(nrec_local_received < 0) call exit_MPI(myrank,'error while receiving local number of receivers')
         else
           nrec_local_received = nrec_local
@@ -369,7 +421,8 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global,
               irec = number_receiver_global(irec_local)
               one_seismogram(:,:) = seismograms(:,irec_local,:)
             else
-              call recv_i(irec,1,sender,itag)
+              call recv_i(tmp_irec,1,sender,itag)
+              irec = tmp_irec(1)
               if(irec < 1 .or. irec > nrec) call exit_MPI(myrank,'error while receiving global receiver number')
 
               call recvv_cr(one_seismogram,NDIM*NSTEP,sender,itag)
@@ -389,6 +442,7 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global,
           enddo ! nrec_local_received
         endif ! if(nrec_local_received > 0 )
       enddo ! NPROCTOT-1
+      deallocate(islice_num_rec_local)
 
       write(IMAIN,*) 'Component: .sem'//component
       write(IMAIN,*) '  total number of receivers saved is ',total_seismos,' out of ',nrec
@@ -398,12 +452,14 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global,
 
     else  ! on the nodes, send the seismograms to the master
        receiver = 0
-       call send_i(nrec_local,1,receiver,itag)
+       tmp_nrec_local(1) = nrec_local
+       call send_i(tmp_nrec_local,1,receiver,itag)
        if (nrec_local > 0) then
          do irec_local = 1,nrec_local
            ! get global number of that receiver
            irec = number_receiver_global(irec_local)
-           call send_i(irec,1,receiver,itag)
+           tmp_irec(1) = irec
+           call send_i(tmp_irec,1,receiver,itag)
 
            ! sends seismogram of that receiver
            one_seismogram(:,:) = seismograms(:,irec_local,:)
diff --git a/utils/Cluster/pbs/valgrind_go_solver_pbs.bash b/utils/Cluster/pbs/valgrind_go_solver_pbs.bash
new file mode 100755
index 000000000..3776b8e32
--- /dev/null
+++ b/utils/Cluster/pbs/valgrind_go_solver_pbs.bash
@@ -0,0 +1,77 @@
+#!/bin/bash
+#
+# Valgrind, a suite of tools for debugging and profiling
+# http://valgrind.org/
+#
+
+# bash script
+#PBS -S /bin/bash
+
+# job name 
+#PBS -N valgrind_go_solver
+
+# joins output and error information
+#PBS -j oe
+
+# job output file
+#PBS -o in_out_files/OUTPUT_FILES/job.o
+
+###########################################################
+# USER PARAMETERS
+
+# Queue
+#PBS -q tromp
+
+# 150 CPUs ( 18*8+6 ), walltime 15 hour
+#PBS -l nodes=18:ppn=8+1:ppn=6,walltime=15:00:00
+
+# valgrind mpi library 
+PRELOAD_LIB=/my_valgrind_path/valgrind/lib/valgrind/libmpiwrap-x86-linux.so
+
+###########################################################
+
+cd $PBS_O_WORKDIR
+
+# script to run the mesher and the solver
+# read Par_file to get information about the run
+# compute total number of nodes needed
+NPROC=`grep NPROC in_data_files/Par_file | cut -d = -f 2 `
+
+# total number of nodes is the product of the values read
+numnodes=$NPROC
+
+mkdir -p in_out_files/OUTPUT_FILES
+
+# backup files used for this simulation
+cp in_data_files/Par_file in_out_files/OUTPUT_FILES/
+cp in_data_files/STATIONS in_out_files/OUTPUT_FILES/
+cp in_data_files/CMTSOLUTION in_out_files/OUTPUT_FILES/
+
+# obtain job information
+cat $PBS_NODEFILE > in_out_files/OUTPUT_FILES/compute_nodes
+echo "$PBS_JOBID" > in_out_files/OUTPUT_FILES/jobid
+
+echo starting run in current directory $PWD
+cd bin/
+
+echo " "
+echo "run: memory leaks"
+echo " "
+sleep 2
+
+# memory leaks
+LD_PRELOAD=$PRELOAD_LIB mpiexec -np $numnodes valgrind --leak-check=full ./xspecfem3D >& ../in_out_files/OUTPUT_FILES/output.memory-leaks.log
+
+sleep 2
+echo " "
+echo "run: cache misses"
+echo " "
+
+# cache misses
+LD_PRELOAD=$PRELOAD_LIB mpiexec -np $numnodes valgrind --tool=cachegrind ./xspecfem3D >& ../in_out_files/OUTPUT_FILES/output.cache-misses.log
+
+cp cachegrind.out.* ../in_out_files/OUTPUT_FILES/
+
+echo " "
+echo "finished successfully"
+
diff --git a/utils/create_specfem3D_gpu_cuda_method_stubs.pl b/utils/create_specfem3D_gpu_cuda_method_stubs.pl
new file mode 100755
index 000000000..5b0c14f0d
--- /dev/null
+++ b/utils/create_specfem3D_gpu_cuda_method_stubs.pl
@@ -0,0 +1,141 @@
+#!/usr/bin/perl
+
+#
+#  Script to extract the function declarations in cuda files
+#
+#
+# usage: ./ceate_specfem3D_gpu_cuda_method_stubs.pl 
+#             run in directory root SPECFEM3D/
+#
+
+$outfile = "src/cuda/specfem3D_gpu_cuda_method_stubs.c";
+
+
+open(IOUT,"> _____temp_tutu_____");
+
+$header = <<END;
+/* 
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config.h"
+
+typedef float realw;
+
+END
+
+
+$warning = <<END;
+ fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\\n");
+ exit(1);
+END
+
+print IOUT "$header \n";
+
+$success = 0;
+
+@objects = `ls src/cuda/*.cu`;
+
+foreach $name (@objects) {  
+  chop $name;
+  print "extracting word in file $name ...\n";
+
+  print IOUT "\n//\n// $name\n//\n\n";  
+  
+  # change tabs to white spaces
+  system("expand -2 < $name > _____temp_tutu01_____");  
+  open(IIN,"<_____temp_tutu01_____");
+
+  
+  # open the source file
+  $success = 1;
+  $do_extract = 0;
+  while($line = <IIN>) {
+    chop $line;
+    
+    # suppress trailing white spaces and carriage return
+    $line =~ s/\s*$//;
+    
+    # change the version number and copyright information
+    #    $line =~ s#\(c\) California Institute of Technology and University of Pau, October 2007#\(c\) California Institute of Technology and University of Pau, November 2007#og;
+    #    $line =~ s#rmass_sigma#rmass_time_integral_of_sigma#og;
+    
+    if($line =~ /extern "C"/){
+      # new function declaration starts  
+      #print "$line\n";
+      if( $line =~/FC_FUNC/ ){ 
+        # function declaration on same line as extern, ask for line skip
+        print "problem: please add a line break after extern 'C' here:";
+        print "$line\n";
+        $success = 0;
+        close(IIN);  
+        exit;
+      }
+      $do_extract = 1;
+      next;          
+    }
+    
+    # extract section
+    if($do_extract == 1 ){
+      # function declaration
+      if($line =~ /{/){
+        # function declaration ends
+        if( $line =~ /PREPARE_CUDA_DEVICE/ ){
+          # adds warning
+          print IOUT "$line \n$warning\} \n\n";
+        }else{
+          print IOUT "$line\} \n\n";
+        }
+        $do_extract = 0;
+      }else{
+        # write line to the output file
+        print IOUT "$line\n";  
+      }
+      next;
+    }
+  }
+  close(IIN);  
+
+  if( $success == 0 ){ exit; }
+}
+
+close(IOUT);
+system("rm -f _____temp_tutu01_____");
+
+# creates new stubs file if successful
+if( $success == 1 ){
+  print "\n\nsuccessfully extracted declarations \n\n";
+  system("cp -p $outfile $outfile.bak");
+  system("cp -p _____temp_tutu_____ $outfile");
+  print "created new: $outfile \n";
+}
+system("rm -f _____temp_tutu_____");
+
+
diff --git a/utils/readme_cuda_cscs.txt b/utils/readme_cuda_cscs.txt
new file mode 100644
index 000000000..025321244
--- /dev/null
+++ b/utils/readme_cuda_cscs.txt
@@ -0,0 +1,63 @@
+
+GPU_MODE Notes (by Max Rietmann)
+
+src/shared/constants.h:
+
+Change the following settings (for NOISE simulations):
+
+! sources and receivers Z coordinates given directly instead of with depth
+  logical, parameter :: USE_SOURCES_RECVS_Z = .true.
+
+! the seismograms are normal to surface
+! Z record corresponds to the normal, while E and N are two tangent vectors
+! that completes an orthonormal.
+  logical, parameter :: EXT_MESH_RECV_NORMAL = .true.
+
+Settings for Eiger
+./configure --with-cuda FC=mpif90 MPIFC=mpif90 CUDA_LIB=-L/apps/eiger/Cuda-4.0/cuda/lib64 MPI_INC=-I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include
+./configure --with-cuda FC=mpif90 MPIFC=mpif90 FLAGS_CHECK="-O3 -ffree-line-length-none -fopenmp" FLAGS_NO_CHECK="-Ofast -mfpmath=sse -funroll-loops -ffree-line-length-none -fopenmp" CUDA_LIB=-L/apps/eiger/Cuda-4.0/cuda/lib64 MPI_INC=-I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include
+
+./configure --with-cuda FC=mpif90 MPIFC=mpif90 FLAGS_CHECK="`echo $ICC_CHECK`" FLAGS_NO_CHECK='`echo $ICC_FCFLAGS`' CUDA_LIB=-L/apps/eiger/Cuda-4.0/cuda/lib64 MPI_INC=-I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include
+
+NOTE: Need to add below to get nvcc to use the older version of gcc, as it doesn't support gcc4.5 or above.
+--compiler-bindir /usr/bin/gcc-4.3
+
+ CUDA_LIB=-L$CUDA_HOME/lib64 MPI_INC=-I$MPICH_DIR/include
+
+
+Settings for todi.cscs.ch
+export GNU_FCFLAGS="-Ofast -mfpmath=sse -funroll-loops -ffree-line-length-none"
+export GNU_CHECK="-O3 -ffree-line-length-none"
+export DEBUG_FLAGS="-g -fbacktrace"
+export ICC_FCFLAGS="-O3 -fp-model fast=2 -x SSE4.2 -ftz -funroll-loops -unroll5"
+export ICC_FCFLAGS="-O3 -fp-model fast=2 -funroll-loops -unroll5 -msse3 -ftree-vectorize"
+export ICC_CHECK="-O2"
+export CRAY_FCFLAGS="-eF -em -rm -O3,fp3"
+export CRAY_CHECK="-eF -em -rm"
+
+# Cray configure
+./configure --with-cuda CC=cc FC=ftn MPIFC=ftn MPICC=cc FLAGS_CHECK="`echo $CRAY_CHECK`" FLAGS_NO_CHECK='`echo $CRAY_FCFLAGS`' CUDA_LIB=-L$CUDA_HOME/lib64 MPI_INC=-I$MPICH_DIR/include
+# GNU configure
+./configure --with-cuda CC=cc FC=ftn MPIFC=ftn MPICC=cc FLAGS_CHECK="`echo $GNU_CHECK`" FLAGS_NO_CHECK='`echo $GNU_FCFLAGS`' CUDA_LIB=-L$CUDA_HOME/lib64 MPI_INC=-I$MPICH_DIR/include
+# Intel convifugre
+./configure --with-cuda CC=cc FC=ftn MPIFC=ftn MPICC=cc FLAGS_CHECK="`echo $ICC_CHECK`" FLAGS_NO_CHECK='`echo $ICC_FCFLAGS`' CUDA_LIB=-L$CUDA_HOME/lib64 MPI_INC=-I$MPICH_DIR/include
+
+./configure CC=cc FC=ftn MPIFC=ftn MPICC=cc FLAGS_CHECK="`echo $CRAY_CHECK`" FLAGS_NO_CHECK='`echo $CRAY_FCFLAGS`' MPI_INC=-I$MPICH_DIR/include
+
+suggested compiler options:
+intel: -O3 -fp-model fast=2 -x SSE4.2 -ftz -funroll-loops -unroll5
+gnu: -Ofast -mfpmath=sse -funroll-loops
+pgi: 
+####################################
+Notes from Jeff Poznanovic:
+
+Configure and build with:
+
+module load cuda cudatools cudasdk
+module load PrgEnv-cray
+./configure --with-cuda --with-mpi MPIFC=ftn MPICC=cc FC=ftn CC=cc FCFLAGS="-eF -em -rm" CFLAGS="-h list=m" MPI_INC=-I/opt/cray/mpt/default/xt/gemini/mpich2-cray/73/include
+
+ 
+make
+
+
diff --git a/utils/remake_makefiles.sh b/utils/remake_makefiles.sh
new file mode 100755
index 000000000..06529e545
--- /dev/null
+++ b/utils/remake_makefiles.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+rm -r autom4te.cache
+aclocal -I ./m4/
+echo "autoconf!"
+autoconf configure.ac > test_conf
+if [ $? -eq 0 ]
+then
+    echo "configure!"
+    chmod +x ./test_conf
+    # example on guinan, which has "normal" defaults for a workstation or server
+    ./test_conf MPIFC=mpif90 FC=mpif90 CUDA_LIB="-L/usr/local/cuda/lib64/" MPI_INC="-I/usr/include/mpich2/" --with-cuda CUDA_INC="-I/usr/local/cuda/include"
+fi
diff --git a/utils/update_headers_change_word_f90.pl b/utils/update_headers_change_word_f90.pl
new file mode 100755
index 000000000..90159f7ff
--- /dev/null
+++ b/utils/update_headers_change_word_f90.pl
@@ -0,0 +1,114 @@
+#!/usr/bin/perl
+
+#
+#  Script to change the version number in f90 codes
+#
+#  Author : Dimitri Komatitsch, EPS - Harvard University, May 1998
+#
+
+#
+# read all f90 and F90 (and *.h) files in the current directory
+# f90 files are supposed to have the extension "*.f90" or "*.F90" or "*.h"
+#
+
+#
+# known bug : does the changes also in constant strings, and not only
+# in the code (which is dangerous, but really easier to program...)
+#
+
+#
+# usage: ./update_headers_change_word_f90.pl 
+#             run in directory root SPECFEM3D/
+#
+
+
+@objects = `ls src/*/*.f90 src/*/*.F90 src/*/*.h.in src/*/*.h src/*/*.c src/*/*.cu`;
+
+$nlines_total = 0;
+$nlines_noblank = 0;
+$nlines_nocomment = 0;
+
+foreach $name (@objects) {
+  chop $name;
+
+  
+  # change tabs to white spaces
+  if( 1 == 1 ){
+    system("expand -2 < $name > _____tutu01_____");
+    $f90name = $name;
+    print STDOUT "Changing word in file $name ...\n";
+
+    open(FILEF77,"<_____tutu01_____");
+    open(FILEF90,">$f90name");
+
+    # open the source file
+    while($line = <FILEF77>) {
+      chop $line;
+
+      # suppress trailing white spaces and carriage return
+      $line =~ s/\s*$//;
+
+      # change the version number and copyright information
+      #    $line =~ s#\(c\) California Institute of Technology and University of Pau, October 2007#\(c\) California Institute of Technology and University of Pau, November 2007#og;
+      #    $line =~ s#rmass_sigma#rmass_time_integral_of_sigma#og;
+
+      # write the modified line to the output file
+      print FILEF90 "$line\n";
+
+    }
+
+    close(FILEF77);
+    close(FILEF90);
+  }
+
+  # line count
+  if( 1 == 0 ){
+    print STDOUT "file $name : \n";
+    
+    # counts all lines in file
+    $l = `wc -l $name | awk '{print \$1}'`;
+    chomp $l;
+    print "  lines = $l  \n";
+
+    # without blank lines
+    $lb = 0;
+    # without comments
+    $lc = 0;
+    system("expand -2 < $name > _____tutu01_____");
+    open(FILEF77,"<_____tutu01_____");
+    # open the source file
+    while($line = <FILEF77>) {
+      chop $line;      
+      chomp $line;
+      # remove whitespace at start
+      $line =~ s/^\s+//;
+      # remove whitespace at end
+      $line =~ s/\s+$//;
+      # write the modified line to the output file
+      if( $line ne ""){ $lb = $lb + 1; }
+      if( ($line ne "") && (substr($line,0,1) ne "!") && (substr($line,0,1) ne "/") ){ $lc = $lc + 1; }
+    }
+    close(FILEF77);    
+    print "  lines (no blank) = $lb \n";
+    print "  lines (no comment) = $lc  \n";
+
+    # summations
+    $nlines_total = $nlines_total + $l;
+    $nlines_noblank = $nlines_noblank + $lb;
+    $nlines_nocomment = $nlines_nocomment + $lc;
+    print "  total = $nlines_total \n";
+    print "  total (no blank) = $nlines_noblank \n";
+    print "  total (no comment) = $nlines_nocomment \n";
+  }  
+}
+
+#line count output
+if( 1 == 0 ){
+  print "\ntotal number of lines: \n";
+  print "  lines = $nlines_total \n";
+  print "  lines (no blank) = $nlines_noblank \n";
+  print "  lines (no comment) = $nlines_nocomment \n\n";
+}
+
+system("rm -f _____tutu01_____");
+