diff --git a/src/cuda/check_fields_cuda.cu b/src/cuda/check_fields_cuda.cu
index bd11abd61..f06639c29 100644
--- a/src/cuda/check_fields_cuda.cu
+++ b/src/cuda/check_fields_cuda.cu
@@ -752,8 +752,7 @@ __global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
 extern "C"
 void FC_FUNC_(get_norm_acoustic_from_device,
               GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
-                                                  long* Mesh_pointer_f,
-                                                  int* SIMULATION_TYPE) {
+                                             long* Mesh_pointer_f) {
 
 TRACE("get_norm_acoustic_from_device");
   //double start_time = get_time();
@@ -816,9 +815,9 @@ TRACE("get_norm_acoustic_from_device");
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  if(*SIMULATION_TYPE == 1 ){
+  if(mp->simulation_type == 1 ){
     get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,size,d_max);
-  }else if(*SIMULATION_TYPE == 3 ){
+  }else if(mp->simulation_type == 3 ){
     get_maximum_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,size,d_max);
   }
 
@@ -925,8 +924,7 @@ __global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
 extern "C"
 void FC_FUNC_(get_norm_elastic_from_device,
               GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
-                                            long* Mesh_pointer_f,
-                                            int* SIMULATION_TYPE) {
+                                            long* Mesh_pointer_f) {
 
   TRACE("get_norm_elastic_from_device");
   //double start_time = get_time();
@@ -957,9 +955,9 @@ void FC_FUNC_(get_norm_elastic_from_device,
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  if(*SIMULATION_TYPE == 1 ){
+  if(mp->simulation_type == 1 ){
     get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,size,d_max);
-  }else if(*SIMULATION_TYPE == 3 ){
+  }else if(mp->simulation_type == 3 ){
     get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,size,d_max);
   }
 
diff --git a/src/cuda/compute_add_sources_acoustic_cuda.cu b/src/cuda/compute_add_sources_acoustic_cuda.cu
index 14686f87b..756f4eacd 100644
--- a/src/cuda/compute_add_sources_acoustic_cuda.cu
+++ b/src/cuda/compute_add_sources_acoustic_cuda.cu
@@ -99,7 +99,6 @@ void FC_FUNC_(compute_add_sources_ac_cuda,
               COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
                                                  int* phase_is_innerf,
                                                  int* NSOURCESf,
-                                                 int* SIMULATION_TYPEf,
                                                  double* h_stf_pre_compute,
                                                  int* myrankf) {
 
@@ -153,7 +152,6 @@ void FC_FUNC_(compute_add_sources_ac_s3_cuda,
               COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
                                                       int* phase_is_innerf,
                                                       int* NSOURCESf,
-                                                      int* SIMULATION_TYPEf,
                                                       double* h_stf_pre_compute,
                                                       int* myrankf) {
 
diff --git a/src/cuda/compute_coupling_cuda.cu b/src/cuda/compute_coupling_cuda.cu
index ce420d857..fc0c87bef 100644
--- a/src/cuda/compute_coupling_cuda.cu
+++ b/src/cuda/compute_coupling_cuda.cu
@@ -117,15 +117,13 @@ extern "C"
 void FC_FUNC_(compute_coupling_ac_el_cuda,
               COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
-                                           int* num_coupling_ac_el_facesf,
-                                           int* SIMULATION_TYPEf) {
+                                           int* num_coupling_ac_el_facesf) {
   TRACE("compute_coupling_ac_el_cuda");
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   int phase_is_inner            = *phase_is_innerf;
   int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
-  int SIMULATION_TYPE           = *SIMULATION_TYPEf;
 
   // way 1: exact blocksize to match NGLLSQUARE
   int blocksize = NGLL2;
@@ -152,7 +150,7 @@ void FC_FUNC_(compute_coupling_ac_el_cuda,
                                                        phase_is_inner);
 
   //  adjoint simulations
-  if (SIMULATION_TYPE == 3 ){
+  if (mp->simulation_type == 3 ){
     compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_b_displ,
                                                           mp->d_b_potential_dot_dot_acoustic,
                                                           num_coupling_ac_el_faces,
@@ -279,15 +277,13 @@ extern "C"
 void FC_FUNC_(compute_coupling_el_ac_cuda,
               COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
-                                           int* num_coupling_ac_el_facesf,
-                                           int* SIMULATION_TYPEf) {
+                                           int* num_coupling_ac_el_facesf) {
   TRACE("compute_coupling_el_ac_cuda");
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   int phase_is_inner            = *phase_is_innerf;
   int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
-  int SIMULATION_TYPE           = *SIMULATION_TYPEf;
 
   // way 1: exact blocksize to match NGLLSQUARE
   int blocksize = 25;
@@ -319,7 +315,7 @@ void FC_FUNC_(compute_coupling_el_ac_cuda,
                                                        mp->d_displ);
 
   //  adjoint simulations
-  if (SIMULATION_TYPE == 3 ){
+  if (mp->simulation_type == 3 ){
     compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
                                                          mp->d_b_accel,
                                                          num_coupling_ac_el_faces,
@@ -343,3 +339,136 @@ void FC_FUNC_(compute_coupling_el_ac_cuda,
   exit_on_cuda_error("compute_coupling_el_ac_cuda");
 #endif
 }
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* OCEANS load on free surface */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void compute_coupling_ocean_cuda_kernel(realw* accel,
+                                               realw* rmassx,realw* rmassy,realw* rmassz,
+                                               realw* rmass_ocean_load,
+                                               int num_free_surface_faces,
+                                               int* free_surface_ispec,
+                                               int* free_surface_ijk,
+                                               realw* free_surface_normal,
+                                               int* ibool,
+                                               int* updated_dof_ocean_load) {
+  // gets spectral element face id
+  int igll = threadIdx.x ;  //  threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+  realw nx,ny,nz;
+  realw force_normal_comp;
+
+  // for all faces on free surface
+  if( iface < num_free_surface_faces ){
+
+    int ispec = free_surface_ispec[iface]-1;
+
+    // gets global point index
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+    //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
+
+    // only update this global point once
+
+    // daniel: TODO - there might be better ways to implement a mutex like below,
+    //            and find a workaround to not use the temporary update array.
+    //            atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
+
+    if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
+
+      // get normal
+      nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
+      ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+      nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+
+      // make updated component of right-hand side
+      // we divide by rmass() which is 1 / M
+      // we use the total force which includes the Coriolis term above
+      force_normal_comp = accel[iglob*3]*nx / rmassx[iglob]
+                          + accel[iglob*3+1]*ny / rmassy[iglob]
+                          + accel[iglob*3+2]*nz / rmassz[iglob];
+
+      // probably wouldn't need atomicAdd anymore, but just to be sure...
+      atomicAdd(&accel[iglob*3],   + (rmass_ocean_load[iglob] - rmassx[iglob]) * force_normal_comp * nx);
+      atomicAdd(&accel[iglob*3+1], + (rmass_ocean_load[iglob] - rmassy[iglob]) * force_normal_comp * ny);
+      atomicAdd(&accel[iglob*3+2], + (rmass_ocean_load[iglob] - rmassz[iglob]) * force_normal_comp * nz);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_coupling_ocean_cuda,
+              COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f) {
+
+  TRACE("compute_coupling_ocean_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( mp->num_free_surface_faces == 0 ) return;
+
+  // block sizes: exact blocksize to match NGLLSQUARE
+  int blocksize = NGLL2;
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+
+  // initializes temporary array to zero
+  print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+                                     sizeof(int)*mp->NGLOB_AB),88501);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before kernel compute_coupling_ocean_cuda");
+#endif
+
+  compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
+                                                   mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
+                                                   mp->d_rmass_ocean_load,
+                                                   mp->num_free_surface_faces,
+                                                   mp->d_free_surface_ispec,
+                                                   mp->d_free_surface_ijk,
+                                                   mp->d_free_surface_normal,
+                                                   mp->d_ibool,
+                                                   mp->d_updated_dof_ocean_load);
+  // for backward/reconstructed potentials
+  if(mp->simulation_type == 3) {
+    // re-initializes array
+    print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+                                       sizeof(int)*mp->NGLOB_AB),88502);
+
+    compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,
+                                                     mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
+                                                     mp->d_rmass_ocean_load,
+                                                     mp->num_free_surface_faces,
+                                                     mp->d_free_surface_ispec,
+                                                     mp->d_free_surface_ijk,
+                                                     mp->d_free_surface_normal,
+                                                     mp->d_ibool,
+                                                     mp->d_updated_dof_ocean_load);
+
+  }
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_coupling_ocean_cuda");
+#endif
+}
+
diff --git a/src/cuda/compute_forces_acoustic_cuda.cu b/src/cuda/compute_forces_acoustic_cuda.cu
index 149aef66d..85f2604d6 100644
--- a/src/cuda/compute_forces_acoustic_cuda.cu
+++ b/src/cuda/compute_forces_acoustic_cuda.cu
@@ -538,7 +538,6 @@ __global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
-                       int SIMULATION_TYPE,
                        int* d_ibool,
                        realw* d_xix,
                        realw* d_xiy,
@@ -599,7 +598,7 @@ void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
                                                         d_kappastore,
                                                         mp->d_wgll_cube);
 
-  if(SIMULATION_TYPE == 3) {
+  if(mp->simulation_type == 3) {
     Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
                                                           mp->NGLOB_AB,
                                                           d_ibool,
@@ -647,8 +646,7 @@ void FC_FUNC_(compute_forces_acoustic_cuda,
               COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
                                             int* iphase,
                                             int* nspec_outer_acoustic,
-                                            int* nspec_inner_acoustic,
-                                            int* SIMULATION_TYPE) {
+                                            int* nspec_inner_acoustic) {
 
   TRACE("compute_forces_acoustic_cuda");
   //double start_time = get_time();
@@ -700,7 +698,6 @@ void FC_FUNC_(compute_forces_acoustic_cuda,
       nb_blocks_to_compute = mp->h_num_elem_colors_acoustic[icolor];
 
       Kernel_2_acoustic(nb_blocks_to_compute,mp,*iphase,
-                         *SIMULATION_TYPE,
                          mp->d_ibool + color_offset_nonpadded,
                          mp->d_xix + color_offset,
                          mp->d_xiy + color_offset,
@@ -724,7 +721,6 @@ void FC_FUNC_(compute_forces_acoustic_cuda,
 
     // no mesh coloring: uses atomic updates
     Kernel_2_acoustic(num_elements, mp, *iphase,
-                      *SIMULATION_TYPE,
                       mp->d_ibool,
                       mp->d_xix,
                       mp->d_xiy,
@@ -741,129 +737,6 @@ void FC_FUNC_(compute_forces_acoustic_cuda,
   }
 }
 
-/* ----------------------------------------------------------------------------------------------- */
-
-/* KERNEL 3 */
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void kernel_3_a_acoustic_cuda_device(realw* potential_dot_dot_acoustic,
-                                                int size,
-                                                realw* rmass_acoustic) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    // multiplies pressure with the inverse of the mass matrix
-    potential_dot_dot_acoustic[id] = potential_dot_dot_acoustic[id]*rmass_acoustic[id];
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_b_acoustic_cuda_device(realw* potential_dot_acoustic,
-                                                realw* potential_dot_dot_acoustic,
-                                                int size,
-                                                realw deltatover2,
-                                                realw* rmass_acoustic) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    // Newmark time scheme: corrector term
-    potential_dot_acoustic[id] = potential_dot_acoustic[id] + deltatover2*potential_dot_dot_acoustic[id];
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
-                             long* Mesh_pointer,
-                             int* size_F,
-                             int* SIMULATION_TYPE) {
-
-TRACE("kernel_3_a_acoustic_cuda");
-
-   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-   int size = *size_F;
-
-   int blocksize = BLOCKSIZE_KERNEL3;
-   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-   int num_blocks_x = size_padded/blocksize;
-   int num_blocks_y = 1;
-   while(num_blocks_x > 65535) {
-     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-     num_blocks_y = num_blocks_y*2;
-   }
-   dim3 grid(num_blocks_x,num_blocks_y);
-   dim3 threads(blocksize,1,1);
-
-   kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                       size,
-                                                       mp->d_rmass_acoustic);
-
-   if(*SIMULATION_TYPE == 3) {
-     kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                                         size,
-                                                         mp->d_rmass_acoustic);
-   }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel 3 a");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
-                                                             long* Mesh_pointer,
-                                                             int* size_F,
-                                                             realw* deltatover2_F,
-                                                             int* SIMULATION_TYPE,
-                                                             realw* b_deltatover2_F) {
-
-TRACE("kernel_3_b_acoustic_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-  int size = *size_F;
-  realw deltatover2 = *deltatover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
-
-  int blocksize = BLOCKSIZE_KERNEL3;
-  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_acoustic,
-                                                    mp->d_potential_dot_dot_acoustic,
-                                                    size, deltatover2,
-                                                    mp->d_rmass_acoustic);
-
-  if(*SIMULATION_TYPE == 3) {
-    kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_acoustic,
-                                                      mp->d_b_potential_dot_dot_acoustic,
-                                                      size, b_deltatover2,
-                                                      mp->d_rmass_acoustic);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel 3 b");
-#endif
-}
-
 
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -915,8 +788,7 @@ __global__ void enforce_free_surface_cuda_kernel(
 extern "C"
 void FC_FUNC_(acoustic_enforce_free_surf_cuda,
               ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
-                                                  int* SIMULATION_TYPE,
-                                                  int* ABSORB_FREE_SURFACE) {
+                                               int* ABSORB_FREE_SURFACE) {
 
 TRACE("acoustic_enforce_free_surf_cuda");
 
@@ -947,7 +819,7 @@ TRACE("acoustic_enforce_free_surf_cuda");
                                                        mp->d_ibool,
                                                        mp->d_ispec_is_acoustic);
     // for backward/reconstructed potentials
-    if(*SIMULATION_TYPE == 3) {
+    if(mp->simulation_type == 3) {
       enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_b_potential_acoustic,
                                                          mp->d_b_potential_dot_acoustic,
                                                          mp->d_b_potential_dot_dot_acoustic,
diff --git a/src/cuda/compute_forces_elastic_cuda.cu b/src/cuda/compute_forces_elastic_cuda.cu
index bd4d95110..62df527fc 100644
--- a/src/cuda/compute_forces_elastic_cuda.cu
+++ b/src/cuda/compute_forces_elastic_cuda.cu
@@ -160,7 +160,6 @@ void FC_FUNC_(transfer_boundary_from_device_a,
 
   if( mp->size_mpi_buffer > 0 ){
 
-//daniel: todo - check below with this...
     int blocksize = BLOCKSIZE_TRANSFER;
     int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
     int num_blocks_x = size_padded/blocksize;
@@ -172,20 +171,6 @@ void FC_FUNC_(transfer_boundary_from_device_a,
     dim3 grid(num_blocks_x,num_blocks_y);
     dim3 threads(blocksize,1,1);
 
-/*
-//daniel: todo - check originally used...
-  int num_blocks_x = *nspec_outer_elastic;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  int blocksize = NGLL3_PADDED;
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-*/
-
     prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
                                                                           mp->num_interfaces_ext_mesh,
                                                                           mp->max_nibool_interfaces_ext_mesh,
@@ -200,9 +185,6 @@ void FC_FUNC_(transfer_boundary_from_device_a,
 
     cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
                   mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost,mp->copy_stream);
-    // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
-    // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
-    // cudaMemcpyDeviceToHost,mp->compute_stream);
   }
 }
 
@@ -1265,7 +1247,7 @@ __global__ void Kernel_2_impl(int nb_blocks_to_compute,
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
-              int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,
+              int COMPUTE_AND_STORE_STRAIN,
               int ATTENUATION,int ANISOTROPY,
               int* d_ibool,
               realw* d_xix,
@@ -1374,7 +1356,7 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
                                                         d_epsilondev_xz,
                                                         d_epsilondev_yz,
                                                         d_epsilon_trace_over_3,
-                                                        SIMULATION_TYPE,
+                                                        mp->simulation_type,
                                                         ATTENUATION,mp->NSPEC_AB,
                                                         d_one_minus_sum_beta,
                                                         d_factor_common,
@@ -1409,7 +1391,7 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
                                                         mp->d_wgll_cube);
 
 
-  if(SIMULATION_TYPE == 3) {
+  if(mp->simulation_type == 3) {
     Kernel_2_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                                            mp->NGLOB_AB,
                                                            d_ibool,
@@ -1432,7 +1414,7 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
                                                            d_b_epsilondev_xz,
                                                            d_b_epsilondev_yz,
                                                            d_b_epsilon_trace_over_3,
-                                                           SIMULATION_TYPE,
+                                                           mp->simulation_type,
                                                            ATTENUATION,mp->NSPEC_AB,
                                                            d_one_minus_sum_beta,
                                                            d_factor_common,
@@ -1490,7 +1472,6 @@ void FC_FUNC_(compute_forces_elastic_cuda,
                                            int* iphase,
                                            int* nspec_outer_elastic,
                                            int* nspec_inner_elastic,
-                                           int* SIMULATION_TYPE,
                                            int* COMPUTE_AND_STORE_STRAIN,
                                            int* ATTENUATION,
                                            int* ANISOTROPY) {
@@ -1556,7 +1537,7 @@ void FC_FUNC_(compute_forces_elastic_cuda,
       //}
 
       Kernel_2(nb_blocks_to_compute,mp,*iphase,
-               *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,
+               *COMPUTE_AND_STORE_STRAIN,
                *ATTENUATION,*ANISOTROPY,
                mp->d_ibool + color_offset_nonpadded,
                mp->d_xix + color_offset,
@@ -1638,7 +1619,7 @@ void FC_FUNC_(compute_forces_elastic_cuda,
     // no mesh coloring: uses atomic updates
 
     Kernel_2(num_elements,mp,*iphase,
-             *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,
+             *COMPUTE_AND_STORE_STRAIN,
              *ATTENUATION,*ANISOTROPY,
              mp->d_ibool,
              mp->d_xix,
@@ -1703,7 +1684,6 @@ void FC_FUNC_(compute_forces_elastic_cuda,
 
 /* ----------------------------------------------------------------------------------------------- */
 
-//daniel: todo - use this instead above call to fortran routine to avoid compilation problems
 extern "C"
 void FC_FUNC_(sync_copy_from_device,
               SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f,
@@ -1729,310 +1709,3 @@ void FC_FUNC_(sync_copy_from_device,
   // memory copy is now finished, so non-blocking MPI send can proceed
 }
 
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 3
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_cuda_device(realw* veloc,
-                                     realw* accel, int size,
-                                     realw deltatover2,
-                                     realw* rmass) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    realw new_accel = accel[id] * rmass[id / 3];
-    veloc[id] += deltatover2 * new_accel;
-    accel[id] = new_accel;
-/*
-    accel[3*id] = accel[3*id]*rmass[id];
-    accel[3*id+1] = accel[3*id+1]*rmass[id];
-    accel[3*id+2] = accel[3*id+2]*rmass[id];
-
-    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
-    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
-    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
-*/
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_accel_cuda_device(realw* accel,
-                                           int size,
-                                           realw* rmass) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    accel[id] *= rmass[id / 3];
-/*
-    accel[3*id] = accel[3*id]*rmass[id];
-    accel[3*id+1] = accel[3*id+1]*rmass[id];
-    accel[3*id+2] = accel[3*id+2]*rmass[id];
-*/
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_veloc_cuda_device(realw* veloc,
-                                           realw* accel,
-                                           int size,
-                                           realw deltatover2) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
-    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
-    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(kernel_3_a_cuda,
-              KERNEL_3_A_CUDA)(long* Mesh_pointer,
-                               int* size_F,
-                               realw* deltatover2_F,
-                               int* SIMULATION_TYPE_f,
-                               realw* b_deltatover2_F,
-                               int* OCEANS) {
-TRACE("kernel_3_a_cuda");
-
-   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-   //int size = *size_F;
-   int size = *size_F * 3;
-   int SIMULATION_TYPE = *SIMULATION_TYPE_f;
-   realw deltatover2 = *deltatover2_F;
-   realw b_deltatover2 = *b_deltatover2_F;
-
-   int blocksize = BLOCKSIZE_KERNEL3;
-   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-
-   int num_blocks_x = size_padded/blocksize;
-   int num_blocks_y = 1;
-   while(num_blocks_x > 65535) {
-     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-     num_blocks_y = num_blocks_y*2;
-   }
-
-   dim3 grid(num_blocks_x,num_blocks_y);
-   dim3 threads(blocksize,1,1);
-
-   // check whether we can update accel and veloc, or only accel at this point
-   if( *OCEANS == 0 ){
-     // updates both, accel and veloc
-     kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
-                                                                   mp->d_accel,
-                                                                   size, deltatover2, mp->d_rmass);
-
-     if(SIMULATION_TYPE == 3) {
-       kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
-                                                                     mp->d_b_accel,
-                                                                     size, b_deltatover2,mp->d_rmass);
-     }
-   }else{
-     // updates only accel
-     kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel,
-                                                                         size, mp->d_rmass);
-
-     if(SIMULATION_TYPE == 3) {
-       kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel,
-                                                                           size, mp->d_rmass);
-     }
-   }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel 3 a");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(kernel_3_b_cuda,
-              KERNEL_3_B_CUDA)(long* Mesh_pointer,
-                             int* size_F,
-                             realw* deltatover2_F,
-                             int* SIMULATION_TYPE_f,
-                             realw* b_deltatover2_F) {
-  TRACE("kernel_3_b_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-  int size = *size_F;
-  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
-  realw deltatover2 = *deltatover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
-
-  int blocksize = BLOCKSIZE_KERNEL3;
-  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  // updates only veloc at this point
-  kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
-                                                                      mp->d_accel,
-                                                                      size,deltatover2);
-
-  if(SIMULATION_TYPE == 3) {
-    kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
-                                                                        mp->d_b_accel,
-                                                                        size,b_deltatover2);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel 3 b");
-#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-/* OCEANS load on free surface */
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void elastic_ocean_load_cuda_kernel(realw* accel,
-                                               realw* rmass,
-                                               realw* rmass_ocean_load,
-                                               int num_free_surface_faces,
-                                               int* free_surface_ispec,
-                                               int* free_surface_ijk,
-                                               realw* free_surface_normal,
-                                               int* ibool,
-                                               int* updated_dof_ocean_load) {
-  // gets spectral element face id
-  int igll = threadIdx.x ;  //  threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
-  int iface = blockIdx.x + gridDim.x*blockIdx.y;
-  realw nx,ny,nz;
-  realw force_normal_comp,additional_term;
-
-  // for all faces on free surface
-  if( iface < num_free_surface_faces ){
-
-    int ispec = free_surface_ispec[iface]-1;
-
-    // gets global point index
-    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
-    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
-    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
-
-    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
-
-    //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
-
-    // only update this global point once
-
-    // daniel: TODO - there might be better ways to implement a mutex like below,
-    //            and find a workaround to not use the temporary update array.
-    //            atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
-
-    if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
-
-      // get normal
-      nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
-      ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
-      nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
-
-      // make updated component of right-hand side
-      // we divide by rmass() which is 1 / M
-      // we use the total force which includes the Coriolis term above
-      force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
-
-      additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
-
-      // probably wouldn't need atomicAdd anymore, but just to be sure...
-      atomicAdd(&accel[iglob*3], + additional_term * nx);
-      atomicAdd(&accel[iglob*3+1], + additional_term * ny);
-      atomicAdd(&accel[iglob*3+2], + additional_term * nz);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(elastic_ocean_load_cuda,
-              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
-                                       int* SIMULATION_TYPE) {
-
-  TRACE("elastic_ocean_load_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  // checks if anything to do
-  if( mp->num_free_surface_faces == 0 ) return;
-
-  // block sizes: exact blocksize to match NGLLSQUARE
-  int blocksize = NGLL2;
-
-  int num_blocks_x = mp->num_free_surface_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-
-  // initializes temporary array to zero
-  print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
-                                     sizeof(int)*mp->NGLOB_AB),88501);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("before kernel elastic_ocean_load_cuda");
-#endif
-
-  elastic_ocean_load_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
-                                                   mp->d_rmass,
-                                                   mp->d_rmass_ocean_load,
-                                                   mp->num_free_surface_faces,
-                                                   mp->d_free_surface_ispec,
-                                                   mp->d_free_surface_ijk,
-                                                   mp->d_free_surface_normal,
-                                                   mp->d_ibool,
-                                                   mp->d_updated_dof_ocean_load);
-  // for backward/reconstructed potentials
-  if(*SIMULATION_TYPE == 3) {
-    // re-initializes array
-    print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
-                                       sizeof(int)*mp->NGLOB_AB),88502);
-
-    elastic_ocean_load_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,
-                                                     mp->d_rmass,
-                                                     mp->d_rmass_ocean_load,
-                                                     mp->num_free_surface_faces,
-                                                     mp->d_free_surface_ispec,
-                                                     mp->d_free_surface_ijk,
-                                                     mp->d_free_surface_normal,
-                                                     mp->d_ibool,
-                                                     mp->d_updated_dof_ocean_load);
-
-  }
-
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("elastic_ocean_load_cuda");
-#endif
-}
diff --git a/src/cuda/compute_stacey_acoustic_cuda.cu b/src/cuda/compute_stacey_acoustic_cuda.cu
index 304da3a55..c983c8d16 100644
--- a/src/cuda/compute_stacey_acoustic_cuda.cu
+++ b/src/cuda/compute_stacey_acoustic_cuda.cu
@@ -50,7 +50,8 @@ __global__ void compute_stacey_acoustic_kernel(realw* potential_dot_acoustic,
                                                int* ispec_is_inner,
                                                int* ispec_is_acoustic,
                                                int phase_is_inner,
-                                               int SIMULATION_TYPE, int SAVE_FORWARD,
+                                               int SIMULATION_TYPE,
+                                               int SAVE_FORWARD,
                                                int num_abs_boundary_faces,
                                                realw* b_potential_dot_acoustic,
                                                realw* b_potential_dot_dot_acoustic,
@@ -121,18 +122,15 @@ __global__ void compute_stacey_acoustic_kernel(realw* potential_dot_acoustic,
 
 extern "C"
 void FC_FUNC_(compute_stacey_acoustic_cuda,
-              COMPUTE_STACEY_ACOUSTIC_CUDA)(
-                                    long* Mesh_pointer_f,
-                                    int* phase_is_innerf,
-                                    int* SIMULATION_TYPEf,
-                                    int* SAVE_FORWARDf,
-                                    realw* h_b_absorb_potential) {
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* SAVE_FORWARDf,
+                                            realw* h_b_absorb_potential) {
 TRACE("compute_stacey_acoustic_cuda");
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   int phase_is_inner          = *phase_is_innerf;
-  int SIMULATION_TYPE         = *SIMULATION_TYPEf;
   int SAVE_FORWARD            = *SAVE_FORWARDf;
 
   // way 1: Elapsed time: 4.385948e-03
@@ -154,7 +152,7 @@ TRACE("compute_stacey_acoustic_cuda");
   dim3 threads(blocksize,1,1);
 
   //  adjoint simulations: reads in absorbing boundary
-  if (SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0 ){
+  if (mp->simulation_type == 3 && mp->d_num_abs_boundary_faces > 0 ){
     // copies array to GPU
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential,
                                        mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700);
@@ -171,7 +169,8 @@ TRACE("compute_stacey_acoustic_cuda");
                                                    mp->d_ispec_is_inner,
                                                    mp->d_ispec_is_acoustic,
                                                    phase_is_inner,
-                                                   SIMULATION_TYPE,SAVE_FORWARD,
+                                                   mp->simulation_type,
+                                                   SAVE_FORWARD,
                                                    mp->d_num_abs_boundary_faces,
                                                    mp->d_b_potential_dot_acoustic,
                                                    mp->d_b_potential_dot_dot_acoustic,
@@ -179,7 +178,7 @@ TRACE("compute_stacey_acoustic_cuda");
                                                    mp->gravity);
 
   //  adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ){
+  if (mp->simulation_type == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ){
     // copies array to CPU
     print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential,
                                        mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701);
diff --git a/src/cuda/compute_stacey_elastic_cuda.cu b/src/cuda/compute_stacey_elastic_cuda.cu
index a4e7458b7..618c5f9a6 100644
--- a/src/cuda/compute_stacey_elastic_cuda.cu
+++ b/src/cuda/compute_stacey_elastic_cuda.cu
@@ -132,7 +132,6 @@ extern "C"
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
-                                           int* SIMULATION_TYPEf,
                                            int* SAVE_FORWARDf,
                                            realw* h_b_absorb_field) {
 
@@ -144,7 +143,6 @@ TRACE("compute_stacey_elastic_cuda");
   if( mp->d_num_abs_boundary_faces == 0 ) return;
 
   int phase_is_inner    = *phase_is_innerf;
-  int SIMULATION_TYPE   = *SIMULATION_TYPEf;
   int SAVE_FORWARD      = *SAVE_FORWARDf;
 
   // way 1
@@ -165,7 +163,7 @@ TRACE("compute_stacey_elastic_cuda");
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) {
+  if(mp->simulation_type == 3 && mp->d_num_abs_boundary_faces > 0) {
     // The read is done in fortran
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
                                        mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
@@ -187,7 +185,8 @@ TRACE("compute_stacey_elastic_cuda");
                                                   mp->d_ispec_is_inner,
                                                   mp->d_ispec_is_elastic,
                                                   phase_is_inner,
-                                                  SIMULATION_TYPE,SAVE_FORWARD,
+                                                  mp->simulation_type,
+                                                  SAVE_FORWARD,
                                                   mp->d_num_abs_boundary_faces,
                                                   mp->d_b_accel,
                                                   mp->d_b_absorb_field);
@@ -197,10 +196,10 @@ TRACE("compute_stacey_elastic_cuda");
 #endif
 
   // ! adjoint simulations: stores absorbed wavefield part
-  // if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
+  // if (mp->simulation_type == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
   //   write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
 
-  if(SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ) {
+  if(mp->simulation_type == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ) {
     print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field,
                                        mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701);
     // The write is done in fortran
diff --git a/src/cuda/it_update_displacement_cuda.cu b/src/cuda/it_update_displacement_cuda.cu
index 08ef470bc..10a3cf4ff 100644
--- a/src/cuda/it_update_displacement_cuda.cu
+++ b/src/cuda/it_update_displacement_cuda.cu
@@ -68,14 +68,13 @@ __global__ void UpdateDispVeloc_kernel(realw* displ,
 extern "C"
 void FC_FUNC_(it_update_displacement_cuda,
               IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
-                                                 int* size_F,
-                                                 realw* deltat_F,
-                                                 realw* deltatsqover2_F,
-                                                 realw* deltatover2_F,
-                                                 int* SIMULATION_TYPE,
-                                                 realw* b_deltat_F,
-                                                 realw* b_deltatsqover2_F,
-                                                 realw* b_deltatover2_F) {
+                                          int* size_F,
+                                          realw* deltat_F,
+                                          realw* deltatsqover2_F,
+                                          realw* deltatover2_F,
+                                          realw* b_deltat_F,
+                                          realw* b_deltatsqover2_F,
+                                          realw* b_deltatover2_F) {
 
 TRACE("it_update_displacement_cuda");
 
@@ -120,7 +119,7 @@ TRACE("it_update_displacement_cuda");
 //#endif
 
   // kernel for backward fields
-  if(*SIMULATION_TYPE == 3) {
+  if(mp->simulation_type == 3) {
     realw b_deltat = *b_deltat_F;
     realw b_deltatsqover2 = *b_deltatsqover2_F;
     realw b_deltatover2 = *b_deltatover2_F;
@@ -178,7 +177,6 @@ void FC_FUNC_(it_update_displacement_ac_cuda,
                                                realw* deltat_F,
                                                realw* deltatsqover2_F,
                                                realw* deltatover2_F,
-                                               int* SIMULATION_TYPE,
                                                realw* b_deltat_F,
                                                realw* b_deltatsqover2_F,
                                                realw* b_deltatover2_F) {
@@ -212,7 +210,7 @@ TRACE("it_update_displacement_ac_cuda");
                                            mp->d_potential_dot_dot_acoustic,
                                            size,deltat,deltatsqover2,deltatover2);
 
-  if(*SIMULATION_TYPE == 3) {
+  if(mp->simulation_type == 3) {
     realw b_deltat = *b_deltat_F;
     realw b_deltatsqover2 = *b_deltatsqover2_F;
     realw b_deltatover2 = *b_deltatover2_F;
@@ -229,3 +227,310 @@ TRACE("it_update_displacement_ac_cuda");
   exit_on_cuda_error("it_update_displacement_ac_cuda");
 #endif
 }
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic domains
+
+// KERNEL 3
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_cuda_device(realw* veloc,
+                                     realw* accel,
+                                     int size,
+                                     realw deltatover2,
+                                     realw* rmassx,
+                                     realw* rmassy,
+                                     realw* rmassz) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    accel[3*id] = accel[3*id]*rmassx[id];
+    accel[3*id+1] = accel[3*id+1]*rmassy[id];
+    accel[3*id+2] = accel[3*id+2]*rmassz[id];
+
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_accel_cuda_device(realw* accel,
+                                           int size,
+                                           realw* rmassx,
+                                           realw* rmassy,
+                                           realw* rmassz) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    accel[3*id] = accel[3*id]*rmassx[id];
+    accel[3*id+1] = accel[3*id+1]*rmassy[id];
+    accel[3*id+2] = accel[3*id+2]*rmassz[id];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_veloc_cuda_device(realw* veloc,
+                                           realw* accel,
+                                           int size,
+                                           realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_a_cuda,
+              KERNEL_3_A_CUDA)(long* Mesh_pointer,
+                               int* size_F,
+                               realw* deltatover2_F,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {
+TRACE("kernel_3_a_cuda");
+
+   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+   int size = *size_F;
+
+   realw deltatover2 = *deltatover2_F;
+   realw b_deltatover2 = *b_deltatover2_F;
+
+   int blocksize = BLOCKSIZE_KERNEL3;
+   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+   int num_blocks_x = size_padded/blocksize;
+   int num_blocks_y = 1;
+   while(num_blocks_x > 65535) {
+     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+     num_blocks_y = num_blocks_y*2;
+   }
+
+   dim3 grid(num_blocks_x,num_blocks_y);
+   dim3 threads(blocksize,1,1);
+
+   // check whether we can update accel and veloc, or only accel at this point
+   if( *OCEANS == 0 ){
+     // updates both, accel and veloc
+     kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
+                                                                   mp->d_accel,
+                                                                   size, deltatover2,
+                                                                   mp->d_rmassx,mp->d_rmassy,mp->d_rmassz);
+
+     if(mp->simulation_type == 3) {
+       kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
+                                                                     mp->d_b_accel,
+                                                                     size, b_deltatover2,
+                                                                     mp->d_rmassx,mp->d_rmassy,mp->d_rmassz);
+     }
+   }else{
+     // updates only accel
+     kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel,
+                                                                         size,
+                                                                         mp->d_rmassx,
+                                                                         mp->d_rmassy,
+                                                                         mp->d_rmassz);
+
+     if(mp->simulation_type == 3) {
+       kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel,
+                                                                           size,
+                                                                           mp->d_rmassx,
+                                                                           mp->d_rmassy,
+                                                                           mp->d_rmassz);
+     }
+   }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 a");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_b_cuda,
+              KERNEL_3_B_CUDA)(long* Mesh_pointer,
+                             int* size_F,
+                             realw* deltatover2_F,
+                             realw* b_deltatover2_F) {
+  TRACE("kernel_3_b_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+  int size = *size_F;
+
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // updates only veloc at this point
+  kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
+                                                                      mp->d_accel,
+                                                                      size,deltatover2);
+
+  if(mp->simulation_type == 3) {
+    kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
+                                                                        mp->d_b_accel,
+                                                                        size,b_deltatover2);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 b");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic domains
+
+// KERNEL 3
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void kernel_3_a_acoustic_cuda_device(realw* potential_dot_dot_acoustic,
+                                                int size,
+                                                realw* rmass_acoustic) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    // multiplies pressure with the inverse of the mass matrix
+    potential_dot_dot_acoustic[id] = potential_dot_dot_acoustic[id]*rmass_acoustic[id];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_b_acoustic_cuda_device(realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                int size,
+                                                realw deltatover2,
+                                                realw* rmass_acoustic) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    // Newmark time scheme: corrector term
+    potential_dot_acoustic[id] = potential_dot_acoustic[id] + deltatover2*potential_dot_dot_acoustic[id];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                             long* Mesh_pointer,
+                             int* size_F) {
+
+TRACE("kernel_3_a_acoustic_cuda");
+
+   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+   int size = *size_F;
+
+   int blocksize = BLOCKSIZE_KERNEL3;
+   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+   int num_blocks_x = size_padded/blocksize;
+   int num_blocks_y = 1;
+   while(num_blocks_x > 65535) {
+     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+     num_blocks_y = num_blocks_y*2;
+   }
+   dim3 grid(num_blocks_x,num_blocks_y);
+   dim3 threads(blocksize,1,1);
+
+   kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                       size,
+                                                       mp->d_rmass_acoustic);
+
+   if(mp->simulation_type == 3) {
+     kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                         size,
+                                                         mp->d_rmass_acoustic);
+   }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 a");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                                                             long* Mesh_pointer,
+                                                             int* size_F,
+                                                             realw* deltatover2_F,
+                                                             realw* b_deltatover2_F) {
+
+TRACE("kernel_3_b_acoustic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+  int size = *size_F;
+
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_acoustic,
+                                                    mp->d_potential_dot_dot_acoustic,
+                                                    size, deltatover2,
+                                                    mp->d_rmass_acoustic);
+
+  if(mp->simulation_type == 3) {
+    kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_acoustic,
+                                                      mp->d_b_potential_dot_dot_acoustic,
+                                                      size, b_deltatover2,
+                                                      mp->d_rmass_acoustic);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 b");
+#endif
+}
+
+
diff --git a/src/cuda/mesh_constants_cuda.h b/src/cuda/mesh_constants_cuda.h
index 4a7fb4c20..86095c7bc 100644
--- a/src/cuda/mesh_constants_cuda.h
+++ b/src/cuda/mesh_constants_cuda.h
@@ -192,6 +192,13 @@ typedef struct mesh_ {
   int myrank;
   int NPROC;
 
+  // constants
+  int simulation_type;
+  int use_mesh_coloring_gpu;
+  int absorbing_conditions;
+  int gravity;
+
+
   // ------------------------------------------------------------------ //
   // GLL points & weights
   // ------------------------------------------------------------------ //
@@ -210,9 +217,6 @@ typedef struct mesh_ {
   // inner / outer elements
   int* d_ispec_is_inner;
 
-  // mesh coloring
-  int use_mesh_coloring_gpu;
-
   // pointers to constant memory arrays
   realw* d_hprime_xx;
   //realw* d_hprime_yy; // only needed if NGLLX != NGLLY != NGLLZ
@@ -285,7 +289,9 @@ typedef struct mesh_ {
   int num_colors_outer_elastic,num_colors_inner_elastic;
   int nspec_elastic;
 
-  realw* d_rmass;
+  realw* d_rmassx;
+  realw* d_rmassy;
+  realw* d_rmassz;
 
   // mpi buffer
   realw* d_send_accel_buffer;
@@ -481,7 +487,6 @@ typedef struct mesh_ {
   realw* d_coupling_ac_el_jacobian2Dw;
 
   // gravity
-  int gravity;
   realw* d_minus_deriv_gravity;
   realw* d_minus_g;
 
diff --git a/src/cuda/prepare_mesh_constants_cuda.cu b/src/cuda/prepare_mesh_constants_cuda.cu
index 55c91dc63..ae62644f7 100644
--- a/src/cuda/prepare_mesh_constants_cuda.cu
+++ b/src/cuda/prepare_mesh_constants_cuda.cu
@@ -154,6 +154,10 @@ TRACE("prepare_constants_device");
   mp->NSPEC_AB = *NSPEC_AB;
   mp->NGLOB_AB = *NGLOB_AB;
 
+  // constants
+  mp->simulation_type = *SIMULATION_TYPE;
+  mp->absorbing_conditions = *ABSORBING_CONDITIONS;
+
   // sets constant arrays
   setConst_hprime_xx(h_hprime_xx,mp);
   // setConst_hprime_yy(h_hprime_yy,mp); // only needed if NGLLX != NGLLY != NGLLZ
@@ -289,41 +293,54 @@ TRACE("prepare_constants_device");
   }
 
   // inner elements
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
-                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),1206);
+  copy_todevice_int((void**)&mp->d_ispec_is_inner,h_ispec_is_inner,mp->NSPEC_AB);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
+ //                                    mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),1206);
 
   // absorbing boundaries
   mp->d_num_abs_boundary_faces = *h_num_abs_boundary_faces;
-  if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ispec),
-                                       (mp->d_num_abs_boundary_faces)*sizeof(int)),1101);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
-                                       (mp->d_num_abs_boundary_faces)*sizeof(int),
-                                       cudaMemcpyHostToDevice),1102);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ijk),
-                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
-                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int),
-                                       cudaMemcpyHostToDevice),1104);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_normal),
-                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1105);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
-                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),1106);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_jacobian2Dw),
-                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1107);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
-                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),1108);
+  if( mp->absorbing_conditions && mp->d_num_abs_boundary_faces > 0 ){
+    copy_todevice_int((void**)&mp->d_abs_boundary_ispec,h_abs_boundary_ispec,mp->d_num_abs_boundary_faces);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ispec),
+//                                       (mp->d_num_abs_boundary_faces)*sizeof(int)),1101);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
+//                                       (mp->d_num_abs_boundary_faces)*sizeof(int),
+//                                       cudaMemcpyHostToDevice),1102);
+
+    copy_todevice_int((void**)&mp->d_abs_boundary_ijk,h_abs_boundary_ijk,
+                      3*NGLL2*(mp->d_num_abs_boundary_faces));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ijk),
+//                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
+//                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int),
+//                                       cudaMemcpyHostToDevice),1104);
+
+    copy_todevice_realw((void**)&mp->d_abs_boundary_normal,h_abs_boundary_normal,
+                        NDIM*NGLL2*(mp->d_num_abs_boundary_faces));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_normal),
+//                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1105);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
+//                                       3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),1106);
+
+    copy_todevice_realw((void**)&mp->d_abs_boundary_jacobian2Dw,h_abs_boundary_jacobian2Dw,
+                        NGLL2*(mp->d_num_abs_boundary_faces));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_jacobian2Dw),
+//                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1107);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
+//                                       NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
+ //                                      cudaMemcpyHostToDevice),1108);
   }
 
   // sources
   mp->nsources_local = *nsources_local_f;
-  if (*SIMULATION_TYPE == 1  || *SIMULATION_TYPE == 3){
+  if (mp->simulation_type == 1  || mp->simulation_type == 3){
     // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
     copy_todevice_realw((void**)&mp->d_sourcearrays,h_sourcearrays,(*NSOURCES)*NDIM*NGLL3);
 
@@ -388,7 +405,6 @@ void FC_FUNC_(prepare_fields_acoustic_device,
                                               int* num_free_surface_faces,
                                               int* free_surface_ispec,
                                               int* free_surface_ijk,
-                                              int* ABSORBING_CONDITIONS,
                                               int* b_reclen_potential,
                                               realw* b_absorb_potential,
                                               int* ELASTIC_SIMULATION,
@@ -406,7 +422,7 @@ void FC_FUNC_(prepare_fields_acoustic_device,
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
   /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
   int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
-  int size_nonpadded = NGLL3 * mp->NSPEC_AB;
+//  int size_nonpadded = NGLL3 * mp->NSPEC_AB;
   int size_glob = mp->NGLOB_AB;
 
   // allocates arrays on device (GPU)
@@ -421,9 +437,11 @@ void FC_FUNC_(prepare_fields_acoustic_device,
   }
 
   // mass matrix
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  copy_todevice_realw((void**)&mp->d_rmass_acoustic,rmass_acoustic,mp->NGLOB_AB);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
+//                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
 
   // padded array
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),2006);
@@ -434,45 +452,63 @@ void FC_FUNC_(prepare_fields_acoustic_device,
   }
 
   // non-padded array
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105);
+  copy_todevice_realw((void**)&mp->d_kappastore,kappastore,NGLL3*mp->NSPEC_AB);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
+//                                     NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105);
 
   // phase elements
   mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
-                                      mp->num_phase_ispec_acoustic*2*sizeof(int)),2008);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
-                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),
-                                     mp->NSPEC_AB*sizeof(int)),2009);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
-                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102);
+  copy_todevice_int((void**)&mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
+                    2*mp->num_phase_ispec_acoustic);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
+//                                      mp->num_phase_ispec_acoustic*2*sizeof(int)),2008);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
+//                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+
+  copy_todevice_int((void**)&mp->d_ispec_is_acoustic,ispec_is_acoustic,mp->NSPEC_AB);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),
+//                                     mp->NSPEC_AB*sizeof(int)),2009);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
+//                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102);
 
   // free surface
   if( *NOISE_TOMOGRAPHY == 0 ){
     // allocate surface arrays
     mp->num_free_surface_faces = *num_free_surface_faces;
     if( mp->num_free_surface_faces > 0 ){
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-                                       mp->num_free_surface_faces*sizeof(int)),2201);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-                                       mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203);
-
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204);
+
+      copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
+
+//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
+//                                       mp->num_free_surface_faces*sizeof(int)),2201);
+//      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
+//                                       mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203);
+
+
+      copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
+                        3*NGLL2*mp->num_free_surface_faces);
+
+//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
+//                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202);
+//      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+//                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204);
     }
   }
 
   // absorbing boundaries
-  if( *ABSORBING_CONDITIONS ){
+  if( mp->absorbing_conditions ){
     mp->d_b_reclen_potential = *b_reclen_potential;
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
-                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302);
+
+    copy_todevice_realw((void**)&mp->d_b_absorb_potential,b_absorb_potential,mp->d_b_reclen_potential);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
+//                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302);
   }
 
 
@@ -488,25 +524,36 @@ void FC_FUNC_(prepare_fields_acoustic_device,
 
   // coupling with elastic parts
   if( *ELASTIC_SIMULATION && *num_coupling_ac_el_faces > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
-                                       (*num_coupling_ac_el_faces)*sizeof(int)),2601);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
-                                       (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
-                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
-                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
-                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
-                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
-                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
-                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608);
+
+    copy_todevice_int((void**)&mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,(*num_coupling_ac_el_faces));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
+//                                       (*num_coupling_ac_el_faces)*sizeof(int)),2601);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
+//                                       (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602);
+
+    copy_todevice_int((void**)&mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,3*NGLL2*(*num_coupling_ac_el_faces));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
+//                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
+//                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604);
+
+    copy_todevice_realw((void**)&mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
+                        3*NGLL2*(*num_coupling_ac_el_faces));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
+//                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
+//                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606);
+
+    copy_todevice_realw((void**)&mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
+                        NGLL2*(*num_coupling_ac_el_faces));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
+//                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
+//                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608);
 
   }
 
@@ -528,7 +575,6 @@ void FC_FUNC_(prepare_fields_acoustic_device,
 extern "C"
 void FC_FUNC_(prepare_fields_acoustic_adj_dev,
               PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
-                                              int* SIMULATION_TYPE,
                                               int* APPROXIMATE_HESS_KL) {
 
   TRACE("prepare_fields_acoustic_adj_dev");
@@ -538,7 +584,7 @@ void FC_FUNC_(prepare_fields_acoustic_adj_dev,
   int size_glob = mp->NGLOB_AB;
 
   // kernel simulations
-  if( *SIMULATION_TYPE != 3 ) return;
+  if( mp->simulation_type != 3 ) return;
 
   // allocates backward/reconstructed arrays on device (GPU)
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),3014);
@@ -579,16 +625,17 @@ extern "C"
 void FC_FUNC_(prepare_fields_elastic_device,
               PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
                                              int* size,
-                                             realw* rmass,
+                                             realw* rmassx,
+                                             realw* rmassy,
+                                             realw* rmassz,
                                              realw* rho_vp,
                                              realw* rho_vs,
                                              int* num_phase_ispec_elastic,
                                              int* phase_ispec_inner_elastic,
                                              int* ispec_is_elastic,
-                                             int* ABSORBING_CONDITIONS,
                                              realw* h_b_absorb_field,
                                              int* h_b_reclen_field,
-                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
+                                             int* SAVE_FORWARD,
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                              realw* epsilondev_xz,realw* epsilondev_yz,
@@ -636,7 +683,7 @@ TRACE("prepare_fields_elastic_device");
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
   /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
   int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
-  int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
+//  int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
 
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
@@ -663,23 +710,32 @@ TRACE("prepare_fields_elastic_device");
   }
 
   // mass matrix
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
-  // transfer element data
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
-                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
+  copy_todevice_realw((void**)&mp->d_rmassx,rmassx,mp->NGLOB_AB);
+  copy_todevice_realw((void**)&mp->d_rmassy,rmassy,mp->NGLOB_AB);
+  copy_todevice_realw((void**)&mp->d_rmassz,rmassz,mp->NGLOB_AB);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
+//  // transfer element data
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
+//                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
 
 
   // element indices
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
-                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
+  copy_todevice_int((void**)&mp->d_ispec_is_elastic,ispec_is_elastic,mp->NSPEC_AB);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
+//                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
 
   // phase elements
   mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
-                                     mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
-                                     mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
+
+  copy_todevice_int((void**)&mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,2*mp->num_phase_ispec_elastic);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
+//                                     mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
+//                                     mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
 
   // for seismograms
   if( mp->nrec_local > 0 ){
@@ -691,24 +747,30 @@ TRACE("prepare_fields_elastic_device");
   }
 
   // absorbing conditions
-  if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
+  if( mp->absorbing_conditions && mp->d_num_abs_boundary_faces > 0){
     // non-padded arrays
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
+    copy_todevice_realw((void**)&mp->d_rho_vp,rho_vp,NGLL3*mp->NSPEC_AB);
+    copy_todevice_realw((void**)&mp->d_rho_vs,rho_vs,NGLL3*mp->NSPEC_AB);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
 
     // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
+//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
+//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
 
     // absorb_field array used for file i/o
-    if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
+    if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && *SAVE_FORWARD )){
       mp->d_b_reclen_field = *h_b_reclen_field;
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
-                                       mp->d_b_reclen_field),4016);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
-                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
+
+      copy_todevice_realw((void**)&mp->d_b_absorb_field,h_b_absorb_field,mp->d_b_reclen_field);
+
+//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
+//                                       mp->d_b_reclen_field),4016);
+//      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
+//                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
     }
   }
 
@@ -717,83 +779,117 @@ TRACE("prepare_fields_elastic_device");
     // strains
     int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
 
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
-                                       epsilondev_size*sizeof(realw)),4301);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4302);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
-                                       epsilondev_size*sizeof(realw)),4302);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4303);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
-                                       epsilondev_size*sizeof(realw)),4304);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4305);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
-                                       epsilondev_size*sizeof(realw)),4306);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4307);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
-                                       epsilondev_size*sizeof(realw)),4308);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4309);
+    copy_todevice_realw((void**)&mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
+//                                       epsilondev_size*sizeof(realw)),4301);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4302);
+
+    copy_todevice_realw((void**)&mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
+//                                       epsilondev_size*sizeof(realw)),4302);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4303);
+
+    copy_todevice_realw((void**)&mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
+//                                       epsilondev_size*sizeof(realw)),4304);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4305);
+
+    copy_todevice_realw((void**)&mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
+//                                       epsilondev_size*sizeof(realw)),4306);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4307);
+
+    copy_todevice_realw((void**)&mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
+//                                       epsilondev_size*sizeof(realw)),4308);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4309);
 
   }
 
   // attenuation memory variables
   if( *ATTENUATION ){
     // memory arrays
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
-                                       (*R_size)*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4402);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
-                                       (*R_size)*sizeof(realw)),4403);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4404);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
-                                       (*R_size)*sizeof(realw)),4405);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4406);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
-                                       (*R_size)*sizeof(realw)),4407);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4408);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
-                                       (*R_size)*sizeof(realw)),4409);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4410);
+    copy_todevice_realw((void**)&mp->d_R_xx,R_xx,(*R_size));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
+//                                       (*R_size)*sizeof(realw)),4401);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4402);
+
+    copy_todevice_realw((void**)&mp->d_R_yy,R_yy,(*R_size));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
+//                                       (*R_size)*sizeof(realw)),4403);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4404);
+
+    copy_todevice_realw((void**)&mp->d_R_xy,R_xy,(*R_size));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
+//                                       (*R_size)*sizeof(realw)),4405);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4406);
+
+    copy_todevice_realw((void**)&mp->d_R_xz,R_xz,(*R_size));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
+//                                       (*R_size)*sizeof(realw)),4407);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4408);
+
+    copy_todevice_realw((void**)&mp->d_R_yz,R_yz,(*R_size));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
+//                                       (*R_size)*sizeof(realw)),4409);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),4410);
 
     // attenuation factors
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
+    copy_todevice_realw((void**)&mp->d_one_minus_sum_beta,one_minus_sum_beta,NGLL3*mp->NSPEC_AB);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
+//                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
+//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
 
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
-                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
-                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    copy_todevice_realw((void**)&mp->d_factor_common,factor_common,N_SLS*NGLL3*mp->NSPEC_AB);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
+//                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
+//                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
 
     // alpha,beta,gamma factors
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
-                                       N_SLS*sizeof(realw)),4434);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
+    copy_todevice_realw((void**)&mp->d_alphaval,alphaval,N_SLS);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
+//                                       N_SLS*sizeof(realw)),4434);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
+//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
 
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
-                                       N_SLS*sizeof(realw)),4436);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
+    copy_todevice_realw((void**)&mp->d_betaval,betaval,N_SLS);
 
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
-                                       N_SLS*sizeof(realw)),4438);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
+//                                       N_SLS*sizeof(realw)),4436);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
+//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
+
+    copy_todevice_realw((void**)&mp->d_gammaval,gammaval,N_SLS);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
+//                                       N_SLS*sizeof(realw)),4438);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
+//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
 
   }
 
@@ -896,29 +992,41 @@ TRACE("prepare_fields_elastic_device");
     mp->num_free_surface_faces = *num_free_surface_faces;
     if( mp->num_free_surface_faces > 0 ){
       // mass matrix
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
-                                         sizeof(realw)*mp->NGLOB_AB),4501);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
-                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
+      copy_todevice_realw((void**)&mp->d_rmass_ocean_load,rmass_ocean_load,mp->NGLOB_AB);
+
+//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
+//                                         sizeof(realw)*mp->NGLOB_AB),4501);
+//      print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
+//                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
       // surface normal
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
-                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
-                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
+      copy_todevice_realw((void**)&mp->d_free_surface_normal,free_surface_normal,
+                          3*NGLL2*(mp->num_free_surface_faces));
+
+//      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
+//                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
+//      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
+//                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
 
       // temporary global array: used to synchronize updates on global accel array
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
                                          sizeof(int)*mp->NGLOB_AB),4505);
 
       if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
-        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-                                          mp->num_free_surface_faces*sizeof(int)),4601);
-        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
-        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
-        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
+
+        copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
+
+//        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
+//                                          mp->num_free_surface_faces*sizeof(int)),4601);
+//        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
+//                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
+
+        copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
+                          3*NGLL2*mp->num_free_surface_faces);
+
+//        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
+//                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
+//        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+//                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
       }
     }
   }
@@ -941,7 +1049,6 @@ extern "C"
 void FC_FUNC_(prepare_fields_elastic_adj_dev,
               PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
                                              int* size,
-                                             int* SIMULATION_TYPE,
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilon_trace_over_3,
                                              realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
@@ -958,7 +1065,7 @@ void FC_FUNC_(prepare_fields_elastic_adj_dev,
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
   // checks if kernel simulation
-  if( *SIMULATION_TYPE != 3 ) return;
+  if( mp->simulation_type != 3 ) return;
 
   // kernel simulations
   // allocates backward/reconstructed arrays on device (GPU)
@@ -985,81 +1092,104 @@ void FC_FUNC_(prepare_fields_elastic_adj_dev,
     int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
 
     // solid pressure
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5310);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311);
+    copy_todevice_realw((void**)&mp->d_epsilon_trace_over_3,epsilon_trace_over_3,NGLL3*mp->NSPEC_AB);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
+//                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5310);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
+//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311);
     // backward solid pressure
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5312);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313);
+
+    copy_todevice_realw((void**)&mp->d_b_epsilon_trace_over_3,b_epsilon_trace_over_3,NGLL3*mp->NSPEC_AB);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
+//                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5312);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
+//                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313);
+
     // prepares backward strains
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
-                                       epsilondev_size*sizeof(realw)),5321);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
-                                       epsilondev_size*sizeof(realw)),5322);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
-                                       epsilondev_size*sizeof(realw)),5323);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
-                                       epsilondev_size*sizeof(realw)),5324);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
-                                       epsilondev_size*sizeof(realw)),5325);
-
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330);
+
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xx,b_epsilondev_xx,epsilondev_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_yy,b_epsilondev_yy,epsilondev_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xy,b_epsilondev_xy,epsilondev_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xz,b_epsilondev_xz,epsilondev_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_yz,b_epsilondev_yz,epsilondev_size);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
+//                                       epsilondev_size*sizeof(realw)),5321);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
+//                                       epsilondev_size*sizeof(realw)),5322);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
+//                                       epsilondev_size*sizeof(realw)),5323);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
+//                                       epsilondev_size*sizeof(realw)),5324);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
+//                                       epsilondev_size*sizeof(realw)),5325);
+
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
+//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
+//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
+//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
+//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
+//                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330);
   }
 
   // attenuation memory variables
   if( *ATTENUATION ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
-                                       (*R_size)*sizeof(realw)),5421);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),5422);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
-                                       (*R_size)*sizeof(realw)),5423);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),5424);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
-                                       (*R_size)*sizeof(realw)),5425);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),5426);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
-                                       (*R_size)*sizeof(realw)),5427);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),5428);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
-                                       (*R_size)*sizeof(realw)),5429);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),5420);
+    copy_todevice_realw((void**)&mp->d_b_R_xx,b_R_xx,(*R_size));
+    copy_todevice_realw((void**)&mp->d_b_R_yy,b_R_yy,(*R_size));
+    copy_todevice_realw((void**)&mp->d_b_R_xy,b_R_xy,(*R_size));
+    copy_todevice_realw((void**)&mp->d_b_R_xz,b_R_xz,(*R_size));
+    copy_todevice_realw((void**)&mp->d_b_R_yz,b_R_yz,(*R_size));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
+//                                       (*R_size)*sizeof(realw)),5421);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),5422);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
+//                                       (*R_size)*sizeof(realw)),5423);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),5424);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
+//                                       (*R_size)*sizeof(realw)),5425);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),5426);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
+//                                       (*R_size)*sizeof(realw)),5427);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),5428);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
+//                                       (*R_size)*sizeof(realw)),5429);
+//   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
+//                                       cudaMemcpyHostToDevice),5420);
 
     // alpha,beta,gamma factors for backward fields
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
-                                       N_SLS*sizeof(realw)),5434);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
-                                       N_SLS*sizeof(realw)),5436);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
-                                       N_SLS*sizeof(realw)),5438);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
+    copy_todevice_realw((void**)&mp->d_b_alphaval,b_alphaval,N_SLS);
+    copy_todevice_realw((void**)&mp->d_b_betaval,b_betaval,N_SLS);
+    copy_todevice_realw((void**)&mp->d_b_gammaval,b_gammaval,N_SLS);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
+//                                       N_SLS*sizeof(realw)),5434);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
+//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
+//                                       N_SLS*sizeof(realw)),5436);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
+//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
+//                                       N_SLS*sizeof(realw)),5438);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
+//                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
   }
 
   if( *APPROXIMATE_HESS_KL ){
@@ -1148,7 +1278,6 @@ void FC_FUNC_(prepare_fields_noise_device,
                                            int* free_surface_ispec,
                                            int* free_surface_ijk,
                                            int* num_free_surface_faces,
-                                           int* SIMULATION_TYPE,
                                            int* NOISE_TOMOGRAPHY,
                                            int* NSTEP,
                                            realw* noise_sourcearray,
@@ -1165,15 +1294,20 @@ void FC_FUNC_(prepare_fields_noise_device,
   // free surface
   mp->num_free_surface_faces = *num_free_surface_faces;
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
-                                     mp->num_free_surface_faces*sizeof(int)),7001);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
-                                     mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002);
+  copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
-                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004);
+//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
+//                                     mp->num_free_surface_faces*sizeof(int)),7001);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
+//                                     mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002);
+
+  copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
+                    3*NGLL2*mp->num_free_surface_faces);
+
+//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
+//                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+//                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004);
 
   // alloc storage for the surface buffer to be copied
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
@@ -1181,37 +1315,48 @@ void FC_FUNC_(prepare_fields_noise_device,
 
   // prepares noise source array
   if( *NOISE_TOMOGRAPHY == 1 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
-                                       3*NGLL3*(*NSTEP)*sizeof(realw)),7101);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
-                                       3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
+    copy_todevice_realw((void**)&mp->d_noise_sourcearray,noise_sourcearray,
+                        3*NGLL3*(*NSTEP));
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
+//                                       3*NGLL3*(*NSTEP)*sizeof(realw)),7101);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
+//                                       3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
   }
 
   // prepares noise directions
   if( *NOISE_TOMOGRAPHY > 1 ){
     int nface_size = NGLL2*(*num_free_surface_faces);
     // allocates memory on GPU
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
-                                       nface_size*sizeof(realw)),7301);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
-                                       nface_size*sizeof(realw)),7302);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
-                                       nface_size*sizeof(realw)),7303);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
-                                       nface_size*sizeof(realw)),7304);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
-                                       nface_size*sizeof(realw)),7305);
+    copy_todevice_realw((void**)&mp->d_normal_x_noise,normal_x_noise,nface_size);
+    copy_todevice_realw((void**)&mp->d_normal_y_noise,normal_y_noise,nface_size);
+    copy_todevice_realw((void**)&mp->d_normal_z_noise,normal_z_noise,nface_size);
+
+    copy_todevice_realw((void**)&mp->d_mask_noise,mask_noise,nface_size);
+    copy_todevice_realw((void**)&mp->d_free_surface_jacobian2Dw,free_surface_jacobian2Dw,nface_size);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
+//                                       nface_size*sizeof(realw)),7301);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
+//                                       nface_size*sizeof(realw)),7302);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
+//                                       nface_size*sizeof(realw)),7303);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
+//                                       nface_size*sizeof(realw)),7304);
+//    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
+//                                       nface_size*sizeof(realw)),7305);
     // transfers data onto GPU
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
+//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
+//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
+//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
+//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
+//                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
   }
 
   // prepares noise strength kernel
@@ -1254,15 +1399,20 @@ void FC_FUNC_(prepare_fields_gravity_device,
 
   mp->gravity = *GRAVITY;
   if( mp->gravity ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_deriv_gravity),
-                                       (mp->NGLOB_AB)*sizeof(realw)),8000);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity, minus_deriv_gravity,
-                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8001);
 
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_g),
-                                       (mp->NGLOB_AB)*sizeof(realw)),8002);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_g, minus_g,
-                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8003);
+    copy_todevice_realw((void**)&mp->d_minus_deriv_gravity,minus_deriv_gravity,mp->NGLOB_AB);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_deriv_gravity),
+//                                       (mp->NGLOB_AB)*sizeof(realw)),8000);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity, minus_deriv_gravity,
+//                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+
+    copy_todevice_realw((void**)&mp->d_minus_g,minus_g,mp->NGLOB_AB);
+
+//    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_g),
+//                                       (mp->NGLOB_AB)*sizeof(realw)),8002);
+//    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_g, minus_g,
+//                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8003);
 
 
     if( *ACOUSTIC_SIMULATION == 0 ){
@@ -1280,6 +1430,8 @@ void FC_FUNC_(prepare_fields_gravity_device,
 
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
 extern "C"
 void FC_FUNC_(prepare_seismogram_fields,
               PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {
@@ -1287,19 +1439,19 @@ void FC_FUNC_(prepare_seismogram_fields,
   TRACE("prepare_constants_device");
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_nu),3*3* *nrec_local*sizeof(double)),8100);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hxir),5* *nrec_local*sizeof(double)),8100);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hetar),5* *nrec_local*sizeof(double)),8100);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hgammar),5* *nrec_local*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_nu),3*3*(*nrec_local)*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hxir),5*(*nrec_local)*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hetar),5*(*nrec_local)*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hgammar),5*(*nrec_local)*sizeof(double)),8100);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_d,3**nrec_local*sizeof(realw)),8101);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_v,3**nrec_local*sizeof(realw)),8101);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_a,3**nrec_local*sizeof(realw)),8101);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_d,3*(*nrec_local)*sizeof(realw)),8101);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_v,3*(*nrec_local)*sizeof(realw)),8101);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_a,3*(*nrec_local)*sizeof(realw)),8101);
 
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nu,nu,3*3* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_hxir,hxir,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_hetar,hetar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_hgammar,hgammar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nu,nu,3*3*(*nrec_local)*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hxir,hxir,5*(*nrec_local)*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hetar,hetar,5*(*nrec_local)*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hgammar,hgammar,5*(*nrec_local)*sizeof(double),cudaMemcpyHostToDevice),8101);
 
   cudaMallocHost((void**)&mp->h_seismograms_d_it,3**nrec_local*sizeof(realw));
   cudaMallocHost((void**)&mp->h_seismograms_v_it,3**nrec_local*sizeof(realw));
@@ -1315,7 +1467,6 @@ void FC_FUNC_(prepare_seismogram_fields,
 extern "C"
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
-                                      int* SIMULATION_TYPE,
                                       int* SAVE_FORWARD,
                                       int* ACOUSTIC_SIMULATION,
                                       int* ELASTIC_SIMULATION,
@@ -1362,7 +1513,7 @@ TRACE("prepare_cleanup_device");
   cudaFree(mp->d_ibool);
 
   // sources
-  if (*SIMULATION_TYPE == 1  || *SIMULATION_TYPE == 3){
+  if (mp->simulation_type == 1  || mp->simulation_type == 3){
     cudaFree(mp->d_sourcearrays);
     cudaFree(mp->d_stf_pre_compute);
   }
@@ -1393,7 +1544,7 @@ TRACE("prepare_cleanup_device");
 
     if( *ABSORBING_CONDITIONS ) cudaFree(mp->d_b_absorb_potential);
 
-    if( *SIMULATION_TYPE == 3 ) {
+    if( mp->simulation_type == 3 ) {
       cudaFree(mp->d_b_potential_acoustic);
       cudaFree(mp->d_b_potential_dot_acoustic);
       cudaFree(mp->d_b_potential_dot_dot_acoustic);
@@ -1416,7 +1567,10 @@ TRACE("prepare_cleanup_device");
     cudaFree(mp->d_veloc);
     cudaFree(mp->d_accel);
     cudaFree(mp->d_send_accel_buffer);
-    cudaFree(mp->d_rmass);
+
+    cudaFree(mp->d_rmassx);
+    cudaFree(mp->d_rmassy);
+    cudaFree(mp->d_rmassz);
 
     cudaFree(mp->d_phase_ispec_inner_elastic);
     cudaFree(mp->d_ispec_is_elastic);
@@ -1430,11 +1584,11 @@ TRACE("prepare_cleanup_device");
       cudaFree(mp->d_rho_vp);
       cudaFree(mp->d_rho_vs);
 
-      if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD ))
+      if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && *SAVE_FORWARD ))
           cudaFree(mp->d_b_absorb_field);
     }
 
-    if( *SIMULATION_TYPE == 3 ) {
+    if( mp->simulation_type == 3 ) {
       cudaFree(mp->d_b_displ);
       cudaFree(mp->d_b_veloc);
       cudaFree(mp->d_b_accel);
@@ -1450,7 +1604,7 @@ TRACE("prepare_cleanup_device");
       cudaFree(mp->d_epsilondev_xy);
       cudaFree(mp->d_epsilondev_xz);
       cudaFree(mp->d_epsilondev_yz);
-      if( *SIMULATION_TYPE == 3 ){
+      if( mp->simulation_type == 3 ){
         cudaFree(mp->d_epsilon_trace_over_3);
         cudaFree(mp->d_b_epsilon_trace_over_3);
         cudaFree(mp->d_b_epsilondev_xx);
@@ -1472,7 +1626,7 @@ TRACE("prepare_cleanup_device");
       cudaFree(mp->d_R_xy);
       cudaFree(mp->d_R_xz);
       cudaFree(mp->d_R_yz);
-      if( *SIMULATION_TYPE == 3){
+      if( mp->simulation_type == 3){
         cudaFree(mp->d_b_R_xx);
         cudaFree(mp->d_b_R_yy);
         cudaFree(mp->d_b_R_xy);
@@ -1522,7 +1676,7 @@ TRACE("prepare_cleanup_device");
   } // ELASTIC_SIMULATION
 
   // purely adjoint & kernel array
-  if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ){
+  if( mp->simulation_type == 2 || mp->simulation_type == 3 ){
     if(mp->nadj_rec_local > 0 ){
       cudaFree(mp->d_adj_sourcearrays);
       cudaFree(mp->d_pre_computed_irec);
diff --git a/src/cuda/specfem3D_gpu_cuda_method_stubs.c b/src/cuda/specfem3D_gpu_cuda_method_stubs.c
index c96f208d5..c6231321c 100644
--- a/src/cuda/specfem3D_gpu_cuda_method_stubs.c
+++ b/src/cuda/specfem3D_gpu_cuda_method_stubs.c
@@ -79,13 +79,11 @@ void FC_FUNC_(get_max_accel,
 
 void FC_FUNC_(get_norm_acoustic_from_device,
               GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
-                                                  long* Mesh_pointer_f,
-                                                  int* SIMULATION_TYPE) {}
+                                             long* Mesh_pointer_f) {}
 
 void FC_FUNC_(get_norm_elastic_from_device,
               GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
-                                            long* Mesh_pointer_f,
-                                            int* SIMULATION_TYPE) {}
+                                            long* Mesh_pointer_f) {}
 
 
 //
@@ -96,7 +94,6 @@ void FC_FUNC_(compute_add_sources_ac_cuda,
               COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
                                                  int* phase_is_innerf,
                                                  int* NSOURCESf,
-                                                 int* SIMULATION_TYPEf,
                                                  double* h_stf_pre_compute,
                                                  int* myrankf) {}
 
@@ -104,7 +101,6 @@ void FC_FUNC_(compute_add_sources_ac_s3_cuda,
               COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
                                                       int* phase_is_innerf,
                                                       int* NSOURCESf,
-                                                      int* SIMULATION_TYPEf,
                                                       double* h_stf_pre_compute,
                                                       int* myrankf) {}
 
@@ -170,14 +166,15 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
 void FC_FUNC_(compute_coupling_ac_el_cuda,
               COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
-                                           int* num_coupling_ac_el_facesf,
-                                           int* SIMULATION_TYPEf) {}
+                                           int* num_coupling_ac_el_facesf) {}
 
 void FC_FUNC_(compute_coupling_el_ac_cuda,
               COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
-                                           int* num_coupling_ac_el_facesf,
-                                           int* SIMULATION_TYPEf) {}
+                                           int* num_coupling_ac_el_facesf) {}
+
+void FC_FUNC_(compute_coupling_ocean_cuda,
+              COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f) {}
 
 
 //
@@ -211,25 +208,11 @@ void FC_FUNC_(compute_forces_acoustic_cuda,
               COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
                                             int* iphase,
                                             int* nspec_outer_acoustic,
-                                            int* nspec_inner_acoustic,
-                                            int* SIMULATION_TYPE) {}
-
-void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
-                             long* Mesh_pointer,
-                             int* size_F,
-                             int* SIMULATION_TYPE) {}
-
-void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
-                                                             long* Mesh_pointer,
-                                                             int* size_F,
-                                                             realw* deltatover2_F,
-                                                             int* SIMULATION_TYPE,
-                                                             realw* b_deltatover2_F) {}
+                                            int* nspec_inner_acoustic) {}
 
 void FC_FUNC_(acoustic_enforce_free_surf_cuda,
               ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
-                                                  int* SIMULATION_TYPE,
-                                                  int* ABSORB_FREE_SURFACE) {}
+                                               int* ABSORB_FREE_SURFACE) {}
 
 
 //
@@ -278,7 +261,6 @@ void FC_FUNC_(compute_forces_elastic_cuda,
                                            int* iphase,
                                            int* nspec_outer_elastic,
                                            int* nspec_inner_elastic,
-                                           int* SIMULATION_TYPE,
                                            int* COMPUTE_AND_STORE_STRAIN,
                                            int* ATTENUATION,
                                            int* ANISOTROPY) {}
@@ -288,25 +270,6 @@ void FC_FUNC_(sync_copy_from_device,
                                      int* iphase,
                                      realw* send_buffer) {}
 
-void FC_FUNC_(kernel_3_a_cuda,
-              KERNEL_3_A_CUDA)(long* Mesh_pointer,
-                               int* size_F,
-                               realw* deltatover2_F,
-                               int* SIMULATION_TYPE_f,
-                               realw* b_deltatover2_F,
-                               int* OCEANS) {}
-
-void FC_FUNC_(kernel_3_b_cuda,
-              KERNEL_3_B_CUDA)(long* Mesh_pointer,
-                             int* size_F,
-                             realw* deltatover2_F,
-                             int* SIMULATION_TYPE_f,
-                             realw* b_deltatover2_F) {}
-
-void FC_FUNC_(elastic_ocean_load_cuda,
-              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
-                                       int* SIMULATION_TYPE) {}
-
 
 //
 // src/cuda/compute_kernels_cuda.cu
@@ -338,12 +301,10 @@ void FC_FUNC_(compute_kernels_hess_cuda,
 //
 
 void FC_FUNC_(compute_stacey_acoustic_cuda,
-              COMPUTE_STACEY_ACOUSTIC_CUDA)(
-                                    long* Mesh_pointer_f,
-                                    int* phase_is_innerf,
-                                    int* SIMULATION_TYPEf,
-                                    int* SAVE_FORWARDf,
-                                    realw* h_b_absorb_potential) {}
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* SAVE_FORWARDf,
+                                            realw* h_b_absorb_potential) {}
 
 
 //
@@ -353,7 +314,6 @@ void FC_FUNC_(compute_stacey_acoustic_cuda,
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
-                                           int* SIMULATION_TYPEf,
                                            int* SAVE_FORWARDf,
                                            realw* h_b_absorb_field) {}
 
@@ -375,14 +335,13 @@ void FC_FUNC_(initialize_cuda_device,
 
 void FC_FUNC_(it_update_displacement_cuda,
               IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
-                                                 int* size_F,
-                                                 realw* deltat_F,
-                                                 realw* deltatsqover2_F,
-                                                 realw* deltatover2_F,
-                                                 int* SIMULATION_TYPE,
-                                                 realw* b_deltat_F,
-                                                 realw* b_deltatsqover2_F,
-                                                 realw* b_deltatover2_F) {}
+                                          int* size_F,
+                                          realw* deltat_F,
+                                          realw* deltatsqover2_F,
+                                          realw* deltatover2_F,
+                                          realw* b_deltat_F,
+                                          realw* b_deltatsqover2_F,
+                                          realw* b_deltatover2_F) {}
 
 void FC_FUNC_(it_update_displacement_ac_cuda,
               it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
@@ -390,11 +349,33 @@ void FC_FUNC_(it_update_displacement_ac_cuda,
                                                realw* deltat_F,
                                                realw* deltatsqover2_F,
                                                realw* deltatover2_F,
-                                               int* SIMULATION_TYPE,
                                                realw* b_deltat_F,
                                                realw* b_deltatsqover2_F,
                                                realw* b_deltatover2_F) {}
 
+void FC_FUNC_(kernel_3_a_cuda,
+              KERNEL_3_A_CUDA)(long* Mesh_pointer,
+                               int* size_F,
+                               realw* deltatover2_F,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {}
+
+void FC_FUNC_(kernel_3_b_cuda,
+              KERNEL_3_B_CUDA)(long* Mesh_pointer,
+                             int* size_F,
+                             realw* deltatover2_F,
+                             realw* b_deltatover2_F) {}
+
+void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                             long* Mesh_pointer,
+                             int* size_F) {}
+
+void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
+                                                             long* Mesh_pointer,
+                                                             int* size_F,
+                                                             realw* deltatover2_F,
+                                                             realw* b_deltatover2_F) {}
+
 
 //
 // src/cuda/noise_tomography_cuda.cu
@@ -475,7 +456,6 @@ void FC_FUNC_(prepare_fields_acoustic_device,
                                               int* num_free_surface_faces,
                                               int* free_surface_ispec,
                                               int* free_surface_ijk,
-                                              int* ABSORBING_CONDITIONS,
                                               int* b_reclen_potential,
                                               realw* b_absorb_potential,
                                               int* ELASTIC_SIMULATION,
@@ -490,22 +470,22 @@ void FC_FUNC_(prepare_fields_acoustic_device,
 
 void FC_FUNC_(prepare_fields_acoustic_adj_dev,
               PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
-                                              int* SIMULATION_TYPE,
                                               int* APPROXIMATE_HESS_KL) {}
 
 void FC_FUNC_(prepare_fields_elastic_device,
               PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
                                              int* size,
-                                             realw* rmass,
+                                             realw* rmassx,
+                                             realw* rmassy,
+                                             realw* rmassz,
                                              realw* rho_vp,
                                              realw* rho_vs,
                                              int* num_phase_ispec_elastic,
                                              int* phase_ispec_inner_elastic,
                                              int* ispec_is_elastic,
-                                             int* ABSORBING_CONDITIONS,
                                              realw* h_b_absorb_field,
                                              int* h_b_reclen_field,
-                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
+                                             int* SAVE_FORWARD,
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                              realw* epsilondev_xz,realw* epsilondev_yz,
@@ -551,7 +531,6 @@ void FC_FUNC_(prepare_fields_elastic_device,
 void FC_FUNC_(prepare_fields_elastic_adj_dev,
               PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
                                              int* size,
-                                             int* SIMULATION_TYPE,
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilon_trace_over_3,
                                              realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
@@ -578,7 +557,6 @@ void FC_FUNC_(prepare_fields_noise_device,
                                            int* free_surface_ispec,
                                            int* free_surface_ijk,
                                            int* num_free_surface_faces,
-                                           int* SIMULATION_TYPE,
                                            int* NOISE_TOMOGRAPHY,
                                            int* NSTEP,
                                            realw* noise_sourcearray,
@@ -602,7 +580,6 @@ void FC_FUNC_(prepare_seismogram_fields,
 
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
-                                      int* SIMULATION_TYPE,
                                       int* SAVE_FORWARD,
                                       int* ACOUSTIC_SIMULATION,
                                       int* ELASTIC_SIMULATION,
@@ -774,7 +751,6 @@ void FC_FUNC_(transfer_kernels_hess_ac_tohost,
 void FC_FUNC_(transfer_seismograms_el_from_d,
               TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local,
                                               long* Mesh_pointer_f,
-                                              int* SIMULATION_TYPEf,
                                               realw* seismograms_d,
                                               realw* seismograms_v,
                                               realw* seismograms_a,
@@ -785,7 +761,7 @@ void FC_FUNC_(transfer_station_el_from_device,
                                                    realw* b_displ, realw* b_veloc, realw* b_accel,
                                                    long* Mesh_pointer_f,int* number_receiver_global,
                                                    int* ispec_selected_rec,int* ispec_selected_source,
-                                                   int* ibool,int* SIMULATION_TYPEf) {}
+                                                   int* ibool) {}
 
 void FC_FUNC_(transfer_station_ac_from_device,
               TRANSFER_STATION_AC_FROM_DEVICE)(
@@ -799,6 +775,5 @@ void FC_FUNC_(transfer_station_ac_from_device,
                                                 int* number_receiver_global,
                                                 int* ispec_selected_rec,
                                                 int* ispec_selected_source,
-                                                int* ibool,
-                                                int* SIMULATION_TYPEf) {}
+                                                int* ibool) {}
 
diff --git a/src/cuda/write_seismograms_cuda.cu b/src/cuda/write_seismograms_cuda.cu
index 95163d0d3..8bba41905 100644
--- a/src/cuda/write_seismograms_cuda.cu
+++ b/src/cuda/write_seismograms_cuda.cu
@@ -195,7 +195,6 @@ extern "C"
 void FC_FUNC_(transfer_seismograms_el_from_d,
               TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local,
                                               long* Mesh_pointer_f,
-                                              int* SIMULATION_TYPEf,
                                               realw* seismograms_d,
                                               realw* seismograms_v,
                                               realw* seismograms_a,
@@ -345,16 +344,15 @@ void FC_FUNC_(transfer_station_el_from_device,
                                                    realw* b_displ, realw* b_veloc, realw* b_accel,
                                                    long* Mesh_pointer_f,int* number_receiver_global,
                                                    int* ispec_selected_rec,int* ispec_selected_source,
-                                                   int* ibool,int* SIMULATION_TYPEf) {
+                                                   int* ibool) {
 TRACE("transfer_station_el_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
 
-  int SIMULATION_TYPE = *SIMULATION_TYPEf;
-
-  if(SIMULATION_TYPE == 1) {
+  if(mp->simulation_type == 1) {
     transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global,
              mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
     transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global,
@@ -362,7 +360,7 @@ TRACE("transfer_station_el_from_device");
     transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global,
              mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
   }
-  else if(SIMULATION_TYPE == 2) {
+  else if(mp->simulation_type == 2) {
     transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global,
              mp->d_ispec_selected_source, ispec_selected_source, ibool);
     transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global,
@@ -370,7 +368,7 @@ TRACE("transfer_station_el_from_device");
     transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global,
              mp->d_ispec_selected_source, ispec_selected_source, ibool);
   }
-  else if(SIMULATION_TYPE == 3) {
+  else if(mp->simulation_type == 3) {
     transfer_field_from_device(mp,mp->d_b_displ,b_displ, number_receiver_global,
              mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
     transfer_field_from_device(mp,mp->d_b_veloc,b_veloc, number_receiver_global,
@@ -420,7 +418,7 @@ TRACE("transfer_field_acoustic_from_device");
   int irec_local,irec,ispec,iglob,j;
 
   // checks if anything to do
-  if( mp->nrec_local == 0 ) return;
+  if( mp->nrec_local < 1 ) return;
 
   // sets up kernel dimensions
   int blocksize = NGLL3;
@@ -442,8 +440,12 @@ TRACE("transfer_field_acoustic_from_device");
                                                                          d_potential);
 
 
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_field_acoustic_from_device kernel");
+#endif
+
   print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
-                                     mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),500);
+                                     mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),55000);
 
   //printf("copy local receivers: %i \n",mp->nrec_local);
 
@@ -483,19 +485,17 @@ void FC_FUNC_(transfer_station_ac_from_device,
                                                 int* number_receiver_global,
                                                 int* ispec_selected_rec,
                                                 int* ispec_selected_source,
-                                                int* ibool,
-                                                int* SIMULATION_TYPEf) {
+                                                int* ibool) {
 
 TRACE("transfer_station_ac_from_device");
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
 
-  int SIMULATION_TYPE = *SIMULATION_TYPEf;
-
-  if(SIMULATION_TYPE == 1) {
+  if(mp->simulation_type == 1) {
     transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic,
                                         number_receiver_global,
                                         mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
@@ -506,7 +506,7 @@ TRACE("transfer_station_ac_from_device");
                                         number_receiver_global,
                                         mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
   }
-  else if(SIMULATION_TYPE == 2) {
+  else if(mp->simulation_type == 2) {
     transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic,
                                         number_receiver_global,
                                         mp->d_ispec_selected_source, ispec_selected_source, ibool);
@@ -517,7 +517,7 @@ TRACE("transfer_station_ac_from_device");
                                         number_receiver_global,
                                         mp->d_ispec_selected_source, ispec_selected_source, ibool);
   }
-  else if(SIMULATION_TYPE == 3) {
+  else if(mp->simulation_type == 3) {
     transfer_field_acoustic_from_device(mp,mp->d_b_potential_acoustic,b_potential_acoustic,
                                         number_receiver_global,
                                         mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
diff --git a/src/generate_databases/create_mass_matrices.f90 b/src/generate_databases/create_mass_matrices.f90
index 47e976a13..c2ff99ccc 100644
--- a/src/generate_databases/create_mass_matrices.f90
+++ b/src/generate_databases/create_mass_matrices.f90
@@ -44,83 +44,125 @@ subroutine create_mass_matrices(nglob,nspec,ibool)
   integer :: ispec,i,j,k,iglob,ier
   real(kind=CUSTOM_REAL) :: rho_s, rho_f, rho_bar, phi, tort
 
-! allocates memory
-  allocate(rmass(nglob),stat=ier); if(ier /= 0) stop 'error in allocate rmass'
-  allocate(rmass_acoustic(nglob),stat=ier); if(ier /= 0) stop 'error in allocate rmass_acoustic'
-  allocate(rmass_solid_poroelastic(nglob),stat=ier); if(ier /= 0) stop 'error in allocate'
-  allocate(rmass_fluid_poroelastic(nglob),stat=ier); if(ier /= 0) stop 'error in allocate'
-
-! creates mass matrix
-  rmass(:) = 0._CUSTOM_REAL
-  rmass_acoustic(:) = 0._CUSTOM_REAL
-  rmass_solid_poroelastic(:) = 0._CUSTOM_REAL
-  rmass_fluid_poroelastic(:) = 0._CUSTOM_REAL
-
-  do ispec=1,nspec
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          iglob = ibool(i,j,k,ispec)
-
-          weight = wxgll(i)*wygll(j)*wzgll(k)
-          jacobianl = jacobianstore(i,j,k,ispec)
-
-! acoustic mass matrix
-          if( ispec_is_acoustic(ispec) ) then
-            ! distinguish between single and double precision for reals
-            if(CUSTOM_REAL == SIZE_REAL) then
-              rmass_acoustic(iglob) = rmass_acoustic(iglob) + &
-                    sngl( dble(jacobianl) * weight / dble(kappastore(i,j,k,ispec)) )
-            else
-               rmass_acoustic(iglob) = rmass_acoustic(iglob) + &
-                    jacobianl * weight / kappastore(i,j,k,ispec)
-            endif
-          endif
+  ! elastic domains
+  if( ELASTIC_SIMULATION ) then
+    ! allocates memory
+    allocate(rmass(nglob),stat=ier); if(ier /= 0) stop 'error in allocate rmass'
+    rmass(:) = 0._CUSTOM_REAL
+
+    ! elastic mass matrix
+    do ispec=1,nspec
+      if( ispec_is_elastic(ispec) ) then
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+
+              weight = wxgll(i)*wygll(j)*wzgll(k)
+              jacobianl = jacobianstore(i,j,k,ispec)
+
+              if(CUSTOM_REAL == SIZE_REAL) then
+                rmass(iglob) = rmass(iglob) + &
+                      sngl( dble(jacobianl) * weight * dble(rhostore(i,j,k,ispec)) )
+              else
+                 rmass(iglob) = rmass(iglob) + &
+                      jacobianl * weight * rhostore(i,j,k,ispec)
+              endif
+            enddo
+          enddo
+        enddo
+      endif
+    enddo ! nspec
+  endif
 
-! elastic mass matrix
-          if( ispec_is_elastic(ispec) ) then
-            if(CUSTOM_REAL == SIZE_REAL) then
-              rmass(iglob) = rmass(iglob) + &
-                    sngl( dble(jacobianl) * weight * dble(rhostore(i,j,k,ispec)) )
-            else
-               rmass(iglob) = rmass(iglob) + &
-                    jacobianl * weight * rhostore(i,j,k,ispec)
-            endif
-          endif
+  ! acoustic domains
+  if( ACOUSTIC_SIMULATION) then
+    ! allocates memory
+    allocate(rmass_acoustic(nglob),stat=ier); if(ier /= 0) stop 'error in allocate rmass_acoustic'
+    rmass_acoustic(:) = 0._CUSTOM_REAL
+
+    ! acoustic mass matrix
+    do ispec=1,nspec
+      if( ispec_is_acoustic(ispec) ) then
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+
+              weight = wxgll(i)*wygll(j)*wzgll(k)
+              jacobianl = jacobianstore(i,j,k,ispec)
+
+              ! distinguish between single and double precision for reals
+              if(CUSTOM_REAL == SIZE_REAL) then
+                rmass_acoustic(iglob) = rmass_acoustic(iglob) + &
+                      sngl( dble(jacobianl) * weight / dble(kappastore(i,j,k,ispec)) )
+              else
+                 rmass_acoustic(iglob) = rmass_acoustic(iglob) + &
+                      jacobianl * weight / kappastore(i,j,k,ispec)
+              endif
+            enddo
+          enddo
+        enddo
+      endif
+    enddo ! nspec
+  endif
+
+  ! poroelastic domains
+  if( POROELASTIC_SIMULATION) then
+    ! allocates memory
+    allocate(rmass_solid_poroelastic(nglob),stat=ier); if(ier /= 0) stop 'error in allocate'
+    allocate(rmass_fluid_poroelastic(nglob),stat=ier); if(ier /= 0) stop 'error in allocate'
+    rmass_solid_poroelastic(:) = 0._CUSTOM_REAL
+    rmass_fluid_poroelastic(:) = 0._CUSTOM_REAL
+
+    ! poroelastic mass matrices
+    do ispec=1,nspec
+      if( ispec_is_poroelastic(ispec) ) then
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+
+              weight = wxgll(i)*wygll(j)*wzgll(k)
+              jacobianl = jacobianstore(i,j,k,ispec)
+
+              rho_s = rhoarraystore(1,i,j,k,ispec)
+              rho_f = rhoarraystore(2,i,j,k,ispec)
+              phi = phistore(i,j,k,ispec)
+              tort = tortstore(i,j,k,ispec)
+              rho_bar = (1._CUSTOM_REAL-phi)*rho_s + phi*rho_f
+
+              if(CUSTOM_REAL == SIZE_REAL) then
+                ! for the solid mass matrix
+                rmass_solid_poroelastic(iglob) = rmass_solid_poroelastic(iglob) + &
+                    sngl( dble(jacobianl) * weight * dble(rho_bar - phi*rho_f/tort) )
+
+                ! for the fluid mass matrix
+                rmass_fluid_poroelastic(iglob) = rmass_fluid_poroelastic(iglob) + &
+                    sngl( dble(jacobianl) * weight * dble(rho_bar*rho_f*tort - &
+                                                phi*rho_f*rho_f)/dble(rho_bar*phi) )
+              else
+                rmass_solid_poroelastic(iglob) = rmass_solid_poroelastic(iglob) + &
+                    jacobianl * weight * (rho_bar - phi*rho_f/tort)
+
+                rmass_fluid_poroelastic(iglob) = rmass_fluid_poroelastic(iglob) + &
+                    jacobianl * weight * (rho_bar*rho_f*tort - &
+                                                phi*rho_f*rho_f) / (rho_bar*phi)
+              endif
+            enddo
+          enddo
+        enddo
+      endif
+    enddo ! nspec
+  endif
 
-! poroelastic mass matrices
-          if( ispec_is_poroelastic(ispec) ) then
-
-            rho_s = rhoarraystore(1,i,j,k,ispec)
-            rho_f = rhoarraystore(2,i,j,k,ispec)
-            phi = phistore(i,j,k,ispec)
-            tort = tortstore(i,j,k,ispec)
-            rho_bar = (1._CUSTOM_REAL-phi)*rho_s + phi*rho_f
-
-            if(CUSTOM_REAL == SIZE_REAL) then
-              ! for the solid mass matrix
-              rmass_solid_poroelastic(iglob) = rmass_solid_poroelastic(iglob) + &
-                  sngl( dble(jacobianl) * weight * dble(rho_bar - phi*rho_f/tort) )
-
-              ! for the fluid mass matrix
-              rmass_fluid_poroelastic(iglob) = rmass_fluid_poroelastic(iglob) + &
-                  sngl( dble(jacobianl) * weight * dble(rho_bar*rho_f*tort - &
-                                              phi*rho_f*rho_f)/dble(rho_bar*phi) )
-            else
-              rmass_solid_poroelastic(iglob) = rmass_solid_poroelastic(iglob) + &
-                  jacobianl * weight * (rho_bar - phi*rho_f/tort)
-
-              rmass_fluid_poroelastic(iglob) = rmass_fluid_poroelastic(iglob) + &
-                  jacobianl * weight * (rho_bar*rho_f*tort - &
-                                              phi*rho_f*rho_f) / (rho_bar*phi)
-            endif
+  ! extra C*deltat/2 contribution to the mass matrices on Stacey edges
+  ! for absorbing condition
+  call create_mass_matrices_Stacey(nglob,nspec,ibool)
 
-          endif
+  ! creates ocean load mass matrix
+  call create_mass_matrices_ocean_load(nglob,nspec,ibool)
 
-        enddo
-      enddo
-    enddo
-  enddo ! nspec
 
   end subroutine create_mass_matrices
 
@@ -128,13 +170,16 @@ end subroutine create_mass_matrices
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool,OCEANS,TOPOGRAPHY, &
-                                            UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, &
-                                            NX_TOPO,NY_TOPO,itopo_bathy)
+  subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool)
 
 ! returns precomputed mass matrix in rmass array
 
+  use generate_databases_par,only: &
+    OCEANS,TOPOGRAPHY,UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, &
+    NX_TOPO,NY_TOPO,itopo_bathy,myrank
+
   use create_regions_mesh_ext_par
+
   implicit none
 
   ! number of spectral elements in each block
@@ -143,14 +188,6 @@ subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool,OCEANS,TOPOGRAPHY,
 
   ! arrays with the mesh global indices
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
-  logical :: OCEANS,TOPOGRAPHY
-
-  ! use integer array to store topography values
-  integer :: UTM_PROJECTION_ZONE
-  logical :: SUPPRESS_UTM_PROJECTION
-
-  integer :: NX_TOPO,NY_TOPO
-  integer, dimension(NX_TOPO,NY_TOPO) :: itopo_bathy
 
   ! local parameters
   double precision :: weight
@@ -164,6 +201,10 @@ subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool,OCEANS,TOPOGRAPHY,
   ! creates ocean load mass matrix
   if(OCEANS) then
 
+    if( myrank == 0) then
+      write(IMAIN,*) '  ...creating ocean load mass matrix '
+    endif
+
     ! adding ocean load mass matrix at ocean bottom
     NGLOB_OCEAN = nglob
     allocate(rmass_ocean_load(NGLOB_OCEAN),stat=ier)
@@ -244,3 +285,144 @@ subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool,OCEANS,TOPOGRAPHY,
   endif
 
   end subroutine create_mass_matrices_ocean_load
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine create_mass_matrices_Stacey(nglob,nspec,ibool)
+
+! in the case of stacey boundary conditions, add C*deltat/2 contribution to the mass matrix
+! on Stacey edges for the crust_mantle and outer_core regions but not for the inner_core region
+! thus the mass matrix must be replaced by three mass matrices including the "C" damping matrix
+
+  use generate_databases_par,only: &
+    DT
+
+  use create_regions_mesh_ext_par
+
+  implicit none
+
+  integer :: nglob
+  integer :: nspec
+  integer,dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  ! local parameters
+  real(kind=CUSTOM_REAL) :: jacobianw
+  real(kind=CUSTOM_REAL) :: deltat,deltatover2
+  real(kind=CUSTOM_REAL) :: tx,ty,tz,sn
+  real(kind=CUSTOM_REAL) :: nx,ny,nz,vn
+  integer :: ispec,iglob,i,j,k,iface,igll,ier
+
+  ! checks if anything to do
+  if( num_abs_boundary_faces > 0 ) then
+    nglob_xy = nglob
+  else
+    nglob_xy = 1
+  endif
+
+  ! elastic domains
+  if( ELASTIC_SIMULATION ) then
+    allocate( rmassx(nglob_xy), &
+              rmassy(nglob_xy), &
+              rmassz(nglob_xy), &
+              stat=ier)
+    if(ier /= 0) stop 'error in allocate 21'
+    rmassx(:) = 0._CUSTOM_REAL
+    rmassy(:) = 0._CUSTOM_REAL
+    rmassz(:) = 0._CUSTOM_REAL
+  endif
+
+  ! acoustic domains
+  if( ACOUSTIC_SIMULATION ) then
+    allocate( rmassz_acoustic(nglob_xy), &
+              stat=ier)
+    if(ier /= 0) stop 'error in allocate 22'
+    rmassz_acoustic(:) = 0._CUSTOM_REAL
+  endif
+
+  ! use the non-dimensional time step to make the mass matrix correction
+  if(CUSTOM_REAL == SIZE_REAL) then
+    deltat = sngl(DT)
+    deltatover2 = sngl(0.5d0*deltat)
+  else
+    deltat = DT
+    deltatover2 = 0.5d0*deltat
+  endif
+
+  ! adds contributions to mass matrix to stabilize stacey conditions
+  do iface=1,num_abs_boundary_faces
+
+    ispec = abs_boundary_ispec(iface)
+
+    ! elastic elements
+    if( ispec_is_elastic(ispec)) then
+      ! reference gll points on boundary face
+      do igll = 1,NGLLSQUARE
+        ! gets local indices for GLL point
+        i = abs_boundary_ijk(1,igll,iface)
+        j = abs_boundary_ijk(2,igll,iface)
+        k = abs_boundary_ijk(3,igll,iface)
+
+        ! gets velocity
+        iglob = ibool(i,j,k,ispec)
+
+        ! gets associated normal
+        nx = abs_boundary_normal(1,igll,iface)
+        ny = abs_boundary_normal(2,igll,iface)
+        nz = abs_boundary_normal(3,igll,iface)
+
+        vn = deltatover2*(nx+ny+nz)
+
+        ! C*deltat/2 contributions
+        tx = rho_vp(i,j,k,ispec)*vn*nx + rho_vs(i,j,k,ispec)*(deltatover2-vn*nx)
+        ty = rho_vp(i,j,k,ispec)*vn*ny + rho_vs(i,j,k,ispec)*(deltatover2-vn*ny)
+        tz = rho_vp(i,j,k,ispec)*vn*nz + rho_vs(i,j,k,ispec)*(deltatover2-vn*nz)
+
+        ! gets associated, weighted jacobian
+        jacobianw = abs_boundary_jacobian2Dw(igll,iface)
+
+        ! assembles mass matrix on global points
+        if(CUSTOM_REAL == SIZE_REAL) then
+          rmassx(iglob) = rmassx(iglob) + sngl(tx*jacobianw)
+          rmassy(iglob) = rmassy(iglob) + sngl(ty*jacobianw)
+          rmassz(iglob) = rmassz(iglob) + sngl(tz*jacobianw)
+        else
+          rmassx(iglob) = rmassx(iglob) + tx*jacobianw
+          rmassy(iglob) = rmassy(iglob) + ty*jacobianw
+          rmassz(iglob) = rmassz(iglob) + tz*jacobianw
+        endif
+      enddo
+    endif ! elastic
+
+    ! acoustic element
+    if( ispec_is_acoustic(ispec) ) then
+
+      ! reference gll points on boundary face
+      do igll = 1,NGLLSQUARE
+        ! gets local indices for GLL point
+        i = abs_boundary_ijk(1,igll,iface)
+        j = abs_boundary_ijk(2,igll,iface)
+        k = abs_boundary_ijk(3,igll,iface)
+
+        ! gets global index
+        iglob=ibool(i,j,k,ispec)
+
+        ! gets associated, weighted jacobian
+        jacobianw = abs_boundary_jacobian2Dw(igll,iface)
+
+        ! C * DT/2 contribution
+        sn = deltatover2/rho_vp(i,j,k,ispec)
+
+        if(CUSTOM_REAL == SIZE_REAL) then
+          rmassz_acoustic(iglob) = rmassz_acoustic(iglob) + sngl(jacobianw*sn)
+        else
+          rmassz_acoustic(iglob) = rmassz_acoustic(iglob) + jacobianw*sn
+        endif
+      enddo
+    endif ! acoustic
+  enddo
+
+  end subroutine create_mass_matrices_Stacey
+
diff --git a/src/generate_databases/create_regions_mesh.f90 b/src/generate_databases/create_regions_mesh.f90
index b20426f04..85269e2b3 100644
--- a/src/generate_databases/create_regions_mesh.f90
+++ b/src/generate_databases/create_regions_mesh.f90
@@ -46,11 +46,10 @@ subroutine create_regions_mesh()
     nodes_ibelm_xmin,nodes_ibelm_xmax,nodes_ibelm_ymin,nodes_ibelm_ymax,&
     nodes_ibelm_bottom,nodes_ibelm_top, &
     SAVE_MESH_FILES, &
-    ANISOTROPY,NPROC,OCEANS,TOPOGRAPHY, &
+    ANISOTROPY,NPROC,OCEANS, &
     ATTENUATION,USE_OLSEN_ATTENUATION, &
-    UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, &
-    NX_TOPO,NY_TOPO,itopo_bathy, &
     nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho
+
   use create_regions_mesh_ext_par
   implicit none
 
@@ -176,15 +175,6 @@ subroutine create_regions_mesh()
   endif
   call create_mass_matrices(nglob_dummy,nspec,ibool)
 
-! creates ocean load mass matrix
-  call sync_all()
-  if( myrank == 0) then
-    write(IMAIN,*) '  ...creating ocean load mass matrix '
-  endif
-  call create_mass_matrices_ocean_load(nglob_dummy,nspec,ibool,OCEANS,TOPOGRAPHY, &
-                                      UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, &
-                                      NX_TOPO,NY_TOPO,itopo_bathy)
-
 ! saves the binary mesh files
   call sync_all()
   if( myrank == 0) then
@@ -232,8 +222,7 @@ subroutine create_regions_mesh()
                           ispec_is_elastic,min_resolved_period,prname)
   endif
 
-! cleanup
-  if( .not. SAVE_MOHO_MESH ) deallocate(xstore_dummy,ystore_dummy,zstore_dummy)
+  ! cleanup
   deallocate(xixstore,xiystore,xizstore,&
               etaxstore,etaystore,etazstore,&
               gammaxstore,gammaystore,gammazstore)
@@ -242,6 +231,22 @@ subroutine create_regions_mesh()
   deallocate(rho_vpI,rho_vpII,rho_vsI)
   deallocate(rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore)
 
+  if( .not. SAVE_MOHO_MESH ) then
+    deallocate(xstore_dummy,ystore_dummy,zstore_dummy)
+  endif
+
+  if( ELASTIC_SIMULATION ) then
+    deallocate(rmass)
+    deallocate(rmassx,rmassy,rmassz)
+  endif
+  if( ACOUSTIC_SIMULATION) then
+    deallocate(rmass_acoustic)
+    deallocate(rmassz_acoustic)
+  endif
+  if( POROELASTIC_SIMULATION) then
+    deallocate(rmass_solid_poroelastic,rmass_fluid_poroelastic)
+  endif
+
 end subroutine create_regions_mesh
 
 !
diff --git a/src/generate_databases/generate_databases.f90 b/src/generate_databases/generate_databases.f90
index 4b4e407e4..8a8906178 100644
--- a/src/generate_databases/generate_databases.f90
+++ b/src/generate_databases/generate_databases.f90
@@ -320,6 +320,8 @@ subroutine gd_read_parameters
     write(IMAIN,'(a)',advance='yes') '  external'
     case( IMODEL_IPATI )
     write(IMAIN,'(a)',advance='yes') '  ipati'
+    case( IMODEL_IPATI_WATER )
+    write(IMAIN,'(a)',advance='yes') '  ipati_water'
     end select
 
     write(IMAIN,*)
diff --git a/src/generate_databases/generate_databases_par.f90 b/src/generate_databases/generate_databases_par.f90
index 2129cc049..af7a7f97f 100644
--- a/src/generate_databases/generate_databases_par.f90
+++ b/src/generate_databases/generate_databases_par.f90
@@ -177,6 +177,11 @@ module create_regions_mesh_ext_par
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmass,rmass_acoustic,&
                             rmass_solid_poroelastic,rmass_fluid_poroelastic
 
+! mass matrix contributions
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmassx,rmassy,rmassz
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmassz_acoustic
+  integer :: nglob_xy
+
 ! ocean load
   integer :: NGLOB_OCEAN
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmass_ocean_load
diff --git a/src/generate_databases/get_model.f90 b/src/generate_databases/get_model.f90
index 7dade46fd..9d1917aa0 100644
--- a/src/generate_databases/get_model.f90
+++ b/src/generate_databases/get_model.f90
@@ -346,6 +346,7 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
   enddo
 
   ! sets simulation domain flags
+  ! all processes will have e.g. acoustic_simulation flag set if any flag is .true. somewhere
   call any_all_l( ANY(ispec_is_acoustic), ACOUSTIC_SIMULATION )
   call any_all_l( ANY(ispec_is_elastic), ELASTIC_SIMULATION )
   call any_all_l( ANY(ispec_is_poroelastic), POROELASTIC_SIMULATION )
@@ -409,7 +410,7 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, &
   ! selects chosen velocity model
   select case( IMODEL )
 
-  case( IMODEL_DEFAULT,IMODEL_GLL,IMODEL_IPATI )
+  case( IMODEL_DEFAULT,IMODEL_GLL,IMODEL_IPATI,IMODEL_IPATI_WATER )
     ! material values determined by mesh properties
     call model_default(materials_ext_mesh,nmat_ext_mesh, &
                           undef_mat_prop,nundefMat_ext_mesh, &
@@ -485,7 +486,9 @@ subroutine get_model_binaries(myrank,nspec,LOCAL_PATH)
 ! reads in material parameters from external binary files
 
   use generate_databases_par,only: IMODEL
+
   use create_regions_mesh_ext_par
+
   implicit none
 
   ! number of spectral elements in each block
@@ -505,9 +508,15 @@ subroutine get_model_binaries(myrank,nspec,LOCAL_PATH)
     ! import the model from files in SPECFEM format
     ! note that those those files should be saved in LOCAL_PATH
     call model_gll(myrank,nspec,LOCAL_PATH)
+
   case( IMODEL_IPATI )
     ! import the model from modified files in SPECFEM format
     call model_ipati(myrank,nspec,LOCAL_PATH)
+
+  case( IMODEL_IPATI_WATER )
+    ! import the model from modified files in SPECFEM format
+    call model_ipati_water(myrank,nspec,LOCAL_PATH)
+
   end select
 
   end subroutine get_model_binaries
diff --git a/src/generate_databases/memory_eval.f90 b/src/generate_databases/memory_eval.f90
index a1bdfdded..5c1d4d5a2 100644
--- a/src/generate_databases/memory_eval.f90
+++ b/src/generate_databases/memory_eval.f90
@@ -88,7 +88,7 @@ subroutine memory_eval(NSPEC_AB,NGLOB_AB,max_nibool_interfaces_ext_mesh,num_inte
     memory_size = memory_size + 3.d0*dble(NDIM)*NGLOB_AB*dble(CUSTOM_REAL)
 
     ! rmass
-    memory_size = memory_size + NGLOB_AB*dble(CUSTOM_REAL)
+    memory_size = memory_size + 3*NGLOB_AB*dble(CUSTOM_REAL)
 
     ! rho_vp,rho_vs
     memory_size = memory_size + 2.d0*dble(NGLLX)*dble(NGLLY)*dble(NGLLZ)*NSPEC_AB*dble(CUSTOM_REAL)
diff --git a/src/generate_databases/model_1d_socal.f90 b/src/generate_databases/model_1d_socal.f90
index 4a799d3f9..8d6f95110 100644
--- a/src/generate_databases/model_1d_socal.f90
+++ b/src/generate_databases/model_1d_socal.f90
@@ -37,7 +37,6 @@ subroutine model_1D_socal(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten)
 
 ! given a GLL point, returns super-imposed velocity model values
 
-  use generate_databases_par,only: nspec => NSPEC_AB,ibool
   use create_regions_mesh_ext_par
   implicit none
 
diff --git a/src/generate_databases/model_ipati.f90 b/src/generate_databases/model_ipati.f90
index 01e145a61..820837267 100644
--- a/src/generate_databases/model_ipati.f90
+++ b/src/generate_databases/model_ipati.f90
@@ -111,3 +111,97 @@ subroutine model_ipati(myrank,nspec,LOCAL_PATH)
   deallocate( rho_read,vp_read,vs_read)
 
   end subroutine model_ipati
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine model_ipati_water(myrank,nspec,LOCAL_PATH)
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer, intent(in) :: myrank,nspec
+  character(len=256) :: LOCAL_PATH
+
+  ! local parameters
+  real, dimension(:,:,:,:),allocatable :: vp_read,vs_read,rho_read
+  integer :: ispec,ier
+  character(len=256) :: prname_lp,filename
+
+  ! -----------------------------------------------------------------------------
+
+  ! note: vp not vs structure is available (as is often the case in exploration seismology),
+  ! scaling factor
+  real, parameter :: SCALING_FACTOR = 1.0/1.8
+
+  ! -----------------------------------------------------------------------------
+
+  ! user output
+  if (myrank==0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'using external IPATI_WATER model from:',trim(LOCAL_PATH)
+    write(IMAIN,*) 'scaling factor: ',SCALING_FACTOR
+    write(IMAIN,*)
+  endif
+
+  ! processors name
+  write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'proc',myrank,'_'
+
+  ! density
+  allocate( rho_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array rho_read'
+
+  filename = prname_lp(1:len_trim(prname_lp))//'rho.bin'
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading rho.bin file'
+  endif
+
+  read(28) rho_read
+  close(28)
+
+  ! vp
+  allocate( vp_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array vp_read'
+
+  filename = prname_lp(1:len_trim(prname_lp))//'vp.bin'
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading vp.bin file'
+  endif
+
+  read(28) vp_read
+  close(28)
+
+  ! vs scaled from vp
+  allocate( vs_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array vs_read'
+
+  ! scaling
+  vs_read = vp_read * SCALING_FACTOR
+
+  ! overwrites only elastic elements
+  do ispec=1,nspec
+    ! assumes water layer with acoustic elements are set properly
+    ! only overwrites elastic elements
+    if( ispec_is_elastic(ispec)) then
+      ! isotropic model parameters
+      rhostore(:,:,:,ispec) = rho_read(:,:,:,ispec)
+      kappastore(:,:,:,ispec) = rhostore(:,:,:,ispec) * ( vp_read(:,:,:,ispec) * vp_read(:,:,:,ispec) &
+                                    - FOUR_THIRDS * vs_read(:,:,:,ispec) * vs_read(:,:,:,ispec) )
+      mustore(:,:,:,ispec) = rhostore(:,:,:,ispec) * vs_read(:,:,:,ispec) * vs_read(:,:,:,ispec)
+      rho_vp(:,:,:,ispec) = rhostore(:,:,:,ispec) * vp_read(:,:,:,ispec)
+      rho_vs(:,:,:,ispec) = rhostore(:,:,:,ispec) * vs_read(:,:,:,ispec)
+    endif
+  enddo
+
+  ! free memory
+  deallocate( rho_read,vp_read,vs_read)
+
+  end subroutine model_ipati_water
+
+
+
diff --git a/src/generate_databases/save_arrays_solver.f90 b/src/generate_databases/save_arrays_solver.f90
index d47fcb892..e75f45745 100644
--- a/src/generate_databases/save_arrays_solver.f90
+++ b/src/generate_databases/save_arrays_solver.f90
@@ -91,30 +91,23 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob,OCEANS,ibool, &
   write(IOUT) ispec_is_poroelastic
 
 ! acoustic
-! all processes will have acoustic_simulation set if any flag is .true. somewhere
-  call any_all_l( ANY(ispec_is_acoustic), ACOUSTIC_SIMULATION )
   if( ACOUSTIC_SIMULATION ) then
     write(IOUT) rmass_acoustic
     write(IOUT) rhostore
   endif
 
 ! elastic
-  call any_all_l( ANY(ispec_is_elastic), ELASTIC_SIMULATION )
   if( ELASTIC_SIMULATION ) then
     write(IOUT) rmass
-
     if( OCEANS) then
       write(IOUT) rmass_ocean_load
     endif
-
     !pll Stacey
     write(IOUT) rho_vp
     write(IOUT) rho_vs
-
   endif
 
 ! poroelastic
-  call any_all_l( ANY(ispec_is_poroelastic), POROELASTIC_SIMULATION )
   if( POROELASTIC_SIMULATION ) then
     write(IOUT) rmass_solid_poroelastic
     write(IOUT) rmass_fluid_poroelastic
@@ -136,6 +129,15 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob,OCEANS,ibool, &
     write(IOUT) abs_boundary_ijk
     write(IOUT) abs_boundary_jacobian2Dw
     write(IOUT) abs_boundary_normal
+    ! store mass matrix contributions
+    if(ELASTIC_SIMULATION) then
+     write(IOUT) rmassx
+     write(IOUT) rmassy
+     write(IOUT) rmassz
+    endif
+    if(ACOUSTIC_SIMULATION) then
+      write(IOUT) rmassz_acoustic
+    endif
   endif
 
 ! free surface
diff --git a/src/shared/constants.h.in b/src/shared/constants.h.in
index 6d118bbb6..6ee728818 100644
--- a/src/shared/constants.h.in
+++ b/src/shared/constants.h.in
@@ -428,3 +428,5 @@
   integer, parameter :: IMODEL_SALTON_TROUGH    = 8
   integer, parameter :: IMODEL_1D_PREM_PB       = 9
   integer, parameter :: IMODEL_IPATI            = 10
+  integer, parameter :: IMODEL_IPATI_WATER      = 11
+
diff --git a/src/shared/get_shape2D.f90 b/src/shared/get_shape2D.f90
index f384ca9cf..1a32a80ce 100644
--- a/src/shared/get_shape2D.f90
+++ b/src/shared/get_shape2D.f90
@@ -46,8 +46,6 @@ subroutine get_shape2D(myrank,shape2D,dershape2D,xigll,yigll,NGLLA,NGLLB)
 ! location of the nodes of the 2D quadrilateral elements
   double precision xi,eta
   double precision xi_map,eta_map
-  double precision l1xi,l2xi,l3xi,l1eta,l2eta,l3eta
-  double precision l1pxi,l2pxi,l3pxi,l1peta,l2peta,l3peta
 
 ! for checking the 2D shape functions
   double precision sumshape,sumdershapexi,sumdershapeeta
@@ -94,105 +92,146 @@ subroutine get_shape2D(myrank,shape2D,dershape2D,xigll,yigll,NGLLA,NGLLB)
 
   else
 
-    ! generate the 2D shape functions and their derivatives (9 nodes)
-    do i=1,NGLLA
+    ! note: put further initialization for ngnod2d == 9 into subroutine
+    !       to avoid compilation errors in case ngnod2d == 4
+    call get_shape2D_9(NGNOD2D,NDIM2D,shape2D,dershape2D,xigll,yigll,NGLLA,NGLLB)
 
-      xi=xigll(i)
+  endif
 
-      l1xi=HALF*xi*(xi-ONE)
-      l2xi=ONE-xi**2
-      l3xi=HALF*xi*(xi+ONE)
+! check the 2D shape functions
+  do i=1,NGLLA
+    do j=1,NGLLB
 
-      l1pxi=xi-HALF
-      l2pxi=-TWO*xi
-      l3pxi=xi+HALF
+    sumshape=ZERO
 
-      do j=1,NGLLB
+    sumdershapexi=ZERO
+    sumdershapeeta=ZERO
 
-        eta=yigll(j)
+    do ia=1,NGNOD2D
+      sumshape=sumshape+shape2D(ia,i,j)
 
-        l1eta=HALF*eta*(eta-ONE)
-        l2eta=ONE-eta**2
-        l3eta=HALF*eta*(eta+ONE)
+      sumdershapexi=sumdershapexi+dershape2D(1,ia,i,j)
+      sumdershapeeta=sumdershapeeta+dershape2D(2,ia,i,j)
+    enddo
 
-        l1peta=eta-HALF
-        l2peta=-TWO*eta
-        l3peta=eta+HALF
+!   the sum of the shape functions should be 1
+    if(abs(sumshape-ONE)>TINYVAL) call exit_MPI(myrank,'error in 2D shape functions')
 
-!   corner nodes
+!   the sum of the derivatives of the shape functions should be 0
+    if(abs(sumdershapexi)>TINYVAL) &
+      call exit_MPI(myrank,'error in xi derivatives of 2D shape function')
 
-        shape2D(1,i,j)=l1xi*l1eta
-        shape2D(2,i,j)=l3xi*l1eta
-        shape2D(3,i,j)=l3xi*l3eta
-        shape2D(4,i,j)=l1xi*l3eta
+    if(abs(sumdershapeeta)>TINYVAL) &
+      call exit_MPI(myrank,'error in eta derivatives of 2D shape function')
 
-        dershape2D(1,1,i,j)=l1pxi*l1eta
-        dershape2D(1,2,i,j)=l3pxi*l1eta
-        dershape2D(1,3,i,j)=l3pxi*l3eta
-        dershape2D(1,4,i,j)=l1pxi*l3eta
+    enddo
+  enddo
 
-        dershape2D(2,1,i,j)=l1xi*l1peta
-        dershape2D(2,2,i,j)=l3xi*l1peta
-        dershape2D(2,3,i,j)=l3xi*l3peta
-        dershape2D(2,4,i,j)=l1xi*l3peta
+  end subroutine get_shape2D
 
-!   midside nodes
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-        shape2D(5,i,j)=l2xi*l1eta
-        shape2D(6,i,j)=l3xi*l2eta
-        shape2D(7,i,j)=l2xi*l3eta
-        shape2D(8,i,j)=l1xi*l2eta
+  subroutine get_shape2D_9(ngnod2d,ndim2d,shape2D,dershape2D,xigll,yigll,NGLLA,NGLLB)
 
-        dershape2D(1,5,i,j)=l2pxi*l1eta
-        dershape2D(1,6,i,j)=l3pxi*l2eta
-        dershape2D(1,7,i,j)=l2pxi*l3eta
-        dershape2D(1,8,i,j)=l1pxi*l2eta
+  implicit none
 
-        dershape2D(2,5,i,j)=l2xi*l1peta
-        dershape2D(2,6,i,j)=l3xi*l2peta
-        dershape2D(2,7,i,j)=l2xi*l3peta
-        dershape2D(2,8,i,j)=l1xi*l2peta
+! generic routine that accepts any polynomial degree in each direction
 
-!   center node
+  integer :: ngnod2d,ndim2d
+  integer :: NGLLA,NGLLB
 
-        shape2D(9,i,j)=l2xi*l2eta
+  double precision xigll(NGLLA)
+  double precision yigll(NGLLB)
 
-        dershape2D(1,9,i,j)=l2pxi*l2eta
-        dershape2D(2,9,i,j)=l2xi*l2peta
+! 2D shape functions and their derivatives
+  double precision shape2D(NGNOD2D,NGLLA,NGLLB)
+  double precision dershape2D(NDIM2D,NGNOD2D,NGLLA,NGLLB)
 
-      enddo
-    enddo
+  integer i,j
 
-  endif
+! location of the nodes of the 2D quadrilateral elements
+  double precision xi,eta
+  double precision l1xi,l2xi,l3xi,l1eta,l2eta,l3eta
+  double precision l1pxi,l2pxi,l3pxi,l1peta,l2peta,l3peta
 
-! check the 2D shape functions
+  double precision,parameter :: HALF  = 0.5d0
+  double precision,parameter :: ONE   = 1.0d0
+  double precision,parameter :: TWO   = 2.0d0
+
+  ! check that the parameter file is correct
+  if( ngnod2d /= 9) stop 'surface elements should have 9 control nodes'
+
+  ! generate the 2D shape functions and their derivatives (9 nodes)
   do i=1,NGLLA
+
+    xi=xigll(i)
+
+    l1xi=HALF*xi*(xi-ONE)
+    l2xi=ONE-xi**2
+    l3xi=HALF*xi*(xi+ONE)
+
+    l1pxi=xi-HALF
+    l2pxi=-TWO*xi
+    l3pxi=xi+HALF
+
     do j=1,NGLLB
 
-    sumshape=ZERO
+      eta=yigll(j)
 
-    sumdershapexi=ZERO
-    sumdershapeeta=ZERO
+      l1eta=HALF*eta*(eta-ONE)
+      l2eta=ONE-eta**2
+      l3eta=HALF*eta*(eta+ONE)
 
-    do ia=1,NGNOD2D
-      sumshape=sumshape+shape2D(ia,i,j)
+      l1peta=eta-HALF
+      l2peta=-TWO*eta
+      l3peta=eta+HALF
 
-      sumdershapexi=sumdershapexi+dershape2D(1,ia,i,j)
-      sumdershapeeta=sumdershapeeta+dershape2D(2,ia,i,j)
-    enddo
+!   corner nodes
 
-!   the sum of the shape functions should be 1
-    if(abs(sumshape-ONE)>TINYVAL) call exit_MPI(myrank,'error in 2D shape functions')
+      shape2D(1,i,j)=l1xi*l1eta
+      shape2D(2,i,j)=l3xi*l1eta
+      shape2D(3,i,j)=l3xi*l3eta
+      shape2D(4,i,j)=l1xi*l3eta
 
-!   the sum of the derivatives of the shape functions should be 0
-    if(abs(sumdershapexi)>TINYVAL) &
-      call exit_MPI(myrank,'error in xi derivatives of 2D shape function')
+      dershape2D(1,1,i,j)=l1pxi*l1eta
+      dershape2D(1,2,i,j)=l3pxi*l1eta
+      dershape2D(1,3,i,j)=l3pxi*l3eta
+      dershape2D(1,4,i,j)=l1pxi*l3eta
 
-    if(abs(sumdershapeeta)>TINYVAL) &
-      call exit_MPI(myrank,'error in eta derivatives of 2D shape function')
+      dershape2D(2,1,i,j)=l1xi*l1peta
+      dershape2D(2,2,i,j)=l3xi*l1peta
+      dershape2D(2,3,i,j)=l3xi*l3peta
+      dershape2D(2,4,i,j)=l1xi*l3peta
+
+!   midside nodes
+
+      shape2D(5,i,j)=l2xi*l1eta
+      shape2D(6,i,j)=l3xi*l2eta
+      shape2D(7,i,j)=l2xi*l3eta
+      shape2D(8,i,j)=l1xi*l2eta
+
+      dershape2D(1,5,i,j)=l2pxi*l1eta
+      dershape2D(1,6,i,j)=l3pxi*l2eta
+      dershape2D(1,7,i,j)=l2pxi*l3eta
+      dershape2D(1,8,i,j)=l1pxi*l2eta
+
+      dershape2D(2,5,i,j)=l2xi*l1peta
+      dershape2D(2,6,i,j)=l3xi*l2peta
+      dershape2D(2,7,i,j)=l2xi*l3peta
+      dershape2D(2,8,i,j)=l1xi*l2peta
+
+!   center node
+
+      shape2D(9,i,j)=l2xi*l2eta
+
+      dershape2D(1,9,i,j)=l2pxi*l2eta
+      dershape2D(2,9,i,j)=l2xi*l2peta
 
     enddo
   enddo
 
-  end subroutine get_shape2D
+  end subroutine get_shape2D_9
+
 
diff --git a/src/shared/hex_nodes.f90 b/src/shared/hex_nodes.f90
index ec309b2b9..392eef13d 100644
--- a/src/shared/hex_nodes.f90
+++ b/src/shared/hex_nodes.f90
@@ -72,91 +72,110 @@ subroutine usual_hex_nodes(iaddx,iaddy,iaddz)
 
   if(NGNOD == 27) then
 
-    ! midside nodes (nodes located in the middle of an edge)
+    ! note: put further initialization into subroutine to avoid compilation errors
+    !       in case ngnod == 8
+    call usual_hex_nodes_27(NGNOD,iaddx,iaddy,iaddz)
 
-    iaddx(9) = 1
-    iaddy(9) = 0
-    iaddz(9) = 0
+  endif
 
-    iaddx(10) = 2
-    iaddy(10) = 1
-    iaddz(10) = 0
+  end subroutine usual_hex_nodes
 
-    iaddx(11) = 1
-    iaddy(11) = 2
-    iaddz(11) = 0
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-    iaddx(12) = 0
-    iaddy(12) = 1
-    iaddz(12) = 0
+  subroutine usual_hex_nodes_27(ngnod,iaddx,iaddy,iaddz)
 
-    iaddx(13) = 0
-    iaddy(13) = 0
-    iaddz(13) = 1
+  implicit none
 
-    iaddx(14) = 2
-    iaddy(14) = 0
-    iaddz(14) = 1
+  integer :: ngnod
+  integer,dimension(ngnod) :: iaddx,iaddy,iaddz
 
-    iaddx(15) = 2
-    iaddy(15) = 2
-    iaddz(15) = 1
+! define the topology of the hexahedral elements
 
-    iaddx(16) = 0
-    iaddy(16) = 2
-    iaddz(16) = 1
+  ! midside nodes (nodes located in the middle of an edge)
 
-    iaddx(17) = 1
-    iaddy(17) = 0
-    iaddz(17) = 2
+  iaddx(9) = 1
+  iaddy(9) = 0
+  iaddz(9) = 0
 
-    iaddx(18) = 2
-    iaddy(18) = 1
-    iaddz(18) = 2
+  iaddx(10) = 2
+  iaddy(10) = 1
+  iaddz(10) = 0
 
-    iaddx(19) = 1
-    iaddy(19) = 2
-    iaddz(19) = 2
+  iaddx(11) = 1
+  iaddy(11) = 2
+  iaddz(11) = 0
 
-    iaddx(20) = 0
-    iaddy(20) = 1
-    iaddz(20) = 2
+  iaddx(12) = 0
+  iaddy(12) = 1
+  iaddz(12) = 0
 
-    ! side center nodes (nodes located in the middle of a face)
+  iaddx(13) = 0
+  iaddy(13) = 0
+  iaddz(13) = 1
 
-    iaddx(21) = 1
-    iaddy(21) = 1
-    iaddz(21) = 0
+  iaddx(14) = 2
+  iaddy(14) = 0
+  iaddz(14) = 1
 
-    iaddx(22) = 1
-    iaddy(22) = 0
-    iaddz(22) = 1
+  iaddx(15) = 2
+  iaddy(15) = 2
+  iaddz(15) = 1
 
-    iaddx(23) = 2
-    iaddy(23) = 1
-    iaddz(23) = 1
+  iaddx(16) = 0
+  iaddy(16) = 2
+  iaddz(16) = 1
 
-    iaddx(24) = 1
-    iaddy(24) = 2
-    iaddz(24) = 1
+  iaddx(17) = 1
+  iaddy(17) = 0
+  iaddz(17) = 2
 
-    iaddx(25) = 0
-    iaddy(25) = 1
-    iaddz(25) = 1
+  iaddx(18) = 2
+  iaddy(18) = 1
+  iaddz(18) = 2
 
-    iaddx(26) = 1
-    iaddy(26) = 1
-    iaddz(26) = 2
+  iaddx(19) = 1
+  iaddy(19) = 2
+  iaddz(19) = 2
 
-    ! center node (barycenter of the eight corners)
+  iaddx(20) = 0
+  iaddy(20) = 1
+  iaddz(20) = 2
 
-    iaddx(27) = 1
-    iaddy(27) = 1
-    iaddz(27) = 1
+  ! side center nodes (nodes located in the middle of a face)
 
-  endif
+  iaddx(21) = 1
+  iaddy(21) = 1
+  iaddz(21) = 0
 
-  end subroutine usual_hex_nodes
+  iaddx(22) = 1
+  iaddy(22) = 0
+  iaddz(22) = 1
+
+  iaddx(23) = 2
+  iaddy(23) = 1
+  iaddz(23) = 1
+
+  iaddx(24) = 1
+  iaddy(24) = 2
+  iaddz(24) = 1
+
+  iaddx(25) = 0
+  iaddy(25) = 1
+  iaddz(25) = 1
+
+  iaddx(26) = 1
+  iaddy(26) = 1
+  iaddz(26) = 2
+
+  ! center node (barycenter of the eight corners)
+
+  iaddx(27) = 1
+  iaddy(27) = 1
+  iaddz(27) = 1
+
+  end subroutine usual_hex_nodes_27
 
 !
 !-------------------------------------------------------------------------------------------------
diff --git a/src/shared/read_parameter_file.f90 b/src/shared/read_parameter_file.f90
index 8efc947ac..87f2f252c 100644
--- a/src/shared/read_parameter_file.f90
+++ b/src/shared/read_parameter_file.f90
@@ -254,6 +254,8 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, &
     IMODEL = IMODEL_USER_EXTERNAL
   case( 'ipati' )
     IMODEL = IMODEL_IPATI
+  case( 'ipati_water' )
+    IMODEL = IMODEL_IPATI_WATER
   case( 'gll' )
     IMODEL = IMODEL_GLL
   case( 'salton_trough')
@@ -270,8 +272,8 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, &
   end select
 
   ! check
-  if( IMODEL == IMODEL_IPATI ) then
-    if( USE_RICKER_IPATI .eqv. .false. ) stop 'please set USE_RICKER_IPATI to true in shared/constants.h and recompile'
+  if( IMODEL == IMODEL_IPATI .or. IMODEL == IMODEL_IPATI_WATER ) then
+    if( USE_RICKER_IPATI .eqv. .false. ) stop 'please set USE_RICKER_IPATI to .true. in shared/constants.h and recompile'
   endif
 
   end subroutine read_parameter_file
diff --git a/src/shared/recompute_jacobian.f90 b/src/shared/recompute_jacobian.f90
index b082daf2b..79180a935 100644
--- a/src/shared/recompute_jacobian.f90
+++ b/src/shared/recompute_jacobian.f90
@@ -48,13 +48,6 @@ subroutine recompute_jacobian(xelm,yelm,zelm,xi,eta,gamma,x,y,z, &
   double precision xgamma,ygamma,zgamma
   double precision ra1,ra2,rb1,rb2,rc1,rc2
 
-  double precision l1xi,l2xi,l3xi
-  double precision l1eta,l2eta,l3eta
-  double precision l1gamma,l2gamma,l3gamma
-  double precision l1pxi,l2pxi,l3pxi
-  double precision l1peta,l2peta,l3peta
-  double precision l1pgamma,l2pgamma,l3pgamma
-
   integer ia
 
 ! for 8-node element
@@ -120,164 +113,9 @@ subroutine recompute_jacobian(xelm,yelm,zelm,xi,eta,gamma,x,y,z, &
 
   else
 
-! ***
-! *** create the 3D shape functions and the Jacobian for a 27-node element
-! ***
-
-    l1xi=HALF*xi*(xi-ONE)
-    l2xi=ONE-xi**2
-    l3xi=HALF*xi*(xi+ONE)
-
-    l1pxi=xi-HALF
-    l2pxi=-TWO*xi
-    l3pxi=xi+HALF
-
-    l1eta=HALF*eta*(eta-ONE)
-    l2eta=ONE-eta**2
-    l3eta=HALF*eta*(eta+ONE)
-
-    l1peta=eta-HALF
-    l2peta=-TWO*eta
-    l3peta=eta+HALF
-
-    l1gamma=HALF*gamma*(gamma-ONE)
-    l2gamma=ONE-gamma**2
-    l3gamma=HALF*gamma*(gamma+ONE)
-
-    l1pgamma=gamma-HALF
-    l2pgamma=-TWO*gamma
-    l3pgamma=gamma+HALF
-
-! corner nodes
-
-    shape3D(1)=l1xi*l1eta*l1gamma
-    shape3D(2)=l3xi*l1eta*l1gamma
-    shape3D(3)=l3xi*l3eta*l1gamma
-    shape3D(4)=l1xi*l3eta*l1gamma
-    shape3D(5)=l1xi*l1eta*l3gamma
-    shape3D(6)=l3xi*l1eta*l3gamma
-    shape3D(7)=l3xi*l3eta*l3gamma
-    shape3D(8)=l1xi*l3eta*l3gamma
-
-    dershape3D(1,1)=l1pxi*l1eta*l1gamma
-    dershape3D(1,2)=l3pxi*l1eta*l1gamma
-    dershape3D(1,3)=l3pxi*l3eta*l1gamma
-    dershape3D(1,4)=l1pxi*l3eta*l1gamma
-    dershape3D(1,5)=l1pxi*l1eta*l3gamma
-    dershape3D(1,6)=l3pxi*l1eta*l3gamma
-    dershape3D(1,7)=l3pxi*l3eta*l3gamma
-    dershape3D(1,8)=l1pxi*l3eta*l3gamma
-
-    dershape3D(2,1)=l1xi*l1peta*l1gamma
-    dershape3D(2,2)=l3xi*l1peta*l1gamma
-    dershape3D(2,3)=l3xi*l3peta*l1gamma
-    dershape3D(2,4)=l1xi*l3peta*l1gamma
-    dershape3D(2,5)=l1xi*l1peta*l3gamma
-    dershape3D(2,6)=l3xi*l1peta*l3gamma
-    dershape3D(2,7)=l3xi*l3peta*l3gamma
-    dershape3D(2,8)=l1xi*l3peta*l3gamma
-
-    dershape3D(3,1)=l1xi*l1eta*l1pgamma
-    dershape3D(3,2)=l3xi*l1eta*l1pgamma
-    dershape3D(3,3)=l3xi*l3eta*l1pgamma
-    dershape3D(3,4)=l1xi*l3eta*l1pgamma
-    dershape3D(3,5)=l1xi*l1eta*l3pgamma
-    dershape3D(3,6)=l3xi*l1eta*l3pgamma
-    dershape3D(3,7)=l3xi*l3eta*l3pgamma
-    dershape3D(3,8)=l1xi*l3eta*l3pgamma
-
-! midside nodes
-
-    shape3D(9)=l2xi*l1eta*l1gamma
-    shape3D(10)=l3xi*l2eta*l1gamma
-    shape3D(11)=l2xi*l3eta*l1gamma
-    shape3D(12)=l1xi*l2eta*l1gamma
-    shape3D(13)=l1xi*l1eta*l2gamma
-    shape3D(14)=l3xi*l1eta*l2gamma
-    shape3D(15)=l3xi*l3eta*l2gamma
-    shape3D(16)=l1xi*l3eta*l2gamma
-    shape3D(17)=l2xi*l1eta*l3gamma
-    shape3D(18)=l3xi*l2eta*l3gamma
-    shape3D(19)=l2xi*l3eta*l3gamma
-    shape3D(20)=l1xi*l2eta*l3gamma
-
-    dershape3D(1,9)=l2pxi*l1eta*l1gamma
-    dershape3D(1,10)=l3pxi*l2eta*l1gamma
-    dershape3D(1,11)=l2pxi*l3eta*l1gamma
-    dershape3D(1,12)=l1pxi*l2eta*l1gamma
-    dershape3D(1,13)=l1pxi*l1eta*l2gamma
-    dershape3D(1,14)=l3pxi*l1eta*l2gamma
-    dershape3D(1,15)=l3pxi*l3eta*l2gamma
-    dershape3D(1,16)=l1pxi*l3eta*l2gamma
-    dershape3D(1,17)=l2pxi*l1eta*l3gamma
-    dershape3D(1,18)=l3pxi*l2eta*l3gamma
-    dershape3D(1,19)=l2pxi*l3eta*l3gamma
-    dershape3D(1,20)=l1pxi*l2eta*l3gamma
-
-    dershape3D(2,9)=l2xi*l1peta*l1gamma
-    dershape3D(2,10)=l3xi*l2peta*l1gamma
-    dershape3D(2,11)=l2xi*l3peta*l1gamma
-    dershape3D(2,12)=l1xi*l2peta*l1gamma
-    dershape3D(2,13)=l1xi*l1peta*l2gamma
-    dershape3D(2,14)=l3xi*l1peta*l2gamma
-    dershape3D(2,15)=l3xi*l3peta*l2gamma
-    dershape3D(2,16)=l1xi*l3peta*l2gamma
-    dershape3D(2,17)=l2xi*l1peta*l3gamma
-    dershape3D(2,18)=l3xi*l2peta*l3gamma
-    dershape3D(2,19)=l2xi*l3peta*l3gamma
-    dershape3D(2,20)=l1xi*l2peta*l3gamma
-
-    dershape3D(3,9)=l2xi*l1eta*l1pgamma
-    dershape3D(3,10)=l3xi*l2eta*l1pgamma
-    dershape3D(3,11)=l2xi*l3eta*l1pgamma
-    dershape3D(3,12)=l1xi*l2eta*l1pgamma
-    dershape3D(3,13)=l1xi*l1eta*l2pgamma
-    dershape3D(3,14)=l3xi*l1eta*l2pgamma
-    dershape3D(3,15)=l3xi*l3eta*l2pgamma
-    dershape3D(3,16)=l1xi*l3eta*l2pgamma
-    dershape3D(3,17)=l2xi*l1eta*l3pgamma
-    dershape3D(3,18)=l3xi*l2eta*l3pgamma
-    dershape3D(3,19)=l2xi*l3eta*l3pgamma
-    dershape3D(3,20)=l1xi*l2eta*l3pgamma
-
-! side center nodes
-
-    shape3D(21)=l2xi*l2eta*l1gamma
-    shape3D(22)=l2xi*l1eta*l2gamma
-    shape3D(23)=l3xi*l2eta*l2gamma
-    shape3D(24)=l2xi*l3eta*l2gamma
-    shape3D(25)=l1xi*l2eta*l2gamma
-    shape3D(26)=l2xi*l2eta*l3gamma
-
-    dershape3D(1,21)=l2pxi*l2eta*l1gamma
-    dershape3D(1,22)=l2pxi*l1eta*l2gamma
-    dershape3D(1,23)=l3pxi*l2eta*l2gamma
-    dershape3D(1,24)=l2pxi*l3eta*l2gamma
-    dershape3D(1,25)=l1pxi*l2eta*l2gamma
-    dershape3D(1,26)=l2pxi*l2eta*l3gamma
-
-    dershape3D(2,21)=l2xi*l2peta*l1gamma
-    dershape3D(2,22)=l2xi*l1peta*l2gamma
-    dershape3D(2,23)=l3xi*l2peta*l2gamma
-    dershape3D(2,24)=l2xi*l3peta*l2gamma
-    dershape3D(2,25)=l1xi*l2peta*l2gamma
-    dershape3D(2,26)=l2xi*l2peta*l3gamma
-
-    dershape3D(3,21)=l2xi*l2eta*l1pgamma
-    dershape3D(3,22)=l2xi*l1eta*l2pgamma
-    dershape3D(3,23)=l3xi*l2eta*l2pgamma
-    dershape3D(3,24)=l2xi*l3eta*l2pgamma
-    dershape3D(3,25)=l1xi*l2eta*l2pgamma
-    dershape3D(3,26)=l2xi*l2eta*l3pgamma
-
-! center node
-
-    shape3D(27)=l2xi*l2eta*l2gamma
-
-    dershape3D(1,27)=l2pxi*l2eta*l2gamma
-    dershape3D(2,27)=l2xi*l2peta*l2gamma
-    dershape3D(3,27)=l2xi*l2eta*l2pgamma
-
+    ! note: put further setup for ngnod == 27 into subroutine
+    !       to avoid compilation errors in case ngnod == 8
+    call recompute_jacobian_27(NGNOD,NDIM,xi,eta,gamma,shape3D,dershape3D)
   endif
 
 ! compute coordinates and jacobian matrix
@@ -327,3 +165,196 @@ subroutine recompute_jacobian(xelm,yelm,zelm,xi,eta,gamma,x,y,z, &
 
   end subroutine recompute_jacobian
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine recompute_jacobian_27(ngnod,ndim,xi,eta,gamma,shape3D,dershape3D)
+
+  implicit none
+
+  integer :: ngnod,ndim
+
+  double precision :: xi,eta,gamma
+
+! 3D shape functions and their derivatives at receiver
+  double precision,dimension(ngnod) :: shape3D
+  double precision,dimension(ndim,ngnod) :: dershape3D
+
+  ! local parameters
+  double precision l1xi,l2xi,l3xi
+  double precision l1eta,l2eta,l3eta
+  double precision l1gamma,l2gamma,l3gamma
+  double precision l1pxi,l2pxi,l3pxi
+  double precision l1peta,l2peta,l3peta
+  double precision l1pgamma,l2pgamma,l3pgamma
+
+  double precision, parameter :: HALF = 0.5d0
+  double precision, parameter :: ONE  = 1.0d0
+  double precision, parameter :: TWO  = 2.0d0
+
+! recompute jacobian for any (xi,eta,gamma) point, not necessarily a GLL point
+
+
+! ***
+! *** create the 3D shape functions and the Jacobian for a 27-node element
+! ***
+
+  l1xi=HALF*xi*(xi-ONE)
+  l2xi=ONE-xi**2
+  l3xi=HALF*xi*(xi+ONE)
+
+  l1pxi=xi-HALF
+  l2pxi=-TWO*xi
+  l3pxi=xi+HALF
+
+  l1eta=HALF*eta*(eta-ONE)
+  l2eta=ONE-eta**2
+  l3eta=HALF*eta*(eta+ONE)
+
+  l1peta=eta-HALF
+  l2peta=-TWO*eta
+  l3peta=eta+HALF
+
+  l1gamma=HALF*gamma*(gamma-ONE)
+  l2gamma=ONE-gamma**2
+  l3gamma=HALF*gamma*(gamma+ONE)
+
+  l1pgamma=gamma-HALF
+  l2pgamma=-TWO*gamma
+  l3pgamma=gamma+HALF
+
+! corner nodes
+
+  shape3D(1)=l1xi*l1eta*l1gamma
+  shape3D(2)=l3xi*l1eta*l1gamma
+  shape3D(3)=l3xi*l3eta*l1gamma
+  shape3D(4)=l1xi*l3eta*l1gamma
+  shape3D(5)=l1xi*l1eta*l3gamma
+  shape3D(6)=l3xi*l1eta*l3gamma
+  shape3D(7)=l3xi*l3eta*l3gamma
+  shape3D(8)=l1xi*l3eta*l3gamma
+
+  dershape3D(1,1)=l1pxi*l1eta*l1gamma
+  dershape3D(1,2)=l3pxi*l1eta*l1gamma
+  dershape3D(1,3)=l3pxi*l3eta*l1gamma
+  dershape3D(1,4)=l1pxi*l3eta*l1gamma
+  dershape3D(1,5)=l1pxi*l1eta*l3gamma
+  dershape3D(1,6)=l3pxi*l1eta*l3gamma
+  dershape3D(1,7)=l3pxi*l3eta*l3gamma
+  dershape3D(1,8)=l1pxi*l3eta*l3gamma
+
+  dershape3D(2,1)=l1xi*l1peta*l1gamma
+  dershape3D(2,2)=l3xi*l1peta*l1gamma
+  dershape3D(2,3)=l3xi*l3peta*l1gamma
+  dershape3D(2,4)=l1xi*l3peta*l1gamma
+  dershape3D(2,5)=l1xi*l1peta*l3gamma
+  dershape3D(2,6)=l3xi*l1peta*l3gamma
+  dershape3D(2,7)=l3xi*l3peta*l3gamma
+  dershape3D(2,8)=l1xi*l3peta*l3gamma
+
+  dershape3D(3,1)=l1xi*l1eta*l1pgamma
+  dershape3D(3,2)=l3xi*l1eta*l1pgamma
+  dershape3D(3,3)=l3xi*l3eta*l1pgamma
+  dershape3D(3,4)=l1xi*l3eta*l1pgamma
+  dershape3D(3,5)=l1xi*l1eta*l3pgamma
+  dershape3D(3,6)=l3xi*l1eta*l3pgamma
+  dershape3D(3,7)=l3xi*l3eta*l3pgamma
+  dershape3D(3,8)=l1xi*l3eta*l3pgamma
+
+! midside nodes
+
+  shape3D(9)=l2xi*l1eta*l1gamma
+  shape3D(10)=l3xi*l2eta*l1gamma
+  shape3D(11)=l2xi*l3eta*l1gamma
+  shape3D(12)=l1xi*l2eta*l1gamma
+  shape3D(13)=l1xi*l1eta*l2gamma
+  shape3D(14)=l3xi*l1eta*l2gamma
+  shape3D(15)=l3xi*l3eta*l2gamma
+  shape3D(16)=l1xi*l3eta*l2gamma
+  shape3D(17)=l2xi*l1eta*l3gamma
+  shape3D(18)=l3xi*l2eta*l3gamma
+  shape3D(19)=l2xi*l3eta*l3gamma
+  shape3D(20)=l1xi*l2eta*l3gamma
+
+  dershape3D(1,9)=l2pxi*l1eta*l1gamma
+  dershape3D(1,10)=l3pxi*l2eta*l1gamma
+  dershape3D(1,11)=l2pxi*l3eta*l1gamma
+  dershape3D(1,12)=l1pxi*l2eta*l1gamma
+  dershape3D(1,13)=l1pxi*l1eta*l2gamma
+  dershape3D(1,14)=l3pxi*l1eta*l2gamma
+  dershape3D(1,15)=l3pxi*l3eta*l2gamma
+  dershape3D(1,16)=l1pxi*l3eta*l2gamma
+  dershape3D(1,17)=l2pxi*l1eta*l3gamma
+  dershape3D(1,18)=l3pxi*l2eta*l3gamma
+  dershape3D(1,19)=l2pxi*l3eta*l3gamma
+  dershape3D(1,20)=l1pxi*l2eta*l3gamma
+
+  dershape3D(2,9)=l2xi*l1peta*l1gamma
+  dershape3D(2,10)=l3xi*l2peta*l1gamma
+  dershape3D(2,11)=l2xi*l3peta*l1gamma
+  dershape3D(2,12)=l1xi*l2peta*l1gamma
+  dershape3D(2,13)=l1xi*l1peta*l2gamma
+  dershape3D(2,14)=l3xi*l1peta*l2gamma
+  dershape3D(2,15)=l3xi*l3peta*l2gamma
+  dershape3D(2,16)=l1xi*l3peta*l2gamma
+  dershape3D(2,17)=l2xi*l1peta*l3gamma
+  dershape3D(2,18)=l3xi*l2peta*l3gamma
+  dershape3D(2,19)=l2xi*l3peta*l3gamma
+  dershape3D(2,20)=l1xi*l2peta*l3gamma
+
+  dershape3D(3,9)=l2xi*l1eta*l1pgamma
+  dershape3D(3,10)=l3xi*l2eta*l1pgamma
+  dershape3D(3,11)=l2xi*l3eta*l1pgamma
+  dershape3D(3,12)=l1xi*l2eta*l1pgamma
+  dershape3D(3,13)=l1xi*l1eta*l2pgamma
+  dershape3D(3,14)=l3xi*l1eta*l2pgamma
+  dershape3D(3,15)=l3xi*l3eta*l2pgamma
+  dershape3D(3,16)=l1xi*l3eta*l2pgamma
+  dershape3D(3,17)=l2xi*l1eta*l3pgamma
+  dershape3D(3,18)=l3xi*l2eta*l3pgamma
+  dershape3D(3,19)=l2xi*l3eta*l3pgamma
+  dershape3D(3,20)=l1xi*l2eta*l3pgamma
+
+! side center nodes
+
+  shape3D(21)=l2xi*l2eta*l1gamma
+  shape3D(22)=l2xi*l1eta*l2gamma
+  shape3D(23)=l3xi*l2eta*l2gamma
+  shape3D(24)=l2xi*l3eta*l2gamma
+  shape3D(25)=l1xi*l2eta*l2gamma
+  shape3D(26)=l2xi*l2eta*l3gamma
+
+  dershape3D(1,21)=l2pxi*l2eta*l1gamma
+  dershape3D(1,22)=l2pxi*l1eta*l2gamma
+  dershape3D(1,23)=l3pxi*l2eta*l2gamma
+  dershape3D(1,24)=l2pxi*l3eta*l2gamma
+  dershape3D(1,25)=l1pxi*l2eta*l2gamma
+  dershape3D(1,26)=l2pxi*l2eta*l3gamma
+
+  dershape3D(2,21)=l2xi*l2peta*l1gamma
+  dershape3D(2,22)=l2xi*l1peta*l2gamma
+  dershape3D(2,23)=l3xi*l2peta*l2gamma
+  dershape3D(2,24)=l2xi*l3peta*l2gamma
+  dershape3D(2,25)=l1xi*l2peta*l2gamma
+  dershape3D(2,26)=l2xi*l2peta*l3gamma
+
+  dershape3D(3,21)=l2xi*l2eta*l1pgamma
+  dershape3D(3,22)=l2xi*l1eta*l2pgamma
+  dershape3D(3,23)=l3xi*l2eta*l2pgamma
+  dershape3D(3,24)=l2xi*l3eta*l2pgamma
+  dershape3D(3,25)=l1xi*l2eta*l2pgamma
+  dershape3D(3,26)=l2xi*l2eta*l3pgamma
+
+! center node
+
+  shape3D(27)=l2xi*l2eta*l2gamma
+
+  dershape3D(1,27)=l2pxi*l2eta*l2gamma
+  dershape3D(2,27)=l2xi*l2peta*l2gamma
+  dershape3D(3,27)=l2xi*l2eta*l2pgamma
+
+
+  end subroutine recompute_jacobian_27
+
+
diff --git a/src/specfem3D/PML_init.f90 b/src/specfem3D/PML_init.f90
index af675c089..f7d628546 100644
--- a/src/specfem3D/PML_init.f90
+++ b/src/specfem3D/PML_init.f90
@@ -127,7 +127,6 @@ end subroutine PML_damping_profile_l
 subroutine PML_initialize()
 
   use specfem_par,only: NGLOB_AB,NSPEC_AB,myrank, &
-                        ibool,xstore,ystore,zstore,&
                         model_speed_max,hdur
   use PML_par
   use PML_par_acoustic
@@ -303,9 +302,9 @@ subroutine PML_set_firstlayer()
 ! sets ispec occurrences for first element layer in PML region based on absorbing boundary elements
 
   use PML_par
-  use specfem_par,only: NSPEC_AB,NGLOB_AB, &
+  use specfem_par,only: NSPEC_AB, &
                         abs_boundary_ispec,abs_boundary_normal,num_abs_boundary_faces,&
-                        abs_boundary_ijk,ibool,myrank
+                        abs_boundary_ijk,ibool
   use constants,only: NDIM,TINYVAL,NGNOD,NGLLX,NGLLY,NGLLZ,NGLLSQUARE
   implicit none
   ! local parameters
@@ -479,14 +478,13 @@ subroutine PML_determine_interfacePoints()
 
 ! finds global interface points of PML region to "regular" domain
 
-  use specfem_par,only: ibool,myrank,NGLOB_AB,NSPEC_AB, &
+  use specfem_par,only: ibool,NGLOB_AB,NSPEC_AB, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh,NPROC
   use PML_par
   use PML_par_acoustic
   use constants,only: NGLLX,NGLLY,NGLLZ
-  use specfem_par_acoustic,only: ispec_is_acoustic
   implicit none
 
   ! local parameters
@@ -848,7 +846,6 @@ subroutine PML_add_layer()
 
   use PML_par
   use specfem_par,only: NSPEC_AB,NGLOB_AB, &
-                        abs_boundary_ispec,num_abs_boundary_faces,&
                         ibool,myrank,&
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
diff --git a/src/specfem3D/compute_add_sources_acoustic.f90 b/src/specfem3D/compute_add_sources_acoustic.f90
index b68cdb002..5e0f55eef 100644
--- a/src/specfem3D/compute_add_sources_acoustic.f90
+++ b/src/specfem3D/compute_add_sources_acoustic.f90
@@ -139,8 +139,7 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
         ! write(*,*) "fortran dt = ", dt
         ! change dt -> DT
         call compute_add_sources_ac_cuda(Mesh_pointer, phase_is_inner, &
-                                        NSOURCES, SIMULATION_TYPE, &
-                                        stf_pre_compute, myrank)
+                                        NSOURCES,stf_pre_compute, myrank)
       endif
 
     else ! .NOT. GPU_MODE
@@ -435,8 +434,7 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou
 
         ! only implements SIMTYPE=3
         call compute_add_sources_ac_s3_cuda(Mesh_pointer, phase_is_inner, &
-                                           NSOURCES, SIMULATION_TYPE, &
-                                           stf_pre_compute, myrank)
+                                           NSOURCES,stf_pre_compute, myrank)
       endif
 
     else ! .NOT. GPU_MODE
diff --git a/src/specfem3D/compute_add_sources_elastic.f90 b/src/specfem3D/compute_add_sources_elastic.f90
index 73d9435e9..63ed10d0a 100644
--- a/src/specfem3D/compute_add_sources_elastic.f90
+++ b/src/specfem3D/compute_add_sources_elastic.f90
@@ -39,7 +39,6 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
   use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, &
                         xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,&
                         station_name,network_name,adj_source_file, &
-                        LOCAL_PATH,wgllwgll_xy, &
                         num_free_surface_faces,free_surface_ispec, &
                         free_surface_ijk,free_surface_jacobian2Dw, &
                         noise_sourcearray,irec_master_noise, &
@@ -48,10 +47,6 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, &
                         nrec_local,number_receiver_global, &
                         nsources_local
 
-  use specfem_par_movie,only: &
-                        store_val_ux_external_mesh,store_val_uy_external_mesh, &
-                        store_val_uz_external_mesh
-
   implicit none
 
   include "constants.h"
diff --git a/src/specfem3D/compute_add_sources_poroelastic.f90 b/src/specfem3D/compute_add_sources_poroelastic.f90
index 8d93ef600..e13166243 100644
--- a/src/specfem3D/compute_add_sources_poroelastic.f90
+++ b/src/specfem3D/compute_add_sources_poroelastic.f90
@@ -41,7 +41,7 @@ subroutine compute_add_sources_poroelastic( NSPEC_AB,NGLOB_AB, &
 
   use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, &
                         xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,&
-                        station_name,network_name,adj_source_file,nrec_local,number_receiver_global
+                        station_name,network_name,adj_source_file
   implicit none
 
   include "constants.h"
diff --git a/src/specfem3D/compute_coupling_elastic_ac.f90 b/src/specfem3D/compute_coupling_elastic_ac.f90
index 43c233a7e..f391bb022 100644
--- a/src/specfem3D/compute_coupling_elastic_ac.f90
+++ b/src/specfem3D/compute_coupling_elastic_ac.f90
@@ -120,3 +120,110 @@ subroutine compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
 
 end subroutine compute_coupling_elastic_ac
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine compute_coupling_ocean(NSPEC_AB,NGLOB_AB, &
+                                    ibool,rmassx,rmassy,rmassz, &
+                                    rmass_ocean_load,accel, &
+                                    free_surface_normal,free_surface_ijk,free_surface_ispec, &
+                                    num_free_surface_faces,SIMULATION_TYPE, &
+                                    NGLOB_ADJOINT,b_accel)
+
+! updates acceleration with ocean load term:
+! approximates ocean-bottom continuity of pressure & displacement for longer period waves (> ~20s ),
+! assuming incompressible fluid column above bathymetry ocean bottom
+
+  implicit none
+
+  include 'constants.h'
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_AB),intent(inout) :: accel
+  real(kind=CUSTOM_REAL),dimension(NGLOB_AB),intent(in) :: rmassx,rmassy,rmassz
+  real(kind=CUSTOM_REAL),dimension(NGLOB_AB),intent(in) :: rmass_ocean_load
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB),intent(in) :: ibool
+
+  ! free surface
+  integer :: num_free_surface_faces
+  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
+  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
+  integer :: free_surface_ispec(num_free_surface_faces)
+
+  ! adjoint simulations
+  integer :: SIMULATION_TYPE,NGLOB_ADJOINT
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
+
+! local parameters
+  real(kind=CUSTOM_REAL) :: nx,ny,nz
+  real(kind=CUSTOM_REAL) :: force_normal_comp
+  integer :: i,j,k,ispec,iglob
+  integer :: igll,iface
+  logical,dimension(NGLOB_AB) :: updated_dof_ocean_load
+  ! adjoint locals
+  real(kind=CUSTOM_REAL) :: b_force_normal_comp
+
+  !   initialize the updates
+  updated_dof_ocean_load(:) = .false.
+
+  ! for surface elements exactly at the top of the model (ocean bottom)
+  do iface = 1,num_free_surface_faces
+
+    ispec = free_surface_ispec(iface)
+    do igll = 1, NGLLSQUARE
+      i = free_surface_ijk(1,igll,iface)
+      j = free_surface_ijk(2,igll,iface)
+      k = free_surface_ijk(3,igll,iface)
+
+      ! get global point number
+      iglob = ibool(i,j,k,ispec)
+
+      ! only update once
+      if(.not. updated_dof_ocean_load(iglob)) then
+
+        ! get normal
+        nx = free_surface_normal(1,igll,iface)
+        ny = free_surface_normal(2,igll,iface)
+        nz = free_surface_normal(3,igll,iface)
+
+        ! make updated component of right-hand side
+        ! we divide by rmass() which is 1 / M
+        ! we use the total force which includes the Coriolis term above
+        force_normal_comp = accel(1,iglob)*nx / rmassx(iglob) &
+                            + accel(2,iglob)*ny / rmassy(iglob) &
+                            + accel(3,iglob)*nz / rmassz(iglob)
+
+        accel(1,iglob) = accel(1,iglob) &
+          + (rmass_ocean_load(iglob) - rmassx(iglob)) * force_normal_comp * nx
+        accel(2,iglob) = accel(2,iglob) &
+          + (rmass_ocean_load(iglob) - rmassy(iglob)) * force_normal_comp * ny
+        accel(3,iglob) = accel(3,iglob) &
+          + (rmass_ocean_load(iglob) - rmassz(iglob)) * force_normal_comp * nz
+
+        ! adjoint simulations
+        if (SIMULATION_TYPE == 3) then
+          b_force_normal_comp = b_accel(1,iglob)*nx / rmassx(iglob) &
+                                + b_accel(2,iglob)*ny / rmassy(iglob) &
+                                + b_accel(3,iglob)*nz / rmassz(iglob)
+
+          b_accel(1,iglob) = b_accel(1,iglob) &
+            + (rmass_ocean_load(iglob) - rmassx(iglob)) * b_force_normal_comp * nx
+          b_accel(2,iglob) = b_accel(2,iglob) &
+            + (rmass_ocean_load(iglob) - rmassy(iglob)) * b_force_normal_comp * ny
+          b_accel(3,iglob) = b_accel(3,iglob) &
+            + (rmass_ocean_load(iglob) - rmassz(iglob)) * b_force_normal_comp * nz
+        endif !adjoint
+
+        ! done with this point
+        updated_dof_ocean_load(iglob) = .true.
+
+      endif
+
+    enddo ! igll
+  enddo ! iface
+
+  end subroutine compute_coupling_ocean
+
diff --git a/src/specfem3D/compute_forces_acoustic.f90 b/src/specfem3D/compute_forces_acoustic.f90
index e43ca8d00..e5cda1850 100644
--- a/src/specfem3D/compute_forces_acoustic.f90
+++ b/src/specfem3D/compute_forces_acoustic.f90
@@ -81,7 +81,7 @@ subroutine compute_forces_acoustic()
                         num_free_surface_faces,ispec_is_acoustic)
   else
     ! on GPU
-    call acoustic_enforce_free_surf_cuda(Mesh_pointer,SIMULATION_TYPE,ABSORB_FREE_SURFACE)
+    call acoustic_enforce_free_surf_cuda(Mesh_pointer,ABSORB_FREE_SURFACE)
   endif
 
   if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then
@@ -148,8 +148,7 @@ subroutine compute_forces_acoustic()
       ! on GPU
       ! includes code for SIMULATION_TYPE==3
       call compute_forces_acoustic_cuda(Mesh_pointer, iphase, &
-                                      nspec_outer_acoustic, nspec_inner_acoustic, &
-                                      SIMULATION_TYPE)
+                                        nspec_outer_acoustic, nspec_inner_acoustic)
     endif
 
     if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then
@@ -253,7 +252,7 @@ subroutine compute_forces_acoustic()
         else
           ! on GPU
           call compute_coupling_ac_el_cuda(Mesh_pointer,phase_is_inner, &
-                                              num_coupling_ac_el_faces,SIMULATION_TYPE)
+                                              num_coupling_ac_el_faces)
         endif ! GPU_MODE
       endif
     endif
@@ -401,7 +400,7 @@ subroutine compute_forces_acoustic()
       b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:)
   else
     ! on GPU
-    call kernel_3_a_acoustic_cuda(Mesh_pointer,NGLOB_AB,SIMULATION_TYPE)
+    call kernel_3_a_acoustic_cuda(Mesh_pointer,NGLOB_AB)
   endif
 
 
@@ -448,7 +447,7 @@ subroutine compute_forces_acoustic()
       b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:)
   else
     ! on GPU
-    call kernel_3_b_acoustic_cuda(Mesh_pointer,NGLOB_AB,deltatover2,SIMULATION_TYPE,b_deltatover2)
+    call kernel_3_b_acoustic_cuda(Mesh_pointer,NGLOB_AB,deltatover2,b_deltatover2)
   endif
 
   ! updates potential_dot_acoustic and potential_dot_dot_acoustic inside PML region for plotting seismograms/movies
@@ -497,7 +496,7 @@ subroutine compute_forces_acoustic()
                         num_free_surface_faces,ispec_is_acoustic)
   else
     ! on GPU
-    call acoustic_enforce_free_surf_cuda(Mesh_pointer,SIMULATION_TYPE,ABSORB_FREE_SURFACE)
+    call acoustic_enforce_free_surf_cuda(Mesh_pointer,ABSORB_FREE_SURFACE)
   endif
 
 
diff --git a/src/specfem3D/compute_forces_elastic.F90 b/src/specfem3D/compute_forces_elastic.F90
index 53ced024c..6f00373fb 100644
--- a/src/specfem3D/compute_forces_elastic.F90
+++ b/src/specfem3D/compute_forces_elastic.F90
@@ -124,7 +124,6 @@ subroutine compute_forces_elastic()
       call compute_forces_elastic_cuda(Mesh_pointer, iphase, &
                                       nspec_outer_elastic, &
                                       nspec_inner_elastic, &
-                                      SIMULATION_TYPE, &
                                       COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
 
       if(phase_is_inner .eqv. .true.) then
@@ -203,7 +202,7 @@ subroutine compute_forces_elastic()
         else
           ! on GPU
           call compute_coupling_el_ac_cuda(Mesh_pointer,phase_is_inner, &
-                                          num_coupling_ac_el_faces,SIMULATION_TYPE)
+                                           num_coupling_ac_el_faces)
         endif ! GPU_MODE
       endif ! num_coupling_ac_el_faces
     endif
@@ -329,30 +328,31 @@ subroutine compute_forces_elastic()
 
  ! multiplies with inverse of mass matrix (note: rmass has been inverted already)
  if(.NOT. GPU_MODE) then
-    accel(1,:) = accel(1,:)*rmass(:)
-    accel(2,:) = accel(2,:)*rmass(:)
-    accel(3,:) = accel(3,:)*rmass(:)
+    accel(1,:) = accel(1,:)*rmassx(:)
+    accel(2,:) = accel(2,:)*rmassy(:)
+    accel(3,:) = accel(3,:)*rmassz(:)
     ! adjoint simulations
     if (SIMULATION_TYPE == 3) then
-       b_accel(1,:) = b_accel(1,:)*rmass(:)
-       b_accel(2,:) = b_accel(2,:)*rmass(:)
-       b_accel(3,:) = b_accel(3,:)*rmass(:)
+       b_accel(1,:) = b_accel(1,:)*rmassx(:)
+       b_accel(2,:) = b_accel(2,:)*rmassy(:)
+       b_accel(3,:) = b_accel(3,:)*rmassz(:)
     endif !adjoint
  else ! GPU_MODE == 1
-    call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS)
+    call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2,OCEANS)
  endif
 
 ! updates acceleration with ocean load term
   if(OCEANS) then
     if( .NOT. GPU_MODE ) then
-      call elastic_ocean_load(NSPEC_AB,NGLOB_AB, &
-                        ibool,rmass,rmass_ocean_load,accel, &
-                        free_surface_normal,free_surface_ijk,free_surface_ispec, &
-                        num_free_surface_faces,SIMULATION_TYPE, &
-                        NGLOB_ADJOINT,b_accel)
+      call compute_coupling_ocean(NSPEC_AB,NGLOB_AB, &
+                                  ibool,rmassx,rmassy,rmassz, &
+                                  rmass_ocean_load,accel, &
+                                  free_surface_normal,free_surface_ijk,free_surface_ispec, &
+                                  num_free_surface_faces,SIMULATION_TYPE, &
+                                  NGLOB_ADJOINT,b_accel)
     else
       ! on GPU
-      call elastic_ocean_load_cuda(Mesh_pointer,SIMULATION_TYPE)
+      call compute_coupling_ocean_cuda(Mesh_pointer)
     endif
   endif
 
@@ -378,115 +378,13 @@ subroutine compute_forces_elastic()
      ! adjoint simulations
      if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
   else ! GPU_MODE == 1
-    if( OCEANS ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2)
+    if( OCEANS ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2)
   endif
 
 
 end subroutine compute_forces_elastic
 
 
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-subroutine elastic_ocean_load(NSPEC_AB,NGLOB_AB, &
-                        ibool,rmass,rmass_ocean_load,accel, &
-                        free_surface_normal,free_surface_ijk,free_surface_ispec, &
-                        num_free_surface_faces,SIMULATION_TYPE, &
-                        NGLOB_ADJOINT,b_accel)
-
-! updates acceleration with ocean load term:
-! approximates ocean-bottom continuity of pressure & displacement for longer period waves (> ~20s ),
-! assuming incompressible fluid column above bathymetry ocean bottom
-
-  implicit none
-
-  include 'constants.h'
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_AB),intent(inout) :: accel
-  real(kind=CUSTOM_REAL),dimension(NGLOB_AB),intent(in) :: rmass,rmass_ocean_load
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB),intent(in) :: ibool
-
-  ! free surface
-  integer :: num_free_surface_faces
-  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
-  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
-  integer :: free_surface_ispec(num_free_surface_faces)
-
-  ! adjoint simulations
-  integer :: SIMULATION_TYPE,NGLOB_ADJOINT
-  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel
-
-! local parameters
-  real(kind=CUSTOM_REAL) :: nx,ny,nz
-  real(kind=CUSTOM_REAL) :: additional_term,force_normal_comp
-  integer :: i,j,k,ispec,iglob
-  integer :: igll,iface
-  logical,dimension(NGLOB_AB) :: updated_dof_ocean_load
-  ! adjoint locals
-  real(kind=CUSTOM_REAL) :: b_additional_term,b_force_normal_comp
-
-  !   initialize the updates
-  updated_dof_ocean_load(:) = .false.
-
-  ! for surface elements exactly at the top of the model (ocean bottom)
-  do iface = 1,num_free_surface_faces
-
-    ispec = free_surface_ispec(iface)
-    do igll = 1, NGLLSQUARE
-      i = free_surface_ijk(1,igll,iface)
-      j = free_surface_ijk(2,igll,iface)
-      k = free_surface_ijk(3,igll,iface)
-
-      ! get global point number
-      iglob = ibool(i,j,k,ispec)
-
-      ! only update once
-      if(.not. updated_dof_ocean_load(iglob)) then
-
-        ! get normal
-        nx = free_surface_normal(1,igll,iface)
-        ny = free_surface_normal(2,igll,iface)
-        nz = free_surface_normal(3,igll,iface)
-
-        ! make updated component of right-hand side
-        ! we divide by rmass() which is 1 / M
-        ! we use the total force which includes the Coriolis term above
-        force_normal_comp = ( accel(1,iglob)*nx + &
-                              accel(2,iglob)*ny + &
-                              accel(3,iglob)*nz ) / rmass(iglob)
-
-        additional_term = (rmass_ocean_load(iglob) - rmass(iglob)) * force_normal_comp
-
-        accel(1,iglob) = accel(1,iglob) + additional_term * nx
-        accel(2,iglob) = accel(2,iglob) + additional_term * ny
-        accel(3,iglob) = accel(3,iglob) + additional_term * nz
-
-        ! adjoint simulations
-        if (SIMULATION_TYPE == 3) then
-          b_force_normal_comp = ( b_accel(1,iglob)*nx + &
-                                  b_accel(2,iglob)*ny + &
-                                  b_accel(3,iglob)*nz) / rmass(iglob)
-          b_additional_term = (rmass_ocean_load(iglob) - rmass(iglob)) * b_force_normal_comp
-
-          b_accel(1,iglob) = b_accel(1,iglob) + b_additional_term * nx
-          b_accel(2,iglob) = b_accel(2,iglob) + b_additional_term * ny
-          b_accel(3,iglob) = b_accel(3,iglob) + b_additional_term * nz
-        endif !adjoint
-
-        ! done with this point
-        updated_dof_ocean_load(iglob) = .true.
-
-      endif
-
-    enddo ! igll
-  enddo ! iface
-
-end subroutine elastic_ocean_load
-
 !
 !-------------------------------------------------------------------------------------------------
 !
diff --git a/src/specfem3D/compute_stacey_acoustic.f90 b/src/specfem3D/compute_stacey_acoustic.f90
index 60aa2ab30..63f44dd1c 100644
--- a/src/specfem3D/compute_stacey_acoustic.f90
+++ b/src/specfem3D/compute_stacey_acoustic.f90
@@ -147,7 +147,7 @@ subroutine compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, &
     ! GPU_MODE == .true.
     if( num_abs_boundary_faces > 0 ) &
       call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner, &
-                                       SIMULATION_TYPE,SAVE_FORWARD,b_absorb_potential)
+                                        SAVE_FORWARD,b_absorb_potential)
 
   endif
 
diff --git a/src/specfem3D/compute_stacey_elastic.f90 b/src/specfem3D/compute_stacey_elastic.f90
index 3a7c1d853..826e83352 100644
--- a/src/specfem3D/compute_stacey_elastic.f90
+++ b/src/specfem3D/compute_stacey_elastic.f90
@@ -166,7 +166,7 @@ subroutine compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, &
     ! GPU_MODE == .true.
     if( num_abs_boundary_faces > 0 ) &
       call compute_stacey_elastic_cuda(Mesh_pointer,phase_is_inner, &
-                                      SIMULATION_TYPE,SAVE_FORWARD,b_absorb_field)
+                                       SAVE_FORWARD,b_absorb_field)
   endif
 
   ! adjoint simulations: stores absorbed wavefield part
diff --git a/src/specfem3D/create_color_image.f90 b/src/specfem3D/create_color_image.f90
index c4edac58a..961eeb2ad 100644
--- a/src/specfem3D/create_color_image.f90
+++ b/src/specfem3D/create_color_image.f90
@@ -103,7 +103,7 @@ subroutine write_PNM_GIF_initialize()
   use specfem_par,only: NGLOB_AB,NSPEC_AB,ibool,xstore,ystore,zstore,&
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,my_neighbours_ext_mesh, &
-                        ibool_interfaces_ext_mesh,prname
+                        ibool_interfaces_ext_mesh !,prname
   use constants,only: HUGEVAL,NGLLX,NGLLY,NGLLZ
   implicit none
   ! local parameters
@@ -816,7 +816,7 @@ end subroutine write_PNM_GIF_data
   subroutine get_iglob_vp(iglob,ispec,vp)
 
   use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,FOUR_THIRDS
-  use specfem_par,only: mustore,kappastore,ibool,myrank,NSPEC_AB
+  use specfem_par,only: mustore,kappastore,ibool,myrank
   use specfem_par_acoustic,only: ACOUSTIC_SIMULATION,rhostore
   use specfem_par_elastic,only: ELASTIC_SIMULATION,rho_vp
   implicit none
@@ -857,7 +857,7 @@ subroutine get_iglob_veloc(iglob,ispec,val_vector)
                                 rhostore,ispec_is_acoustic, &
                                 b_potential_acoustic,b_potential_dot_acoustic
   use specfem_par_elastic,only: ELASTIC_SIMULATION,displ,veloc, &
-                                ispec_is_elastic,b_displ,b_veloc
+                                ispec_is_elastic ! ,b_displ,b_veloc
   use specfem_par,only: NSPEC_AB,NGLOB_AB,hprime_xx,hprime_yy,hprime_zz, &
                         xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
                         ibool,SIMULATION_TYPE,GRAVITY
diff --git a/src/specfem3D/finalize_simulation.f90 b/src/specfem3D/finalize_simulation.f90
index f18adfef9..e4499d11e 100644
--- a/src/specfem3D/finalize_simulation.f90
+++ b/src/specfem3D/finalize_simulation.f90
@@ -140,6 +140,18 @@ subroutine finalize_simulation()
     endif
   endif
 
+  ! frees dynamically allocated memory
+
+  ! mass matrices
+  if( ELASTIC_SIMULATION ) then
+    deallocate(rmassx)
+    deallocate(rmassy)
+    deallocate(rmassz)
+  endif
+  if( ACOUSTIC_SIMULATION ) then
+    deallocate(rmass_acoustic)
+  endif
+
 ! close the main output file
   if(myrank == 0) then
     write(IMAIN,*)
diff --git a/src/specfem3D/initialize_simulation.f90 b/src/specfem3D/initialize_simulation.f90
index d5be225f4..b2a6d8483 100644
--- a/src/specfem3D/initialize_simulation.f90
+++ b/src/specfem3D/initialize_simulation.f90
@@ -116,6 +116,8 @@ subroutine initialize_simulation()
     write(IMAIN,'(a)',advance='yes') '  external'
     case( IMODEL_IPATI )
     write(IMAIN,'(a)',advance='yes') '  ipati'
+    case( IMODEL_IPATI_WATER )
+    write(IMAIN,'(a)',advance='yes') '  ipati_water'
     end select
 
     write(IMAIN,*)
diff --git a/src/specfem3D/iterate_time.f90 b/src/specfem3D/iterate_time.f90
index 9db2ab79e..12e61cfe4 100644
--- a/src/specfem3D/iterate_time.f90
+++ b/src/specfem3D/iterate_time.f90
@@ -416,8 +416,8 @@ subroutine it_update_displacement_scheme()
     else
       ! on GPU
       call it_update_displacement_ac_cuda(Mesh_pointer, NGLOB_AB, &
-                                        deltat, deltatsqover2, deltatover2, &
-                                        SIMULATION_TYPE, b_deltat, b_deltatsqover2, b_deltatover2)
+                                          deltat, deltatsqover2, deltatover2, &
+                                          b_deltat, b_deltatsqover2, b_deltatover2)
     endif
 
     ! time marching potentials
@@ -457,7 +457,7 @@ subroutine it_update_displacement_scheme()
       ! on GPU
       ! Includes SIM_TYPE 1 & 3 (for noise tomography)
       call it_update_displacement_cuda(Mesh_pointer, size(displ), deltat, deltatsqover2,&
-             deltatover2, SIMULATION_TYPE, b_deltat, b_deltatsqover2, b_deltatover2)
+                                       deltatover2, b_deltat, b_deltatsqover2, b_deltatover2)
     endif
   endif
 
@@ -821,7 +821,7 @@ subroutine it_transfer_from_GPU()
 
   ! frees allocated memory on GPU
   call prepare_cleanup_device(Mesh_pointer, &
-                              SIMULATION_TYPE,SAVE_FORWARD, &
+                              SAVE_FORWARD, &
                               ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, &
                               ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, &
                               ATTENUATION,ANISOTROPY,OCEANS, &
diff --git a/src/specfem3D/model_update.f90 b/src/specfem3D/model_update.f90
index ba7e78771..0723f27cd 100644
--- a/src/specfem3D/model_update.f90
+++ b/src/specfem3D/model_update.f90
@@ -827,47 +827,47 @@ program model_update
 
   rmass_new(:) = 0._CUSTOM_REAL
 
+  ! note: this does not update the absorbing boundary contributions to the mass matrix
+  ! elastic mass matrix
   do ispec=1,nspec
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          iglob = ibool(i,j,k,ispec)
-
-          weight = wxgll(i)*wygll(j)*wzgll(k)
-          jacobianl = jacobianstore(i,j,k,ispec)
-
-          if( myrank == 0) then
-            print*, 'weight', weight
-            print*, 'jacobianl', jacobianl
-          endif
+    if( ispec_is_elastic(ispec) ) then
+      do k=1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+            iglob = ibool(i,j,k,ispec)
+
+            weight = wxgll(i)*wygll(j)*wzgll(k)
+            jacobianl = jacobianstore(i,j,k,ispec)
+
+            if( myrank == 0) then
+              print*, 'weight', weight
+              print*, 'jacobianl', jacobianl
+            endif
 
-! elastic mass matrix
-          if( ispec_is_elastic(ispec) ) then
             if(CUSTOM_REAL == SIZE_REAL) then
               rmass_new(iglob) = rmass_new(iglob) + &
-                    sngl( dble(jacobianl) * weight * dble(rhostore_new(i,j,k,ispec)) )
+                      sngl( dble(jacobianl) * weight * dble(rhostore_new(i,j,k,ispec)) )
             else
-               rmass_new(iglob) = rmass_new(iglob) + &
-                    jacobianl * weight * rhostore_new(i,j,k,ispec)
+              rmass_new(iglob) = rmass_new(iglob) + &
+                      jacobianl * weight * rhostore_new(i,j,k,ispec)
             endif
-          endif
+          enddo
         enddo
       enddo
-    enddo
+    endif
   enddo ! nspec
 
 
   call sync_all()
 
-
+  ! dummy allocations, arrays are not needed since the update here only works for elastic models
   allocate(rmass_acoustic_new(NGLOB))
   allocate(rmass_solid_poroelastic_new(NGLOB))
   allocate(rmass_fluid_poroelastic_new(NGLOB))
   rmass_acoustic_new = 0._CUSTOM_REAL
   rmass_solid_poroelastic_new = 0._CUSTOM_REAL
   rmass_fluid_poroelastic_new = 0._CUSTOM_REAL
-
-
+  
   !-------- attenuation -------
 
   ! read the proc*attenuation.vtk for the old model in LOCAL_PATH and store qmu_attenuation_store
@@ -979,8 +979,6 @@ program model_update
                         abs_boundary_ijk,abs_boundary_ispec,num_abs_boundary_faces, &
                         free_surface_normal,free_surface_jacobian2Dw, &
                         free_surface_ijk,free_surface_ispec,num_free_surface_faces, &
-                        coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-                        coupling_ac_el_ijk,coupling_ac_el_ispec,num_coupling_ac_el_faces, &
                         num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, &
                         max_nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         prname_new,SAVE_MESH_FILES,ANISOTROPY,NSPEC_ANISO, &
diff --git a/src/specfem3D/prepare_timerun.F90 b/src/specfem3D/prepare_timerun.F90
index 00516328f..a1cbdf9a5 100644
--- a/src/specfem3D/prepare_timerun.F90
+++ b/src/specfem3D/prepare_timerun.F90
@@ -219,6 +219,14 @@ subroutine prepare_timerun_mass_matrices()
 
   ! the mass matrix needs to be assembled with MPI here once and for all
   if(ACOUSTIC_SIMULATION) then
+
+    ! adds contributions
+    if( ABSORBING_CONDITIONS ) then
+      rmass_acoustic(:) = rmass_acoustic(:) + rmassz_acoustic(:)
+      ! not needed anymore
+      deallocate(rmassz_acoustic)
+    endif
+
     call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
@@ -231,21 +239,52 @@ subroutine prepare_timerun_mass_matrices()
   endif
 
   if(ELASTIC_SIMULATION) then
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass, &
+
+    ! switches to three-component mass matrix
+    if( ABSORBING_CONDITIONS ) then
+      ! adds boundary contributions
+      rmassx(:) = rmass(:) + rmassx(:)
+      rmassy(:) = rmass(:) + rmassy(:)
+      rmassz(:) = rmass(:) + rmassz(:)
+    else
+      rmassx(:) = rmass(:)
+      rmassy(:) = rmass(:)
+      rmassz(:) = rmass(:)
+    endif
+    ! not needed anymore
+    deallocate(rmass)
+
+    ! assemble mass matrix
+    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassx, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        my_neighbours_ext_mesh)
+    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassy, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        my_neighbours_ext_mesh)
+    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassz, &
                         num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
                         nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                         my_neighbours_ext_mesh)
 
     ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
-    where(rmass <= 0._CUSTOM_REAL) rmass = 1._CUSTOM_REAL
-    rmass(:) = 1._CUSTOM_REAL / rmass(:)
-
+    where(rmassx <= 0._CUSTOM_REAL) rmassx = 1._CUSTOM_REAL
+    where(rmassy <= 0._CUSTOM_REAL) rmassy = 1._CUSTOM_REAL
+    where(rmassz <= 0._CUSTOM_REAL) rmassz = 1._CUSTOM_REAL
+    rmassx(:) = 1._CUSTOM_REAL / rmassx(:)
+    rmassy(:) = 1._CUSTOM_REAL / rmassy(:)
+    rmassz(:) = 1._CUSTOM_REAL / rmassz(:)
+
+    ! ocean load
     if(OCEANS ) then
-      if( minval(rmass_ocean_load(:)) <= 0._CUSTOM_REAL) &
-        call exit_MPI(myrank,'negative ocean load mass matrix term')
-      rmass_ocean_load(:) = 1. / rmass_ocean_load(:)
+      call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_ocean_load, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        my_neighbours_ext_mesh)
+      where(rmass_ocean_load <= 0._CUSTOM_REAL) rmass_ocean_load = 1._CUSTOM_REAL
+      rmass_ocean_load(:) = 1._CUSTOM_REAL / rmass_ocean_load(:)
     endif
-
   endif
 
   if(POROELASTIC_SIMULATION) then
@@ -1077,7 +1116,7 @@ subroutine prepare_timerun_GPU()
                                   ispec_is_acoustic, &
                                   NOISE_TOMOGRAPHY,num_free_surface_faces, &
                                   free_surface_ispec,free_surface_ijk, &
-                                  ABSORBING_CONDITIONS,b_reclen_potential,b_absorb_potential, &
+                                  b_reclen_potential,b_absorb_potential, &
                                   ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
                                   coupling_ac_el_ispec,coupling_ac_el_ijk, &
                                   coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
@@ -1086,19 +1125,19 @@ subroutine prepare_timerun_GPU()
 
     if( SIMULATION_TYPE == 3 ) &
       call prepare_fields_acoustic_adj_dev(Mesh_pointer, &
-                                  SIMULATION_TYPE, &
-                                  APPROXIMATE_HESS_KL)
+                                           APPROXIMATE_HESS_KL)
 
   endif
 
   ! prepares fields on GPU for elastic simulations
   if( ELASTIC_SIMULATION ) then
     call prepare_fields_elastic_device(Mesh_pointer, NDIM*NGLOB_AB, &
-                                  rmass,rho_vp,rho_vs, &
+                                  rmassx,rmassy,rmassz, &
+                                  rho_vp,rho_vs, &
                                   num_phase_ispec_elastic,phase_ispec_inner_elastic, &
                                   ispec_is_elastic, &
-                                  ABSORBING_CONDITIONS,b_absorb_field,b_reclen_field, &
-                                  SIMULATION_TYPE,SAVE_FORWARD, &
+                                  b_absorb_field,b_reclen_field, &
+                                  SAVE_FORWARD, &
                                   COMPUTE_AND_STORE_STRAIN, &
                                   epsilondev_xx,epsilondev_yy,epsilondev_xy, &
                                   epsilondev_xz,epsilondev_yz, &
@@ -1122,7 +1161,6 @@ subroutine prepare_timerun_GPU()
 
     if( SIMULATION_TYPE == 3 ) &
       call prepare_fields_elastic_adj_dev(Mesh_pointer, NDIM*NGLOB_AB, &
-                                  SIMULATION_TYPE, &
                                   COMPUTE_AND_STORE_STRAIN, &
                                   epsilon_trace_over_3, &
                                   b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
@@ -1155,7 +1193,7 @@ subroutine prepare_timerun_GPU()
                                   free_surface_ispec, &
                                   free_surface_ijk, &
                                   num_free_surface_faces, &
-                                  SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
+                                  NOISE_TOMOGRAPHY, &
                                   NSTEP,noise_sourcearray, &
                                   normal_x_noise,normal_y_noise,normal_z_noise, &
                                   mask_noise,free_surface_jacobian2Dw)
diff --git a/src/specfem3D/read_mesh_databases.f90 b/src/specfem3D/read_mesh_databases.f90
index d79d21f70..24b590601 100644
--- a/src/specfem3D/read_mesh_databases.f90
+++ b/src/specfem3D/read_mesh_databases.f90
@@ -95,6 +95,12 @@ subroutine read_mesh_databases()
     ! mass matrix, density
     allocate(rmass_acoustic(NGLOB_AB),stat=ier)
     if( ier /= 0 ) stop 'error allocating array rmass_acoustic'
+
+    ! initializes mass matrix contribution
+    allocate(rmassz_acoustic(NGLOB_AB),stat=ier)
+    if( ier /= 0 ) stop 'error allocating array rmassz_acoustic'
+    rmassz_acoustic(:) = 0._CUSTOM_REAL
+
     allocate(rhostore(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
     if( ier /= 0 ) stop 'error allocating array rhostore'
 
@@ -120,8 +126,20 @@ subroutine read_mesh_databases()
       allocate(accel_adj_coupling(NDIM,NGLOB_AB),stat=ier)
       if( ier /= 0 ) stop 'error allocating array accel_adj_coupling'
     endif
+
+    ! allocates mass matrix
     allocate(rmass(NGLOB_AB),stat=ier)
     if( ier /= 0 ) stop 'error allocating array rmass'
+    ! initializes mass matrix contributions
+    allocate(rmassx(NGLOB_AB), &
+             rmassy(NGLOB_AB), &
+             rmassz(NGLOB_AB), &
+             stat=ier)
+    if( ier /= 0 ) stop 'error allocating array rmassx,rmassy,rmassz'
+    rmassx(:) = 0._CUSTOM_REAL
+    rmassy(:) = 0._CUSTOM_REAL
+    rmassz(:) = 0._CUSTOM_REAL
+
     allocate(rho_vp(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
     if( ier /= 0 ) stop 'error allocating array rho_vp'
     allocate(rho_vs(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
@@ -312,6 +330,15 @@ subroutine read_mesh_databases()
     read(27) abs_boundary_ijk
     read(27) abs_boundary_jacobian2Dw
     read(27) abs_boundary_normal
+    ! reads mass matrix contributions on boundaries
+    if( ELASTIC_SIMULATION ) then
+      read(27) rmassx
+      read(27) rmassy
+      read(27) rmassz
+    endif
+    if( ACOUSTIC_SIMULATION ) then
+      read(27) rmassz_acoustic
+    endif
   endif
 
   ! free surface
diff --git a/src/specfem3D/save_external_bin_m_up.f90 b/src/specfem3D/save_external_bin_m_up.f90
index 71e2e85f0..6730a2cfa 100644
--- a/src/specfem3D/save_external_bin_m_up.f90
+++ b/src/specfem3D/save_external_bin_m_up.f90
@@ -45,9 +45,6 @@ subroutine save_external_bin_m_up(nspec,nglob, &
                     free_surface_normal,free_surface_jacobian2Dw, &
                     free_surface_ijk,free_surface_ispec, &
                     num_free_surface_faces, &
-                    coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-                    coupling_ac_el_ijk,coupling_ac_el_ispec, &
-                    num_coupling_ac_el_faces, &
                     num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, &
                     max_nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
                     prname,SAVE_MESH_FILES, &
@@ -58,6 +55,24 @@ subroutine save_external_bin_m_up(nspec,nglob, &
                     c55store,c56store,c66store, &
                     ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic)
 
+  use specfem_par,only: &
+    ispec_is_inner
+
+  use specfem_par_elastic,only: &
+    rmassx,rmassy,rmassz, &
+    nspec_inner_elastic,nspec_outer_elastic,num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+    num_colors_outer_elastic,num_colors_inner_elastic,num_elem_colors_elastic
+
+  use specfem_par_acoustic,only: &
+    rmassz_acoustic,num_coupling_ac_po_faces, &
+    num_coupling_ac_el_faces,coupling_ac_el_ijk,coupling_ac_el_ispec, &
+    nspec_inner_acoustic,nspec_outer_acoustic,num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+    num_colors_outer_acoustic,num_colors_inner_acoustic,num_elem_colors_acoustic
+
+  use specfem_par_poroelastic,only: &
+    num_coupling_el_po_faces, &
+    nspec_inner_poroelastic,nspec_outer_poroelastic,num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic
+    
   implicit none
 
   include "constants.h"
@@ -76,6 +91,7 @@ subroutine save_external_bin_m_up(nspec,nglob, &
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rhostore,kappastore,mustore
   real(kind=CUSTOM_REAL), dimension(nglob) :: rmass,rmass_acoustic, &
             rmass_solid_poroelastic,rmass_fluid_poroelastic
+
 ! ocean load
   logical :: OCEANS
   integer :: NGLOB_OCEAN
@@ -99,13 +115,6 @@ subroutine save_external_bin_m_up(nspec,nglob, &
   integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
   integer :: free_surface_ispec(num_free_surface_faces)
 
-! acoustic-elastic coupling surface
-  integer :: num_coupling_ac_el_faces
-  real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces)
-  real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces)
-  integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces)
-  integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces)
-
 ! MPI interfaces
   integer :: num_interfaces_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
@@ -192,7 +201,6 @@ subroutine save_external_bin_m_up(nspec,nglob, &
     !pll Stacey
     write(IOUT) rho_vp
     write(IOUT) rho_vs
-
   endif
 
 ! poroelastic
@@ -200,39 +208,65 @@ subroutine save_external_bin_m_up(nspec,nglob, &
   if( POROELASTIC_SIMULATION ) then
     write(IOUT) rmass_solid_poroelastic
     write(IOUT) rmass_fluid_poroelastic
+    stop 'model update with poroelastic domains not supported yet'
   endif
 
 ! absorbing boundary surface
   write(IOUT) num_abs_boundary_faces
-  write(IOUT) abs_boundary_ispec
-  write(IOUT) abs_boundary_ijk
-  write(IOUT) abs_boundary_jacobian2Dw
-  write(IOUT) abs_boundary_normal
-
+  if(num_abs_boundary_faces > 0 ) then
+    write(IOUT) abs_boundary_ispec
+    write(IOUT) abs_boundary_ijk
+    write(IOUT) abs_boundary_jacobian2Dw
+    write(IOUT) abs_boundary_normal
+    ! store mass matrix contributions
+    if(ELASTIC_SIMULATION) then
+     write(IOUT) rmassx
+     write(IOUT) rmassy
+     write(IOUT) rmassz
+    endif
+    if(ACOUSTIC_SIMULATION) then
+      write(IOUT) rmassz_acoustic
+    endif
+  endif
+  
 ! free surface
   write(IOUT) num_free_surface_faces
-  write(IOUT) free_surface_ispec
-  write(IOUT) free_surface_ijk
-  write(IOUT) free_surface_jacobian2Dw
-  write(IOUT) free_surface_normal
+  if( num_free_surface_faces > 0 ) then
+    write(IOUT) free_surface_ispec
+    write(IOUT) free_surface_ijk
+    write(IOUT) free_surface_jacobian2Dw
+    write(IOUT) free_surface_normal
+  endif
 
 ! acoustic-elastic coupling surface
   write(IOUT) num_coupling_ac_el_faces
-  write(IOUT) coupling_ac_el_ispec
-  write(IOUT) coupling_ac_el_ijk
-  write(IOUT) coupling_ac_el_jacobian2Dw
-  write(IOUT) coupling_ac_el_normal
+  if( num_coupling_ac_el_faces > 0 ) then
+    stop 'coupling ac_po not updated yet'
+  endif
+
+! acoustic-poroelastic coupling surface
+  write(IOUT) num_coupling_ac_po_faces
+  if( num_coupling_ac_po_faces > 0 ) then
+    stop 'coupling ac_po not updated yet'
+  endif
+
+! elastic-poroelastic coupling surface
+  write(IOUT) num_coupling_el_po_faces
+  if( num_coupling_el_po_faces > 0 ) then
+    stop 'coupling ac_po not updated yet'
+  endif
 
 !MPI interfaces
   write(IOUT) num_interfaces_ext_mesh
-  write(IOUT) max_nibool_interfaces_ext_mesh  !magnoni
-  write(IOUT) my_neighbours_ext_mesh
-  write(IOUT) nibool_interfaces_ext_mesh  !magnoni
-  write(IOUT) ibool_interfaces_ext_mesh   !magnoni
-
+  if( num_interfaces_ext_mesh > 0 ) then
+    write(IOUT) max_nibool_interfaces_ext_mesh
+    write(IOUT) my_neighbours_ext_mesh
+    write(IOUT) nibool_interfaces_ext_mesh
+    write(IOUT) ibool_interfaces_ext_mesh   !magnoni
+  endif
 
 ! anisotropy
-  if( ANISOTROPY ) then
+  if( ELASTIC_SIMULATION .and. ANISOTROPY ) then
     write(IOUT) c11store
     write(IOUT) c12store
     write(IOUT) c13store
@@ -256,6 +290,39 @@ subroutine save_external_bin_m_up(nspec,nglob, &
     write(IOUT) c66store
   endif
 
+! inner/outer elements
+  write(IOUT) ispec_is_inner
+
+  if( ACOUSTIC_SIMULATION ) then
+    write(IOUT) nspec_inner_acoustic,nspec_outer_acoustic
+    write(IOUT) num_phase_ispec_acoustic
+    if(num_phase_ispec_acoustic > 0 ) write(IOUT) phase_ispec_inner_acoustic
+  endif
+
+  if( ELASTIC_SIMULATION ) then
+    write(IOUT) nspec_inner_elastic,nspec_outer_elastic
+    write(IOUT) num_phase_ispec_elastic
+    if(num_phase_ispec_elastic > 0 ) write(IOUT) phase_ispec_inner_elastic
+  endif
+
+  if( POROELASTIC_SIMULATION ) then
+    write(IOUT) nspec_inner_poroelastic,nspec_outer_poroelastic
+    write(IOUT) num_phase_ispec_poroelastic
+    if(num_phase_ispec_poroelastic > 0 ) write(IOUT) phase_ispec_inner_poroelastic
+  endif
+
+  ! mesh coloring
+  if( USE_MESH_COLORING_GPU ) then
+    if( ACOUSTIC_SIMULATION ) then
+      write(IOUT) num_colors_outer_acoustic,num_colors_inner_acoustic
+      write(IOUT) num_elem_colors_acoustic
+    endif
+    if( ELASTIC_SIMULATION ) then
+      write(IOUT) num_colors_outer_elastic,num_colors_inner_elastic
+      write(IOUT) num_elem_colors_elastic
+    endif
+  endif
+
   close(IOUT)
 
 
diff --git a/src/specfem3D/specfem3D_par.f90 b/src/specfem3D/specfem3D_par.f90
index 02b07cae2..113961028 100644
--- a/src/specfem3D/specfem3D_par.f90
+++ b/src/specfem3D/specfem3D_par.f90
@@ -297,6 +297,7 @@ module specfem_par_elastic
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmass
 
 ! Stacey
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmassx,rmassy,rmassz
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: rho_vp,rho_vs
 
   ! anisotropic
@@ -381,6 +382,7 @@ module specfem_par_acoustic
 
 ! mass matrix
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmass_acoustic
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmassz_acoustic
 
 ! acoustic-elastic coupling surface
   real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: coupling_ac_el_normal
diff --git a/src/specfem3D/write_output_SU.f90 b/src/specfem3D/write_output_SU.f90
index 932f06ca5..f9fa24edf 100644
--- a/src/specfem3D/write_output_SU.f90
+++ b/src/specfem3D/write_output_SU.f90
@@ -38,17 +38,12 @@ subroutine write_output_SU()
   character(len=256) procname,final_LOCAL_PATH
   integer :: irec_local,irec
 
-  ! headers
-  integer,parameter :: nheader=240      ! 240 bytes
-!! DK DK  integer(kind=2) :: i2head(nheader/2)  ! 2-byte-integer
-  integer(kind=4) :: i4head(nheader/4)  ! 4-byte-integer
-!! DK DK  equivalence (i2head,i4head)    ! share the same 240-byte-memory
-
   double precision, allocatable, dimension(:) :: x_found,y_found,z_found
   double precision :: x_found_source,y_found_source,z_found_source
 
   real(kind=4),dimension(:),allocatable :: rtmpseis
-  integer :: ier
+  real :: dx
+  integer :: i,ier
 
   allocate(x_found(nrec),y_found(nrec),z_found(nrec),stat=ier)
   if( ier /= 0 ) stop 'error allocating array x_found y_found z_found'
@@ -83,87 +78,141 @@ subroutine write_output_SU()
 
   ! write seismograms (dx)
   open(unit=IOUT_SU, file=trim(adjustl(final_LOCAL_PATH))//trim(adjustl(procname))//'_dx_SU' ,&
-      form='unformatted', access='direct', recl=240+4*NSTEP, iostat=ier)
+      status='unknown', access='direct', recl=4, iostat=ier)
+
   if( ier /= 0 ) stop 'error opening ***_dx_SU file'
 
+  ! receiver interval
+  if( nrec > 1) then
+    dx = SNGL(x_found(2)-x_found(1))
+  else
+    dx = 0.0
+  endif
+
+
   do irec_local = 1,nrec_local
-   irec = number_receiver_global(irec_local)
-   i4head(1)  =irec
-   i4head(11) =z_found(irec)
-   i4head(13) =z_found_source
-   i4head(19) =x_found_source !utm_x_source(1)
-   i4head(20) =y_found_source !utm_y_source(1)
-   i4head(21) =x_found(irec)  !stutm_x(irec)
-   i4head(22) =y_found(irec)  !stutm_y(irec)
-!! DK DK   i2head(58) =NSTEP
-!! DK DK   i2head(59) =DT*1.0d6
-!! DK DK almost all modern processors are little-endian, thus we do not need an equivalence statement any more
-!! DK DK (some compilers give a warning message for the equivalent statement that we used to have above)
-   i4head(58/2) = NSTEP + 65536*int(DT*1.0d6)
-
-   ! convert to real 4-byte
-   rtmpseis(1:NSTEP) = seismograms_d(1,irec_local,1:NSTEP)
-
-   ! write record
-   write(IOUT_SU,rec=irec_local) i4head, rtmpseis
+    irec = number_receiver_global(irec_local)
+
+    ! write section header
+    call write_SU_header(irec_local,irec,nrec,NSTEP,DT,dx, &
+                          x_found(irec),y_found(irec),z_found(irec),x_found_source,y_found_source,z_found_source)
+
+    ! convert trace to real 4-byte
+    if( CUSTOM_REAL == SIZE_REAL) then
+      rtmpseis(1:NSTEP) = seismograms_d(1,irec_local,1:NSTEP)
+    else
+      rtmpseis(1:NSTEP) = sngl(seismograms_d(1,irec_local,1:NSTEP))
+    endif
+    do i=1,NSTEP
+      write(IOUT_SU,rec=irec_local*60+(irec_local-1)*NSTEP+i) rtmpseis(i)
+    enddo
+
   enddo
   close(IOUT_SU)
 
   ! write seismograms (dy)
   open(unit=IOUT_SU, file=trim(adjustl(final_LOCAL_PATH))//trim(adjustl(procname))//'_dy_SU' ,&
-      form='unformatted', access='direct', recl=240+4*NSTEP, iostat=ier)
+      status='unknown', access='direct', recl=4, iostat=ier)
+
   if( ier /= 0 ) stop 'error opening ***_dy_SU file'
 
   do irec_local = 1,nrec_local
-   irec = number_receiver_global(irec_local)
-   i4head(1)  =irec
-   i4head(11) =z_found(irec)
-   i4head(13) =z_found_source
-   i4head(19) =x_found_source !utm_x_source(1)
-   i4head(20) =y_found_source !utm_y_source(1)
-   i4head(21) =x_found(irec)  !stutm_x(irec)
-   i4head(22) =y_found(irec)  !stutm_y(irec)
-!! DK DK   i2head(58) =NSTEP
-!! DK DK   i2head(59) =DT*1.0d6
-!! DK DK almost all modern processors are little-endian, thus we do not need an equivalence statement any more
-!! DK DK (some compilers give a warning message for the equivalent statement that we used to have above)
-   i4head(58/2) = NSTEP + 65536*int(DT*1.0d6)
-
-   ! convert to real 4-byte
-   rtmpseis(1:NSTEP) = seismograms_d(2,irec_local,1:NSTEP)
-
-   ! write record
-   write(IOUT_SU,rec=irec_local) i4head, rtmpseis
+    irec = number_receiver_global(irec_local)
+
+    ! write section header
+    call write_SU_header(irec_local,irec,nrec,NSTEP,DT,dx, &
+                         x_found(irec),y_found(irec),z_found(irec),x_found_source,y_found_source,z_found_source)
+
+    ! convert trace to real 4-byte
+    if( CUSTOM_REAL == SIZE_REAL) then
+      rtmpseis(1:NSTEP) = seismograms_d(2,irec_local,1:NSTEP)
+    else
+      rtmpseis(1:NSTEP) = sngl(seismograms_d(2,irec_local,1:NSTEP))
+    endif
+    do i=1,NSTEP
+      write(IOUT_SU,rec=irec_local*60+(irec_local-1)*NSTEP+i) rtmpseis(i)
+    enddo
   enddo
   close(IOUT_SU)
 
   ! write seismograms (dz)
   open(unit=IOUT_SU, file=trim(adjustl(final_LOCAL_PATH))//trim(adjustl(procname))//'_dz_SU' ,&
-      form='unformatted', access='direct', recl=240+4*NSTEP, iostat=ier)
+      status='unknown', access='direct', recl=4, iostat=ier)
+
   if( ier /= 0 ) stop 'error opening ***_dz_SU file'
 
   do irec_local = 1,nrec_local
-   irec = number_receiver_global(irec_local)
-   i4head(1)  =irec
-   i4head(11) =z_found(irec)
-   i4head(13) =z_found_source
-   i4head(19) =x_found_source !utm_x_source(1)
-   i4head(20) =y_found_source !utm_y_source(1)
-   i4head(21) =x_found(irec)  !stutm_x(irec)
-   i4head(22) =y_found(irec)  !stutm_y(irec)
-!! DK DK   i2head(58) =NSTEP
-!! DK DK   i2head(59) =DT*1.0d6
-!! DK DK almost all modern processors are little-endian, thus we do not need an equivalence statement any more
-!! DK DK (some compilers give a warning message for the equivalent statement that we used to have above)
-   i4head(58/2) = NSTEP + 65536*int(DT*1.0d6)
-
-   ! convert to real 4-byte
-   rtmpseis(1:NSTEP) = seismograms_d(3,irec_local,1:NSTEP)
-
-   ! write record
-   write(IOUT_SU,rec=irec_local) i4head, rtmpseis
+    irec = number_receiver_global(irec_local)
+
+    ! write section header
+    call write_SU_header(irec_local,irec,nrec,NSTEP,DT,dx, &
+                         x_found(irec),y_found(irec),z_found(irec),x_found_source,y_found_source,z_found_source)
+
+    ! convert trace to real 4-byte
+    if( CUSTOM_REAL == SIZE_REAL) then
+      rtmpseis(1:NSTEP) = seismograms_d(3,irec_local,1:NSTEP)
+    else
+      rtmpseis(1:NSTEP) = sngl(seismograms_d(3,irec_local,1:NSTEP))
+    endif
+    do i=1,NSTEP
+      write(IOUT_SU,rec=irec_local*60+(irec_local-1)*NSTEP+i) rtmpseis(i)
+    enddo
+
   enddo
   close(IOUT_SU)
 
   end subroutine write_output_SU
 
+!
+!------------------------------------------------------------------------------------------------------
+!
+
+  subroutine write_SU_header(irec_local,irec,nrec,NSTEP,DT,dx, &
+                             x_found,y_found,z_found,x_found_source,y_found_source,z_found_source)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: irec_local,irec,nrec
+  integer :: NSTEP
+  double precision :: x_found,y_found,z_found
+  double precision :: x_found_source,y_found_source,z_found_source
+  double precision :: DT
+  real :: dx
+
+  ! local parameters
+  integer(kind=2) :: header2(2)
+
+  ! write SU headers (refer to Seismic Unix for details)
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+1)  irec                          ! receiver ID
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+10) NINT(x_found-x_found_source)  ! offset
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+11) NINT(z_found)
+
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+13) NINT(z_found_source)                ! source location
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+19) NINT(x_found_source)                ! source location
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+20) NINT(y_found_source)                ! source location
+
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+21) NINT(x_found)           ! receiver location xr
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+22) NINT(y_found)           ! receiver location zr
+
+  if (nrec>1) write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+48) dx ! receiver interval
+
+  ! time steps
+  header2(1)=0  ! dummy
+  header2(2)=NSTEP
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+29) header2
+
+  ! time increment
+  if( NINT(DT*1.0d6) < 65536 ) then
+    header2(1)=NINT(DT*1.0d6)  ! deltat (unit: 10^{-6} second)
+  else if( NINT(DT*1.0d3) < 65536 ) then
+    header2(1)=NINT(DT*1.0d3)  ! deltat (unit: 10^{-3} second)
+  else
+    header2(1)=NINT(DT)  ! deltat (unit: 10^{0} second)
+  endif
+  header2(2)=0  ! dummy
+  write(IOUT_SU,rec=(irec_local-1)*60+(irec_local-1)*NSTEP+30) header2
+
+  end subroutine write_SU_header
+
diff --git a/src/specfem3D/write_seismograms.f90 b/src/specfem3D/write_seismograms.f90
index d9767719a..5040410a9 100644
--- a/src/specfem3D/write_seismograms.f90
+++ b/src/specfem3D/write_seismograms.f90
@@ -55,11 +55,10 @@ subroutine write_seismograms()
       if( ACOUSTIC_SIMULATION ) then
         ! only copy corresponding elements to CPU host
         ! timing: Elapsed time: 5.230904e-04
-        call transfer_station_ac_from_device( &
-                        potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, &
+        call transfer_station_ac_from_device(potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, &
                         b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
                         Mesh_pointer,number_receiver_global, &
-                        ispec_selected_rec,ispec_selected_source,ibool,SIMULATION_TYPE)
+                        ispec_selected_rec,ispec_selected_source,ibool)
 
         ! alternative: transfers whole fields
         ! timing: Elapsed time: 4.138947e-03
@@ -71,7 +70,6 @@ subroutine write_seismograms()
       if( ELASTIC_SIMULATION ) then
          if(USE_CUDA_SEISMOGRAMS) then
             call transfer_seismograms_el_from_d(nrec_local,Mesh_pointer, &
-                                               SIMULATION_TYPE,&
                                                seismograms_d,seismograms_v,seismograms_a,&
                                                it)
          else
@@ -79,7 +77,7 @@ subroutine write_seismograms()
                                                 b_displ,b_veloc,b_accel, &
                                                 Mesh_pointer,number_receiver_global, &
                                                 ispec_selected_rec,ispec_selected_source, &
-                                                ibool,SIMULATION_TYPE)
+                                                ibool)
          endif
         ! alternative: transfers whole fields
         !  call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
diff --git a/utils/update_headers_change_word_f90.pl b/utils/update_headers_change_word_f90.pl
index d5eb2da77..b473c2e1c 100755
--- a/utils/update_headers_change_word_f90.pl
+++ b/utils/update_headers_change_word_f90.pl
@@ -16,7 +16,7 @@
 # in the code (which is dangerous, but really easier to program...)
 #
 
-@objects = `ls ../src/*/*.f90 ../src/*/*.F90 ../src/*/*.c ../src/*/*.cu ../src/*/*.h.in ../src/*/*.h`;
+@objects = `ls src/*/*.f90 src/*/*.F90 src/*/*.c src/*/*.cu src/*/*.h.in src/*/*.h`;
 
 foreach $name (@objects) {
   chop $name;