From 68c7a7720fcee072fbb6cb93ec3d1580563274b0 Mon Sep 17 00:00:00 2001
From: Daniel Peter <daniel.peter22@gmail.com>
Date: Thu, 25 May 2023 22:29:45 +0200
Subject: [PATCH] adds LTS to solver

---
 setup/constants.h.in                          |   9 +-
 src/decompose_mesh/lts_helper.F90             |  34 +-
 src/decompose_mesh/lts_setup_elements.F90     |  38 +-
 .../part_decompose_mesh_hdf5.F90              |   2 +-
 .../write_mesh_databases_hdf5.F90             |   2 +-
 .../lts_generate_databases.F90                |  70 +-
 src/inverse_problem_for_model/rules.mk        |   4 +
 .../specfem_interface_mod.F90                 |   7 +-
 src/shared/check_mesh_resolution.f90          |  10 +-
 src/shared/read_parameter_file.F90            |  91 ++
 src/shared/write_VTK_data.f90                 |  96 ++
 src/specfem3D/assemble_MPI_vector.f90         |  12 +-
 .../compute_add_sources_acoustic.f90          |  20 +-
 .../compute_add_sources_viscoelastic.F90      | 100 ++-
 src/specfem3D/compute_element_att_memory.f90  | 128 ++-
 ...ompute_forces_acoustic_calling_routine.f90 |  10 +-
 ...ute_forces_poroelastic_calling_routine.f90 |  40 +-
 src/specfem3D/compute_forces_viscoelastic.F90 |  67 +-
 ...te_forces_viscoelastic_calling_routine.F90 |  26 +-
 src/specfem3D/fault_solver_common.f90         | 242 ++++-
 src/specfem3D/fault_solver_dynamic.f90        | 843 ++++++++++++++++--
 src/specfem3D/fault_solver_kinematic.f90      | 297 +++++-
 src/specfem3D/finalize_simulation.f90         |  15 +
 src/specfem3D/iterate_time.F90                |  11 +
 src/specfem3D/iterate_time_undoatt.F90        |   2 +
 src/specfem3D/locate_source.F90               |  69 +-
 src/specfem3D/lts_setup.F90                   | 582 ++++++++----
 src/specfem3D/noise_tomography.f90            |   6 +-
 src/specfem3D/read_mesh_databases_hdf5.F90    |   2 +-
 src/specfem3D/rules.mk                        |   9 +
 src/specfem3D/setup_sources_receivers.f90     |   8 +-
 src/specfem3D/specfem3D_par.F90               |  54 +-
 src/specfem3D/write_movie_output_HDF5.F90     |   5 +-
 tests/decompose_mesh/test_partitioning.f90    |   1 +
 .../decompose_mesh/test_partitioning.makefile |   6 +-
 tests/decompose_mesh/test_read.makefile       |   6 +-
 tests/decompose_mesh/test_valence.makefile    |   6 +-
 37 files changed, 2360 insertions(+), 570 deletions(-)

diff --git a/setup/constants.h.in b/setup/constants.h.in
index 3fafa8aee..ab1e216cf 100644
--- a/setup/constants.h.in
+++ b/setup/constants.h.in
@@ -525,12 +525,17 @@
 ! adds overlap elements
   logical,parameter :: LTS_OVERLAP_REGION   = .false.
 
-! uses only 1 single p-level (for debugging)
+! uses only 1 single p-level
   logical,parameter :: LTS_SINGLE_P_LEVEL = .false.
-! uses only 2 p-levels (for debugging)
+! uses only 2 p-levels
   logical,parameter :: LTS_TWO_P_LEVEL = .false.
 
 ! decreases final coarsest time step for stability
+! note: LTS must decrease time step for stability depending on p depth;
+!       to avoid this decrease, one can try to overlap the fine region by one/two elements.
+!       by default, we add an LTS_SAFETY_MARGIN to the suggested LTS time steps and LTS element binning;
+!       where the coarsest time step is calculated as a multiple of the user DT input in Par_file.
+!       here, we can further decrease this coarsest step for LTS simulations to add additional stability.
   logical,parameter :: LTS_DECREASE_DT = .false.
 ! time step margin to use for decrease
   double precision,parameter :: LTS_STABILITY_MARGIN_DT = 0.6d0
diff --git a/src/decompose_mesh/lts_helper.F90 b/src/decompose_mesh/lts_helper.F90
index 8398e0a9b..cb13c2e79 100644
--- a/src/decompose_mesh/lts_helper.F90
+++ b/src/decompose_mesh/lts_helper.F90
@@ -37,38 +37,18 @@ subroutine lts_add_overlap_elements(nglob,nspec,iglob_p_refine,ispec_p_refine)
   integer,dimension(nspec),intent(inout) :: ispec_p_refine
 
   ! local parameters
-  integer :: ispec,iglob,i,p
+  integer :: ispec,iglob,i,p,icycle
+  ! number of cycles to add overlap by one element layer
+  integer, parameter :: NUMBER_OF_OVERLAP_CYCLES = 1
 
   ! user output
   print *,'  adding overlap region'
+  print *,'    number of overlap cycles = ', NUMBER_OF_OVERLAP_CYCLES
   print *
 
   ! adds overlap by one element
-  ! 1. overlap
-  do ispec = 1,NSPEC
-    ! sets refinement on all points
-    ! this enlarges the finer region by one element
-    p = ispec_p_refine(ispec)
-    do i = 1,NGNOD
-      iglob = elmnts(i,ispec)
-      if ( p > iglob_p_refine(iglob) ) iglob_p_refine(iglob) = p
-    enddo
-  enddo
-  ! re-sets p refinement for element due to overlap addition above
-  ! (adds also elements touching global points with higher refinement)
-  ispec_p_refine(:) = 0
-  do ispec = 1,NSPEC
-    do i = 1,NGNOD
-      iglob = elmnts(i,ispec)
-      p = iglob_p_refine(iglob)
-      if ( p > ispec_p_refine(ispec)) ispec_p_refine(ispec) = p
-    enddo
-  enddo
-  ! checks that all elements have a valid p-value assigned
-  if ( minval(ispec_p_refine(:)) == 0 ) stop 'error ispec_p_refine has zero entry'
-
-  ! 2. overlap
-  if ( .false. ) then
+  do icycle = 1,NUMBER_OF_OVERLAP_CYCLES
+    ! flags all shared nodes according to the highest element p-level
     do ispec = 1,NSPEC
       ! sets refinement on all points
       ! this enlarges the finer region by one element
@@ -90,7 +70,7 @@ subroutine lts_add_overlap_elements(nglob,nspec,iglob_p_refine,ispec_p_refine)
     enddo
     ! checks that all elements have a valid p-value assigned
     if ( minval(ispec_p_refine(:)) == 0 ) stop 'error ispec_p_refine has zero entry'
-  endif
+  enddo
 
   end subroutine lts_add_overlap_elements
 
diff --git a/src/decompose_mesh/lts_setup_elements.F90 b/src/decompose_mesh/lts_setup_elements.F90
index 15406916d..66c5fc8a3 100644
--- a/src/decompose_mesh/lts_setup_elements.F90
+++ b/src/decompose_mesh/lts_setup_elements.F90
@@ -50,8 +50,8 @@ subroutine lts_setup_elements()
 
   ! time steps
   double precision :: dtmin,dtmax
-  double precision :: time_step
   double precision :: deltat_lts
+  double precision :: dt_suggested
 
   real(kind=CUSTOM_REAL),dimension(:),allocatable :: ispec_time_step
 
@@ -155,35 +155,39 @@ subroutine lts_setup_elements()
     distance_min = elemsize * percent_GLL(NGLLX)
 
     ! estimated time step based on CFL condition
-    time_step = COURANT_SUGGESTED * distance_min / max( vp,vs )
+    dt_suggested = COURANT_SUGGESTED * distance_min / max( vp,vs )
+
+    ! cut at a significant number of digits (2 digits)
+    ! example: 0.0734815 -> lpow = (2 - (-1) = 3 -> 0.0730
+    call get_timestep_limit_significant_digit_dp(dt_suggested)
 
     ! stores value
-    ispec_time_step(ispec) = time_step
+    ispec_time_step(ispec) = dt_suggested
 
     ! determines coarsest time step
-    if (dtmax < time_step) then
-      dtmax = time_step
+    if (dtmax < dt_suggested) then
+      dtmax = dt_suggested
     endif
     ! determines finest time step
-    if (dtmin > time_step) then
-      dtmin = time_step
+    if (dtmin > dt_suggested) then
+      dtmin = dt_suggested
     endif
   enddo
-  print *,'  estimated time step min   = ',sngl(dtmin),' seconds'
-  print *,'  estimated time step max   = ',sngl(dtmax),' seconds'
-  print *,'  estimated time step ratio = ',sngl(dtmax/dtmin)
+  print *,'  estimated time step min   = ',dtmin,' seconds'
+  print *,'  estimated time step max   = ',dtmax,' seconds'
+  print *,'  estimated time step ratio = ',dtmax/dtmin
   print *
 
   ! users sets DT = .. in Par_file, we use this as a threshold limit for the minimum time step size
   ! (useful in case CFL is underestimating size)
   if (dtmin > DT) then
-    print *,'  DT time step size set by Par_file: DT = ',sngl(DT),' limits coarsest time step size'
+    print *,'  DT time step size set by Par_file: DT = ',DT,' limits coarsest time step size'
     if (dtmin / DT >= 2.0) then
-      print *,'  Please set DT to higher value    :      ',sngl(dtmin),' otherwise LTS will not be optimal'
+      print *,'  Please set DT to higher value    :      ',dtmin,' otherwise LTS will not be optimal'
     endif
     print *
   endif
-  print *,'  suggested minimum DT time step = ',sngl(dtmin)
+  print *,'  suggested minimum DT time step = ',dtmin
 
   ! we will use now the smallest time step estimate and put elements
   ! into bins with increasing time step size (power of 2 of smallest time step)
@@ -193,17 +197,17 @@ subroutine lts_setup_elements()
   do ispec = 1,nspec
 
     ! gets estimated time step based on CFL condition
-    time_step = ispec_time_step(ispec)
+    dt_suggested = ispec_time_step(ispec)
 
     ! uses a 5% safety margin for binning
-    time_step = ( 1.d0 - LTS_SAFETY_MARGIN ) * time_step
+    dt_suggested = ( 1.d0 - LTS_SAFETY_MARGIN ) * dt_suggested
 
     ! local time step level compared to global time step size
-    p = floor( time_step / dtmin )
+    p = floor( dt_suggested / dtmin )
     if (p < 1) p = 1
 
     ! debug
-    !print *,'ispec',ispec,'time step=',time_step,dtmin,'     p refinement = ',p
+    !print *,'ispec',ispec,'time step=',dt_suggested,dtmin,'     p refinement = ',p
 
     ! debug
     ! uniform distribution for testing
diff --git a/src/decompose_mesh/part_decompose_mesh_hdf5.F90 b/src/decompose_mesh/part_decompose_mesh_hdf5.F90
index 908e16375..f87a3d215 100644
--- a/src/decompose_mesh/part_decompose_mesh_hdf5.F90
+++ b/src/decompose_mesh/part_decompose_mesh_hdf5.F90
@@ -1042,7 +1042,7 @@ subroutine write_moho_surface_database_h5(iproc, nspec, &
     endif
   enddo
 
-  !#TODO: check if this works... creates a single group per process
+  !#TODO: HDF5 moho check if this works... creates a single group per process
 
   ! create group name for the database of the current proc
   write(gname_proc,"('proc',i6.6,'_')") iproc
diff --git a/src/decompose_mesh/write_mesh_databases_hdf5.F90 b/src/decompose_mesh/write_mesh_databases_hdf5.F90
index 3a63ff6ac..ee9408c38 100644
--- a/src/decompose_mesh/write_mesh_databases_hdf5.F90
+++ b/src/decompose_mesh/write_mesh_databases_hdf5.F90
@@ -240,7 +240,7 @@ subroutine write_mesh_databases_hdf5()
 
   if (COUPLE_WITH_INJECTION_TECHNIQUE .or. MESH_A_CHUNK_OF_THE_EARTH) close(124)
 
-  ! #TODO: hdf5 support for fault simulation not implemented yet
+  ! #TODO: HDF5 support for fault simulation not implemented yet
   ! writes fault database
   if (ANY_FAULT) then
     do ipart = 0, nparts-1
diff --git a/src/generate_databases/lts_generate_databases.F90 b/src/generate_databases/lts_generate_databases.F90
index c15467638..753b310c1 100644
--- a/src/generate_databases/lts_generate_databases.F90
+++ b/src/generate_databases/lts_generate_databases.F90
@@ -98,7 +98,7 @@ subroutine lts_generate_databases()
   ! time steps
   double precision :: dtmin,dtmax
   double precision :: dtmin_glob,dtmax_glob
-  double precision :: time_step
+  double precision :: dt_suggested
 
   real(kind=CUSTOM_REAL),dimension(:),allocatable :: ispec_time_step
 
@@ -215,16 +215,20 @@ subroutine lts_generate_databases()
     !                            NSPEC_AB,NGLOB_AB,ibool,xstore,ystore,zstore)
 
     ! estimated time step based on CFL condition
-    time_step = COURANT_SUGGESTED * distance_min / max( vpmax,vsmax )
+    dt_suggested = COURANT_SUGGESTED * distance_min / max( vpmax,vsmax )
+
+    ! cut at a significant number of digits (2 digits)
+    ! example: 0.0734815 -> lpow = (2 - (-1) = 3 -> 0.0730
+    call get_timestep_limit_significant_digit_dp(dt_suggested)
 
     ! stores value
-    ispec_time_step(ispec) = time_step
+    ispec_time_step(ispec) = dt_suggested
 
     ! determines coarsest time step
-    if (dtmax < time_step) dtmax = time_step
+    if (dtmax < dt_suggested) dtmax = dt_suggested
 
     ! determines finest time step
-    if (dtmin > time_step) dtmin = time_step
+    if (dtmin > dt_suggested) dtmin = dt_suggested
   enddo
 
   ! gets min/max values over all processes
@@ -258,7 +262,7 @@ subroutine lts_generate_databases()
 
   ! user output
   if (myrank == 0) then
-    write(IMAIN,*) '     used minimum DT time step = ',dtmin
+    write(IMAIN,*) '     suggested minimum DT time step = ',dtmin
     call flush_IMAIN()
   endif
 
@@ -270,22 +274,22 @@ subroutine lts_generate_databases()
   do ispec = 1,NSPEC_AB
 
     ! gets estimated time step based on CFL condition
-    time_step = ispec_time_step(ispec)
+    dt_suggested = ispec_time_step(ispec)
 
     ! uses a user-defined safety margin for binning
-    time_step = ( 1.d0 - LTS_SAFETY_MARGIN ) * time_step
+    dt_suggested = ( 1.d0 - LTS_SAFETY_MARGIN ) * dt_suggested
 
     ! absorbing conditions need slightly smaller time steps
     ! if (ABSORBING_CONDITIONS) then
     !   ! uses a 60% safety margin for binning
-    !   time_step = ( 1.d0 - 0.3d0 ) * time_step
+    !   dt_suggested = ( 1.d0 - 0.3d0 ) * dt_suggested
     ! endif
 
     ! local time step level compared to global time step size
-    p = floor( time_step / dtmin )
+    p = floor( dt_suggested / dtmin )
     if (p < 1) p = 1
     ! debug
-    !print *,'ispec',ispec,'time step=',time_step,dtmin,'     p refinement = ',p
+    !print *,'ispec',ispec,'time step=',dt_suggested,dtmin,'     p refinement = ',p
 
     ! debug
     ! uniform distribution for testing
@@ -352,14 +356,14 @@ subroutine lts_generate_databases()
 
   ! user output
   if (myrank == 0) then
-    write(IMAIN,*) '     final lts time step       = ',deltat_lts
+    write(IMAIN,*) '     suggested global coarsest time step       = ',deltat_lts
     write(IMAIN,*)
     call flush_IMAIN()
   endif
   call synchronize_all()
 
   ! re-orders p' -> p = 1 / p'
-  ! note: 1 == coarsest, p_max == smallest such that local time step == dt / p
+  ! note: 1 == coarsest, p_max == finest level such that local time step == dt / p
   do iglob = 1,NGLOB_AB
     p = iglob_p_refine(iglob)
     iglob_p_refine(iglob) = p_max / p
@@ -1251,47 +1255,21 @@ subroutine lts_add_overlap_elements_databases(iglob_p_refine,ispec_p_refine)
   integer,dimension(NSPEC_AB),intent(inout) :: ispec_p_refine
 
   ! local parameters
-  integer :: ispec,iglob,i,j,k,p
+  integer :: ispec,iglob,i,j,k,p,icycle
+  ! number of cycles to add overlap by one element layer
+  integer, parameter :: NUMBER_OF_OVERLAP_CYCLES = 1
 
   ! user output
   if (myrank == 0) then
     write(IMAIN,*) '     adding overlap region'
+    write(IMAIN,*) '       number of overlap cycles = ', NUMBER_OF_OVERLAP_CYCLES
     write(IMAIN,*)
     call flush_IMAIN()
   endif
 
   ! adds overlap by one element
-  ! 1. overlap
-  do ispec = 1,NSPEC_AB
-    ! sets refinement on all points
-    ! this enlarges the finer region by one element
-    p = ispec_p_refine(ispec)
-    do k = 1,NGLLZ
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          iglob = ibool(i,j,k,ispec)
-          if (p > iglob_p_refine(iglob)) iglob_p_refine(iglob) = p
-        enddo
-      enddo
-    enddo
-  enddo
-  ! re-sets p refinement for element due to overlap addition above
-  ! (adds also elements touching global points with higher refinement)
-  ispec_p_refine(:) = 0
-  do ispec = 1,NSPEC_AB
-    do k = 1,NGLLZ
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          iglob = ibool(i,j,k,ispec)
-          p = iglob_p_refine(iglob)
-          if (p > ispec_p_refine(ispec)) ispec_p_refine(ispec) = p
-        enddo
-      enddo
-    enddo
-  enddo
-
-  ! 2. overlap
-  if (.false.) then
+  do icycle = 1,NUMBER_OF_OVERLAP_CYCLES
+    ! flags all shared nodes according to the highest element p-level
     do ispec = 1,NSPEC_AB
       ! sets refinement on all points
       ! this enlarges the finer region by one element
@@ -1319,7 +1297,7 @@ subroutine lts_add_overlap_elements_databases(iglob_p_refine,ispec_p_refine)
         enddo
       enddo
     enddo
-  endif
+  enddo
 
   end subroutine lts_add_overlap_elements_databases
 
diff --git a/src/inverse_problem_for_model/rules.mk b/src/inverse_problem_for_model/rules.mk
index a77b5f5a1..c6de580b1 100644
--- a/src/inverse_problem_for_model/rules.mk
+++ b/src/inverse_problem_for_model/rules.mk
@@ -163,6 +163,10 @@ inverse_problem_for_model_OBJECTS += \
 	$O/locate_point.spec.o \
 	$O/locate_receivers.spec.o \
 	$O/locate_source.spec.o \
+	$O/lts_assemble_MPI_vector.spec.o \
+	$O/lts_global_step.spec.o \
+	$O/lts_iterate_time.spec.o \
+	$O/lts_newmark_update.spec.o \
 	$O/lts_setup.spec.o \
 	$O/make_gravity.spec.o \
 	$O/noise_tomography.spec.o \
diff --git a/src/inverse_problem_for_model/specfem_interface/specfem_interface_mod.F90 b/src/inverse_problem_for_model/specfem_interface/specfem_interface_mod.F90
index 77b1fdd08..d6f61fb36 100644
--- a/src/inverse_problem_for_model/specfem_interface/specfem_interface_mod.F90
+++ b/src/inverse_problem_for_model/specfem_interface/specfem_interface_mod.F90
@@ -1570,7 +1570,12 @@ subroutine  CheckModelSuitabilityForModeling(ModelIsSuitable)
      ! suggested timestep
      vel_max = max( vpmax,vsmax )
      dt_suggested = COURANT_SUGGESTED * distance_min / vel_max
-     dt_suggested_glob = min( dt_suggested_glob, dt_suggested)
+
+     ! cut at a significant number of digits (2 digits)
+     ! example: 0.0734815 -> lpow = (2 - (-1) = 3 -> 0.0730
+     call get_timestep_limit_significant_digit(dt_suggested)
+
+     dt_suggested_glob = min(dt_suggested_glob, dt_suggested)
 
   enddo
 
diff --git a/src/shared/check_mesh_resolution.f90 b/src/shared/check_mesh_resolution.f90
index 7328dbd7c..52443608f 100644
--- a/src/shared/check_mesh_resolution.f90
+++ b/src/shared/check_mesh_resolution.f90
@@ -294,7 +294,13 @@ subroutine check_mesh_resolution(NSPEC_AB,NGLOB_AB,ibool,xstore,ystore,zstore, &
 
     ! suggested timestep
     dt_suggested = COURANT_SUGGESTED * distance_min / vel_max
-    dt_suggested_glob = min( dt_suggested_glob, dt_suggested)
+
+    ! cut at a significant number of digits (2 digits)
+    ! example: 0.0734815 -> lpow = (2 - (-1) = 3 -> 0.0730
+    call get_timestep_limit_significant_digit(dt_suggested)
+
+    ! global suggested timestep
+    dt_suggested_glob = min(dt_suggested_glob, dt_suggested)
 
     ! debug: for vtk output
     if (SAVE_MESH_FILES) tmp2(ispec) = pmax
@@ -736,7 +742,7 @@ end subroutine check_mesh_resolution
 !
 !    ! suggested timestep
 !    dt_suggested = COURANT_SUGGESTED * distance_min / max( vpmax,vp2max,vsmax )
-!    dt_suggested_glob = min( dt_suggested_glob, dt_suggested)
+!    dt_suggested_glob = min(dt_suggested_glob, dt_suggested)
 !
 !    ! debug: for vtk output
 !    if (SAVE_MESH_FILES) tmp2(ispec) = pmax
diff --git a/src/shared/read_parameter_file.F90 b/src/shared/read_parameter_file.F90
index c93452ee6..3762d3526 100644
--- a/src/shared/read_parameter_file.F90
+++ b/src/shared/read_parameter_file.F90
@@ -1438,3 +1438,94 @@ subroutine broadcast_computed_parameters()
 
   end subroutine broadcast_computed_parameters
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_timestep_limit_significant_digit(time_step)
+
+  ! cut at a significant number of digits (e.g., 2 digits) using 1/2 rounding
+  ! example: 0.0734815 -> 0.0730
+  !      and 0.0737777 -> 0.0735
+  !
+  ! also works with different magnitudes of time step sizes (0.118, 0.00523, ..). always cut of after 2 significant digits:
+  ! example: 0.118749999 -> 0.115
+
+  use constants, only: CUSTOM_REAL
+
+  implicit none
+
+  real(kind=CUSTOM_REAL),intent(inout) :: time_step  ! CUSTOM_REAL
+
+  ! rounding
+  integer :: lpow,ival
+  double precision :: fac_pow,dt_cut
+
+  ! initializes
+  dt_cut = time_step
+
+  ! cut at a significant number of digits (2 digits)
+  ! example: 0.0734815 -> lpow = (2 - (-1) = 3
+  lpow = int(2.d0 - log10(dt_cut))
+
+  ! example: -> factor 10**3
+  fac_pow = 10.d0**(lpow)
+
+  ! example: -> 73
+  ival = int(fac_pow * dt_cut)
+
+  ! adds .5-digit (in case): 73.0 -> 0.073
+  if ( (fac_pow * dt_cut - ival) >= 0.5 ) then
+    dt_cut = (dble(ival) + 0.5d0) / fac_pow
+  else
+    dt_cut = dble(ival) / fac_pow
+  endif
+
+  time_step = real(dt_cut,kind=CUSTOM_REAL)
+
+  end subroutine get_timestep_limit_significant_digit
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_timestep_limit_significant_digit_dp(time_step)
+
+  ! cut at a significant number of digits (e.g., 2 digits) using 1/2 rounding
+  ! example: 0.0734815 -> 0.0730
+  !      and 0.0737777 -> 0.0735
+  !
+  ! also works with different magnitudes of time step sizes (0.118, 0.00523, ..). always cut of after 2 significant digits:
+  ! example: 0.118749999 -> 0.115
+
+  implicit none
+
+  double precision,intent(inout) :: time_step      ! double precision
+
+  ! rounding
+  integer :: lpow,ival
+  double precision :: fac_pow,dt_cut
+
+  ! initializes
+  dt_cut = time_step
+
+  ! cut at a significant number of digits (2 digits)
+  ! example: 0.0734815 -> lpow = (2 - (-1) = 3
+  lpow = int(2.d0 - log10(dt_cut))
+
+  ! example: -> factor 10**3
+  fac_pow = 10.d0**(lpow)
+
+  ! example: -> 73
+  ival = int(fac_pow * dt_cut)
+
+  ! adds .5-digit (in case): 73.0 -> 0.073
+  if ( (fac_pow * dt_cut - ival) >= 0.5 ) then
+    dt_cut = (dble(ival) + 0.5d0) / fac_pow
+  else
+    dt_cut = dble(ival) / fac_pow
+  endif
+
+  time_step = dt_cut
+
+  end subroutine get_timestep_limit_significant_digit_dp
diff --git a/src/shared/write_VTK_data.f90 b/src/shared/write_VTK_data.f90
index 87d3ee8b4..8c78a6b4b 100644
--- a/src/shared/write_VTK_data.f90
+++ b/src/shared/write_VTK_data.f90
@@ -1536,9 +1536,105 @@ end subroutine write_VTK_movie_data_elemental
 !
 !  end subroutine write_VTK_movie_data_binary
 !
+
+!
+!-------------------------------------------------------------------------------------------------
 !
+
+  subroutine write_VTK_wavefield(nspec,nglob,xstore,ystore,zstore,ibool, &
+                                 field,prname_file)
+
+  use constants, only: CUSTOM_REAL,MAX_STRING_LEN,IOUT_VTK,NDIM,NGLLX,NGLLY,NGLLZ
+
+  implicit none
+
+  integer,intent(in) :: nspec,nglob
+
+  ! global coordinates
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec),intent(in) :: ibool
+  real(kind=CUSTOM_REAL), dimension(nglob),intent(in) :: xstore,ystore,zstore
+
+  ! GLL data values array
+  real(kind=CUSTOM_REAL), dimension(NDIM,nglob),intent(in) :: field
+
+  ! file name
+  character(len=MAX_STRING_LEN),intent(in) :: prname_file
+
+  ! local parameters
+  integer :: ispec,i
+
+  open(IOUT_VTK,file=prname_file(1:len_trim(prname_file))//'.vtk',status='unknown')
+  write(IOUT_VTK,'(a)') '# vtk DataFile Version 3.1'
+  write(IOUT_VTK,'(a)') 'material model VTK file'
+  write(IOUT_VTK,'(a)') 'ASCII'
+  write(IOUT_VTK,'(a)') 'DATASET UNSTRUCTURED_GRID'
+  write(IOUT_VTK, '(a,i12,a)') 'POINTS ', nglob, ' float'
+  do i=1,nglob
+    write(IOUT_VTK,'(3e18.6)') xstore(i),ystore(i),zstore(i)
+  enddo
+  write(IOUT_VTK,*) ""
+
+  ! note: indices for vtk start at 0
+  write(IOUT_VTK,'(a,i12,i12)') "CELLS ",nspec,nspec*9
+  do ispec=1,nspec
+    write(IOUT_VTK,'(9i12)') 8, &
+                             ibool(1,1,1,ispec)-1,ibool(NGLLX,1,1,ispec)-1, &
+                             ibool(NGLLX,NGLLY,1,ispec)-1,ibool(1,NGLLY,1,ispec)-1, &
+                             ibool(1,1,NGLLZ,ispec)-1,ibool(NGLLX,1,NGLLZ,ispec)-1, &
+                             ibool(NGLLX,NGLLY,NGLLZ,ispec)-1,ibool(1,NGLLY,NGLLZ,ispec)-1
+  enddo
+  write(IOUT_VTK,*) ""
+
+  ! type: hexahedrons
+  write(IOUT_VTK,'(a,i12)') "CELL_TYPES ",nspec
+  write(IOUT_VTK,'(6i12)') (12,ispec=1,nspec)
+  write(IOUT_VTK,*) ""
+
+  write(IOUT_VTK,'(a,i12)') "POINT_DATA ",nglob
+
+  ! single wavefield components
+  if (.true.) then
+    write(IOUT_VTK,'(a)') "SCALARS field_x float"
+    write(IOUT_VTK,'(a)') "LOOKUP_TABLE default"
+    do i = 1,nglob
+      write(IOUT_VTK,*) field(1,i)
+    enddo
+    write(IOUT_VTK,*) ""
+
+    write(IOUT_VTK,'(a)') "SCALARS field_y float"
+    write(IOUT_VTK,'(a)') "LOOKUP_TABLE default"
+    do i = 1,nglob
+      write(IOUT_VTK,*) field(2,i)
+    enddo
+    write(IOUT_VTK,*) ""
+
+    write(IOUT_VTK,'(a)') "SCALARS field_z float"
+    write(IOUT_VTK,'(a)') "LOOKUP_TABLE default"
+    do i = 1,nglob
+      write(IOUT_VTK,*) field(3,i)
+    enddo
+    write(IOUT_VTK,*) ""
+  endif
+
+  ! vector wavefield
+  if (.true.) then
+    write(IOUT_VTK,'(a)') "VECTORS field_vector float"
+    do i = 1,nglob
+      write(IOUT_VTK,*) field(1,i),field(2,i),field(3,i)
+    enddo
+    write(IOUT_VTK,*) ""
+  endif
+
+  close(IOUT_VTK)
+
+  end subroutine write_VTK_wavefield
+
+
 !-------------------------------------------------------------------------------------------------
 !
+! VTU binary formats
+!
+!-------------------------------------------------------------------------------------------------
 
   subroutine write_VTU_movie_data(ne,np,total_dat_xyz,total_dat_con,total_dat,mesh_file,var_name)
 
diff --git a/src/specfem3D/assemble_MPI_vector.f90 b/src/specfem3D/assemble_MPI_vector.f90
index 6d92558ac..0281e0637 100644
--- a/src/specfem3D/assemble_MPI_vector.f90
+++ b/src/specfem3D/assemble_MPI_vector.f90
@@ -249,7 +249,7 @@ subroutine assemble_MPI_vector_async_send(NPROC,NGLOB_AB,array_val, &
 
 ! sends data
 
-  use constants, only: NDIM,CUSTOM_REAL,itag
+  use constants, only: NDIM,CUSTOM_REAL,itag,ASSEMBLE_MPI_OFF
 
   implicit none
 
@@ -269,10 +269,13 @@ subroutine assemble_MPI_vector_async_send(NPROC,NGLOB_AB,array_val, &
   integer, dimension(num_interfaces_ext_mesh),intent(inout) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
 
   ! local parameters
-  integer ipoin,iinterface
+  integer :: ipoin,iinterface
 
   ! here we have to assemble all the contributions between partitions using MPI
 
+  ! debug: no mpi
+  if (ASSEMBLE_MPI_OFF) return
+
   ! assemble only if more than one partition
   if (NPROC > 1) then
 
@@ -315,7 +318,7 @@ subroutine assemble_MPI_vector_async_recv(NPROC,NGLOB_AB,array_val, &
 
 ! waits for send/receiver to be completed and assembles contributions
 
-  use constants, only: NDIM,CUSTOM_REAL
+  use constants, only: NDIM,CUSTOM_REAL,ASSEMBLE_MPI_OFF
   use specfem_par, only: FAULT_SIMULATION
 
   implicit none
@@ -341,6 +344,9 @@ subroutine assemble_MPI_vector_async_recv(NPROC,NGLOB_AB,array_val, &
 
   ! here we have to assemble all the contributions between partitions using MPI
 
+  ! debug: no mpi
+  if (ASSEMBLE_MPI_OFF) return
+
   ! assemble only if more than one partition
   if (NPROC == 1) return
 
diff --git a/src/specfem3D/compute_add_sources_acoustic.f90 b/src/specfem3D/compute_add_sources_acoustic.f90
index e96cdafee..c9e1de25b 100644
--- a/src/specfem3D/compute_add_sources_acoustic.f90
+++ b/src/specfem3D/compute_add_sources_acoustic.f90
@@ -27,7 +27,7 @@
 
 ! for acoustic solver
 
-  subroutine compute_add_sources_acoustic()
+  subroutine compute_add_sources_acoustic(potential_dot_dot_acoustic)
 
   use constants
   use specfem_par, only: station_name,network_name, &
@@ -35,11 +35,11 @@ subroutine compute_add_sources_acoustic()
                          hxir_adjstore,hetar_adjstore,hgammar_adjstore,source_adjoint,number_adjsources_global,nadj_rec_local, &
                          USE_BINARY_FOR_SEISMOGRAMS, &
                          ibool,NSOURCES,myrank,it,ispec_selected_source,islice_selected_source, &
-                         sourcearrays,kappastore,SIMULATION_TYPE,NSTEP, &
+                         sourcearrays,kappastore,SIMULATION_TYPE,NSTEP,NGLOB_AB, &
                          ispec_selected_rec, &
                          nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC, INVERSE_FWI_FULL_PROBLEM
 
-  use specfem_par_acoustic, only: potential_dot_dot_acoustic,ispec_is_acoustic
+  use specfem_par_acoustic, only: ispec_is_acoustic
 
   ! coupling
   use shared_parameters, only: COUPLE_WITH_INJECTION_TECHNIQUE
@@ -49,7 +49,9 @@ subroutine compute_add_sources_acoustic()
 
   implicit none
 
-! local parameters
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB), intent(inout) :: potential_dot_dot_acoustic
+
+  ! local parameters
   real(kind=CUSTOM_REAL) :: stf_used,hlagrange
   logical :: ibool_read_adj_arrays
   double precision :: stf,time_source_dble,time_t
@@ -246,18 +248,18 @@ end subroutine compute_add_sources_acoustic
 
 ! for acoustic solver for back propagation wave field
 
-  subroutine compute_add_sources_acoustic_backward()
+  subroutine compute_add_sources_acoustic_backward(b_potential_dot_dot_acoustic)
 
   use constants
   use specfem_par, only: nsources_local,tshift_src,DT,t0,USE_LDDRK,istage, &
                          ibool,NSOURCES,myrank,it,islice_selected_source,ispec_selected_source, &
-                         sourcearrays,kappastore,SIMULATION_TYPE,NSTEP
+                         sourcearrays,kappastore,SIMULATION_TYPE,NSTEP,NGLOB_AB
 
   ! undo_att
   use specfem_par, only: UNDO_ATTENUATION_AND_OR_PML,NSUBSET_ITERATIONS,NT_DUMP_ATTENUATION, &
                          iteration_on_subset,it_of_this_subset
 
-  use specfem_par_acoustic, only: ispec_is_acoustic,b_potential_dot_dot_acoustic
+  use specfem_par_acoustic, only: ispec_is_acoustic
 
   ! coupling
   use shared_parameters, only: COUPLE_WITH_INJECTION_TECHNIQUE
@@ -267,7 +269,9 @@ subroutine compute_add_sources_acoustic_backward()
 
   implicit none
 
-! local parameters
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB), intent(inout) :: b_potential_dot_dot_acoustic
+
+  ! local parameters
   real(kind=CUSTOM_REAL) :: stf_used
 
   double precision :: stf,time_source_dble,time_t
diff --git a/src/specfem3D/compute_add_sources_viscoelastic.F90 b/src/specfem3D/compute_add_sources_viscoelastic.F90
index 10655e5f5..4311eaebf 100644
--- a/src/specfem3D/compute_add_sources_viscoelastic.F90
+++ b/src/specfem3D/compute_add_sources_viscoelastic.F90
@@ -27,21 +27,25 @@
 
 ! for elastic solver
 
-  subroutine compute_add_sources_viscoelastic()
+  subroutine compute_add_sources_viscoelastic(accel)
 
   use constants
-  use specfem_par, only: station_name,network_name,num_free_surface_faces,free_surface_ispec, &
-                        free_surface_ijk,free_surface_jacobian2Dw, &
-                        nsources_local,tshift_src,dt,t0,SU_FORMAT, &
-                        USE_LDDRK,istage, &
-                        USE_BINARY_FOR_SEISMOGRAMS,NSPEC_AB,NGLOB_AB,ibool,NSOURCES,myrank,it,islice_selected_source, &
-                        ispec_selected_source,sourcearrays,SIMULATION_TYPE,NSTEP,READ_ADJSRC_ASDF, &
-                        nrec,islice_selected_rec,ispec_selected_rec, &
-                        NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
-                        hxir_adjstore,hetar_adjstore,hgammar_adjstore,source_adjoint,number_adjsources_global,nadj_rec_local, &
-                        INVERSE_FWI_FULL_PROBLEM
 
-  use specfem_par_elastic, only: accel,ispec_is_elastic
+  use shared_parameters, only: DT, &
+    SIMULATION_TYPE,NOISE_TOMOGRAPHY,INVERSE_FWI_FULL_PROBLEM, &
+    USE_LDDRK,LTS_MODE, &
+    SU_FORMAT,USE_BINARY_FOR_SEISMOGRAMS, &
+    NSTEP,READ_ADJSRC_ASDF,NTSTEP_BETWEEN_READ_ADJSRC
+
+  use specfem_par, only: station_name,network_name, &
+    NSPEC_AB,NGLOB_AB,ibool, &
+    tshift_src,t0,istage,it, &
+    num_free_surface_faces,free_surface_ispec,free_surface_ijk,free_surface_jacobian2Dw, &
+    nsources_local,NSOURCES,islice_selected_source,ispec_selected_source,sourcearrays, &
+    nrec,islice_selected_rec,ispec_selected_rec, &
+    nadj_rec_local,hxir_adjstore,hetar_adjstore,hgammar_adjstore,source_adjoint,number_adjsources_global
+
+  use specfem_par_elastic, only: ispec_is_elastic
 
   ! noise
   use specfem_par_noise, only: noise_sourcearray,irec_main_noise, &
@@ -53,9 +57,14 @@ subroutine compute_add_sources_viscoelastic()
   ! faults
   use specfem_par, only: FAULT_SIMULATION
 
+  ! LTS
+  use specfem_par_lts, only: current_lts_time, current_lts_elem
+
   implicit none
 
-! local parameters
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB), intent(inout) :: accel
+
+  ! local parameters
   real(kind=CUSTOM_REAL) :: stf_used,hlagrange
   logical :: ibool_read_adj_arrays
   double precision :: stf,time_source_dble,time_t
@@ -74,6 +83,9 @@ subroutine compute_add_sources_viscoelastic()
     !       thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme
     !       when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source.
     time_t = dble(it-1-1)*DT + dble(C_LDDRK(istage))*DT - t0
+  else if (LTS_MODE) then
+    ! current local time
+    time_t = current_lts_time
   else
     time_t = dble(it-1)*DT - t0
   endif
@@ -102,6 +114,14 @@ subroutine compute_add_sources_viscoelastic()
         ispec = ispec_selected_source(isource)
 
         if (ispec_is_elastic(ispec)) then
+          ! LTS
+          if (LTS_MODE) then
+            ! checks if source lies in this p-level LTS element
+            if (.not. current_lts_elem(ispec)) cycle
+            ! debug
+            !if (it==1) print *,'debug: lts add source time',time_t,it,isource
+          endif
+
           ! current time
           time_source_dble = time_t - tshift_src(isource)
 
@@ -271,7 +291,7 @@ end subroutine compute_add_sources_viscoelastic
 !=====================================================================
 ! for elastic solver
 
-  subroutine compute_add_sources_viscoelastic_backward()
+  subroutine compute_add_sources_viscoelastic_backward(b_accel)
 
   use constants
   use specfem_par, only: num_free_surface_faces,free_surface_ispec, &
@@ -282,7 +302,7 @@ subroutine compute_add_sources_viscoelastic_backward()
                         NSOURCES,myrank,it,islice_selected_source,ispec_selected_source, &
                         sourcearrays,SIMULATION_TYPE,NSTEP,NOISE_TOMOGRAPHY
 
-  use specfem_par_elastic, only: b_accel,ispec_is_elastic
+  use specfem_par_elastic, only: ispec_is_elastic
 
   ! noise
   use specfem_par_noise, only: normal_x_noise,normal_y_noise,normal_z_noise, &
@@ -300,6 +320,8 @@ subroutine compute_add_sources_viscoelastic_backward()
 
   implicit none
 
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB), intent(inout) :: b_accel
+
   ! local parameters
   real(kind=CUSTOM_REAL) stf_used
 
@@ -447,17 +469,21 @@ end subroutine compute_add_sources_viscoelastic_backward
   subroutine compute_add_sources_viscoelastic_GPU()
 
   use constants
+
+  use shared_parameters, only: DT, &
+    SIMULATION_TYPE,NOISE_TOMOGRAPHY,INVERSE_FWI_FULL_PROBLEM, &
+    USE_LDDRK,LTS_MODE,GPU_MODE,UNDO_ATTENUATION_AND_OR_PML, &
+    SU_FORMAT,USE_BINARY_FOR_SEISMOGRAMS, &
+    NSTEP,NTSTEP_BETWEEN_READ_ADJSRC
+
   use specfem_par, only: station_name,network_name, &
-                        num_free_surface_faces, &
-                        nsources_local,tshift_src,dt,t0,SU_FORMAT, &
-                        USE_LDDRK,istage,USE_BINARY_FOR_SEISMOGRAMS, &
-                        UNDO_ATTENUATION_AND_OR_PML, &
-                        NSOURCES,it,SIMULATION_TYPE,NSTEP, &
-                        nrec,islice_selected_rec, &
-                        nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
-                        Mesh_pointer, &
-                        source_adjoint,nadj_rec_local,number_adjsources_global, &
-                        INVERSE_FWI_FULL_PROBLEM,GPU_MODE
+    num_free_surface_faces, &
+    tshift_src,t0, &
+    istage,it, &
+    NSOURCES,nsources_local,ispec_selected_source, &
+    nrec,islice_selected_rec, &
+    nadj_rec_local,source_adjoint,nadj_rec_local,number_adjsources_global, &
+    Mesh_pointer
 
   use specfem_par_noise, only: irec_main_noise,noise_surface_movie
 
@@ -467,6 +493,9 @@ subroutine compute_add_sources_viscoelastic_GPU()
   ! faults
   use specfem_par, only: FAULT_SIMULATION
 
+  ! LTS
+  use specfem_par_lts, only: current_lts_time,current_lts_elem
+
   implicit none
 
   ! local parameters
@@ -502,10 +531,19 @@ subroutine compute_add_sources_viscoelastic_GPU()
         !       thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme
         !       when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source.
         time_t = dble(it-1-1)*DT + dble(C_LDDRK(istage))*DT - t0
+      else if (LTS_MODE) then
+        ! current LTS time
+        time_t = current_lts_time
       else
         time_t = dble(it-1)*DT - t0
       endif
 
+      ! LTS
+      if (LTS_MODE) then
+        ! checks if anything to do
+        if (NSOURCES == 1 .and. (.not. current_lts_elem(ispec_selected_source(1)))) return
+      endif
+
       do isource = 1,NSOURCES
         ! current time
         time_source_dble = time_t - tshift_src(isource)
@@ -513,10 +551,22 @@ subroutine compute_add_sources_viscoelastic_GPU()
         ! determines source time function value
         stf = get_stf_viscoelastic(time_source_dble,isource,it)
 
+        ! LTS
+        if (LTS_MODE) then
+          ! only sources in elements contribute which are in current p-level
+          if (.not. current_lts_elem(ispec_selected_source(isource))) stf = 0.d0
+        endif
+
         ! stores precomputed source time function factor
         stf_pre_compute(isource) = stf
       enddo
 
+      ! LTS
+      if (LTS_MODE) then
+        ! checks if anything to do
+        if (maxval(stf_pre_compute(:)) == 0.d0) return
+      endif
+
       ! only implements SIMTYPE=1 and NOISE_TOM=0
       ! write(*,*) "Fortran dt = ", dt
       ! change dt -> DT
diff --git a/src/specfem3D/compute_element_att_memory.f90 b/src/specfem3D/compute_element_att_memory.f90
index e15627213..ca0a7720a 100644
--- a/src/specfem3D/compute_element_att_memory.f90
+++ b/src/specfem3D/compute_element_att_memory.f90
@@ -25,7 +25,7 @@
 !
 !=====================================================================
 
-subroutine compute_element_att_memory_second_order_rk(ispec,alphaval,betaval,gammaval,NSPEC_AB,kappastore,mustore, &
+  subroutine compute_element_att_memory_second_order_rk(ispec,alphaval,betaval,gammaval,NSPEC_AB,kappastore,mustore, &
                           NSPEC_ATTENUATION_AB, &
                           factor_common_kappa, &
                           R_trace,epsilondev_trace,epsilondev_trace_loc, &
@@ -35,6 +35,10 @@ subroutine compute_element_att_memory_second_order_rk(ispec,alphaval,betaval,gam
 
   use constants, only: CUSTOM_REAL,N_SLS,NGLLX,NGLLY,NGLLZ
 
+  ! LTS
+  use shared_parameters, only: LTS_MODE
+  use specfem_par_lts, only: lts_type_compute_pelem
+
   implicit none
 
   integer,intent(in) :: ispec,NSPEC_AB,NSPEC_ATTENUATION_AB,NSPEC_STRAIN_ONLY
@@ -53,65 +57,109 @@ subroutine compute_element_att_memory_second_order_rk(ispec,alphaval,betaval,gam
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(in) :: epsilondev_trace_loc, epsilondev_xx_loc, &
             epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
 
-! local parameters
+  ! local parameters
   integer :: i,j,k
   real(kind=CUSTOM_REAL) :: Sn,Snp1
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: factor_loc
 
-  do k = 1,NGLLZ
-    do j = 1,NGLLY
-      do i = 1,NGLLX
+  ! LTS
+  if (LTS_MODE .and. .not. lts_type_compute_pelem) then
+    ! note: we first call the compute_forces routine for p-elements only, then in a second call for boundary-elements.
+    !       to update the memory variables, we will in the first compute them as by default, but in the second call
+    !       only add to the existing values.
+
+    ! boundary-element call
+    do k = 1,NGLLZ
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+          ! bulk attenuation
+          ! term in trace
+          factor_loc(:) = kappastore(i,j,k,ispec) * factor_common_kappa(:,i,j,k,ispec)
+          ! adds only gamma term
+          Snp1   = epsilondev_trace_loc(i,j,k)
+          R_trace(:,i,j,k,ispec) = R_trace(:,i,j,k,ispec) + factor_loc(:) * gammaval(:) * Snp1
+
+          ! shear attenuation
+          ! term in xx yy zz xy xz yz
+          factor_loc(:) = mustore(i,j,k,ispec) * factor_common(:,i,j,k,ispec)
+          ! adds only gamma term
+          ! term in xx
+          Snp1   = epsilondev_xx_loc(i,j,k)
+          R_xx(:,i,j,k,ispec) = R_xx(:,i,j,k,ispec) + factor_loc(:) * gammaval(:) * Snp1
+          ! term in yy
+          Snp1   = epsilondev_yy_loc(i,j,k)
+          R_yy(:,i,j,k,ispec) = R_yy(:,i,j,k,ispec) + factor_loc(:) * gammaval(:) * Snp1
+          ! term in zz not computed since zero trace
+          ! term in xy
+          Snp1   = epsilondev_xy_loc(i,j,k)
+          R_xy(:,i,j,k,ispec) = R_xy(:,i,j,k,ispec) + factor_loc(:) * gammaval(:) * Snp1
+          ! term in xz
+          Snp1   = epsilondev_xz_loc(i,j,k)
+          R_xz(:,i,j,k,ispec) = R_xz(:,i,j,k,ispec) + factor_loc(:) * gammaval(:) * Snp1
+          ! term in yz
+          Snp1   = epsilondev_yz_loc(i,j,k)
+          R_yz(:,i,j,k,ispec) = R_yz(:,i,j,k,ispec) + factor_loc(:) * gammaval(:) * Snp1
+        enddo
+      enddo
+    enddo
 
-        ! bulk attenuation
-        ! term in trace
-        factor_loc(:) = kappastore(i,j,k,ispec)* factor_common_kappa(:,i,j,k,ispec)
+  else
+    ! default
+    do k = 1,NGLLZ
+      do j = 1,NGLLY
+        do i = 1,NGLLX
 
-        Sn   = epsilondev_trace(i,j,k,ispec)
-        Snp1   = epsilondev_trace_loc(i,j,k)
-        R_trace(:,i,j,k,ispec) = alphaval(:) * R_trace(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
+          ! bulk attenuation
+          ! term in trace
+          factor_loc(:) = kappastore(i,j,k,ispec)* factor_common_kappa(:,i,j,k,ispec)
 
-        ! shear attenuation
-        ! term in xx yy zz xy xz yz
-        factor_loc(:) = mustore(i,j,k,ispec) * factor_common(:,i,j,k,ispec)
+          Sn   = epsilondev_trace(i,j,k,ispec)
+          Snp1   = epsilondev_trace_loc(i,j,k)
+          R_trace(:,i,j,k,ispec) = alphaval(:) * R_trace(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
 
-        ! term in xx
-        Sn   = epsilondev_xx(i,j,k,ispec)
-        Snp1   = epsilondev_xx_loc(i,j,k)
-        R_xx(:,i,j,k,ispec) = alphaval(:) * R_xx(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
+          ! shear attenuation
+          ! term in xx yy zz xy xz yz
+          factor_loc(:) = mustore(i,j,k,ispec) * factor_common(:,i,j,k,ispec)
 
-        ! term in yy
-        Sn   = epsilondev_yy(i,j,k,ispec)
-        Snp1   = epsilondev_yy_loc(i,j,k)
-        R_yy(:,i,j,k,ispec) = alphaval(:) * R_yy(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
+          ! term in xx
+          Sn   = epsilondev_xx(i,j,k,ispec)
+          Snp1   = epsilondev_xx_loc(i,j,k)
+          R_xx(:,i,j,k,ispec) = alphaval(:) * R_xx(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
 
-        ! term in zz not computed since zero trace
+          ! term in yy
+          Sn   = epsilondev_yy(i,j,k,ispec)
+          Snp1   = epsilondev_yy_loc(i,j,k)
+          R_yy(:,i,j,k,ispec) = alphaval(:) * R_yy(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
 
-        ! term in xy
-        Sn   = epsilondev_xy(i,j,k,ispec)
-        Snp1   = epsilondev_xy_loc(i,j,k)
-        R_xy(:,i,j,k,ispec) = alphaval(:) * R_xy(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
+          ! term in zz not computed since zero trace
 
-        ! term in xz
-        Sn   = epsilondev_xz(i,j,k,ispec)
-        Snp1   = epsilondev_xz_loc(i,j,k)
-        R_xz(:,i,j,k,ispec) = alphaval(:) * R_xz(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
+          ! term in xy
+          Sn   = epsilondev_xy(i,j,k,ispec)
+          Snp1   = epsilondev_xy_loc(i,j,k)
+          R_xy(:,i,j,k,ispec) = alphaval(:) * R_xy(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
 
-        ! term in yz
-        Sn   = epsilondev_yz(i,j,k,ispec)
-        Snp1   = epsilondev_yz_loc(i,j,k)
-        R_yz(:,i,j,k,ispec) = alphaval(:) * R_yz(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
+          ! term in xz
+          Sn   = epsilondev_xz(i,j,k,ispec)
+          Snp1   = epsilondev_xz_loc(i,j,k)
+          R_xz(:,i,j,k,ispec) = alphaval(:) * R_xz(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
 
+          ! term in yz
+          Sn   = epsilondev_yz(i,j,k,ispec)
+          Snp1   = epsilondev_yz_loc(i,j,k)
+          R_yz(:,i,j,k,ispec) = alphaval(:) * R_yz(:,i,j,k,ispec) + factor_loc(:) * (betaval(:) * Sn + gammaval(:) * Snp1)
+
+        enddo
       enddo
     enddo
-  enddo
+  endif
 
-end subroutine compute_element_att_memory_second_order_rk
+  end subroutine compute_element_att_memory_second_order_rk
 
 !
 !--------------------------------------------------------------------------------------------
 !
 
-subroutine compute_element_att_memory_lddrk(ispec,deltat,NSPEC_AB,kappastore,mustore, &
+  subroutine compute_element_att_memory_lddrk(ispec,deltat,NSPEC_AB,kappastore,mustore, &
                           NSPEC_ATTENUATION_AB, &
                           factor_common_kappa, &
                           R_trace,epsilondev_trace_loc, &
@@ -143,7 +191,7 @@ subroutine compute_element_att_memory_lddrk(ispec,deltat,NSPEC_AB,kappastore,mus
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(in) :: epsilondev_trace_loc, epsilondev_xx_loc, &
             epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
 
-! local parameters
+  ! local parameters
   integer :: i,j,k
   real(kind=CUSTOM_REAL) :: Snp1
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: factor_loc
@@ -199,4 +247,4 @@ subroutine compute_element_att_memory_lddrk(ispec,deltat,NSPEC_AB,kappastore,mus
     enddo
   enddo
 
-end subroutine compute_element_att_memory_lddrk
+  end subroutine compute_element_att_memory_lddrk
diff --git a/src/specfem3D/compute_forces_acoustic_calling_routine.f90 b/src/specfem3D/compute_forces_acoustic_calling_routine.f90
index 7c2d65bb7..198e83479 100644
--- a/src/specfem3D/compute_forces_acoustic_calling_routine.f90
+++ b/src/specfem3D/compute_forces_acoustic_calling_routine.f90
@@ -213,7 +213,7 @@ subroutine compute_forces_acoustic_forward_calling()
       !
       if (.not. GPU_MODE) then
         ! on CPU
-        call compute_add_sources_acoustic()
+        call compute_add_sources_acoustic(potential_dot_dot_acoustic)
       else
         ! on GPU
         call compute_add_sources_acoustic_GPU()
@@ -331,9 +331,11 @@ subroutine compute_forces_acoustic_forward_calling()
 !   updates the chi_dot term which requires chi_dot_dot(t+delta)
     ! corrector
     if (USE_LDDRK) then
+      ! LDDRK
       call update_potential_dot_acoustic_lddrk()
     else
-      potential_dot_acoustic(:) = potential_dot_acoustic(:) + deltatover2*potential_dot_dot_acoustic(:)
+      ! Newmark time scheme
+      call update_potential_dot_acoustic()
     endif
   else
     ! on GPU
@@ -514,7 +516,7 @@ subroutine compute_forces_acoustic_backward_calling()
       !
       if (.not. GPU_MODE) then
         ! on CPU
-        call compute_add_sources_acoustic_backward()
+        call compute_add_sources_acoustic_backward(b_potential_dot_dot_acoustic)
       else
         ! on GPU
         call compute_add_sources_acoustic_backward_GPU()
@@ -597,7 +599,7 @@ subroutine compute_forces_acoustic_backward_calling()
       stop 'LDDRK scheme for backward propagation not implemented yet'
     else
       ! adjoint simulations
-      b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:)
+      call update_potential_dot_acoustic_backward()
     endif
   else
     ! on GPU
diff --git a/src/specfem3D/compute_forces_poroelastic_calling_routine.f90 b/src/specfem3D/compute_forces_poroelastic_calling_routine.f90
index 3f2802773..0f5ad3960 100644
--- a/src/specfem3D/compute_forces_poroelastic_calling_routine.f90
+++ b/src/specfem3D/compute_forces_poroelastic_calling_routine.f90
@@ -204,24 +204,24 @@ subroutine compute_forces_poroelastic_calling()
 
   ! multiplies with inverse of mass matrix (note: rmass has been inverted already)
   ! solid phase
-  accels_poroelastic(1,:) = accels_poroelastic(1,:)*rmass_solid_poroelastic(:)
-  accels_poroelastic(2,:) = accels_poroelastic(2,:)*rmass_solid_poroelastic(:)
-  accels_poroelastic(3,:) = accels_poroelastic(3,:)*rmass_solid_poroelastic(:)
+  accels_poroelastic(1,:) = accels_poroelastic(1,:) * rmass_solid_poroelastic(:)
+  accels_poroelastic(2,:) = accels_poroelastic(2,:) * rmass_solid_poroelastic(:)
+  accels_poroelastic(3,:) = accels_poroelastic(3,:) * rmass_solid_poroelastic(:)
   ! fluid phase
-  accelw_poroelastic(1,:) = accelw_poroelastic(1,:)*rmass_fluid_poroelastic(:)
-  accelw_poroelastic(2,:) = accelw_poroelastic(2,:)*rmass_fluid_poroelastic(:)
-  accelw_poroelastic(3,:) = accelw_poroelastic(3,:)*rmass_fluid_poroelastic(:)
+  accelw_poroelastic(1,:) = accelw_poroelastic(1,:) * rmass_fluid_poroelastic(:)
+  accelw_poroelastic(2,:) = accelw_poroelastic(2,:) * rmass_fluid_poroelastic(:)
+  accelw_poroelastic(3,:) = accelw_poroelastic(3,:) * rmass_fluid_poroelastic(:)
 
   ! adjoint simulations
   if (SIMULATION_TYPE == 3) then
     ! solid
-    b_accels_poroelastic(1,:) = b_accels_poroelastic(1,:)*rmass_solid_poroelastic(:)
-    b_accels_poroelastic(2,:) = b_accels_poroelastic(2,:)*rmass_solid_poroelastic(:)
-    b_accels_poroelastic(3,:) = b_accels_poroelastic(3,:)*rmass_solid_poroelastic(:)
+    b_accels_poroelastic(1,:) = b_accels_poroelastic(1,:) * rmass_solid_poroelastic(:)
+    b_accels_poroelastic(2,:) = b_accels_poroelastic(2,:) * rmass_solid_poroelastic(:)
+    b_accels_poroelastic(3,:) = b_accels_poroelastic(3,:) * rmass_solid_poroelastic(:)
     ! fluid
-    b_accelw_poroelastic(1,:) = b_accelw_poroelastic(1,:)*rmass_fluid_poroelastic(:)
-    b_accelw_poroelastic(2,:) = b_accelw_poroelastic(2,:)*rmass_fluid_poroelastic(:)
-    b_accelw_poroelastic(3,:) = b_accelw_poroelastic(3,:)*rmass_fluid_poroelastic(:)
+    b_accelw_poroelastic(1,:) = b_accelw_poroelastic(1,:) * rmass_fluid_poroelastic(:)
+    b_accelw_poroelastic(2,:) = b_accelw_poroelastic(2,:) * rmass_fluid_poroelastic(:)
+    b_accelw_poroelastic(3,:) = b_accelw_poroelastic(3,:) * rmass_fluid_poroelastic(:)
   endif
 
 ! updates velocities
@@ -240,19 +240,7 @@ subroutine compute_forces_poroelastic_calling()
 !
 ! corrector:
 !   updates the velocity term which requires a(t+delta)
-
-  ! solid phase
-  velocs_poroelastic(:,:) = velocs_poroelastic(:,:) + deltatover2*accels_poroelastic(:,:)
-  ! fluid phase
-  velocw_poroelastic(:,:) = velocw_poroelastic(:,:) + deltatover2*accelw_poroelastic(:,:)
-
-  ! adjoint simulations
-  if (SIMULATION_TYPE == 3) then
-    ! solid phase
-    b_velocs_poroelastic(:,:) = b_velocs_poroelastic(:,:) + b_deltatover2*b_accels_poroelastic(:,:)
-    ! fluid phase
-    b_velocw_poroelastic(:,:) = b_velocw_poroelastic(:,:) + b_deltatover2*b_accelw_poroelastic(:,:)
-  endif
+  call update_veloc_poroelastic_forward_and_backward()
 
   ! elastic coupling
   if (ELASTIC_SIMULATION) &
@@ -347,6 +335,7 @@ subroutine compute_continuity_disp_po_el(NSPEC_AB,NGLOB_AB,ibool, &
         velocs_poroelastic(1,iglob) = velocs_poroelastic(1,iglob) - deltatover2*accels_poroelastic(1,iglob)
         velocs_poroelastic(2,iglob) = velocs_poroelastic(2,iglob) - deltatover2*accels_poroelastic(2,iglob)
         velocs_poroelastic(3,iglob) = velocs_poroelastic(3,iglob) - deltatover2*accels_poroelastic(3,iglob)
+
         accels_poroelastic(1,iglob) = accels_poroelastic(1,iglob) / rmass_solid_poroelastic(iglob)
         accels_poroelastic(2,iglob) = accels_poroelastic(2,iglob) / rmass_solid_poroelastic(iglob)
         accels_poroelastic(3,iglob) = accels_poroelastic(3,iglob) / rmass_solid_poroelastic(iglob)
@@ -379,6 +368,7 @@ subroutine compute_continuity_disp_po_el(NSPEC_AB,NGLOB_AB,ibool, &
         accelw_poroelastic(1,iglob) = 0.d0
         accelw_poroelastic(2,iglob) = 0.d0
         accelw_poroelastic(3,iglob) = 0.d0
+
         velocw_poroelastic(1,iglob) = 0.d0
         velocw_poroelastic(2,iglob) = 0.d0
         velocw_poroelastic(3,iglob) = 0.d0
diff --git a/src/specfem3D/compute_forces_viscoelastic.F90 b/src/specfem3D/compute_forces_viscoelastic.F90
index b3c7277ab..cacedd4d4 100644
--- a/src/specfem3D/compute_forces_viscoelastic.F90
+++ b/src/specfem3D/compute_forces_viscoelastic.F90
@@ -43,10 +43,13 @@ subroutine compute_forces_viscoelastic(iphase,deltat, &
   use constants, only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM,N_SLS,ONE_THIRD,FOUR_THIRDS, &
     m1,m2
 
+  use shared_parameters, only: SIMULATION_TYPE, &
+    USE_LDDRK,LTS_MODE,SAVE_MOHO_MESH, &
+    ATTENUATION,ANISOTROPY
+
   use fault_solver_common, only: Kelvin_Voigt_eta,USE_KELVIN_VOIGT_DAMPING
 
-  use specfem_par, only: SAVE_MOHO_MESH,USE_LDDRK, &
-                         xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, &
+  use specfem_par, only: xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, &
                          gammaxstore,gammaystore,gammazstore,jacobianstore, &
                          NSPEC_AB,NGLOB_AB, &
                          hprime_xx,hprime_xxT, &
@@ -55,10 +58,9 @@ subroutine compute_forces_viscoelastic(iphase,deltat, &
                          hprimewgll_xx,hprimewgll_xxT, &
                          hprimewgll_yy,hprimewgll_zz, &
                          kappastore,mustore,ibool, &
-                         ATTENUATION, &
                          NSPEC_ATTENUATION_AB,NSPEC_ATTENUATION_AB_LDDRK, &
-                         ANISOTROPY,SIMULATION_TYPE, &
-                         NSPEC_ADJOINT,is_moho_top,is_moho_bot, &
+                         NSPEC_ADJOINT, &
+                         is_moho_top,is_moho_bot, &
                          irregular_element_number,xix_regular,jacobian_regular
 
   use specfem_par, only: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
@@ -73,8 +75,12 @@ subroutine compute_forces_viscoelastic(iphase,deltat, &
                                  ispec2D_moho_top,ispec2D_moho_bot, &
                                  nspec_inner_elastic,nspec_outer_elastic,phase_ispec_inner_elastic
 
+  ! PML
   use pml_par, only: is_CPML,NSPEC_CPML
 
+  ! LTS
+  use specfem_par_lts, only: lts_type_compute_pelem,current_lts_elem,current_lts_boundary_elem
+
 #ifdef FORCE_VECTORIZATION
   use constants, only: NGLLCUBE
 #endif
@@ -199,7 +205,8 @@ subroutine compute_forces_viscoelastic(iphase,deltat, &
 !$OMP epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,epsilondev_trace,epsilon_trace_over_3, &
 !$OMP USE_LDDRK,R_xx_lddrk,R_yy_lddrk,R_xy_lddrk,R_xz_lddrk,R_yz_lddrk,R_trace_lddrk, &
 !$OMP NSPEC_AB,NSPEC_ATTENUATION_AB,NSPEC_ATTENUATION_AB_LDDRK,NSPEC_STRAIN_ONLY, &
-!$OMP SAVE_MOHO_MESH,dsdx_top,dsdx_bot,ispec2D_moho_top,ispec2D_moho_bot,is_moho_top,is_moho_bot &
+!$OMP SAVE_MOHO_MESH,dsdx_top,dsdx_bot,ispec2D_moho_top,ispec2D_moho_bot,is_moho_top,is_moho_bot, &
+!$OMP LTS_MODE,lts_type_compute_pelem,current_lts_elem,current_lts_boundary_elem &
 !$OMP ) &
 !$OMP PRIVATE( &
 !$OMP ispec_p,ispec,ispec_irreg,i,j,k,l,iglob,ispec2D, &
@@ -241,6 +248,22 @@ subroutine compute_forces_viscoelastic(iphase,deltat, &
     ispec = phase_ispec_inner_elastic(ispec_p,iphase)
 
     ! selects element contribution
+
+    ! LTS
+    if (LTS_MODE) then
+      if (lts_type_compute_pelem) then
+        ! p-element call
+        ! if not in this p-level, skip
+        if (.not. current_lts_elem(ispec)) cycle
+        ! only work on elements _strictly_ in this level (all 125 nodes in P)
+        if (current_lts_boundary_elem(ispec)) cycle
+      else
+        ! boundary-element call
+        ! only for boundary elements
+        if (.not. current_lts_boundary_elem(ispec)) cycle
+      endif
+    endif
+
     ! PML elements will be computed later
     if (is_CPML(ispec) .and. .not. backward_simulation) cycle
 
@@ -788,13 +811,31 @@ subroutine compute_forces_viscoelastic(iphase,deltat, &
 
     ! save deviatoric strain for Runge-Kutta scheme
     if (COMPUTE_AND_STORE_STRAIN) then
-      if (ATTENUATION .and. .not. is_CPML(ispec)) epsilondev_trace(:,:,:,ispec) = epsilondev_trace_loc(:,:,:)
-      ! already stored epsilon_trace_over_3(:,:,:,ispec) above for SIMULATION_TYPE == 3
-      epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-      epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-      epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-      epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-      epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+      ! LTS
+      if (LTS_MODE .and. .not. lts_type_compute_pelem) then
+        ! note: we first call this compute_forces routine for p-elements only, then in a second call for boundary-elements.
+        !       to store strains, we will in the first call re-set the strain as by default, but in the second call
+        !       only add to the existing values.
+        ! boundary-element call
+        ! adds contributions
+        if (ATTENUATION .and. .not. is_CPML(ispec)) &
+          epsilondev_trace(:,:,:,ispec) = epsilondev_trace(:,:,:,ispec) + epsilondev_trace_loc(:,:,:)
+        ! already stored epsilon_trace_over_3(:,:,:,ispec) above for SIMULATION_TYPE == 3
+        epsilondev_xx(:,:,:,ispec) = epsilondev_xx(:,:,:,ispec) + epsilondev_xx_loc(:,:,:)
+        epsilondev_yy(:,:,:,ispec) = epsilondev_yy(:,:,:,ispec) + epsilondev_yy_loc(:,:,:)
+        epsilondev_xy(:,:,:,ispec) = epsilondev_xy(:,:,:,ispec) + epsilondev_xy_loc(:,:,:)
+        epsilondev_xz(:,:,:,ispec) = epsilondev_xz(:,:,:,ispec) + epsilondev_xz_loc(:,:,:)
+        epsilondev_yz(:,:,:,ispec) = epsilondev_yz(:,:,:,ispec) + epsilondev_yz_loc(:,:,:)
+      else
+        ! default
+        if (ATTENUATION .and. .not. is_CPML(ispec)) epsilondev_trace(:,:,:,ispec) = epsilondev_trace_loc(:,:,:)
+        ! already stored epsilon_trace_over_3(:,:,:,ispec) above for SIMULATION_TYPE == 3
+        epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+        epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+        epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+        epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+        epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+      endif
     endif
 
   enddo  ! spectral element loop
diff --git a/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90 b/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
index 1d7af5d1c..674bd7c83 100644
--- a/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
+++ b/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
@@ -261,7 +261,7 @@ subroutine compute_forces_viscoelastic_calling()
       ! adds source term (single-force/moment-tensor solution)
       if (.not. GPU_MODE) then
         ! on CPU
-        call compute_add_sources_viscoelastic()
+        call compute_add_sources_viscoelastic(accel)
       else
         ! on GPU
         call compute_add_sources_viscoelastic_GPU()
@@ -362,9 +362,9 @@ subroutine compute_forces_viscoelastic_calling()
   if (.not. GPU_MODE) then
     ! on CPU
     do iglob = 1,NGLOB_AB
-      accel(1,iglob) = accel(1,iglob)*rmassx(iglob)
-      accel(2,iglob) = accel(2,iglob)*rmassy(iglob)
-      accel(3,iglob) = accel(3,iglob)*rmassz(iglob)
+      accel(1,iglob) = accel(1,iglob) * rmassx(iglob)
+      accel(2,iglob) = accel(2,iglob) * rmassy(iglob)
+      accel(3,iglob) = accel(3,iglob) * rmassz(iglob)
     enddo
   else
     ! on GPU
@@ -435,7 +435,7 @@ subroutine compute_forces_viscoelastic_calling()
     if (USE_LDDRK) then
       call update_veloc_elastic_lddrk()
     else
-      veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
+      call update_veloc_elastic()
     endif
   else
     ! on GPU
@@ -591,7 +591,7 @@ subroutine compute_forces_viscoelastic_backward_calling()
       !       to avoid calling the same routine twice and to check if the source element is an inner/outer element
       if (.not. GPU_MODE) then
         ! on CPU
-        call compute_add_sources_viscoelastic_backward()
+        call compute_add_sources_viscoelastic_backward(b_accel)
       else
         ! on GPU
         call compute_add_sources_viscoelastic_backward_GPU()
@@ -651,9 +651,9 @@ subroutine compute_forces_viscoelastic_backward_calling()
   if (.not. GPU_MODE) then
     ! on CPU
     ! adjoint simulations
-    b_accel(1,:) = b_accel(1,:)*rmassx(:)
-    b_accel(2,:) = b_accel(2,:)*rmassy(:)
-    b_accel(3,:) = b_accel(3,:)*rmassz(:)
+    b_accel(1,:) = b_accel(1,:) * rmassx(:)
+    b_accel(2,:) = b_accel(2,:) * rmassy(:)
+    b_accel(3,:) = b_accel(3,:) * rmassz(:)
   else
     ! on GPU
     call kernel_3_a_cuda(Mesh_pointer,deltatover2,b_deltatover2,APPROXIMATE_OCEAN_LOAD,3) ! 3 == backward
@@ -695,8 +695,12 @@ subroutine compute_forces_viscoelastic_backward_calling()
 !   updates the velocity term which requires a(t+delta)
   if (.not. GPU_MODE) then
     ! on CPU
-    ! adjoint simulations
-    b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+    if (USE_LDDRK) then
+      stop 'USE_LDDRK not implemented for backward viscoelastic simulations'
+    else
+      ! adjoint simulations
+      call update_veloc_elastic_backward()
+    endif
   else
     ! on GPU
     ! only call in case of ocean load, otherwise already done in kernel 3 a
diff --git a/src/specfem3D/fault_solver_common.f90 b/src/specfem3D/fault_solver_common.f90
index bf9831634..6d9e9e94c 100644
--- a/src/specfem3D/fault_solver_common.f90
+++ b/src/specfem3D/fault_solver_common.f90
@@ -105,6 +105,11 @@ module fault_solver_common
     real(kind=CUSTOM_REAL) :: kin_dt
     integer :: kin_it
     real(kind=CUSTOM_REAL), dimension(:,:), pointer :: v_kin_t1,v_kin_t2
+
+    ! LTS
+    logical,dimension(:),pointer :: fault_use_p_level => null()
+    integer,dimension(:,:),pointer :: fault_ibool_local_p => null()
+    integer,dimension(:),pointer :: fault_num_p_local => null()
   end type bc_dynandkinflt_type
 
   ! fault array
@@ -126,17 +131,23 @@ module fault_solver_common
 
 !---------------------------------------------------------------------
 
-  subroutine initialize_fault (bc,IIN_BIN)
+  subroutine initialize_fault(bc,IIN_BIN)
+
+  use constants, only: PARALLEL_FAULT,NDIM,NGLLSQUARE,myrank
 
-  use constants, only: PARALLEL_FAULT,NDIM,NGLLSQUARE
+  use shared_parameters, only: NPROC,DT,LTS_MODE
 
-  use specfem_par, only: NPROC,DT,NGLOB_AB, &
+  use specfem_par, only: NGLOB_AB, &
     num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
     nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
     my_neighbors_ext_mesh
 
   use specfem_par_elastic, only: rmassx
 
+  ! LTS
+  use specfem_par_lts, only: iglob_p_refine,num_p_level,p_level,p_lookup
+  use specfem_par, only: xstore,ystore,zstore
+
   implicit none
 !! DK DK use type(bc_dynandkinflt_type) instead of class(fault_type) for compatibility with some current compilers
   type(bc_dynandkinflt_type), intent(inout) :: bc
@@ -146,12 +157,34 @@ subroutine initialize_fault (bc,IIN_BIN)
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: jacobian2Dw
   real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: normal
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: nxyz
+  real(kind=CUSTOM_REAL) :: deltat
   integer, dimension(:,:), allocatable :: ibool1
   integer :: ij,k,e,ier
+  ! LTS
+  integer :: i,p,ilevel
+  integer :: p_min,p_max
+  integer :: p_min_glob,p_max_glob
+
+  ! explicit initialization
+  bc%nspec = 0
+  bc%nglob = 0
 
   ! number of elements and global points on fault
   read(IIN_BIN) bc%nspec,bc%nglob
 
+  ! makes sure nglob value is also zero in case no fault elements are in this partition
+  if (bc%nspec <= 0) then
+    ! checks
+    if (bc%nglob > 0) print *,'Warning: rank',myrank,'has no fault elements but a non-zero nglob:',bc%nspec,bc%nglob
+    ! re-sets nspec/nglob
+    bc%nspec = 0
+    bc%nglob = 0
+  endif
+
+  ! time step on fault
+  ! sets simulation time step size
+  bc%dt = real(DT,kind=CUSTOM_REAL)
+
   if (bc%nspec > 0) then
     ! checks nglob
     if (bc%nglob < 1) then
@@ -221,9 +254,6 @@ subroutine initialize_fault (bc,IIN_BIN)
 
     ! done reading faults_db.bin file, no more records in read(IIN_BIN).. from here on
 
-    ! sets simulation time step size
-    bc%dt = real(DT,kind=CUSTOM_REAL)
-
     ! normal vector
     allocate(nxyz(NDIM,bc%nglob),stat=ier)
     if (ier /= 0) call exit_MPI_without_rank('error allocating array 2170')
@@ -234,6 +264,7 @@ subroutine initialize_fault (bc,IIN_BIN)
     do e = 1,bc%nspec
       do ij = 1,NGLLSQUARE
         k = ibool1(ij,e)
+        if (k < 1 .or. k > bc%nglob) call exit_MPI(myrank,'Error: fault ibool1 value out of bounds')
         nxyz(:,k) = nxyz(:,k) + normal(:,ij,e)
         bc%B(k) = bc%B(k) + jacobian2Dw(ij,e)
       enddo
@@ -250,6 +281,12 @@ subroutine initialize_fault (bc,IIN_BIN)
              bc%R(NDIM,NDIM,1))
   endif
 
+  ! checks fault iglob values
+  do i = 1,bc%nglob
+    if (bc%ibulk1(i) < 1 .or. bc%ibulk1(i) > NGLOB_AB) call exit_MPI(myrank,'Error ibulk1 has invalid iglob value')
+    if (bc%ibulk2(i) < 1 .or. bc%ibulk2(i) > NGLOB_AB) call exit_MPI(myrank,'Error ibulk2 has invalid iglob value')
+  enddo
+
   ! fault parallelization across multiple MPI processes
   if (PARALLEL_FAULT) then
     ! assembles fault boundary matrix B
@@ -277,6 +314,74 @@ subroutine initialize_fault (bc,IIN_BIN)
     if (bc%nspec > 0) nxyz(:,:) = tmp_vec(:,bc%ibulk1(:))
   endif
 
+  ! LTS
+  if (LTS_MODE) then
+    ! checks
+    if (size(p_level(:)) /= num_p_level) call exit_MPI(myrank,'Error num_p_level in initialize_fault')
+
+    ! uses flags to determine whether there are p-level nodes on fault surfaces
+    allocate(bc%fault_use_p_level(num_p_level), &
+             bc%fault_num_p_local(num_p_level), &
+             bc%fault_ibool_local_p(bc%nglob,num_p_level),stat=ier)
+    if (ier /= 0) stop 'Error allocating arrays fault_ibool_local_p,..'
+
+    bc%fault_use_p_level(:) = .false.
+    bc%fault_num_p_local(:) = 0
+    bc%fault_ibool_local_p(:,:) = 0
+
+    p_min = 1000000000
+    p_max = 0
+    do i = 1,bc%nglob
+      ! p-value on 1. split node
+      p = iglob_p_refine(bc%ibulk1(i))
+
+      ! opposite split nodes must lie in same p-level region
+      ! LTS check
+      if (p /= iglob_p_refine(bc%ibulk2(i))) then
+        print *,'Error fault p-values: rank ',myrank
+        print *,'  fault ibulk1: iglob = ',bc%ibulk1(i),'p = ',iglob_p_refine(bc%ibulk1(i))
+        print *,'                x/y/z: ',xstore(bc%ibulk1(i)),ystore(bc%ibulk1(i)),zstore(bc%ibulk1(i))
+        print *,'  fault ibulk2: iglob = ',bc%ibulk2(i),'p = ',iglob_p_refine(bc%ibulk2(i))
+        print *,'                x/y/z: ',xstore(bc%ibulk2(i)),ystore(bc%ibulk2(i)),zstore(bc%ibulk2(i))
+        print *,'fault split nodes MUST belong to the same p-level, please check your mesh...'
+        call exit_MPI(myrank,'error fault p-values on split nodes')
+      endif
+      ! statistics
+      if (p < p_min) p_min = p
+      if (p > p_max) p_max = p
+
+      ilevel = p_lookup(p)
+
+      ! sets flag if fault has p-nodes
+      bc%fault_use_p_level(ilevel) = .true.
+
+      ! counts number of p-nodes
+      bc%fault_num_p_local(ilevel) = bc%fault_num_p_local(ilevel) + 1
+
+      ! sets up local array with p-nodes
+      bc%fault_ibool_local_p(bc%fault_num_p_local(ilevel),ilevel) = i
+    enddo
+
+    ! gather info
+    call min_all_all_i(p_min,p_min_glob)
+    call max_all_all_i(p_max,p_max_glob)
+    p_min = p_min_glob
+    p_max = p_max_glob
+
+    ! LTS sets fault time step to minimum LTS time step for fault nodes
+    if (p_max_glob <= 0) then
+      call exit_MPI(myrank,'Error p_max_glob invalid')
+    endif
+
+    ! user output
+    if (myrank == 0) then
+      write(IMAIN,*) "    LTS fault p-values min/max          :",p_min_glob,p_max_glob
+      write(IMAIN,*) "    LTS minimum local time step on fault:",bc%dt / p_max_glob
+      call flush_IMAIN()
+    endif
+    call synchronize_all()
+  endif ! LTS_MODE
+
   if (bc%nspec > 0) then
     ! normalizes fault normal vector
     call normalize_3d_vector(nxyz)
@@ -284,7 +389,7 @@ subroutine initialize_fault (bc,IIN_BIN)
     call compute_R(bc%R,bc%nglob,nxyz)
 
     ! inverse mass matrix
-    !SURENDRA : WARNING! Assuming rmassx=rmassy=rmassz
+    ! WARNING: Assuming rmassx=rmassy=rmassz
     ! Needed in dA_Free = -K2*d2/M2 + K1*d1/M1
     bc%invM1(:) = rmassx(bc%ibulk1(:))
     bc%invM2(:) = rmassx(bc%ibulk2(:))
@@ -294,8 +399,19 @@ subroutine initialize_fault (bc,IIN_BIN)
     !   Z = 1/( B1/M1 + B2/M2 ) / (0.5*dt)
     ! T_stick = Z*Vfree traction as if the fault was stuck (no displ discontinuity)
     ! NOTE: same Bi on both sides, see note above
-
-    bc%Z(:) = 1.0_CUSTOM_REAL/(0.5_CUSTOM_REAL * bc%dt * bc%B(:) *( bc%invM1(:) + bc%invM2(:) ))
+    if (LTS_MODE) then
+      do i = 1,bc%nglob
+        ! p-value on 1. split node (should be the same as on opposite node)
+        p = iglob_p_refine(bc%ibulk1(i))
+        if (p <= 0) call exit_MPI(myrank,'Error p-value invalid in fault impedance setup')
+        ! local time step
+        deltat = bc%dt / p
+        bc%Z(i) = 1.0_CUSTOM_REAL/(0.5_CUSTOM_REAL * deltat * bc%B(i) *( bc%invM1(i) + bc%invM2(i) ))
+      enddo
+    else
+      deltat = bc%dt
+      bc%Z(:) = 1.0_CUSTOM_REAL/(0.5_CUSTOM_REAL * deltat * bc%B(:) *( bc%invM1(:) + bc%invM2(:) ))
+    endif
 
     ! WARNING: In non-split nodes at fault edges M is assembled across the fault.
     ! hence invM1+invM2=2/(M1+M2) instead of 1/M1+1/M2
@@ -408,9 +524,9 @@ function get_weighted_jump (bc,f) result(da)
   real(kind=CUSTOM_REAL) :: da(3,bc%nglob)
 
   ! difference between side 2 and side 1 of fault nodes. M-1 * F
-  da(1,:) = bc%invM2(:)*f(1,bc%ibulk2(:)) - bc%invM1(:)*f(1,bc%ibulk1(:))
-  da(2,:) = bc%invM2(:)*f(2,bc%ibulk2(:)) - bc%invM1(:)*f(2,bc%ibulk1(:))
-  da(3,:) = bc%invM2(:)*f(3,bc%ibulk2(:)) - bc%invM1(:)*f(3,bc%ibulk1(:))
+  da(1,:) = bc%invM2(:) * f(1,bc%ibulk2(:)) - bc%invM1(:) * f(1,bc%ibulk1(:))
+  da(2,:) = bc%invM2(:) * f(2,bc%ibulk2(:)) - bc%invM1(:) * f(2,bc%ibulk1(:))
+  da(3,:) = bc%invM2(:) * f(3,bc%ibulk2(:)) - bc%invM1(:) * f(3,bc%ibulk1(:))
 
   ! NOTE: In non-split nodes at fault edges M and f are assembled across the fault.
   ! Hence, f1=f2, invM1=invM2=1/(M1+M2) instead of invMi=1/Mi, and da=0.
@@ -455,16 +571,42 @@ subroutine add_BT(bc,MxA,T)
   real(kind=CUSTOM_REAL), intent(inout) :: MxA(:,:)
   real(kind=CUSTOM_REAL), dimension(3,bc%nglob), intent(in) :: T
 
-  MxA(1,bc%ibulk1(:)) = MxA(1,bc%ibulk1(:)) + bc%B(:)*T(1,:)
-  MxA(2,bc%ibulk1(:)) = MxA(2,bc%ibulk1(:)) + bc%B(:)*T(2,:)
-  MxA(3,bc%ibulk1(:)) = MxA(3,bc%ibulk1(:)) + bc%B(:)*T(3,:)
+  MxA(1,bc%ibulk1(:)) = MxA(1,bc%ibulk1(:)) + bc%B(:) * T(1,:)
+  MxA(2,bc%ibulk1(:)) = MxA(2,bc%ibulk1(:)) + bc%B(:) * T(2,:)
+  MxA(3,bc%ibulk1(:)) = MxA(3,bc%ibulk1(:)) + bc%B(:) * T(3,:)
 
-  MxA(1,bc%ibulk2(:)) = MxA(1,bc%ibulk2(:)) - bc%B(:)*T(1,:)
-  MxA(2,bc%ibulk2(:)) = MxA(2,bc%ibulk2(:)) - bc%B(:)*T(2,:)
-  MxA(3,bc%ibulk2(:)) = MxA(3,bc%ibulk2(:)) - bc%B(:)*T(3,:)
+  MxA(1,bc%ibulk2(:)) = MxA(1,bc%ibulk2(:)) - bc%B(:) * T(1,:)
+  MxA(2,bc%ibulk2(:)) = MxA(2,bc%ibulk2(:)) - bc%B(:) * T(2,:)
+  MxA(3,bc%ibulk2(:)) = MxA(3,bc%ibulk2(:)) - bc%B(:) * T(3,:)
 
   end subroutine add_BT
 
+!----------------------------------------------------------------------
+
+  subroutine add_BT_single(bc,MxA,T,i)
+
+! LTS version for a single fault point
+
+  implicit none
+  type(bc_dynandkinflt_type), intent(in) :: bc
+  real(kind=CUSTOM_REAL), intent(inout) :: MxA(:,:)
+  real(kind=CUSTOM_REAL), dimension(3,bc%nglob), intent(in) :: T
+  integer, intent(in) :: i
+
+  ! local parameters
+  integer :: iglob1,iglob2
+
+  iglob1 = bc%ibulk1(i)
+  MxA(1,iglob1) = MxA(1,iglob1) + bc%B(i) * T(1,i)
+  MxA(2,iglob1) = MxA(2,iglob1) + bc%B(i) * T(2,i)
+  MxA(3,iglob1) = MxA(3,iglob1) + bc%B(i) * T(3,i)
+
+  iglob2 = bc%ibulk2(i)
+  MxA(1,iglob2) = MxA(1,iglob2) - bc%B(i) * T(1,i)
+  MxA(2,iglob2) = MxA(2,iglob2) - bc%B(i) * T(2,i)
+  MxA(3,iglob2) = MxA(3,iglob2) - bc%B(i) * T(3,i)
+
+end subroutine add_BT_single
 
 !===============================================================
 ! dataT outputs
@@ -487,17 +629,18 @@ subroutine init_dataT(dataT,coord,nglob,NT,DT,ndat,iflt)
   integer, dimension(:,:), allocatable :: iglob_all
   integer, dimension(:), allocatable :: iproc,iglob_tmp,glob_indx
   real(kind=CUSTOM_REAL) :: xtarget,ytarget,ztarget,dist,distkeep
-  integer :: i, iglob , IIN, ier, jflt, np, k
+  integer :: i, iglob, ier, jflt, np, k
   character(len=MAX_STRING_LEN) :: tmpname
   character(len=MAX_STRING_LEN), dimension(:), allocatable :: name_tmp
   integer :: ipoin, ipoin_local, npoin_local
 
+  ! WARNING: not safe, should check that unit is not aleady opened
+  integer, parameter :: IIN_FAULT_STATIONS = 251
+
   !  1. read fault output coordinates from user file,
   !  2. define iglob: the fault global index of the node nearest to user
   !     requested coordinate
 
-  IIN = 251 ! WARNING: not safe, should check that unit is not aleady opened
-
   ! initializes dataT
   dataT%ndat = ndat
   dataT%nt = NT
@@ -505,7 +648,8 @@ subroutine init_dataT(dataT,coord,nglob,NT,DT,ndat,iflt)
   dataT%element_size = 0.0
 
   ! count the number of output points on the current fault (#iflt)
-  open(IIN,file=IN_DATA_FILES(1:len_trim(IN_DATA_FILES))//'FAULT_STATIONS',status='old',action='read',iostat=ier)
+  open(IIN_FAULT_STATIONS,file=IN_DATA_FILES(1:len_trim(IN_DATA_FILES))//'FAULT_STATIONS', &
+       status='old',action='read',iostat=ier)
   if (ier /= 0) then
     print *,'Error opening file ',IN_DATA_FILES(1:len_trim(IN_DATA_FILES))//'FAULT_STATIONS'
     if (myrank == 0) write(IMAIN,*) 'Fatal error opening FAULT_STATIONS file. Abort.'
@@ -513,18 +657,18 @@ subroutine init_dataT(dataT,coord,nglob,NT,DT,ndat,iflt)
   endif
 
   ! number of points
-  read(IIN,*) np
+  read(IIN_FAULT_STATIONS,*) np
 
   ! counts fault points on specified fault
   dataT%npoin = 0
   do i = 1,np
     ! format : #x  #y  #z  #name  #fault_id
     ! example: -4500.0  0.0  0.0  faultst-045dp000 1
-    read(IIN,*) xtarget,ytarget,ztarget,tmpname,jflt
+    read(IIN_FAULT_STATIONS,*) xtarget,ytarget,ztarget,tmpname,jflt
     ! only points on this fault
     if (jflt == iflt) dataT%npoin = dataT%npoin + 1
   enddo
-  close(IIN)
+  close(IIN_FAULT_STATIONS)
 
   ! allocates fault point arrays
   if (dataT%npoin > 0) then
@@ -552,26 +696,29 @@ subroutine init_dataT(dataT,coord,nglob,NT,DT,ndat,iflt)
   if (dataT%npoin == 0) return
 
   ! opens in fault stations
-  open(IIN,file=IN_DATA_FILES(1:len_trim(IN_DATA_FILES))//'FAULT_STATIONS',status='old',action='read',iostat=ier)
+  open(IIN_FAULT_STATIONS,file=IN_DATA_FILES(1:len_trim(IN_DATA_FILES))//'FAULT_STATIONS', &
+       status='old',action='read',iostat=ier)
   if (ier /= 0) then
     print *,'Error opening file ',IN_DATA_FILES(1:len_trim(IN_DATA_FILES))//'FAULT_STATIONS'
     stop 'Error opening file FAULT_STATIONS'
   endif
 
   ! number of points
-  read(IIN,*) np
+  read(IIN_FAULT_STATIONS,*) np
 
   ! reads in fault point positions
   k = 0
   do i = 1,np
     ! format : #x  #y  #z  #name  #fault_id
     ! example: -4500.0  0.0  0.0  faultst-045dp000 1
-    read(IIN,*) xtarget,ytarget,ztarget,tmpname,jflt
+    read(IIN_FAULT_STATIONS,*) xtarget,ytarget,ztarget,tmpname,jflt
 
     ! only points on this fault
     if (jflt /= iflt) cycle
 
     k = k+1
+    if (k < 1 .or. k > dataT%npoin) stop 'Error bounds in init_dataT'
+
     dataT%name(k) = tmpname
 
     ! search nearest node
@@ -588,7 +735,7 @@ subroutine init_dataT(dataT,coord,nglob,NT,DT,ndat,iflt)
     enddo
     dist_loc(k) = distkeep
   enddo
-  close(IIN)
+  close(IIN_FAULT_STATIONS)
 
   if (PARALLEL_FAULT) then
     ! For each output point, find the processor that contains the nearest node
@@ -614,7 +761,7 @@ subroutine init_dataT(dataT,coord,nglob,NT,DT,ndat,iflt)
     call bcast_all_i(dataT%iglob,dataT%npoin)
 
     ! Number of output points contained in the current processor
-    npoin_local = count( iproc == myrank )
+    npoin_local = count( iproc(:) == myrank )
 
     if (npoin_local > 0) then
       ! Make a list of output points contained in the current processor
@@ -736,6 +883,9 @@ subroutine SCEC_write_dataT(dataT)
   integer, dimension(8) :: time_values
   integer, parameter :: IOUT_SC = 121 !WARNING: not very robust. Could instead look for an available ID
 
+  ! checks if anything to do
+  if (dataT%npoin == 0) return
+
   call date_and_time(VALUES=time_values)
 
   ! element size in m
@@ -782,6 +932,33 @@ subroutine SCEC_write_dataT(dataT)
 
   end subroutine SCEC_write_dataT
 
+!------------------------------------------------------------------------
+
+  subroutine free_fault_memory(bc)
+
+  ! LTS
+  use specfem_par, only: LTS_MODE
+
+  implicit none
+  type(bc_dynandkinflt_type), intent(inout) :: bc
+
+  if (bc%nspec > 0) then
+    deallocate(bc%ibulk1, &
+               bc%ibulk2, &
+               bc%R, &
+               bc%invM1, &
+               bc%invM2, &
+               bc%B, &
+               bc%Z)
+  endif
+  deallocate(bc%coord)
+
+  if (LTS_MODE) then
+    deallocate(bc%fault_use_p_level,bc%fault_num_p_local,bc%fault_ibool_local_p)
+  endif
+
+  end subroutine free_fault_memory
+
 !---------------------------------------------------------------------
 
   subroutine fault_check_mesh_resolution()
@@ -908,9 +1085,14 @@ subroutine fault_check_mesh_resolution()
   if (myrank == 0) then
     ! average distance between GLL points within this element
     avg_distance = elemsize_max_glob / ( NGLLX - 1)  ! since NGLLX = NGLLY = NGLLZ
+
     ! rough estimate of time step
     dt_suggested = COURANT_SUGGESTED * avg_distance / vpmax_glob
 
+    ! cut at a significant number of digits (2 digits)
+    ! example: 0.0734815 -> lpow = (2 - (-1) = 3 -> 0.0730
+    call get_timestep_limit_significant_digit(dt_suggested)
+
     ! critical time step estimation
     ! see: utils/critical_timestep.m
     NGLL = max(NGLLX,NGLLY,NGLLZ)
diff --git a/src/specfem3D/fault_solver_dynamic.f90 b/src/specfem3D/fault_solver_dynamic.f90
index 79e10d3a5..1e5c8c7ad 100644
--- a/src/specfem3D/fault_solver_dynamic.f90
+++ b/src/specfem3D/fault_solver_dynamic.f90
@@ -78,12 +78,11 @@ module fault_solver_dynamic
   integer, save :: NT_RECORD_LENGTH = 500     ! default 500, i.e., every 500 time step synchronizes dataT%dat records.
                                               ! will adapt in case needed to NSNAP number of time steps for snapshots
 
-  public :: BC_DYNFLT_init, BC_DYNFLT_set3d_all, &
+  public :: BC_DYNFLT_init, BC_DYNFLT_free, BC_DYNFLT_set3d_all, &
             SIMULATION_TYPE_DYN, NT_RECORD_LENGTH, &
             fault_transfer_data_GPU, fault_rsf_swf_init_GPU, fault_output_synchronize_GPU, &
             fault_check_mesh_resolution
 
-
 contains
 
 !=====================================================================
@@ -94,7 +93,7 @@ module fault_solver_dynamic
   subroutine BC_DYNFLT_init(prname)
 
   use specfem_par, only: nt => NSTEP,DTglobal => DT
-  use constants, only: IMAIN,myrank,IIN_PAR,IIN_BIN
+  use constants, only: IMAIN,myrank,IIN_PAR,IIN_BIN,PARALLEL_FAULT
 
   implicit none
 
@@ -168,6 +167,12 @@ subroutine BC_DYNFLT_init(prname)
   if (myrank == 0) then
     write(IMAIN,*) '  incorporating dynamic rupture simulation'
     write(IMAIN,*) '  found ', nbfaults, ' fault(s) in file DATA/Par_file_faults'
+    if (PARALLEL_FAULT) then
+      write(IMAIN,*) "  using parallel partitions"
+    else
+      write(IMAIN,*) "  using single   partition"
+    endif
+    call flush_IMAIN()
   endif
 
   ! sets dynamic rupture flag
@@ -183,6 +188,13 @@ subroutine BC_DYNFLT_init(prname)
   read(IIN_PAR,*) V_HEALING
   read(IIN_PAR,*) V_RUPT
 
+  ! user output
+  if (myrank == 0) then
+    write(IMAIN,*) "  number of iterations per output  : ",NTOUT
+    write(IMAIN,*) "  number of iterations per snapshot: ",NSNAP
+    call flush_IMAIN()
+  endif
+
   ! from binary fault file
   read(IIN_BIN) nbfaults_bin ! should be the same as in IIN_PAR
 
@@ -222,6 +234,12 @@ subroutine BC_DYNFLT_init(prname)
   close(IIN_BIN)
   close(IIN_PAR)
 
+  ! user output
+  if (myrank == 0) then
+    write(IMAIN,*) "  reading in Kelvin_voigt_eta arrays"
+    call flush_IMAIN()
+  endif
+
   ! reads Kelvin-Voigt parameters
   filename = prname(1:len_trim(prname))//'Kelvin_voigt_eta.bin'
   open(unit=IIN_BIN,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
@@ -399,14 +417,29 @@ subroutine init_one_fault(bc,IIN_BIN,IIN_PAR,dt_real,NT,iflt,myrank)
 
   real(kind=CUSTOM_REAL) :: S1,S2,S3,Sigma(6)
   integer :: n1,n2,n3,ier,recordlength
+  integer :: nspec_all,nglob_all
   logical :: LOAD_STRESSDROP = .false.
 
   NAMELIST / INIT_STRESS / S1,S2,S3,n1,n2,n3
   NAMELIST /STRESS_TENSOR / Sigma
 
+  ! user output
+  if (myrank == 0) then
+    write(IMAIN,*) "  initializing fault",iflt,":"
+    call flush_IMAIN()
+  endif
+
   ! reads in fault_db binary file and initializes fault arrays
   call initialize_fault(bc,IIN_BIN)
 
+  ! user output
+  call sum_all_i(bc%nspec,nspec_all)
+  call sum_all_i(bc%nglob,nglob_all)
+  if (myrank == 0) then
+    write(IMAIN,*) "    total number of elements = ",nspec_all," global points = ",nglob_all
+    call flush_IMAIN()
+  endif
+
   ! sets up initial fault state
   if (bc%nspec > 0) then
     allocate(bc%T(3,bc%nglob),stat=ier)
@@ -528,6 +561,12 @@ subroutine init_one_fault(bc,IIN_BIN,IIN_PAR,dt_real,NT,iflt,myrank)
 
   call init_dataXZ(bc%dataXZ,bc)
 
+  ! user output
+  if (myrank == 0) then
+    write(IMAIN,*) "    writing out fault snapshot: at t=0"
+    call flush_IMAIN()
+  endif
+
   ! output a fault snapshot at t=0
   if (PARALLEL_FAULT) then
     call gather_dataXZ(bc)
@@ -894,7 +933,10 @@ end function heaviside
 ! NOTE: On non-split nodes at fault edges, dD=dV=dA=0
 ! and the net contribution of B*T is =0
 !
-  subroutine bc_dynflt_set3d_all(F,V,D)
+  subroutine bc_dynflt_set3d_all(F,V,D,p_value,ilevel)
+
+  ! LTS
+  use specfem_par, only: LTS_MODE
 
   implicit none
 
@@ -905,6 +947,8 @@ subroutine bc_dynflt_set3d_all(F,V,D)
 
   real(kind=CUSTOM_REAL), dimension(:,:), intent(in) :: V,D
   real(kind=CUSTOM_REAL), dimension(:,:), intent(inout) :: F
+  ! LTS
+  integer,intent(in),optional :: p_value,ilevel
 
   ! local parameters
   integer :: iflt
@@ -916,7 +960,11 @@ subroutine bc_dynflt_set3d_all(F,V,D)
   do iflt = 1,Nfaults
     ! note: this routine should be called by all processes, regardless if they contain no fault elements,
     !       for managing MPI calls and file outputs
-    call BC_DYNFLT_set3d(faults(iflt),F,V,D,iflt)
+    if (LTS_MODE) then
+      call BC_DYNFLT_set3d_lts(faults(iflt),F,V,D,iflt,p_value,ilevel)
+    else
+      call BC_DYNFLT_set3d(faults(iflt),F,V,D,iflt)
+    endif
   enddo
 
   end subroutine bc_dynflt_set3d_all
@@ -944,8 +992,9 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
   real(kind=CUSTOM_REAL), dimension(bc%nglob) :: Tstick,Tnew
   real(kind=CUSTOM_REAL), dimension(bc%nglob) :: strength, theta_old, theta_new, dc
   real(kind=CUSTOM_REAL), dimension(bc%nglob) :: Vf_old, Vf_new, TxExt, tmp_Vf
-  real(kind=CUSTOM_REAL) :: half_dt,TLoad,DTau0,GLoad,timeval
-  integer :: i,ipoin,iglob
+  real(kind=CUSTOM_REAL) :: TLoad,DTau0,GLoad
+  real(kind=CUSTOM_REAL) :: deltat,half_dt,timeval
+  integer :: i,ipoin,iglob,ipass
   real(kind=CUSTOM_REAL) :: nuc_x, nuc_y, nuc_z, nuc_r, nuc_t0, nuc_v, dist, tw_r, coh_size
 
 ! note: this implementation follows the description in:
@@ -965,10 +1014,18 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
 !          fault opening, fault healing, time-weakening, smooth loading,
 !          and TPV16 benchmark specific friction handling
 
+  ! global time step
+  deltat = bc%dt
+
+  ! half time step
+  half_dt = 0.5_CUSTOM_REAL * deltat
+
+  ! time will never be zero. it starts from 1
+  timeval = it * deltat
+
   ! for parallel faults
   if (bc%nspec > 0) then
 
-    half_dt = 0.5_CUSTOM_REAL * bc%dt
     Vf_old(:) = sqrt(bc%V(1,:)*bc%V(1,:) + bc%V(2,:)*bc%V(2,:))
 
     ! get predicted values
@@ -998,7 +1055,9 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
 
     ! Solve for normal stress (negative is compressive)
     ! Opening implies free stress
-    if (bc%allow_opening) T(3,:) = min(T(3,:),0.0_CUSTOM_REAL)
+    if (bc%allow_opening) then
+      T(3,:) = min(T(3,:),0.0_CUSTOM_REAL)
+    endif
 
     ! smooth loading within nucleation patch
     !WARNING : ad hoc for SCEC benchmark TPV10x
@@ -1008,9 +1067,6 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
       TLoad = 1.0_CUSTOM_REAL   ! T_ini
       DTau0 = 1.0_CUSTOM_REAL   ! \Delta \tau_0
 
-      ! time will never be zero. it starts from 1
-      timeval = it*bc%dt
-
       ! function G(t)
       ! see Kaneko (2008), equation (B3):
       !   if 0 < t < T_ini
@@ -1046,7 +1102,6 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
 
       ! combined with time-weakening for nucleation
       if (TWF) then
-        timeval = it*bc%dt
         nuc_x   = bc%twf%nuc_x
         nuc_y   = bc%twf%nuc_y
         nuc_z   = bc%twf%nuc_z
@@ -1070,12 +1125,11 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
       ! TPV16 benchmark
       if (TPV16) then
         ! fixes friction coefficient to be dynamic friction coefficient when rupture time is over
-        where (bc%swf%T(:) <= it*bc%dt) bc%mu(:) = bc%swf%mud(:)
+        where (bc%swf%T(:) <= timeval) bc%mu(:) = bc%swf%mud(:)
       endif
 
       ! updates fault strength
       strength(:) = -bc%mu(:) * min(T(3,:),0.0_CUSTOM_REAL) + bc%swf%C(:)
-
       ! solves for shear stress
       Tnew(:) = min(Tstick(:),strength(:))
 
@@ -1087,25 +1141,31 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
       ! see: Kaneko (2008), explanation of steps 4 and 6 in section 2.3,
       !      and compare against the alternative first-order expansion given by equation (24)
 
-      ! first pass
-      theta_old(:) = bc%rsf%theta(:)
-      call rsf_update_state(Vf_old,bc%dt,bc%rsf)
-
-      do i = 1,bc%nglob
-        Vf_new(i) = rtsafe(0.0_CUSTOM_REAL,Vf_old(i)+5.0_CUSTOM_REAL,1e-5_CUSTOM_REAL,Tstick(i),-T(3,i),bc%Z(i),bc%rsf%f0(i), &
-                           bc%rsf%V0(i),bc%rsf%a(i),bc%rsf%b(i),bc%rsf%L(i),bc%rsf%theta(i),bc%rsf%StateLaw)
-      enddo
-
-      ! second pass
-      bc%rsf%theta(:) = theta_old(:)
-      tmp_Vf(:) = 0.5_CUSTOM_REAL*(Vf_old(:) + Vf_new(:))
-      call rsf_update_state(tmp_Vf,bc%dt,bc%rsf)
-
-      do i = 1,bc%nglob
-        Vf_new(i) = rtsafe(0.0_CUSTOM_REAL,Vf_old(i)+5.0_CUSTOM_REAL,1e-5_CUSTOM_REAL,Tstick(i),-T(3,i),bc%Z(i),bc%rsf%f0(i), &
-                           bc%rsf%V0(i),bc%rsf%a(i),bc%rsf%b(i),bc%rsf%L(i),bc%rsf%theta(i),bc%rsf%StateLaw)
-      enddo
+      do ipass = 1,2
+        ! updates state
+        select case(ipass)
+        case (1)
+          ! first pass
+          theta_old(:) = bc%rsf%theta(:)
+          call rsf_update_state(Vf_old,deltat,bc%rsf)
+        case (2)
+          ! second pass
+          bc%rsf%theta(:) = theta_old(:)
+          tmp_Vf(:) = 0.5_CUSTOM_REAL*(Vf_old(:) + Vf_new(:))
+          call rsf_update_state(tmp_Vf,deltat,bc%rsf)
+        case default
+          stop 'Error invalid pass in rate and state friction update'
+        end select
+
+        ! updates Vf_new
+        do i = 1,bc%nglob
+          Vf_new(i) = rtsafe(0.0_CUSTOM_REAL,Vf_old(i)+5.0_CUSTOM_REAL,1e-5_CUSTOM_REAL, &
+                             Tstick(i),-T(3,i),bc%Z(i),bc%rsf%f0(i), &
+                             bc%rsf%V0(i),bc%rsf%a(i),bc%rsf%b(i),bc%rsf%L(i),bc%rsf%theta(i),bc%rsf%StateLaw)
+        enddo
+      enddo ! two passes
 
+      ! updates new traction
       Tnew(:) = Tstick(:) - bc%Z(:) * Vf_new(:)
     endif
 
@@ -1120,7 +1180,9 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
     ! Subtract initial stress (to have relative stress on the fault)
     T(:,:) = T(:,:) - bc%T0(:,:)
 
-    if (RATE_AND_STATE) T(1,:) = T(1,:) - TxExt(:)
+    if (RATE_AND_STATE) then
+      T(1,:) = T(1,:) - TxExt(:)
+    endif
     !JPA: this eliminates the effect of TxExt on the equations of motion. Why is it needed?
 
     ! Update slip acceleration da=da_free-T/(0.5*dt*Z)
@@ -1130,7 +1192,7 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
 
     ! Update slip and slip rate, in fault frame
     bc%D(:,:) = dD(:,:)
-    bc%V(:,:) = dV(:,:) + half_dt*dA(:,:)
+    bc%V(:,:) = dV(:,:) + half_dt * dA(:,:)
 
     ! Rotate tractions back to (x,y,z) frame
     T(:,:) = rotate(bc,T,-1)
@@ -1149,22 +1211,572 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
     endif
 
     call store_dataXZ(bc%dataXZ, strength, theta_old, theta_new, dc, &
-                      Vf_old, Vf_new, it*bc%dt,bc%dt)
+                      Vf_old, Vf_new, timeval, deltat)
 
     call store_dataT(bc%dataT,bc%D,bc%V,bc%T,it)
 
     if (RATE_AND_STATE) then
       ! adds storage of state
-      do ipoin = 1,bc%dataT%npoin
-        iglob = bc%dataT%iglob(ipoin)
-        if (bc%rsf%StateLaw == 1) then
-          ! ageing law
+      if (bc%rsf%StateLaw == 1) then
+        ! ageing law
+        do ipoin = 1,bc%dataT%npoin
+          iglob = bc%dataT%iglob(ipoin)
           bc%dataT%dat(8,ipoin,it) = log10(theta_new(iglob))
-        else
-          ! slip law
+        enddo
+      else
+        ! slip law
+        do ipoin = 1,bc%dataT%npoin
+          iglob = bc%dataT%iglob(ipoin)
           bc%dataT%dat(8,ipoin,it) = theta_new(iglob)
+        enddo
+      endif
+    endif
+
+    !-- outputs --
+    ! write dataT every NTOUT time step or at the end of simulation
+    if (mod(it,NTOUT) == 0 .or. it == NSTEP) call SCEC_write_dataT(bc%dataT)
+  endif
+
+  ! note: this stage of the routine must be reached by all processes,
+  !       otherwise the MPI gather calls won't succeed and the run gets stuck.
+
+  ! write dataXZ every NSNAP time step
+  if (mod(it,NSNAP) == 0) then
+    if (PARALLEL_FAULT) then
+      ! collects data from all processes
+      call gather_dataXZ(bc)
+      ! main process writes output file
+      if (myrank == 0) call write_dataXZ(bc%dataXZ_all,it,iflt)
+    else
+      ! fault in single slice
+      if (bc%nspec > 0) call write_dataXZ(bc%dataXZ,it,iflt)
+    endif
+  endif
+
+  ! final output
+  if (it == NSTEP) then
+    if (.not. PARALLEL_FAULT) then
+      call SCEC_Write_RuptureTime(bc%dataXZ,iflt)
+    else
+      if (myrank == 0) call SCEC_Write_RuptureTime(bc%dataXZ_all,iflt)
+    endif
+  endif
+
+  end subroutine BC_DYNFLT_set3d
+
+!---------------------------------------------------------------------
+
+  subroutine BC_DYNFLT_set3d_lts(bc,MxA,V,D,iflt,p_value,ilevel)
+
+! LTS version of BC_DYNFLT_set3d to calculate only for fault nodes on a given p-level
+
+  use constants, only: PARALLEL_FAULT
+  use specfem_par, only: it,NSTEP,myrank
+
+  ! LTS
+  use specfem_par, only: LTS_MODE
+  use specfem_par_lts, only: num_p_level,p_level,lts_current_m
+
+  implicit none
+
+  ! fault
+  type(bc_dynandkinflt_type), intent(inout) :: bc
+  ! force/accel
+  real(kind=CUSTOM_REAL), intent(inout) :: MxA(:,:)
+  ! velocity,displacement
+  real(kind=CUSTOM_REAL), intent(in) :: V(:,:),D(:,:)
+  ! fault id
+  integer, intent(in) :: iflt
+  ! LTS
+  integer,intent(in),optional :: p_value,ilevel
+
+  ! local parameters
+  real(kind=CUSTOM_REAL), dimension(3,bc%nglob) :: T,dD,dV,dA
+  real(kind=CUSTOM_REAL), dimension(bc%nglob) :: Tstick,Tnew
+  real(kind=CUSTOM_REAL), dimension(bc%nglob) :: strength, theta_old, theta_new, dc
+  real(kind=CUSTOM_REAL), dimension(bc%nglob) :: Vf_old, Vf_new, TxExt, tmp_Vf
+  real(kind=CUSTOM_REAL) :: TLoad,DTau0,GLoad
+  real(kind=CUSTOM_REAL) :: deltat,half_dt,timeval
+  integer :: i,ipoin,iglob,ipass
+  real(kind=CUSTOM_REAL) :: nuc_x, nuc_y, nuc_z, nuc_r, nuc_t0, nuc_v, dist, tw_r, coh_size
+  ! LTS
+  integer :: inum,num_p_local,iilevel
+  ! only update p-level nodes (when called for each ilevel separately) set this to .true., otherwise
+  ! update all nodes (when called once on for last ilevel)
+  logical, parameter :: UPDATE_LTS_ONLY_P = .false.
+
+! note: this implementation follows the description in:
+!       - rate and state friction:
+!          Kaneko, Y., N. Lapusta, J.-P. Ampuero (2008)
+!          Spectral element modeling of spontaneous earthquake rupture on rate and state faults: Effect of
+!          velocity-strengthening friction at shallow depths,
+!          JGR, 113, B09317, doi:10.1029/2007JB005553
+!
+!       - slip weakening friction:
+!          Galvez, P., J.-P. Ampuero, L.A. Dalguer, S.N. Somala, T. Nissen-Meyer (2014)
+!          Dynamic earthquake rupture modelled with an unstructured 3-D spectral element method applied to
+!          the 2011 M9 Tohoku earthquake,
+!          Geophys. J. Int., 198, 1222-1240.
+!
+!       More features have been added, including:
+!          fault opening, fault healing, time-weakening, smooth loading,
+!          and TPV16 benchmark specific friction handling
+
+  ! safety check
+  if (.not. LTS_MODE) return
+
+  ! LTS checks if fault has any p-nodes to compute
+  ! fault flag: only compute fault elements if fault elements in this p-level
+  if (UPDATE_LTS_ONLY_P) then
+    if (.not. bc%fault_use_p_level(ilevel)) return
+  endif
+
+  ! local time step
+  deltat = bc%dt / p_value
+
+  ! half time step
+  half_dt = 0.5_CUSTOM_REAL * deltat
+
+  ! time will never be zero. it starts from 1
+  ! calculate time for this level
+  timeval = it * bc%dt     ! coarse level
+  ! adds fine level increments
+  do iilevel = ilevel,(num_p_level-1)
+    timeval = timeval + dble(lts_current_m(iilevel)-1) * bc%dt/dble(p_level(iilevel))
+  enddo
+
+  ! for parallel faults
+  if (bc%nspec > 0) then
+
+    ! sets up local p-nodes on fault surface
+    num_p_local = bc%fault_num_p_local(ilevel)
+
+    ! LTS: note that T,dD,.. are all temporary arrays which we don't have to split into p-values necessarily
+    !      however, computing them on p-nodes only would make it faster probably
+    !      for now, we only take care to update arrays stored by fault arrays bc%.. on p-nodes only
+    ! LTS - todo optimization
+    !if (USE_OPTIMIZATION) then
+    !  ! LTS on global p-nodes only
+    !  do inum = 1,num_p_local
+    !    i = bc%fault_ibool_local_p(inum,ilevel)
+    !    Vf_old(i) = sqrt( bc%V(1,i)*bc%V(1,i) + bc%V(2,i)*bc%V(2,i) )
+    !  enddo
+    !else
+    Vf_old(:) = sqrt(bc%V(1,:)*bc%V(1,:) + bc%V(2,:)*bc%V(2,:))
+
+    ! get predicted values
+    dD(:,:) = get_jump(bc,D) ! dD_predictor
+    dV(:,:) = get_jump(bc,V) ! dV_predictor
+    dA(:,:) = get_weighted_jump(bc,MxA) ! dA_free
+
+    ! rotate to fault frame (tangent,normal)
+    ! component 3 is normal to the fault
+    dD(:,:) = rotate(bc,dD,1)
+    dV(:,:) = rotate(bc,dV,1)
+    dA(:,:) = rotate(bc,dA,1)
+
+    ! T_stick "stick traction"
+    ! LTS - todo optimization
+    !if (USE_OPTIMIZATION) then
+    !  ! LTS on global p-nodes only
+    !  do inum = 1,num_p_local
+    !    i = bc%fault_ibool_local_p(inum,ilevel)
+    !    T(:,i) = bc%Z(i) * ( dV(:,i) + half_dt*dA(:,i) )
+    !  enddo
+    !else
+    T(1,:) = bc%Z(:) * ( dV(1,:) + half_dt*dA(1,:) )
+    T(2,:) = bc%Z(:) * ( dV(2,:) + half_dt*dA(2,:) )
+    T(3,:) = bc%Z(:) * ( dV(3,:) + half_dt*dA(3,:) )
+
+    !Warning : dirty particular free surface condition z = 0.
+    !  where (bc%zcoord(:) > - SMALLVAL) T(2,:) = 0
+    ! do k = 1,bc%nglob
+    !   if (abs(bc%zcoord(k)-0.e0_CUSTOM_REAL) < SMALLVAL) T(2,k) = 0.e0_CUSTOM_REAL
+    ! enddo
+
+    ! add initial stress
+    ! LTS - todo optimization
+    !if (USE_OPTIMIZATION) then
+    !  ! LTS on global p-nodes only
+    !  do inum = 1,num_p_local
+    !    i = bc%fault_ibool_local_p(inum,ilevel)
+    !    T(:,i) = T(:,i) + bc%T0(:,i)
+    !  enddo
+    !else
+    T(:,:) = T(:,:) + bc%T0(:,:)
+
+    ! Solve for normal stress (negative is compressive)
+    ! Opening implies free stress
+    if (bc%allow_opening) then
+      ! LTS - todo optimization
+      !if (USE_OPTIMIZATION) then
+      !  ! LTS on global p-nodes only
+      !  do inum = 1,num_p_local
+      !    i = bc%fault_ibool_local_p(inum,ilevel)
+      !    T(3,i) = min(T(3,i),0.e0_CUSTOM_REAL)
+      !  enddo
+      !else
+      T(3,:) = min(T(3,:),0.0_CUSTOM_REAL)
+    endif
+
+    ! smooth loading within nucleation patch
+    !WARNING : ad hoc for SCEC benchmark TPV10x
+    if (RATE_AND_STATE) then
+      ! see: Kaneko (2008), appendix B "Rupture Initiation Procedure"
+
+      ! LTS - todo optimization
+      !if (USE_OPTIMIZATION) then
+      !  TLoad = 1.0_CUSTOM_REAL
+      !  DTau0 = 1.0_CUSTOM_REAL
+      !  if (timeval <= TLoad) then
+      !    GLoad = exp( (timeval-TLoad)*(timeval-Tload) / (timeval*(timeval-2.0_CUSTOM_REAL*TLoad)) )
+      !  else
+      !    GLoad = 1.0_CUSTOM_REAL
+      !  endif
+      !  ! LTS on p-nodes only
+      !  do inum = 1,num_p_local
+      !    i = bc%fault_ibool_local_p(inum,ilevel)
+      !    TxExt(i) = DTau0 * bc%Fload(i) * GLoad
+      !    T(1,i) = T(1,i) + TxExt(i)
+      !  enddo
+      !else
+      TxExt(:) = 0.0_CUSTOM_REAL
+      TLoad = 1.0_CUSTOM_REAL   ! T_ini
+      DTau0 = 1.0_CUSTOM_REAL   ! \Delta \tau_0
+
+      ! function G(t)
+      ! see Kaneko (2008), equation (B3):
+      !   if 0 < t < T_ini
+      !     G(t) = exp( (t - T_ini)^2 / (t^2 - 2 t T_ini)
+      !   else
+      !     G(t) = 1
+      if (timeval <= TLoad) then
+        GLoad = exp( (timeval-TLoad)*(timeval-Tload) / (timeval*(timeval-2.0_CUSTOM_REAL*TLoad)) )
+      else
+        GLoad = 1.0_CUSTOM_REAL
+      endif
+      ! Kaneko (2008), equation (B1): \Delta \tau = \Delta \tau_0 * F(x,z) * G(t)
+      ! the geometry and values of function F(x,z) have been pre-computed for a case specific nucleation patch
+      TxExt(:) = DTau0 * bc%Fload(:) * GLoad
+
+      ! adds horizontal shear traction perturbation
+      T(1,:) = T(1,:) + TxExt(:)
+    endif
+
+    ! norm of shear fault traction
+    ! LTS - todo optimization
+    !if (USE_OPTIMIZATION) then
+    !  ! LTS on p-nodes only
+    !  do inum = 1,num_p_local
+    !    i = bc%fault_ibool_local_p(inum,ilevel)
+    !    tStick(i) = sqrt( T(1,i)*T(1,i) + T(2,i)*T(2,i) )
+    !  enddo
+    !else
+    Tstick(:) = sqrt( T(1,:)*T(1,:) + T(2,:)*T(2,:))
+
+    if (.not. RATE_AND_STATE) then
+      ! slip weakening friction
+      if (UPDATE_LTS_ONLY_P) then
+        ! update slip state variable
+        ! WARNING: during opening the friction state variable should not evolve
+        ! LTS on p-nodes only
+        do inum = 1,num_p_local
+          i = bc%fault_ibool_local_p(inum,ilevel)
+          theta_old(i) = bc%swf%theta(i)
+        enddo
+        call swf_update_state_lts(bc%D,dD,bc%V,bc%swf,p_value,bc)  ! LTS version
+
+        ! update friction coefficient (using slip weakening friction law)
+        ! LTS on p-nodes only
+        do inum = 1,num_p_local
+          i = bc%fault_ibool_local_p(inum,ilevel)
+          bc%MU(i) = swf_mu_lts(bc%swf,i)  ! LTS version
+        enddo
+      else
+        ! update slip state variable
+        ! WARNING: during opening the friction state variable should not evolve
+        theta_old(:) = bc%swf%theta(:)
+        call swf_update_state(bc%D,dD,bc%V,bc%swf)
+
+        ! update friction coefficient (using slip weakening friction law)
+        bc%mu(:) = swf_mu(bc%swf)
+      endif
+
+      ! combined with time-weakening for nucleation
+      if (TWF) then
+        nuc_x   = bc%twf%nuc_x
+        nuc_y   = bc%twf%nuc_y
+        nuc_z   = bc%twf%nuc_z
+        nuc_r   = bc%twf%nuc_r
+        nuc_t0  = bc%twf%nuc_t0
+        nuc_v   = bc%twf%nuc_v
+        do i = 1,bc%nglob
+            dist = ((bc%coord(1,i)-nuc_x)**2 + (bc%coord(2,i)-nuc_y)**2 + (bc%coord(3,i)-nuc_z)**2)**0.5
+            if (dist <= nuc_r) then
+                tw_r     = timeval * nuc_v
+                coh_size = nuc_t0  * nuc_v
+                if (dist <= tw_r - coh_size) then
+                    bc%mu(i) = min(bc%mu(i), bc%swf%mud(i))
+                else if (dist > tw_r - coh_size .and. dist <= tw_r ) then
+                    bc%mu(i) = min(bc%mu(i), bc%swf%mud(i) + (dist-(tw_r-coh_size))/coh_size*(bc%swf%mus(i)-bc%swf%mud(i)))
+                endif
+            endif
+        enddo
+      endif
+
+      ! TPV16 benchmark
+      if (TPV16) then
+        ! fixes friction coefficient to be dynamic friction coefficient when rupture time is over
+        if (UPDATE_LTS_ONLY_P) then
+          ! LTS on p-nodes only
+          do inum = 1,num_p_local
+            i = bc%fault_ibool_local_p(inum,ilevel)
+            if (bc%swf%T(i) <= timeval) bc%mu(i) = bc%swf%mud(i)
+          enddo
+        else
+          where (bc%swf%T(:) <= timeval) bc%mu(:) = bc%swf%mud(:)
+        endif
+      endif
+
+      ! Update strength & shear stress
+      if (UPDATE_LTS_ONLY_P) then
+        ! LTS on p-nodes only
+        do inum = 1,num_p_local
+          i = bc%fault_ibool_local_p(inum,ilevel)
+          ! Update strength
+          strength(i) = -bc%MU(i) * min(T(3,i),0.0_CUSTOM_REAL) + bc%swf%C(i)
+          ! solves for shear stress
+          Tnew(i) = min(Tstick(i),strength(i))
+        enddo
+      else
+        ! updates fault strength
+        strength(:) = -bc%mu(:) * min(T(3,:),0.0_CUSTOM_REAL) + bc%swf%C(:)
+        ! solves for shear stress
+        Tnew(:) = min(Tstick(:),strength(:))
+      endif
+    else
+      ! rate and state friction
+
+      !JPA the solver below can be refactored into a loop with two passes
+      !
+      ! see: Kaneko (2008), explanation of steps 4 and 6 in section 2.3,
+      !      and compare against the alternative first-order expansion given by equation (24)
+
+      do ipass = 1,2
+        ! updates state
+        select case(ipass)
+        case (1)
+          ! first pass
+          if (UPDATE_LTS_ONLY_P) then
+            ! LTS on p-nodes only
+            do inum = 1,num_p_local
+              i = bc%fault_ibool_local_p(inum,ilevel)
+              theta_old(i) = bc%rsf%theta(i)
+              call rsf_update_state_lts(Vf_old,deltat,bc%rsf,i)
+            enddo
+          else
+            theta_old(:) = bc%rsf%theta(:)
+            call rsf_update_state(Vf_old,deltat,bc%rsf)
+          endif
+        case (2)
+          ! second pass
+          if (UPDATE_LTS_ONLY_P) then
+            ! LTS on p-nodes only
+            do inum = 1,num_p_local
+              i = bc%fault_ibool_local_p(inum,ilevel)
+              bc%rsf%theta(i) = theta_old(i)
+              tmp_Vf(:) = 0.5_CUSTOM_REAL*(Vf_old(:) + Vf_new(:))
+              call rsf_update_state_lts(tmp_Vf,deltat,bc%rsf,i)
+            enddo
+          else
+            bc%rsf%theta(:) = theta_old(:)
+            tmp_Vf(:) = 0.5_CUSTOM_REAL*(Vf_old(:) + Vf_new(:))
+            call rsf_update_state(tmp_Vf,deltat,bc%rsf)
+          endif
+        case default
+          stop 'Error invalid pass in rate and state friction update'
+        end select
+
+        ! updates Vf_new
+        if (UPDATE_LTS_ONLY_P) then
+          ! LTS on p-nodes only
+          do inum = 1,num_p_local
+            i = bc%fault_ibool_local_p(inum,ilevel)
+            Vf_new(i) = rtsafe(0.0_CUSTOM_REAL,Vf_old(i)+5.0_CUSTOM_REAL,1e-5_CUSTOM_REAL, &
+                               Tstick(i),-T(3,i),bc%Z(i),bc%rsf%f0(i), &
+                               bc%rsf%V0(i),bc%rsf%a(i),bc%rsf%b(i),bc%rsf%L(i),bc%rsf%theta(i),bc%rsf%StateLaw)
+          enddo
+        else
+          do i = 1,bc%nglob
+            Vf_new(i) = rtsafe(0.0_CUSTOM_REAL,Vf_old(i)+5.0_CUSTOM_REAL,1e-5_CUSTOM_REAL, &
+                               Tstick(i),-T(3,i),bc%Z(i),bc%rsf%f0(i), &
+                               bc%rsf%V0(i),bc%rsf%a(i),bc%rsf%b(i),bc%rsf%L(i),bc%rsf%theta(i),bc%rsf%StateLaw)
+          enddo
         endif
+      enddo ! two passes
+
+      ! updates new stress
+      if (UPDATE_LTS_ONLY_P) then
+        ! LTS on p-nodes only
+        do inum = 1,num_p_local
+          i = bc%fault_ibool_local_p(inum,ilevel)
+          Tnew(i) = Tstick(i) - bc%Z(i) * Vf_new(i)
+        enddo
+      else
+        ! updates new traction
+        Tnew(:) = Tstick(:) - bc%Z(:) * Vf_new(:)
+      endif
+
+    endif
+
+    if (UPDATE_LTS_ONLY_P) then
+      ! LTS on p-nodes only
+      do inum = 1,num_p_local
+        i = bc%fault_ibool_local_p(inum,ilevel)
+        Tstick(i) = max(Tstick(i),1.0_CUSTOM_REAL) ! to avoid division by zero
+        T(1,i) = Tnew(i) * T(1,i)/Tstick(i)
+        T(2,i) = Tnew(i) * T(2,i)/Tstick(i)
       enddo
+
+      ! Save total tractions
+      ! LTS on p-nodes only
+      do inum = 1,num_p_local
+        i = bc%fault_ibool_local_p(inum,ilevel)
+        bc%T(:,i) = T(:,i)
+      enddo
+    else
+      Tstick(:) = max(Tstick(:),1.0_CUSTOM_REAL) ! to avoid division by zero
+
+      T(1,:) = Tnew(:) * T(1,:)/Tstick(:)
+      T(2,:) = Tnew(:) * T(2,:)/Tstick(:)
+
+      ! Save total tractions
+      bc%T(:,:) = T(:,:)
+    endif
+
+    ! Subtract initial stress (to have relative stress on the fault)
+    ! LTS - todo optimization
+    !if (USE_OPTIMIZATION) then
+    !  do inum = 1,num_p_local
+    !    i = bc%fault_ibool_local_p(inum,ilevel)
+    !    T(:,i) = T(:,i) - bc%T0(:,i)
+    !  enddo
+    !else
+    T(:,:) = T(:,:) - bc%T0(:,:)
+
+    if (RATE_AND_STATE) then
+      ! LTS - todo optimization
+      !if (USE_OPTIMIZATION) then
+      !  do inum = 1,num_p_local
+      !    i = bc%fault_ibool_local_p(inum,ilevel)
+      !    T(1,i) = T(1,i) - TxExt(i)
+      !  enddo
+      !else
+      T(1,:) = T(1,:) - TxExt(:)
+    endif
+    !JPA: this eliminates the effect of TxExt on the equations of motion. Why is it needed?
+
+    ! Update slip acceleration da=da_free-T/(0.5*dt*Z)
+    ! LTS - todo optimization
+    !if (USE_OPTIMIZATION) then
+    !  do inum = 1,num_p_local
+    !    i = bc%fault_ibool_local_p(inum,ilevel)
+    !    dA(:,i) = dA(:,i) - T(:,i)/(bc%Z(i) * half_dt)
+    !  enddo
+    !else
+    dA(1,:) = dA(1,:) - T(1,:)/(bc%Z(:) * half_dt)
+    dA(2,:) = dA(2,:) - T(2,:)/(bc%Z(:) * half_dt)
+    dA(3,:) = dA(3,:) - T(3,:)/(bc%Z(:) * half_dt)
+
+    ! Update slip and slip rate, in fault frame
+    if (UPDATE_LTS_ONLY_P) then
+      ! LTS on p-nodes only
+      do inum = 1,num_p_local
+        i = bc%fault_ibool_local_p(inum,ilevel)
+        bc%D(:,i) = dD(:,i)
+        bc%V(:,i) = dV(:,i) + half_dt*dA(:,i)
+      enddo
+    else
+      bc%D(:,:) = dD(:,:)
+      bc%V(:,:) = dV(:,:) + half_dt * dA(:,:)
+    endif
+
+    ! Rotate tractions back to (x,y,z) frame
+    T(:,:) = rotate(bc,T,-1)
+
+    ! Add boundary term B*T to M*a
+    if (UPDATE_LTS_ONLY_P) then
+      ! LTS on p-nodes only
+      do inum = 1,num_p_local
+        i = bc%fault_ibool_local_p(inum,ilevel)
+        call add_BT_single(bc,MxA,T,i)  ! LTS version
+      enddo
+    else
+      call add_BT(bc,MxA,T)
+    endif
+
+    !-- intermediate storage of outputs --
+    ! LTS - todo optimization
+    !if (USE_OPTIMIZATION) then
+    !  do inum = 1,num_p_local
+    !    i = bc%fault_ibool_local_p(inum,ilevel)
+    !    Vf_new(i) = sqrt(bc%V(1,i)*bc%V(1,i) + bc%V(2,i)*bc%V(2,i))
+    !    if (.not. RATE_AND_STATE) then
+    !      theta_new(i) = bc%swf%theta(i)
+    !      dc(i) = bc%swf%dc(i)
+    !    else
+    !      theta_new(i) = bc%rsf%theta(i)
+    !      dc(i) = bc%rsf%L(i)
+    !    endif
+    !  enddo
+    !else
+    Vf_new = sqrt(bc%V(1,:)*bc%V(1,:) + bc%V(2,:)*bc%V(2,:))
+    if (.not. RATE_AND_STATE) then
+      theta_new(:) = bc%swf%theta(:)
+      dc(:) = bc%swf%Dc(:)
+    else
+      theta_new(:) = bc%rsf%theta(:)
+      dc(:) = bc%rsf%L(:)
+    endif
+
+    call store_dataXZ(bc%dataXZ, strength, theta_old, theta_new, dc, &
+                      Vf_old, Vf_new, timeval, deltat)
+
+    call store_dataT(bc%dataT,bc%D,bc%V,bc%T,it)
+
+    if (RATE_AND_STATE) then
+      ! adds storage of state
+      if (bc%rsf%StateLaw == 1) then
+        ! ageing law
+        if (UPDATE_LTS_ONLY_P) then
+          ! LTS on p-nodes only
+          do inum = 1,num_p_local
+            ipoin = bc%fault_ibool_local_p(inum,ilevel)
+            iglob = bc%dataT%iglob(ipoin)
+            bc%dataT%dat(8,ipoin,it) = log10(theta_new(iglob))
+          enddo
+        else
+          do ipoin = 1,bc%dataT%npoin
+            iglob = bc%dataT%iglob(ipoin)
+            bc%dataT%dat(8,ipoin,it) = log10(theta_new(iglob))
+          enddo
+        endif
+      else
+        ! slip law
+        if (UPDATE_LTS_ONLY_P) then
+          ! LTS on p-nodes only
+          do inum = 1,num_p_local
+            ipoin = bc%fault_ibool_local_p(inum,ilevel)
+            iglob = bc%dataT%iglob(ipoin)
+            bc%dataT%dat(8,ipoin,it) = theta_new(iglob)
+          enddo
+        else
+          do ipoin = 1,bc%dataT%npoin
+            iglob = bc%dataT%iglob(ipoin)
+            bc%dataT%dat(8,ipoin,it) = theta_new(iglob)
+          enddo
+        endif
+      endif
     endif
 
     !-- outputs --
@@ -1197,7 +1809,7 @@ subroutine BC_DYNFLT_set3d(bc,MxA,V,D,iflt)
     endif
   endif
 
-  end subroutine BC_DYNFLT_set3d
+  end subroutine BC_DYNFLT_set3d_lts
 
 !===============================================================
 
@@ -1339,6 +1951,45 @@ subroutine swf_update_state(dold,dnew,vold,f)
 
   end subroutine swf_update_state
 
+!---------------------------------------------------------------------
+
+  subroutine swf_update_state_lts(dold,dnew,vold,f,p_value,bc)
+
+  use specfem_par_lts, only: iglob_p_refine
+  implicit none
+
+  real(kind=CUSTOM_REAL), dimension(:,:), intent(in) :: vold,dold,dnew
+  type(swf_type), intent(inout) :: f
+
+  integer,intent(in) :: p_value
+  type(bc_dynandkinflt_type), intent(in) :: bc
+
+  ! local parameters
+  real(kind=CUSTOM_REAL) :: vnorm
+  integer :: i,iglob
+
+  ! fault state variable theta (magnitude of accumulated slip on fault)
+  ! accumulates fault slip
+  do i = 1,bc%nglob
+    iglob = bc%ibulk1(i)
+    if ( iglob_p_refine(iglob) == p_value ) then
+      f%theta(i) = f%theta(i) + sqrt( (dold(1,i)-dnew(1,i))**2 + (dold(2,i)-dnew(2,i))**2 )
+    endif
+  enddo
+
+  ! fault healing
+  if (f%healing) then
+    do i = 1,bc%nglob
+      iglob = bc%ibulk1(i)
+      if (iglob_p_refine(iglob) == p_value) then
+        vnorm = sqrt(vold(1,i)**2 + vold(2,i)**2)
+        if (vnorm < V_HEALING) f%theta(i) = 0.0_CUSTOM_REAL
+      endif
+    enddo
+  endif
+
+  end subroutine swf_update_state_lts
+
 !---------------------------------------------------------------------
 
   function swf_mu(f) result(mu)
@@ -1349,19 +2000,42 @@ function swf_mu(f) result(mu)
   real(kind=CUSTOM_REAL) :: mu(size(f%theta))
 
   if (f%kind == 1) then
-  ! slip weakening law
-  !
-  ! for example: Galvez, 2014, eq. (8)
-  !              also Ida, 1973; Palmer & Rice 1973; Andrews 1976; ..
-      mu(:) = f%mus(:) - (f%mus(:)-f%mud(:)) * min(f%theta(:)/f%Dc(:), 1.0_CUSTOM_REAL)
+    ! slip weakening law
+    !
+    ! for example: Galvez, 2014, eq. (8)
+    !              also Ida, 1973; Palmer & Rice 1973; Andrews 1976; ..
+    mu(:) = f%mus(:) - (f%mus(:)-f%mud(:)) * min(f%theta(:)/f%Dc(:), 1.0_CUSTOM_REAL)
   else
-  !-- exponential slip weakening:
-      mu(:) = f%mud(:) - (f%mud(:)-f%mus(:)) * exp(-f%theta(:)/f%Dc(:))
+    !-- exponential slip weakening:
+    mu(:) = f%mud(:) - (f%mud(:)-f%mus(:)) * exp(-f%theta(:)/f%Dc(:))
   endif
 
   end function swf_mu
 
 
+!---------------------------------------------------------------------
+
+  function swf_mu_lts(f,i) result(mu)
+
+  implicit none
+
+  type(swf_type), intent(in) :: f
+  integer, intent(in) :: i
+  real(kind=CUSTOM_REAL) :: mu
+
+  if (f%kind == 1) then
+    ! slip weakening law
+    !
+    ! for example: Galvez, 2014, eq. (8)
+    !              also Ida, 1973; Palmer & Rice 1973; Andrews 1976; ..
+    mu = f%mus(i) - (f%mus(i)-f%mud(i)) * min(f%theta(i)/f%Dc(i), 1.0_CUSTOM_REAL)
+  else
+    !-- exponential slip weakening:
+    mu = f%mud(i) - (f%mud(i)-f%mus(i)) * exp(-f%theta(i)/f%Dc(i))
+  endif
+
+  end function swf_mu_lts
+
 !=====================================================================
 
   subroutine fault_rsf_swf_init_GPU()
@@ -1888,6 +2562,56 @@ subroutine rsf_update_state(V,dt_real,f)
 
   end subroutine rsf_update_state
 
+!---------------------------------------------------------------------
+
+  subroutine rsf_update_state_lts(V,dt_real,f,i)
+
+  use constants, only: ONE,HALF,TWO
+  implicit none
+  real(kind=CUSTOM_REAL), dimension(:), intent(in) :: V
+  type(rsf_type), intent(inout) :: f
+  real(kind=CUSTOM_REAL), intent(in) :: dt_real
+
+  ! LTS
+  integer,intent(in) :: i
+
+  ! local parameters
+  real(kind=CUSTOM_REAL) :: vDtL
+  real(kind=CUSTOM_REAL) :: f_ss,theta_ss,f_LV
+
+  ! LTS on p-nodes only
+  vDtL = V(i) * dt_real / f%L(i)
+
+  ! note: assumes that vDTL is strictly positive ( >= 0), since V >= 0, dt > 0 and L > 0
+  if (vDtL < 0.0_CUSTOM_REAL) stop 'Invalid negative factor found in rate and state friction law'
+
+  ! state update
+  if (f%StateLaw == 1) then
+    ! ageing law
+    if (vDtL > 1.e-5_CUSTOM_REAL) then
+      f%theta(i) = f%theta(i) * exp(-vDtL) + f%L(i)/V(i) * (ONE - exp(-vDtL))
+    else
+      f%theta(i) = f%theta(i) * exp(-vDtL) + dt_real * ( ONE - HALF*vDtL )
+    endif
+  else
+    ! slip law
+    if (RSF_SLIP_LAW_TYPE == 1) then
+      ! default, strong rate-weakening:
+      if (V(i) /= 0.0_CUSTOM_REAL) then
+        f_LV = f%f0(i) - (f%b(i) - f%a(i))*log(V(i)/f%V0(i))
+        f_ss = f%fw(i) + (f_LV - f%fw(i))/(ONE + (V(i)/f%Vw(i))**8)**0.125
+        theta_ss = f%a(i) * log( TWO*f%V0(i)/V(i) * sinh(f_ss/f%a(i)) )
+        f%theta(i) = theta_ss + (f%theta(i) - theta_ss) * exp(-vDtL)
+      else
+        f%theta(i) = f%theta(i)
+      endif
+    else
+      ! Kaneko (2008) slip law:
+      f%theta(i) = f%L(i)/V(i) * (f%theta(i)*V(i)/f%L(i))**(exp(-vDtL))
+    endif
+  endif
+
+  end subroutine rsf_update_state_lts
 
 !===============================================================
 ! OUTPUTS
@@ -1905,7 +2629,8 @@ subroutine SCEC_Write_RuptureTime(dataXZ,iflt)
   integer :: i
   character(len=MAX_STRING_LEN) :: filename
   integer, dimension(8) :: time_values
-  integer, parameter :: IOUT_RUP = 121 !WARNING: not very robust. Could instead look for an available ID
+  ! WARNING: not very robust. Could instead look for an available ID
+  integer, parameter :: IOUT_RUP = 121
 
   call date_and_time(VALUES=time_values)
 
@@ -2563,5 +3288,21 @@ subroutine fault_output_synchronize_GPU(it)
 
   end subroutine fault_output_synchronize_GPU
 
+!---------------------------------------------------------------
+
+  subroutine BC_DYNFLT_free()
+
+  implicit none
+  integer :: iflt
+
+  do iflt = 1,size(faults)
+    call free_fault_memory(faults(iflt))
+  enddo
+
+  deallocate(faults)
+  if (allocated(Kelvin_Voigt_eta)) deallocate(Kelvin_Voigt_eta)
+
+  end subroutine BC_DYNFLT_free
+
 end module fault_solver_dynamic
 
diff --git a/src/specfem3D/fault_solver_kinematic.f90 b/src/specfem3D/fault_solver_kinematic.f90
index 0b7c24fe8..8c2e4c86a 100644
--- a/src/specfem3D/fault_solver_kinematic.f90
+++ b/src/specfem3D/fault_solver_kinematic.f90
@@ -43,7 +43,8 @@ module fault_solver_kinematic
 
   logical, save :: SIMULATION_TYPE_KIN = .false.
 
-  public :: BC_KINFLT_init, BC_KINFLT_set_all, SIMULATION_TYPE_KIN
+  public :: BC_KINFLT_init, BC_KINFLT_free, BC_KINFLT_set_all, &
+            SIMULATION_TYPE_KIN
 
 contains
 
@@ -56,7 +57,7 @@ subroutine BC_KINFLT_init(prname)
 
   use specfem_par, only: nt => NSTEP, DTglobal => DT
 
-  use constants, only: myrank,IIN_PAR,IIN_BIN
+  use constants, only: myrank,IIN_PAR,IIN_BIN,PARALLEL_FAULT
 
   implicit none
 
@@ -126,6 +127,12 @@ subroutine BC_KINFLT_init(prname)
   if (myrank == 0) then
     write(IMAIN,*) '  incorporating kinematic rupture simulation'
     write(IMAIN,*) '  found ', nbfaults, ' fault(s) in file DATA/Par_file_faults'
+    if (PARALLEL_FAULT) then
+      write(IMAIN,*) "    using parallel partitions"
+    else
+      write(IMAIN,*) "    using single   partition"
+    endif
+    call flush_IMAIN()
   endif
 
   ! sets kinematic rupture flag
@@ -217,6 +224,7 @@ end subroutine BC_KINFLT_init
 
   subroutine init_one_fault(bc,IIN_BIN,IIN_PAR,dt_real,NT,iflt)
 
+  use constants, only: myrank
   implicit none
 
   type(bc_dynandkinflt_type), intent(inout) :: bc
@@ -230,6 +238,12 @@ subroutine init_one_fault(bc,IIN_BIN,IIN_PAR,dt_real,NT,iflt)
 
   NAMELIST / KINPAR / kindt
 
+  ! user output
+  if (myrank == 0) then
+    write(IMAIN,*) "    initializing fault",iflt
+    call flush_IMAIN()
+  endif
+
   ! reads in fault_db binary file and initializes fault arrays
   call initialize_fault(bc,IIN_BIN)
 
@@ -273,17 +287,28 @@ subroutine init_one_fault(bc,IIN_BIN,IIN_PAR,dt_real,NT,iflt)
 
   call init_dataXZ(bc%dataXZ,bc)
 
+  ! user output
+  if (myrank == 0) then
+    write(IMAIN,*) "    fault initialized successfully"
+    call flush_IMAIN()
+  endif
+
   end subroutine init_one_fault
 
 
 !=====================================================================
 ! adds boundary term Bt to Force array for each fault.
 !
-  subroutine BC_KINFLT_set_all(F,Vel,Dis)
+  subroutine BC_KINFLT_set_all(F,Vel,Dis,p_value,ilevel)
+
+  ! LTS
+  use specfem_par, only: LTS_MODE
 
   implicit none
   real(kind=CUSTOM_REAL), dimension(:,:), intent(in) :: Vel,Dis
   real(kind=CUSTOM_REAL), dimension(:,:), intent(inout) :: F
+  ! LTS
+  integer,intent(in),optional :: p_value,ilevel
 
   ! local parameters
   integer :: iflt
@@ -295,7 +320,11 @@ subroutine BC_KINFLT_set_all(F,Vel,Dis)
   do iflt = 1,Nfaults
     ! note: this routine should be called by all processes, regardless if they contain no fault elements,
     !       for managing MPI calls and file outputs
-    call BC_KINFLT_set_single(faults(iflt),F,Vel,Dis,iflt)
+    if (LTS_MODE) then
+      call BC_KINFLT_set_single_lts(faults(iflt),F,Vel,Dis,iflt,p_value,ilevel)
+    else
+      call BC_KINFLT_set_single(faults(iflt),F,Vel,Dis,iflt)
+    endif
   enddo
 
   end subroutine BC_KINFLT_set_all
@@ -316,31 +345,38 @@ subroutine BC_KINFLT_set_single(bc,MxA,V,D,iflt)
   type(bc_dynandkinflt_type), intent(inout) :: bc
   real(kind=CUSTOM_REAL), intent(in) :: V(:,:),D(:,:)
   integer,intent(in) :: iflt
+
+  ! local parameters
   integer :: it_kin,itime
   real(kind=CUSTOM_REAL), dimension(3,bc%nglob) :: T
   real(kind=CUSTOM_REAL), dimension(3,bc%nglob) :: dD,dV,dA,dV_free
   real(kind=CUSTOM_REAL) :: t1,t2
-  real(kind=CUSTOM_REAL) :: half_dt,timeval
+  real(kind=CUSTOM_REAL) :: deltat,half_dt,timeval
 
-  if (bc%nspec > 0) then !Surendra : for parallel faults
+  ! global time step
+  deltat = bc%dt
+
+  ! half time step
+  half_dt = 0.5_CUSTOM_REAL * deltat
 
-    half_dt = 0.5e0_CUSTOM_REAL*bc%dt
+  ! time will never be zero. it starts from 1
+  timeval = it * deltat
+
+  if (bc%nspec > 0) then !Surendra : for parallel faults
 
     ! get predicted values
-    dD = get_jump(bc,D) ! dD_predictor
-    dV = get_jump(bc,V) ! dV_predictor
-    dA = get_weighted_jump(bc,MxA) ! dA_free
+    dD(:,:) = get_jump(bc,D) ! dD_predictor
+    dV(:,:) = get_jump(bc,V) ! dV_predictor
+    dA(:,:) = get_weighted_jump(bc,MxA) ! dA_free
 
     ! rotate to fault frame (tangent,normal)
     ! component 3 is normal to the fault
-    dD = rotate(bc,dD,1)
-    dV = rotate(bc,dV,1)
-    dA = rotate(bc,dA,1)
+    dD(:,:) = rotate(bc,dD,1)
+    dV(:,:) = rotate(bc,dV,1)
+    dA(:,:) = rotate(bc,dA,1)
 
-    ! Time marching
-    timeval = it*bc%dt
     ! Slip_rate step "it_kin"
-    it_kin = bc%kin_it*nint(bc%kin_dt/bc%dt)
+    it_kin = bc%kin_it * nint(bc%kin_dt/bc%dt)
     ! (nint : Fortran round (nearest whole number) ,
     !  if nint(a)=0.5 then "a" get upper bound )
 
@@ -348,27 +384,24 @@ subroutine BC_KINFLT_set_single(bc,MxA,V,D,iflt)
     ! This is done in case bc%kin_dt
     ! if (it_kin == it) it_kin=it_kin+1 !
 
-
     !NOTE : it and it_kin is being used due to integers are exact numbers.
     if (it > it_kin) then
-
       print *, 'it :', it
       print *, 'it_kin :', it_kin
 
       bc%kin_it = bc%kin_it +1
       bc%v_kin_t1 = bc%v_kin_t2
       print *, 'loading v_kin_t2'
-      !Temporal : just for snapshots file names kin_dt=0.1 , dt=0.0001
-      !snapshot(100=itime).. : itime=kin_it*(kin_dt/dt)
-      itime = bc%kin_it*nint(bc%kin_dt/bc%dt)
+      ! Temporal : just for snapshots file names kin_dt=0.1 , dt=0.0001
+      ! snapshot(100=itime).. : itime=kin_it*(kin_dt/dt)
+      itime = bc%kin_it * nint(bc%kin_dt/bc%dt)
       call load_vslip_snapshots(bc%dataXZ,itime,iflt,myrank)
-!     loading slip rates
-      bc%v_kin_t2(1,:)=bc%dataXZ%v1
-      bc%v_kin_t2(2,:)=bc%dataXZ%v2
+      ! loading slip rates
+      bc%v_kin_t2(1,:) = bc%dataXZ%v1
+      bc%v_kin_t2(2,:) = bc%dataXZ%v2
 
       !linear interpolation in time between t1 and t2
       !REMARK , bc%kin_dt is the delta "t" between two snapshots.
-
     endif
 
     t1 = (bc%kin_it-1) * bc%kin_dt
@@ -382,9 +415,9 @@ subroutine BC_KINFLT_set_single(bc,MxA,V,D,iflt)
     bc%V(2,:) = ( (t2 - timeval)*bc%v_kin_t1(2,:) + (timeval - t1)*bc%v_kin_t2(2,:) )/ bc%kin_dt
 
     !dV_free = dV_predictor + (dt/2)*dA_free
-    dV_free(1,:) = dV(1,:) + half_dt*dA(1,:)
-    dV_free(2,:) = dV(2,:) + half_dt*dA(2,:)
-    dV_free(3,:) = dV(3,:) + half_dt*dA(3,:)
+    dV_free(1,:) = dV(1,:) + half_dt * dA(1,:)
+    dV_free(2,:) = dV(2,:) + half_dt * dA(2,:)
+    dV_free(3,:) = dV(3,:) + half_dt * dA(3,:)
 
     ! T = Z*( dV_free - V) , V known apriori as input.
     ! CONVENTION : T(ibulk1) = T = -T(ibulk2)
@@ -393,13 +426,13 @@ subroutine BC_KINFLT_set_single(bc,MxA,V,D,iflt)
     T(3,:) = bc%Z * ( dV_free(3,:) )
 
     ! Save tractions
-    bc%T = T
+    bc%T(:,:) = T(:,:)
 
     ! Update slip in fault frame
-    bc%D = dD
+    bc%D(:,:) = dD(:,:)
 
     ! Rotate tractions back to (x,y,z) frame
-    T = rotate(bc,T,-1)
+    T(:,:) = rotate(bc,T,-1)
 
     ! Add boundary term B*T to M*a
     call add_BT(bc,MxA,T)
@@ -417,6 +450,194 @@ subroutine BC_KINFLT_set_single(bc,MxA,V,D,iflt)
 
   end subroutine BC_KINFLT_set_single
 
+!---------------------------------------------------------------------
+
+  subroutine BC_KINFLT_set_single_lts(bc,MxA,V,D,iflt,p_value,ilevel)
+
+! LTS version of BC_KINFLT_set_single to calculate only for fault nodes on a given p-level
+
+  use specfem_par, only: it,NSTEP,myrank
+
+  ! LTS
+  use specfem_par, only: LTS_MODE
+  use specfem_par_lts, only: num_p_level,p_level,lts_current_m
+
+  implicit none
+  real(kind=CUSTOM_REAL), intent(inout) :: MxA(:,:)
+  type(bc_dynandkinflt_type), intent(inout) :: bc
+  real(kind=CUSTOM_REAL), intent(in) :: V(:,:),D(:,:)
+  integer,intent(in) :: iflt
+  ! LTS
+  integer,intent(in),optional :: p_value,ilevel
+
+  ! local parameters
+  integer :: it_kin,itime
+  real(kind=CUSTOM_REAL), dimension(3,bc%nglob) :: T
+  real(kind=CUSTOM_REAL), dimension(3,bc%nglob) :: dD,dV,dA,dV_free
+  real(kind=CUSTOM_REAL) :: t1,t2
+  real(kind=CUSTOM_REAL) :: deltat,half_dt,timeval
+  ! LTS
+  integer :: i,iglob1,iglob2,iilevel,inum,num_p_local
+  ! only update p-level nodes (when called for each ilevel separately) set this to .true., otherwise
+  ! update all nodes (when called once on for last ilevel)
+  logical, parameter :: UPDATE_LTS_ONLY_P = .false.
+
+  ! safety check
+  if (.not. LTS_MODE) return
+
+  ! LTS checks if fault has any p-nodes to compute
+  ! fault flag: only compute fault elements if fault elements in this p-level
+  if (UPDATE_LTS_ONLY_P) then
+    if (.not. bc%fault_use_p_level(ilevel)) return
+  endif
+
+  ! local time step
+  deltat = bc%dt / p_value
+
+  ! half time step
+  half_dt = 0.5_CUSTOM_REAL * deltat
+
+  ! time will never be zero. it starts from 1
+  ! calculate time for this level
+  timeval = it * bc%dt     ! coarse level
+  ! adds fine level increments
+  do iilevel = ilevel,(num_p_level-1)
+    timeval = timeval + dble(lts_current_m(iilevel)-1) * bc%dt/dble(p_level(iilevel))
+  enddo
+
+  if (bc%nspec > 0) then !Surendra : for parallel faults
+
+    ! sets up local p-nodes on fault surface
+    num_p_local = bc%fault_num_p_local(ilevel)
+
+    ! get predicted values
+    dD(:,:) = get_jump(bc,D) ! dD_predictor
+    dV(:,:) = get_jump(bc,V) ! dV_predictor
+    dA(:,:) = get_weighted_jump(bc,MxA) ! dA_free
+
+    ! rotate to fault frame (tangent,normal)
+    ! component 3 is normal to the fault
+    dD(:,:) = rotate(bc,dD,1)
+    dV(:,:) = rotate(bc,dV,1)
+    dA(:,:) = rotate(bc,dA,1)
+
+    ! Slip_rate step "it_kin"
+    it_kin = bc%kin_it * nint(bc%kin_dt/bc%dt)
+    ! (nint : Fortran round (nearest whole number) ,
+    !  if nint(a)=0.5 then "a" get upper bound )
+
+    ! Loading the next slip_rate one ahead it.
+    ! This is done in case bc%kin_dt
+    ! if (it_kin == it) it_kin=it_kin+1 !
+
+    !NOTE : it and it_kin is being used due to integers are exact numbers.
+    if (it > it_kin) then
+      print *, 'it :', it
+      print *, 'it_kin :', it_kin
+
+      bc%kin_it = bc%kin_it +1
+      bc%v_kin_t1 = bc%v_kin_t2
+      print *, 'loading v_kin_t2'
+      ! Temporal : just for snapshots file names kin_dt=0.1 , dt=0.0001
+      ! snapshot(100=itime).. : itime=kin_it*(kin_dt/dt)
+      itime = bc%kin_it*nint(bc%kin_dt/bc%dt)
+      call load_vslip_snapshots(bc%dataXZ,itime,iflt,myrank)
+      ! loading slip rates
+      bc%v_kin_t2(1,:) = bc%dataXZ%v1
+      bc%v_kin_t2(2,:) = bc%dataXZ%v2
+
+      !linear interpolation in time between t1 and t2
+      !REMARK , bc%kin_dt is the delta "t" between two snapshots.
+    endif
+
+    t1 = (bc%kin_it-1) * bc%kin_dt
+    t2 = bc%kin_it * bc%kin_dt
+
+    ! Kinematic velocity_rate
+    ! bc%V : Imposed a priori and read from slip rate snapshots (from time reversal)
+    !        Linear interpolation between consecutive kinematic time steps.
+    !        V will be given at each time step.
+    if (UPDATE_LTS_ONLY_P) then
+      ! LTS on p-nodes only
+      do inum = 1,num_p_local
+        i = bc%fault_ibool_local_p(inum,ilevel)
+        bc%V(1,i) = ( (t2 - timeval)*bc%v_kin_t1(1,i) + (timeval - t1)*bc%v_kin_t2(1,i) )/ bc%kin_dt
+        bc%V(2,i) = ( (t2 - timeval)*bc%v_kin_t1(2,i) + (timeval - t1)*bc%v_kin_t2(2,i) )/ bc%kin_dt
+      enddo
+    else
+      bc%V(1,:) = ( (t2 - timeval)*bc%v_kin_t1(1,:) + (timeval - t1)*bc%v_kin_t2(1,:) )/ bc%kin_dt
+      bc%V(2,:) = ( (t2 - timeval)*bc%v_kin_t1(2,:) + (timeval - t1)*bc%v_kin_t2(2,:) )/ bc%kin_dt
+    endif
+
+    !dV_free = dV_predictor + (dt/2)*dA_free
+    dV_free(1,:) = dV(1,:) + half_dt * dA(1,:)
+    dV_free(2,:) = dV(2,:) + half_dt * dA(2,:)
+    dV_free(3,:) = dV(3,:) + half_dt * dA(3,:)
+
+    ! T = Z*( dV_free - V) , V known apriori as input.
+    ! CONVENTION : T(ibulk1) = T = -T(ibulk2)
+    T(1,:) = bc%Z * ( dV_free(1,:) - bc%V(1,:) )
+    T(2,:) = bc%Z * ( dV_free(2,:) - bc%V(2,:) )
+    T(3,:) = bc%Z * ( dV_free(3,:) )
+
+    if (UPDATE_LTS_ONLY_P) then
+      ! Save tractions
+      ! LTS on p-nodes only
+      do inum = 1,num_p_local
+        i = bc%fault_ibool_local_p(inum,ilevel)
+        bc%T(:,i) = T(:,i)
+      enddo
+
+      ! Update slip in fault frame
+      ! LTS on p-nodes only
+      do inum = 1,num_p_local
+        i = bc%fault_ibool_local_p(inum,ilevel)
+        bc%D(:,i) = dD(:,i)
+      enddo
+    else
+      ! Save tractions
+      bc%T(:,:) = T(:,:)
+
+      ! Update slip in fault frame
+      bc%D(:,:) = dD(:,:)
+    endif
+
+    ! Rotate tractions back to (x,y,z) frame
+    T(:,:) = rotate(bc,T,-1)
+
+    ! Add boundary term B*T to M*a
+    if (UPDATE_LTS_ONLY_P) then
+      ! LTS on p-nodes only
+      do inum = 1,num_p_local
+        i = bc%fault_ibool_local_p(inum,ilevel)
+        iglob1 = bc%ibulk1(i)
+        MxA(1,iglob1) = MxA(1,iglob1) + bc%B(i) * T(1,i)
+        MxA(2,iglob1) = MxA(2,iglob1) + bc%B(i) * T(2,i)
+        MxA(3,iglob1) = MxA(3,iglob1) + bc%B(i) * T(3,i)
+
+        iglob2 = bc%ibulk2(i)
+        MxA(1,iglob2) = MxA(1,iglob2) - bc%B(i) * T(1,i)
+        MxA(2,iglob2) = MxA(2,iglob2) - bc%B(i) * T(2,i)
+        MxA(3,iglob2) = MxA(3,iglob2) - bc%B(i) * T(3,i)
+      enddo
+    else
+      call add_BT(bc,MxA,T)
+    endif
+
+    !-- intermediate storage of outputs --
+    call store_dataT(bc%dataT,bc%D,bc%V,bc%T,it)
+
+    !-- OUTPUTS --
+    ! write dataT every NTOUT time steps or at the end of simulation
+    if (mod(it,NTOUT) == 0 .or. it == NSTEP) call SCEC_write_dataT(bc%dataT)
+    ! write dataXZ every NSNAP time steps
+    if (mod(it,NSNAP) == 0) call write_dataXZ(bc%dataXZ,it,iflt)
+
+  endif
+
+  end subroutine BC_KINFLT_set_single_lts
+
+
 !===============================================================
 
   subroutine init_dataXZ(dataXZ,bc)
@@ -540,5 +761,19 @@ subroutine load_vslip_snapshots(dataXZ,itime,iflt,myrank)
 
   end subroutine load_vslip_snapshots
 
+!---------------------------------------------------------------
+
+  subroutine BC_KINFLT_free()
+
+  implicit none
+  integer :: iflt
+
+  do iflt = 1,size(faults)
+    call free_fault_memory(faults(iflt))
+  enddo
+
+  deallocate(faults)
+
+  end subroutine BC_KINFLT_free
 
 end module fault_solver_kinematic
diff --git a/src/specfem3D/finalize_simulation.f90 b/src/specfem3D/finalize_simulation.f90
index 038930564..62a9da55d 100644
--- a/src/specfem3D/finalize_simulation.f90
+++ b/src/specfem3D/finalize_simulation.f90
@@ -134,6 +134,9 @@ subroutine finalize_simulation_cleanup()
   use specfem_par_poroelastic
   use specfem_par_lts
 
+  use fault_solver_dynamic, only: SIMULATION_TYPE_DYN,BC_DYNFLT_free
+  use fault_solver_kinematic, only: SIMULATION_TYPE_KIN,BC_KINFLT_free
+
   implicit none
 
   ! from here on, no gpu data is needed anymore
@@ -153,6 +156,7 @@ subroutine finalize_simulation_cleanup()
   if (ACOUSTIC_SIMULATION) then
     deallocate(rmass_acoustic)
   endif
+
   ! boundary surfaces
   deallocate(ibelm_xmin)
   deallocate(ibelm_xmax)
@@ -160,6 +164,7 @@ subroutine finalize_simulation_cleanup()
   deallocate(ibelm_ymax)
   deallocate(ibelm_bottom)
   deallocate(ibelm_top)
+
   ! sources
   if (NSOURCES > 0) then
     deallocate(islice_selected_source,ispec_selected_source)
@@ -170,6 +175,7 @@ subroutine finalize_simulation_cleanup()
     deallocate(nu_source)
     deallocate(user_source_time_function)
   endif
+
   ! receivers
   if (nrec > 0) then
     deallocate(islice_selected_rec,ispec_selected_rec)
@@ -177,6 +183,7 @@ subroutine finalize_simulation_cleanup()
     deallocate(station_name,network_name)
     deallocate(nu_rec)
   endif
+
   if (allocated(pm1_source_encoding)) deallocate(pm1_source_encoding)
   if (allocated(sourcearrays)) deallocate(sourcearrays)
   if (allocated(source_adjoint)) deallocate(source_adjoint)
@@ -185,6 +192,7 @@ subroutine finalize_simulation_cleanup()
   if (allocated(number_receiver_global)) deallocate(number_receiver_global)
   if (allocated(hxir_store)) deallocate(hxir_store,hetar_store,hgammar_store)
   if (allocated(hpxir_store)) deallocate(hpxir_store,hpetar_store,hpgammar_store)
+
   ! adjoint sources
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
     if (nadj_rec_local > 0) then
@@ -197,11 +205,14 @@ subroutine finalize_simulation_cleanup()
       endif
     endif
   endif
+
   ! seismograms
   if (allocated(seismograms_d)) deallocate(seismograms_d,seismograms_v,seismograms_a,seismograms_p)
   if (allocated(seismograms_eps)) deallocate(seismograms_eps)
+
   ! moment tensor derivatives
   if (allocated(Mxx_der)) deallocate(Mxx_der,Myy_der,Mzz_der,Mxy_der,Mxz_der,Myz_der,sloc_der)
+
   ! mesh
   deallocate(ibool)
   deallocate(irregular_element_number)
@@ -211,6 +222,10 @@ subroutine finalize_simulation_cleanup()
   deallocate(kappastore,mustore,rhostore)
   deallocate(ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic)
 
+  ! faults
+  if (SIMULATION_TYPE_DYN) call BC_DYNFLT_free()
+  if (SIMULATION_TYPE_KIN) call BC_KINFLT_free()
+
   ! LTS
   if (LTS_MODE) then
     deallocate(p_level,p_level_loops,p_level_steps)
diff --git a/src/specfem3D/iterate_time.F90 b/src/specfem3D/iterate_time.F90
index 512286def..5f16e06c0 100644
--- a/src/specfem3D/iterate_time.F90
+++ b/src/specfem3D/iterate_time.F90
@@ -207,6 +207,14 @@ subroutine iterate_time()
   ! get MPI starting
   time_start = wtime()
 
+  ! LTS
+  if (LTS_MODE) then
+    ! LTS steps through its own time iterations - for now
+    call lts_iterate_time()
+    ! all done, continue after time loop
+    goto 777
+  endif
+
   ! time loop
   do it = it_begin,it_end
 
@@ -321,6 +329,9 @@ subroutine iterate_time()
   !
   enddo   ! end of main time loop
 
+! goto point for LTS to finish time loop
+777 continue
+
   ! close the huge file that contains a dump of all the time steps to disk
   if (EXACT_UNDOING_TO_DISK) close(IFILE_FOR_EXACT_UNDOING)
 
diff --git a/src/specfem3D/iterate_time_undoatt.F90 b/src/specfem3D/iterate_time_undoatt.F90
index 3ac61326b..5bef002fc 100644
--- a/src/specfem3D/iterate_time_undoatt.F90
+++ b/src/specfem3D/iterate_time_undoatt.F90
@@ -69,6 +69,8 @@ subroutine iterate_time_undoatt()
     call exit_MPI(myrank,'for undo_attenuation, POROELASTIC kernel simulation is not supported yet') ! not working yet...
   if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY /= 0) &
     call exit_MPI(myrank,'for undo_attenuation, noise kernel simulation is not supported yet') ! not working yet...
+  if (LTS_MODE) &
+    call exit_MPI(myrank,'LTS mode for undo_attenuation is not supported yet')
 
   ! hdf5 i/o server
   if (HDF5_IO_NODES > 0) then
diff --git a/src/specfem3D/locate_source.F90 b/src/specfem3D/locate_source.F90
index bcef26c6e..44a86da5a 100644
--- a/src/specfem3D/locate_source.F90
+++ b/src/specfem3D/locate_source.F90
@@ -37,7 +37,7 @@ subroutine locate_source()
       USE_EXTERNAL_SOURCE_FILE, &
       USE_TRICK_FOR_BETTER_PRESSURE,COUPLE_WITH_INJECTION_TECHNIQUE, &
       myrank,NSPEC_AB,NGLOB_AB,ibool,xstore,ystore,zstore,DT, &
-      NSOURCES,HAS_FINITE_FAULT_SOURCE
+      NSOURCES,HAS_FINITE_FAULT_SOURCE,LTS_MODE
 
   ! sources arrays
   use specfem_par, only: Mxx,Myy,Mzz,Mxy,Mxz,Myz,hdur, &
@@ -49,6 +49,9 @@ subroutine locate_source()
   ! PML
   use pml_par, only: is_CPML
 
+  ! LTS
+  use specfem_par_lts, only: p_elem,ispec_p_refine,p_lookup,num_p_level
+
   implicit none
 
   ! local parameters
@@ -93,6 +96,12 @@ subroutine locate_source()
   integer :: ispec,ier
   logical :: is_done_sources
 
+  ! LTS
+  integer :: p_spec,ilevel
+  integer, dimension(:), allocatable :: source_p_value,source_p_ilevel
+  integer, dimension(:,:), allocatable :: source_p_elem
+  integer, dimension(:), allocatable :: sendbufv,recvbufv
+
   ! timer MPI
   double precision :: tstart,tCPU
   double precision, external :: wtime
@@ -308,6 +317,56 @@ subroutine locate_source()
   enddo
   call any_all_1Darray_l(is_CPML_source,is_CPML_source_all,NSOURCES)
 
+  ! LTS
+  if (LTS_MODE) then
+    allocate(source_p_value(NSOURCES),source_p_ilevel(NSOURCES), &
+             source_p_elem(num_p_level,NSOURCES), &
+             sendbufv(num_p_level),recvbufv(num_p_level), stat=ier)
+    if (ier /= 0) call exit_MPI(myrank,'Error allocating source_p arrays')
+    source_p_value(:) = 0
+    source_p_ilevel(:) = 0
+    source_p_elem(:,:) = 0
+    ! collects p-level info of source element
+    do isource = 1,NSOURCES
+      if (myrank == islice_selected_source(isource)) then
+        ispec = ispec_selected_source(isource)
+        ! p_value
+        p_spec = ispec_p_refine(ispec)
+        ilevel = p_lookup(p_spec)
+        source_p_value(isource) = p_spec
+        source_p_ilevel(isource) = ilevel
+        ! here use integer values for P-elem since the send routine sendrecv_all_i() is only implemented for integers
+        ! explicit if-case to avoid a ternary operator like: s = (p_elem == .true.)? 1:0
+        do ilevel = 1,num_p_level
+          if (p_elem(ispec,ilevel) .eqv. .true.) then
+            source_p_elem(ilevel,isource) = 1
+          else
+            source_p_elem(ilevel,isource) = 0
+          endif
+        enddo
+      endif
+      ! send to main process if needed
+      if (islice_selected_source(isource) /= 0) then
+        if (myrank == 0) then
+          ! main process gets p-value
+          call recv_singlei(source_p_value(isource),islice_selected_source(isource),itag)
+          ! ilevel-value
+          call recv_singlei(source_p_ilevel(isource),islice_selected_source(isource),itag)
+          ! p_elem as integer-value
+          call recv_i(recvbufv,num_p_level,islice_selected_source(isource),itag)
+          source_p_elem(:,isource) = recvbufv(:)
+        else if (myrank == islice_selected_source(isource)) then
+          ! slice with sources sents its values
+          call send_singlei(source_p_value(isource),0,itag)
+          ! ilevel-value
+          call send_singlei(source_p_ilevel(isource),0,itag)
+          ! p_elem as integer-value
+          sendbufv(:) = source_p_elem(:,isource)
+          call send_i(sendbufv,num_p_level,0,itag)
+        endif
+      endif
+    enddo
+  endif
 
   ! sets new utm coordinates for best locations
   utm_x_source(:) = x_found(:)
@@ -362,6 +421,13 @@ subroutine locate_source()
           write(IMAIN,*) '                 in unknown domain'
         endif
         write(IMAIN,*)
+        ! LTS info
+        if (LTS_MODE) then
+          write(IMAIN,*) '                 in LTS ilevel  : ',source_p_ilevel(isource)
+          write(IMAIN,*) '                        p-value : ',source_p_value(isource)
+          write(IMAIN,*) '                        p-elem  : ',source_p_elem(:,isource)
+          write(IMAIN,*)
+        endif
 
         ! source location (reference element)
         if (USE_FORCE_POINT_SOURCE) then
@@ -630,6 +696,7 @@ subroutine locate_source()
   deallocate(x_found,y_found,z_found,elevation,final_distance)
   deallocate(x_target,y_target,z_target,idomain)
   deallocate(is_CPML_source,is_CPML_source_all)
+  if (LTS_MODE) deallocate(source_p_value,source_p_ilevel,source_p_elem,sendbufv,recvbufv)
 
   end subroutine locate_source
 
diff --git a/src/specfem3D/lts_setup.F90 b/src/specfem3D/lts_setup.F90
index ccffc4fd9..a723e7636 100644
--- a/src/specfem3D/lts_setup.F90
+++ b/src/specfem3D/lts_setup.F90
@@ -33,6 +33,12 @@
 ! Newmark local time stepping on high-performance computing architectures,
 ! Journal of Comp. Physics, 334, p. 308-326.
 ! https://doi.org/10.1016/j.jcp.2016.11.012
+!
+! Rietmann, M., B. Ucar, D. Peter, O. Schenk, M. Grote, 2015.
+! Load-balanced local time stepping for large-scale wave propagation,
+! in: Parallel Distributed Processing Symposium (IPDPS), IEEE International, May 2015.
+! https://doi.org/10.1109/IPDPS.2015.10
+
 
 ! LTS setup routines
 !
@@ -42,10 +48,11 @@ subroutine lts_setup()
 
   use constants, only: NDIM,CUSTOM_REAL,VERYSMALLVAL,FIX_UNDERFLOW_PROBLEM,IMAIN,itag,myrank
 
-  use shared_parameters, only: DT,NSTEP,NPROC
+  use shared_parameters, only: DT,NSTEP,NPROC,SIMULATION_TYPE,LTS_MODE,GPU_MODE,USE_LDDRK, &
+    SAVE_SEISMOGRAMS_ACCELERATION,CREATE_SHAKEMAP
 
   use specfem_par, only: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION, &
-    deltat,NGLOB_AB,NSPEC_AB
+    deltat,NGLOB_AB
 
   ! MPI interfaces
   use specfem_par, only: num_interfaces_ext_mesh,ibool_interfaces_ext_mesh,nibool_interfaces_ext_mesh, &
@@ -62,25 +69,32 @@ subroutine lts_setup()
 
   ! local parameters
   integer :: ilevel,p,m,step
-  integer :: p_min,p_max,p_min_glob,p_max_glob
-  integer :: ipoin, iinterface, iglob, ier
+  integer :: p_min,p_max,p_min_glob,p_max_glob,p_node
+  integer :: i, ipoin, iinterface, iglob, ier
   real(kind=CUSTOM_REAL) :: duration
   integer, dimension(:), allocatable :: num_boundary_p, num_p_glob
   ! check
   integer, dimension(:,:), allocatable ::nibool_interface_p_refine_all
   integer, dimension(:), allocatable :: sendbuf,recvbuf
-  integer :: i,inum,ispec
-  integer,dimension(:),allocatable :: num_ispec_level
-  double precision :: total_work,total_work_lts,total_work_lts_b
+  double precision :: lts_speedup,lts_speedup_with_boundary
   double precision :: deltat_lts
+  double precision :: memory_size
+  integer :: is,ie,is0,ie0
   ! debugging
-  integer :: is,ie
   integer :: iproc
 
+  ! checks if anything to do
+  if (.not. LTS_MODE) return
+
+  ! safety checks
   ! checks simulation domain types
-  if (ACOUSTIC_SIMULATION) call exit_MPI(myrank,'Error LTS routines only implemented for purely ELASTIC simulations...')
-  if (POROELASTIC_SIMULATION) call exit_MPI(myrank,'Error LTS routines only implemented for purely ELASTIC simulations...')
-  if (.not. ELASTIC_SIMULATION) call exit_MPI(myrank,'Error LTS routines only implemented for purely ELASTIC simulations...')
+  if (ACOUSTIC_SIMULATION) call exit_MPI(myrank,'LTS routines only implemented for purely ELASTIC simulations')
+  if (POROELASTIC_SIMULATION) call exit_MPI(myrank,'LTS routines only implemented for purely ELASTIC simulations')
+  if (.not. ELASTIC_SIMULATION) call exit_MPI(myrank,'LTS routines only implemented for purely ELASTIC simulations')
+
+  if (SIMULATION_TYPE /= 1) call exit_MPI(myrank,'LTS routines only implemented for forward simulations (SIMULATION_TYPE == 1)')
+  if (USE_LDDRK) call exit_MPI(myrank,'LTS routines only implemented for Newark time scheme (USE_LDDRK == .false.)')
+  if (GPU_MODE) call exit_MPI(myrank,'LTS routines only implemented for CPU-only runs (GPU_MODE == .false.)')
 
   ! user output
   if (myrank == 0) then
@@ -104,6 +118,35 @@ subroutine lts_setup()
   endif
   call synchronize_all()
 
+  ! safety check
+  if (num_p_level == 0) stop 'Error LTS mode needs at least a single p-level'
+
+  ! we assume contiguous range of p-level nodes
+  ! checks iglob range
+  if (p_level_iglob_start(1) /= 1) call exit_mpi(myrank,"ASSERT: p-level iglob should start at 1")
+  if (p_level_iglob_end(num_p_level) /= NGLOB_AB) call exit_mpi(myrank,"ASSERT: coarsest p-level must have all iglobs!")
+  ! checks levels
+  do ilevel = 1,num_p_level
+    ! start index of current p-level
+    is = p_level_iglob_start(ilevel)
+
+    ! end index of current p-level
+    ie = p_level_iglob_end(ilevel)
+
+    ! start index of finest level
+    is0 = p_level_iglob_start(1)
+
+    ! end index of next finer p-level
+    if (ilevel > 1) then
+      ie0 = p_level_iglob_end(ilevel-1)
+    endif
+
+    ! checks
+    if (ie < is) stop 'Error lts newmark update: end index of current level invalid'
+    if (ie < is0) stop 'Error lts newmark update: start/end index invalid'
+    if (ilevel > 1 .and. ie0 < is0) stop 'Error lts newmark update: end index of finer level invalid'
+  enddo
+
   ! desired initial duration
   duration = sngl(NSTEP * DT)
 
@@ -121,104 +164,27 @@ subroutine lts_setup()
   deltat = deltat_lts
 
   ! counts number of local time step evaluations
-  it_local = 0
+  lts_it_local = 0
   do step = 1,num_p_level_steps
     ilevel = p_level_steps(step)
     p = p_level(ilevel)
     do m = 1,p_level_loops(ilevel)
-      it_local = it_local + 1
+      lts_it_local = lts_it_local + 1
     enddo
   enddo
-  NSTEP_LOCAL = it_local
+  NSTEP_LOCAL = lts_it_local
+
+  ! gets theoretical speed-up values (valid only for all elements belonging to same domain type)
+  call lts_get_theoretical_speedup(lts_speedup,lts_speedup_with_boundary)
 
   ! a map for p-level -> ilevel
   allocate(p_level_ilevel_map(p_level(1)),stat=ier)
   if (ier /= 0) call exit_MPI(myrank,'Error allocating working LTS fields p_level_ilevel_map')
-
   p_level_ilevel_map(:) = 0
   do ilevel = 1,num_p_level
     p_level_ilevel_map(p_level(ilevel)) = ilevel
   enddo
 
-  ! counts number of elements for each level
-  allocate(num_ispec_level(num_p_level),stat=ier)
-  if (ier /= 0) call exit_MPI(myrank,'Error allocating array num_ispec_level')
-  num_ispec_level(:) = 0
-  do i = 1,num_p_level
-    p = p_level(i)
-    num_ispec_level(i) = count(ispec_p_refine(:) == p)
-  enddo
-
-  ! note: some partitions might not contain any element within a certain p-level
-  ! collect on master
-  if (NPROC > 1) then
-    do ilevel = 1,num_p_level
-      call sum_all_all_i(num_ispec_level(ilevel),inum)
-      num_ispec_level(ilevel) = inum
-    enddo
-  endif
-  if (minval(num_ispec_level(:)) == 0) call exit_MPI(myrank,'Error p level without element found')
-
-  ! user output
-  if (myrank == 0) then
-    write(IMAIN,*) '  p-level number of elements: ',num_ispec_level(:)
-    call flush_IMAIN()
-  endif
-
-  ! theoretical speed-up values (valid only for all elements belonging to same domain type)
-  total_work = 0.0
-  total_work_lts = 0.0
-  if (myrank == 0) then
-    ! "pure" speed-up, without taking account of additional coarse/fine boundary contributions
-    do ilevel = 1,num_p_level
-      p = p_level(ilevel)
-      total_work_lts = total_work_lts + num_ispec_level(ilevel) * p
-    enddo
-    ! work without lts
-    total_work = sum( num_ispec_level(:)) * maxval(p_level(:))
-  endif
-
-  ! counts boundary elements
-  num_ispec_level(:) = 0
-  do ilevel = 1,num_p_level
-    do ispec = 1,NSPEC_AB
-      if (boundary_elem(ispec,ilevel)) then
-        num_ispec_level(ilevel) = num_ispec_level(ilevel) + 1
-      endif
-    enddo
-  enddo
-
-  ! note: some partitions might not contain any element within a certain p-level
-  ! collect on master
-  if (NPROC > 1 ) then
-    do ilevel = 1,num_p_level
-      call sum_all_all_i(num_ispec_level(ilevel),inum)
-      num_ispec_level(ilevel) = inum
-    enddo
-  endif
-  ! work with additional coarse/fine boundary contributions
-  total_work_lts_b = total_work_lts
-  if (myrank == 0) then
-    do ilevel = 1,num_p_level
-      p = p_level(ilevel)
-      total_work_lts_b = total_work_lts_b + num_ispec_level(ilevel) * p
-    enddo
-  endif
-
-  ! user output
-  call synchronize_all()
-  if (myrank == 0) then
-    write(IMAIN,*) '  p-level boundary elements : ',num_ispec_level(:)
-    write(IMAIN,*)
-    write(IMAIN,*) '  theoretical speed-up value: ',sngl(total_work / total_work_lts),'(without boundary contributions)'
-    write(IMAIN,*) '  theoretical speed-up value: ',sngl(total_work / total_work_lts_b),'(with boundary contributions)'
-    write(IMAIN,*)
-    call flush_IMAIN()
-  endif
-  call synchronize_all()
-
-  deallocate(num_ispec_level)
-
   ! p_lookup maps p -> ilevel
   allocate(p_lookup(maxval(p_level)),stat=ier)
   if (ier /= 0) call exit_MPI(myrank,'Error allocating p_lookup')
@@ -250,17 +216,17 @@ subroutine lts_setup()
     do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
       ! gets p value for this interface points
       iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
-      p = iglob_p_refine(iglob)
+      p_node = iglob_p_refine(iglob)
 
       ! debug
-      ! if (myrank == 1) print *, "p=", p
+      ! if (myrank == 1) print *, "p_node=", p_node
 
       ! for each interface, stores for each ilevel its corresponding p value if p-nodes on this interface where found
-      interface_p_refine_all(iinterface,p_lookup(p)) = p
-      num_boundary_p(p_lookup(p)) = num_boundary_p(p_lookup(p)) + 1
+      interface_p_refine_all(iinterface,p_lookup(p_node)) = p_node
+      num_boundary_p(p_lookup(p_node)) = num_boundary_p(p_lookup(p_node)) + 1
 
       ! counts p-nodes on interface
-      nibool_interface_p_refine_all(iinterface,p_lookup(p)) = nibool_interface_p_refine_all(iinterface,p_lookup(p)) + 1
+      nibool_interface_p_refine_all(iinterface,p_lookup(p_node)) = nibool_interface_p_refine_all(iinterface,p_lookup(p_node)) + 1
     enddo
     ! debug
     !print *,'rank ',myrank,'interface neighbor',my_neighbors_ext_mesh(iinterface)
@@ -277,8 +243,8 @@ subroutine lts_setup()
   !  call synchronize_all()
   !  if (myrank == iproc) then
   !    print *,'rank',myrank,' num boundary p:'
-  !    do ip = 1,num_p_level
-  !      print *,'  level ',ip,' total number of p-nodes on MPI boundary = ',num_boundary_p(ip)
+  !    do ilevel = 1,num_p_level
+  !      print *,'  level ',ilevel,' total number of p-nodes on MPI boundary = ',num_boundary_p(ilevel)
   !    enddo
   !  endif
   !enddo
@@ -314,11 +280,12 @@ subroutine lts_setup()
   endif
 
   !debug
+  ! num_p_glob(:) = 0
   ! do iglob = 1,NGLOB_AB
-  !   p = iglob_p_refine(iglob)
-  !   num_p_glob(p_lookup(p)) = num_p_glob(p_lookup(p)) + 1
+  !   p_node = iglob_p_refine(iglob)
+  !   num_p_glob(p_lookup(p_node)) = num_p_glob(p_lookup(p_node)) + 1
   ! enddo
-  ! if (myrank==0) print *, "Rank ,p,#dof"
+  ! if (myrank==0) print *, "Rank, p, #dof"
   ! do ilevel = 1,num_p_level
   !   if (num_boundary_p(ilevel) > 0) then
   !     print *, myrank, ",", p_level(ilevel), ",", num_p_glob(ilevel)
@@ -336,10 +303,10 @@ subroutine lts_setup()
       ! loops over all global points
       do iglob = 1,NGLOB_AB
         ! gets p value for this node
-        p = iglob_p_refine(iglob)
+        p_node = iglob_p_refine(iglob)
 
         ! gets ilevel index for this p-value
-        ilevel = p_lookup(p)
+        ilevel = p_lookup(p_node)
 
         ! start/end index for this p-level
         is = p_level_iglob_start(ilevel)
@@ -364,10 +331,10 @@ subroutine lts_setup()
             call exit_MPI(myrank,'Error iglob invalid in LTS level')
           endif
           ! checks associated p-level
-          p = iglob_p_refine(iglob)
-          i = p_lookup(p)
+          p_node = iglob_p_refine(iglob)
+          i = p_lookup(p_node)
           if (i /= ilevel) then
-            print *,'Error rank:',myrank,'has iglob value ',iglob,'in level',i,'range:',is,ie,'p-value:',p,'ilevel:',ilevel
+            print *,'Error rank:',myrank,'has iglob value ',iglob,'in level',i,'range:',is,ie,'p-value:',p_node,'ilevel:',ilevel
             call exit_MPI(myrank,'Error iglob level invalid in LTS level')
           endif
         enddo
@@ -388,10 +355,18 @@ subroutine lts_setup()
   if (ier /= 0) stop 'Error allocating lts_current_m array'
   lts_current_m(:) = 0
 
+  ! initializes current pointers
+  current_lts_elem => p_elem(:,1)                 ! current p-level elements
+  current_lts_boundary_elem => boundary_elem(:,1) ! current boundary elements
+
+  ! current p-level time
+  current_lts_time = 0.d0
+
   ! user output
   if (myrank == 0) then
-    write(IMAIN,*) '  array size for working fields displ_p,veloc_p: ', &
-      dble(NDIM)*dble(NGLOB_AB)*dble(num_p_level)*dble(CUSTOM_REAL)/ 1024./1024.,'MB'
+    ! wavefield sizes displ_p,veloc_p
+    memory_size = dble(NDIM) * dble(NGLOB_AB) * dble(num_p_level) * dble(CUSTOM_REAL)
+    write(IMAIN,*) '  array size for working fields displ_p, veloc_p: ', sngl(memory_size) / 1024./ 1024.,'MB'
     write(IMAIN,*)
     call flush_IMAIN()
   endif
@@ -406,23 +381,44 @@ subroutine lts_setup()
   ! put negligible initial value to avoid very slow underflow trapping
   if (FIX_UNDERFLOW_PROBLEM) displ_p(:,:,:) = VERYSMALLVAL
 
-  !#TODO: LTS reset DT and NSTEP
-  ! re-sets NSTEP
-  !NSTEP = ceiling( duration / deltat_lts )
+  ! collected acceleration
+  ! only needed for seismograms and shakemaps
+  if (SAVE_SEISMOGRAMS_ACCELERATION .or. CREATE_SHAKEMAP) then
+    allocate(accel_collected(NDIM,NGLOB_AB),stat=ier)
+    if (ier /= 0) call exit_MPI(myrank,'Error allocating working LTS fields displ_p,veloc_p')
+    accel_collected(:,:) = 0.0_CUSTOM_REAL
+  endif
 
-  ! re-sets global time-step to be coarsest level
-  !DT = deltat_lts
+  ! user output
+  call synchronize_all()
+  if (myrank == 0) then
+    write(IMAIN,*) '  original time step                 : ',sngl(DT)
+    write(IMAIN,*) '           number of time steps      : ',NSTEP
+    write(IMAIN,*) '           total duration            : ',sngl(NSTEP * DT)
+    write(IMAIN,*)
+    call flush_IMAIN()
+  endif
+
+  ! re-sets simulation NSTEP
+  NSTEP = ceiling( duration / deltat_lts )
+
+  ! re-sets simulation global time step to be coarsest-level time step
+  DT = deltat_lts
 
   ! user output
   call synchronize_all()
   if (myrank == 0) then
-    write(IMAIN,*) '       final lts time step: ',sngl(DT)
-    write(IMAIN,*) '  new number of time steps: ',NSTEP
-    write(IMAIN,*) '          local time steps: ',NSTEP_LOCAL
-    write(IMAIN,*) '              new duration: ',sngl(NSTEP * DT)
+    write(IMAIN,*) '  new LTS  time step                 : ',sngl(DT)
+    write(IMAIN,*) '           number of time steps      : ',NSTEP
+    write(IMAIN,*) '           total duration            : ',sngl(NSTEP * DT)
+    write(IMAIN,*) '           number of local time steps: ',NSTEP_LOCAL
+    write(IMAIN,*)
+    write(IMAIN,*) '  theoretical speed-up value: ',sngl(lts_speedup),'(without boundary contributions)'
+    write(IMAIN,*) '  theoretical speed-up value: ',sngl(lts_speedup_with_boundary),'(with boundary contributions)'
     write(IMAIN,*)
     call flush_IMAIN()
   endif
+  call synchronize_all()
 
   ! user output
   call synchronize_all()
@@ -537,6 +533,122 @@ subroutine lts_read_databases()
 
   end subroutine lts_read_databases
 
+!
+!--------------------------------------------------------------------------------------------
+!
+
+  subroutine lts_get_theoretical_speedup(lts_speedup,lts_speedup_with_boundary)
+
+  use constants, only: IMAIN,myrank
+  use shared_parameters, only: NPROC
+  use specfem_par, only: NSPEC_AB
+
+  use specfem_par_lts
+
+  implicit none
+
+  double precision, intent(out) :: lts_speedup,lts_speedup_with_boundary
+
+  ! local parameters
+  double precision :: total_work,total_work_lts,total_work_lts_b
+  integer,dimension(:),allocatable :: num_ispec_level
+  integer :: ispec,p,ilevel,inum,ier
+
+  ! theoretical speed-up values (valid only for all elements belonging to same domain type)
+  total_work = 0.0
+  total_work_lts = 0.0
+
+  ! counts number of elements for each level
+  allocate(num_ispec_level(num_p_level),stat=ier)
+  if (ier /= 0) call exit_MPI(myrank,'Error allocating array num_ispec_level')
+  num_ispec_level(:) = 0
+
+  do ilevel = 1,num_p_level
+    p = p_level(ilevel)
+    num_ispec_level(ilevel) = count(ispec_p_refine(:) == p)
+  enddo
+
+  ! note: some partitions might not contain any element within a certain p-level
+  ! collect on master
+  if (NPROC > 1) then
+    do ilevel = 1,num_p_level
+      call sum_all_all_i(num_ispec_level(ilevel),inum)
+      num_ispec_level(ilevel) = inum
+    enddo
+  endif
+  if (minval(num_ispec_level(:)) == 0) call exit_MPI(myrank,'Error p level without element found')
+
+  ! user output
+  if (myrank == 0) then
+    write(IMAIN,*) '  p-level number of elements: ',num_ispec_level(:)
+    call flush_IMAIN()
+  endif
+
+  ! computes total work
+  if (myrank == 0) then
+    ! "pure" speed-up, without taking account of additional coarse/fine boundary contributions
+    do ilevel = 1,num_p_level
+      p = p_level(ilevel)
+      total_work_lts = total_work_lts + num_ispec_level(ilevel) * p
+    enddo
+    ! work without lts
+    total_work = sum( num_ispec_level(:)) * maxval(p_level(:))
+  endif
+
+  ! counts boundary elements
+  num_ispec_level(:) = 0
+  do ilevel = 1,num_p_level
+    do ispec = 1,NSPEC_AB
+      if (boundary_elem(ispec,ilevel)) then
+        num_ispec_level(ilevel) = num_ispec_level(ilevel) + 1
+      endif
+    enddo
+  enddo
+
+  ! note: some partitions might not contain any element within a certain p-level
+  ! collect on master
+  if (NPROC > 1 ) then
+    do ilevel = 1,num_p_level
+      call sum_all_all_i(num_ispec_level(ilevel),inum)
+      num_ispec_level(ilevel) = inum
+    enddo
+  endif
+
+  ! user output
+  if (myrank == 0) then
+    write(IMAIN,*) '  p-level boundary elements : ',num_ispec_level(:)
+    write(IMAIN,*)
+    call flush_IMAIN()
+  endif
+
+  ! work with additional coarse/fine boundary contributions
+  total_work_lts_b = total_work_lts
+  if (myrank == 0) then
+    do ilevel = 1,num_p_level
+      p = p_level(ilevel)
+      total_work_lts_b = total_work_lts_b + num_ispec_level(ilevel) * p
+    enddo
+  endif
+
+  ! speedup factors
+  if (total_work_lts /= 0.d0) then
+    lts_speedup = total_work / total_work_lts
+  else
+    lts_speedup = 0.d0
+  endif
+
+  ! speedup factor w/ boundary work
+  if (total_work_lts_b /= 0.d0) then
+    lts_speedup_with_boundary = total_work / total_work_lts_b
+  else
+    lts_speedup_with_boundary = 0.d0
+  endif
+
+  ! free temporary array
+  deallocate(num_ispec_level)
+
+  end subroutine lts_get_theoretical_speedup
+
 !
 !--------------------------------------------------------------------------------------------
 !
@@ -557,10 +669,13 @@ subroutine lts_setup_level_boundary()
 
   use specfem_par_lts, only: &
     p_level_ilevel_map, num_p_level, boundary_elem, iglob_p_refine, &
-    boundary_ispec, ispec_counter, ibool_from, ibool_counter, ilevel_from, &
+    num_p_level_boundary_ispec,num_p_level_boundary_nodes, p_level_boundary_node, &
+    p_level_boundary_ispec, &
+    p_level_boundary_ilevel_from, &
     num_p_level_coarser_to_update, p_level_coarser_to_update, &
-    num_interface_p_refine_ibool, interface_p_refine_ibool, p_level_iglob_end, &
-    p_level,p_level_m_loops
+    num_interface_p_refine_ibool, interface_p_refine_ibool, &
+    p_level_iglob_start, p_level_iglob_end, &
+    p_level, p_level_m_loops, current_lts_boundary_elem
 
   implicit none
 
@@ -573,12 +688,10 @@ subroutine lts_setup_level_boundary()
   integer :: ilevel, iphase, ispec_p, ispec, num_elements, iglob
   integer :: jlevel, iglob_n, ipoin, iinterface, ipoin_n, ier, ipoin_old
   integer :: p,m
-
-  logical, dimension(:), allocatable :: duplicate_check
-
-  integer :: i,j,k,icounter
+  integer :: i,j,k,icounter,inum_poin,inum_spec
   integer :: coarser_counter, coarser_counter_new, num_ipoin_coarser_neighbor
-
+  integer :: is,ie
+  integer :: p_node
   integer, dimension(:), allocatable :: interface_p_refine_ipoin
   integer, dimension(:), allocatable :: ipoin_coarser
   integer, dimension(:), allocatable :: recv_interface_p_refine_ipoin
@@ -590,6 +703,8 @@ subroutine lts_setup_level_boundary()
   integer, dimension(:,:), allocatable :: tmp_p_level_coarser_nodes
   integer, dimension(:), allocatable :: num_p_level_coarser_nodes
 
+  logical, dimension(:), allocatable :: mask_ibool
+
   ! timing
   logical, parameter :: DEBUG_TIMING = .false.
   double precision, external :: wtime
@@ -604,7 +719,6 @@ subroutine lts_setup_level_boundary()
   ! user output
   if (myrank == 0) then
     write(IMAIN,*) '  p-level boundary setup:'
-    write(IMAIN,*)
     call flush_IMAIN()
   endif
   call synchronize_all()
@@ -612,29 +726,30 @@ subroutine lts_setup_level_boundary()
   ! get MPI starting time
   if (DEBUG_TIMING) time_start = wtime()
 
-  ! temporary arrays
-  allocate(boundary_ispec(NSPEC_AB,2,num_p_level),stat=ier)
-  if (ier /= 0) stop 'Error allocating arrays boundary_ispec,...'
-  boundary_ispec(:,:,:) = 0
+  ! boundary arrays
+  allocate(p_level_boundary_ispec(NSPEC_AB,2,num_p_level),stat=ier)
+  if (ier /= 0) stop 'Error allocating arrays p_level_boundary_ispec,...'
+  p_level_boundary_ispec(:,:,:) = 0
 
-  allocate(ibool_from(NGLOB_AB,2,num_p_level),stat=ier)
-  if (ier /= 0) stop 'Error allocating arrays ibool_from,...'
-  ibool_from(:,:,:) = 0
+  allocate(p_level_boundary_node(NGLOB_AB,2,num_p_level),stat=ier)
+  if (ier /= 0) stop 'Error allocating arrays p_level_boundary_node,...'
+  p_level_boundary_node(:,:,:) = 0
 
-  allocate(ilevel_from(NGLOB_AB,2,num_p_level),stat=ier)
-  if (ier /= 0) stop 'Error allocating arrays ilevel_from,...'
-  ilevel_from(:,:,:) = 0
+  allocate(p_level_boundary_ilevel_from(NGLOB_AB,2,num_p_level),stat=ier)
+  if (ier /= 0) stop 'Error allocating arrays p_level_boundary_ilevel_from,...'
+  p_level_boundary_ilevel_from(:,:,:) = 0
 
-  allocate(duplicate_check(NGLOB_AB),stat=ier)
-  if (ier /= 0) stop 'Error allocating arrays duplicate_check,...'
-  duplicate_check(:) = .true.
+  ! counters
+  allocate(num_p_level_boundary_nodes(2,num_p_level), &
+           num_p_level_boundary_ispec(2,num_p_level),stat=ier)
+  if (ier /= 0) stop 'Error allocating arrays num_p_level_boundary_nodes,...'
+  num_p_level_boundary_nodes(:,:) = 0
+  num_p_level_boundary_ispec(:,:) = 0
 
-  ! counters needed for GPU mode
-  allocate(ibool_counter(2,num_p_level), &
-           ispec_counter(2,num_p_level),stat=ier)
-  if (ier /= 0) stop 'Error allocating arrays ibool_counter,...'
-  ibool_counter(:,:) = 0
-  ispec_counter(:,:) = 0
+  ! temporary arrays
+  allocate(mask_ibool(NGLOB_AB),stat=ier)
+  if (ier /= 0) stop 'Error allocating arrays mask_ibool'
+  mask_ibool(:) = .false.
 
   ! temporary arrays for nodes in coarser p-levels
   allocate(p_level_coarser_nodes(NGLOB_AB,num_p_level),stat=ier)
@@ -647,7 +762,7 @@ subroutine lts_setup_level_boundary()
 
   ! user output
   if (myrank == 0) then
-    write(IMAIN,*) "  determining coarser p-level nodes"
+    write(IMAIN,*) "    determining coarser p-level nodes"
     call flush_IMAIN()
   endif
   call synchronize_all()
@@ -658,10 +773,13 @@ subroutine lts_setup_level_boundary()
 
   do ilevel = 1,num_p_level
 
+    ! current boundary elements
+    current_lts_boundary_elem => boundary_elem(:,ilevel)
+
     do iphase = 1,2
 
-      ibool_counter(iphase,ilevel) = 0
-      ispec_counter(iphase,ilevel) = 0
+      num_p_level_boundary_nodes(iphase,ilevel) = 0
+      num_p_level_boundary_ispec(iphase,ilevel) = 0
 
       ! choses inner/outer elements
       if (iphase == 1) then
@@ -670,29 +788,47 @@ subroutine lts_setup_level_boundary()
         num_elements = nspec_inner_elastic
       endif
 
+      ! counters
+      inum_spec = 0
+      inum_poin = 0
+      mask_ibool(:) = .false.
+
       do ispec_p = 1,num_elements
 
         ispec = phase_ispec_inner_elastic(ispec_p,iphase)
 
         ! only elements belonging to a p-level boundary
-        if (boundary_elem(ispec,ilevel) .eqv. .true.) then
-          ispec_counter(iphase,ilevel) = ispec_counter(iphase,ilevel) + 1
-          boundary_ispec(ispec_counter(iphase,ilevel),iphase,ilevel) = ispec
+        if (current_lts_boundary_elem(ispec)) then
+          ! boundary elements
+          inum_spec = inum_spec + 1
+
+          num_p_level_boundary_ispec(iphase,ilevel) = inum_spec
+          p_level_boundary_ispec(inum_spec,iphase,ilevel) = ispec
 
           do k = 1,NGLLZ
             do j = 1,NGLLY
               do i = 1,NGLLX
                 iglob = ibool(i,j,k,ispec)
-                p = iglob_p_refine(iglob)
 
-                ibool_counter(iphase,ilevel) = ibool_counter(iphase,ilevel) + 1
-                ibool_from(ibool_counter(iphase,ilevel),iphase,ilevel) = iglob
-                ilevel_from(ibool_counter(iphase,ilevel),iphase,ilevel) = p_level_ilevel_map(p)
+                ! associated p-level value
+                p_node = iglob_p_refine(iglob)
+
+                ! add global point only once
+                if (.not. mask_ibool(iglob)) then
+                  mask_ibool(iglob) = .true.
+                  ! counter
+                  inum_poin = inum_poin + 1
 
-                ! adds node
-                if (p_level_ilevel_map(p) /= ilevel) then
-                  num_p_level_coarser_nodes(ilevel) = num_p_level_coarser_nodes(ilevel) + 1
-                  p_level_coarser_nodes(num_p_level_coarser_nodes(ilevel),ilevel) = iglob
+                  num_p_level_boundary_nodes(iphase,ilevel) = inum_poin
+                  p_level_boundary_node(inum_poin,iphase,ilevel) = iglob
+
+                  p_level_boundary_ilevel_from(inum_poin,iphase,ilevel) = p_level_ilevel_map(p_node)
+
+                  ! adds node to coarser nodes
+                  if (p_level_ilevel_map(p_node) /= ilevel) then
+                    num_p_level_coarser_nodes(ilevel) = num_p_level_coarser_nodes(ilevel) + 1
+                    p_level_coarser_nodes(num_p_level_coarser_nodes(ilevel),ilevel) = iglob
+                  endif
                 endif
               enddo
             enddo
@@ -704,7 +840,7 @@ subroutine lts_setup_level_boundary()
 
   ! gets maximum number of coarser nodes
   if (myrank == 0) then
-    write(IMAIN,*) "  maximum coarser nodes: ",maxval(num_p_level_coarser_nodes(:))
+    write(IMAIN,*) "    maximum coarser nodes: ",maxval(num_p_level_coarser_nodes(:))
     call flush_IMAIN()
   endif
   call synchronize_all()
@@ -736,15 +872,15 @@ subroutine lts_setup_level_boundary()
   ! loops over finer p-levels
   do ilevel = 1,(num_p_level-1)
     ! checks if nodes have been taken multiple times from coarser p-boundary already
-    duplicate_check(:) = .true.
+    mask_ibool(:) = .false.
     ! loops over all levels up to current one
     do jlevel = 1,ilevel
       do iglob_n = 1,num_p_level_coarser_nodes(jlevel)
         ! checks if node belongs to a coarser level
         iglob = tmp_p_level_coarser_nodes(iglob_n,jlevel)
-        p = iglob_p_refine(iglob)
+        p_node = iglob_p_refine(iglob)
 
-        if (p_level_ilevel_map(p) > ilevel .and. duplicate_check(iglob)) then
+        if (p_level_ilevel_map(p_node) > ilevel .and. (.not. mask_ibool(iglob))) then
           ! adds node
           icounter = num_p_level_coarser_to_update(ilevel)
           icounter = icounter + 1
@@ -752,7 +888,7 @@ subroutine lts_setup_level_boundary()
           num_p_level_coarser_to_update(ilevel) = icounter
           p_level_coarser_to_update(icounter,ilevel) = iglob
 
-          duplicate_check(iglob) = .false.
+          mask_ibool(iglob) = .true.
         endif
       enddo
     enddo
@@ -761,13 +897,13 @@ subroutine lts_setup_level_boundary()
   ! frees temporary arrays
   deallocate(tmp_p_level_coarser_nodes)
   deallocate(num_p_level_coarser_nodes)
-  deallocate(duplicate_check)
+  deallocate(mask_ibool)
 
   ! user output
   if (DEBUG_TIMING) then
     if (myrank == 0 ) then
       tCPU = wtime() - time_start
-      write(IMAIN,*) "   time in seconds = ", tCPU
+      write(IMAIN,*) "    time in seconds = ", tCPU
       call flush_IMAIN()
     endif
     time_start = wtime()
@@ -775,7 +911,7 @@ subroutine lts_setup_level_boundary()
 
   ! user output
   if (myrank == 0) then
-    write(IMAIN,*) "  building MPI arrays"
+    write(IMAIN,*) "    building MPI arrays"
     call flush_IMAIN()
   endif
   call synchronize_all()
@@ -801,14 +937,14 @@ subroutine lts_setup_level_boundary()
   if (TEST_ERROR) then
     ! user output
     if (myrank == 0) then
-      write(IMAIN,*) "  testing LTS arrays"
+      write(IMAIN,*) "      testing LTS initial MPI arrays"
       call flush_IMAIN()
     endif
     call synchronize_all()
 
     ! user output
     if (myrank == 0) then
-      write(IMAIN,*) "    checking initial interface locations"
+      write(IMAIN,*) "      checking initial interface locations"
       call flush_IMAIN()
     endif
 
@@ -876,7 +1012,7 @@ subroutine lts_setup_level_boundary()
 
     ! user output
     if (myrank == 0) then
-      write(IMAIN,*) "  test result okay"
+      write(IMAIN,*) "      test result okay"
       call flush_IMAIN()
     endif
     call synchronize_all()
@@ -893,7 +1029,7 @@ subroutine lts_setup_level_boundary()
   do ilevel = 1,num_p_level-1
     ! user output
     if (myrank == 0 ) then
-      write(IMAIN,*) "  MPI interfaces for finer p-levels: level = ",ilevel," p-value = ",p_level(ilevel)
+      write(IMAIN,*) "    MPI interfaces for finer p-levels: level = ",ilevel," p-value = ",p_level(ilevel)
       call flush_IMAIN()
     endif
     if (DEBUG_TIMING) time_start = wtime()
@@ -1041,7 +1177,7 @@ subroutine lts_setup_level_boundary()
                          " out of ", num_interfaces_ext_mesh,"interfaces"
           if (DEBUG_TIMING) then
             tCPU = wtime() - time_start
-            write(IMAIN,*) "  time in seconds = ", tCPU,"s"
+            write(IMAIN,*) "    time in seconds = ", tCPU,"s"
           endif
           ! flushes file buffer for main output file (IMAIN)
           call flush_IMAIN()
@@ -1055,18 +1191,34 @@ subroutine lts_setup_level_boundary()
   ! frees temporary array
   deallocate(interface_p_refine_ipoin)
 
+  ! checks p_level_coarser_to_update
+  do ilevel = 1,num_p_level
+    ! gets start index of finest level and end index of current p-level
+    is = p_level_iglob_start(1)
+    ie = p_level_iglob_end(ilevel)
+    if (ilevel < num_p_level) then
+      ! considers contributions from coarser to finer p-levels
+      do iglob_n = 1,num_p_level_coarser_to_update(ilevel)
+        iglob = p_level_coarser_to_update(iglob_n,ilevel)
+        ! checks
+        if (iglob < ie) call exit_mpi(myrank,"ASSERT: coarser iglob should start in next coarser level")
+        if (iglob > NGLOB_AB) call exit_mpi(myrank,"ASSERT: coarser iglob index is out of bounds!")
+      enddo
+    endif
+  enddo
+
   ! tests MPI interface arrays
   if (TEST_ERROR) then
     ! user output
     if (myrank == 0) then
-      write(IMAIN,*) "  testing LTS arrays"
+      write(IMAIN,*) "      testing LTS MPI arrays"
       call flush_IMAIN()
     endif
     call synchronize_all()
 
     ! user output
     if (myrank == 0) then
-      write(IMAIN,*) "    checking iglob index range"
+      write(IMAIN,*) "      checking iglob index range"
       call flush_IMAIN()
     endif
 
@@ -1095,7 +1247,7 @@ subroutine lts_setup_level_boundary()
 
     ! user output
     if (myrank == 0) then
-      write(IMAIN,*) "    checking interface locations"
+      write(IMAIN,*) "      checking interface locations"
       call flush_IMAIN()
     endif
     call synchronize_all()
@@ -1162,11 +1314,11 @@ subroutine lts_setup_level_boundary()
 
     ! user output
     if (myrank == 0) then
+      write(IMAIN,*) "      test result okay"
       if (DEBUG_TIMING) then
         tCPU = wtime() - time_start
-        write(IMAIN,*) "   time in seconds = ", tCPU
+        write(IMAIN,*) "      time in seconds = ", tCPU
       endif
-      write(IMAIN,*) "  test result okay"
       call flush_IMAIN()
     endif
     call synchronize_all()
@@ -1193,6 +1345,51 @@ subroutine lts_setup_level_boundary()
     p_level_m_loops(m) = p_level(m)/p_level(m+1)
   enddo
 
+  ! checks boundary node p-values
+  do ilevel = 1,num_p_level
+    ! current boundary elements
+    current_lts_boundary_elem => boundary_elem(:,ilevel)
+
+    ! p (dt/p) refinement number in this specified p-level
+    p = p_level(ilevel)
+
+    ! inner/outer elements
+    do iphase = 1,2
+      ! choses inner/outer elements
+      if (iphase == 1) then
+        num_elements = nspec_outer_elastic
+      else
+        num_elements = nspec_inner_elastic
+      endif
+
+      do ispec_p = 1,num_elements
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! only elements belonging to p-level boundary
+        if (.not. current_lts_boundary_elem(ispec)) cycle
+
+        ! checks element nodes
+        do k = 1,NGLLZ
+          do j = 1,NGLLY
+            do i = 1,NGLLX
+              iglob = ibool(i,j,k,ispec)
+              ! checks if node belongs to this or coarser p-level
+              p_node = iglob_p_refine(iglob)
+              if (p_node > p) then
+                ! coarser p-levels have smaller p values, finer p-levels have larger p values,
+                ! such that local time step delta_lts==DT/p
+                print *,'Error: boundary node p value is invalid: ',p_node,' on level',ilevel,'p',p
+                print *,'       iglob ',iglob,'i/j/k/ispec',i,j,k,ispec
+                call exit_mpi(myrank,"Error: invalid p-value node; Assert(boundary nodes are this level or coarser only)")
+              endif
+            enddo
+          enddo
+        enddo
+      enddo ! ispec
+    enddo ! iphase
+  enddo ! ilevel
+
   ! computes true cost of MPI in terms of total elements sent
   mpi_cost = 0
   call lts_mpi_cost(num_p_level,mpi_cost)
@@ -1206,9 +1403,9 @@ subroutine lts_setup_level_boundary()
 
   ! user output
   if (myrank == 0) then
-    write(IMAIN,*) "  Communication cost: sum(mpi_cost)=",sum(mpi_cost_gather), &
-                   " avg(mpi_cost)=",sngl(sum(mpi_cost_gather)/dble(NPROC))
-    write(IMAIN,*) "  p-level boundary setup done"
+    write(IMAIN,*) "    Communication cost: sum(mpi_cost)          = ",sum(mpi_cost_gather)
+    write(IMAIN,*) "                        avg(mpi_cost per proc) = ",sngl(sum(mpi_cost_gather)/dble(NPROC))
+    write(IMAIN,*) "    p-level boundary setup done"
     write(IMAIN,*)
     call flush_IMAIN()
   endif
@@ -1217,9 +1414,6 @@ subroutine lts_setup_level_boundary()
   ! frees temporary arrays
   deallocate(mpi_cost_gather)
   deallocate(ipoin_coarser)
-  deallocate(boundary_ispec)
-  deallocate(ibool_from)
-  deallocate(ilevel_from)
 
   contains
 
@@ -1372,17 +1566,10 @@ subroutine lts_prepare_mass_matrices()
       rmassxyz_mod(2,:) = rmass(:) + rmassy(:)
       rmassxyz_mod(3,:) = rmass(:) + rmassz(:)
 
-      !#TODO: LTS reset mass matrix in Stacey case
       ! re-sets mass matrices without contributions
-      !rmassx(:) = rmass(:)
-      !rmassy(:) = rmass(:)
-      !rmassz(:) = rmass(:)
-      ! no LTS case - same result as original prepare_timerun_mass_matrices() routine
-      ! where final rmassx,.. contains both contributions from rmass + initial rmassx, ..
-      ! to be taken out and replaces with above outcommented lines when LTS is fully implemented:
-      rmassx(:) = rmass(:)  + rmassx(:)
-      rmassy(:) = rmass(:)  + rmassy(:)
-      rmassz(:) = rmass(:)  + rmassz(:)
+      rmassx(:) = rmass(:)
+      rmassy(:) = rmass(:)
+      rmassz(:) = rmass(:)
 
     else
       ! no absorbing boundary contributions
@@ -1648,7 +1835,8 @@ subroutine lts_prepare_gpu()
 !  call transfer_element_list_to_device(Mesh_pointer,element_list,num_element_list)
 !
 !  ! transfer ibool and ilevel from CPU to GPU
-!  call transfer_boundary_element_list_to_device(Mesh_pointer,ibool_from,ilevel_from,boundary_ispec)
+!  call transfer_boundary_element_list_to_device(Mesh_pointer,p_level_boundary_node, &
+!                                                p_level_boundary_ilevel_from,p_level_boundary_ispec)
 !
 !  ! sends list for coarser nodes to GPU
 !  call setup_r_boundaries_time_stepping(Mesh_pointer,p_level_coarser_to_update)
diff --git a/src/specfem3D/noise_tomography.f90 b/src/specfem3D/noise_tomography.f90
index eb2206e20..0c803d9a1 100644
--- a/src/specfem3D/noise_tomography.f90
+++ b/src/specfem3D/noise_tomography.f90
@@ -709,9 +709,9 @@ end subroutine compute_arrays_source_noise
 ! step 1: calculate the "ensemble forward source"
 ! add noise spectrum to the location of main receiver
   subroutine add_source_main_rec_noise(nrec,NSTEP,accel,noise_sourcearray, &
-                                         ibool,islice_selected_rec,ispec_selected_rec, &
-                                         it,irec_main_noise, &
-                                         nspec,nglob)
+                                       ibool,islice_selected_rec,ispec_selected_rec, &
+                                       it,irec_main_noise, &
+                                       nspec,nglob)
 
   use constants, only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM,myrank
 
diff --git a/src/specfem3D/read_mesh_databases_hdf5.F90 b/src/specfem3D/read_mesh_databases_hdf5.F90
index 939a29414..0f16f3622 100644
--- a/src/specfem3D/read_mesh_databases_hdf5.F90
+++ b/src/specfem3D/read_mesh_databases_hdf5.F90
@@ -1729,7 +1729,7 @@ subroutine read_mesh_databases_moho_hdf5()
 
   implicit none
 
-  !#TODO: hdf5 moho version not implemented yet
+  !#TODO: HDF5 moho version not implemented yet
   stop 'HDF5_ENABLED version of routine read_mesh_databases_moho_hdf5() not implemented yet'
 
   end subroutine read_mesh_databases_moho_hdf5
diff --git a/src/specfem3D/rules.mk b/src/specfem3D/rules.mk
index 3270a48a2..76595154a 100644
--- a/src/specfem3D/rules.mk
+++ b/src/specfem3D/rules.mk
@@ -98,6 +98,10 @@ specfem3D_OBJECTS = \
 	$O/locate_point.spec.o \
 	$O/locate_receivers.spec.o \
 	$O/locate_source.spec.o \
+	$O/lts_assemble_MPI_vector.spec.o \
+	$O/lts_global_step.spec.o \
+	$O/lts_iterate_time.spec.o \
+	$O/lts_newmark_update.spec.o \
 	$O/lts_setup.spec.o \
 	$O/make_gravity.spec.o \
 	$O/noise_tomography.spec.o \
@@ -336,6 +340,7 @@ $O/fault_solver_dynamic.spec.o: $O/fault_solver_common.spec.o
 $O/fault_solver_kinematic.spec.o: $O/fault_solver_common.spec.o
 $O/compute_forces_viscoelastic.spec.o: $O/fault_solver_dynamic.spec.o
 $O/compute_forces_viscoelastic_calling_routine.spec.o: $O/fault_solver_dynamic.spec.o $O/fault_solver_kinematic.spec.o
+$O/finalize_simulation.spec.o: $O/fault_solver_dynamic.spec.o $O/fault_solver_kinematic.spec.o
 
 $O/prepare_timerun.spec.o: $O/fault_solver_dynamic.spec.o $O/fault_solver_kinematic.spec.o
 $O/prepare_gpu.spec.o: $O/fault_solver_dynamic.spec.o $O/fault_solver_kinematic.spec.o
@@ -368,6 +373,10 @@ $O/write_movie_output.spec.o: $O/hdf5_io_server.spec_hdf5.o
 $O/write_movie_output_HDF5.spec_hdf5.o: $O/hdf5_io_server.spec_hdf5.o
 $O/write_output_HDF5.spec_hdf5.o: $O/hdf5_io_server.spec_hdf5.o
 
+## LTS
+$O/lts_iterate_time.spec.o: $O/gravity_perturbation.spec.o $O/hdf5_io_server.spec_hdf5.o
+$O/lts_global_step.spec.o: $O/fault_solver_dynamic.spec.o $O/fault_solver_kinematic.spec.o
+
 ####
 #### rule to build each .o file below
 ####
diff --git a/src/specfem3D/setup_sources_receivers.f90 b/src/specfem3D/setup_sources_receivers.f90
index 81e30e458..6769ca8cb 100644
--- a/src/specfem3D/setup_sources_receivers.f90
+++ b/src/specfem3D/setup_sources_receivers.f90
@@ -372,8 +372,10 @@ subroutine get_run_number_of_the_source()
       write(IMAIN,*)
       ! warning
       if (NSOURCES /= NB_RUNS_ACOUSTIC_GPU) then
+        write(IMAIN,*) '  ***'
         write(IMAIN,*) '  *** WARNING: number of sources ',NSOURCES, &
                        ' does not match number of runs ',NB_RUNS_ACOUSTIC_GPU,' ***'
+        write(IMAIN,*) '  ***'
         write(IMAIN,*)
       endif
       call flush_IMAIN()
@@ -1655,7 +1657,11 @@ subroutine setup_receivers_precompute_intp()
     enddo
     ! warning
     if (has_receiver_in_elastic_domain) then
-      print *, "Warning: Pressure seismogram output for receivers in elastic domains is valid only for non-attenuation case"
+      write(IMAIN,*) '  ***'
+      write(IMAIN,*) '  *** Warning: Pressure seismogram output for receivers in elastic domains&
+                                     &is valid only for non-attenuation case ***'
+      write(IMAIN,*) '  ***'
+      call flush_IMAIN()
     endif
   endif
 
diff --git a/src/specfem3D/specfem3D_par.F90 b/src/specfem3D/specfem3D_par.F90
index b12c3dcfe..d39425574 100644
--- a/src/specfem3D/specfem3D_par.F90
+++ b/src/specfem3D/specfem3D_par.F90
@@ -782,12 +782,14 @@ module specfem_par_lts
 
   implicit none
 
+  ! current lts time
+  double precision :: current_lts_time
+
   ! suggested coarsest time step for LTS (largest multiple p of smallest time step)
   double precision :: deltat_lts_suggested
 
   ! LTS intermediate arrays, one NGLOB_AB*3 per level
-  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: displ_p,veloc_p,accel_p
-  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: displ_ref,veloc_ref,accel_ref
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: displ_p,veloc_p
 
   ! p-refinement level arrays
   integer :: num_p_level
@@ -805,33 +807,43 @@ module specfem_par_lts
   integer, dimension(:), allocatable :: ispec_p_refine
   integer, dimension(:,:), allocatable :: interface_p_refine_all
 
+  ! lts call type indicator for compute forces: true == p-element call, false == boundary-element call
+  logical :: lts_type_compute_pelem
+
   ! p_elem(ispec,p_level) = (p == ispec_p_refine(ispec,p))
-  logical, dimension(:,:), allocatable :: p_elem
+  logical, dimension(:,:), allocatable, target :: p_elem
+
+  ! element in current p-level (points to p_elem(:,ilevel)
+  logical, dimension(:), pointer :: current_lts_elem => null()
 
   ! boundary_elem = (p_elem(ispec,p_level) == .true.) .and. (some element nodes are in different level)
   ! Note: p-levels are fine-greedy. Finer levels take an element
-  ! for themselves when sharing element-boundary-nodes.
-  logical, dimension(:,:), allocatable :: boundary_elem
+  !       for themselves when sharing element-boundary-nodes.
+  logical, dimension(:,:), allocatable, target :: boundary_elem
+
+  ! element in current boundary_elem (points to boundary_elem(:,ilevel)
+  logical, dimension(:), pointer :: current_lts_boundary_elem => null()
 
   ! dofs are grouped by p-level for efficiency of time-stepping vector additions.
   integer, dimension(:), allocatable :: p_level_iglob_start
   integer, dimension(:), allocatable :: p_level_iglob_end
-  ! Q-R; Coarse region minus halo from fine region
-  integer, dimension(:), allocatable :: p_level_iglob_inner_end
 
-  ! boundary counters/maps for GPU_MODE
-  integer, dimension(:,:,:), allocatable :: boundary_ispec
-  integer, dimension(:,:,:), allocatable :: ibool_from
-  integer, dimension(:,:,:), allocatable :: ilevel_from
-  integer, dimension(:,:), allocatable :: ibool_counter
-  integer, dimension(:,:), allocatable :: ispec_counter
+  ! Q-R; Coarse region minus halo from fine region
+  !integer, dimension(:), allocatable :: p_level_iglob_inner_end
 
-  integer :: it_local,NSTEP_LOCAL
+  integer :: lts_it_local
+  integer :: NSTEP_LOCAL
 
   ! boundary element nodes -- used to update the degrees of freedom on a p-level boundary
   ! equivalent to R and R*
-  integer, dimension(:), allocatable :: num_p_level_boundary_nodes
-  integer, dimension(:,:), allocatable :: p_level_boundary_node
+  ! boundary counters/maps
+  integer, dimension(:,:), allocatable :: num_p_level_boundary_nodes
+  integer, dimension(:,:), allocatable :: num_p_level_boundary_ispec
+
+  integer, dimension(:,:,:), allocatable :: p_level_boundary_ispec
+  integer, dimension(:,:,:), allocatable :: p_level_boundary_node
+  integer, dimension(:,:,:), allocatable :: p_level_boundary_ilevel_from
+
   integer, dimension(:), allocatable :: p_level_ilevel_map
 
   integer, dimension(:), allocatable :: p_level_m_loops
@@ -848,9 +860,13 @@ module specfem_par_lts
   integer, dimension(:,:), allocatable :: interface_p_refine_boundary
   integer :: max_nibool_interfaces_boundary
 
-  ! adaptive error
-  real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: accel_adap_old
-  real(kind=CUSTOM_REAL) :: relative_error,error_norm
+  ! reference solution wavefields for debugging
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: displ_ref,veloc_ref,accel_ref
+  ! global step reference element flags
+  logical,dimension(:), allocatable, target :: p_elem_ref,boundary_elem_ref
+
+  ! collected acceleration wavefield
+  real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: accel_collected
 
   ! for stacey absorbing boundary conditions
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: cmassxyz, rmassxyz
diff --git a/src/specfem3D/write_movie_output_HDF5.F90 b/src/specfem3D/write_movie_output_HDF5.F90
index 87548c0ef..a2ac4db43 100644
--- a/src/specfem3D/write_movie_output_HDF5.F90
+++ b/src/specfem3D/write_movie_output_HDF5.F90
@@ -725,10 +725,7 @@ subroutine write_xdmf_surface_body(it_io, num_nodes)
   ! checks if anything do, only main process writes out xdmf file
   if (myrank /= 0) return
 
-  !#TODO: hdf5 i/o server
-  ! redefinition for no ioserver case
-  !if (fname_xdmf_surf == '' ) fname_xdmf_surf = trim(OUTPUT_FILES)//"/movie_surface.xmf"
-
+  ! append data section to xdmf file for surface movie
   fname_xdmf_surf = trim(OUTPUT_FILES)//"/movie_surface.xmf"
   fname_h5_data_surf_xdmf = "./movie_surface.h5"    ! relative to movie_surface.xmf file
   ! this seems to point to a wrong directory:
diff --git a/tests/decompose_mesh/test_partitioning.f90 b/tests/decompose_mesh/test_partitioning.f90
index 1cdc97012..434609825 100644
--- a/tests/decompose_mesh/test_partitioning.f90
+++ b/tests/decompose_mesh/test_partitioning.f90
@@ -6,6 +6,7 @@ program test_partitioning
 
   use constants, only: NDIM
   use decompose_mesh_par
+  use fault_scotch, only: ANY_FAULT
 
   implicit none
   integer :: i,ispec,inode,ier
diff --git a/tests/decompose_mesh/test_partitioning.makefile b/tests/decompose_mesh/test_partitioning.makefile
index 11f134b9c..1fce64755 100644
--- a/tests/decompose_mesh/test_partitioning.makefile
+++ b/tests/decompose_mesh/test_partitioning.makefile
@@ -22,11 +22,13 @@ OBJECTS = \
 	$O/wrap_patoh.o \
 	$O/wrap_metis.o \
 	$O/param_reader.cc.o \
-	$O/exit_mpi.shared.o \
-	$O/shared_par.shared_module.o \
   $O/count_number_of_sources.shared.o \
+	$O/exit_mpi.shared.o \
+	$O/heap_sort.shared.o \
 	$O/read_parameter_file.shared.o \
 	$O/read_value_parameters.shared.o \
+	$O/search_kdtree.shared.o \
+	$O/shared_par.shared_module.o \
 	$O/sort_array_coordinates.shared.o \
 	$O/write_VTK_data.shared.o \
 	$(EMPTY_MACRO)
diff --git a/tests/decompose_mesh/test_read.makefile b/tests/decompose_mesh/test_read.makefile
index 25a402f7a..b667a465e 100644
--- a/tests/decompose_mesh/test_read.makefile
+++ b/tests/decompose_mesh/test_read.makefile
@@ -23,10 +23,12 @@ OBJECTS = \
 	$O/param_reader.cc.o \
   $O/count_number_of_sources.shared.o \
 	$O/exit_mpi.shared.o \
-	$O/shared_par.shared_module.o \
+	$O/heap_sort.shared.o \
 	$O/read_parameter_file.shared.o \
 	$O/read_value_parameters.shared.o \
-	$O/sort_array_coordinates.shared.o \
+	$O/search_kdtree.shared.o \
+	$O/shared_par.shared_module.o \
+  $O/sort_array_coordinates.shared.o \
 	$O/write_VTK_data.shared.o \
 	$(EMPTY_MACRO)
 
diff --git a/tests/decompose_mesh/test_valence.makefile b/tests/decompose_mesh/test_valence.makefile
index 5051a5d1b..3a03ad267 100644
--- a/tests/decompose_mesh/test_valence.makefile
+++ b/tests/decompose_mesh/test_valence.makefile
@@ -22,11 +22,13 @@ OBJECTS = \
 	$O/wrap_patoh.o \
 	$O/wrap_metis.o \
 	$O/param_reader.cc.o \
-	$O/exit_mpi.shared.o \
-	$O/shared_par.shared_module.o \
   $O/count_number_of_sources.shared.o \
+	$O/exit_mpi.shared.o \
+	$O/heap_sort.shared.o \
 	$O/read_parameter_file.shared.o \
 	$O/read_value_parameters.shared.o \
+	$O/search_kdtree.shared.o \
+	$O/shared_par.shared_module.o \
 	$O/sort_array_coordinates.shared.o \
 	$O/write_VTK_data.shared.o \
 	$(EMPTY_MACRO)