From b774e3c0bc9c7fca1d79709341638d4ff83add77 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Fri, 6 Jul 2012 13:58:19 +0000 Subject: [PATCH] adds GPU functionality to SPECFEM3D solver (requires to rerun ./configure); merges GPU branch modifications to main trunk; adds src/cuda directory & files; updates Par_files; renames compute_forces_elastic.F90 and prepare_timerun.F90 --- AUTHORS | 1 + config.h.in | 6 + configure | 194 +- configure.ac | 35 +- in_data_files/Par_file | 3 + src/cuda/check_fields_cuda.cu | 566 +++ src/cuda/compute_add_sources_acoustic_cuda.cu | 370 ++ src/cuda/compute_add_sources_elastic_cuda.cu | 422 ++ src/cuda/compute_coupling_cuda.cu | 345 ++ src/cuda/compute_forces_acoustic_cuda.cu | 966 +++++ src/cuda/compute_forces_elastic_cuda.cu | 2255 +++++++++++ src/cuda/compute_kernels_cuda.cu | 646 +++ src/cuda/compute_stacey_acoustic_cuda.cu | 192 + src/cuda/compute_stacey_elastic_cuda.cu | 214 + src/cuda/it_update_displacement_cuda.cu | 228 ++ src/cuda/mesh_constants_cuda.h | 467 +++ src/cuda/noise_tomography_cuda.cu | 305 ++ src/cuda/prepare_constants_cuda.h | 99 + src/cuda/prepare_mesh_constants_cuda.cu | 2037 ++++++++++ src/cuda/save_and_compare_cpu_vs_gpu.c | 318 ++ src/cuda/specfem3D_gpu_cuda_method_stubs.c | 805 ++++ .../specfem3D_gpu_cuda_method_stubs.c.bak | 805 ++++ src/cuda/transfer_fields_cuda.cu | 586 +++ src/cuda/write_seismograms_cuda.cu | 537 +++ .../decompose_mesh_SCOTCH.f90 | 44 +- .../part_decompose_mesh_SCOTCH.f90 | 26 +- .../program_decompose_mesh_SCOTCH.f90 | 3 + src/generate_databases/Makefile.in | 17 +- .../create_mass_matrices.f90 | 14 +- .../create_regions_mesh.f90 | 855 +++- src/generate_databases/generate_databases.f90 | 15 +- .../get_coupling_surfaces.f90 | 1419 ++----- src/generate_databases/get_global.f90 | 4 +- src/generate_databases/get_model.f90 | 143 +- src/generate_databases/get_perm_color.f90 | 1210 ++++++ src/generate_databases/memory_eval.f90 | 10 +- src/generate_databases/model_1d_cascadia.f90 | 10 +- src/generate_databases/model_1d_prem.f90 | 129 +- src/generate_databases/model_1d_socal.f90 | 12 +- src/generate_databases/model_aniso.f90 | 23 + src/generate_databases/model_default.f90 | 20 +- .../model_external_values.f90 | 12 +- src/generate_databases/model_gll.f90 | 28 +- src/generate_databases/model_ipati.f90 | 113 + .../model_salton_trough.f90 | 40 +- src/generate_databases/model_tomography.f90 | 7 +- src/generate_databases/save_arrays_solver.f90 | 571 +-- src/meshfem3D/check_mesh_quality.f90 | 22 +- src/meshfem3D/compute_parameters.f90 | 2 +- src/meshfem3D/create_regions_mesh.f90 | 4 +- src/meshfem3D/create_visual_files.f90 | 8 +- src/meshfem3D/meshfem3D.f90 | 2 +- src/meshfem3D/store_boundaries.f90 | 6 +- src/shared/assemble_MPI_scalar.f90 | 1 - src/shared/check_mesh_resolution.f90 | 11 +- src/shared/combine_vol_data.f90 | 10 +- src/shared/compute_arrays_source.f90 | 352 ++ src/shared/constants.h.in | 90 +- src/shared/detect_surface.f90 | 6 +- src/shared/get_element_face.f90 | 66 +- src/shared/param_reader.c | 2 +- src/shared/read_parameter_file.f90 | 74 +- src/shared/read_topo_bathy_file.f90 | 92 +- src/shared/serial.f90 | 11 +- src/shared/smooth_vol_data.f90 | 40 +- src/shared/sum_kernels.f90 | 76 +- src/shared/write_VTK_data.f90 | 18 +- src/shared/write_c_binary.c | 69 +- src/specfem3D/Makefile.in | 518 +-- src/specfem3D/assemble_MPI_vector.f90 | 475 ++- .../compute_add_sources_acoustic.f90 | 602 +-- src/specfem3D/compute_add_sources_elastic.f90 | 542 +-- src/specfem3D/compute_coupling_elastic_po.f90 | 8 +- .../compute_coupling_poroelastic_el.f90 | 8 +- src/specfem3D/compute_forces_acoustic.f90 | 290 +- ...elastic.f90 => compute_forces_elastic.F90} | 458 ++- src/specfem3D/compute_forces_elastic_Dev.f90 | 3454 +--------------- src/specfem3D/compute_forces_elastic_Dev2.f90 | 3477 +++++++++++++++++ .../compute_forces_elastic_Dev_openmp.f90 | 786 ++++ src/specfem3D/compute_gradient.f90 | 15 +- src/specfem3D/compute_kernels.f90 | 268 +- src/specfem3D/compute_stacey_acoustic.f90 | 85 +- src/specfem3D/compute_stacey_elastic.f90 | 106 +- src/specfem3D/create_color_image.f90 | 163 +- src/specfem3D/finalize_simulation.f90 | 4 +- src/specfem3D/initialize_simulation.f90 | 74 +- src/specfem3D/iterate_time.f90 | 383 +- src/specfem3D/locate_receivers.f90 | 27 +- src/specfem3D/locate_source.f90 | 14 +- src/specfem3D/make_gravity.f90 | 680 ++++ src/specfem3D/noise_tomography.f90 | 395 +- ...repare_timerun.f90 => prepare_timerun.F90} | 410 +- src/specfem3D/read_mesh_databases.f90 | 80 +- src/specfem3D/setup_GLL_points.f90 | 6 +- src/specfem3D/setup_sources_receivers.f90 | 233 +- src/specfem3D/specfem3D_par.f90 | 95 +- src/specfem3D/write_movie_output.f90 | 54 +- src/specfem3D/write_seismograms.f90 | 140 +- utils/Cluster/pbs/valgrind_go_solver_pbs.bash | 77 + .../create_specfem3D_gpu_cuda_method_stubs.pl | 141 + utils/readme_cuda_cscs.txt | 63 + utils/remake_makefiles.sh | 12 + utils/update_headers_change_word_f90.pl | 114 + 103 files changed, 25331 insertions(+), 7015 deletions(-) create mode 100644 src/cuda/check_fields_cuda.cu create mode 100644 src/cuda/compute_add_sources_acoustic_cuda.cu create mode 100644 src/cuda/compute_add_sources_elastic_cuda.cu create mode 100644 src/cuda/compute_coupling_cuda.cu create mode 100644 src/cuda/compute_forces_acoustic_cuda.cu create mode 100644 src/cuda/compute_forces_elastic_cuda.cu create mode 100644 src/cuda/compute_kernels_cuda.cu create mode 100644 src/cuda/compute_stacey_acoustic_cuda.cu create mode 100644 src/cuda/compute_stacey_elastic_cuda.cu create mode 100644 src/cuda/it_update_displacement_cuda.cu create mode 100644 src/cuda/mesh_constants_cuda.h create mode 100644 src/cuda/noise_tomography_cuda.cu create mode 100644 src/cuda/prepare_constants_cuda.h create mode 100644 src/cuda/prepare_mesh_constants_cuda.cu create mode 100644 src/cuda/save_and_compare_cpu_vs_gpu.c create mode 100644 src/cuda/specfem3D_gpu_cuda_method_stubs.c create mode 100644 src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak create mode 100644 src/cuda/transfer_fields_cuda.cu create mode 100644 src/cuda/write_seismograms_cuda.cu create mode 100644 src/generate_databases/get_perm_color.f90 create mode 100644 src/generate_databases/model_ipati.f90 create mode 100644 src/shared/compute_arrays_source.f90 rename src/specfem3D/{compute_forces_elastic.f90 => compute_forces_elastic.F90} (64%) create mode 100644 src/specfem3D/compute_forces_elastic_Dev2.f90 create mode 100644 src/specfem3D/compute_forces_elastic_Dev_openmp.f90 create mode 100644 src/specfem3D/make_gravity.f90 rename src/specfem3D/{prepare_timerun.f90 => prepare_timerun.F90} (59%) create mode 100755 utils/Cluster/pbs/valgrind_go_solver_pbs.bash create mode 100755 utils/create_specfem3D_gpu_cuda_method_stubs.pl create mode 100644 utils/readme_cuda_cscs.txt create mode 100755 utils/remake_makefiles.sh create mode 100755 utils/update_headers_change_word_f90.pl diff --git a/AUTHORS b/AUTHORS index cf2dc7332..0938d8293 100644 --- a/AUTHORS +++ b/AUTHORS @@ -28,6 +28,7 @@ David Michéa, Hom Nath Gharti, Tarje Nissen-Meyer, Daniel Peter, +Max Rietmann, Brian Savage, Bernhard Schuberth, Anne Sieminski, diff --git a/config.h.in b/config.h.in index 26a366c62..4d074a31d 100644 --- a/config.h.in +++ b/config.h.in @@ -85,3 +85,9 @@ /* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a `char[]'. */ #undef YYTEXT_POINTER + + +/* Uncomment and define to select optimized file i/o for regional simulations */ +/* map fails when output files are > 4GB, which is often the case for GPU simulations */ +// #define USE_MAP_FUNCTION + diff --git a/configure b/configure index a9af4633e..6a581d7c6 100755 --- a/configure +++ b/configure @@ -604,11 +604,16 @@ ac_includes_default="\ ac_subst_vars='LTLIBOBJS LIBOBJS +NVCC PYTHON_EGG_LDFLAGS PYTHON_EGG_CPPFLAGS PYTHON_EGG_CFLAGS PYTHONPATH LOCAL_PATH_IS_ALSO_GLOBAL +OPENMP_LIB +MPI_INC +CUDA_INC +CUDA_LIB FLAGS_NO_CHECK FLAGS_CHECK MPICC @@ -673,6 +678,10 @@ PYTHON_EXEC_PREFIX PYTHON_PREFIX PYTHON_VERSION PYTHON +COND_OPENMP_FALSE +COND_OPENMP_TRUE +COND_CUDA_FALSE +COND_CUDA_TRUE CUSTOM_MPI_TYPE CUSTOM_REAL COND_MPI_FALSE @@ -723,6 +732,8 @@ enable_option_checking with_pyre with_mpi enable_double_precision +with_cuda +with_openmp with_scotch_dir with_scotch_includedir with_scotch_libdir @@ -749,6 +760,10 @@ MPILIBS MPICC FLAGS_CHECK FLAGS_NO_CHECK +CUDA_LIB +CUDA_INC +MPI_INC +OPENMP_LIB LOCAL_PATH_IS_ALSO_GLOBAL PYTHON PYTHONPATH' @@ -1375,6 +1390,8 @@ Optional Packages: --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) --with-pyre build Pyrized version [default=no] --with-mpi build parallel version [default=yes] + --with-cuda build CUDA GPU enabled version [default=no] + --with-openmp build OpenMP enabled version [default=no] --with-scotch-dir define the root path to Scotch (e.g. /opt/scotch/) --with-scotch-includedir define the path to the Scotch headers (e.g. @@ -1413,6 +1430,11 @@ Some influential environment variables: FLAGS_NO_CHECK Fortran compiler flags for creating fast, production-run code for critical subroutines + CUDA_LIB Location of CUDA library libcudart + CUDA_INC Location of CUDA include files + MPI_INC Location of MPI include mpi.h, which is needed by nvcc when + compiling cuda files + OPENMP_LIB Location of extra OpenMP libraries LOCAL_PATH_IS_ALSO_GLOBAL files on a local path on each node are also seen as global with same path [default=true] @@ -2279,6 +2301,40 @@ fi +# Check whether --with-cuda was given. +if test "${with_cuda+set}" = set; then : + withval=$with_cuda; want_cuda="$withval" +else + want_cuda=no +fi + + if test "$want_cuda" = yes; then + COND_CUDA_TRUE= + COND_CUDA_FALSE='#' +else + COND_CUDA_TRUE='#' + COND_CUDA_FALSE= +fi + + + +# Check whether --with-openmp was given. +if test "${with_openmp+set}" = set; then : + withval=$with_openmp; want_openmp="$withval" +else + want_openmp=no +fi + + if test "$want_openmp" = yes; then + COND_OPENMP_TRUE= + COND_OPENMP_FALSE='#' +else + COND_OPENMP_TRUE='#' + COND_OPENMP_FALSE= +fi + + + # Checks for programs. if test "$want_pyre" = yes; then @@ -6390,6 +6446,9 @@ LDFLAGS=${ac_save_ldflags} + + + if test x"$MPIFC" = x; then MPIFC=mpif90 fi @@ -6398,6 +6457,7 @@ if test x"$MPICC" = x; then fi + if test x"$LOCAL_PATH_IS_ALSO_GLOBAL" = x; then LOCAL_PATH_IS_ALSO_GLOBAL=true fi @@ -6708,9 +6768,131 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu fi +if test "$want_cuda" = yes; then + + # Extract the first word of "nvcc", so it can be a program name with args. +set dummy nvcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_NVCC+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $NVCC in + [\\/]* | ?:[\\/]*) + ac_cv_path_NVCC="$NVCC" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_path_NVCC="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + ;; +esac +fi +NVCC=$ac_cv_path_NVCC +if test -n "$NVCC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $NVCC" >&5 +$as_echo "$NVCC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi -# Checks for library functions. + if test -z "$NVCC" ; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot find 'nvcc' program. +See \`config.log' for more details" "$LINENO" 5; } + NVCC=`echo "Error: nvcc is not installed." ; false` + fi + +fi + +if test "$want_openmp" = yes; then + +ac_ext=${ac_fc_srcext-f} +ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5' +ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_fc_compiler_gnu + +cit_fc_save_fc=$FC +cit_fc_save_fcflags=$FCFLAGS +FC=$FC +FCFLAGS="$FCFLAGS $FLAGS_NO_CHECK" + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether OpenMP directives work" >&5 +$as_echo_n "checking whether OpenMP directives work... " >&6; } + +#AC_COMPILE_IFELSE(_CIT_FC_TRIVIAL_OPENMP_PROGRAM, [ +# AC_MSG_RESULT(yes) +#], [ +# AC_MSG_RESULT(no) +# AC_MSG_FAILURE([cannot compile a trivial OpenMP program using $FC]) +#]) + +cat > conftest.$ac_ext <<_ACEOF + + program main + + implicit none + integer OMP_get_thread_num + integer OMP_GET_MAX_THREADS + integer NUM_THREADS + integer thread_id + + NUM_THREADS = OMP_GET_MAX_THREADS() + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(thread_id) + thread_id = OMP_get_thread_num()+1 + !$OMP END PARALLEL + + end + +_ACEOF +if ac_fn_fc_try_link "$LINENO"; then : + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +else + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot link a trivial OpenMP program using $FC with flags: $FLAGS_NO_CHECK +See \`config.log' for more details" "$LINENO" 5; } + +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + +FC=$cit_fc_save_fc +FCFLAGS=$cit_fc_save_fcflags + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + +fi + + +# Checks for library functions. ac_config_files="$ac_config_files Makefile src/meshfem3D/Makefile src/meshfem3D/constants.h src/meshfem3D/precision.h src/decompose_mesh_SCOTCH/Makefile src/specfem3D/Makefile src/generate_databases/Makefile src/shared/constants.h src/shared/precision.h src/decompose_mesh_SCOTCH/scotch_5.1.11/src/Makefile.inc src/check_mesh_quality_CUBIT_Abaqus/Makefile" @@ -6831,6 +7013,14 @@ if test -z "${COND_MPI_TRUE}" && test -z "${COND_MPI_FALSE}"; then as_fn_error $? "conditional \"COND_MPI\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${COND_CUDA_TRUE}" && test -z "${COND_CUDA_FALSE}"; then + as_fn_error $? "conditional \"COND_CUDA\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi +if test -z "${COND_OPENMP_TRUE}" && test -z "${COND_OPENMP_FALSE}"; then + as_fn_error $? "conditional \"COND_OPENMP\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 @@ -8017,4 +8207,6 @@ $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} fi +#daniel: adding custom lines to config.h.in through autoheader + diff --git a/configure.ac b/configure.ac index 3dd651ac6..f1057fc99 100644 --- a/configure.ac +++ b/configure.ac @@ -45,6 +45,20 @@ fi AC_SUBST([CUSTOM_REAL]) AC_SUBST([CUSTOM_MPI_TYPE]) +AC_ARG_WITH([cuda], + [AC_HELP_STRING([--with-cuda], + [build CUDA GPU enabled version @<:@default=no@:>@])], + [want_cuda="$withval"], + [want_cuda=no]) +AM_CONDITIONAL([COND_CUDA], [test "$want_cuda" = yes]) + +AC_ARG_WITH([openmp], + [AC_HELP_STRING([--with-openmp], + [build OpenMP enabled version @<:@default=no@:>@])], + [want_openmp="$withval"], + [want_openmp=no]) +AM_CONDITIONAL([COND_OPENMP], [test "$want_openmp" = yes]) + # Checks for programs. @@ -172,7 +186,7 @@ else AC_MSG_ERROR([No suitable yacc or bison found]) fi ACX_PTHREAD(AC_MSG_RESULT([pthread found]), AC_MSG_ERROR([pthread not found])) - + #daniel: scotch bundle # uses bundled scotch: current version 5.1.11 USE_BUNDLED_SCOTCH=1 @@ -339,12 +353,16 @@ AC_ARG_VAR(MPILIBS, [extra libraries for linking MPI programs]) AC_ARG_VAR(MPICC, [MPI C compiler command]) AC_ARG_VAR(FLAGS_CHECK, [Fortran compiler flags for non-critical subroutines]) AC_ARG_VAR(FLAGS_NO_CHECK, [Fortran compiler flags for creating fast, production-run code for critical subroutines]) +AC_ARG_VAR(CUDA_LIB,[Location of CUDA library libcudart]) +AC_ARG_VAR(CUDA_INC,[Location of CUDA include files]) +AC_ARG_VAR(MPI_INC,[Location of MPI include mpi.h, which is needed by nvcc when compiling cuda files]) if test x"$MPIFC" = x; then MPIFC=mpif90 fi if test x"$MPICC" = x; then MPICC=mpicc fi +AC_ARG_VAR(OPENMP_LIB,[Location of extra OpenMP libraries]) AC_ARG_VAR(LOCAL_PATH_IS_ALSO_GLOBAL, [files on a local path on each node are also seen as global with same path @<:@default=true@:>@]) if test x"$LOCAL_PATH_IS_ALSO_GLOBAL" = x; then @@ -415,9 +433,16 @@ if test "$want_pyre" = yes; then CIT_FC_MAIN fi +if test "$want_cuda" = yes; then + CIT_CUDA_COMPILER +fi + +if test "$want_openmp" = yes; then + CIT_FC_OPENMP_MODULE([$FC],[$FLAGS_NO_CHECK]) +fi -# Checks for library functions. +# Checks for library functions. AC_CONFIG_FILES([ Makefile @@ -434,5 +459,11 @@ AC_CONFIG_FILES([ ]) AC_OUTPUT +#daniel: adding custom lines to config.h.in through autoheader +AH_BOTTOM([ +/* Uncomment and define to select optimized file i/o for regional simulations */ +/* map fails when output files are > 4GB, which is often the case for GPU simulations */ +// #define USE_MAP_FUNCTION +]) dnl end of configure.ac diff --git a/in_data_files/Par_file b/in_data_files/Par_file index 93996c51e..3c68407d0 100644 --- a/in_data_files/Par_file +++ b/in_data_files/Par_file @@ -33,6 +33,7 @@ TOPOGRAPHY = .false. ATTENUATION = .false. USE_OLSEN_ATTENUATION = .false. ANISOTROPY = .false. +GRAVITY = .false. # absorbing boundary conditions for a regional simulation ABSORBING_CONDITIONS = .false. @@ -65,4 +66,6 @@ NTSTEP_BETWEEN_READ_ADJSRC = 0 # print source time function PRINT_SOURCE_TIME_FUNCTION = .false. +# set to true to use GPUs +GPU_MODE = .false. diff --git a/src/cuda/check_fields_cuda.cu b/src/cuda/check_fields_cuda.cu new file mode 100644 index 000000000..ba747d09c --- /dev/null +++ b/src/cuda/check_fields_cuda.cu @@ -0,0 +1,566 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#ifdef WITH_MPI +#include +#endif + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" +#include "prepare_constants_cuda.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +// Check functions + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(check_max_norm_displ_gpu, + CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) { + +TRACE("check_max_norm_displ_gpu"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost); + realw maxnorm=0; + + for(int i=0;i<*size;i++) { + maxnorm = MAX(maxnorm,fabsf(displ[i])); + } + printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(check_max_norm_vector, + CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) { + +TRACE("check_max_norm_vector"); + + int procid; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD,&procid); +#else + procid = 0; +#endif + realw maxnorm=0; + int maxloc; + for(int i=0;i<*size;i++) { + if(maxnormd_b_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost); + + realw maxnorm=0; + realw maxnorm_accel=0; + + for(int i=0;i<*size;i++) { + maxnorm = MAX(maxnorm,fabsf(b_displ[i])); + maxnorm_accel = MAX(maxnorm,fabsf(b_accel[i])); + } + free(b_accel); + printf("%d: maxnorm of backward displ = %e\n",*announceID,maxnorm); + printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm_accel); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(check_max_norm_b_accel_gpu, + CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) { + +TRACE("check_max_norm_b_accel_gpu"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost); + + realw maxnorm=0; + + for(int i=0;i<*size;i++) { + maxnorm = MAX(maxnorm,fabsf(b_accel[i])); + } + printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(check_max_norm_b_veloc_gpu, + CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) { + +TRACE("check_max_norm_b_veloc_gpu"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost); + + realw maxnorm=0; + + for(int i=0;i<*size;i++) { + maxnorm = MAX(maxnorm,fabsf(b_veloc[i])); + } + printf("%d: maxnorm of backward veloc = %e\n",*announceID,maxnorm); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(check_max_norm_b_displ, + CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) { + +TRACE("check_max_norm_b_displ"); + + realw maxnorm=0; + + for(int i=0;i<*size;i++) { + maxnorm = MAX(maxnorm,fabsf(b_displ[i])); + } + printf("%d:maxnorm of backward displ = %e\n",*announceID,maxnorm); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(check_max_norm_b_accel, + CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) { + +TRACE("check_max_norm_b_accel"); + + realw maxnorm=0; + + for(int i=0;i<*size;i++) { + maxnorm = MAX(maxnorm,fabsf(b_accel[i])); + } + printf("%d:maxnorm of backward accel = %e\n",*announceID,maxnorm); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(check_error_vectors, + CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) { + +TRACE("check_error_vectors"); + + int size = *sizef; + + double diff2 = 0; + double sum = 0; + double temp; + double maxerr=0; + int maxerrorloc; + + for(int i=0;imaxerrorloc-5;i--) { + printf("[%d]: %e vs. %e\n",i,vector1[i],vector2[i]); + } + } + +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// Auxiliary functions + +/* ----------------------------------------------------------------------------------------------- */ + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(get_max_accel, + GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) { + +TRACE("get_max_accel"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); + int procid; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD,&procid); +#else + procid = 0; +#endif + int size = *sizef; + int it = *itf; + realw* accel_cpy = (realw*)malloc(size*sizeof(realw)); + cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost); + realw maxval=0; + for(int i=0;i 0 ){ + max = abs(array[0]); + for( int i=1; i < size; i++){ + if( abs(array[i]) > max ) max = abs(array[i]); + } + } + *d_max = max; + */ + + // reduction example: + __shared__ realw sdata[256] ; + + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; + + // loads absolute values into shared memory + sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ; + + __syncthreads(); + + // do reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (tid < s){ + // summation: + //sdata[tid] += sdata[tid + s]; + // maximum: + if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s]; + } + __syncthreads(); + } + + // write result for this block to global mem + if (tid == 0) d_max[blockIdx.x] = sdata[0]; + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(get_norm_acoustic_from_device, + GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm, + long* Mesh_pointer_f, + int* SIMULATION_TYPE) { + +TRACE("get_norm_acoustic_from_device"); + //double start_time = get_time(); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + realw max; + realw *d_max; + + max = 0; + + /* way 1 : timing Elapsed time: 8.464813e-03 + realw* h_array; + h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw)); + + print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic, + sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131); + + // finds maximum value in array + max = h_array[0]; + for( int i=1; i < mp->NGLOB_AB; i++){ + if( abs(h_array[i]) > max ) max = abs(h_array[i]); + } + free(h_array); + */ + + /* way 2: timing Elapsed time: 8.818102e-02 + // launch simple kernel + cudaMalloc((void**)&d_max,sizeof(realw)); + + dim3 grid(1,1); + dim3 threads(1,1,1); + + get_maximum_kernel<<>>(mp->d_potential_dot_dot_acoustic, + mp->NGLOB_AB, + d_max); + print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222); + + cudaFree(d_max); + */ + + // way 2 b: timing Elapsed time: 1.236916e-03 + // launch simple reduction kernel + realw* h_max; + int blocksize = 256; + + int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize); + //printf("num_blocks_x %i \n",num_blocks_x); + + h_max = (realw*) calloc(num_blocks_x,sizeof(realw)); + cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw)); + + dim3 grid(num_blocks_x,1); + dim3 threads(blocksize,1,1); + + if(*SIMULATION_TYPE == 1 ){ + get_maximum_kernel<<>>(mp->d_potential_dot_dot_acoustic, + mp->NGLOB_AB, + d_max); + } + + if(*SIMULATION_TYPE == 3 ){ + get_maximum_kernel<<>>(mp->d_b_potential_dot_dot_acoustic, + mp->NGLOB_AB, + d_max); + } + + print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222); + + // determines max for all blocks + max = h_max[0]; + for(int i=1;iNGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr); + status= cublasGetError(); + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n"); + exit(1); + } + + print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]), + sizeof(realw), cudaMemcpyDeviceToHost),222); + + printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max); + + // Shutdown + status = cublasShutdown(); + if (status != CUBLAS_STATUS_SUCCESS) { + fprintf (stderr, "!!!! shutdown error (A)\n"); + exit(1); + } + + */ + + // return result + *norm = max; + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //double end_time = get_time(); + //printf("Elapsed time: %e\n",end_time-start_time); + exit_on_cuda_error("after get_norm_acoustic_from_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +// ELASTIC simulations + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){ + + // reduction example: + __shared__ realw sdata[256] ; + + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; + + // loads values into shared memory: assume array is a vector array + sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3] + + array[i*3+1]*array[i*3+1] + + array[i*3+2]*array[i*3+2]) : 0.0 ; + + __syncthreads(); + + // do reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (tid < s){ + // summation: + //sdata[tid] += sdata[tid + s]; + // maximum: + if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s]; + } + __syncthreads(); + } + + // write result for this block to global mem + if (tid == 0) d_max[blockIdx.x] = sdata[0]; + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(get_norm_elastic_from_device, + GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm, + long* Mesh_pointer_f, + int* SIMULATION_TYPE) { + + TRACE("get_norm_elastic_from_device"); + //double start_time = get_time(); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + realw max; + realw *d_max; + + max = 0; + + // launch simple reduction kernel + realw* h_max; + int blocksize = 256; + + int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize); + //printf("num_blocks_x %i \n",num_blocks_x); + + h_max = (realw*) calloc(num_blocks_x,sizeof(realw)); + cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw)); + + dim3 grid(num_blocks_x,1); + dim3 threads(blocksize,1,1); + + if(*SIMULATION_TYPE == 1 ){ + get_maximum_vector_kernel<<>>(mp->d_displ, + mp->NGLOB_AB, + d_max); + } + + if(*SIMULATION_TYPE == 3 ){ + get_maximum_vector_kernel<<>>(mp->d_b_displ, + mp->NGLOB_AB, + d_max); + } + + print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222); + + // determines max for all blocks + max = h_max[0]; + for(int i=1;i +#include +#include + +#include +#include +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" +// #include "epik_user.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +// acoustic sources + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_add_sources_acoustic_kernel(realw* potential_dot_dot_acoustic, + int* ibool, + int* ispec_is_inner, + int phase_is_inner, + realw* sourcearrays, + double* stf_pre_compute, + int myrank, + int* islice_selected_source, + int* ispec_selected_source, + int* ispec_is_acoustic, + realw* kappastore, + int NSOURCES) { + int i = threadIdx.x; + int j = threadIdx.y; + int k = threadIdx.z; + + int isource = blockIdx.x + gridDim.x*blockIdx.y; // bx + + int ispec; + int iglob; + realw stf; + realw kappal; + + if( isource < NSOURCES ){ + + if(myrank == islice_selected_source[isource]) { + + ispec = ispec_selected_source[isource]-1; + + if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) { + + stf = (realw) stf_pre_compute[isource]; + iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1; + kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)]; + + atomicAdd(&potential_dot_dot_acoustic[iglob], + -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal); + + // potential_dot_dot_acoustic[iglob] += + // -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal; + } + } + } +} + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_add_sources_ac_cuda, + COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + int* SIMULATION_TYPEf, + double* h_stf_pre_compute, + int* myrankf) { + +TRACE("compute_add_sources_ac_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + // check if anything to do + if( mp->nsources_local == 0 ) return; + + int phase_is_inner = *phase_is_innerf; + int NSOURCES = *NSOURCESf; + int myrank = *myrankf; + + int num_blocks_x = NSOURCES; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + // copies pre-computed source time factors onto GPU + print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute, + NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18); + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(5,5,5); + + compute_add_sources_acoustic_kernel<<>>(mp->d_potential_dot_dot_acoustic, + mp->d_ibool, + mp->d_ispec_is_inner, + phase_is_inner, + mp->d_sourcearrays, + mp->d_stf_pre_compute, + myrank, + mp->d_islice_selected_source, + mp->d_ispec_selected_source, + mp->d_ispec_is_acoustic, + mp->d_kappastore, + NSOURCES); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_add_sources_ac_cuda"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_add_sources_ac_s3_cuda, + COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + int* SIMULATION_TYPEf, + double* h_stf_pre_compute, + int* myrankf) { + +TRACE("compute_add_sources_ac_s3_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + // check if anything to do + if( mp->nsources_local == 0 ) return; + + int phase_is_inner = *phase_is_innerf; + int NSOURCES = *NSOURCESf; + int myrank = *myrankf; + + int num_blocks_x = NSOURCES; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + // copies source time factors onto GPU + print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute, + NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18); + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(5,5,5); + + compute_add_sources_acoustic_kernel<<>>(mp->d_b_potential_dot_dot_acoustic, + mp->d_ibool, + mp->d_ispec_is_inner, + phase_is_inner, + mp->d_sourcearrays, + mp->d_stf_pre_compute, + myrank, + mp->d_islice_selected_source, + mp->d_ispec_selected_source, + mp->d_ispec_is_acoustic, + mp->d_kappastore, + NSOURCES); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_add_sources_ac_s3_cuda"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// acoustic adjoint sources + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_acoustic, + int nrec, + realw* adj_sourcearrays, + int* ibool, + int* ispec_is_inner, + int* ispec_is_acoustic, + int* ispec_selected_rec, + int phase_is_inner, + int* pre_computed_irec, + int nadj_rec_local, + realw* kappastore) { + + int irec_local = blockIdx.x + gridDim.x*blockIdx.y; + + // because of grid shape, irec_local can be too big + if(irec_local < nadj_rec_local) { + + int irec = pre_computed_irec[irec_local]; + + int ispec = ispec_selected_rec[irec]-1; + if( ispec_is_acoustic[ispec] ){ + + // checks if element is in phase_is_inner run + if(ispec_is_inner[ispec] == phase_is_inner) { + int i = threadIdx.x; + int j = threadIdx.y; + int k = threadIdx.z; + + int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1; + + //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)]; + + //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5, + // pre_computed_irec_local_index[irec], + // pre_computed_index, + // 0, + // i,j,k)]/kappal; + + // beware, for acoustic medium, a pressure source would be taking the negative + // and divide by Kappa of the fluid; + // this would have to be done when constructing the adjoint source. + // + // note: we take the first component of the adj_sourcearrays + // the idea is to have e.g. a pressure source, where all 3 components would be the same + realw stf = adj_sourcearrays[INDEX5(5,5,5,3,i,j,k,0,irec_local)]; // / kappal + + atomicAdd(&potential_dot_dot_acoustic[iglob],stf); + + //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5, + // pre_computed_irec_local_index[irec],pre_computed_index-1, + // 0,i,j,k)] // / kappal + // ); + } + } + } +} + +/* ----------------------------------------------------------------------------------------------- */ + + +extern "C" +void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda, + ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer, + realw* h_adj_sourcearrays, + int* phase_is_inner, + int* h_ispec_is_inner, + int* h_ispec_is_acoustic, + int* h_ispec_selected_rec, + int* myrank, + int* nrec, + int* time_index, + int* h_islice_selected_rec, + int* nadj_rec_local, + int* NTSTEP_BETWEEN_READ_ADJSRC) { + +TRACE("add_sources_ac_sim_2_or_3_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + // checks + if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n"); + + // make sure grid dimension is less than 65535 in x dimension + int num_blocks_x = mp->nadj_rec_local; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y,1); + dim3 threads(5,5,5); + + // build slice of adj_sourcearrays because full array is *very* large. + // note: this extracts array values for local adjoint sources at given time step "time_index" + // from large adj_sourcearrays array into h_adj_sourcearrays_slice + int ispec,i,j,k; + int irec_local = 0; + for(int irec = 0; irec < *nrec; irec++) { + if(*myrank == h_islice_selected_rec[irec]) { + irec_local++; + + // takes only acoustic sources + ispec = h_ispec_selected_rec[irec]-1; + if( h_ispec_is_acoustic[ispec] ){ + + if( h_ispec_is_inner[ispec] == *phase_is_inner) { + for(k=0;k<5;k++) { + for(j=0;j<5;j++) { + for(i=0;i<5;i++) { + mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)] + = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local, + *NTSTEP_BETWEEN_READ_ADJSRC, + 3,5,5, + irec_local-1,(*time_index)-1, + 0,i,j,k)]; + + mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,irec_local-1)] + = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local, + *NTSTEP_BETWEEN_READ_ADJSRC, + 3,5,5, + irec_local-1,(*time_index)-1, + 1,i,j,k)]; + + mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,irec_local-1)] + = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local, + *NTSTEP_BETWEEN_READ_ADJSRC, + 3,5,5, + irec_local-1,(*time_index)-1, + 2,i,j,k)]; + } + } + } + } // phase_is_inner + } // h_ispec_is_acoustic + } + } + // check all local sources were added + if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n"); + + // copies extracted array values onto GPU + print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice, + (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099); + + // launches cuda kernel for acoustic adjoint sources + add_sources_ac_SIM_TYPE_2_OR_3_kernel<<>>(mp->d_potential_dot_dot_acoustic, + *nrec, + mp->d_adj_sourcearrays, + mp->d_ibool, + mp->d_ispec_is_inner, + mp->d_ispec_is_acoustic, + mp->d_ispec_selected_rec, + *phase_is_inner, + mp->d_pre_computed_irec, + mp->nadj_rec_local, + mp->d_kappastore); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel"); +#endif +} diff --git a/src/cuda/compute_add_sources_elastic_cuda.cu b/src/cuda/compute_add_sources_elastic_cuda.cu new file mode 100644 index 000000000..056b6a07c --- /dev/null +++ b/src/cuda/compute_add_sources_elastic_cuda.cu @@ -0,0 +1,422 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" +// #include "epik_user.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +// elastic domain sources + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_add_sources_kernel(realw* accel, + int* ibool, + int* ispec_is_inner, + int phase_is_inner, + realw* sourcearrays, + double* stf_pre_compute, + int myrank, + int* islice_selected_source, + int* ispec_selected_source, + int* ispec_is_elastic, + int NSOURCES) { + int i = threadIdx.x; + int j = threadIdx.y; + int k = threadIdx.z; + + int isource = blockIdx.x + gridDim.x*blockIdx.y; // bx + int ispec; + int iglob; + realw stf; + + if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block. + + if(myrank == islice_selected_source[isource]) { + + ispec = ispec_selected_source[isource]-1; + + if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) { + + stf = (realw) stf_pre_compute[isource]; + iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1; + + atomicAdd(&accel[iglob*3], + sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf); + atomicAdd(&accel[iglob*3+1], + sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf); + atomicAdd(&accel[iglob*3+2], + sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf); + } + } + } + +} + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_add_sources_el_cuda, + COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + double* h_stf_pre_compute, + int* myrankf) { + +TRACE("compute_add_sources_el_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + // check if anything to do + if( mp->nsources_local == 0 ) return; + + int phase_is_inner = *phase_is_innerf; + int NSOURCES = *NSOURCESf; + int myrank = *myrankf; + + int num_blocks_x = NSOURCES; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + //double* d_stf_pre_compute; + print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute, + NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18); + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(5,5,5); + + compute_add_sources_kernel<<compute_stream>>>(mp->d_accel, + mp->d_ibool, + mp->d_ispec_is_inner, + phase_is_inner, + mp->d_sourcearrays, + mp->d_stf_pre_compute, + myrank, + mp->d_islice_selected_source, + mp->d_ispec_selected_source, + mp->d_ispec_is_elastic, + NSOURCES); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_add_sources_kernel"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_add_sources_el_s3_cuda, + COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer, + double* h_stf_pre_compute, + int* NSOURCESf, + int* phase_is_inner, + int* myrank) { + TRACE("compute_add_sources_el_s3_cuda"); + // EPIK_TRACER("compute_add_sources_el_s3_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + int NSOURCES = *NSOURCESf; + + print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute, + NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_add_sources_el_s3_cuda"); +#endif + + int num_blocks_x = NSOURCES; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(5,5,5); + + compute_add_sources_kernel<<compute_stream>>>(mp->d_b_accel,mp->d_ibool, + mp->d_ispec_is_inner, *phase_is_inner, + mp->d_sourcearrays, + mp->d_stf_pre_compute, + *myrank, + mp->d_islice_selected_source,mp->d_ispec_selected_source, + mp->d_ispec_is_elastic, + NSOURCES); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_add_sources_el_s3_cuda"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +// NOISE sources + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool, + int* ispec_selected_rec, + int irec_master_noise, + realw* accel, + realw* noise_sourcearray, + int it) { + int tx = threadIdx.x; + int iglob = ibool[tx + NGLL3*(ispec_selected_rec[irec_master_noise-1]-1)]-1; + + // not sure if we need atomic operations but just in case... + // accel[3*iglob] += noise_sourcearray[3*tx + 3*125*it]; + // accel[1+3*iglob] += noise_sourcearray[1+3*tx + 3*125*it]; + // accel[2+3*iglob] += noise_sourcearray[2+3*tx + 3*125*it]; + + atomicAdd(&accel[iglob*3],noise_sourcearray[3*tx + 3*NGLL3*it]); + atomicAdd(&accel[iglob*3+1],noise_sourcearray[1+3*tx + 3*NGLL3*it]); + atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*NGLL3*it]); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(add_source_master_rec_noise_cu, + ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f, + int* myrank_f, + int* it_f, + int* irec_master_noise_f, + int* islice_selected_rec) { + +TRACE("add_source_master_rec_noise_cu"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + int it = *it_f-1; // -1 for Fortran -> C indexing differences + int irec_master_noise = *irec_master_noise_f; + int myrank = *myrank_f; + + dim3 grid(1,1,1); + dim3 threads(NGLL3,1,1); + + if(myrank == islice_selected_rec[irec_master_noise-1]) { + add_source_master_rec_noise_cuda_kernel<<compute_stream>>>(mp->d_ibool, + mp->d_ispec_selected_rec, + irec_master_noise, + mp->d_accel, + mp->d_noise_sourcearray, + it); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel"); +#endif + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +// ADJOINT sources + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel, + int nrec, + realw* adj_sourcearrays, + int* ibool, + int* ispec_is_inner, + int* ispec_is_elastic, + int* ispec_selected_rec, + int phase_is_inner, + int* pre_computed_irec, + int nadj_rec_local) { + + int irec_local = blockIdx.x + gridDim.x*blockIdx.y; + + if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block. + + int irec = pre_computed_irec[irec_local]; + + int ispec = ispec_selected_rec[irec]-1; + if( ispec_is_elastic[ispec] ){ + + if(ispec_is_inner[ispec] == phase_is_inner) { + int i = threadIdx.x; + int j = threadIdx.y; + int k = threadIdx.z; + int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1; + + // atomic operations are absolutely necessary for correctness! + atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(5,5,5,3, + i,j,k, + 0, + irec_local)]); + + atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3, + i,j,k, + 1, + irec_local)]); + + atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3, + i,j,k, + 2, + irec_local)]); + } + } // ispec_is_elastic + } + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(add_sources_el_sim_type_2_or_3, + ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer, + realw* h_adj_sourcearrays, + int* phase_is_inner, + int* h_ispec_is_inner, + int* h_ispec_is_elastic, + int* h_ispec_selected_rec, + int* myrank, + int* nrec, + int* time_index, + int* h_islice_selected_rec, + int* nadj_rec_local, + int* NTSTEP_BETWEEN_READ_ADJSRC) { + +TRACE("add_sources_el_sim_type_2_or_3"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + // checks + if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n"); + + // make sure grid dimension is less than 65535 in x dimension + int num_blocks_x = mp->nadj_rec_local; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y,1); + dim3 threads(5,5,5); + + // build slice of adj_sourcearrays because full array is *very* large. + // note: this extracts array values for local adjoint sources at given time step "time_index" + // from large adj_sourcearrays array into h_adj_sourcearrays_slice + int ispec,i,j,k; + int irec_local = 0; + for(int irec = 0; irec < *nrec; irec++) { + if(*myrank == h_islice_selected_rec[irec]) { + irec_local++; + + // takes only elastic sources + ispec = h_ispec_selected_rec[irec]-1; + if( h_ispec_is_elastic[ispec] ){ + + if( h_ispec_is_inner[ispec] == *phase_is_inner) { + for(k=0;k<5;k++) { + for(j=0;j<5;j++) { + for(i=0;i<5;i++) { + + mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3, + i,j,k,0, + irec_local-1)] + = h_adj_sourcearrays[INDEX6(*nadj_rec_local, + *NTSTEP_BETWEEN_READ_ADJSRC, + 3,5,5, + irec_local-1, + *time_index-1, + 0,i,j,k)]; + + mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3, + i,j,k,1, + irec_local-1)] + = h_adj_sourcearrays[INDEX6(*nadj_rec_local, + *NTSTEP_BETWEEN_READ_ADJSRC, + 3,5,5, + irec_local-1, + *time_index-1, + 1,i,j,k)]; + + mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3, + i,j,k,2, + irec_local-1)] + = h_adj_sourcearrays[INDEX6(*nadj_rec_local, + *NTSTEP_BETWEEN_READ_ADJSRC, + 3,5,5, + irec_local-1, + *time_index-1, + 2,i,j,k)]; + } + } + } + } // phase_is_inner + } // h_ispec_is_elastic + } + } + // check all local sources were added + if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n"); + + // copies extracted array values onto GPU + cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice, + (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice); + + + // the irec_local variable needs to be precomputed (as + // h_pre_comp..), because normally it is in the loop updating accel, + // and due to how it's incremented, it cannot be parallelized + + add_sources_el_SIM_TYPE_2_OR_3_kernel<<compute_stream>>>(mp->d_accel, + *nrec, + mp->d_adj_sourcearrays, + mp->d_ibool, + mp->d_ispec_is_inner, + mp->d_ispec_is_elastic, + mp->d_ispec_selected_rec, + *phase_is_inner, + mp->d_pre_computed_irec, + mp->nadj_rec_local); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel"); +#endif +} + diff --git a/src/cuda/compute_coupling_cuda.cu b/src/cuda/compute_coupling_cuda.cu new file mode 100644 index 000000000..bdf7c798e --- /dev/null +++ b/src/cuda/compute_coupling_cuda.cu @@ -0,0 +1,345 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +// ACOUSTIC - ELASTIC coupling + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_coupling_acoustic_el_kernel(realw* displ, + realw* potential_dot_dot_acoustic, + int num_coupling_ac_el_faces, + int* coupling_ac_el_ispec, + int* coupling_ac_el_ijk, + realw* coupling_ac_el_normal, + realw* coupling_ac_el_jacobian2Dw, + int* ibool, + int* ispec_is_inner, + int phase_is_inner) { + + int igll = threadIdx.x; + int iface = blockIdx.x + gridDim.x*blockIdx.y; + + int i,j,k,iglob,ispec; + realw displ_x,displ_y,displ_z,displ_n; + realw nx,ny,nz; + realw jacobianw; + + if( iface < num_coupling_ac_el_faces){ + + // don't compute points outside NGLLSQUARE==NGLL2==25 + // way 2: no further check needed since blocksize = 25 + // if(igll C indexing + ispec = coupling_ac_el_ispec[iface] - 1; + + if(ispec_is_inner[ispec] == phase_is_inner ) { + + i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; + j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1; + k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1; + iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1; + + // elastic displacement on global point + displ_x = displ[iglob*3] ; // (1,iglob) + displ_y = displ[iglob*3+1] ; // (2,iglob) + displ_z = displ[iglob*3+2] ; // (3,iglob) + + // gets associated normal on GLL point + nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface) + ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface) + nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface) + + // calculates displacement component along normal + // (normal points outwards of acoustic element) + displ_n = displ_x*nx + displ_y*ny + displ_z*nz; + + // gets associated, weighted jacobian + jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)]; + + // continuity of pressure and normal displacement on global point + + // note: Newmark time scheme together with definition of scalar potential: + // pressure = - chi_dot_dot + // requires that this coupling term uses the updated displacement at time step [t+delta_t], + // which is done at the very beginning of the time loop + // (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...) + // it also means you have to calculate and update this here first before + // calculating the coupling on the elastic side for the acceleration... + atomicAdd(&potential_dot_dot_acoustic[iglob],+ jacobianw*displ_n); + + } + // } + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_coupling_ac_el_cuda, + COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* num_coupling_ac_el_facesf, + int* SIMULATION_TYPEf) { + TRACE("compute_coupling_ac_el_cuda"); + //double start_time = get_time(); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + int phase_is_inner = *phase_is_innerf; + int num_coupling_ac_el_faces = *num_coupling_ac_el_facesf; + int SIMULATION_TYPE = *SIMULATION_TYPEf; + + // way 1: exact blocksize to match NGLLSQUARE + int blocksize = NGLL2; + int num_blocks_x = num_coupling_ac_el_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + // launches GPU kernel + compute_coupling_acoustic_el_kernel<<>>(mp->d_displ, + mp->d_potential_dot_dot_acoustic, + num_coupling_ac_el_faces, + mp->d_coupling_ac_el_ispec, + mp->d_coupling_ac_el_ijk, + mp->d_coupling_ac_el_normal, + mp->d_coupling_ac_el_jacobian2Dw, + mp->d_ibool, + mp->d_ispec_is_inner, + phase_is_inner); + + // adjoint simulations + if (SIMULATION_TYPE == 3 ){ + compute_coupling_acoustic_el_kernel<<>>(mp->d_b_displ, + mp->d_b_potential_dot_dot_acoustic, + num_coupling_ac_el_faces, + mp->d_coupling_ac_el_ispec, + mp->d_coupling_ac_el_ijk, + mp->d_coupling_ac_el_normal, + mp->d_coupling_ac_el_jacobian2Dw, + mp->d_ibool, + mp->d_ispec_is_inner, + phase_is_inner); + + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //double end_time = get_time(); + //printf("Elapsed time: %e\n",end_time-start_time); + exit_on_cuda_error("compute_coupling_acoustic_el_kernel"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// ELASTIC - ACOUSTIC coupling + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_coupling_elastic_ac_kernel(realw* potential_dot_dot_acoustic, + realw* accel, + int num_coupling_ac_el_faces, + int* coupling_ac_el_ispec, + int* coupling_ac_el_ijk, + realw* coupling_ac_el_normal, + realw* coupling_ac_el_jacobian2Dw, + int* ibool, + int* ispec_is_inner, + int phase_is_inner, + int gravity, + realw* minus_g, + realw* rhostore, + realw* displ) { + + int igll = threadIdx.x; + int iface = blockIdx.x + gridDim.x*blockIdx.y; + + int i,j,k,iglob,ispec; + realw pressure; + realw nx,ny,nz; + realw jacobianw; + realw rhol; + + if( iface < num_coupling_ac_el_faces){ + + // don't compute points outside NGLLSQUARE==NGLL2==25 + // way 2: no further check needed since blocksize = 25 + // if(igll C indexing + ispec = coupling_ac_el_ispec[iface] - 1; + + if(ispec_is_inner[ispec] == phase_is_inner ) { + + i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; + j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1; + k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1; + iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1; + + // gets associated normal on GLL point + // note: normal points away from acoustic element + nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface) + ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface) + nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface) + + // gets associated, weighted jacobian + jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)]; + + // acoustic pressure on global point + if( gravity ){ + // takes density (from acoustic? element) + rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]; + + // note: uses potential chi such that displacement s = grad(chi), + // pressure becomes: p = - kappa ( div( s ) ) = rho ( - dot_dot_chi + g * s ) + // g only acting in negative z-direction + + // daniel: TODO - check gravity and coupling would be displ * nz correct? + pressure = rhol*( - potential_dot_dot_acoustic[iglob] + + minus_g[iglob] * displ[iglob*3+2] ); + + //daniel: TODO - check gravity and coupling + //pressure = - potential_dot_dot_acoustic[iglob] ; + //if( iface == 128 && igll == 5 ){ + // printf("coupling acoustic: %f %f \n",potential_dot_dot_acoustic[iglob], + // minus_g[iglob] * displ[iglob*3+2]); + //} + + }else{ + // no gravity: uses potential chi such that displacement s = 1/rho grad(chi) + // pressure p = - kappa ( div( s ) ) then becomes: p = - dot_dot_chi + // ( multiplied with factor 1/kappa due to setup of equation of motion ) + pressure = - potential_dot_dot_acoustic[iglob]; + } + + // continuity of displacement and pressure on global point + // + // note: Newmark time scheme together with definition of scalar potential: + // pressure = - chi_dot_dot + // requires that this coupling term uses the *UPDATED* pressure (chi_dot_dot), i.e. + // pressure at time step [t + delta_t] + // (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...) + // it means you have to calculate and update the acoustic pressure first before + // calculating this term... + atomicAdd(&accel[iglob*3],+ jacobianw*nx*pressure); + atomicAdd(&accel[iglob*3+1],+ jacobianw*ny*pressure); + atomicAdd(&accel[iglob*3+2],+ jacobianw*nz*pressure); + } + // } + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_coupling_el_ac_cuda, + COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* num_coupling_ac_el_facesf, + int* SIMULATION_TYPEf) { + TRACE("compute_coupling_el_ac_cuda"); + //double start_time = get_time(); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + int phase_is_inner = *phase_is_innerf; + int num_coupling_ac_el_faces = *num_coupling_ac_el_facesf; + int SIMULATION_TYPE = *SIMULATION_TYPEf; + + // way 1: exact blocksize to match NGLLSQUARE + int blocksize = 25; + + int num_blocks_x = num_coupling_ac_el_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + // launches GPU kernel + compute_coupling_elastic_ac_kernel<<>>(mp->d_potential_dot_dot_acoustic, + mp->d_accel, + num_coupling_ac_el_faces, + mp->d_coupling_ac_el_ispec, + mp->d_coupling_ac_el_ijk, + mp->d_coupling_ac_el_normal, + mp->d_coupling_ac_el_jacobian2Dw, + mp->d_ibool, + mp->d_ispec_is_inner, + phase_is_inner, + mp->gravity, + mp->d_minus_g, + mp->d_rhostore, + mp->d_displ); + + // adjoint simulations + if (SIMULATION_TYPE == 3 ){ + compute_coupling_elastic_ac_kernel<<>>(mp->d_b_potential_dot_dot_acoustic, + mp->d_b_accel, + num_coupling_ac_el_faces, + mp->d_coupling_ac_el_ispec, + mp->d_coupling_ac_el_ijk, + mp->d_coupling_ac_el_normal, + mp->d_coupling_ac_el_jacobian2Dw, + mp->d_ibool, + mp->d_ispec_is_inner, + phase_is_inner, + mp->gravity, + mp->d_minus_g, + mp->d_rhostore, + mp->d_b_displ); + + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //double end_time = get_time(); + //printf("Elapsed time: %e\n",end_time-start_time); + exit_on_cuda_error("compute_coupling_el_ac_cuda"); +#endif +} diff --git a/src/cuda/compute_forces_acoustic_cuda.cu b/src/cuda/compute_forces_acoustic_cuda.cu new file mode 100644 index 000000000..e07cf712a --- /dev/null +++ b/src/cuda/compute_forces_acoustic_cuda.cu @@ -0,0 +1,966 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +// prepares a device array with with all inter-element edge-nodes -- this +// is followed by a memcpy and MPI operations +__global__ void prepare_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic, + realw* d_send_potential_dot_dot_buffer, + int num_interfaces_ext_mesh, + int max_nibool_interfaces_ext_mesh, + int* d_nibool_interfaces_ext_mesh, + int* d_ibool_interfaces_ext_mesh) { + + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + int iinterface=0; + + for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) { + if(idmax_nibool_interfaces_ext_mesh))/((double)blocksize)))*blocksize; + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + if(*FORWARD_OR_ADJOINT == 1) { + prepare_boundary_potential_on_device<<>>(mp->d_potential_dot_dot_acoustic, + mp->d_send_potential_dot_dot_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + } + else if(*FORWARD_OR_ADJOINT == 3) { + prepare_boundary_potential_on_device<<>>(mp->d_b_potential_dot_dot_acoustic, + mp->d_send_potential_dot_dot_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + } + + print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer, + (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),cudaMemcpyDeviceToHost),98000); + + // finish timing of kernel+memcpy + // cudaEventRecord( stop, 0 ); + // cudaEventSynchronize( stop ); + // cudaEventElapsedTime( &time, start, stop ); + // cudaEventDestroy( start ); + // cudaEventDestroy( stop ); + // printf("boundary xfer d->h Time: %f ms\n",time); +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("prepare_boundary_kernel"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + + +__global__ void assemble_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic, + realw* d_send_potential_dot_dot_buffer, + int num_interfaces_ext_mesh, + int max_nibool_interfaces_ext_mesh, + int* d_nibool_interfaces_ext_mesh, + int* d_ibool_interfaces_ext_mesh) { + + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + int iinterface=0; + + for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) { + if(idd_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh, + (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice); + + // assembles on GPU + int blocksize = BLOCKSIZE_TRANSFER; + int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize; + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + if(*FORWARD_OR_ADJOINT == 1) { + //assemble forward field + assemble_boundary_potential_on_device<<>>(mp->d_potential_dot_dot_acoustic, + mp->d_send_potential_dot_dot_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + } + else if(*FORWARD_OR_ADJOINT == 3) { + //assemble reconstructed/backward field + assemble_boundary_potential_on_device<<>>(mp->d_b_potential_dot_dot_acoustic, + mp->d_send_potential_dot_dot_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + // cudaEventRecord( stop, 0 ); + // cudaEventSynchronize( stop ); + // cudaEventElapsedTime( &time, start, stop ); + // cudaEventDestroy( start ); + // cudaEventDestroy( stop ); + // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time); + //double end_time = get_time(); + //printf("Elapsed time: %e\n",end_time-start_time); + exit_on_cuda_error("transfer_asmbl_pot_to_device"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +/* KERNEL 2 */ + +/* ----------------------------------------------------------------------------------------------- */ + + +__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute, + int NGLOB, int* d_ibool, + int* d_phase_ispec_inner_acoustic, + int num_phase_ispec_acoustic, + int d_iphase, + int use_mesh_coloring_gpu, + realw* d_potential_acoustic, realw* d_potential_dot_dot_acoustic, + realw* d_xix, realw* d_xiy, realw* d_xiz, + realw* d_etax, realw* d_etay, realw* d_etaz, + realw* d_gammax, realw* d_gammay, realw* d_gammaz, + realw* hprime_xx, realw* hprime_yy, realw* hprime_zz, + realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz, + realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz, + realw* d_rhostore, + int gravity, + realw* minus_g, + realw* d_kappastore, + realw* wgll_cube){ + + int bx = blockIdx.y*gridDim.x+blockIdx.x; + int tx = threadIdx.x; + + //const int NGLL3 = NGLL3; + const int NGLL3_ALIGN = NGLL3_PADDED; + + int K = (tx/NGLL2); + int J = ((tx-K*NGLL2)/NGLLX); + int I = (tx-K*NGLL2-J*NGLLX); + + int active,offset; + int iglob = 0; + int working_element; + reald temp1l,temp2l,temp3l; + reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl; + reald dpotentialdxl,dpotentialdyl,dpotentialdzl; + reald fac1,fac2,fac3; + reald rho_invl,kappa_invl; + reald sum_terms; + reald gravity_term; + +#ifndef MANUALLY_UNROLLED_LOOPS + int l; + int offset1,offset2,offset3; + realw hp1,hp2,hp3; +#endif + + __shared__ reald s_dummy_loc[NGLL3]; + + __shared__ reald s_temp1[NGLL3]; + __shared__ reald s_temp2[NGLL3]; + __shared__ reald s_temp3[NGLL3]; + +// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads, +// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses + active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0; + +// copy from global memory to shared memory +// each thread writes one of the NGLL^3 = 125 data points + if (active) { + +#ifdef USE_MESH_COLORING_GPU + working_element = bx; +#else + //mesh coloring + if( use_mesh_coloring_gpu ){ + working_element = bx; + }else{ + // iphase-1 and working_element-1 for Fortran->C array conventions + working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)]-1; + } +#endif + + // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1; + iglob = d_ibool[working_element*NGLL3 + tx]-1; + +#ifdef USE_TEXTURES + s_dummy_loc[tx] = tex1Dfetch(tex_potential_acoustic, iglob); +#else + // changing iglob indexing to match fortran row changes fast style + s_dummy_loc[tx] = d_potential_acoustic[iglob]; +#endif + } + +// synchronize all the threads (one thread for each of the NGLL grid points of the +// current spectral element) because we need the whole element to be ready in order +// to be able to compute the matrix products along cut planes of the 3D element below + __syncthreads(); + +#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS + + if (active) { + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING +// if(iglob == 0 )printf("kernel 2: iglob %i hprime_xx %f %f %f \n",iglob,hprime_xx[0],hprime_xx[1],hprime_xx[2]); +#endif + + +#ifndef MANUALLY_UNROLLED_LOOPS + + temp1l = 0.f; + temp2l = 0.f; + temp3l = 0.f; + + for (l=0;l 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2; + dim3 grid_2(num_blocks_x,num_blocks_y); + + + // Cuda timing + // cudaEvent_t start, stop; + // realw time; + // cudaEventCreate(&start); + // cudaEventCreate(&stop); + // cudaEventRecord( start, 0 ); + + Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute, + mp->NGLOB_AB, + d_ibool, + mp->d_phase_ispec_inner_acoustic, + mp->num_phase_ispec_acoustic, + d_iphase, + mp->use_mesh_coloring_gpu, + mp->d_potential_acoustic, mp->d_potential_dot_dot_acoustic, + d_xix, d_xiy, d_xiz, + d_etax, d_etay, d_etaz, + d_gammax, d_gammay, d_gammaz, + mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz, + mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz, + mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz, + d_rhostore, + mp->gravity, + mp->d_minus_g, + d_kappastore, + mp->d_wgll_cube); + + if(SIMULATION_TYPE == 3) { + Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute, + mp->NGLOB_AB, + d_ibool, + mp->d_phase_ispec_inner_acoustic, + mp->num_phase_ispec_acoustic, + d_iphase, + mp->use_mesh_coloring_gpu, + mp->d_b_potential_acoustic, mp->d_b_potential_dot_dot_acoustic, + d_xix, d_xiy, d_xiz, + d_etax, d_etay, d_etaz, + d_gammax, d_gammay, d_gammaz, + mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz, + mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz, + mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz, + d_rhostore, + mp->gravity, + mp->d_minus_g, + d_kappastore, + mp->d_wgll_cube); + } + + // cudaEventRecord( stop, 0 ); + // cudaEventSynchronize( stop ); + // cudaEventElapsedTime( &time, start, stop ); + // cudaEventDestroy( start ); + // cudaEventDestroy( stop ); + // printf("Kernel2 Execution Time: %f ms\n",time); + + /* cudaThreadSynchronize(); */ + /* TRACE("Kernel 2 finished"); */ +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //printf("Tried to start with %dx1 blocks\n",nb_blocks_to_compute); + exit_on_cuda_error("kernel Kernel_2"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +// main compute_forces_acoustic CUDA routine + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_forces_acoustic_cuda, + COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f, + int* iphase, + int* nspec_outer_acoustic, + int* nspec_inner_acoustic, + int* SIMULATION_TYPE) { + + TRACE("compute_forces_acoustic_cuda"); + //double start_time = get_time(); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + + int num_elements; + + if( *iphase == 1 ) + num_elements = *nspec_outer_acoustic; + else + num_elements = *nspec_inner_acoustic; + + if( num_elements == 0 ) return; + + // mesh coloring + if( mp->use_mesh_coloring_gpu ){ + + // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements + // and followed by acoustic ones. + // acoustic elements also start with outer than inner element ordering + + int nb_colors,nb_blocks_to_compute; + int istart; + int color_offset,color_offset_nonpadded; + + // sets up color loop + if( *iphase == 1 ){ + // outer elements + nb_colors = mp->num_colors_outer_acoustic; + istart = 0; + + // array offsets (acoustic elements start after elastic ones) + color_offset = mp->nspec_elastic * NGLL3_PADDED; + color_offset_nonpadded = mp->nspec_elastic * NGLL3; + }else{ + // inner element colors (start after outer elements) + nb_colors = mp->num_colors_outer_acoustic + mp->num_colors_inner_acoustic; + istart = mp->num_colors_outer_acoustic; + + // array offsets (inner elements start after outer ones) + color_offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED; + color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3; + } + + // loops over colors + for(int icolor = istart; icolor < nb_colors; icolor++){ + + nb_blocks_to_compute = mp->h_num_elem_colors_acoustic[icolor]; + + Kernel_2_acoustic(nb_blocks_to_compute,mp,*iphase, + *SIMULATION_TYPE, + mp->d_ibool + color_offset_nonpadded, + mp->d_xix + color_offset, + mp->d_xiy + color_offset, + mp->d_xiz + color_offset, + mp->d_etax + color_offset, + mp->d_etay + color_offset, + mp->d_etaz + color_offset, + mp->d_gammax + color_offset, + mp->d_gammay + color_offset, + mp->d_gammaz + color_offset, + mp->d_rhostore + color_offset, + mp->d_kappastore + color_offset_nonpadded); + + // for padded and aligned arrays + color_offset += nb_blocks_to_compute * NGLL3_PADDED; + // for no-aligned arrays + color_offset_nonpadded += nb_blocks_to_compute * NGLL3; + } + + }else{ + + // no mesh coloring: uses atomic updates + Kernel_2_acoustic(num_elements, mp, *iphase, + *SIMULATION_TYPE, + mp->d_ibool, + mp->d_xix, + mp->d_xiy, + mp->d_xiz, + mp->d_etax, + mp->d_etay, + mp->d_etaz, + mp->d_gammax, + mp->d_gammay, + mp->d_gammaz, + mp->d_rhostore, + mp->d_kappastore); + + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +/* KERNEL 3 */ + +/* ----------------------------------------------------------------------------------------------- */ + + +__global__ void kernel_3_a_acoustic_cuda_device(realw* potential_dot_dot_acoustic, + int size, + realw* rmass_acoustic) { + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + + /* because of block and grid sizing problems, there is a small */ + /* amount of buffer at the end of the calculation */ + if(id < size) { + // multiplies pressure with the inverse of the mass matrix + potential_dot_dot_acoustic[id] = potential_dot_dot_acoustic[id]*rmass_acoustic[id]; + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void kernel_3_b_acoustic_cuda_device(realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + int size, + realw deltatover2, + realw* rmass_acoustic) { + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + + /* because of block and grid sizing problems, there is a small */ + /* amount of buffer at the end of the calculation */ + if(id < size) { + // Newmark time scheme: corrector term + potential_dot_acoustic[id] = potential_dot_acoustic[id] + deltatover2*potential_dot_dot_acoustic[id]; + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)( + long* Mesh_pointer, + int* size_F, + int* SIMULATION_TYPE) { + +TRACE("kernel_3_a_acoustic_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper + int size = *size_F; + + int blocksize = BLOCKSIZE_KERNEL3; + int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize; + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_dot_acoustic, + size, + mp->d_rmass_acoustic); + + if(*SIMULATION_TYPE == 3) { + kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_dot_acoustic, + size, + mp->d_rmass_acoustic); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y); + exit_on_cuda_error("after kernel 3 a"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)( + long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltatover2_F) { + +TRACE("kernel_3_b_acoustic_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper + int size = *size_F; + realw deltatover2 = *deltatover2_F; + realw b_deltatover2 = *b_deltatover2_F; + + int blocksize = BLOCKSIZE_KERNEL3; + int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize; + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_acoustic, + mp->d_potential_dot_dot_acoustic, + size, deltatover2, + mp->d_rmass_acoustic); + + if(*SIMULATION_TYPE == 3) { + kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_acoustic, + mp->d_b_potential_dot_dot_acoustic, + size, b_deltatover2, + mp->d_rmass_acoustic); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y); + exit_on_cuda_error("after kernel 3 b"); +#endif +} + + + +/* ----------------------------------------------------------------------------------------------- */ + +/* KERNEL for enforce free surface */ + +/* ----------------------------------------------------------------------------------------------- */ + + +__global__ void enforce_free_surface_cuda_kernel( + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + int num_free_surface_faces, + int* free_surface_ispec, + int* free_surface_ijk, + int* ibool, + int* ispec_is_acoustic) { + // gets spectral element face id + int iface = blockIdx.x + gridDim.x*blockIdx.y; + + // for all faces on free surface + if( iface < num_free_surface_faces ){ + + int ispec = free_surface_ispec[iface]-1; + + // checks if element is in acoustic domain + if( ispec_is_acoustic[ispec] ){ + + // gets global point index + int igll = threadIdx.x + threadIdx.y*blockDim.x; + + int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface) + int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1; + int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1; + + int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1; + + // sets potentials to zero at free surface + potential_acoustic[iglob] = 0; + potential_dot_acoustic[iglob] = 0; + potential_dot_dot_acoustic[iglob] = 0; + } + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(acoustic_enforce_free_surf_cuda, + ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* ABSORB_FREE_SURFACE) { + +TRACE("acoustic_enforce_free_surf_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + // checks if anything to do + if( *ABSORB_FREE_SURFACE == 0 ){ + + // does not absorb free surface, thus we enforce the potential to be zero at surface + + // block sizes + int num_blocks_x = mp->num_free_surface_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + dim3 grid(num_blocks_x,num_blocks_y,1); + dim3 threads(NGLL2,1,1); + + // sets potentials to zero at free surface + enforce_free_surface_cuda_kernel<<>>(mp->d_potential_acoustic, + mp->d_potential_dot_acoustic, + mp->d_potential_dot_dot_acoustic, + mp->num_free_surface_faces, + mp->d_free_surface_ispec, + mp->d_free_surface_ijk, + mp->d_ibool, + mp->d_ispec_is_acoustic); + // for backward/reconstructed potentials + if(*SIMULATION_TYPE == 3) { + enforce_free_surface_cuda_kernel<<>>(mp->d_b_potential_acoustic, + mp->d_b_potential_dot_acoustic, + mp->d_b_potential_dot_dot_acoustic, + mp->num_free_surface_faces, + mp->d_free_surface_ispec, + mp->d_free_surface_ijk, + mp->d_ibool, + mp->d_ispec_is_acoustic); + + } + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("enforce_free_surface_cuda"); +#endif +} + diff --git a/src/cuda/compute_forces_elastic_cuda.cu b/src/cuda/compute_forces_elastic_cuda.cu new file mode 100644 index 000000000..6c3e753e5 --- /dev/null +++ b/src/cuda/compute_forces_elastic_cuda.cu @@ -0,0 +1,2255 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" +// #include "epik_user.h" + +// cuda constant arrays +__device__ realw d_hprime_xx[NGLL2]; + +// only needed if NGLLX != NGLLY != NGLLZ +// __device__ realw d_hprime_yy[NGLL2]; +// __device__ realw d_hprime_zz[NGLL2]; +__device__ realw d_hprimewgll_xx[NGLL2]; +__device__ realw d_hprimewgll_yy[NGLL2]; +__device__ realw d_hprimewgll_zz[NGLL2]; +__device__ realw d_wgllwgll_xy[NGLL2]; +__device__ realw d_wgllwgll_xz[NGLL2]; +__device__ realw d_wgllwgll_yz[NGLL2]; + +__constant__ realw d_wgll_cube[NGLL3]; // needed only for gravity case + +//daniel: todo - check if necessary... +// prototype for the fortran function to do non-blocking mpi send +//extern "C" +//void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*); // {}; + + +/* ----------------------------------------------------------------------------------------------- */ + +// prepares a device array with with all inter-element edge-nodes -- this +// is followed by a memcpy and MPI operations + +__global__ void prepare_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer, + int num_interfaces_ext_mesh, + int max_nibool_interfaces_ext_mesh, + int* d_nibool_interfaces_ext_mesh, + int* d_ibool_interfaces_ext_mesh) { + + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + //int iinterface=0; + + for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) { + if(idmax_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize; + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + //timing for memory xfer + // cudaEvent_t start, stop; + // realw time; + // cudaEventCreate(&start); + // cudaEventCreate(&stop); + // cudaEventRecord( start, 0 ); + if(*FORWARD_OR_ADJOINT == 1) { + prepare_boundary_accel_on_device<<compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + } + else if(*FORWARD_OR_ADJOINT == 3) { + prepare_boundary_accel_on_device<<compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + } + + + cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer, + 3*mp->max_nibool_interfaces_ext_mesh*mp->num_interfaces_ext_mesh*sizeof(realw), + cudaMemcpyDeviceToHost); + + // finish timing of kernel+memcpy + // cudaEventRecord( stop, 0 ); + // cudaEventSynchronize( stop ); + // cudaEventElapsedTime( &time, start, stop ); + // cudaEventDestroy( start ); + // cudaEventDestroy( stop ); + // printf("boundary xfer d->h Time: %f ms\n",time); +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("transfer_boun_accel_from_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_boundary_from_device_a, + TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer, + int* nspec_outer_elastic) { + +// asynchronous transfer from device to host + + TRACE("transfer_boundary_from_device_a"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper + +//daniel: todo - check below with this... + int blocksize = BLOCKSIZE_TRANSFER; + int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize; + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + +/* +//daniel: todo - check originally used... + int num_blocks_x = *nspec_outer_elastic; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + int blocksize = NGLL3_PADDED; + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); +*/ + + prepare_boundary_accel_on_device<<compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + // wait until kernel is finished before starting async memcpy +#if CUDA_VERSION >= 4000 + cudaDeviceSynchronize(); +#else + cudaThreadSynchronize(); +#endif + + cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer, + 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw), + cudaMemcpyDeviceToHost,mp->copy_stream); + // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer, + // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw), + // cudaMemcpyDeviceToHost,mp->compute_stream); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer, + int num_interfaces_ext_mesh, + int max_nibool_interfaces_ext_mesh, + int* d_nibool_interfaces_ext_mesh, + int* d_ibool_interfaces_ext_mesh) { + + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + //int bx = blockIdx.y*gridDim.x+blockIdx.x; + //int tx = threadIdx.x; + //int iinterface=0; + + for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) { + if(id < d_nibool_interfaces_ext_mesh[iinterface]) { + + // for testing atomic operations against not atomic operations (0.1ms vs. 0.04 ms) + // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)] += + // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)]; + // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1] += + // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1]; + // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2] += + // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2]; + + + atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)], + d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)]); + atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1], + d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1]); + atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2], + d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2]); + } + } + // ! This step is done via previous function transfer_and_assemble... + // ! do iinterface = 1, num_interfaces_ext_mesh + // ! do ipoin = 1, nibool_interfaces_ext_mesh(iinterface) + // ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = & + // ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface) + // ! enddo + // ! enddo +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_boundary_to_device_a, + TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer, + realw* buffer_recv_vector_ext_mesh, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh) { + +// asynchronous transfer from host to device + + TRACE("transfer_boundary_to_device_a"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_recv_buffer*sizeof(realw)); + + // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh, + // 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), + // cudaMemcpyHostToDevice,mp->compute_stream); + //printf("xfer to device\n"); + cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh, + 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), + cudaMemcpyHostToDevice,mp->copy_stream); + + + + +} + +/* ----------------------------------------------------------------------------------------------- */ + +//daniel: not used ... +// +//extern "C" +//void FC_FUNC_(assemble_accel_on_device, +// ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel, +// realw* buffer_recv_vector_ext_mesh, +// int* num_interfaces_ext_mesh, +// int* max_nibool_interfaces_ext_mesh, +// int* nibool_interfaces_ext_mesh, +// int* ibool_interfaces_ext_mesh, +// int* FORWARD_OR_ADJOINT) { +// TRACE("assemble_accel_on_device"); +// +// Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container +// +// int blocksize = BLOCKSIZE_TRANSFER; +// int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize; +// int num_blocks_x = size_padded/blocksize; +// int num_blocks_y = 1; +// while(num_blocks_x > 65535) { +// num_blocks_x = (int) ceil(num_blocks_x*0.5f); +// num_blocks_y = num_blocks_y*2; +// } +// +// //double start_time = get_time(); +// dim3 grid(num_blocks_x,num_blocks_y); +// dim3 threads(blocksize,1,1); +// // cudaEvent_t start, stop; +// // realw time; +// // cudaEventCreate(&start); +// // cudaEventCreate(&stop); +// // cudaEventRecord( start, 0 ); +// +// +// // *************************************************************************** +// // Wait until previous copy stream finishes. We assemble while other compute kernels execute. +// cudaStreamSynchronize(mp->copy_stream); +// +// // Assembling on the copy_stream breaks the solution and it "blows up" +// if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel +// assemble_boundary_accel_on_device<<compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer, +// mp->num_interfaces_ext_mesh, +// mp->max_nibool_interfaces_ext_mesh, +// mp->d_nibool_interfaces_ext_mesh, +// mp->d_ibool_interfaces_ext_mesh); +// } +// else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel +// assemble_boundary_accel_on_device<<copy_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer, +// mp->num_interfaces_ext_mesh, +// mp->max_nibool_interfaces_ext_mesh, +// mp->d_nibool_interfaces_ext_mesh, +// mp->d_ibool_interfaces_ext_mesh); +// } +// +// // cudaEventRecord( stop, 0 ); +// // cudaEventSynchronize( stop ); +// // cudaEventElapsedTime( &time, start, stop ); +// // cudaEventDestroy( start ); +// // cudaEventDestroy( stop ); +// // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time); +//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING +// //double end_time = get_time(); +// //printf("Elapsed time: %e\n",end_time-start_time); +// exit_on_cuda_error("transfer_asmbl_accel_to_device"); +//#endif +//} + +/* ----------------------------------------------------------------------------------------------- */ + +// FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel +extern "C" +void FC_FUNC_(transfer_asmbl_accel_to_device, + TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel, + realw* buffer_recv_vector_ext_mesh, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT) { +TRACE("transfer_asmbl_accel_to_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously? + if(*FORWARD_OR_ADJOINT == 1 ){ + // Wait until previous copy stream finishes. We assemble while other compute kernels execute. + cudaStreamSynchronize(mp->copy_stream); + } + else if(*FORWARD_OR_ADJOINT == 3 ){ + cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh, + 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), + cudaMemcpyHostToDevice); + } + + int blocksize = BLOCKSIZE_TRANSFER; + int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize; + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + //double start_time = get_time(); + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + // cudaEvent_t start, stop; + // realw time; + // cudaEventCreate(&start); + // cudaEventCreate(&stop); + // cudaEventRecord( start, 0 ); + if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel + assemble_boundary_accel_on_device<<compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + } + else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel + assemble_boundary_accel_on_device<<compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer, + mp->num_interfaces_ext_mesh, + mp->max_nibool_interfaces_ext_mesh, + mp->d_nibool_interfaces_ext_mesh, + mp->d_ibool_interfaces_ext_mesh); + } + + // cudaEventRecord( stop, 0 ); + // cudaEventSynchronize( stop ); + // cudaEventElapsedTime( &time, start, stop ); + // cudaEventDestroy( start ); + // cudaEventDestroy( stop ); + // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time); +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //double end_time = get_time(); + //printf("Elapsed time: %e\n",end_time-start_time); + exit_on_cuda_error("transfer_asmbl_accel_to_device"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// KERNEL 2 + +/* ----------------------------------------------------------------------------------------------- */ + +/* ----------------------------------------------------------------------------------------------- */ + +//__global__ void Kernel_test(realw* d_debug_output,int* d_phase_ispec_inner_elastic, +// int num_phase_ispec_elastic, int d_iphase, int* d_ibool) { +// int bx = blockIdx.x; +// int tx = threadIdx.x; +// int working_element; +// //int ispec; +// //int NGLL3_ALIGN = 128; +// if(tx==0 && bx==0) { +// +// d_debug_output[0] = 420.0; +// +// d_debug_output[2] = num_phase_ispec_elastic; +// d_debug_output[3] = d_iphase; +// working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1; +// d_debug_output[4] = working_element; +// d_debug_output[5] = d_phase_ispec_inner_elastic[0]; +// /* d_debug_output[1] = d_ibool[working_element*NGLL3_ALIGN + tx]-1; */ +// } +// /* d_debug_output[1+tx+128*bx] = 69.0; */ +// +//} + +/* ----------------------------------------------------------------------------------------------- */ + +// updates stress + +__device__ void compute_element_att_stress(int tx,int working_element,int NSPEC, + realw* R_xx, + realw* R_yy, + realw* R_xy, + realw* R_xz, + realw* R_yz, + reald* sigma_xx, + reald* sigma_yy, + reald* sigma_zz, + reald* sigma_xy, + reald* sigma_xz, + reald* sigma_yz) { + + int i_sls,offset_sls; + reald R_xx_val,R_yy_val; + + for(i_sls = 0; i_sls < N_SLS; i_sls++){ + // index + offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls); + + R_xx_val = R_xx[offset_sls]; //(i,j,k,ispec,i_sls) + R_yy_val = R_yy[offset_sls]; + + *sigma_xx = *sigma_xx - R_xx_val; + *sigma_yy = *sigma_yy - R_yy_val; + *sigma_zz = *sigma_zz + R_xx_val + R_yy_val; + *sigma_xy = *sigma_xy - R_xy[offset_sls]; + *sigma_xz = *sigma_xz - R_xz[offset_sls]; + *sigma_yz = *sigma_yz - R_yz[offset_sls]; + } + return; +} + +/* ----------------------------------------------------------------------------------------------- */ + +// updates R_memory + +__device__ void compute_element_att_memory(int tx,int working_element,int NSPEC, + realw* d_muv, + realw* factor_common, + realw* alphaval,realw* betaval,realw* gammaval, + realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz, + realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy, + realw* epsilondev_xz,realw* epsilondev_yz, + reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc, + reald epsilondev_xz_loc,reald epsilondev_yz_loc + ){ + + int i_sls; + int ijk_ispec; + int offset_sls,offset_align,offset_common; + reald mul; + reald alphaval_loc,betaval_loc,gammaval_loc; + reald factor_loc,Sn,Snp1; + + // indices + offset_align = tx + NGLL3_PADDED * working_element; + ijk_ispec = tx + NGLL3 * working_element; + + mul = d_muv[offset_align]; + + // use Runge-Kutta scheme to march in time + for(i_sls = 0; i_sls < N_SLS; i_sls++){ + + // indices + offset_common = i_sls + N_SLS*(tx + NGLL3*working_element); // (i_sls,i,j,k,ispec) + offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls); // (i,j,k,ispec,i_sls) + + factor_loc = mul * factor_common[offset_common]; //mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) + + alphaval_loc = alphaval[i_sls]; // (i_sls) + betaval_loc = betaval[i_sls]; + gammaval_loc = gammaval[i_sls]; + + // term in xx + Sn = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xx_loc; //(i,j,k) + + //R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + + // betaval_loc * Sn + gammaval_loc * Snp1; + + R_xx[offset_sls] = alphaval_loc * R_xx[offset_sls] + + betaval_loc * Sn + gammaval_loc * Snp1; + + // term in yy + Sn = factor_loc * epsilondev_yy[ijk_ispec]; + Snp1 = factor_loc * epsilondev_yy_loc; + R_yy[offset_sls] = alphaval_loc * R_yy[offset_sls] + + betaval_loc * Sn + gammaval_loc * Snp1; + // term in zz not computed since zero trace + // term in xy + Sn = factor_loc * epsilondev_xy[ijk_ispec]; + Snp1 = factor_loc * epsilondev_xy_loc; + R_xy[offset_sls] = alphaval_loc * R_xy[offset_sls] + + betaval_loc * Sn + gammaval_loc * Snp1; + // term in xz + Sn = factor_loc * epsilondev_xz[ijk_ispec]; + Snp1 = factor_loc * epsilondev_xz_loc; + R_xz[offset_sls] = alphaval_loc * R_xz[offset_sls] + + betaval_loc * Sn + gammaval_loc * Snp1; + // term in yz + Sn = factor_loc * epsilondev_yz[ijk_ispec]; + Snp1 = factor_loc * epsilondev_yz_loc; + R_yz[offset_sls] = alphaval_loc * R_yz[offset_sls] + + betaval_loc * Sn + gammaval_loc * Snp1; + } + return; +} + +/* ----------------------------------------------------------------------------------------------- */ + +// pre-computes gravity term + +__device__ void compute_element_gravity(int tx,int working_element, + int* d_ibool, + realw* d_minus_g, + realw* d_minus_deriv_gravity, + realw* d_rhostore, + realw* wgll_cube, + reald jacobianl, + reald* s_dummyx_loc, + reald* s_dummyy_loc, + reald* s_dummyz_loc, + reald* sigma_xx, + reald* sigma_yy, + reald* sigma_xz, + reald* sigma_yz, + reald* rho_s_H1, + reald* rho_s_H2, + reald* rho_s_H3){ + + int iglob; + reald minus_g,minus_dg; + reald rhol; + reald gzl; // gxl,gyl, + reald sx_l,sy_l,sz_l; + reald Hxxl,Hyyl,Hzzl; //,Hxyl,Hxzl,Hyzl; + reald factor; + + // compute non-symmetric terms for gravity + + // get g, rho and dg/dr=dg + iglob = d_ibool[working_element*NGLL3 + tx]-1; + + minus_g = d_minus_g[iglob]; + minus_dg = d_minus_deriv_gravity[iglob]; + + // Cartesian components of the gravitational acceleration + //gxl = 0.f; + //gyl = 0.f; + gzl = minus_g; + + // Cartesian components of gradient of gravitational acceleration + // H = grad g + // assumes g only acts in negative z-direction + Hxxl = 0.f; + Hyyl = 0.f; + Hzzl = minus_dg; + //Hxyl = 0.f; + //Hxzl = 0.f; + //Hyzl = 0.f; + + rhol = d_rhostore[working_element*NGLL3_PADDED + tx]; + + // get displacement and multiply by density to compute G tensor + // G = rho [ sg - (s * g) I ] + sx_l = rhol * s_dummyx_loc[tx]; // d_displ[iglob*3]; + sy_l = rhol * s_dummyy_loc[tx]; // d_displ[iglob*3 + 1]; + sz_l = rhol * s_dummyz_loc[tx]; // d_displ[iglob*3 + 2]; + + // compute G tensor from s . g and add to sigma (not symmetric) + //sigma_xx += sy_l*gyl + sz_l*gzl; + *sigma_xx += sz_l*gzl; + //sigma_yy += sx_l*gxl + sz_l*gzl; + *sigma_yy += sz_l*gzl; + //sigma_zz += sx_l*gxl + sy_l*gyl; + + //sigma_xy -= sx_l*gyl; + //sigma_yx -= sy_l*gxl; + + *sigma_xz -= sx_l*gzl; + //sigma_zx -= sz_l*gxl; + + *sigma_yz -= sy_l*gzl; + //sigma_zy -= sz_l*gyl; + + // precompute vector + factor = jacobianl * wgll_cube[tx]; + + //rho_s_H1 = fac1 * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl); + //rho_s_H2 = fac1 * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl); + //rho_s_H3 = fac1 * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl); + + // only non-zero z-direction + *rho_s_H1 = factor * sx_l * Hxxl ; // 0.f; + *rho_s_H2 = factor * sy_l * Hyyl ; // 0.f; + *rho_s_H3 = factor * sz_l * Hzzl ; + + // debug + //*rho_s_H1 = 0.f; + //*rho_s_H2 = 0.f; + //*rho_s_H3 = 0.f ; + +} + +/* ----------------------------------------------------------------------------------------------- */ + +// double precision temporary variables leads to 10% performance +// decrease in Kernel_2_impl (not very much..) +//typedef realw reald; +#ifdef USE_TEXTURES_FIELDS +texture d_displ_tex; +texture d_accel_tex; +#endif + +#ifdef USE_TEXTURES_CONSTANTS +texture d_hprime_xx_tex; +#endif + +__global__ void Kernel_2_impl(int nb_blocks_to_compute, + int NGLOB, + int* d_ibool, + int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, + int d_iphase, + int use_mesh_coloring_gpu, + realw* d_displ, realw* d_accel, + realw* d_xix, realw* d_xiy, realw* d_xiz, + realw* d_etax, realw* d_etay, realw* d_etaz, + realw* d_gammax, realw* d_gammay, realw* d_gammaz, + realw* d_kappav, realw* d_muv, + int COMPUTE_AND_STORE_STRAIN, + realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy, + realw* epsilondev_xz,realw* epsilondev_yz, + realw* epsilon_trace_over_3, + int SIMULATION_TYPE, + int ATTENUATION, + int NSPEC, + realw* one_minus_sum_beta,realw* factor_common, + realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz, + realw* alphaval,realw* betaval,realw* gammaval, + int ANISOTROPY, + realw* d_c11store, + realw* d_c12store, + realw* d_c13store, + realw* d_c14store, + realw* d_c15store, + realw* d_c16store, + realw* d_c22store, + realw* d_c23store, + realw* d_c24store, + realw* d_c25store, + realw* d_c26store, + realw* d_c33store, + realw* d_c34store, + realw* d_c35store, + realw* d_c36store, + realw* d_c44store, + realw* d_c45store, + realw* d_c46store, + realw* d_c55store, + realw* d_c56store, + realw* d_c66store, + int gravity, + realw* d_minus_g, + realw* d_minus_deriv_gravity, + realw* d_rhostore, + realw* wgll_cube){ + + /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/ + int bx = blockIdx.y*gridDim.x+blockIdx.x; + /* int bx = blockIdx.x; */ + int tx = threadIdx.x; + + //const int NGLLX = 5; + // const int NGLL2 = 25; + //const int NGLL3 = NGLL3; + const int NGLL3_ALIGN = NGLL3_PADDED; + + int K = (tx/NGLL2); + int J = ((tx-K*NGLL2)/NGLLX); + int I = (tx-K*NGLL2-J*NGLLX); + + int active,offset; + int iglob = 0; + int working_element; + + reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l; + reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl; + reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl; + reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl; + reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl; + reald fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal; + reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz; + reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc; + reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66; + reald sum_terms1,sum_terms2,sum_terms3; + + // gravity variables + reald sigma_yx,sigma_zx,sigma_zy; + reald rho_s_H1,rho_s_H2,rho_s_H3; + +#ifndef MANUALLY_UNROLLED_LOOPS + int l; + realw hp1,hp2,hp3; +#endif + + __shared__ reald s_dummyx_loc[NGLL3]; + __shared__ reald s_dummyy_loc[NGLL3]; + __shared__ reald s_dummyz_loc[NGLL3]; + + __shared__ reald s_tempx1[NGLL3]; + __shared__ reald s_tempx2[NGLL3]; + __shared__ reald s_tempx3[NGLL3]; + __shared__ reald s_tempy1[NGLL3]; + __shared__ reald s_tempy2[NGLL3]; + __shared__ reald s_tempy3[NGLL3]; + __shared__ reald s_tempz1[NGLL3]; + __shared__ reald s_tempz2[NGLL3]; + __shared__ reald s_tempz3[NGLL3]; + + __shared__ reald sh_hprime_xx[NGLL2]; + +// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads, +// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses + active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0; + +// copy from global memory to shared memory +// each thread writes one of the NGLL^3 = 125 data points + if (active) { + +#ifdef USE_MESH_COLORING_GPU + working_element = bx; +#else + //mesh coloring + if( use_mesh_coloring_gpu ){ + working_element = bx; + }else{ + // iphase-1 and working_element-1 for Fortran->C array conventions + working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1; + } +#endif + + // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1; + iglob = d_ibool[working_element*NGLL3 + tx]-1; + +#ifdef USE_TEXTURES_FIELDS + s_dummyx_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3); + s_dummyy_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3 + 1); + s_dummyz_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3 + 2); +#else + // changing iglob indexing to match fortran row changes fast style + s_dummyx_loc[tx] = d_displ[iglob*3]; + s_dummyy_loc[tx] = d_displ[iglob*3 + 1]; + s_dummyz_loc[tx] = d_displ[iglob*3 + 2]; +#endif + } + + if (tx < NGLL2) { + #ifdef USE_TEXTURES_CONSTANTS + sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx); + #else + sh_hprime_xx[tx] = d_hprime_xx[tx]; + #endif + } + +// synchronize all the threads (one thread for each of the NGLL grid points of the +// current spectral element) because we need the whole element to be ready in order +// to be able to compute the matrix products along cut planes of the 3D element below + __syncthreads(); + + if (active) { + +#ifndef MANUALLY_UNROLLED_LOOPS + + tempx1l = 0.f; + tempx2l = 0.f; + tempx3l = 0.f; + + tempy1l = 0.f; + tempy2l = 0.f; + tempy3l = 0.f; + + tempz1l = 0.f; + tempz2l = 0.f; + tempz3l = 0.f; + + for (l=0;l 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + int blocksize = NGLL3_PADDED; + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + // Cuda timing + // cudaEvent_t start, stop; + // realw time; + // cudaEventCreate(&start); + // cudaEventCreate(&stop); + // cudaEventRecord( start, 0 ); + + Kernel_2_impl<<compute_stream>>>(nb_blocks_to_compute, + mp->NGLOB_AB, + d_ibool, + mp->d_phase_ispec_inner_elastic, + mp->num_phase_ispec_elastic, + d_iphase, + mp->use_mesh_coloring_gpu, + mp->d_displ, mp->d_accel, + d_xix, d_xiy, d_xiz, + d_etax, d_etay, d_etaz, + d_gammax, d_gammay, d_gammaz, + d_kappav, d_muv, + COMPUTE_AND_STORE_STRAIN, + d_epsilondev_xx, + d_epsilondev_yy, + d_epsilondev_xy, + d_epsilondev_xz, + d_epsilondev_yz, + d_epsilon_trace_over_3, + SIMULATION_TYPE, + ATTENUATION,mp->NSPEC_AB, + d_one_minus_sum_beta, + d_factor_common, + d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz, + mp->d_alphaval,mp->d_betaval,mp->d_gammaval, + ANISOTROPY, + d_c11store, + d_c12store, + d_c13store, + d_c14store, + d_c15store, + d_c16store, + d_c22store, + d_c23store, + d_c24store, + d_c25store, + d_c26store, + d_c33store, + d_c34store, + d_c35store, + d_c36store, + d_c44store, + d_c45store, + d_c46store, + d_c55store, + d_c56store, + d_c66store, + mp->gravity, + mp->d_minus_g, + mp->d_minus_deriv_gravity, + d_rhostore, + mp->d_wgll_cube); + + + if(SIMULATION_TYPE == 3) { + Kernel_2_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute, + mp->NGLOB_AB, + d_ibool, + mp->d_phase_ispec_inner_elastic, + mp->num_phase_ispec_elastic, + d_iphase, + mp->use_mesh_coloring_gpu, + mp->d_b_displ, mp->d_b_accel, + d_xix, d_xiy, d_xiz, + d_etax, d_etay, d_etaz, + d_gammax, d_gammay, d_gammaz, + d_kappav, d_muv, + COMPUTE_AND_STORE_STRAIN, + d_b_epsilondev_xx, + d_b_epsilondev_yy, + d_b_epsilondev_xy, + d_b_epsilondev_xz, + d_b_epsilondev_yz, + d_b_epsilon_trace_over_3, + SIMULATION_TYPE, + ATTENUATION,mp->NSPEC_AB, + d_one_minus_sum_beta, + d_factor_common, + d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz, + mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval, + ANISOTROPY, + d_c11store, + d_c12store, + d_c13store, + d_c14store, + d_c15store, + d_c16store, + d_c22store, + d_c23store, + d_c24store, + d_c25store, + d_c26store, + d_c33store, + d_c34store, + d_c35store, + d_c36store, + d_c44store, + d_c45store, + d_c46store, + d_c55store, + d_c56store, + d_c66store, + mp->gravity, + mp->d_minus_g, + mp->d_minus_deriv_gravity, + d_rhostore, + mp->d_wgll_cube); + } + + // cudaEventRecord( stop, 0 ); + // cudaEventSynchronize( stop ); + // cudaEventElapsedTime( &time, start, stop ); + // cudaEventDestroy( start ); + // cudaEventDestroy( stop ); + // printf("Kernel2 Execution Time: %f ms\n",time); + + /* cudaThreadSynchronize(); */ + /* LOG("Kernel 2 finished"); */ +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("Kernel_2_impl "); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + + +extern "C" +void FC_FUNC_(compute_forces_elastic_cuda, + COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f, + int* iphase, + int* nspec_outer_elastic, + int* nspec_inner_elastic, + int* SIMULATION_TYPE, + int* COMPUTE_AND_STORE_STRAIN, + int* ATTENUATION, + int* ANISOTROPY) { + + TRACE("compute_forces_elastic_cuda"); + // EPIK_TRACER("compute_forces_elastic_cuda"); + //printf("Running compute_forces\n"); + //double start_time = get_time(); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + + int num_elements; + + if( *iphase == 1 ) + num_elements = *nspec_outer_elastic; + else + num_elements = *nspec_inner_elastic; + + // checks if anything to do + if( num_elements == 0 ) return; + + // mesh coloring + if( mp->use_mesh_coloring_gpu ){ + + // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements + // and followed by acoustic ones. + // elastic elements also start with outer than inner element ordering + + int nb_colors,nb_blocks_to_compute; + int istart; + int color_offset,color_offset_nonpadded,color_offset_nonpadded_att2; + + // sets up color loop + if( *iphase == 1 ){ + // outer elements + nb_colors = mp->num_colors_outer_elastic; + istart = 0; + + // array offsets + color_offset = 0; + color_offset_nonpadded = 0; + color_offset_nonpadded_att2 = 0; + }else{ + // inner elements (start after outer elements) + nb_colors = mp->num_colors_outer_elastic + mp->num_colors_inner_elastic; + istart = mp->num_colors_outer_elastic; + + // array offsets + color_offset = (*nspec_outer_elastic) * NGLL3_PADDED; + color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3; + color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3 * N_SLS; + } + + // loops over colors + for(int icolor = istart; icolor < nb_colors; icolor++){ + + nb_blocks_to_compute = mp->h_num_elem_colors_elastic[icolor]; + + // checks + //if( nb_blocks_to_compute <= 0 ){ + // printf("error number of elastic color blocks: %d -- color = %d \n",nb_blocks_to_compute,icolor); + // exit(EXIT_FAILURE); + //} + + Kernel_2(nb_blocks_to_compute,mp,*iphase, + *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE, + *ATTENUATION,*ANISOTROPY, + mp->d_ibool + color_offset_nonpadded, + mp->d_xix + color_offset, + mp->d_xiy + color_offset, + mp->d_xiz + color_offset, + mp->d_etax + color_offset, + mp->d_etay + color_offset, + mp->d_etaz + color_offset, + mp->d_gammax + color_offset, + mp->d_gammay + color_offset, + mp->d_gammaz + color_offset, + mp->d_kappav + color_offset, + mp->d_muv + color_offset, + mp->d_epsilondev_xx + color_offset_nonpadded, + mp->d_epsilondev_yy + color_offset_nonpadded, + mp->d_epsilondev_xy + color_offset_nonpadded, + mp->d_epsilondev_xz + color_offset_nonpadded, + mp->d_epsilondev_yz + color_offset_nonpadded, + mp->d_epsilon_trace_over_3 + color_offset_nonpadded, + mp->d_one_minus_sum_beta + color_offset_nonpadded, + mp->d_factor_common + color_offset_nonpadded_att2, + mp->d_R_xx + color_offset_nonpadded, + mp->d_R_yy + color_offset_nonpadded, + mp->d_R_xy + color_offset_nonpadded, + mp->d_R_xz + color_offset_nonpadded, + mp->d_R_yz + color_offset_nonpadded, + mp->d_b_epsilondev_xx + color_offset_nonpadded, + mp->d_b_epsilondev_yy + color_offset_nonpadded, + mp->d_b_epsilondev_xy + color_offset_nonpadded, + mp->d_b_epsilondev_xz + color_offset_nonpadded, + mp->d_b_epsilondev_yz + color_offset_nonpadded, + mp->d_b_epsilon_trace_over_3 + color_offset_nonpadded, + mp->d_b_R_xx + color_offset_nonpadded, + mp->d_b_R_yy + color_offset_nonpadded, + mp->d_b_R_xy + color_offset_nonpadded, + mp->d_b_R_xz + color_offset_nonpadded, + mp->d_b_R_yz + color_offset_nonpadded, + mp->d_c11store + color_offset, + mp->d_c12store + color_offset, + mp->d_c13store + color_offset, + mp->d_c14store + color_offset, + mp->d_c15store + color_offset, + mp->d_c16store + color_offset, + mp->d_c22store + color_offset, + mp->d_c23store + color_offset, + mp->d_c24store + color_offset, + mp->d_c25store + color_offset, + mp->d_c26store + color_offset, + mp->d_c33store + color_offset, + mp->d_c34store + color_offset, + mp->d_c35store + color_offset, + mp->d_c36store + color_offset, + mp->d_c44store + color_offset, + mp->d_c45store + color_offset, + mp->d_c46store + color_offset, + mp->d_c55store + color_offset, + mp->d_c56store + color_offset, + mp->d_c66store + color_offset, + mp->d_rhostore + color_offset); + + // for padded and aligned arrays + color_offset += nb_blocks_to_compute * NGLL3_PADDED; + // for no-aligned arrays + color_offset_nonpadded += nb_blocks_to_compute * NGLL3; + // for factor_common array + color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS; + } + + }else{ + + // no mesh coloring: uses atomic updates + + Kernel_2(num_elements,mp,*iphase, + *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE, + *ATTENUATION,*ANISOTROPY, + mp->d_ibool, + mp->d_xix, + mp->d_xiy, + mp->d_xiz, + mp->d_etax, + mp->d_etay, + mp->d_etaz, + mp->d_gammax, + mp->d_gammay, + mp->d_gammaz, + mp->d_kappav, + mp->d_muv, + mp->d_epsilondev_xx, + mp->d_epsilondev_yy, + mp->d_epsilondev_xy, + mp->d_epsilondev_xz, + mp->d_epsilondev_yz, + mp->d_epsilon_trace_over_3, + mp->d_one_minus_sum_beta, + mp->d_factor_common, + mp->d_R_xx, + mp->d_R_yy, + mp->d_R_xy, + mp->d_R_xz, + mp->d_R_yz, + mp->d_b_epsilondev_xx, + mp->d_b_epsilondev_yy, + mp->d_b_epsilondev_xy, + mp->d_b_epsilondev_xz, + mp->d_b_epsilondev_yz, + mp->d_b_epsilon_trace_over_3, + mp->d_b_R_xx, + mp->d_b_R_yy, + mp->d_b_R_xy, + mp->d_b_R_xz, + mp->d_b_R_yz, + mp->d_c11store, + mp->d_c12store, + mp->d_c13store, + mp->d_c14store, + mp->d_c15store, + mp->d_c16store, + mp->d_c22store, + mp->d_c23store, + mp->d_c24store, + mp->d_c25store, + mp->d_c26store, + mp->d_c33store, + mp->d_c34store, + mp->d_c35store, + mp->d_c36store, + mp->d_c44store, + mp->d_c45store, + mp->d_c46store, + mp->d_c55store, + mp->d_c56store, + mp->d_c66store, + mp->d_rhostore); + } + + //daniel: todo - check with routine sync_copy_from_device below... +// // Wait until async-memcpy of outer elements is finished and start MPI. +// if(*iphase==2) { +// cudaStreamSynchronize(mp->copy_stream); +// +// // There have been problems using the pinned-memory with MPI, so +// // we copy the buffer into a non-pinned region. +// memcpy(mp->send_buffer,mp->h_send_accel_buffer, +// mp->size_mpi_send_buffer*sizeof(float)); +// +// // memory copy is now finished, so non-blocking MPI send can proceed +// // MPI based halo exchange +// +// assemble_mpi_vector_send_cuda_(&(mp->NPROCS), +// mp->send_buffer, /* "regular" memory */ +// // mp->h_send_accel_buffer, /* pinned memory **CRASH** */ +// mp->buffer_recv_vector_ext_mesh, +// &mp->num_interfaces_ext_mesh, +// &mp->max_nibool_interfaces_ext_mesh, +// mp->nibool_interfaces_ext_mesh, +// mp->my_neighbours_ext_mesh, +// mp->request_send_vector_ext_mesh, +// mp->request_recv_vector_ext_mesh); +// +// // Decided to keep launching kernels and to wait for MPI & do memcpy while other kernels launch. +// // cudaDeviceSynchronize(); +// } + +} + +/* ----------------------------------------------------------------------------------------------- */ + +//daniel: todo - use this instead above call to fortran routine to avoid compilation problems +extern "C" +void FC_FUNC_(sync_copy_from_device, + SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f, + int* iphase, + realw* send_buffer) { + + TRACE("sync_copy_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + + // Wait until async-memcpy of outer elements is finished and start MPI. + if( *iphase != 2 ){ exit_on_cuda_error("sync_copy_from_device must be called for iphase == 2"); } + + //if(*iphase==2) { + + // waits for asynchronous copy to finish + cudaStreamSynchronize(mp->copy_stream); + + // There have been problems using the pinned-memory with MPI, so + // we copy the buffer into a non-pinned region. + memcpy(send_buffer,mp->h_send_accel_buffer, + mp->size_mpi_send_buffer*sizeof(float)); + + // memory copy is now finished, so non-blocking MPI send can proceed + + //} +} + +/* ----------------------------------------------------------------------------------------------- */ + +// KERNEL 3 + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void kernel_3_cuda_device(realw* veloc, + realw* accel, int size, + realw deltatover2, + realw* rmass) { + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + + /* because of block and grid sizing problems, there is a small */ + /* amount of buffer at the end of the calculation */ + if(id < size) { + realw new_accel = accel[id] * rmass[id / 3]; + veloc[id] += deltatover2 * new_accel; + accel[id] = new_accel; +/* + accel[3*id] = accel[3*id]*rmass[id]; + accel[3*id+1] = accel[3*id+1]*rmass[id]; + accel[3*id+2] = accel[3*id+2]*rmass[id]; + + veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id]; + veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1]; + veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2]; +*/ + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void kernel_3_accel_cuda_device(realw* accel, + int size, + realw* rmass) { + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + + /* because of block and grid sizing problems, there is a small */ + /* amount of buffer at the end of the calculation */ + if(id < size) { + accel[id] *= rmass[id / 3]; +/* + accel[3*id] = accel[3*id]*rmass[id]; + accel[3*id+1] = accel[3*id+1]*rmass[id]; + accel[3*id+2] = accel[3*id+2]*rmass[id]; +*/ + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void kernel_3_veloc_cuda_device(realw* veloc, + realw* accel, + int size, + realw deltatover2) { + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + + /* because of block and grid sizing problems, there is a small */ + /* amount of buffer at the end of the calculation */ + if(id < size) { + veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id]; + veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1]; + veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2]; + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(kernel_3_a_cuda, + KERNEL_3_A_CUDA)(long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE_f, + realw* b_deltatover2_F, + int* OCEANS) { +TRACE("kernel_3_a_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper + //int size = *size_F; + int size = *size_F * 3; + int SIMULATION_TYPE = *SIMULATION_TYPE_f; + realw deltatover2 = *deltatover2_F; + realw b_deltatover2 = *b_deltatover2_F; + + int blocksize = BLOCKSIZE_KERNEL3; + int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize; + + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + // check whether we can update accel and veloc, or only accel at this point + if( *OCEANS == 0 ){ + // updates both, accel and veloc + kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass); + + if(SIMULATION_TYPE == 3) { + kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc, mp->d_b_accel, size, b_deltatover2,mp->d_rmass); + } + }else{ + // updates only accel + kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel, size, mp->d_rmass); + + if(SIMULATION_TYPE == 3) { + kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel, size, mp->d_rmass); + } + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y); + exit_on_cuda_error("after kernel 3 a"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(kernel_3_b_cuda, + KERNEL_3_B_CUDA)(long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE_f, + realw* b_deltatover2_F) { + TRACE("kernel_3_b_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper + int size = *size_F; + int SIMULATION_TYPE = *SIMULATION_TYPE_f; + realw deltatover2 = *deltatover2_F; + realw b_deltatover2 = *b_deltatover2_F; + + int blocksize = BLOCKSIZE_KERNEL3; + int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize; + + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + // updates only veloc at this point + kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,mp->d_accel,size,deltatover2); + + if(SIMULATION_TYPE == 3) { + kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,mp->d_b_accel,size,b_deltatover2); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y); + exit_on_cuda_error("after kernel 3 b"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +/* OCEANS load on free surface */ + +/* ----------------------------------------------------------------------------------------------- */ + + +__global__ void elastic_ocean_load_cuda_kernel(realw* accel, + realw* rmass, + realw* rmass_ocean_load, + int num_free_surface_faces, + int* free_surface_ispec, + int* free_surface_ijk, + realw* free_surface_normal, + int* ibool, + int* updated_dof_ocean_load) { + // gets spectral element face id + int igll = threadIdx.x ; // threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1) + int iface = blockIdx.x + gridDim.x*blockIdx.y; + realw nx,ny,nz; + realw force_normal_comp,additional_term; + + // for all faces on free surface + if( iface < num_free_surface_faces ){ + + int ispec = free_surface_ispec[iface]-1; + + // gets global point index + int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface) + int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1; + int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1; + + int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1; + + //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob); + + // only update this global point once + + // daniel: TODO - there might be better ways to implement a mutex like below, + // and find a workaround to not use the temporary update array. + // atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point + + if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){ + + // get normal + nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface) + ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; + nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; + + // make updated component of right-hand side + // we divide by rmass() which is 1 / M + // we use the total force which includes the Coriolis term above + force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob]; + + additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp; + + // probably wouldn't need atomicAdd anymore, but just to be sure... + atomicAdd(&accel[iglob*3], + additional_term * nx); + atomicAdd(&accel[iglob*3+1], + additional_term * ny); + atomicAdd(&accel[iglob*3+2], + additional_term * nz); + } + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(elastic_ocean_load_cuda, + ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f, + int* SIMULATION_TYPE) { + + TRACE("elastic_ocean_load_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + // checks if anything to do + if( mp->num_free_surface_faces == 0 ) return; + + // block sizes: exact blocksize to match NGLLSQUARE + int blocksize = NGLL2; + + int num_blocks_x = mp->num_free_surface_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + + // initializes temporary array to zero + print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0, + sizeof(int)*mp->NGLOB_AB),88501); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("before kernel elastic_ocean_load_cuda"); +#endif + + elastic_ocean_load_cuda_kernel<<compute_stream>>>(mp->d_accel, + mp->d_rmass, + mp->d_rmass_ocean_load, + mp->num_free_surface_faces, + mp->d_free_surface_ispec, + mp->d_free_surface_ijk, + mp->d_free_surface_normal, + mp->d_ibool, + mp->d_updated_dof_ocean_load); + // for backward/reconstructed potentials + if(*SIMULATION_TYPE == 3) { + // re-initializes array + print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0, + sizeof(int)*mp->NGLOB_AB),88502); + + elastic_ocean_load_cuda_kernel<<compute_stream>>>(mp->d_b_accel, + mp->d_rmass, + mp->d_rmass_ocean_load, + mp->num_free_surface_faces, + mp->d_free_surface_ispec, + mp->d_free_surface_ijk, + mp->d_free_surface_normal, + mp->d_ibool, + mp->d_updated_dof_ocean_load); + + } + + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("elastic_ocean_load_cuda"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +/* note: + constant arrays when used in compute_forces_acoustic_cuda.cu routines stay zero, + constant declaration and cudaMemcpyToSymbol would have to be in the same file... + + extern keyword doesn't work for __constant__ declarations. + + also: + cudaMemcpyToSymbol("deviceCaseParams", caseParams, sizeof(CaseParams)); + .. + and compile with -arch=sm_20 + + see also: http://stackoverflow.com/questions/4008031/how-to-use-cuda-constant-memory-in-a-programmer-pleasant-way + doesn't seem to work. + + we could keep arrays separated for acoustic and elastic routines... + + for now, we store pointers with cudaGetSymbolAddress() function calls. + + */ + + +// constant arrays + +void setConst_hprime_xx(realw* array,Mesh* mp) +{ + + cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n"); + exit(1); + } + + err = cudaGetSymbolAddress((void**)&(mp->d_hprime_xx),"d_hprime_xx"); + if(err != cudaSuccess) { + fprintf(stderr, "Error with d_hprime_xx: %s\n", cudaGetErrorString(err)); + exit(1); + } +} + +// void setConst_hprime_yy(realw* array,Mesh* mp) +// { + +// cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw)); +// if (err != cudaSuccess) +// { +// fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err)); +// fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n"); +// exit(1); +// } + +// err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy"); +// if(err != cudaSuccess) { +// fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err)); +// exit(1); +// } +// } + +// void setConst_hprime_zz(realw* array,Mesh* mp) +// { + +// cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw)); +// if (err != cudaSuccess) +// { +// fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err)); +// fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n"); +// exit(1); +// } + +// err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz"); +// if(err != cudaSuccess) { +// fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err)); +// exit(1); +// } +// } + + +void setConst_hprimewgll_xx(realw* array,Mesh* mp) +{ + cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err)); + exit(1); + } + + err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_xx),"d_hprimewgll_xx"); + if(err != cudaSuccess) { + fprintf(stderr, "Error with d_hprimewgll_xx: %s\n", cudaGetErrorString(err)); + exit(1); + } +} + +void setConst_hprimewgll_yy(realw* array,Mesh* mp) +{ + cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err)); + exit(1); + } + + err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy"); + if(err != cudaSuccess) { + fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err)); + exit(1); + } +} + +void setConst_hprimewgll_zz(realw* array,Mesh* mp) +{ + cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err)); + exit(1); + } + + err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz"); + if(err != cudaSuccess) { + fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err)); + exit(1); + } +} + +void setConst_wgllwgll_xy(realw* array,Mesh* mp) +{ + cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err)); + exit(1); + } + //mp->d_wgllwgll_xy = d_wgllwgll_xy; + err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xy),"d_wgllwgll_xy"); + if(err != cudaSuccess) { + fprintf(stderr, "Error with d_wgllwgll_xy: %s\n", cudaGetErrorString(err)); + exit(1); + } + +} + +void setConst_wgllwgll_xz(realw* array,Mesh* mp) +{ + cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err)); + exit(1); + } + //mp->d_wgllwgll_xz = d_wgllwgll_xz; + err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xz),"d_wgllwgll_xz"); + if(err != cudaSuccess) { + fprintf(stderr, "Error with d_wgllwgll_xz: %s\n", cudaGetErrorString(err)); + exit(1); + } + +} + +void setConst_wgllwgll_yz(realw* array,Mesh* mp) +{ + cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err)); + exit(1); + } + //mp->d_wgllwgll_yz = d_wgllwgll_yz; + err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_yz),"d_wgllwgll_yz"); + if(err != cudaSuccess) { + fprintf(stderr, "Error with d_wgllwgll_yz: %s\n", cudaGetErrorString(err)); + exit(1); + } + +} + +void setConst_wgll_cube(realw* array,Mesh* mp) +{ + cudaError_t err = cudaMemcpyToSymbol(d_wgll_cube, array, NGLL3*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in setConst_wgll_cube: %s\n", cudaGetErrorString(err)); + exit(1); + } + //mp->d_wgll_cube = d_wgll_cube; + err = cudaGetSymbolAddress((void**)&(mp->d_wgll_cube),"d_wgll_cube"); + if(err != cudaSuccess) { + fprintf(stderr, "Error with d_wgll_cube: %s\n", cudaGetErrorString(err)); + exit(1); + } + +} diff --git a/src/cuda/compute_kernels_cuda.cu b/src/cuda/compute_kernels_cuda.cu new file mode 100644 index 000000000..2cb339185 --- /dev/null +++ b/src/cuda/compute_kernels_cuda.cu @@ -0,0 +1,646 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +// ELASTIC SIMULATIONS + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_kernels_cudakernel(int* ispec_is_elastic, + int* ibool, + realw* accel, + realw* b_displ, + realw* epsilondev_xx, + realw* epsilondev_yy, + realw* epsilondev_xy, + realw* epsilondev_xz, + realw* epsilondev_yz, + realw* b_epsilondev_xx, + realw* b_epsilondev_yy, + realw* b_epsilondev_xy, + realw* b_epsilondev_xz, + realw* b_epsilondev_yz, + realw* rho_kl, + realw deltat, + realw* mu_kl, + realw* kappa_kl, + realw* epsilon_trace_over_3, + realw* b_epsilon_trace_over_3, + int NSPEC_AB) { + + int ispec = blockIdx.x + blockIdx.y*gridDim.x; + + // handles case when there is 1 extra block (due to rectangular grid) + if(ispec < NSPEC_AB) { + + // elastic elements only + if( ispec_is_elastic[ispec] ) { + + int ijk = threadIdx.x; + int ijk_ispec = ijk + NGLL3*ispec; + int iglob = ibool[ijk_ispec] - 1 ; + + // isotropic kernels: + // density kernel + rho_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_displ[3*iglob]+ + accel[3*iglob+1]*b_displ[3*iglob+1]+ + accel[3*iglob+2]*b_displ[3*iglob+2]); + + + // shear modulus kernel + mu_kl[ijk_ispec] += deltat * (epsilondev_xx[ijk_ispec]*b_epsilondev_xx[ijk_ispec]+ + epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+ + (epsilondev_xx[ijk_ispec]+epsilondev_yy[ijk_ispec])* + (b_epsilondev_xx[ijk_ispec]+b_epsilondev_yy[ijk_ispec])+ + 2*(epsilondev_xy[ijk_ispec]*b_epsilondev_xy[ijk_ispec]+ + epsilondev_xz[ijk_ispec]*b_epsilondev_xz[ijk_ispec]+ + epsilondev_yz[ijk_ispec]*b_epsilondev_yz[ijk_ispec])); + + // bulk modulus kernel + kappa_kl[ijk_ispec] += deltat*(9*epsilon_trace_over_3[ijk_ispec]* + b_epsilon_trace_over_3[ijk_ispec]); + + } + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_kernels_elastic_cuda, + COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer, + realw* deltat_f) { +TRACE("compute_kernels_elastic_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ + realw deltat = *deltat_f; + + int num_blocks_x = mp->NSPEC_AB; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + compute_kernels_cudakernel<<>>(mp->d_ispec_is_elastic,mp->d_ibool, + mp->d_accel, mp->d_b_displ, + mp->d_epsilondev_xx, + mp->d_epsilondev_yy, + mp->d_epsilondev_xy, + mp->d_epsilondev_xz, + mp->d_epsilondev_yz, + mp->d_b_epsilondev_xx, + mp->d_b_epsilondev_yy, + mp->d_b_epsilondev_xy, + mp->d_b_epsilondev_xz, + mp->d_b_epsilondev_yz, + mp->d_rho_kl, + deltat, + mp->d_mu_kl, + mp->d_kappa_kl, + mp->d_epsilon_trace_over_3, + mp->d_b_epsilon_trace_over_3, + mp->NSPEC_AB); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_kernels_elastic_cuda"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// NOISE SIMULATIONS + +/* ----------------------------------------------------------------------------------------------- */ + + +__global__ void compute_kernels_strength_noise_cuda_kernel(realw* displ, + int* free_surface_ispec, + int* free_surface_ijk, + int* ibool, + realw* noise_surface_movie, + realw* normal_x_noise, + realw* normal_y_noise, + realw* normal_z_noise, + realw* Sigma_kl, + realw deltat, + int num_free_surface_faces) { + int iface = blockIdx.x + blockIdx.y*gridDim.x; + + if(iface < num_free_surface_faces) { + + int ispec = free_surface_ispec[iface]-1; + int igll = threadIdx.x; + int ipoin = igll + NGLL2*iface; + int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1 ; + int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1; + int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1; + + int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1 ; + + realw eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+ + noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y_noise[ipoin]+ + noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z_noise[ipoin]); + + Sigma_kl[INDEX4(5,5,5,i,j,k,ispec)] += deltat*eta*(normal_x_noise[ipoin]*displ[3*iglob]+ + normal_y_noise[ipoin]*displ[1+3*iglob]+ + normal_z_noise[ipoin]*displ[2+3*iglob]); + } + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_kernels_strgth_noise_cu, + COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer, + realw* h_noise_surface_movie, + realw* deltat) { + +TRACE("compute_kernels_strgth_noise_cu"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie, + 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice); + + + int num_blocks_x = mp->num_free_surface_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(NGLL2,1,1); + + compute_kernels_strength_noise_cuda_kernel<<>>(mp->d_displ, + mp->d_free_surface_ispec, + mp->d_free_surface_ijk, + mp->d_ibool, + mp->d_noise_surface_movie, + mp->d_normal_x_noise, + mp->d_normal_y_noise, + mp->d_normal_z_noise, + mp->d_Sigma_kl,*deltat, + mp->num_free_surface_faces); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_kernels_strength_noise_cuda_kernel"); +#endif +} + + + +/* ----------------------------------------------------------------------------------------------- */ + +// ACOUSTIC SIMULATIONS + +/* ----------------------------------------------------------------------------------------------- */ + + +__device__ void compute_gradient_kernel(int ijk, + int ispec, + realw* scalar_field, + realw* vector_field_element, + realw* hprime_xx, + realw* hprime_yy, + realw* hprime_zz, + realw* d_xix, + realw* d_xiy, + realw* d_xiz, + realw* d_etax, + realw* d_etay, + realw* d_etaz, + realw* d_gammax, + realw* d_gammay, + realw* d_gammaz, + realw rhol, + int gravity) { + + realw temp1l,temp2l,temp3l; + realw hp1,hp2,hp3; + realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl; + realw rho_invl; + int l,offset,offset1,offset2,offset3; + + //const int NGLLX = 5; + const int NGLL3_ALIGN = NGLL3_PADDED; + + int K = (ijk/NGLL2); + int J = ((ijk-K*NGLL2)/NGLLX); + int I = (ijk-K*NGLL2-J*NGLLX); + + // derivative along x + temp1l = 0.f; + for( l=0; lNSPEC_AB; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + compute_kernels_acoustic_kernel<<>>(mp->d_ispec_is_acoustic, + mp->d_ibool, + mp->d_rhostore, + mp->d_kappastore, + mp->d_hprime_xx, + mp->d_hprime_yy, + mp->d_hprime_zz, + mp->d_xix, + mp->d_xiy, + mp->d_xiz, + mp->d_etax, + mp->d_etay, + mp->d_etaz, + mp->d_gammax, + mp->d_gammay, + mp->d_gammaz, + mp->d_potential_dot_dot_acoustic, + mp->d_b_potential_acoustic, + mp->d_b_potential_dot_dot_acoustic, + mp->d_rho_ac_kl, + mp->d_kappa_ac_kl, + deltat, + mp->NSPEC_AB, + mp->gravity); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_kernels_acoustic_kernel"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +// preconditioner (approximate Hessian kernel) + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_kernels_hess_el_cudakernel(int* ispec_is_elastic, + int* ibool, + realw* accel, + realw* b_accel, + realw* hess_kl, + realw deltat, + int NSPEC_AB) { + + int ispec = blockIdx.x + blockIdx.y*gridDim.x; + + // handles case when there is 1 extra block (due to rectangular grid) + if(ispec < NSPEC_AB) { + + // elastic elements only + if( ispec_is_elastic[ispec] ) { + + int ijk = threadIdx.x; + int ijk_ispec = ijk + NGLL3*ispec; + int iglob = ibool[ijk_ispec] - 1 ; + + // approximate hessian + hess_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_accel[3*iglob]+ + accel[3*iglob+1]*b_accel[3*iglob+1]+ + accel[3*iglob+2]*b_accel[3*iglob+2]); + } + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_kernels_hess_ac_cudakernel(int* ispec_is_acoustic, + int* ibool, + realw* potential_dot_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + realw* rhostore, + realw* hprime_xx, + realw* hprime_yy, + realw* hprime_zz, + realw* d_xix, + realw* d_xiy, + realw* d_xiz, + realw* d_etax, + realw* d_etay, + realw* d_etaz, + realw* d_gammax, + realw* d_gammay, + realw* d_gammaz, + realw* hess_kl, + realw deltat, + int NSPEC_AB, + int gravity) { + + int ispec = blockIdx.x + blockIdx.y*gridDim.x; + + // handles case when there is 1 extra block (due to rectangular grid) + if(ispec < NSPEC_AB) { + + // acoustic elements only + if( ispec_is_acoustic[ispec] ){ + + // local and global indices + int ijk = threadIdx.x; + int ijk_ispec = ijk + NGLL3*ispec; + int iglob = ibool[ijk_ispec] - 1 ; + + int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec; + + realw accel_elm[3]; + realw b_accel_elm[3]; + realw rhol; + + // shared memory between all threads within this block + __shared__ realw scalar_field_accel[NGLL3]; + __shared__ realw scalar_field_b_accel[NGLL3]; + + // copy field values + scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob]; + scalar_field_b_accel[ijk] = b_potential_dot_dot_acoustic[iglob]; + __syncthreads(); + + // gets material parameter + rhol = rhostore[ijk_ispec_padded]; + + // acceleration vector + compute_gradient_kernel(ijk,ispec, + scalar_field_accel,accel_elm, + hprime_xx,hprime_yy,hprime_zz, + d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz, + rhol,gravity); + + // acceleration vector from backward field + compute_gradient_kernel(ijk,ispec, + scalar_field_b_accel,b_accel_elm, + hprime_xx,hprime_yy,hprime_zz, + d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz, + rhol,gravity); + // approximates hessian + hess_kl[ijk_ispec] += deltat * (accel_elm[0]*b_accel_elm[0] + + accel_elm[1]*b_accel_elm[1] + + accel_elm[2]*b_accel_elm[2]); + + } // ispec_is_acoustic + + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_kernels_hess_cuda, + COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer, + realw* deltat_f, + int* ELASTIC_SIMULATION, + int* ACOUSTIC_SIMULATION) { + TRACE("compute_kernels_hess_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ + realw deltat = *deltat_f; + + int num_blocks_x = mp->NSPEC_AB; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + if( *ELASTIC_SIMULATION ) { + compute_kernels_hess_el_cudakernel<<>>(mp->d_ispec_is_elastic, + mp->d_ibool, + mp->d_accel, + mp->d_b_accel, + mp->d_hess_el_kl, + deltat, + mp->NSPEC_AB); + } + + if( *ACOUSTIC_SIMULATION ) { + compute_kernels_hess_ac_cudakernel<<>>(mp->d_ispec_is_acoustic, + mp->d_ibool, + mp->d_potential_dot_dot_acoustic, + mp->d_b_potential_dot_dot_acoustic, + mp->d_rhostore, + mp->d_hprime_xx, + mp->d_hprime_yy, + mp->d_hprime_zz, + mp->d_xix, + mp->d_xiy, + mp->d_xiz, + mp->d_etax, + mp->d_etay, + mp->d_etaz, + mp->d_gammax, + mp->d_gammay, + mp->d_gammaz, + mp->d_hess_ac_kl, + deltat, + mp->NSPEC_AB, + mp->gravity); + } + + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_kernels_hess_cuda"); +#endif +} + diff --git a/src/cuda/compute_stacey_acoustic_cuda.cu b/src/cuda/compute_stacey_acoustic_cuda.cu new file mode 100644 index 000000000..0174923ec --- /dev/null +++ b/src/cuda/compute_stacey_acoustic_cuda.cu @@ -0,0 +1,192 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_stacey_acoustic_kernel(realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + int* abs_boundary_ispec, + int* abs_boundary_ijk, + realw* abs_boundary_jacobian2Dw, + int* ibool, + realw* rhostore, + realw* kappastore, + int* ispec_is_inner, + int* ispec_is_acoustic, + int phase_is_inner, + int SIMULATION_TYPE, int SAVE_FORWARD, + int num_abs_boundary_faces, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + realw* b_absorb_potential, + int gravity) { + + int igll = threadIdx.x; + int iface = blockIdx.x + gridDim.x*blockIdx.y; + + int i,j,k,iglob,ispec; + realw rhol,kappal,cpl; + realw jacobianw; + realw vel; + + // don't compute points outside NGLLSQUARE==NGLL2==25 + // way 2: no further check needed since blocksize = 25 + if( iface < num_abs_boundary_faces){ + + // if(igll C indexing + ispec = abs_boundary_ispec[iface]-1; + + if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) { + + i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1; + j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1; + k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1; + iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1; + + // determines bulk sound speed + rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]; + + kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)]; + + cpl = sqrt( kappal / rhol ); + + // velocity + if( gravity ){ + // daniel: TODO - check gravity and stacey condition here... + // uses a potential definition of: s = grad(chi) + vel = potential_dot_acoustic[iglob] / rhol ; + }else{ + // uses a potential definition of: s = 1/rho grad(chi) + vel = potential_dot_acoustic[iglob] / rhol; + } + + // gets associated, weighted jacobian + jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)]; + + // Sommerfeld condition + atomicAdd(&potential_dot_dot_acoustic[iglob],-vel*jacobianw/cpl); + + // adjoint simulations + if( SIMULATION_TYPE == 3 ){ + // Sommerfeld condition + atomicAdd(&b_potential_dot_dot_acoustic[iglob],-b_absorb_potential[INDEX2(NGLL2,igll,iface)]); + }else if( SIMULATION_TYPE == 1 && SAVE_FORWARD ){ + // saves boundary values + b_absorb_potential[INDEX2(NGLL2,igll,iface)] = vel*jacobianw/cpl; + } + } +// } + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(compute_stacey_acoustic_cuda, + COMPUTE_STACEY_ACOUSTIC_CUDA)( + long* Mesh_pointer_f, + int* phase_is_innerf, + int* SIMULATION_TYPEf, + int* SAVE_FORWARDf, + realw* h_b_absorb_potential) { +TRACE("compute_stacey_acoustic_cuda"); + //double start_time = get_time(); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + int phase_is_inner = *phase_is_innerf; + int SIMULATION_TYPE = *SIMULATION_TYPEf; + int SAVE_FORWARD = *SAVE_FORWARDf; + + // way 1: Elapsed time: 4.385948e-03 + // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel + // int blocksize = 32; + + // way 2: Elapsed time: 4.379034e-03 + // > NGLLSQUARE==NGLL2==25, no further check inside kernel + int blocksize = NGLL2; + + int num_blocks_x = mp->d_num_abs_boundary_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + // adjoint simulations: reads in absorbing boundary + if (SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0 ){ + // copies array to GPU + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential, + mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700); + } + + compute_stacey_acoustic_kernel<<>>(mp->d_potential_dot_acoustic, + mp->d_potential_dot_dot_acoustic, + mp->d_abs_boundary_ispec, + mp->d_abs_boundary_ijk, + mp->d_abs_boundary_jacobian2Dw, + mp->d_ibool, + mp->d_rhostore, + mp->d_kappastore, + mp->d_ispec_is_inner, + mp->d_ispec_is_acoustic, + phase_is_inner, + SIMULATION_TYPE,SAVE_FORWARD, + mp->d_num_abs_boundary_faces, + mp->d_b_potential_dot_acoustic, + mp->d_b_potential_dot_dot_acoustic, + mp->d_b_absorb_potential, + mp->gravity); + + // adjoint simulations: stores absorbed wavefield part + if (SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ){ + // copies array to CPU + print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential, + mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_stacey_acoustic_kernel"); +#endif +} + diff --git a/src/cuda/compute_stacey_elastic_cuda.cu b/src/cuda/compute_stacey_elastic_cuda.cu new file mode 100644 index 000000000..a00776495 --- /dev/null +++ b/src/cuda/compute_stacey_elastic_cuda.cu @@ -0,0 +1,214 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_stacey_elastic_kernel(realw* veloc, + realw* accel, + int* abs_boundary_ispec, + int* abs_boundary_ijk, + realw* abs_boundary_normal, + realw* abs_boundary_jacobian2Dw, + int* ibool, + realw* rho_vp, + realw* rho_vs, + int* ispec_is_inner, + int* ispec_is_elastic, + int phase_is_inner, + int SIMULATION_TYPE, + int SAVE_FORWARD, + int num_abs_boundary_faces, + realw* b_accel, + realw* b_absorb_field) { + + int igll = threadIdx.x; // tx + int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx + + int i,j,k,iglob,ispec; + realw vx,vy,vz,vn; + realw nx,ny,nz; + realw rho_vp_temp,rho_vs_temp; + realw tx,ty,tz; + realw jacobianw; + + // don't compute points outside NGLLSQUARE==NGLL2==25 + // way 2: no further check needed since blocksize = 25 + if( iface < num_abs_boundary_faces){ + + //if(igll < NGLL2 && iface < num_abs_boundary_faces) { + + // "-1" from index values to convert from Fortran-> C indexing + ispec = abs_boundary_ispec[iface]-1; + + if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) { + + i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1; + j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1; + k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1; + iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1; + + // gets associated velocity + + vx = veloc[iglob*3+0]; + vy = veloc[iglob*3+1]; + vz = veloc[iglob*3+2]; + + // gets associated normal + nx = abs_boundary_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; + ny = abs_boundary_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; + nz = abs_boundary_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; + + // // velocity component in normal direction (normal points out of element) + vn = vx*nx + vy*ny + vz*nz; + + rho_vp_temp = rho_vp[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]; + rho_vs_temp = rho_vs[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]; + + tx = rho_vp_temp*vn*nx + rho_vs_temp*(vx-vn*nx); + ty = rho_vp_temp*vn*ny + rho_vs_temp*(vy-vn*ny); + tz = rho_vp_temp*vn*nz + rho_vs_temp*(vz-vn*nz); + + jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)]; + + atomicAdd(&accel[iglob*3],-tx*jacobianw); + atomicAdd(&accel[iglob*3+1],-ty*jacobianw); + atomicAdd(&accel[iglob*3+2],-tz*jacobianw); + + if(SIMULATION_TYPE == 3) { + atomicAdd(&b_accel[iglob*3 ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]); + atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]); + atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]); + } + else if(SAVE_FORWARD && SIMULATION_TYPE == 1) { + b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw; + b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw; + b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw; + } // SIMULATION_TYPE + } + } // num_abs_boundary_faces + +} + +/* ----------------------------------------------------------------------------------------------- */ + + +extern "C" +void FC_FUNC_(compute_stacey_elastic_cuda, + COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* SIMULATION_TYPEf, + int* SAVE_FORWARDf, + realw* h_b_absorb_field) { + +TRACE("compute_stacey_elastic_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + // check + if( mp->d_num_abs_boundary_faces == 0 ) return; + + int phase_is_inner = *phase_is_innerf; + int SIMULATION_TYPE = *SIMULATION_TYPEf; + int SAVE_FORWARD = *SAVE_FORWARDf; + + // way 1 + // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel + //int blocksize = 32; + + // way 2: seems sligthly faster + // > NGLLSQUARE==NGLL2==25, no further check inside kernel + int blocksize = NGLL2; + + int num_blocks_x = mp->d_num_abs_boundary_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) { + // The read is done in fortran + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field, + mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("between cudamemcpy and compute_stacey_elastic_kernel"); +#endif + + compute_stacey_elastic_kernel<<>>(mp->d_veloc, + mp->d_accel, + mp->d_abs_boundary_ispec, + mp->d_abs_boundary_ijk, + mp->d_abs_boundary_normal, + mp->d_abs_boundary_jacobian2Dw, + mp->d_ibool, + mp->d_rho_vp, + mp->d_rho_vs, + mp->d_ispec_is_inner, + mp->d_ispec_is_elastic, + phase_is_inner, + SIMULATION_TYPE,SAVE_FORWARD, + mp->d_num_abs_boundary_faces, + mp->d_b_accel, + mp->d_b_absorb_field); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("compute_stacey_elastic_kernel"); +#endif + + // ! adjoint simulations: stores absorbed wavefield part + // if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) & + // write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field + + if(SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ) { + print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field, + mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701); + // The write is done in fortran + // write_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&it); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("after compute_stacey_elastic after cudamemcpy"); +#endif +} + diff --git a/src/cuda/it_update_displacement_cuda.cu b/src/cuda/it_update_displacement_cuda.cu new file mode 100644 index 000000000..3c0b3ff94 --- /dev/null +++ b/src/cuda/it_update_displacement_cuda.cu @@ -0,0 +1,228 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" + + +#define CUBLAS_ERROR(s,n) if (s != CUBLAS_STATUS_SUCCESS) { \ +fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \ +exit(EXIT_FAILURE); } + +/* ----------------------------------------------------------------------------------------------- */ + +// elastic wavefield + +/* ----------------------------------------------------------------------------------------------- */ + + +__global__ void UpdateDispVeloc_kernel(realw* displ, + realw* veloc, + realw* accel, + int size, + realw deltat, + realw deltatsqover2, + realw deltatover2) { + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + + /* because of block and grid sizing problems, there is a small */ + /* amount of buffer at the end of the calculation */ + if(id < size) { + displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id]; + veloc[id] = veloc[id] + deltatover2*accel[id]; + accel[id] = 0; // can do this using memset...not sure if faster + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(it_update_displacement_cuda, + IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f, + int* size_F, + realw* deltat_F, + realw* deltatsqover2_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltat_F, + realw* b_deltatsqover2_F, + realw* b_deltatover2_F) { + +TRACE("it_update_displacement_cuda"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + + //int i,device; + + int size = *size_F; + realw deltat = *deltat_F; + realw deltatsqover2 = *deltatsqover2_F; + realw deltatover2 = *deltatover2_F; + realw b_deltat = *b_deltat_F; + realw b_deltatsqover2 = *b_deltatsqover2_F; + realw b_deltatover2 = *b_deltatover2_F; + //cublasStatus status; + + int blocksize = BLOCKSIZE_KERNEL1; + int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize; + + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + +//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING +// exit_on_cuda_error("Before UpdateDispVeloc_kernel"); +//#endif + + //launch kernel + UpdateDispVeloc_kernel<<compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel, + size,deltat,deltatsqover2,deltatover2); + + //cudaThreadSynchronize(); +//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING +// //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y); +// // sync and check to catch errors from previous async operations +// exit_on_cuda_error("UpdateDispVeloc_kernel"); +//#endif + + // kernel for backward fields + if(*SIMULATION_TYPE == 3) { + + UpdateDispVeloc_kernel<<compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel, + size,b_deltat,b_deltatsqover2,b_deltatover2); + +//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING +// //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y); +// exit_on_cuda_error("after SIM_TYPE==3 UpdateDispVeloc_kernel"); +//#endif + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("it_update_displacement_cuda"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +// acoustic wavefield + +// KERNEL 1 +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void UpdatePotential_kernel(realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + int size, + realw deltat, + realw deltatsqover2, + realw deltatover2) { + int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x; + + /* because of block and grid sizing problems, there is a small */ + /* amount of buffer at the end of the calculation */ + if(id < size) { + potential_acoustic[id] = potential_acoustic[id] + + deltat*potential_dot_acoustic[id] + + deltatsqover2*potential_dot_dot_acoustic[id]; + + potential_dot_acoustic[id] = potential_dot_acoustic[id] + + deltatover2*potential_dot_dot_acoustic[id]; + + potential_dot_dot_acoustic[id] = 0; + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(it_update_displacement_ac_cuda, + it_update_displacement_ac_cuda)(long* Mesh_pointer_f, + int* size_F, + realw* deltat_F, + realw* deltatsqover2_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltat_F, + realw* b_deltatsqover2_F, + realw* b_deltatover2_F) { +TRACE("it_update_displacement_ac_cuda"); + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + + //int i,device; + int size = *size_F; + realw deltat = *deltat_F; + realw deltatsqover2 = *deltatsqover2_F; + realw deltatover2 = *deltatover2_F; + realw b_deltat = *b_deltat_F; + realw b_deltatsqover2 = *b_deltatsqover2_F; + realw b_deltatover2 = *b_deltatover2_F; + //cublasStatus status; + + int blocksize = BLOCKSIZE_KERNEL1; + int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize; + + int num_blocks_x = size_padded/blocksize; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + //launch kernel + UpdatePotential_kernel<<compute_stream>>>(mp->d_potential_acoustic, + mp->d_potential_dot_acoustic, + mp->d_potential_dot_dot_acoustic, + size,deltat,deltatsqover2,deltatover2); + + if(*SIMULATION_TYPE == 3) { + UpdatePotential_kernel<<compute_stream>>>(mp->d_b_potential_acoustic, + mp->d_b_potential_dot_acoustic, + mp->d_b_potential_dot_dot_acoustic, + size,b_deltat,b_deltatsqover2,b_deltatover2); + } + + //cudaThreadSynchronize(); +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y); + exit_on_cuda_error("it_update_displacement_ac_cuda"); +#endif +} diff --git a/src/cuda/mesh_constants_cuda.h b/src/cuda/mesh_constants_cuda.h new file mode 100644 index 000000000..032504321 --- /dev/null +++ b/src/cuda/mesh_constants_cuda.h @@ -0,0 +1,467 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +/* trivia + +- for most working arrays we use now "realw" instead of "float" type declarations to make it easier to switch + between a real or double precision simulation + (matching CUSTOM_REAL == 4 or 8 in fortran routines). + +- instead of boolean "logical" declared in fortran routines, in C (or Cuda-C) we have to use "int" variables. + ifort / gfortran caveat: + to check whether it is true or false, do not check for == 1 to test for true values since ifort just uses + non-zero values for true (e.g. can be -1 for true). however, false will be always == 0. + thus, rather use: if( var ) {...} for testing if true instead of if( var == 1){...} (alternative: one could use if( var != 0 ){...} + +*/ + +#ifndef GPU_MESH_ +#define GPU_MESH_ + +#include +#include + +/* ----------------------------------------------------------------------------------------------- */ + +// for debugging and benchmarking + +/* ----------------------------------------------------------------------------------------------- */ + +#define DEBUG 0 +#if DEBUG == 1 +#define TRACE(x) printf("%s\n",x); +#else +#define TRACE(x) // printf("%s\n",x); +#endif + +#define MAXDEBUG 0 +#if MAXDEBUG == 1 +#define LOG(x) printf("%s\n",x) +#define PRINT5(var,offset) for(;print_count<5;print_count++) printf("var(%d)=%2.20f\n",print_count,var[offset+print_count]); +#define PRINT10(var) if(print_count<10) { printf("var=%1.20e\n",var); print_count++; } +#define PRINT10i(var) if(print_count<10) { printf("var=%d\n",var); print_count++; } +#else +#define LOG(x) // printf("%s\n",x); +#define PRINT5(var,offset) // for(i=0;i<10;i++) printf("var(%d)=%f\n",i,var[offset+i]); +#endif + +// error checking after cuda function calls +/* #define ENABLE_VERY_SLOW_ERROR_CHECKING */ + +#define MAX(x,y) (((x) < (y)) ? (y) : (x)) + +double get_time(); + +void print_CUDA_error_if_any(cudaError_t err, int num); + +void pause_for_debugger(int pause); + +void exit_on_cuda_error(char* kernel_name); + +void exit_on_error(char* info); + +void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y); + +/* ----------------------------------------------------------------------------------------------- */ + +// cuda constant arrays + +/* ----------------------------------------------------------------------------------------------- */ + +// dimensions +#define NDIM 3 + +// Gauss-Lobatto-Legendre +#define NGLLX 5 +#define NGLL2 25 +#define NGLL3 125 // no padding: requires same size as in fortran for NGLLX * NGLLY * NGLLZ + +// padding: 128 == 2**7 might improve on older graphics cards w/ coalescent memory accesses: +#define NGLL3_PADDED 128 +// no padding: 125 == 5*5*5 to avoid allocation of extra memory +//#define NGLL3_PADDED 125 + +// number of standard linear solids +#define N_SLS 3 + +//typedef float real; // type of variables passed into function +typedef float realw; // type of "working" variables + +// double precision temporary variables leads to 10% performance +// decrease in Kernel_2_impl (not very much..) +typedef float reald; + +// (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h: +// leads up to ~ 5% performance increase +//#define USE_MESH_COLORING_GPU + +// Texture memory usage: +// requires CUDA version >= 4.0, see check below +// Use textures for d_displ and d_accel -- 10% performance boost +#define USE_TEXTURES_FIELDS +// +// Using texture memory for the hprime-style constants is slower on +// Fermi generation hardware, but *may* be faster on Kepler +// generation. +// Use textures for hprime_xx +//#define USE_TEXTURES_CONSTANTS + +// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize() +#if CUDA_VERSION < 4000 +#undef USE_TEXTURES_FIELDS +#undef USE_TEXTURES_CONSTANTS +#endif + + +// (optional) unrolling loops +// leads up to ~1% performance increase +//#define MANUALLY_UNROLLED_LOOPS + +// cuda kernel block size for updating displacements/potential (newmark time scheme) +// current hardware: 128 is slightly faster than 256 ( ~ 4%) +#define BLOCKSIZE_KERNEL1 128 +#define BLOCKSIZE_KERNEL3 128 +#define BLOCKSIZE_TRANSFER 256 + +/* ----------------------------------------------------------------------------------------------- */ + +// indexing + +#define INDEX2(xsize,x,y) x + (y)*xsize + +#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z) +//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize + +#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i)) +//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + +#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j)))) +//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize + +#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k)))) + +#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED +//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED + +/* ----------------------------------------------------------------------------------------------- */ + +// mesh pointer wrapper structure + +/* ----------------------------------------------------------------------------------------------- */ + +typedef struct mesh_ { + + // mesh resolution + int NSPEC_AB; + int NGLOB_AB; + + int myrank; + int NPROCS; + + // interpolators + realw* d_xix; realw* d_xiy; realw* d_xiz; + realw* d_etax; realw* d_etay; realw* d_etaz; + realw* d_gammax; realw* d_gammay; realw* d_gammaz; + + // model parameters + realw* d_kappav; realw* d_muv; + + // global indexing + int* d_ibool; + + // inner / outer elements + int* d_ispec_is_inner; + + // mesh coloring + int use_mesh_coloring_gpu; + + // pointers to constant memory arrays + realw* d_hprime_xx; realw* d_hprime_yy; realw* d_hprime_zz; + realw* d_hprimewgll_xx; realw* d_hprimewgll_yy; realw* d_hprimewgll_zz; + realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz; + realw* d_wgll_cube; + + // A buffer for mpi-send/recv, which is duplicated in fortran but is + // allocated with pinned memory to facilitate asynchronus device <-> + // host memory transfers + float* h_send_accel_buffer; + float* h_send_b_accel_buffer; + float* send_buffer; + float* h_recv_accel_buffer; + float* h_recv_b_accel_buffer; + float* recv_buffer; + int size_mpi_send_buffer; + int size_mpi_recv_buffer; + + + + // buffers and constants for the MPI-send required for async-memcpy + // + non-blocking MPI + float* buffer_recv_vector_ext_mesh; + int num_interfaces_ext_mesh; + int max_nibool_interfaces_ext_mesh; + int* nibool_interfaces_ext_mesh; + int* my_neighbours_ext_mesh; + int* request_send_vector_ext_mesh; + int* request_recv_vector_ext_mesh; + + + // overlapped memcpy streams + cudaStream_t compute_stream; + cudaStream_t copy_stream; + cudaStream_t b_copy_stream; + + // ------------------------------------------------------------------ // + // elastic wavefield parameters + // ------------------------------------------------------------------ // + + // displacement, velocity, acceleration + realw* d_displ; realw* d_veloc; realw* d_accel; + // backward/reconstructed elastic wavefield + realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel; + + // Texture references for fast non-coalesced scattered access + const textureReference* d_displ_tex_ref_ptr; + const textureReference* d_accel_tex_ref_ptr; + + // elastic elements + int* d_ispec_is_elastic; + + // elastic domain parameters + int* d_phase_ispec_inner_elastic; + int num_phase_ispec_elastic; + + // mesh coloring + int* h_num_elem_colors_elastic; + int num_colors_outer_elastic,num_colors_inner_elastic; + int nspec_elastic; + + realw* d_rmass; + + // mpi buffer + realw* d_send_accel_buffer; + + // interfaces + int* d_nibool_interfaces_ext_mesh; + int* d_ibool_interfaces_ext_mesh; + + //used for absorbing stacey boundaries + int d_num_abs_boundary_faces; + int* d_abs_boundary_ispec; + int* d_abs_boundary_ijk; + realw* d_abs_boundary_normal; + realw* d_abs_boundary_jacobian2Dw; + + realw* d_b_absorb_field; + int d_b_reclen_field; + + realw* d_rho_vp; + realw* d_rho_vs; + + // sources + int nsources_local; + realw* d_sourcearrays; + double* d_stf_pre_compute; + int* d_islice_selected_source; + int* d_ispec_selected_source; + + // receivers + int* d_number_receiver_global; + int* d_ispec_selected_rec; + int nrec_local; + realw* d_station_seismo_field; + realw* h_station_seismo_field; + + double* d_hxir, *d_hetar, *d_hgammar; + double* d_dxd, *d_dyd, *d_dzd; + double* d_vxd, *d_vyd, *d_vzd; + double* d_axd, *d_ayd, *d_azd; + realw* d_seismograms_d, *d_seismograms_v, *d_seismograms_a; + double* d_nu; + + realw* h_seismograms_d_it; + realw* h_seismograms_v_it; + realw* h_seismograms_a_it; + + // adjoint receivers/sources + int nadj_rec_local; + realw* d_adj_sourcearrays; + realw* h_adj_sourcearrays_slice; + int* d_pre_computed_irec; + + // surface elements (to save for noise tomography and acoustic simulations) + int* d_free_surface_ispec; + int* d_free_surface_ijk; + int num_free_surface_faces; + + // surface movie elements to save for noise tomography + realw* d_noise_surface_movie; + + // attenuation + realw* d_R_xx; + realw* d_R_yy; + realw* d_R_xy; + realw* d_R_xz; + realw* d_R_yz; + + realw* d_one_minus_sum_beta; + realw* d_factor_common; + + realw* d_alphaval; + realw* d_betaval; + realw* d_gammaval; + + // attenuation & kernel + realw* d_epsilondev_xx; + realw* d_epsilondev_yy; + realw* d_epsilondev_xy; + realw* d_epsilondev_xz; + realw* d_epsilondev_yz; + realw* d_epsilon_trace_over_3; + + // anisotropy + realw* d_c11store; + realw* d_c12store; + realw* d_c13store; + realw* d_c14store; + realw* d_c15store; + realw* d_c16store; + realw* d_c22store; + realw* d_c23store; + realw* d_c24store; + realw* d_c25store; + realw* d_c26store; + realw* d_c33store; + realw* d_c34store; + realw* d_c35store; + realw* d_c36store; + realw* d_c44store; + realw* d_c45store; + realw* d_c46store; + realw* d_c55store; + realw* d_c56store; + realw* d_c66store; + + // noise + realw* d_normal_x_noise; + realw* d_normal_y_noise; + realw* d_normal_z_noise; + realw* d_mask_noise; + realw* d_free_surface_jacobian2Dw; + + realw* d_noise_sourcearray; + + // attenuation & kernel backward fields + realw* d_b_R_xx; + realw* d_b_R_yy; + realw* d_b_R_xy; + realw* d_b_R_xz; + realw* d_b_R_yz; + + realw* d_b_epsilondev_xx; + realw* d_b_epsilondev_yy; + realw* d_b_epsilondev_xy; + realw* d_b_epsilondev_xz; + realw* d_b_epsilondev_yz; + realw* d_b_epsilon_trace_over_3; + + realw* d_b_alphaval; + realw* d_b_betaval; + realw* d_b_gammaval; + + // sensitivity kernels + realw* d_rho_kl; + realw* d_mu_kl; + realw* d_kappa_kl; + + // noise sensitivity kernel + realw* d_Sigma_kl; + + // approximative hessian for preconditioning kernels + realw* d_hess_el_kl; + + // oceans + realw* d_rmass_ocean_load; + realw* d_free_surface_normal; + int* d_updated_dof_ocean_load; + + // ------------------------------------------------------------------ // + // acoustic wavefield + // ------------------------------------------------------------------ // + // potential and first and second time derivative + realw* d_potential_acoustic; realw* d_potential_dot_acoustic; realw* d_potential_dot_dot_acoustic; + // backward/reconstructed wavefield + realw* d_b_potential_acoustic; realw* d_b_potential_dot_acoustic; realw* d_b_potential_dot_dot_acoustic; + + // acoustic domain parameters + int* d_ispec_is_acoustic; + + int* d_phase_ispec_inner_acoustic; + int num_phase_ispec_acoustic; + + // mesh coloring + int* h_num_elem_colors_acoustic; + int num_colors_outer_acoustic,num_colors_inner_acoustic; + int nspec_acoustic; + + realw* d_rhostore; + realw* d_kappastore; + realw* d_rmass_acoustic; + + // mpi buffer + realw* d_send_potential_dot_dot_buffer; + + realw* d_b_absorb_potential; + int d_b_reclen_potential; + + // for writing seismograms + realw* d_station_seismo_potential; + realw* h_station_seismo_potential; + + // sensitivity kernels + realw* d_rho_ac_kl; + realw* d_kappa_ac_kl; + + // approximative hessian for preconditioning kernels + realw* d_hess_ac_kl; + + // coupling acoustic-elastic + int* d_coupling_ac_el_ispec; + int* d_coupling_ac_el_ijk; + realw* d_coupling_ac_el_normal; + realw* d_coupling_ac_el_jacobian2Dw; + + // gravity + int gravity; + realw* d_minus_deriv_gravity; + realw* d_minus_g; + +} Mesh; + + +#endif diff --git a/src/cuda/noise_tomography_cuda.cu b/src/cuda/noise_tomography_cuda.cu new file mode 100644 index 000000000..e1750b28e --- /dev/null +++ b/src/cuda/noise_tomography_cuda.cu @@ -0,0 +1,305 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#ifdef WITH_MPI +#include +#endif + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" +// #include "epik_user.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){ +TRACE("fortranflush"); + + fflush(stdout); + fflush(stderr); + printf("Flushing proc %d!\n",*rank); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) { +TRACE("fortranprint"); + + int procid; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD,&procid); +#else + procid = 0; +#endif + printf("%d: sends msg_id %d\n",procid,*id); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) { +TRACE("fortranprintf"); + + int procid; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD,&procid); +#else + procid = 0; +#endif + printf("%d: sends val %e\n",procid,*val); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) { +TRACE("fortranprintd"); + + int procid; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD,&procid); +#else + procid = 0; +#endif + printf("%d: sends val %e\n",procid,*val); +} + +/* ----------------------------------------------------------------------------------------------- */ + +// randomize displ for testing +extern "C" +void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) { +TRACE("make_displ_rand"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + // realw* displ_rnd = (realw*)malloc(mp->NGLOB_AB*3*sizeof(realw)); + for(int i=0;iNGLOB_AB*3;i++) { + h_displ[i] = rand(); + } + cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice); +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void transfer_surface_to_host_kernel(int* free_surface_ispec, + int* free_surface_ijk, + int num_free_surface_faces, + int* ibool, + realw* displ, + realw* noise_surface_movie) { + int igll = threadIdx.x; + int iface = blockIdx.x + blockIdx.y*gridDim.x; + + // int id = tx + blockIdx.x*blockDim.x + blockIdx.y*blockDim.x*gridDim.x; + + if(iface < num_free_surface_faces) { + int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing + + int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1; + int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1; + int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1; + + int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1; + + noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3]; + noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1]; + noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2]; + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_surface_to_host, + TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f, + realw* h_noise_surface_movie) { +TRACE("transfer_surface_to_host"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + + int num_blocks_x = mp->num_free_surface_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + dim3 grid(num_blocks_x,num_blocks_y,1); + dim3 threads(NGLL2,1,1); + + transfer_surface_to_host_kernel<<>>(mp->d_free_surface_ispec, + mp->d_free_surface_ijk, + mp->num_free_surface_faces, + mp->d_ibool, + mp->d_displ, + mp->d_noise_surface_movie); + + cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie, + 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("transfer_surface_to_host"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void noise_read_add_surface_movie_cuda_kernel(realw* accel, int* ibool, + int* free_surface_ispec, + int* free_surface_ijk, + int num_free_surface_faces, + realw* noise_surface_movie, + realw* normal_x_noise, + realw* normal_y_noise, + realw* normal_z_noise, + realw* mask_noise, + realw* free_surface_jacobian2Dw) { + + int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id + + // when nspec_top > 65535, but mod(nspec_top,2) > 0, we end up with an extra block. + if(iface < num_free_surface_faces) { + int ispec = free_surface_ispec[iface]-1; + + int igll = threadIdx.x; + + int ipoin = NGLL2*iface + igll; + int i=free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1; + int j=free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1; + int k=free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1; + + int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1; + + realw normal_x = normal_x_noise[ipoin]; + realw normal_y = normal_y_noise[ipoin]; + realw normal_z = normal_z_noise[ipoin]; + + realw eta = (noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x + + noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y + + noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z); + + // error from cuda-memcheck and ddt seems "incorrect", because we + // are passing a __constant__ variable pointer around like it was + // made using cudaMalloc, which *may* be "incorrect", but produces + // correct results. + + // ========= Invalid __global__ read of size + // 4 ========= at 0x00000cd8 in + // compute_add_sources_cuda.cu:260:noise_read_add_surface_movie_cuda_kernel + // ========= by thread (0,0,0) in block (3443,0) ========= Address + // 0x203000c8 is out of bounds + + // non atomic version for speed testing -- atomic updates are needed for correctness + // accel[3*iglob] += eta*mask_noise[ipoin] * normal_x * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D]; + // accel[3*iglob+1] += eta*mask_noise[ipoin] * normal_y * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D]; + // accel[3*iglob+2] += eta*mask_noise[ipoin] * normal_z * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D]; + + // Fortran version in SVN -- note deletion of wgllwgll_xy? + // accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) & + // * free_surface_jacobian2Dw(igll,iface) + // accel(2,iglob) = accel(2,iglob) + eta * mask_noise(ipoin) * normal_y_noise(ipoin) & + // * free_surface_jacobian2Dw(igll,iface) + // accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) & + // * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface) + + // atomicAdd(&accel[iglob*3] ,eta*mask_noise[ipoin]*normal_x*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]); + // atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]); + // atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]); + + atomicAdd(&accel[iglob*3] ,eta*mask_noise[ipoin]*normal_x*free_surface_jacobian2Dw[igll+NGLL2*iface]); + atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*free_surface_jacobian2Dw[igll+NGLL2*iface]); + atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*free_surface_jacobian2Dw[igll+NGLL2*iface]); + + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(noise_read_add_surface_movie_cu, + NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f, + realw* h_noise_surface_movie, + int* NOISE_TOMOGRAPHYf) { +TRACE("noise_read_add_surface_movie_cu"); + + // EPIK_TRACER("noise_read_add_surface_movie_cu"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf; + + cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie, + 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice); + + int num_blocks_x = mp->num_free_surface_faces; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + dim3 grid(num_blocks_x,num_blocks_y,1); + dim3 threads(NGLL2,1,1); + + if(NOISE_TOMOGRAPHY == 2) { // add surface source to forward field + noise_read_add_surface_movie_cuda_kernel<<>>(mp->d_accel, + mp->d_ibool, + mp->d_free_surface_ispec, + mp->d_free_surface_ijk, + mp->num_free_surface_faces, + mp->d_noise_surface_movie, + mp->d_normal_x_noise, + mp->d_normal_y_noise, + mp->d_normal_z_noise, + mp->d_mask_noise, + mp->d_free_surface_jacobian2Dw); + } + else if(NOISE_TOMOGRAPHY == 3) { // add surface source to adjoint (backward) field + noise_read_add_surface_movie_cuda_kernel<<>>(mp->d_b_accel, + mp->d_ibool, + mp->d_free_surface_ispec, + mp->d_free_surface_ijk, + mp->num_free_surface_faces, + mp->d_noise_surface_movie, + mp->d_normal_x_noise, + mp->d_normal_y_noise, + mp->d_normal_z_noise, + mp->d_mask_noise, + mp->d_free_surface_jacobian2Dw); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("noise_read_add_surface_movie_cuda_kernel"); +#endif +} diff --git a/src/cuda/prepare_constants_cuda.h b/src/cuda/prepare_constants_cuda.h new file mode 100644 index 000000000..04c5ea839 --- /dev/null +++ b/src/cuda/prepare_constants_cuda.h @@ -0,0 +1,99 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#ifndef CUDA_HEADER_H +#define CUDA_HEADER_H + +typedef float realw; // type of "working" variables + +/* ----------------------------------------------------------------------------------------------- */ + +// setters for these const arrays (very ugly hack, but will have to do) + +// elastic +void setConst_hprime_xx(realw* array,Mesh* mp); +void setConst_hprime_yy(realw* array,Mesh* mp); +void setConst_hprime_zz(realw* array,Mesh* mp); + +void setConst_hprimewgll_xx(realw* array,Mesh* mp); +void setConst_hprimewgll_yy(realw* array,Mesh* mp); +void setConst_hprimewgll_zz(realw* array,Mesh* mp); + +void setConst_wgllwgll_xy(realw* array,Mesh* mp); +void setConst_wgllwgll_xz(realw* array, Mesh* mp); +void setConst_wgllwgll_yz(realw* array, Mesh* mp); + +void setConst_wgll_cube(realw* array, Mesh* mp); + +/* ----------------------------------------------------------------------------------------------- */ + +/* CUDA specific things from specfem3D_kernels.cu */ + +// older TEXTURE usage. For now just acoustic simulations. See usage +// of USE_TEXTURES_FIELDS elsewhere in code for elastic case +#ifdef USE_TEXTURES + +// declaration of textures +texture tex_potential_acoustic; +texture tex_potential_dot_dot_acoustic; + + + void bindTexturesPotential(realw* d_potential_acoustic) + { + cudaError_t err; + + cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc(); + + err = cudaBindTexture(NULL,tex_potential_acoustic, d_potential_acoustic, + channelDescFloat, NGLOB*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in bindTexturesPotential for potential_acoustic: %s\n", cudaGetErrorString(err)); + exit(1); + } + } + + void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot_acoustic) + { + cudaError_t err; + + cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc(); + + err = cudaBindTexture(NULL,tex_potential_dot_dot_acoustic, d_potential_dot_dot_acoustic, + channelDescFloat, NGLOB*sizeof(realw)); + if (err != cudaSuccess) + { + fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot_acoustic: %s\n", cudaGetErrorString(err)); + exit(1); + } + } + +#endif // USE_TEXTURES + + +#endif //CUDA_HEADER_H diff --git a/src/cuda/prepare_mesh_constants_cuda.cu b/src/cuda/prepare_mesh_constants_cuda.cu new file mode 100644 index 000000000..cc3060f36 --- /dev/null +++ b/src/cuda/prepare_mesh_constants_cuda.cu @@ -0,0 +1,2037 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#ifdef WITH_MPI +#include +#endif + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" +#include "prepare_constants_cuda.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +// Helper functions + +/* ----------------------------------------------------------------------------------------------- */ + +double get_time() +{ + struct timeval t; + struct timezone tzp; + gettimeofday(&t, &tzp); + return t.tv_sec + t.tv_usec*1e-6; +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() { +TRACE("pause_for_debug"); + + pause_for_debugger(1); +} + +/* ----------------------------------------------------------------------------------------------- */ + +void pause_for_debugger(int pause) { + if(pause) { + int myrank; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); +#else + myrank = 0; +#endif + printf("I'm rank %d\n",myrank); + int i = 0; + char hostname[256]; + gethostname(hostname, sizeof(hostname)); + printf("PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank); + FILE *file = fopen("/scratch/eiger/rietmann/attach_gdb.txt","w+"); + if (file != NULL){ + fprintf(file,"PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank); + fclose(file); + } + fflush(stdout); + while (0 == i) + sleep(5); + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +void exit_on_cuda_error(char* kernel_name) { + // sync and check to catch errors from previous async operations + cudaThreadSynchronize(); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err)); + pause_for_debugger(0); + //free(kernel_name); +#ifdef WITH_MPI + MPI_Abort(MPI_COMM_WORLD,1); +#endif + exit(EXIT_FAILURE); + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +void exit_on_error(char* info) +{ + printf("\nERROR: %s\n",info); + fflush(stdout); +#ifdef WITH_MPI + MPI_Abort(MPI_COMM_WORLD,1); +#endif + //free(info); + exit(EXIT_FAILURE); + return; +} + +/* ----------------------------------------------------------------------------------------------- */ + +void print_CUDA_error_if_any(cudaError_t err, int num) +{ + if (cudaSuccess != err) + { + printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num); + fflush(stdout); +#ifdef WITH_MPI + MPI_Abort(MPI_COMM_WORLD,1); +#endif + exit(EXIT_FAILURE); + } + return; +} + +/* ----------------------------------------------------------------------------------------------- */ + +void get_free_memory(double* free_db, double* used_db, double* total_db) { + + // gets memory usage in byte + size_t free_byte ; + size_t total_byte ; + cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ; + if ( cudaSuccess != cuda_status ){ + printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) ); + exit(EXIT_FAILURE); + } + + *free_db = (double)free_byte ; + *total_db = (double)total_byte ; + *used_db = *total_db - *free_db ; + return; +} + +/* ----------------------------------------------------------------------------------------------- */ + +// Saves GPU memory usage to file +void output_free_memory(int myrank,char* info_str) { + + FILE* fp; + char filename[BUFSIZ]; + double free_db,used_db,total_db; + + get_free_memory(&free_db,&used_db,&total_db); + + sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_mem_usage_proc_%06d.txt",myrank); + fp = fopen(filename,"a+"); + if (fp != NULL){ + fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str, + used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); + fclose(fp); + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +// Fortran-callable version of above method +extern "C" +void FC_FUNC_(output_free_device_memory, + OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) { +TRACE("output_free_device_memory"); + + char info[6]; + sprintf(info,"f %d:",*myrank); + output_free_memory(*myrank,info); +} + +/* ----------------------------------------------------------------------------------------------- */ + +/* +void show_free_memory(char* info_str) { + + // show memory usage of GPU + int myrank; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD,&myrank); +#else + myrank = 0; +#endif + double free_db,used_db,total_db; + + get_free_memory(&free_db,&used_db,&total_db); + + printf("%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str, + used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); + +} +*/ + +/* +extern "C" +void FC_FUNC_(show_free_device_memory, + SHOW_FREE_DEVICE_MEMORY)() { + TRACE("show_free_device_memory"); + + show_free_memory("from fortran"); +} +*/ + +/* ----------------------------------------------------------------------------------------------- */ + + +extern "C" +void FC_FUNC_(get_free_device_memory, + get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) { +TRACE("get_free_device_memory"); + + double free_db,used_db,total_db; + + get_free_memory(&free_db,&used_db,&total_db); + + // converts to MB + *free = (realw) free_db/1024.0/1024.0; + *used = (realw) used_db/1024.0/1024.0; + *total = (realw) total_db/1024.0/1024.0; + return; +} + + +/* ----------------------------------------------------------------------------------------------- */ +//daniel: helper function +/* +__global__ void check_phase_ispec_kernel(int num_phase_ispec, + int* phase_ispec, + int NSPEC_AB, + int* ier) { + + int i,ispec,iphase,count0,count1; + *ier = 0; + + for(iphase=0; iphase < 2; iphase++){ + count0 = 0; + count1 = 0; + + for(i=0; i < num_phase_ispec; i++){ + ispec = phase_ispec[iphase*num_phase_ispec + i] - 1; + if( ispec < -1 || ispec >= NSPEC_AB ){ + printf("Error in d_phase_ispec_inner_elastic %d %d\n",i,ispec); + *ier = 1; + return; + } + if( ispec >= 0 ){ count0++;} + if( ispec < 0 ){ count1++;} + } + + printf("check_phase_ispec done: phase %d, count = %d %d \n",iphase,count0,count1); + + } +} + +void check_phase_ispec(long* Mesh_pointer_f,int type){ + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + printf("check phase_ispec for type=%d\n",type); + + dim3 grid(1,1); + dim3 threads(1,1,1); + + int* h_debug = (int*) calloc(1,sizeof(int)); + int* d_debug; + cudaMalloc((void**)&d_debug,sizeof(int)); + + if( type == 1 ){ + check_phase_ispec_kernel<<>>(mp->num_phase_ispec_elastic, + mp->d_phase_ispec_inner_elastic, + mp->NSPEC_AB, + d_debug); + }else if( type == 2 ){ + check_phase_ispec_kernel<<>>(mp->num_phase_ispec_acoustic, + mp->d_phase_ispec_inner_acoustic, + mp->NSPEC_AB, + d_debug); + } + + cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost); + cudaFree(d_debug); + if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);} + free(h_debug); + fflush(stdout); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("check_phase_ispec"); +#endif + +} +*/ + +/* ----------------------------------------------------------------------------------------------- */ +//daniel: helper function +/* +__global__ void check_ispec_is_kernel(int NSPEC_AB, + int* ispec_is, + int* ier) { + + int ispec,count0,count1; + + *ier = 0; + count0 = 0; + count1 = 0; + for(ispec=0; ispec < NSPEC_AB; ispec++){ + if( ispec_is[ispec] < -1 || ispec_is[ispec] > 1 ){ + printf("Error in ispec_is %d %d\n",ispec,ispec_is[ispec]); + *ier = 1; + return; + //exit(1); + } + if( ispec_is[ispec] == 0 ){count0++;} + if( ispec_is[ispec] != 0 ){count1++;} + } + printf("check_ispec_is done: count = %d %d\n",count0,count1); +} + +void check_ispec_is(long* Mesh_pointer_f,int type){ + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + printf("check ispec_is for type=%d\n",type); + + dim3 grid(1,1); + dim3 threads(1,1,1); + + int* h_debug = (int*) calloc(1,sizeof(int)); + int* d_debug; + cudaMalloc((void**)&d_debug,sizeof(int)); + + if( type == 0 ){ + check_ispec_is_kernel<<>>(mp->NSPEC_AB, + mp->d_ispec_is_inner, + d_debug); + }else if( type == 1 ){ + check_ispec_is_kernel<<>>(mp->NSPEC_AB, + mp->d_ispec_is_elastic, + d_debug); + }else if( type == 2 ){ + check_ispec_is_kernel<<>>(mp->NSPEC_AB, + mp->d_ispec_is_acoustic, + d_debug); + } + + cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost); + cudaFree(d_debug); + if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);} + free(h_debug); + fflush(stdout); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("check_ispec_is"); +#endif +} +*/ +/* ----------------------------------------------------------------------------------------------- */ +//daniel: helper function +/* +__global__ void check_array_ispec_kernel(int num_array_ispec, + int* array_ispec, + int NSPEC_AB, + int* ier) { + + int i,ispec,count0,count1; + + *ier = 0; + count0 = 0; + count1 = 0; + + for(i=0; i < num_array_ispec; i++){ + ispec = array_ispec[i] - 1; + if( ispec < -1 || ispec >= NSPEC_AB ){ + printf("Error in d_array_ispec %d %d\n",i,ispec); + *ier = 1; + return; + } + if( ispec >= 0 ){ count0++;} + if( ispec < 0 ){ count1++;} + } + + printf("check_array_ispec done: count = %d %d \n",count0,count1); +} + +void check_array_ispec(long* Mesh_pointer_f,int type){ + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + printf("check array_ispec for type=%d\n",type); + + dim3 grid(1,1); + dim3 threads(1,1,1); + + int* h_debug = (int*) calloc(1,sizeof(int)); + int* d_debug; + cudaMalloc((void**)&d_debug,sizeof(int)); + + if( type == 1 ){ + check_array_ispec_kernel<<>>(mp->d_num_abs_boundary_faces, + mp->d_abs_boundary_ispec, + mp->NSPEC_AB, + d_debug); + } + + cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost); + cudaFree(d_debug); + if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);} + free(h_debug); + fflush(stdout); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("check_array_ispec"); +#endif + +} +*/ + +/* ----------------------------------------------------------------------------------------------- */ + +// GPU preparation + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_cuda_device, + PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { + TRACE("prepare_cuda_device"); + + // Gets rank number of MPI process + int myrank = *myrank_f; + +/* + // cuda initialization (needs -lcuda library) + // note: cuInit initializes the driver API. + // it is needed for any following CUDA driver API function call (format cuFUNCTION(..) ) + // however, for the CUDA runtime API functions (format cudaFUNCTION(..) ) + // the initialization is implicit, thus cuInit() here would not be needed... + CUresult status = cuInit(0); + if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n"); + + // returns a handle to the first cuda compute device + CUdevice dev; + status = cuDeviceGet(&dev, 0); + if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n"); + + // gets device properties + int major,minor; + status = cuDeviceComputeCapability(&major,&minor,dev); + if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n"); + + // make sure that the device has compute capability >= 1.3 + if (major < 1){ + fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major); + exit_on_error("CUDA Compute capability major number should be at least 1\n"); + } + if (major == 1 && minor < 3){ + fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor); + exit_on_error("CUDA Compute capability major number should be at least 1.3\n"); + } +*/ + + // note: from here on we use the runtime API ... + + // Gets number of GPU devices + int device_count = 0; + cudaGetDeviceCount(&device_count); + exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n"); + + // returns device count to fortran + if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n"); + *ncuda_devices = device_count; + + + // Sets the active device + if(device_count > 1) { + // generalized for more GPUs per node + // note: without previous context release, cudaSetDevice will complain with the cuda error + // "setting the device when a process is active is not allowed" + // releases previous contexts + cudaThreadExit(); + + //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count); + //MPI_Barrier(MPI_COMM_WORLD); + + // sets active device + cudaSetDevice( myrank % device_count ); + exit_on_cuda_error("cudaSetDevice"); + } + + // returns a handle to the active device + int device; + cudaGetDevice(&device); + + // get device properties + struct cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp,device); + + // exit if the machine has no CUDA-enabled device + if (deviceProp.major == 9999 && deviceProp.minor == 9999){ + fprintf(stderr,"No CUDA-enabled device found, exiting...\n\n"); + exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n"); + } + + // outputs device infos to file + char filename[BUFSIZ]; + FILE* fp; + sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_info_proc_%06d.txt",myrank); + fp = fopen(filename,"a+"); + if (fp != NULL){ + // display device properties + fprintf(fp,"Device Name = %s\n",deviceProp.name); + fprintf(fp,"multiProcessorCount: %d\n",deviceProp.multiProcessorCount); + fprintf(fp,"totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f)); + fprintf(fp,"totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f)); + fprintf(fp,"sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock); + fprintf(fp,"Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock); + fprintf(fp,"Maximum size of each dimension of a block: %d x %d x %d\n", + deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); + fprintf(fp,"Maximum sizes of each dimension of a grid: %d x %d x %d\n", + deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); + fprintf(fp,"Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor); + if(deviceProp.canMapHostMemory){ + fprintf(fp,"canMapHostMemory: TRUE\n"); + }else{ + fprintf(fp,"canMapHostMemory: FALSE\n"); + } + if(deviceProp.deviceOverlap){ + fprintf(fp,"deviceOverlap: TRUE\n"); + }else{ + fprintf(fp,"deviceOverlap: FALSE\n"); + } + + // outputs initial memory infos via cudaMemGetInfo() + double free_db,used_db,total_db; + get_free_memory(&free_db,&used_db,&total_db); + fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, + used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); + + fclose(fp); + } + + // make sure that the device has compute capability >= 1.3 + if (deviceProp.major < 1){ + fprintf(stderr,"Compute capability major number should be at least 1, exiting...\n\n"); + exit_on_error("CUDA Compute capability major number should be at least 1\n"); + } + if (deviceProp.major == 1 && deviceProp.minor < 3){ + fprintf(stderr,"Compute capability should be at least 1.3, exiting...\n"); + exit_on_error("CUDA Compute capability major number should be at least 1.3\n"); + } + // we use pinned memory for asynchronous copy + if( ! deviceProp.canMapHostMemory){ + fprintf(stderr,"Device capability should allow to map host memory, exiting...\n"); + exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n"); + } +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_constants_device, + PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer, + int* h_NGLLX, + int* NSPEC_AB, int* NGLOB_AB, + realw* h_xix, realw* h_xiy, realw* h_xiz, + realw* h_etax, realw* h_etay, realw* h_etaz, + realw* h_gammax, realw* h_gammay, realw* h_gammaz, + realw* h_kappav, realw* h_muv, + int* h_ibool, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* h_nibool_interfaces_ext_mesh, + int* h_ibool_interfaces_ext_mesh, + realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz, + realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz, + realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz, + int* ABSORBING_CONDITIONS, + int* h_abs_boundary_ispec, int* h_abs_boundary_ijk, + realw* h_abs_boundary_normal, + realw* h_abs_boundary_jacobian2Dw, + int* h_num_abs_boundary_faces, + int* h_ispec_is_inner, + int* NSOURCES, + int* nsources_local_f, + realw* h_sourcearrays, + int* h_islice_selected_source, + int* h_ispec_selected_source, + int* h_number_receiver_global, + int* h_ispec_selected_rec, + int* nrec_f, + int* nrec_local_f, + int* SIMULATION_TYPE, + int* USE_MESH_COLORING_GPU_f, + int* nspec_acoustic,int* nspec_elastic, + int* my_neighbours_ext_mesh, + int* request_send_vector_ext_mesh, + int* request_recv_vector_ext_mesh, + realw* buffer_recv_vector_ext_mesh + ) { + +TRACE("prepare_constants_device"); + + // allocates mesh parameter structure + Mesh* mp = (Mesh*) malloc( sizeof(Mesh) ); + if (mp == NULL) exit_on_error("error allocating mesh pointer"); + *Mesh_pointer = (long)mp; + + // checks if NGLLX == 5 + if( *h_NGLLX != NGLLX ){ + exit_on_error("NGLLX must be 5 for CUDA devices"); + } + + +#ifdef WITH_MPI + int nproc; + MPI_Comm_size(MPI_COMM_WORLD,&nproc); + mp->NPROCS=nproc; +#else + mp->NPROCS = 1; +#endif + + + // sets global parameters + mp->NSPEC_AB = *NSPEC_AB; + mp->NGLOB_AB = *NGLOB_AB; + + // sets constant arrays + setConst_hprime_xx(h_hprime_xx,mp); + // only needed if NGLLX != NGLLY != NGLLZ + // setConst_hprime_yy(h_hprime_yy,mp); + // setConst_hprime_zz(h_hprime_zz,mp); + setConst_hprimewgll_xx(h_hprimewgll_xx,mp); + setConst_hprimewgll_yy(h_hprimewgll_yy,mp); + setConst_hprimewgll_zz(h_hprimewgll_zz,mp); + setConst_wgllwgll_xy(h_wgllwgll_xy,mp); + setConst_wgllwgll_xz(h_wgllwgll_xz,mp); + setConst_wgllwgll_yz(h_wgllwgll_yz,mp); + + // Using texture memory for the hprime-style constants is slower on + // Fermi generation hardware, but *may* be faster on Kepler + // generation. We will reevaluate this again, so might as well leave + // in the code with with #USE_TEXTURES_FIELDS not-defined. + #ifdef USE_TEXTURES_CONSTANTS + { + const textureReference* d_hprime_xx_tex_ptr; + print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_tex_ptr, "d_hprime_xx_tex"), 4101); + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_tex_ptr, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001); + } + #endif + + + // Allocate pinned mpi-buffers. + // MPI buffers use pinned memory allocated by cudaMallocHost, which + // enables the use of asynchronous memory copies from host <-> + // device + int size_mpi_buffer = 3 * (*num_interfaces_ext_mesh) * (*max_nibool_interfaces_ext_mesh); + print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004); + mp->send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float)); + mp->size_mpi_send_buffer = size_mpi_buffer; + + print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004); + mp->recv_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float)); + mp->size_mpi_recv_buffer = size_mpi_buffer; + + print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004); + // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float)); + + mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh; + mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh; + mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh; + mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh; + mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh; + mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh; + mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh; + + // setup two streams, one for compute and one for host<->device memory copies + cudaStreamCreate(&mp->compute_stream); + cudaStreamCreate(&mp->copy_stream); + cudaStreamCreate(&mp->b_copy_stream); + + /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */ + int size_padded = NGLL3_PADDED * (mp->NSPEC_AB); + + // mesh + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(realw)),1002); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(realw)),1003); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(realw)),1004); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(realw)),1005); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(realw)),1006); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(realw)),1007); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(realw)),1008); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(realw)),1009); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(realw)),1010); + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(realw)),1011); + + // transfer constant element data with padding + for(int i=0;i < mp->NSPEC_AB;i++) { + print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*NGLL3_PADDED, &h_xix[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501); + print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*NGLL3_PADDED, &h_xiy[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1502); + print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz+i*NGLL3_PADDED, &h_xiz[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1503); + print_CUDA_error_if_any(cudaMemcpy(mp->d_etax+i*NGLL3_PADDED, &h_etax[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1504); + print_CUDA_error_if_any(cudaMemcpy(mp->d_etay+i*NGLL3_PADDED, &h_etay[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1505); + print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz+i*NGLL3_PADDED, &h_etaz[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1506); + print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax+i*NGLL3_PADDED,&h_gammax[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1507); + print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay+i*NGLL3_PADDED,&h_gammay[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1508); + print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz+i*NGLL3_PADDED,&h_gammaz[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1509); + print_CUDA_error_if_any(cudaMemcpy(mp->d_kappav+i*NGLL3_PADDED,&h_kappav[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510); + print_CUDA_error_if_any(cudaMemcpy(mp->d_muv+i*NGLL3_PADDED, &h_muv[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1511); + } + + // global indexing + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool,size_padded*sizeof(int)),1021); + print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool, + NGLL3*(mp->NSPEC_AB)*sizeof(int),cudaMemcpyHostToDevice),1022); + + + // prepare interprocess-edge exchange information + mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh; + mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh; + if( mp->num_interfaces_ext_mesh > 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh, + (mp->num_interfaces_ext_mesh)*sizeof(int)),1201); + print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh, + (mp->num_interfaces_ext_mesh)*sizeof(int),cudaMemcpyHostToDevice),1202); + + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh, + (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int)),1203); + print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh, + (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int), + cudaMemcpyHostToDevice),1204); + } + + // inner elements + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205); + print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner, + mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),1206); + + // absorbing boundaries + mp->d_num_abs_boundary_faces = *h_num_abs_boundary_faces; + if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ispec), + (mp->d_num_abs_boundary_faces)*sizeof(int)),1101); + print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec, + (mp->d_num_abs_boundary_faces)*sizeof(int), + cudaMemcpyHostToDevice),1102); + + print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ijk), + 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103); + print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk, + 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int), + cudaMemcpyHostToDevice),1104); + + print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_normal), + 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1105); + print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal, + 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw), + cudaMemcpyHostToDevice),1106); + + print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_jacobian2Dw), + NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1107); + print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw, + NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw), + cudaMemcpyHostToDevice),1108); + } + + // sources + mp->nsources_local = *nsources_local_f; + if (*SIMULATION_TYPE == 1 || *SIMULATION_TYPE == 3){ + // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2) + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays, + sizeof(realw)* *NSOURCES*3*NGLL3),1301); + print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays, + sizeof(realw)* *NSOURCES*3*NGLL3,cudaMemcpyHostToDevice),1302); + + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_stf_pre_compute, + *NSOURCES*sizeof(double)),1303); + } + + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_source, + sizeof(int) * *NSOURCES),1401); + print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_source, h_islice_selected_source, + sizeof(int)* *NSOURCES,cudaMemcpyHostToDevice),1402); + + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_source, + sizeof(int)* *NSOURCES),1403); + print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_source, h_ispec_selected_source, + sizeof(int)* *NSOURCES,cudaMemcpyHostToDevice),1404); + + + // receiver stations + int nrec = *nrec_f; // total number of receivers + mp->nrec_local = *nrec_local_f; // number of receiver located in this partition + //int nrec_local = *nrec_local_f; + // note that: + // size(number_receiver_global) = nrec_local + // size(ispec_selected_rec) = nrec + if( mp->nrec_local > 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_number_receiver_global),mp->nrec_local*sizeof(int)),1); + print_CUDA_error_if_any(cudaMemcpy(mp->d_number_receiver_global,h_number_receiver_global, + mp->nrec_local*sizeof(int),cudaMemcpyHostToDevice),1512); + } + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),nrec*sizeof(int)),1513); + print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec, + nrec*sizeof(int),cudaMemcpyHostToDevice),1514); + +#ifdef USE_MESH_COLORING_GPU + mp->use_mesh_coloring_gpu = 1; + if( ! *USE_MESH_COLORING_GPU_f ) exit_on_error("error with USE_MESH_COLORING_GPU constant; please re-compile\n"); +#else + // mesh coloring + // note: this here passes the coloring as an option to the kernel routines + // the performance seems to be the same if one uses the pre-processing directives above or not + mp->use_mesh_coloring_gpu = *USE_MESH_COLORING_GPU_f; +#endif + + // number of elements per domain + mp->nspec_acoustic = *nspec_acoustic; + mp->nspec_elastic = *nspec_elastic; + + // gravity flag initialization + mp->gravity = 0; + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("prepare_constants_device"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// for ACOUSTIC simulations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_fields_acoustic_device, + PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f, + realw* rmass_acoustic, + realw* rhostore, + realw* kappastore, + int* num_phase_ispec_acoustic, + int* phase_ispec_inner_acoustic, + int* ispec_is_acoustic, + int* NOISE_TOMOGRAPHY, + int* num_free_surface_faces, + int* free_surface_ispec, + int* free_surface_ijk, + int* ABSORBING_CONDITIONS, + int* b_reclen_potential, + realw* b_absorb_potential, + int* ELASTIC_SIMULATION, + int* num_coupling_ac_el_faces, + int* coupling_ac_el_ispec, + int* coupling_ac_el_ijk, + realw* coupling_ac_el_normal, + realw* coupling_ac_el_jacobian2Dw, + int* num_colors_outer_acoustic, + int* num_colors_inner_acoustic, + int* num_elem_colors_acoustic) { + + TRACE("prepare_fields_acoustic_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); + /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */ + int size_padded = NGLL3_PADDED * mp->NSPEC_AB; + int size_nonpadded = NGLL3 * mp->NSPEC_AB; + int size_glob = mp->NGLOB_AB; + + // allocates arrays on device (GPU) + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),2001); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),2002); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003); + + // mpi buffer + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer), + (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005); + print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic, + sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100); + + // padded array + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),2006); + // transfer constant element data with padding + for(int i=0; i < mp->NSPEC_AB; i++) { + print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),2106); + } + + // non-padded array + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007); + print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore, + NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105); + + // phase elements + mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic; + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic), + mp->num_phase_ispec_acoustic*2*sizeof(int)),2008); + print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic, + mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic), + mp->NSPEC_AB*sizeof(int)),2009); + print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic, + mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102); + + // free surface + if( *NOISE_TOMOGRAPHY == 0 ){ + // allocate surface arrays + mp->num_free_surface_faces = *num_free_surface_faces; + if( mp->num_free_surface_faces > 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec), + mp->num_free_surface_faces*sizeof(int)),2201); + print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec, + mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk), + 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202); + print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk, + 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204); + } + } + + // absorbing boundaries + if( *ABSORBING_CONDITIONS ){ + mp->d_b_reclen_potential = *b_reclen_potential; + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential, + mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302); + } + + + // for seismograms + if( mp->nrec_local > 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential), + mp->nrec_local*NGLL3*sizeof(realw)),2107); + + mp->h_station_seismo_potential = (realw*) malloc( mp->nrec_local*NGLL3*sizeof(realw) ); + if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential"); + } + + + // coupling with elastic parts + if( *ELASTIC_SIMULATION && *num_coupling_ac_el_faces > 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec), + (*num_coupling_ac_el_faces)*sizeof(int)),2601); + print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec, + (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk), + 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603); + print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk, + 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal), + 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605); + print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal, + 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw), + NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607); + print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw, + NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608); + + } + + // mesh coloring + if( mp->use_mesh_coloring_gpu ){ + mp->num_colors_outer_acoustic = *num_colors_outer_acoustic; + mp->num_colors_inner_acoustic = *num_colors_inner_acoustic; + mp->h_num_elem_colors_acoustic = (int*) num_elem_colors_acoustic; + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("prepare_fields_acoustic_device"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_fields_acoustic_adj_dev, + PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* APPROXIMATE_HESS_KL) { + + TRACE("prepare_fields_acoustic_adj_dev"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); + + int size_glob = mp->NGLOB_AB; + + // kernel simulations + if( *SIMULATION_TYPE != 3 ) return; + + // allocates backward/reconstructed arrays on device (GPU) + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),3014); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),3015); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),3016); + + // allocates kernels + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3017); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3018); + + // initializes kernel values to zero + print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0, + NGLL3*mp->NSPEC_AB*sizeof(realw)),3019); + print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0, + NGLL3*mp->NSPEC_AB*sizeof(realw)),3020); + + // preconditioner + if( *APPROXIMATE_HESS_KL ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3030); + // initializes with zeros + print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0, + NGLL3*mp->NSPEC_AB*sizeof(realw)),3031); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("prepare_fields_acoustic_adj_dev"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// for ELASTIC simulations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_fields_elastic_device, + PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f, + int* size, + realw* rmass, + realw* rho_vp, + realw* rho_vs, + int* num_phase_ispec_elastic, + int* phase_ispec_inner_elastic, + int* ispec_is_elastic, + int* ABSORBING_CONDITIONS, + realw* h_b_absorb_field, + int* h_b_reclen_field, + int* SIMULATION_TYPE,int* SAVE_FORWARD, + int* COMPUTE_AND_STORE_STRAIN, + realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy, + realw* epsilondev_xz,realw* epsilondev_yz, + int* ATTENUATION, + int* R_size, + realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz, + realw* one_minus_sum_beta,realw* factor_common, + realw* alphaval,realw* betaval,realw* gammaval, + int* OCEANS, + realw* rmass_ocean_load, + int* NOISE_TOMOGRAPHY, + realw* free_surface_normal, + int* free_surface_ispec, + int* free_surface_ijk, + int* num_free_surface_faces, + int* ACOUSTIC_SIMULATION, + int* num_colors_outer_elastic, + int* num_colors_inner_elastic, + int* num_elem_colors_elastic, + int* ANISOTROPY, + realw *c11store, + realw *c12store, + realw *c13store, + realw *c14store, + realw *c15store, + realw *c16store, + realw *c22store, + realw *c23store, + realw *c24store, + realw *c25store, + realw *c26store, + realw *c33store, + realw *c34store, + realw *c35store, + realw *c36store, + realw *c44store, + realw *c45store, + realw *c46store, + realw *c55store, + realw *c56store, + realw *c66store){ + +TRACE("prepare_fields_elastic_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); + /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */ + int size_padded = NGLL3_PADDED * (mp->NSPEC_AB); + int size_nonpadded = NGLL3 * (mp->NSPEC_AB); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003); + + #ifdef USE_TEXTURES_FIELDS + { + print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_tex_ref_ptr, "d_displ_tex"), 4001); + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_tex_ref_ptr, mp->d_displ, &channelDesc, sizeof(realw)*(*size)), 4001); + } + + { + print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_tex_ref_ptr, "d_accel_tex"), 4003); + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003); + } + #endif + + // mpi buffer + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer), + 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004); + + // mass matrix + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005); + // transfer element data + print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass, + sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010); + + + // element indices + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009); + print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic, + mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012); + + // phase elements + mp->num_phase_ispec_elastic = *num_phase_ispec_elastic; + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic), + mp->num_phase_ispec_elastic*2*sizeof(int)),4008); + print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic, + mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011); + + // for seismograms + if( mp->nrec_local > 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field), + 3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015); + + mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) ); + if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n"); + } + + // absorbing conditions + if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){ + // non-padded arrays + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007); + + // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition + print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp, + NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013); + print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs, + NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014); + + // absorb_field array used for file i/o + if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){ + mp->d_b_reclen_field = *h_b_reclen_field; + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field), + mp->d_b_reclen_field),4016); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field, + mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017); + } + } + + // strains used for attenuation and kernel simulations + if( *COMPUTE_AND_STORE_STRAIN ){ + // strains + int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing + + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx, + epsilondev_size*sizeof(realw)),4301); + print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw), + cudaMemcpyHostToDevice),4302); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy, + epsilondev_size*sizeof(realw)),4302); + print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw), + cudaMemcpyHostToDevice),4303); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy, + epsilondev_size*sizeof(realw)),4304); + print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw), + cudaMemcpyHostToDevice),4305); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz, + epsilondev_size*sizeof(realw)),4306); + print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw), + cudaMemcpyHostToDevice),4307); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz, + epsilondev_size*sizeof(realw)),4308); + print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw), + cudaMemcpyHostToDevice),4309); + + } + + // attenuation memory variables + if( *ATTENUATION ){ + // memory arrays + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx), + (*R_size)*sizeof(realw)),4401); + print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),4402); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy), + (*R_size)*sizeof(realw)),4403); + print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),4404); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy), + (*R_size)*sizeof(realw)),4405); + print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),4406); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz), + (*R_size)*sizeof(realw)),4407); + print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),4408); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz), + (*R_size)*sizeof(realw)),4409); + print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),4410); + + // attenuation factors + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta), + NGLL3*mp->NSPEC_AB*sizeof(realw)),4430); + print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta, + NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common), + N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432); + print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common, + N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433); + + // alpha,beta,gamma factors + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval), + N_SLS*sizeof(realw)),4434); + print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval, + N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval), + N_SLS*sizeof(realw)),4436); + print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval, + N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval), + N_SLS*sizeof(realw)),4438); + print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval, + N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439); + + } + + // anisotropy + if( *ANISOTROPY ){ + // allocates memory on GPU + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store), + size_padded*sizeof(realw)),4700); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store), + size_padded*sizeof(realw)),4701); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store), + size_padded*sizeof(realw)),4702); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store), + size_padded*sizeof(realw)),4703); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store), + size_padded*sizeof(realw)),4704); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store), + size_padded*sizeof(realw)),4705); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store), + size_padded*sizeof(realw)),4706); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store), + size_padded*sizeof(realw)),4707); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store), + size_padded*sizeof(realw)),4708); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store), + size_padded*sizeof(realw)),4709); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store), + size_padded*sizeof(realw)),4710); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store), + size_padded*sizeof(realw)),4711); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store), + size_padded*sizeof(realw)),4712); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store), + size_padded*sizeof(realw)),4713); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store), + size_padded*sizeof(realw)),4714); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store), + size_padded*sizeof(realw)),4715); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store), + size_padded*sizeof(realw)),4716); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store), + size_padded*sizeof(realw)),4717); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store), + size_padded*sizeof(realw)),4718); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store), + size_padded*sizeof(realw)),4719); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store), + size_padded*sizeof(realw)),4720); + + // transfer constant element data with padding + for(int i=0;i < mp->NSPEC_AB;i++) { + print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4805); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4806); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4807); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4808); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4809); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4810); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4811); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4812); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4813); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4814); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4815); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4816); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4817); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4818); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4819); + print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4820); + } + } + + // ocean load approximation + if( *OCEANS ){ + // oceans needs a free surface + mp->num_free_surface_faces = *num_free_surface_faces; + if( mp->num_free_surface_faces > 0 ){ + // mass matrix + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load), + sizeof(realw)*mp->NGLOB_AB),4501); + print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load, + sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502); + // surface normal + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal), + 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503); + print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal, + 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504); + + // temporary global array: used to synchronize updates on global accel array + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load), + sizeof(int)*mp->NGLOB_AB),4505); + + if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec), + mp->num_free_surface_faces*sizeof(int)),4601); + print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec, + mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk), + 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602); + print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk, + 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604); + } + } + } + + // mesh coloring + if( mp->use_mesh_coloring_gpu ){ + mp->num_colors_outer_elastic = *num_colors_outer_elastic; + mp->num_colors_inner_elastic = *num_colors_inner_elastic; + mp->h_num_elem_colors_elastic = (int*) num_elem_colors_elastic; + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("prepare_fields_elastic_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_fields_elastic_adj_dev, + PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f, + int* size, + int* SIMULATION_TYPE, + int* COMPUTE_AND_STORE_STRAIN, + realw* epsilon_trace_over_3, + realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy, + realw* b_epsilondev_xz,realw* b_epsilondev_yz, + realw* b_epsilon_trace_over_3, + int* ATTENUATION, + int* R_size, + realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz, + realw* b_alphaval,realw* b_betaval,realw* b_gammaval, + int* APPROXIMATE_HESS_KL){ + + TRACE("prepare_fields_elastic_adj_dev"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); + + // checks if kernel simulation + if( *SIMULATION_TYPE != 3 ) return; + + // kernel simulations + // allocates backward/reconstructed arrays on device (GPU) + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),5201); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),5202); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),5203); + + // allocates kernels + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5204); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5205); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5206); + + // initializes kernel values to zero + print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0, + NGLL3*mp->NSPEC_AB*sizeof(realw)),5207); + print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0, + NGLL3*mp->NSPEC_AB*sizeof(realw)),5208); + print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0, + NGLL3*mp->NSPEC_AB*sizeof(realw)),5209); + + // strains used for attenuation and kernel simulations + if( *COMPUTE_AND_STORE_STRAIN ){ + // strains + int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing + + // solid pressure + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3), + NGLL3*mp->NSPEC_AB*sizeof(realw)),5310); + print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3, + NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311); + // backward solid pressure + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3), + NGLL3*mp->NSPEC_AB*sizeof(realw)),5312); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3, + NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313); + // prepares backward strains + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx), + epsilondev_size*sizeof(realw)),5321); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy), + epsilondev_size*sizeof(realw)),5322); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy), + epsilondev_size*sizeof(realw)),5323); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz), + epsilondev_size*sizeof(realw)),5324); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz), + epsilondev_size*sizeof(realw)),5325); + + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx, + epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy, + epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy, + epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz, + epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz, + epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330); + } + + // attenuation memory variables + if( *ATTENUATION ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx), + (*R_size)*sizeof(realw)),5421); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),5422); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy), + (*R_size)*sizeof(realw)),5423); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),5424); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy), + (*R_size)*sizeof(realw)),5425); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),5426); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz), + (*R_size)*sizeof(realw)),5427); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),5428); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz), + (*R_size)*sizeof(realw)),5429); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw), + cudaMemcpyHostToDevice),5420); + + // alpha,beta,gamma factors for backward fields + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval), + N_SLS*sizeof(realw)),5434); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval, + N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval), + N_SLS*sizeof(realw)),5436); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval, + N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval), + N_SLS*sizeof(realw)),5438); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval, + N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439); + } + + if( *APPROXIMATE_HESS_KL ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5450); + // initializes with zeros + print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0, + NGLL3*mp->NSPEC_AB*sizeof(realw)),5451); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("prepare_fields_elastic_adj_dev"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +// purely adjoint & kernel simulations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_sim2_or_3_const_device, + PREPARE_SIM2_OR_3_CONST_DEVICE)( + long* Mesh_pointer_f, + int* islice_selected_rec, + int* islice_selected_rec_size, + int* nadj_rec_local, + int* nrec, + int* myrank) { + + TRACE("prepare_sim2_or_3_const_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); + + // adjoint source arrays + mp->nadj_rec_local = *nadj_rec_local; + if( mp->nadj_rec_local > 0 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays, + (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),6003); + + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec, + (mp->nadj_rec_local)*sizeof(int)),6004); + + // prepares local irec array: + // the irec_local variable needs to be precomputed (as + // h_pre_comp..), because normally it is in the loop updating accel, + // and due to how it's incremented, it cannot be parallelized + int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) ); + if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n"); + + int irec_local = 0; + for(int irec = 0; irec < *nrec; irec++) { + if(*myrank == islice_selected_rec[irec]) { + irec_local++; + h_pre_computed_irec[irec_local-1] = irec; + } + } + if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n"); + // copies values onto GPU + print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec, + (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010); + free(h_pre_computed_irec); + + // temporary array to prepare extracted source array values + mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) ); + if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n"); + + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("prepare_sim2_or_3_const_device"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// for NOISE simulations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_fields_noise_device, + PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f, + int* NSPEC_AB, int* NGLOB_AB, + int* free_surface_ispec, + int* free_surface_ijk, + int* num_free_surface_faces, + int* SIMULATION_TYPE, + int* NOISE_TOMOGRAPHY, + int* NSTEP, + realw* noise_sourcearray, + realw* normal_x_noise, + realw* normal_y_noise, + realw* normal_z_noise, + realw* mask_noise, + realw* free_surface_jacobian2Dw) { + + TRACE("prepare_fields_noise_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); + + // free surface + mp->num_free_surface_faces = *num_free_surface_faces; + + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec, + mp->num_free_surface_faces*sizeof(int)),7001); + print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec, + mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002); + + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk, + 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003); + print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk, + 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004); + + // alloc storage for the surface buffer to be copied + print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie, + 3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),7005); + + // prepares noise source array + if( *NOISE_TOMOGRAPHY == 1 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray, + 3*NGLL3*(*NSTEP)*sizeof(realw)),7101); + print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray, + 3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102); + } + + // prepares noise directions + if( *NOISE_TOMOGRAPHY > 1 ){ + int nface_size = NGLL2*(*num_free_surface_faces); + // allocates memory on GPU + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise, + nface_size*sizeof(realw)),7301); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise, + nface_size*sizeof(realw)),7302); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise, + nface_size*sizeof(realw)),7303); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise, + nface_size*sizeof(realw)),7304); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw, + nface_size*sizeof(realw)),7305); + // transfers data onto GPU + print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise, + nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306); + print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise, + nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307); + print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise, + nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308); + print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise, + nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309); + print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw, + nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310); + } + + // prepares noise strength kernel + if( *NOISE_TOMOGRAPHY == 3 ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl), + NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401); + // initializes kernel values to zero + print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0, + NGLL3*mp->NSPEC_AB*sizeof(realw)),7403); + + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //printf("jacobian_size = %d\n",25*(*num_free_surface_faces)); + exit_on_cuda_error("prepare_fields_noise_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +// GRAVITY simulations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_fields_gravity_device, + PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f, + int* GRAVITY, + realw* minus_deriv_gravity, + realw* minus_g, + realw* h_wgll_cube, + int* ACOUSTIC_SIMULATION, + realw* rhostore) { + + TRACE("prepare_fields_gravity_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); + + setConst_wgll_cube(h_wgll_cube,mp); + + mp->gravity = *GRAVITY; + if( mp->gravity ){ + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_deriv_gravity), + (mp->NGLOB_AB)*sizeof(realw)),8000); + print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity, minus_deriv_gravity, + (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8001); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_g), + (mp->NGLOB_AB)*sizeof(realw)),8002); + print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_g, minus_g, + (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8003); + + + if( *ACOUSTIC_SIMULATION == 0 ){ + // rhostore not allocated yet + int size_padded = NGLL3_PADDED * (mp->NSPEC_AB); + // padded array + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),8006); + // transfer constant element data with padding + for(int i=0; i < mp->NSPEC_AB; i++) { + print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3], + NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8007); + } + } + } + +} + +extern "C" +void FC_FUNC_(prepare_seismogram_fields, + PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) { + + TRACE("prepare_constants_device"); + Mesh* mp = (Mesh*)(*Mesh_pointer); + + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_nu),3*3* *nrec_local*sizeof(double)),8100); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hxir),5* *nrec_local*sizeof(double)),8100); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hetar),5* *nrec_local*sizeof(double)),8100); + print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hgammar),5* *nrec_local*sizeof(double)),8100); + + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_d,3**nrec_local*sizeof(realw)),8101); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_v,3**nrec_local*sizeof(realw)),8101); + print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_a,3**nrec_local*sizeof(realw)),8101); + + print_CUDA_error_if_any(cudaMemcpy(mp->d_nu,nu,3*3* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101); + print_CUDA_error_if_any(cudaMemcpy(mp->d_hxir,hxir,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101); + print_CUDA_error_if_any(cudaMemcpy(mp->d_hetar,hetar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101); + print_CUDA_error_if_any(cudaMemcpy(mp->d_hgammar,hgammar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101); + + cudaMallocHost((void**)&mp->h_seismograms_d_it,3**nrec_local*sizeof(realw)); + cudaMallocHost((void**)&mp->h_seismograms_v_it,3**nrec_local*sizeof(realw)); + cudaMallocHost((void**)&mp->h_seismograms_a_it,3**nrec_local*sizeof(realw)); +} + +/* ----------------------------------------------------------------------------------------------- */ + +// cleanup + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(prepare_cleanup_device, + PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* SAVE_FORWARD, + int* ACOUSTIC_SIMULATION, + int* ELASTIC_SIMULATION, + int* ABSORBING_CONDITIONS, + int* NOISE_TOMOGRAPHY, + int* COMPUTE_AND_STORE_STRAIN, + int* ATTENUATION, + int* ANISOTROPY, + int* OCEANS, + int* APPROXIMATE_HESS_KL) { + +TRACE("prepare_cleanup_device"); + + // frees allocated memory arrays + Mesh* mp = (Mesh*)(*Mesh_pointer_f); + + // frees memory on GPU + // mesh + cudaFree(mp->d_xix); + cudaFree(mp->d_xiy); + cudaFree(mp->d_xiz); + cudaFree(mp->d_etax); + cudaFree(mp->d_etay); + cudaFree(mp->d_etaz); + cudaFree(mp->d_gammax); + cudaFree(mp->d_gammay); + cudaFree(mp->d_gammaz); + cudaFree(mp->d_muv); + + // absorbing boundaries + if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0 ){ + cudaFree(mp->d_abs_boundary_ispec); + cudaFree(mp->d_abs_boundary_ijk); + cudaFree(mp->d_abs_boundary_normal); + cudaFree(mp->d_abs_boundary_jacobian2Dw); + } + + // interfaces + cudaFree(mp->d_nibool_interfaces_ext_mesh); + cudaFree(mp->d_ibool_interfaces_ext_mesh); + + // global indexing + cudaFree(mp->d_ispec_is_inner); + cudaFree(mp->d_ibool); + + // sources + if (*SIMULATION_TYPE == 1 || *SIMULATION_TYPE == 3){ + cudaFree(mp->d_sourcearrays); + cudaFree(mp->d_stf_pre_compute); + } + + cudaFree(mp->d_islice_selected_source); + cudaFree(mp->d_ispec_selected_source); + + // receivers + if( mp->nrec_local > 0 ) cudaFree(mp->d_number_receiver_global); + cudaFree(mp->d_ispec_selected_rec); + + // ACOUSTIC arrays + if( *ACOUSTIC_SIMULATION ){ + cudaFree(mp->d_potential_acoustic); + cudaFree(mp->d_potential_dot_acoustic); + cudaFree(mp->d_potential_dot_dot_acoustic); + cudaFree(mp->d_send_potential_dot_dot_buffer); + cudaFree(mp->d_rmass_acoustic); + cudaFree(mp->d_rhostore); + cudaFree(mp->d_kappastore); + cudaFree(mp->d_phase_ispec_inner_acoustic); + cudaFree(mp->d_ispec_is_acoustic); + + if( *NOISE_TOMOGRAPHY == 0 ){ + cudaFree(mp->d_free_surface_ispec); + cudaFree(mp->d_free_surface_ijk); + } + + if( *ABSORBING_CONDITIONS ) cudaFree(mp->d_b_absorb_potential); + + if( *SIMULATION_TYPE == 3 ) { + cudaFree(mp->d_b_potential_acoustic); + cudaFree(mp->d_b_potential_dot_acoustic); + cudaFree(mp->d_b_potential_dot_dot_acoustic); + cudaFree(mp->d_rho_ac_kl); + cudaFree(mp->d_kappa_ac_kl); + if( *APPROXIMATE_HESS_KL) cudaFree(mp->d_hess_ac_kl); + } + + + if(mp->nrec_local > 0 ){ + cudaFree(mp->d_station_seismo_potential); + free(mp->h_station_seismo_potential); + } + + } // ACOUSTIC_SIMULATION + + // ELASTIC arrays + if( *ELASTIC_SIMULATION ){ + cudaFree(mp->d_displ); + cudaFree(mp->d_veloc); + cudaFree(mp->d_accel); + cudaFree(mp->d_send_accel_buffer); + cudaFree(mp->d_rmass); + + cudaFree(mp->d_phase_ispec_inner_elastic); + cudaFree(mp->d_ispec_is_elastic); + + if( mp->nrec_local > 0 ){ + cudaFree(mp->d_station_seismo_field); + free(mp->h_station_seismo_field); + } + + if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){ + cudaFree(mp->d_rho_vp); + cudaFree(mp->d_rho_vs); + + if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )) + cudaFree(mp->d_b_absorb_field); + } + + if( *SIMULATION_TYPE == 3 ) { + cudaFree(mp->d_b_displ); + cudaFree(mp->d_b_veloc); + cudaFree(mp->d_b_accel); + cudaFree(mp->d_rho_kl); + cudaFree(mp->d_mu_kl); + cudaFree(mp->d_kappa_kl); + if( *APPROXIMATE_HESS_KL ) cudaFree(mp->d_hess_el_kl); + } + + if( *COMPUTE_AND_STORE_STRAIN ){ + cudaFree(mp->d_epsilondev_xx); + cudaFree(mp->d_epsilondev_yy); + cudaFree(mp->d_epsilondev_xy); + cudaFree(mp->d_epsilondev_xz); + cudaFree(mp->d_epsilondev_yz); + if( *SIMULATION_TYPE == 3 ){ + cudaFree(mp->d_epsilon_trace_over_3); + cudaFree(mp->d_b_epsilon_trace_over_3); + cudaFree(mp->d_b_epsilondev_xx); + cudaFree(mp->d_b_epsilondev_yy); + cudaFree(mp->d_b_epsilondev_xy); + cudaFree(mp->d_b_epsilondev_xz); + cudaFree(mp->d_b_epsilondev_yz); + } + } + + if( *ATTENUATION ){ + cudaFree(mp->d_factor_common); + cudaFree(mp->d_one_minus_sum_beta); + cudaFree(mp->d_alphaval); + cudaFree(mp->d_betaval); + cudaFree(mp->d_gammaval); + cudaFree(mp->d_R_xx); + cudaFree(mp->d_R_yy); + cudaFree(mp->d_R_xy); + cudaFree(mp->d_R_xz); + cudaFree(mp->d_R_yz); + if( *SIMULATION_TYPE == 3){ + cudaFree(mp->d_b_R_xx); + cudaFree(mp->d_b_R_yy); + cudaFree(mp->d_b_R_xy); + cudaFree(mp->d_b_R_xz); + cudaFree(mp->d_b_R_yz); + cudaFree(mp->d_b_alphaval); + cudaFree(mp->d_b_betaval); + cudaFree(mp->d_b_gammaval); + } + } + + if( *ANISOTROPY ){ + cudaFree(mp->d_c11store); + cudaFree(mp->d_c12store); + cudaFree(mp->d_c13store); + cudaFree(mp->d_c14store); + cudaFree(mp->d_c15store); + cudaFree(mp->d_c16store); + cudaFree(mp->d_c22store); + cudaFree(mp->d_c23store); + cudaFree(mp->d_c24store); + cudaFree(mp->d_c25store); + cudaFree(mp->d_c26store); + cudaFree(mp->d_c33store); + cudaFree(mp->d_c34store); + cudaFree(mp->d_c35store); + cudaFree(mp->d_c36store); + cudaFree(mp->d_c44store); + cudaFree(mp->d_c45store); + cudaFree(mp->d_c46store); + cudaFree(mp->d_c55store); + cudaFree(mp->d_c56store); + cudaFree(mp->d_c66store); + } + + if( *OCEANS ){ + if( mp->num_free_surface_faces > 0 ){ + cudaFree(mp->d_rmass_ocean_load); + cudaFree(mp->d_free_surface_normal); + cudaFree(mp->d_updated_dof_ocean_load); + if( *NOISE_TOMOGRAPHY == 0){ + cudaFree(mp->d_free_surface_ispec); + cudaFree(mp->d_free_surface_ijk); + } + } + } + } // ELASTIC_SIMULATION + + // purely adjoint & kernel array + if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ){ + if(mp->nadj_rec_local > 0 ){ + cudaFree(mp->d_adj_sourcearrays); + cudaFree(mp->d_pre_computed_irec); + free(mp->h_adj_sourcearrays_slice); + } + } + + // NOISE arrays + if( *NOISE_TOMOGRAPHY > 0 ){ + cudaFree(mp->d_free_surface_ispec); + cudaFree(mp->d_free_surface_ijk); + cudaFree(mp->d_noise_surface_movie); + if( *NOISE_TOMOGRAPHY == 1 ) cudaFree(mp->d_noise_sourcearray); + if( *NOISE_TOMOGRAPHY > 1 ){ + cudaFree(mp->d_normal_x_noise); + cudaFree(mp->d_normal_y_noise); + cudaFree(mp->d_normal_z_noise); + cudaFree(mp->d_mask_noise); + cudaFree(mp->d_free_surface_jacobian2Dw); + } + if( *NOISE_TOMOGRAPHY == 3 ) cudaFree(mp->d_Sigma_kl); + } + + // mesh pointer - not needed anymore + free(mp); +} diff --git a/src/cuda/save_and_compare_cpu_vs_gpu.c b/src/cuda/save_and_compare_cpu_vs_gpu.c new file mode 100644 index 000000000..082a18f2d --- /dev/null +++ b/src/cuda/save_and_compare_cpu_vs_gpu.c @@ -0,0 +1,318 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include +#include + +#ifdef WITH_MPI +#include +#endif + +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + + +void save_to_max_surface_file_(float* maxval) { + int rank; + char filename[BUFSIZ]; + FILE* fp; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD,&rank); +#else + rank = 0; +#endif + sprintf(filename,"maxval_surface_proc_%03d.dat",rank); + fp = fopen(filename,"a+"); + fprintf(fp,"%e\n",*maxval); + fclose(fp); +} + + +void save_fvector_(float* vector, int* size, int* id, int* cpu_or_gpu) { + FILE* fp; + char filename[BUFSIZ]; + if(*cpu_or_gpu == 0) { + sprintf(filename, "debug_output_cpu_%d.dat",*id); + } + else { + sprintf(filename, "debug_output_gpu_%d.dat",*id); + } + fp = fopen(filename, "wb"); + printf("writing vector, vector[0]=%e\n",vector[0]); + fwrite(vector, sizeof(float), *size, fp); + fclose(fp); + +} + +void save_ivector_(int* vector, int* size, int* id, int* cpu_or_gpu) { + FILE* fp; + char filename[BUFSIZ]; + if(*cpu_or_gpu == 0) { + sprintf(filename, "debug_output_cpu_%d.dat",*id); + } + else { + sprintf(filename, "debug_output_gpu_%d.dat",*id); + } + fp = fopen(filename, "wb"); + fwrite(vector, sizeof(int), *size, fp); + fclose(fp); + +} + + +void get_max_from_surface_file_(int* nodes_per_iterationf,int* NSTEP) { + int nodes_per_iteration = *nodes_per_iterationf; + char filename[BUFSIZ]; + int procid; +#ifdef WITH_MPI + MPI_Comm_rank(MPI_COMM_WORLD,&procid); +#else + procid = 0; +#endif + sprintf(filename,"/scratch/eiger/rietmann/SPECFEM3D_AIGLE/in_out_files/DATABASES_MPI/proc%06d_surface_movie",procid); + + FILE* fp; int it; + printf("Opening %s for analysis\n",filename); + fp = fopen(filename,"rb"); + //char* errorstr; + if(fp == 0) { + //errorstr = (char*) strerror(errno); + printf("FILE ERROR:%s\n",(char*) strerror(errno)); + perror("file error\n"); + exit(1); + } + + float* vector = (float*)malloc(nodes_per_iteration*sizeof(float)); + float max_val; + int i; + for(it=0;it<*NSTEP;it++) { + int pos = (sizeof(float)*nodes_per_iteration)*(it); + fseek(fp,pos,SEEK_SET); + fread(vector,sizeof(float),nodes_per_iteration,fp); + for(i=0;i 0.01) { + if(fabsf(vector1[i]-vector2[i]) > 1e-20) { + error_count++; + if(error_count<10) { + printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]); + } + } + } + } + /* if(vector1[i] != vector2[i]) { */ + /* if(fabsf(vector1[i]-vector2[i]) > 1e-25) { */ + /* error_count++; */ + /* if(error_count<50) { */ + /* printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]); */ + /* } */ + /* } */ + /* } */ + } + printf("**** Error Count: %d ****\n",error_count); + *num_errors = error_count; +} + +void compare_surface_files_(int* bytes_per_iteration, int* number_of_iterations) { + + char* cpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_proc000001_surface_movie"; + char* gpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_v2_proc000001_surface_movie"; + + FILE* fp_cpu; + fp_cpu = fopen(cpu_file,"rb"); + //char* errorstr; + if(fp_cpu == 0) { + //errorstr = (char*) strerror(errno); + //printf("CPU FILE ERROR:%s\n",errorstr); + printf("CPU FILE ERROR:%s\n",(char*) strerror(errno)); + perror("cpu file error\n"); + } + FILE* fp_gpu; + fp_gpu = fopen(gpu_file,"rb"); + + if(fp_gpu == NULL) { + //errorstr = (char*) strerror(errno); + //printf("GPU FILE ERROR:%s\n",errorstr); + printf("GPU FILE ERROR:%s\n",(char*) strerror(errno)); + perror("gpu file error\n"); + } + + /* pause_for_debug(); */ + + float* gpu_vector = (float*)malloc(*bytes_per_iteration); + float* cpu_vector = (float*)malloc(*bytes_per_iteration); + int i,it,error_count=0; + for(it=0;it<*number_of_iterations;it++) { + int pos = (*bytes_per_iteration)*(it); + + fseek(fp_cpu,pos,SEEK_SET); + fseek(fp_gpu,pos,SEEK_SET); + + int number_of_nodes = *bytes_per_iteration/sizeof(float); + fread(cpu_vector,sizeof(float),number_of_nodes,fp_cpu); + fread(gpu_vector,sizeof(float),number_of_nodes,fp_gpu); + int size = number_of_nodes; + float gpu_min_val=10; + float gpu_max_val=0; + float cpu_min_val=10; + float cpu_max_val=0; + if(it<100) { + for(i=0;i 0.01)) { + if(error_count < 30) printf("ERROR[%d]: %g != %g\n",i,cpu_vector[i], gpu_vector[i]); + if(cpu_vector[i] > 1e-30) error_count++; + } + if(gpu_vector[i]>gpu_max_val) gpu_max_val = gpu_vector[i]; + if(gpu_vector[i]cpu_max_val) cpu_max_val = cpu_vector[i]; + if(cpu_vector[i] 0.0001)) { + if(error_count < 30) { + printf("ERROR[%d]: %f != %f\n",i,compare_vector[i], vector[i]); + } + error_count++; + /* if(compare_vector[i] > 1e-30) error_count++; */ + } + } + printf("%d Total Errors\n",error_count); + printf("size:%d\n",*size); + /* for(i=0;i<30;i++) { */ + /* printf("val[%d]: %g != %g\n",i,compare_vector[i], vector[i]); */ + /* /\* printf("error_check[%d]= %g\n",abs(vector[i] - compare_vector[i])/vector[i]); *\/ */ + /* } */ +} + +void compare_ivector_(int* vector, int* size, int* id, int* cpu_or_gpu) { + FILE* fp; + char cmp_filename[BUFSIZ]; + int* compare_vector = (int*)malloc(*size*sizeof(int)); + if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare + sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id); + } + else { + sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id); + } + fopen(cmp_filename, "rb"); + /* read the values */ + if((fp=fopen(cmp_filename, "rb"))==NULL) { + printf("Cannot open comparison file %s.\n",cmp_filename); + exit(1); + } + if(fread(compare_vector, sizeof(int), *size, fp) != *size) { + if(feof(fp)) + printf("Premature end of file."); + else + printf("File read error."); + } + + fclose(fp); + + int i; + int error_count=0; + for(i=0;i<*size;i++) { + if((abs(vector[i] - compare_vector[i])/vector[i] > 0.01) && error_count < 30) { + printf("ERROR[%d]: %d != %d\n",i,compare_vector[i], vector[i]); + error_count++; + } + } + printf("%d Total Errors\n",error_count); +} diff --git a/src/cuda/specfem3D_gpu_cuda_method_stubs.c b/src/cuda/specfem3D_gpu_cuda_method_stubs.c new file mode 100644 index 000000000..d87e083ce --- /dev/null +++ b/src/cuda/specfem3D_gpu_cuda_method_stubs.c @@ -0,0 +1,805 @@ +/* +!===================================================================== +! +! S p e c f e m 3 D V e r s i o n 2 . 0 +! --------------------------------------- +! +! Main authors: Dimitri Komatitsch and Jeroen Tromp +! Princeton University, USA and University of Pau / CNRS / INRIA +! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA +! April 2011 +! +! This program is free software; you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 2 of the License, or +! (at your option) any later version. +! +! This program is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along +! with this program; if not, write to the Free Software Foundation, Inc., +! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +! +!===================================================================== +*/ + +#include +#include + +#include "config.h" + +typedef float realw; + + + +// +// src/cuda/check_fields_cuda.cu +// + +void FC_FUNC_(check_max_norm_displ_gpu, + CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} + +void FC_FUNC_(check_max_norm_vector, + CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} + +void FC_FUNC_(check_max_norm_displ, + CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} + +void FC_FUNC_(check_max_norm_b_displ_gpu, + CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} + +void FC_FUNC_(check_max_norm_b_accel_gpu, + CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} + +void FC_FUNC_(check_max_norm_b_veloc_gpu, + CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} + +void FC_FUNC_(check_max_norm_b_displ, + CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} + +void FC_FUNC_(check_max_norm_b_accel, + CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} + +void FC_FUNC_(check_error_vectors, + CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} + +void FC_FUNC_(get_max_accel, + GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} + +void FC_FUNC_(get_norm_acoustic_from_device, + GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm, + long* Mesh_pointer_f, + int* SIMULATION_TYPE) {} + +void FC_FUNC_(get_norm_elastic_from_device, + GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm, + long* Mesh_pointer_f, + int* SIMULATION_TYPE) {} + + +// +// src/cuda/compute_add_sources_acoustic_cuda.cu +// + +void FC_FUNC_(compute_add_sources_ac_cuda, + COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + int* SIMULATION_TYPEf, + double* h_stf_pre_compute, + int* myrankf) {} + +void FC_FUNC_(compute_add_sources_ac_s3_cuda, + COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + int* SIMULATION_TYPEf, + double* h_stf_pre_compute, + int* myrankf) {} + +void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda, + ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer, + realw* h_adj_sourcearrays, + int* phase_is_inner, + int* h_ispec_is_inner, + int* h_ispec_is_acoustic, + int* h_ispec_selected_rec, + int* myrank, + int* nrec, + int* time_index, + int* h_islice_selected_rec, + int* nadj_rec_local, + int* NTSTEP_BETWEEN_READ_ADJSRC) {} + + +// +// src/cuda/compute_add_sources_elastic_cuda.cu +// + +void FC_FUNC_(compute_add_sources_el_cuda, + COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + double* h_stf_pre_compute, + int* myrankf) {} + +void FC_FUNC_(compute_add_sources_el_s3_cuda, + COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer, + double* h_stf_pre_compute, + int* NSOURCESf, + int* phase_is_inner, + int* myrank) {} + +void FC_FUNC_(add_source_master_rec_noise_cu, + ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f, + int* myrank_f, + int* it_f, + int* irec_master_noise_f, + int* islice_selected_rec) {} + +void FC_FUNC_(add_sources_el_sim_type_2_or_3, + ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer, + realw* h_adj_sourcearrays, + int* phase_is_inner, + int* h_ispec_is_inner, + int* h_ispec_is_elastic, + int* h_ispec_selected_rec, + int* myrank, + int* nrec, + int* time_index, + int* h_islice_selected_rec, + int* nadj_rec_local, + int* NTSTEP_BETWEEN_READ_ADJSRC) {} + + +// +// src/cuda/compute_coupling_cuda.cu +// + +void FC_FUNC_(compute_coupling_ac_el_cuda, + COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* num_coupling_ac_el_facesf, + int* SIMULATION_TYPEf) {} + +void FC_FUNC_(compute_coupling_el_ac_cuda, + COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* num_coupling_ac_el_facesf, + int* SIMULATION_TYPEf) {} + + +// +// src/cuda/compute_forces_acoustic_cuda.cu +// + +void FC_FUNC_(transfer_boun_pot_from_device, + TRANSFER_BOUN_POT_FROM_DEVICE)( + int* size, + long* Mesh_pointer_f, + realw* potential_dot_dot_acoustic, + realw* send_potential_dot_dot_buffer, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT){} + +void FC_FUNC_(transfer_asmbl_pot_to_device, + TRANSFER_ASMBL_POT_TO_DEVICE)( + long* Mesh_pointer, + realw* potential_dot_dot_acoustic, + realw* buffer_recv_scalar_ext_mesh, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT) {} + +void FC_FUNC_(compute_forces_acoustic_cuda, + COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f, + int* iphase, + int* nspec_outer_acoustic, + int* nspec_inner_acoustic, + int* SIMULATION_TYPE) {} + +void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)( + long* Mesh_pointer, + int* size_F, + int* SIMULATION_TYPE) {} + +void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)( + long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltatover2_F) {} + +void FC_FUNC_(acoustic_enforce_free_surf_cuda, + ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* ABSORB_FREE_SURFACE) {} + + +// +// src/cuda/compute_forces_elastic_cuda.cu +// + +//void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*); // {};} + +void FC_FUNC_(transfer_boun_accel_from_device, + TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel, + realw* send_accel_buffer, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT){} + +void FC_FUNC_(transfer_boundary_from_device_a, + TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer, + int* nspec_outer_elastic) {} + +void FC_FUNC_(transfer_boundary_to_device_a, + TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer, + realw* buffer_recv_vector_ext_mesh, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh) {} + +//void FC_FUNC_(assemble_accel_on_device, +// ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel, +// realw* buffer_recv_vector_ext_mesh, +// int* num_interfaces_ext_mesh, +// int* max_nibool_interfaces_ext_mesh, +// int* nibool_interfaces_ext_mesh, +// int* ibool_interfaces_ext_mesh, +// int* FORWARD_OR_ADJOINT) {} + +void FC_FUNC_(transfer_asmbl_accel_to_device, + TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel, + realw* buffer_recv_vector_ext_mesh, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT) {} + +void FC_FUNC_(compute_forces_elastic_cuda, + COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f, + int* iphase, + int* nspec_outer_elastic, + int* nspec_inner_elastic, + int* SIMULATION_TYPE, + int* COMPUTE_AND_STORE_STRAIN, + int* ATTENUATION, + int* ANISOTROPY) {} + +void FC_FUNC_(sync_copy_from_device, + SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f, + int* iphase, + realw* send_buffer) {} + +void FC_FUNC_(kernel_3_a_cuda, + KERNEL_3_A_CUDA)(long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE_f, + realw* b_deltatover2_F, + int* OCEANS) {} + +void FC_FUNC_(kernel_3_b_cuda, + KERNEL_3_B_CUDA)(long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE_f, + realw* b_deltatover2_F) {} + +void FC_FUNC_(elastic_ocean_load_cuda, + ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f, + int* SIMULATION_TYPE) {} + + +// +// src/cuda/compute_kernels_cuda.cu +// + +void FC_FUNC_(compute_kernels_elastic_cuda, + COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer, + realw* deltat_f) {} + +void FC_FUNC_(compute_kernels_strgth_noise_cu, + COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer, + realw* h_noise_surface_movie, + realw* deltat) {} + +void FC_FUNC_(compute_kernels_acoustic_cuda, + COMPUTE_KERNELS_ACOUSTIC_CUDA)( + long* Mesh_pointer, + realw* deltat_f) {} + +void FC_FUNC_(compute_kernels_hess_cuda, + COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer, + realw* deltat_f, + int* ELASTIC_SIMULATION, + int* ACOUSTIC_SIMULATION) {} + + +// +// src/cuda/compute_stacey_acoustic_cuda.cu +// + +void FC_FUNC_(compute_stacey_acoustic_cuda, + COMPUTE_STACEY_ACOUSTIC_CUDA)( + long* Mesh_pointer_f, + int* phase_is_innerf, + int* SIMULATION_TYPEf, + int* SAVE_FORWARDf, + realw* h_b_absorb_potential) {} + + +// +// src/cuda/compute_stacey_elastic_cuda.cu +// + +void FC_FUNC_(compute_stacey_elastic_cuda, + COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* SIMULATION_TYPEf, + int* SAVE_FORWARDf, + realw* h_b_absorb_field) {} + + +// +// src/cuda/it_update_displacement_cuda.cu +// + +void FC_FUNC_(it_update_displacement_cuda, + IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f, + int* size_F, + realw* deltat_F, + realw* deltatsqover2_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltat_F, + realw* b_deltatsqover2_F, + realw* b_deltatover2_F) {} + +void FC_FUNC_(it_update_displacement_ac_cuda, + it_update_displacement_ac_cuda)(long* Mesh_pointer_f, + int* size_F, + realw* deltat_F, + realw* deltatsqover2_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltat_F, + realw* b_deltatsqover2_F, + realw* b_deltatover2_F) {} + + +// +// src/cuda/noise_tomography_cuda.cu +// + +void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){} + +void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {} + +void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {} + +void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {} + +void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {} + +void FC_FUNC_(transfer_surface_to_host, + TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f, + realw* h_noise_surface_movie) {} + +void FC_FUNC_(noise_read_add_surface_movie_cu, + NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f, + realw* h_noise_surface_movie, + int* NOISE_TOMOGRAPHYf) {} + + +// +// src/cuda/prepare_mesh_constants_cuda.cu +// + +void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {} + +void FC_FUNC_(output_free_device_memory, + OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {} + +void FC_FUNC_(show_free_device_memory, + SHOW_FREE_DEVICE_MEMORY)() {} + +void FC_FUNC_(get_free_device_memory, + get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} + +void FC_FUNC_(prepare_cuda_device, + PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { + fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n"); + exit(1); +} + +void FC_FUNC_(prepare_constants_device, + PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer, + int* h_NGLLX, + int* NSPEC_AB, int* NGLOB_AB, + realw* h_xix, realw* h_xiy, realw* h_xiz, + realw* h_etax, realw* h_etay, realw* h_etaz, + realw* h_gammax, realw* h_gammay, realw* h_gammaz, + realw* h_kappav, realw* h_muv, + int* h_ibool, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* h_nibool_interfaces_ext_mesh, + int* h_ibool_interfaces_ext_mesh, + realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz, + realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz, + realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz, + int* ABSORBING_CONDITIONS, + int* h_abs_boundary_ispec, int* h_abs_boundary_ijk, + realw* h_abs_boundary_normal, + realw* h_abs_boundary_jacobian2Dw, + int* h_num_abs_boundary_faces, + int* h_ispec_is_inner, + int* NSOURCES, + int* nsources_local_f, + realw* h_sourcearrays, + int* h_islice_selected_source, + int* h_ispec_selected_source, + int* h_number_receiver_global, + int* h_ispec_selected_rec, + int* nrec_f, + int* nrec_local_f, + int* SIMULATION_TYPE, + int* USE_MESH_COLORING_GPU_f, + int* nspec_acoustic,int* nspec_elastic, + int* my_neighbours_ext_mesh, + int* request_send_vector_ext_mesh, + int* request_recv_vector_ext_mesh, + realw* buffer_recv_vector_ext_mesh + ) {} + +void FC_FUNC_(prepare_fields_acoustic_device, + PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f, + realw* rmass_acoustic, + realw* rhostore, + realw* kappastore, + int* num_phase_ispec_acoustic, + int* phase_ispec_inner_acoustic, + int* ispec_is_acoustic, + int* NOISE_TOMOGRAPHY, + int* num_free_surface_faces, + int* free_surface_ispec, + int* free_surface_ijk, + int* ABSORBING_CONDITIONS, + int* b_reclen_potential, + realw* b_absorb_potential, + int* ELASTIC_SIMULATION, + int* num_coupling_ac_el_faces, + int* coupling_ac_el_ispec, + int* coupling_ac_el_ijk, + realw* coupling_ac_el_normal, + realw* coupling_ac_el_jacobian2Dw, + int* num_colors_outer_acoustic, + int* num_colors_inner_acoustic, + int* num_elem_colors_acoustic) {} + +void FC_FUNC_(prepare_fields_acoustic_adj_dev, + PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* APPROXIMATE_HESS_KL) {} + +void FC_FUNC_(prepare_fields_elastic_device, + PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f, + int* size, + realw* rmass, + realw* rho_vp, + realw* rho_vs, + int* num_phase_ispec_elastic, + int* phase_ispec_inner_elastic, + int* ispec_is_elastic, + int* ABSORBING_CONDITIONS, + realw* h_b_absorb_field, + int* h_b_reclen_field, + int* SIMULATION_TYPE,int* SAVE_FORWARD, + int* COMPUTE_AND_STORE_STRAIN, + realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy, + realw* epsilondev_xz,realw* epsilondev_yz, + int* ATTENUATION, + int* R_size, + realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz, + realw* one_minus_sum_beta,realw* factor_common, + realw* alphaval,realw* betaval,realw* gammaval, + int* OCEANS, + realw* rmass_ocean_load, + int* NOISE_TOMOGRAPHY, + realw* free_surface_normal, + int* free_surface_ispec, + int* free_surface_ijk, + int* num_free_surface_faces, + int* ACOUSTIC_SIMULATION, + int* num_colors_outer_elastic, + int* num_colors_inner_elastic, + int* num_elem_colors_elastic, + int* ANISOTROPY, + realw *c11store, + realw *c12store, + realw *c13store, + realw *c14store, + realw *c15store, + realw *c16store, + realw *c22store, + realw *c23store, + realw *c24store, + realw *c25store, + realw *c26store, + realw *c33store, + realw *c34store, + realw *c35store, + realw *c36store, + realw *c44store, + realw *c45store, + realw *c46store, + realw *c55store, + realw *c56store, + realw *c66store){} + +void FC_FUNC_(prepare_fields_elastic_adj_dev, + PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f, + int* size, + int* SIMULATION_TYPE, + int* COMPUTE_AND_STORE_STRAIN, + realw* epsilon_trace_over_3, + realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy, + realw* b_epsilondev_xz,realw* b_epsilondev_yz, + realw* b_epsilon_trace_over_3, + int* ATTENUATION, + int* R_size, + realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz, + realw* b_alphaval,realw* b_betaval,realw* b_gammaval, + int* APPROXIMATE_HESS_KL){} + +void FC_FUNC_(prepare_sim2_or_3_const_device, + PREPARE_SIM2_OR_3_CONST_DEVICE)( + long* Mesh_pointer_f, + int* islice_selected_rec, + int* islice_selected_rec_size, + int* nadj_rec_local, + int* nrec, + int* myrank) {} + +void FC_FUNC_(prepare_fields_noise_device, + PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f, + int* NSPEC_AB, int* NGLOB_AB, + int* free_surface_ispec, + int* free_surface_ijk, + int* num_free_surface_faces, + int* SIMULATION_TYPE, + int* NOISE_TOMOGRAPHY, + int* NSTEP, + realw* noise_sourcearray, + realw* normal_x_noise, + realw* normal_y_noise, + realw* normal_z_noise, + realw* mask_noise, + realw* free_surface_jacobian2Dw) {} + +void FC_FUNC_(prepare_fields_gravity_device, + PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f, + int* GRAVITY, + realw* minus_deriv_gravity, + realw* minus_g, + realw* h_wgll_cube, + int* ACOUSTIC_SIMULATION, + realw* rhostore) {} + +void FC_FUNC_(prepare_seismogram_fields, + PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {} + +void FC_FUNC_(prepare_cleanup_device, + PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* SAVE_FORWARD, + int* ACOUSTIC_SIMULATION, + int* ELASTIC_SIMULATION, + int* ABSORBING_CONDITIONS, + int* NOISE_TOMOGRAPHY, + int* COMPUTE_AND_STORE_STRAIN, + int* ATTENUATION, + int* ANISOTROPY, + int* OCEANS, + int* APPROXIMATE_HESS_KL) {} + + +// +// src/cuda/transfer_fields_cuda.cu +// + +void FC_FUNC_(transfer_fields_el_to_device, + TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_fields_el_from_device, + TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_fields_to_device, + TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_fields_from_device, + TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_accel_to_device, + TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_accel_from_device, + TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_accel_from_device, + TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_sigma_from_device, + TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_displ_from_device, + TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_displ_from_device, + TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_compute_kernel_answers_from_device, + TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer, + realw* rho_kl,int* size_rho, + realw* mu_kl, int* size_mu, + realw* kappa_kl, int* size_kappa) {} + +void FC_FUNC_(transfer_compute_kernel_fields_from_device, + TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer, + realw* accel, int* size_accel, + realw* b_displ, int* size_b_displ, + realw* epsilondev_xx, + realw* epsilondev_yy, + realw* epsilondev_xy, + realw* epsilondev_xz, + realw* epsilondev_yz, + int* size_epsilondev, + realw* b_epsilondev_xx, + realw* b_epsilondev_yy, + realw* b_epsilondev_xy, + realw* b_epsilondev_xz, + realw* b_epsilondev_yz, + int* size_b_epsilondev, + realw* rho_kl,int* size_rho, + realw* mu_kl, int* size_mu, + realw* kappa_kl, int* size_kappa, + realw* epsilon_trace_over_3, + realw* b_epsilon_trace_over_3, + int* size_epsilon_trace_over_3) {} + +void FC_FUNC_(transfer_b_fields_att_to_device, + TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer, + realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz, + int* size_R, + realw* b_epsilondev_xx, + realw* b_epsilondev_yy, + realw* b_epsilondev_xy, + realw* b_epsilondev_xz, + realw* b_epsilondev_yz, + int* size_epsilondev) {} + +void FC_FUNC_(transfer_fields_att_from_device, + TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer, + realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz, + int* size_R, + realw* epsilondev_xx, + realw* epsilondev_yy, + realw* epsilondev_xy, + realw* epsilondev_xz, + realw* epsilondev_yz, + int* size_epsilondev) {} + +void FC_FUNC_(transfer_kernels_el_to_host, + TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer, + realw* h_rho_kl, + realw* h_mu_kl, + realw* h_kappa_kl, + int* NSPEC_AB) {} + +void FC_FUNC_(transfer_kernels_noise_to_host, + TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer, + realw* h_Sigma_kl, + int* NSPEC_AB) {} + +void FC_FUNC_(transfer_fields_ac_to_device, + TRANSFER_FIELDS_AC_TO_DEVICE)( + int* size, + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_fields_ac_to_device, + TRANSFER_B_FIELDS_AC_TO_DEVICE)( + int* size, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_fields_ac_from_device, + TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size, + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_fields_ac_from_device, + TRANSFER_B_FIELDS_AC_FROM_DEVICE)( + int* size, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_dot_dot_from_device, + TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_dot_dot_from_device, + TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_kernels_ac_to_host, + TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer, + realw* h_rho_ac_kl, + realw* h_kappa_ac_kl, + int* NSPEC_AB) {} + +void FC_FUNC_(transfer_kernels_hess_el_tohost, + TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer, + realw* h_hess_kl, + int* NSPEC_AB) {} + +void FC_FUNC_(transfer_kernels_hess_ac_tohost, + TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer, + realw* h_hess_ac_kl, + int* NSPEC_AB) {} + + +// +// src/cuda/write_seismograms_cuda.cu +// + +void FC_FUNC_(transfer_seismograms_el_from_d, + TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local, + long* Mesh_pointer_f, + int* SIMULATION_TYPEf, + realw* seismograms_d, + realw* seismograms_v, + realw* seismograms_a, + int* it) {} + +void FC_FUNC_(transfer_station_el_from_device, + TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel, + realw* b_displ, realw* b_veloc, realw* b_accel, + long* Mesh_pointer_f,int* number_receiver_global, + int* ispec_selected_rec,int* ispec_selected_source, + int* ibool,int* SIMULATION_TYPEf) {} + +void FC_FUNC_(transfer_station_ac_from_device, + TRANSFER_STATION_AC_FROM_DEVICE)( + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f, + int* number_receiver_global, + int* ispec_selected_rec, + int* ispec_selected_source, + int* ibool, + int* SIMULATION_TYPEf) {} + diff --git a/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak b/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak new file mode 100644 index 000000000..d87e083ce --- /dev/null +++ b/src/cuda/specfem3D_gpu_cuda_method_stubs.c.bak @@ -0,0 +1,805 @@ +/* +!===================================================================== +! +! S p e c f e m 3 D V e r s i o n 2 . 0 +! --------------------------------------- +! +! Main authors: Dimitri Komatitsch and Jeroen Tromp +! Princeton University, USA and University of Pau / CNRS / INRIA +! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA +! April 2011 +! +! This program is free software; you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 2 of the License, or +! (at your option) any later version. +! +! This program is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along +! with this program; if not, write to the Free Software Foundation, Inc., +! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +! +!===================================================================== +*/ + +#include +#include + +#include "config.h" + +typedef float realw; + + + +// +// src/cuda/check_fields_cuda.cu +// + +void FC_FUNC_(check_max_norm_displ_gpu, + CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} + +void FC_FUNC_(check_max_norm_vector, + CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} + +void FC_FUNC_(check_max_norm_displ, + CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} + +void FC_FUNC_(check_max_norm_b_displ_gpu, + CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} + +void FC_FUNC_(check_max_norm_b_accel_gpu, + CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} + +void FC_FUNC_(check_max_norm_b_veloc_gpu, + CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} + +void FC_FUNC_(check_max_norm_b_displ, + CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} + +void FC_FUNC_(check_max_norm_b_accel, + CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} + +void FC_FUNC_(check_error_vectors, + CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} + +void FC_FUNC_(get_max_accel, + GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} + +void FC_FUNC_(get_norm_acoustic_from_device, + GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm, + long* Mesh_pointer_f, + int* SIMULATION_TYPE) {} + +void FC_FUNC_(get_norm_elastic_from_device, + GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm, + long* Mesh_pointer_f, + int* SIMULATION_TYPE) {} + + +// +// src/cuda/compute_add_sources_acoustic_cuda.cu +// + +void FC_FUNC_(compute_add_sources_ac_cuda, + COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + int* SIMULATION_TYPEf, + double* h_stf_pre_compute, + int* myrankf) {} + +void FC_FUNC_(compute_add_sources_ac_s3_cuda, + COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + int* SIMULATION_TYPEf, + double* h_stf_pre_compute, + int* myrankf) {} + +void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda, + ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer, + realw* h_adj_sourcearrays, + int* phase_is_inner, + int* h_ispec_is_inner, + int* h_ispec_is_acoustic, + int* h_ispec_selected_rec, + int* myrank, + int* nrec, + int* time_index, + int* h_islice_selected_rec, + int* nadj_rec_local, + int* NTSTEP_BETWEEN_READ_ADJSRC) {} + + +// +// src/cuda/compute_add_sources_elastic_cuda.cu +// + +void FC_FUNC_(compute_add_sources_el_cuda, + COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* NSOURCESf, + double* h_stf_pre_compute, + int* myrankf) {} + +void FC_FUNC_(compute_add_sources_el_s3_cuda, + COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer, + double* h_stf_pre_compute, + int* NSOURCESf, + int* phase_is_inner, + int* myrank) {} + +void FC_FUNC_(add_source_master_rec_noise_cu, + ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f, + int* myrank_f, + int* it_f, + int* irec_master_noise_f, + int* islice_selected_rec) {} + +void FC_FUNC_(add_sources_el_sim_type_2_or_3, + ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer, + realw* h_adj_sourcearrays, + int* phase_is_inner, + int* h_ispec_is_inner, + int* h_ispec_is_elastic, + int* h_ispec_selected_rec, + int* myrank, + int* nrec, + int* time_index, + int* h_islice_selected_rec, + int* nadj_rec_local, + int* NTSTEP_BETWEEN_READ_ADJSRC) {} + + +// +// src/cuda/compute_coupling_cuda.cu +// + +void FC_FUNC_(compute_coupling_ac_el_cuda, + COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* num_coupling_ac_el_facesf, + int* SIMULATION_TYPEf) {} + +void FC_FUNC_(compute_coupling_el_ac_cuda, + COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* num_coupling_ac_el_facesf, + int* SIMULATION_TYPEf) {} + + +// +// src/cuda/compute_forces_acoustic_cuda.cu +// + +void FC_FUNC_(transfer_boun_pot_from_device, + TRANSFER_BOUN_POT_FROM_DEVICE)( + int* size, + long* Mesh_pointer_f, + realw* potential_dot_dot_acoustic, + realw* send_potential_dot_dot_buffer, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT){} + +void FC_FUNC_(transfer_asmbl_pot_to_device, + TRANSFER_ASMBL_POT_TO_DEVICE)( + long* Mesh_pointer, + realw* potential_dot_dot_acoustic, + realw* buffer_recv_scalar_ext_mesh, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT) {} + +void FC_FUNC_(compute_forces_acoustic_cuda, + COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f, + int* iphase, + int* nspec_outer_acoustic, + int* nspec_inner_acoustic, + int* SIMULATION_TYPE) {} + +void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)( + long* Mesh_pointer, + int* size_F, + int* SIMULATION_TYPE) {} + +void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)( + long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltatover2_F) {} + +void FC_FUNC_(acoustic_enforce_free_surf_cuda, + ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* ABSORB_FREE_SURFACE) {} + + +// +// src/cuda/compute_forces_elastic_cuda.cu +// + +//void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*); // {};} + +void FC_FUNC_(transfer_boun_accel_from_device, + TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel, + realw* send_accel_buffer, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT){} + +void FC_FUNC_(transfer_boundary_from_device_a, + TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer, + int* nspec_outer_elastic) {} + +void FC_FUNC_(transfer_boundary_to_device_a, + TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer, + realw* buffer_recv_vector_ext_mesh, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh) {} + +//void FC_FUNC_(assemble_accel_on_device, +// ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel, +// realw* buffer_recv_vector_ext_mesh, +// int* num_interfaces_ext_mesh, +// int* max_nibool_interfaces_ext_mesh, +// int* nibool_interfaces_ext_mesh, +// int* ibool_interfaces_ext_mesh, +// int* FORWARD_OR_ADJOINT) {} + +void FC_FUNC_(transfer_asmbl_accel_to_device, + TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel, + realw* buffer_recv_vector_ext_mesh, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* nibool_interfaces_ext_mesh, + int* ibool_interfaces_ext_mesh, + int* FORWARD_OR_ADJOINT) {} + +void FC_FUNC_(compute_forces_elastic_cuda, + COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f, + int* iphase, + int* nspec_outer_elastic, + int* nspec_inner_elastic, + int* SIMULATION_TYPE, + int* COMPUTE_AND_STORE_STRAIN, + int* ATTENUATION, + int* ANISOTROPY) {} + +void FC_FUNC_(sync_copy_from_device, + SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f, + int* iphase, + realw* send_buffer) {} + +void FC_FUNC_(kernel_3_a_cuda, + KERNEL_3_A_CUDA)(long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE_f, + realw* b_deltatover2_F, + int* OCEANS) {} + +void FC_FUNC_(kernel_3_b_cuda, + KERNEL_3_B_CUDA)(long* Mesh_pointer, + int* size_F, + realw* deltatover2_F, + int* SIMULATION_TYPE_f, + realw* b_deltatover2_F) {} + +void FC_FUNC_(elastic_ocean_load_cuda, + ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f, + int* SIMULATION_TYPE) {} + + +// +// src/cuda/compute_kernels_cuda.cu +// + +void FC_FUNC_(compute_kernels_elastic_cuda, + COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer, + realw* deltat_f) {} + +void FC_FUNC_(compute_kernels_strgth_noise_cu, + COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer, + realw* h_noise_surface_movie, + realw* deltat) {} + +void FC_FUNC_(compute_kernels_acoustic_cuda, + COMPUTE_KERNELS_ACOUSTIC_CUDA)( + long* Mesh_pointer, + realw* deltat_f) {} + +void FC_FUNC_(compute_kernels_hess_cuda, + COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer, + realw* deltat_f, + int* ELASTIC_SIMULATION, + int* ACOUSTIC_SIMULATION) {} + + +// +// src/cuda/compute_stacey_acoustic_cuda.cu +// + +void FC_FUNC_(compute_stacey_acoustic_cuda, + COMPUTE_STACEY_ACOUSTIC_CUDA)( + long* Mesh_pointer_f, + int* phase_is_innerf, + int* SIMULATION_TYPEf, + int* SAVE_FORWARDf, + realw* h_b_absorb_potential) {} + + +// +// src/cuda/compute_stacey_elastic_cuda.cu +// + +void FC_FUNC_(compute_stacey_elastic_cuda, + COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f, + int* phase_is_innerf, + int* SIMULATION_TYPEf, + int* SAVE_FORWARDf, + realw* h_b_absorb_field) {} + + +// +// src/cuda/it_update_displacement_cuda.cu +// + +void FC_FUNC_(it_update_displacement_cuda, + IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f, + int* size_F, + realw* deltat_F, + realw* deltatsqover2_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltat_F, + realw* b_deltatsqover2_F, + realw* b_deltatover2_F) {} + +void FC_FUNC_(it_update_displacement_ac_cuda, + it_update_displacement_ac_cuda)(long* Mesh_pointer_f, + int* size_F, + realw* deltat_F, + realw* deltatsqover2_F, + realw* deltatover2_F, + int* SIMULATION_TYPE, + realw* b_deltat_F, + realw* b_deltatsqover2_F, + realw* b_deltatover2_F) {} + + +// +// src/cuda/noise_tomography_cuda.cu +// + +void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){} + +void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {} + +void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {} + +void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {} + +void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {} + +void FC_FUNC_(transfer_surface_to_host, + TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f, + realw* h_noise_surface_movie) {} + +void FC_FUNC_(noise_read_add_surface_movie_cu, + NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f, + realw* h_noise_surface_movie, + int* NOISE_TOMOGRAPHYf) {} + + +// +// src/cuda/prepare_mesh_constants_cuda.cu +// + +void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {} + +void FC_FUNC_(output_free_device_memory, + OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {} + +void FC_FUNC_(show_free_device_memory, + SHOW_FREE_DEVICE_MEMORY)() {} + +void FC_FUNC_(get_free_device_memory, + get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} + +void FC_FUNC_(prepare_cuda_device, + PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { + fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n"); + exit(1); +} + +void FC_FUNC_(prepare_constants_device, + PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer, + int* h_NGLLX, + int* NSPEC_AB, int* NGLOB_AB, + realw* h_xix, realw* h_xiy, realw* h_xiz, + realw* h_etax, realw* h_etay, realw* h_etaz, + realw* h_gammax, realw* h_gammay, realw* h_gammaz, + realw* h_kappav, realw* h_muv, + int* h_ibool, + int* num_interfaces_ext_mesh, + int* max_nibool_interfaces_ext_mesh, + int* h_nibool_interfaces_ext_mesh, + int* h_ibool_interfaces_ext_mesh, + realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz, + realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz, + realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz, + int* ABSORBING_CONDITIONS, + int* h_abs_boundary_ispec, int* h_abs_boundary_ijk, + realw* h_abs_boundary_normal, + realw* h_abs_boundary_jacobian2Dw, + int* h_num_abs_boundary_faces, + int* h_ispec_is_inner, + int* NSOURCES, + int* nsources_local_f, + realw* h_sourcearrays, + int* h_islice_selected_source, + int* h_ispec_selected_source, + int* h_number_receiver_global, + int* h_ispec_selected_rec, + int* nrec_f, + int* nrec_local_f, + int* SIMULATION_TYPE, + int* USE_MESH_COLORING_GPU_f, + int* nspec_acoustic,int* nspec_elastic, + int* my_neighbours_ext_mesh, + int* request_send_vector_ext_mesh, + int* request_recv_vector_ext_mesh, + realw* buffer_recv_vector_ext_mesh + ) {} + +void FC_FUNC_(prepare_fields_acoustic_device, + PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f, + realw* rmass_acoustic, + realw* rhostore, + realw* kappastore, + int* num_phase_ispec_acoustic, + int* phase_ispec_inner_acoustic, + int* ispec_is_acoustic, + int* NOISE_TOMOGRAPHY, + int* num_free_surface_faces, + int* free_surface_ispec, + int* free_surface_ijk, + int* ABSORBING_CONDITIONS, + int* b_reclen_potential, + realw* b_absorb_potential, + int* ELASTIC_SIMULATION, + int* num_coupling_ac_el_faces, + int* coupling_ac_el_ispec, + int* coupling_ac_el_ijk, + realw* coupling_ac_el_normal, + realw* coupling_ac_el_jacobian2Dw, + int* num_colors_outer_acoustic, + int* num_colors_inner_acoustic, + int* num_elem_colors_acoustic) {} + +void FC_FUNC_(prepare_fields_acoustic_adj_dev, + PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* APPROXIMATE_HESS_KL) {} + +void FC_FUNC_(prepare_fields_elastic_device, + PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f, + int* size, + realw* rmass, + realw* rho_vp, + realw* rho_vs, + int* num_phase_ispec_elastic, + int* phase_ispec_inner_elastic, + int* ispec_is_elastic, + int* ABSORBING_CONDITIONS, + realw* h_b_absorb_field, + int* h_b_reclen_field, + int* SIMULATION_TYPE,int* SAVE_FORWARD, + int* COMPUTE_AND_STORE_STRAIN, + realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy, + realw* epsilondev_xz,realw* epsilondev_yz, + int* ATTENUATION, + int* R_size, + realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz, + realw* one_minus_sum_beta,realw* factor_common, + realw* alphaval,realw* betaval,realw* gammaval, + int* OCEANS, + realw* rmass_ocean_load, + int* NOISE_TOMOGRAPHY, + realw* free_surface_normal, + int* free_surface_ispec, + int* free_surface_ijk, + int* num_free_surface_faces, + int* ACOUSTIC_SIMULATION, + int* num_colors_outer_elastic, + int* num_colors_inner_elastic, + int* num_elem_colors_elastic, + int* ANISOTROPY, + realw *c11store, + realw *c12store, + realw *c13store, + realw *c14store, + realw *c15store, + realw *c16store, + realw *c22store, + realw *c23store, + realw *c24store, + realw *c25store, + realw *c26store, + realw *c33store, + realw *c34store, + realw *c35store, + realw *c36store, + realw *c44store, + realw *c45store, + realw *c46store, + realw *c55store, + realw *c56store, + realw *c66store){} + +void FC_FUNC_(prepare_fields_elastic_adj_dev, + PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f, + int* size, + int* SIMULATION_TYPE, + int* COMPUTE_AND_STORE_STRAIN, + realw* epsilon_trace_over_3, + realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy, + realw* b_epsilondev_xz,realw* b_epsilondev_yz, + realw* b_epsilon_trace_over_3, + int* ATTENUATION, + int* R_size, + realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz, + realw* b_alphaval,realw* b_betaval,realw* b_gammaval, + int* APPROXIMATE_HESS_KL){} + +void FC_FUNC_(prepare_sim2_or_3_const_device, + PREPARE_SIM2_OR_3_CONST_DEVICE)( + long* Mesh_pointer_f, + int* islice_selected_rec, + int* islice_selected_rec_size, + int* nadj_rec_local, + int* nrec, + int* myrank) {} + +void FC_FUNC_(prepare_fields_noise_device, + PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f, + int* NSPEC_AB, int* NGLOB_AB, + int* free_surface_ispec, + int* free_surface_ijk, + int* num_free_surface_faces, + int* SIMULATION_TYPE, + int* NOISE_TOMOGRAPHY, + int* NSTEP, + realw* noise_sourcearray, + realw* normal_x_noise, + realw* normal_y_noise, + realw* normal_z_noise, + realw* mask_noise, + realw* free_surface_jacobian2Dw) {} + +void FC_FUNC_(prepare_fields_gravity_device, + PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f, + int* GRAVITY, + realw* minus_deriv_gravity, + realw* minus_g, + realw* h_wgll_cube, + int* ACOUSTIC_SIMULATION, + realw* rhostore) {} + +void FC_FUNC_(prepare_seismogram_fields, + PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {} + +void FC_FUNC_(prepare_cleanup_device, + PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f, + int* SIMULATION_TYPE, + int* SAVE_FORWARD, + int* ACOUSTIC_SIMULATION, + int* ELASTIC_SIMULATION, + int* ABSORBING_CONDITIONS, + int* NOISE_TOMOGRAPHY, + int* COMPUTE_AND_STORE_STRAIN, + int* ATTENUATION, + int* ANISOTROPY, + int* OCEANS, + int* APPROXIMATE_HESS_KL) {} + + +// +// src/cuda/transfer_fields_cuda.cu +// + +void FC_FUNC_(transfer_fields_el_to_device, + TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_fields_el_from_device, + TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_fields_to_device, + TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_fields_from_device, + TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_accel_to_device, + TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_accel_from_device, + TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_accel_from_device, + TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_sigma_from_device, + TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_displ_from_device, + TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_displ_from_device, + TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_compute_kernel_answers_from_device, + TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer, + realw* rho_kl,int* size_rho, + realw* mu_kl, int* size_mu, + realw* kappa_kl, int* size_kappa) {} + +void FC_FUNC_(transfer_compute_kernel_fields_from_device, + TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer, + realw* accel, int* size_accel, + realw* b_displ, int* size_b_displ, + realw* epsilondev_xx, + realw* epsilondev_yy, + realw* epsilondev_xy, + realw* epsilondev_xz, + realw* epsilondev_yz, + int* size_epsilondev, + realw* b_epsilondev_xx, + realw* b_epsilondev_yy, + realw* b_epsilondev_xy, + realw* b_epsilondev_xz, + realw* b_epsilondev_yz, + int* size_b_epsilondev, + realw* rho_kl,int* size_rho, + realw* mu_kl, int* size_mu, + realw* kappa_kl, int* size_kappa, + realw* epsilon_trace_over_3, + realw* b_epsilon_trace_over_3, + int* size_epsilon_trace_over_3) {} + +void FC_FUNC_(transfer_b_fields_att_to_device, + TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer, + realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz, + int* size_R, + realw* b_epsilondev_xx, + realw* b_epsilondev_yy, + realw* b_epsilondev_xy, + realw* b_epsilondev_xz, + realw* b_epsilondev_yz, + int* size_epsilondev) {} + +void FC_FUNC_(transfer_fields_att_from_device, + TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer, + realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz, + int* size_R, + realw* epsilondev_xx, + realw* epsilondev_yy, + realw* epsilondev_xy, + realw* epsilondev_xz, + realw* epsilondev_yz, + int* size_epsilondev) {} + +void FC_FUNC_(transfer_kernels_el_to_host, + TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer, + realw* h_rho_kl, + realw* h_mu_kl, + realw* h_kappa_kl, + int* NSPEC_AB) {} + +void FC_FUNC_(transfer_kernels_noise_to_host, + TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer, + realw* h_Sigma_kl, + int* NSPEC_AB) {} + +void FC_FUNC_(transfer_fields_ac_to_device, + TRANSFER_FIELDS_AC_TO_DEVICE)( + int* size, + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_fields_ac_to_device, + TRANSFER_B_FIELDS_AC_TO_DEVICE)( + int* size, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_fields_ac_from_device, + TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size, + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_fields_ac_from_device, + TRANSFER_B_FIELDS_AC_FROM_DEVICE)( + int* size, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_dot_dot_from_device, + TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_b_dot_dot_from_device, + TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {} + +void FC_FUNC_(transfer_kernels_ac_to_host, + TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer, + realw* h_rho_ac_kl, + realw* h_kappa_ac_kl, + int* NSPEC_AB) {} + +void FC_FUNC_(transfer_kernels_hess_el_tohost, + TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer, + realw* h_hess_kl, + int* NSPEC_AB) {} + +void FC_FUNC_(transfer_kernels_hess_ac_tohost, + TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer, + realw* h_hess_ac_kl, + int* NSPEC_AB) {} + + +// +// src/cuda/write_seismograms_cuda.cu +// + +void FC_FUNC_(transfer_seismograms_el_from_d, + TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local, + long* Mesh_pointer_f, + int* SIMULATION_TYPEf, + realw* seismograms_d, + realw* seismograms_v, + realw* seismograms_a, + int* it) {} + +void FC_FUNC_(transfer_station_el_from_device, + TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel, + realw* b_displ, realw* b_veloc, realw* b_accel, + long* Mesh_pointer_f,int* number_receiver_global, + int* ispec_selected_rec,int* ispec_selected_source, + int* ibool,int* SIMULATION_TYPEf) {} + +void FC_FUNC_(transfer_station_ac_from_device, + TRANSFER_STATION_AC_FROM_DEVICE)( + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f, + int* number_receiver_global, + int* ispec_selected_rec, + int* ispec_selected_source, + int* ibool, + int* SIMULATION_TYPEf) {} + diff --git a/src/cuda/transfer_fields_cuda.cu b/src/cuda/transfer_fields_cuda.cu new file mode 100644 index 000000000..2e6ca44e2 --- /dev/null +++ b/src/cuda/transfer_fields_cuda.cu @@ -0,0 +1,586 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" +#include "prepare_constants_cuda.h" + +/* ----------------------------------------------------------------------------------------------- */ + +// Transfer functions + +/* ----------------------------------------------------------------------------------------------- */ + + + +/* ----------------------------------------------------------------------------------------------- */ + +// for ELASTIC simulations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_fields_el_to_device, + TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) { + +TRACE("transfer_fields_el_to_device_"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003); + print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004); + print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40005); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_fields_el_from_device, + TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) { + + TRACE("transfer_fields_el_from_device_"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006); + print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007); + print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_b_fields_to_device, + TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel, + long* Mesh_pointer_f) { + + TRACE("transfer_b_fields_to_device_"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + cudaMemcpy(mp->d_b_displ,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_accel,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_b_fields_from_device, + TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) { + +TRACE("transfer_b_fields_from_device_"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + cudaMemcpy(b_displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost); + cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost); + cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost); + +} + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_accel_to_device, + TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) { + +TRACE("transfer_accel_to_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40016); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_accel_from_device, + TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) { + +TRACE("transfer_accel_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40026); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_b_accel_from_device, + TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) { + +TRACE("transfer_b_accel_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40036); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_sigma_from_device, + TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) { + +TRACE("transfer_sigma_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40046); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_b_displ_from_device, + TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) { + +TRACE("transfer_b_displ_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40056); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_displ_from_device, + TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) { + +TRACE("transfer_displ_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40066); + +} + +/* ----------------------------------------------------------------------------------------------- */ +/* +extern "C" +void FC_FUNC_(transfer_compute_kernel_answers_from_device, + TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer, + realw* rho_kl,int* size_rho, + realw* mu_kl, int* size_mu, + realw* kappa_kl, int* size_kappa) { +TRACE("transfer_compute_kernel_answers_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost); + +} +*/ + +/* ----------------------------------------------------------------------------------------------- */ +/* +extern "C" +void FC_FUNC_(transfer_compute_kernel_fields_from_device, + TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer, + realw* accel, int* size_accel, + realw* b_displ, int* size_b_displ, + realw* epsilondev_xx, + realw* epsilondev_yy, + realw* epsilondev_xy, + realw* epsilondev_xz, + realw* epsilondev_yz, + int* size_epsilondev, + realw* b_epsilondev_xx, + realw* b_epsilondev_yy, + realw* b_epsilondev_xy, + realw* b_epsilondev_xz, + realw* b_epsilondev_yz, + int* size_b_epsilondev, + realw* rho_kl,int* size_rho, + realw* mu_kl, int* size_mu, + realw* kappa_kl, int* size_kappa, + realw* epsilon_trace_over_3, + realw* b_epsilon_trace_over_3, + int* size_epsilon_trace_over_3) { +TRACE("transfer_compute_kernel_fields_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw), + cudaMemcpyDeviceToHost); + cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw), + cudaMemcpyDeviceToHost); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("after transfer_compute_kernel_fields_from_device"); +#endif +} +*/ + +/* ----------------------------------------------------------------------------------------------- */ + +// attenuation fields + +extern "C" +void FC_FUNC_(transfer_b_fields_att_to_device, + TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer, + realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz, + int* size_R, + realw* b_epsilondev_xx, + realw* b_epsilondev_yy, + realw* b_epsilondev_xy, + realw* b_epsilondev_xz, + realw* b_epsilondev_yz, + int* size_epsilondev) { + TRACE("transfer_b_fields_att_to_device"); + //get mesh pointer out of fortran integer container + Mesh* mp = (Mesh*)(*Mesh_pointer); + + cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice); + + cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice); + cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice); + + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("after transfer_b_fields_att_to_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +// attenuation fields + +extern "C" +void FC_FUNC_(transfer_fields_att_from_device, + TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer, + realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz, + int* size_R, + realw* epsilondev_xx, + realw* epsilondev_yy, + realw* epsilondev_xy, + realw* epsilondev_xz, + realw* epsilondev_yz, + int* size_epsilondev) { + TRACE("transfer_fields_att_from_device"); + //get mesh pointer out of fortran integer container + Mesh* mp = (Mesh*)(*Mesh_pointer); + + cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost); + + cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost); + + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("after transfer_fields_att_from_device"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_kernels_el_to_host, + TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer, + realw* h_rho_kl, + realw* h_mu_kl, + realw* h_kappa_kl, + int* NSPEC_AB) { +TRACE("transfer_kernels_el_to_host"); + //get mesh pointer out of fortran integer container + Mesh* mp = (Mesh*)(*Mesh_pointer); + + print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*NGLL3*sizeof(realw), + cudaMemcpyDeviceToHost),40101); + print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw), + cudaMemcpyDeviceToHost),40102); + print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw), + cudaMemcpyDeviceToHost),40103); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +// for NOISE simulations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_kernels_noise_to_host, + TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer, + realw* h_Sigma_kl, + int* NSPEC_AB) { +TRACE("transfer_kernels_noise_to_host"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,NGLL3*(*NSPEC_AB)*sizeof(realw), + cudaMemcpyDeviceToHost),40201); + +} + + +/* ----------------------------------------------------------------------------------------------- */ + +// for ACOUSTIC simulations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_fields_ac_to_device, + TRANSFER_FIELDS_AC_TO_DEVICE)( + int* size, + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + long* Mesh_pointer_f) { +TRACE("transfer_fields_ac_to_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic, + sizeof(realw)*(*size),cudaMemcpyHostToDevice),50110); + print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyHostToDevice),50120); + print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyHostToDevice),50130); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("after transfer_fields_ac_to_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_b_fields_ac_to_device, + TRANSFER_B_FIELDS_AC_TO_DEVICE)( + int* size, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f) { +TRACE("transfer_b_fields_ac_to_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic, + sizeof(realw)*(*size),cudaMemcpyHostToDevice),51110); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyHostToDevice),51120); + print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyHostToDevice),51130); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("after transfer_b_fields_ac_to_device"); +#endif +} + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_fields_ac_from_device, + TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size, + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + long* Mesh_pointer_f) { +TRACE("transfer_fields_ac_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic, + sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52111); + print_CUDA_error_if_any(cudaMemcpy(potential_dot_acoustic,mp->d_potential_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52121); + print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52131); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("after transfer_fields_ac_from_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_b_fields_ac_from_device, + TRANSFER_B_FIELDS_AC_FROM_DEVICE)( + int* size, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f) { +TRACE("transfer_b_fields_ac_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic, + sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53111); + print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_acoustic,mp->d_b_potential_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53121); + print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53131); + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("after transfer_b_fields_ac_from_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_dot_dot_from_device, + TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) { + + TRACE("transfer_dot_dot_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50041); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_b_dot_dot_from_device, + TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) { + + TRACE("transfer_b_dot_dot_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic, + sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50042); + +} + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_kernels_ac_to_host, + TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer, + realw* h_rho_ac_kl, + realw* h_kappa_ac_kl, + int* NSPEC_AB) { + + TRACE("transfer_kernels_ac_to_host"); + + //get mesh pointer out of fortran integer container + Mesh* mp = (Mesh*)(*Mesh_pointer); + int size = *NSPEC_AB*NGLL3; + + // copies kernel values over to CPU host + print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(realw), + cudaMemcpyDeviceToHost),54101); + print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(realw), + cudaMemcpyDeviceToHost),54102); +} + +/* ----------------------------------------------------------------------------------------------- */ + +// for Hess kernel calculations + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_kernels_hess_el_tohost, + TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer, + realw* h_hess_kl, + int* NSPEC_AB) { +TRACE("transfer_kernels_hess_el_tohost"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,NGLL3*(*NSPEC_AB)*sizeof(realw), + cudaMemcpyDeviceToHost),70201); +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_kernels_hess_ac_tohost, + TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer, + realw* h_hess_ac_kl, + int* NSPEC_AB) { + TRACE("transfer_kernels_hess_ac_tohost"); + + Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container + + print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,NGLL3*(*NSPEC_AB)*sizeof(realw), + cudaMemcpyDeviceToHost),70202); +} + + diff --git a/src/cuda/write_seismograms_cuda.cu b/src/cuda/write_seismograms_cuda.cu new file mode 100644 index 000000000..d706905b2 --- /dev/null +++ b/src/cuda/write_seismograms_cuda.cu @@ -0,0 +1,537 @@ +/* + !===================================================================== + ! + ! S p e c f e m 3 D V e r s i o n 2 . 0 + ! --------------------------------------- + ! + ! Main authors: Dimitri Komatitsch and Jeroen Tromp + ! Princeton University, USA and University of Pau / CNRS / INRIA + ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA + ! April 2011 + ! + ! This program is free software; you can redistribute it and/or modify + ! it under the terms of the GNU General Public License as published by + ! the Free Software Foundation; either version 2 of the License, or + ! (at your option) any later version. + ! + ! This program is distributed in the hope that it will be useful, + ! but WITHOUT ANY WARRANTY; without even the implied warranty of + ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + ! GNU General Public License for more details. + ! + ! You should have received a copy of the GNU General Public License along + ! with this program; if not, write to the Free Software Foundation, Inc., + ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + ! + !===================================================================== + */ + +#include +#include +#include + +#include +#include + +#include "config.h" +#include "mesh_constants_cuda.h" + + +/* ----------------------------------------------------------------------------------------------- */ + +// ELASTIC simulations + +/* ----------------------------------------------------------------------------------------------- */ + +/* + ! gets global number of that receiver + irec = number_receiver_global(irec_local) + + ! gets local receiver interpolators + ! (1-D Lagrange interpolators) + hxir(:) = hxir_store(irec_local,:) + hetar(:) = hetar_store(irec_local,:) + hgammar(:) = hgammar_store(irec_local,:) + +*/ + + +// Initially sets the blocks_x to be the num_blocks, and adds rows as +// needed. If an additional row is added, the row length is cut in +// half. If the block count is odd, there will be 1 too many blocks, +// which must be managed at runtime with an if statement. +void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) { + *num_blocks_x = num_blocks; + *num_blocks_y = 1; + while(*num_blocks_x > 65535) { + *num_blocks_x = (int) ceil(*num_blocks_x*0.5f); + *num_blocks_y = *num_blocks_y*2; + } + return; +} + +/* ----------------------------------------------------------------------------------------------- */ + +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; +old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void compute_interpolated_dva_plus_seismogram(int nrec_local, + realw* displ, realw* veloc, realw* accel, + int* ibool, + double* hxir, double* hetar, double* hgammar, + realw* seismograms_d, realw* seismograms_v, realw* seismograms_a, + double* nu, + int* number_receiver_global, + int* ispec_selected_rec) { + int irec_local = blockIdx.x + blockIdx.y*gridDim.x; + int i = threadIdx.x; + int j = threadIdx.y; + int k = threadIdx.z; + int ijk = i+5*(j+5*(k)); + + // we do the **d variable reduction in shared memory, because the + // atomicAdd() should be faster on the shared memory registers + // according to + // http://supercomputingblog.com/cuda/cuda-tutorial-4-atomic-operations/ + __shared__ double sh_dxd[NGLL3]; + __shared__ double sh_dyd[NGLL3]; + __shared__ double sh_dzd[NGLL3]; + __shared__ double sh_vxd[NGLL3]; + __shared__ double sh_vyd[NGLL3]; + __shared__ double sh_vzd[NGLL3]; + __shared__ double sh_axd[NGLL3]; + __shared__ double sh_ayd[NGLL3]; + __shared__ double sh_azd[NGLL3]; + + if(irec_local < nrec_local) { + int irec = number_receiver_global[irec_local]-1; + int ispec = ispec_selected_rec[irec]-1; + int iglob = ibool[ijk+125*ispec]-1; + double hlagrange = hxir[irec_local + nrec_local*i]*hetar[irec_local + nrec_local*j]*hgammar[irec_local + nrec_local*k]; + sh_dxd[ijk] = hlagrange*displ[0+3*iglob]; + sh_dyd[ijk] = hlagrange*displ[1+3*iglob]; + sh_dzd[ijk] = hlagrange*displ[2+3*iglob]; + + sh_vxd[ijk] = hlagrange*veloc[0+3*iglob]; + sh_vyd[ijk] = hlagrange*veloc[1+3*iglob]; + sh_vzd[ijk] = hlagrange*veloc[2+3*iglob]; + + sh_axd[ijk] = hlagrange*accel[0+3*iglob]; + sh_ayd[ijk] = hlagrange*accel[1+3*iglob]; + sh_azd[ijk] = hlagrange*accel[2+3*iglob]; + + // the reduction has to skip the first element (we don't need to + // add element 0 to itself) This reduction serializes the code, + // but it should be fast enough --- it can be made faster with a + // proper reduction algorithm. + __syncthreads(); + + // if(ijk>0) { + // reduction needs to be done atomically to avoid race conditions + // atomicAdd(&sh_dxd[0],sh_dxd[ijk]); + // atomicAdd(&sh_dyd[0],sh_dyd[ijk]); + // atomicAdd(&sh_dzd[0],sh_dzd[ijk]); + + // atomicAdd(&sh_vxd[0],sh_vxd[ijk]); + // atomicAdd(&sh_vyd[0],sh_vyd[ijk]); + // atomicAdd(&sh_vzd[0],sh_vzd[ijk]); + + // atomicAdd(&sh_axd[0],sh_axd[ijk]); + // atomicAdd(&sh_ayd[0],sh_ayd[ijk]); + // atomicAdd(&sh_azd[0],sh_azd[ijk]); + // } + // __syncthreads(); + if(ijk==0) { + // a loop in thread 0 is 4 times faster than atomic operations + for(int i=1;i<125;i++) { + sh_dxd[0] += sh_dxd[i]; + sh_dyd[0] += sh_dyd[i]; + sh_dzd[0] += sh_dzd[i]; + + sh_vxd[0] += sh_vxd[i]; + sh_vyd[0] += sh_vyd[i]; + sh_vzd[0] += sh_vzd[i]; + + sh_axd[0] += sh_axd[i]; + sh_ayd[0] += sh_ayd[i]; + sh_azd[0] += sh_azd[i]; + + } + + seismograms_d[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_dxd[0] + nu[0+3*(1+3*irec)]*sh_dyd[0] + nu[0+3*(2+3*irec)]*sh_dzd[0]; + seismograms_d[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_dxd[0] + nu[1+3*(1+3*irec)]*sh_dyd[0] + nu[1+3*(2+3*irec)]*sh_dzd[0]; + seismograms_d[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_dxd[0] + nu[2+3*(1+3*irec)]*sh_dyd[0] + nu[2+3*(2+3*irec)]*sh_dzd[0]; + + seismograms_v[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_vxd[0] + nu[0+3*(1+3*irec)]*sh_vyd[0] + nu[0+3*(2+3*irec)]*sh_vzd[0]; + seismograms_v[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_vxd[0] + nu[1+3*(1+3*irec)]*sh_vyd[0] + nu[1+3*(2+3*irec)]*sh_vzd[0]; + seismograms_v[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_vxd[0] + nu[2+3*(1+3*irec)]*sh_vyd[0] + nu[2+3*(2+3*irec)]*sh_vzd[0]; + + seismograms_a[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_axd[0] + nu[0+3*(1+3*irec)]*sh_ayd[0] + nu[0+3*(2+3*irec)]*sh_azd[0]; + seismograms_a[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_axd[0] + nu[1+3*(1+3*irec)]*sh_ayd[0] + nu[1+3*(2+3*irec)]*sh_azd[0]; + seismograms_a[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_axd[0] + nu[2+3*(1+3*irec)]*sh_ayd[0] + nu[2+3*(2+3*irec)]*sh_azd[0]; + + } + } +} + + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_seismograms_el_from_d, + TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local, + long* Mesh_pointer_f, + int* SIMULATION_TYPEf, + realw* seismograms_d, + realw* seismograms_v, + realw* seismograms_a, + int* it) { + +// transfers seismograms from device to host + + TRACE("transfer_seismograms_el_from_d"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + + int num_blocks_x, num_blocks_y; + get_blocks_xy(*nrec_local,&num_blocks_x,&num_blocks_y); + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(5,5,5); + + // double h_debug[125]; for(int i=0;i<125;i++) h_debug[i] = 0; + // double* d_debug; cudaMalloc((void**)&d_debug,125*sizeof(double)); + // cudaMemcpy(d_debug,h_debug,125*sizeof(double),cudaMemcpyHostToDevice); + // Cuda timing + // cudaEvent_t start, stop; + // realw time; + // cudaEventCreate(&start); + // cudaEventCreate(&stop); + // cudaEventRecord( start, 0 ); + + compute_interpolated_dva_plus_seismogram<<compute_stream>>>(*nrec_local, + mp->d_displ,mp->d_veloc,mp->d_accel, + mp->d_ibool, + mp->d_hxir, mp->d_hetar, mp->d_hgammar, + mp->d_seismograms_d, + mp->d_seismograms_v, + mp->d_seismograms_a, + mp->d_nu, + mp->d_number_receiver_global, + mp->d_ispec_selected_rec + ); + + // cudaMemcpy(h_debug,d_debug,125*sizeof(double),cudaMemcpyDeviceToHost); + + cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost); + cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost); + cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost); + + // cudaEventRecord( stop, 0 ); + // cudaEventSynchronize( stop ); + // cudaEventElapsedTime( &time, start, stop ); + // cudaEventDestroy( start ); + // cudaEventDestroy( stop ); + // printf("seismogram Execution Time: %f ms\n",time); + + // if(abs(mp->h_seismograms_d_it[0]) < 1e-25) printf("seismo1_x=%e\n",mp->h_seismograms_d_it[0]); + // if(abs(mp->h_seismograms_d_it[1]) < 1e-25) printf("seismo1_y=%e\n",mp->h_seismograms_d_it[1]); + + // if(abs(mp->h_seismograms_d_it[2]) < 1e-25) { + + // printf("%d:seismo1_z=%e\n",*it,mp->h_seismograms_d_it[2]); + + // } + + + memcpy(&seismograms_d[3**nrec_local*(*it-1)],mp->h_seismograms_d_it,3* *nrec_local*sizeof(realw)); + memcpy(&seismograms_v[3**nrec_local*(*it-1)],mp->h_seismograms_v_it,3* *nrec_local*sizeof(realw)); + memcpy(&seismograms_a[3**nrec_local*(*it-1)],mp->h_seismograms_a_it,3* *nrec_local*sizeof(realw)); + +} + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global, + int* ispec_selected_rec, + int* ibool, + realw* station_seismo_field, + realw* desired_field, + int nrec_local) { + int blockID = blockIdx.x + blockIdx.y*gridDim.x; + if(blockIDnrec_local == 0 ) return; + + int blocksize = NGLL3; + int num_blocks_x = mp->nrec_local; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + // prepare field transfer array on device + transfer_stations_fields_from_device_kernel<<compute_stream>>>(mp->d_number_receiver_global, + d_ispec_selected, + mp->d_ibool, + mp->d_station_seismo_field, + d_field, + mp->nrec_local); + + cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field, + (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost); + + int irec_local; + for(irec_local=0;irec_localnrec_local;irec_local++) { + int irec = number_receiver_global[irec_local] - 1; + int ispec = h_ispec_selected[irec] - 1; + + for(int i=0;ih_station_seismo_field[0+3*i+irec_local*NGLL3*3]; + h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3]; + h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3]; + } + + } +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("transfer_field_from_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_station_el_from_device, + TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel, + realw* b_displ, realw* b_veloc, realw* b_accel, + long* Mesh_pointer_f,int* number_receiver_global, + int* ispec_selected_rec,int* ispec_selected_source, + int* ibool,int* SIMULATION_TYPEf) { +TRACE("transfer_station_el_from_device"); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + // checks if anything to do + if( mp->nrec_local == 0 ) return; + + int SIMULATION_TYPE = *SIMULATION_TYPEf; + + if(SIMULATION_TYPE == 1) { + transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + } + else if(SIMULATION_TYPE == 2) { + transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global, + mp->d_ispec_selected_source, ispec_selected_source, ibool); + transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global, + mp->d_ispec_selected_source, ispec_selected_source, ibool); + transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global, + mp->d_ispec_selected_source, ispec_selected_source, ibool); + } + else if(SIMULATION_TYPE == 3) { + transfer_field_from_device(mp,mp->d_b_displ,b_displ, number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + transfer_field_from_device(mp,mp->d_b_veloc,b_veloc, number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + transfer_field_from_device(mp,mp->d_b_accel,b_accel, number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + } + +} + +/* ----------------------------------------------------------------------------------------------- */ + +// ACOUSTIC simulations + +/* ----------------------------------------------------------------------------------------------- */ + +__global__ void transfer_stations_fields_acoustic_from_device_kernel(int* number_receiver_global, + int* ispec_selected_rec, + int* ibool, + realw* station_seismo_potential, + realw* desired_potential) { + + int blockID = blockIdx.x + blockIdx.y*gridDim.x; + int nodeID = threadIdx.x + blockID*blockDim.x; + + int irec = number_receiver_global[blockID]-1; + int ispec = ispec_selected_rec[irec]-1; + int iglob = ibool[threadIdx.x + NGLL3*ispec]-1; + + //if(threadIdx.x == 0 ) printf("node acoustic: %i %i %i %i %i %e \n",blockID,nodeID,irec,ispec,iglob,desired_potential[iglob]); + + station_seismo_potential[nodeID] = desired_potential[iglob]; +} + +/* ----------------------------------------------------------------------------------------------- */ + +void transfer_field_acoustic_from_device(Mesh* mp, + realw* d_potential, + realw* h_potential, + int* number_receiver_global, + int* d_ispec_selected, + int* h_ispec_selected, + int* ibool) { + +TRACE("transfer_field_acoustic_from_device"); + + int irec_local,irec,ispec,iglob,j; + + // checks if anything to do + if( mp->nrec_local == 0 ) return; + + // sets up kernel dimensions + int blocksize = NGLL3; + int num_blocks_x = mp->nrec_local; + int num_blocks_y = 1; + while(num_blocks_x > 65535) { + num_blocks_x = (int) ceil(num_blocks_x*0.5f); + num_blocks_y = num_blocks_y*2; + } + + dim3 grid(num_blocks_x,num_blocks_y); + dim3 threads(blocksize,1,1); + + // prepare field transfer array on device + transfer_stations_fields_acoustic_from_device_kernel<<>>(mp->d_number_receiver_global, + d_ispec_selected, + mp->d_ibool, + mp->d_station_seismo_potential, + d_potential); + + + print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential, + mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),500); + + //printf("copy local receivers: %i \n",mp->nrec_local); + + for(irec_local=0; irec_local < mp->nrec_local; irec_local++) { + irec = number_receiver_global[irec_local]-1; + ispec = h_ispec_selected[irec]-1; + + // copy element values + // note: iglob may vary and can be irregularly accessing the h_potential array + for(j=0; j < NGLL3; j++){ + iglob = ibool[j+NGLL3*ispec]-1; + h_potential[iglob] = mp->h_station_seismo_potential[j+irec_local*NGLL3]; + } + + // copy each station element's points to working array + // note: this works if iglob values would be all aligned... + //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*NGLL3]),NGLL3*sizeof(realw)); + + } +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + exit_on_cuda_error("transfer_field_acoustic_from_device"); +#endif +} + +/* ----------------------------------------------------------------------------------------------- */ + +extern "C" +void FC_FUNC_(transfer_station_ac_from_device, + TRANSFER_STATION_AC_FROM_DEVICE)( + realw* potential_acoustic, + realw* potential_dot_acoustic, + realw* potential_dot_dot_acoustic, + realw* b_potential_acoustic, + realw* b_potential_dot_acoustic, + realw* b_potential_dot_dot_acoustic, + long* Mesh_pointer_f, + int* number_receiver_global, + int* ispec_selected_rec, + int* ispec_selected_source, + int* ibool, + int* SIMULATION_TYPEf) { + +TRACE("transfer_station_ac_from_device"); + //double start_time = get_time(); + + Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper + // checks if anything to do + if( mp->nrec_local == 0 ) return; + + int SIMULATION_TYPE = *SIMULATION_TYPEf; + + if(SIMULATION_TYPE == 1) { + transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic, + number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + transfer_field_acoustic_from_device(mp,mp->d_potential_dot_acoustic,potential_dot_acoustic, + number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + transfer_field_acoustic_from_device(mp,mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic, + number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + } + else if(SIMULATION_TYPE == 2) { + transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic, + number_receiver_global, + mp->d_ispec_selected_source, ispec_selected_source, ibool); + transfer_field_acoustic_from_device(mp,mp->d_potential_dot_acoustic,potential_dot_acoustic, + number_receiver_global, + mp->d_ispec_selected_source, ispec_selected_source, ibool); + transfer_field_acoustic_from_device(mp,mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic, + number_receiver_global, + mp->d_ispec_selected_source, ispec_selected_source, ibool); + } + else if(SIMULATION_TYPE == 3) { + transfer_field_acoustic_from_device(mp,mp->d_b_potential_acoustic,b_potential_acoustic, + number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + transfer_field_acoustic_from_device(mp,mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic, + number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + transfer_field_acoustic_from_device(mp,mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic, + number_receiver_global, + mp->d_ispec_selected_rec, ispec_selected_rec, ibool); + } + +#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING + //double end_time = get_time(); + //printf("Elapsed time: %e\n",end_time-start_time); + exit_on_cuda_error("transfer_station_ac_from_device"); +#endif +} + diff --git a/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90 b/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90 index a5bfe5752..4b4bede11 100644 --- a/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90 +++ b/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90 @@ -41,7 +41,7 @@ module decompose_mesh_SCOTCH integer, dimension(:,:), allocatable :: elmnts integer, dimension(:,:), allocatable :: mat integer, dimension(:), allocatable :: part - + integer :: nnodes double precision, dimension(:,:), allocatable :: nodes_coords @@ -121,7 +121,7 @@ subroutine read_mesh_files character(len=256) :: line logical :: use_poroelastic_file integer(long) :: nspec_long - + ! sets number of nodes per element ngnod = esize @@ -158,10 +158,10 @@ subroutine read_mesh_files print*,'bit size fortran: ',bit_size(nspec) stop 'error number of elements too large' endif - + ! sets number of elements (integer 4-byte) nspec = nspec_long - + allocate(elmnts(esize,nspec),stat=ier) if( ier /= 0 ) stop 'error allocating array elmnts' do ispec = 1, nspec @@ -683,6 +683,7 @@ subroutine scotch_partitioning integer, dimension(:),allocatable :: num_material integer :: ier + ! starts from 0 elmnts(:,:) = elmnts(:,:) - 1 ! determines maximum neighbors based on 1 common node @@ -725,7 +726,7 @@ subroutine scotch_partitioning call acoustic_elastic_poro_load(elmnts_load,nspec,count_def_mat,count_undef_mat, & num_material,mat_prop,undef_mat_prop) - deallocate(num_material) + ! SCOTCH partitioning @@ -810,36 +811,40 @@ subroutine scotch_partitioning if (ier /= 0) then stop 'ERROR : MAIN : Cannot destroy strat' endif - - ! re-partitioning puts poroelastic-elastic coupled elements into same partition - ! integer :: nfaces_coupled - ! integer, dimension(:,:), pointer :: faces_coupled + + ! re-partitioning puts poroelastic-elastic coupled elements into same partition + ! integer :: nfaces_coupled + ! integer, dimension(:,:), pointer :: faces_coupled + + ! TODO: supposed to rebalance, but currently broken call poro_elastic_repartitioning (nspec, nnodes, elmnts, & - count_def_mat, mat(1,:) , mat_prop, & + count_def_mat, num_material , mat_prop, & sup_neighbour, nsize, & nparts, part) - !nparts, part, nfaces_coupled, faces_coupled) - ! re-partitioning puts moho-surface coupled elements into same partition + deallocate(num_material) + + ! re-partitioning puts moho-surface coupled elements into same partition call moho_surface_repartitioning (nspec, nnodes, elmnts, & sup_neighbour, nsize, nparts, part, & nspec2D_moho,ibelm_moho,nodes_ibelm_moho ) - - ! local number of each element for each partition + ! local number of each element for each partition call build_glob2loc_elmnts(nspec, part, glob2loc_elmnts,nparts) - ! local number of each node for each partition + ! local number of each node for each partition call build_glob2loc_nodes(nspec, nnodes,nsize, nnodes_elmnts, nodes_elmnts, part, & glob2loc_nodes_nparts, glob2loc_nodes_parts, glob2loc_nodes, nparts) - ! mpi interfaces + ! mpi interfaces ! acoustic/elastic/poroelastic boundaries will be split into different MPI partitions call build_interfaces(nspec, sup_neighbour, part, elmnts, & xadj, adjncy, tab_interfaces, & tab_size_interfaces, ninterfaces, & nparts) + + !or: uncomment if you want acoustic/elastic boundaries NOT to be separated into different MPI partitions !call build_interfaces_no_ac_el_sep(nspec, sup_neighbour, part, elmnts, & ! xadj, adjncy, tab_interfaces, & @@ -877,19 +882,23 @@ subroutine write_mesh_databases endif ! gets number of nodes + call write_glob2loc_nodes_database(IIN_database, ipart, nnodes_loc, nodes_coords, & glob2loc_nodes_nparts, glob2loc_nodes_parts, & glob2loc_nodes, nnodes, 1) - ! gets number of spectral elements call write_partition_database(IIN_database, ipart, nspec_local, nspec, elmnts, & glob2loc_elmnts, glob2loc_nodes_nparts, & glob2loc_nodes_parts, glob2loc_nodes, part, mat, ngnod, 1) + !debug + !print*, ipart,": nspec_local=",nspec_local, " nnodes_local=", nnodes_loc + ! writes out node coordinate locations !write(IIN_database,*) nnodes_loc write(IIN_database) nnodes_loc + call write_glob2loc_nodes_database(IIN_database, ipart, nnodes_loc, nodes_coords,& glob2loc_nodes_nparts, glob2loc_nodes_parts, & glob2loc_nodes, nnodes, 2) @@ -956,4 +965,3 @@ end subroutine write_mesh_databases !end program pre_meshfem3D end module decompose_mesh_SCOTCH - diff --git a/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90 b/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90 index c76c4ba7c..4bbb81cd4 100644 --- a/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90 +++ b/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90 @@ -1138,6 +1138,7 @@ subroutine write_interfaces_database(IIN_database, tab_interfaces, tab_size_inte ! local_nodes(1), -1, -1, -1 write(IIN_database) local_elmnt, tab_interfaces(k*7+2), & local_nodes(1), -1, -1, -1 + case (2) ! edge element do l = glob2loc_nodes_nparts(tab_interfaces(k*7+3)), & @@ -1156,6 +1157,7 @@ subroutine write_interfaces_database(IIN_database, tab_interfaces, tab_size_inte ! local_nodes(1), local_nodes(2), -1, -1 write(IIN_database) local_elmnt, tab_interfaces(k*7+2), & local_nodes(1), local_nodes(2), -1, -1 + case (4) ! face element count_faces = count_faces + 1 @@ -1187,6 +1189,7 @@ subroutine write_interfaces_database(IIN_database, tab_interfaces, tab_size_inte ! local_nodes(1), local_nodes(2),local_nodes(3), local_nodes(4) write(IIN_database) local_elmnt, tab_interfaces(k*7+2), & local_nodes(1), local_nodes(2),local_nodes(3), local_nodes(4) + case default print *, "error in write_interfaces_database!", tab_interfaces(k*7+2), iproc end select @@ -1382,12 +1385,11 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, & nb_materials, num_material, mat_prop, & sup_neighbour, nsize, & nproc, part) - !nproc, part, nfaces_coupled, faces_coupled) implicit none - integer,intent(in) :: nspec - integer, intent(in) :: nnodes, nproc, nb_materials + integer, intent(in) :: nspec + integer, intent(in) :: nnodes, nproc, nb_materials integer, intent(in) :: sup_neighbour integer, intent(in) :: nsize @@ -1398,11 +1400,10 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, & integer, dimension(0:nspec-1) :: part integer, dimension(0:esize*nspec-1) :: elmnts + ! local parameters integer :: nfaces_coupled - !integer, intent(out) :: nfaces_coupled integer, dimension(:,:), pointer :: faces_coupled - logical, dimension(nb_materials) :: is_poroelastic, is_elastic ! neighbors @@ -1429,8 +1430,8 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, & enddo ! checks if any poroelastic/elastic elements are set - !if( .not. any(is_poroelastic) ) return - !if( .not. any(is_elastic) ) return + if( .not. any(is_poroelastic) ) return + if( .not. any(is_elastic) ) return ! gets neighbors by 4 common nodes (face) allocate(xadj(0:nspec),stat=ier) @@ -1461,6 +1462,7 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, & ! coupled elements allocate(faces_coupled(2,nfaces_coupled),stat=ier) if( ier /= 0 ) stop 'error allocating array faces_coupled' + faces_coupled(:,:) = -1 ! stores elements indices nfaces_coupled = 0 @@ -1494,7 +1496,10 @@ subroutine poro_elastic_repartitioning (nspec, nnodes, elmnts, & endif enddo - end subroutine poro_elastic_repartitioning + deallocate(xadj,adjncy,nnodes_elmnts,nodes_elmnts) + deallocate(faces_coupled) + + end subroutine poro_elastic_repartitioning !-------------------------------------------------- ! Repartitioning : two coupled moho surface elements are transfered to the same partition @@ -1624,6 +1629,7 @@ subroutine moho_surface_repartitioning (nspec, nnodes, elmnts, & ! coupled elements allocate(faces_coupled(2,nfaces_coupled),stat=ier) if( ier /= 0 ) stop 'error allocating array faces_coupled' + faces_coupled(:,:) = -1 ! stores elements indices nfaces_coupled = 0 @@ -1658,6 +1664,10 @@ subroutine moho_surface_repartitioning (nspec, nnodes, elmnts, & endif enddo + deallocate(is_moho,node_is_moho) + deallocate(xadj,adjncy,nnodes_elmnts,nodes_elmnts) + deallocate(faces_coupled) + end subroutine moho_surface_repartitioning diff --git a/src/decompose_mesh_SCOTCH/program_decompose_mesh_SCOTCH.f90 b/src/decompose_mesh_SCOTCH/program_decompose_mesh_SCOTCH.f90 index 90c5bb961..aad5f807b 100644 --- a/src/decompose_mesh_SCOTCH/program_decompose_mesh_SCOTCH.f90 +++ b/src/decompose_mesh_SCOTCH/program_decompose_mesh_SCOTCH.f90 @@ -31,6 +31,9 @@ program pre_meshfem3D check_valence, & scotch_partitioning, & write_mesh_databases + +! daniel: ifort +! USE IFPORT,only: getarg implicit none integer :: i character(len=256) :: arg(3) diff --git a/src/generate_databases/Makefile.in b/src/generate_databases/Makefile.in index 3a5c6867f..c22375564 100644 --- a/src/generate_databases/Makefile.in +++ b/src/generate_databases/Makefile.in @@ -78,14 +78,15 @@ libgendatabases_a_OBJECTS = \ $O/detect_surface.o \ $O/exit_mpi.o \ $O/get_absorbing_boundary.o \ - $O/get_coupling_surfaces.o \ - $O/get_model.o \ - $O/get_MPI.o \ $O/get_attenuation_model.o \ $O/get_cmt.o \ + $O/get_coupling_surfaces.o \ $O/get_element_face.o \ $O/get_global.o \ $O/get_jacobian_boundaries.o \ + $O/get_model.o \ + $O/get_MPI.o \ + $O/get_perm_color.o \ $O/get_shape2D.o \ $O/get_shape3D.o \ $O/get_value_parameters.o \ @@ -98,6 +99,7 @@ libgendatabases_a_OBJECTS = \ $O/model_aniso.o \ $O/model_default.o \ $O/model_external_values.o \ + $O/model_ipati.o \ $O/model_gll.o \ $O/model_salton_trough.o \ $O/model_tomography.o \ @@ -199,9 +201,6 @@ $O/assemble_MPI_scalar.o: ${SHARED}/constants.h ${SHARED}/assemble_MPI_scalar.f $O/parallel.o: ${SHARED}/constants.h ${SHARED}/parallel.f90 ${MPIFCCOMPILE_CHECK} -c -o $O/parallel.o ${SHARED}/parallel.f90 -### -### -### ### ### serial compilation without optimization @@ -230,6 +229,9 @@ $O/model_default.o: ${SHARED}/constants.h model_default.f90 $O/model_external_values.o: ${SHARED}/constants.h model_external_values.f90 ${FCCOMPILE_CHECK} -c -o $O/model_external_values.o model_external_values.f90 +$O/model_ipati.o: ${SHARED}/constants.h model_ipati.f90 + ${FCCOMPILE_CHECK} -c -o $O/model_ipati.o model_ipati.f90 + $O/model_gll.o: ${SHARED}/constants.h model_gll.f90 ${FCCOMPILE_CHECK} -c -o $O/model_gll.o model_gll.f90 @@ -351,6 +353,9 @@ $O/get_model.o: ${SHARED}/constants.h get_model.f90 $O/get_MPI.o: ${SHARED}/constants.h get_MPI.f90 ${FCCOMPILE_CHECK} -c -o $O/get_MPI.o get_MPI.f90 +$O/get_perm_color.o: ${SHARED}/constants.h get_perm_color.f90 + ${FCCOMPILE_CHECK} -c -o $O/get_perm_color.o get_perm_color.f90 + $O/create_name_database.o: ${SHARED}/constants.h ${SHARED}/create_name_database.f90 ${FCCOMPILE_CHECK} -c -o $O/create_name_database.o ${SHARED}/create_name_database.f90 diff --git a/src/generate_databases/create_mass_matrices.f90 b/src/generate_databases/create_mass_matrices.f90 index 668f9a9f2..32926ca70 100644 --- a/src/generate_databases/create_mass_matrices.f90 +++ b/src/generate_databases/create_mass_matrices.f90 @@ -160,7 +160,7 @@ subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool,OCEANS,TOPOGRAPHY, integer :: ier real(kind=CUSTOM_REAL) :: xloc,yloc,loc_elevation - + ! creates ocean load mass matrix if(OCEANS) then @@ -184,24 +184,24 @@ subroutine create_mass_matrices_ocean_load(nglob,nspec,ibool,OCEANS,TOPOGRAPHY, do igll=1,NGLLSQUARE ix_oceans = free_surface_ijk(1,igll,ispec2D) - iy_oceans = free_surface_ijk(1,igll,ispec2D) - iz_oceans = free_surface_ijk(1,igll,ispec2D) + iy_oceans = free_surface_ijk(2,igll,ispec2D) + iz_oceans = free_surface_ijk(3,igll,ispec2D) iglobnum=ibool(ix_oceans,iy_oceans,iz_oceans,ispec_oceans) ! compute local height of oceans if( TOPOGRAPHY ) then - + ! takes elevation from topography file xloc = xstore_dummy(iglobnum) yloc = ystore_dummy(iglobnum) - + call get_topo_bathy_elevation(xloc,yloc,loc_elevation, & itopo_bathy,NX_TOPO,NY_TOPO, & UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION) - + elevation = dble(loc_elevation) - + else ! takes elevation from z-coordinate of mesh point diff --git a/src/generate_databases/create_regions_mesh.f90 b/src/generate_databases/create_regions_mesh.f90 index 94516c734..695ce8ab3 100644 --- a/src/generate_databases/create_regions_mesh.f90 +++ b/src/generate_databases/create_regions_mesh.f90 @@ -33,7 +33,7 @@ module create_regions_mesh_ext_par real(kind=CUSTOM_REAL), dimension(:), allocatable :: ystore_dummy real(kind=CUSTOM_REAL), dimension(:), allocatable :: zstore_dummy integer :: nglob_dummy - + ! Gauss-Lobatto-Legendre points and weights of integration double precision, dimension(:), allocatable :: xigll,yigll,zigll,wxgll,wygll,wzgll @@ -148,7 +148,12 @@ module create_regions_mesh_ext_par logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION + ! mesh coloring + integer :: num_colors_outer_acoustic,num_colors_inner_acoustic + integer, dimension(:), allocatable :: num_elem_colors_acoustic + integer :: num_colors_outer_elastic,num_colors_inner_elastic + integer, dimension(:), allocatable :: num_elem_colors_elastic end module create_regions_mesh_ext_par ! @@ -160,7 +165,6 @@ end module create_regions_mesh_ext_par subroutine create_regions_mesh() ! create the different regions of the mesh - use generate_databases_par,only: & nspec => NSPEC_AB,nglob => NGLOB_AB, & ibool,xstore,ystore,zstore, & @@ -183,7 +187,7 @@ subroutine create_regions_mesh() ATTENUATION,USE_OLSEN_ATTENUATION, & UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, & NX_TOPO,NY_TOPO,itopo_bathy, & - nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho + nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho use create_regions_mesh_ext_par implicit none @@ -197,26 +201,6 @@ subroutine create_regions_mesh() ! integer,dimension(:),allocatable :: itest_flag ! integer, dimension(:), allocatable :: elem_flag -! For Piero Basini : -! integer :: doubling_value_found_for_Piero -! double precision :: xmesh,ymesh,zmesh -! double precision :: rho,vp,vs - -! integer,dimension(nspec) :: idoubling -! integer :: doubling_value_found_for_Piero -! integer, parameter :: NUMBER_OF_STATIONS = 6 -! double precision, parameter :: RADIUS_TO_EXCLUDE = 250.d0 -! double precision, dimension(NUMBER_OF_STATIONS) :: utm_x_station,utm_y_station - -! logical :: is_around_a_station -! integer :: istation - -! ! store bedrock values -! integer :: icornerlat,icornerlong -! double precision :: lat,long,elevation_bedrock -! double precision :: lat_corner,long_corner,ratio_xi,ratio_eta -!real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: ibedrock - ! initializes arrays call sync_all() if( myrank == 0) then @@ -227,7 +211,6 @@ subroutine create_regions_mesh() nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax, & nspec2D_bottom,nspec2D_top,ANISOTROPY) - ! fills location and weights for Gauss-Lobatto-Legendre points, shape and derivations, ! returns jacobianstore,xixstore,...gammazstore ! and GLL-point locations in xstore,ystore,zstore @@ -275,6 +258,18 @@ subroutine create_regions_mesh() nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax, & nspec2D_bottom,nspec2D_top) +! sets up up Moho surface + NSPEC2D_MOHO = 0 + if( SAVE_MOHO_MESH ) then + call sync_all() + if( myrank == 0) then + write(IMAIN,*) ' ...setting up Moho surface' + endif + call crm_setup_moho(myrank,nspec, & + nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, & + nodes_coords_ext_mesh,nnodes_ext_mesh,ibool ) + endif + ! sets material velocities call sync_all() if( myrank == 0) then @@ -283,8 +278,7 @@ subroutine create_regions_mesh() call get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & materials_ext_mesh,nmat_ext_mesh, & undef_mat_prop,nundefMat_ext_mesh, & - ANISOTROPY,LOCAL_PATH) - + ANISOTROPY) ! sets up acoustic-elastic-poroelastic coupling surfaces call sync_all() @@ -297,17 +291,26 @@ subroutine create_regions_mesh() num_interfaces_ext_mesh,max_interface_size_ext_mesh, & my_neighbours_ext_mesh) -! sets up up Moho surface - NSPEC2D_MOHO = 0 - if( SAVE_MOHO_MESH ) then - call sync_all() - if( myrank == 0) then - write(IMAIN,*) ' ...setting up Moho surface' - endif - call crm_setup_moho(myrank,nspec, & - nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, & - nodes_coords_ext_mesh,nnodes_ext_mesh,ibool ) +! locates inner and outer elements + call sync_all() + if( myrank == 0) then + write(IMAIN,*) ' ...element inner/outer separation ' + endif + call crm_setup_inner_outer_elemnts(myrank,nspec, & + num_interfaces_ext_mesh,max_interface_size_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + ibool,SAVE_MESH_FILES) + +! colors mesh if requested + call sync_all() + if( myrank == 0) then + write(IMAIN,*) ' ...element mesh coloring ' endif + call crm_setup_color_perm(myrank,nspec,nglob,ibool,ANISOTROPY,SAVE_MESH_FILES) + +! overwrites material parameters from external binary files + call sync_all() + call get_model_binaries(myrank,nspec,LOCAL_PATH) ! creates mass matrix call sync_all() @@ -325,16 +328,6 @@ subroutine create_regions_mesh() UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, & NX_TOPO,NY_TOPO,itopo_bathy) -! locates inner and outer elements - call sync_all() - if( myrank == 0) then - write(IMAIN,*) ' ...element inner/outer separation ' - endif - call crm_setup_inner_outer_elemnts(myrank,nspec, & - num_interfaces_ext_mesh,max_interface_size_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - ibool,SAVE_MESH_FILES) - ! saves the binary mesh files call sync_all() if( myrank == 0) then @@ -342,44 +335,52 @@ subroutine create_regions_mesh() endif !call create_name_database(prname,myrank,LOCAL_PATH) call save_arrays_solver_ext_mesh(nspec,nglob_dummy, & - xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore,& - gammaxstore,gammaystore,gammazstore, & - jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, & - rhostore,kappastore,mustore, & - rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, & - rho_vpI,rho_vpII,rho_vsI, & - rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, & - OCEANS,rmass_ocean_load,NGLOB_OCEAN, & +! xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore,& +! gammaxstore,gammaystore,gammazstore, & +! jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, & +! rhostore,kappastore,mustore, & +! rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, & +! rho_vpI,rho_vpII,rho_vsI, & +! rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, & + OCEANS, & +! rmass_ocean_load,NGLOB_OCEAN, & ibool, & - xstore_dummy,ystore_dummy,zstore_dummy, & - abs_boundary_normal,abs_boundary_jacobian2Dw, & - abs_boundary_ijk,abs_boundary_ispec,num_abs_boundary_faces, & - free_surface_normal,free_surface_jacobian2Dw, & - free_surface_ijk,free_surface_ispec, & - num_free_surface_faces, & - coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, & - coupling_ac_el_ijk,coupling_ac_el_ispec, & - num_coupling_ac_el_faces, & - coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, & - coupling_ac_po_ijk,coupling_ac_po_ispec, & - num_coupling_ac_po_faces, & - coupling_el_po_normal,coupling_el_po_jacobian2Dw, & - coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, & - coupling_po_el_ispec,num_coupling_el_po_faces, & +! xstore_dummy,ystore_dummy,zstore_dummy, & +! abs_boundary_normal,abs_boundary_jacobian2Dw, & +! abs_boundary_ijk,abs_boundary_ispec,num_abs_boundary_faces, & +! free_surface_normal,free_surface_jacobian2Dw, & +! free_surface_ijk,free_surface_ispec, & +! num_free_surface_faces, & +! coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, & +! coupling_ac_el_ijk,coupling_ac_el_ispec, & +! num_coupling_ac_el_faces, & +! coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, & +! coupling_ac_po_ijk,coupling_ac_po_ispec, & +! num_coupling_ac_po_faces, & +! coupling_el_po_normal,coupling_el_po_jacobian2Dw, & +! coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, & +! coupling_po_el_ispec,num_coupling_el_po_faces, & num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, & max_interface_size_ext_mesh,ibool_interfaces_ext_mesh, & - prname,SAVE_MESH_FILES, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store, & - c22store,c23store,c24store,c25store,c26store,c33store, & - c34store,c35store,c36store,c44store,c45store,c46store, & - c55store,c56store,c66store, & - ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, & - ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, & - nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, & - num_phase_ispec_acoustic,phase_ispec_inner_acoustic, & - num_phase_ispec_elastic,phase_ispec_inner_elastic, & - num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic) +! prname, & + SAVE_MESH_FILES, & + ANISOTROPY & +! NSPEC_ANISO, & +! c11store,c12store,c13store,c14store,c15store,c16store, & +! c22store,c23store,c24store,c25store,c26store,c33store, & +! c34store,c35store,c36store,c44store,c45store,c46store, & +! c55store,c56store,c66store, & +! ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, & +! ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, & +! nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, & +! num_phase_ispec_acoustic,phase_ispec_inner_acoustic, & +! num_phase_ispec_elastic,phase_ispec_inner_elastic, & +! num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, & +! num_colors_outer_acoustic,num_colors_inner_acoustic, & +! num_elem_colors_acoustic, & +! num_colors_outer_elastic,num_colors_inner_elastic, & +! num_elem_colors_elastic, & + ) ! saves moho surface if( SAVE_MOHO_MESH ) then @@ -394,12 +395,21 @@ subroutine create_regions_mesh() ! checks the mesh, stability and resolved period call sync_all() -!chris: check for poro: At the moment cpI & cpII are for eta=0 - call check_mesh_resolution_poro(myrank,nspec,nglob_dummy,ibool,& + + if( POROELASTIC_SIMULATION ) then + !chris: check for poro: At the moment cpI & cpII are for eta=0 + call check_mesh_resolution_poro(myrank,nspec,nglob_dummy,ibool,& xstore_dummy,ystore_dummy,zstore_dummy, & -1.0d0, model_speed_max,min_resolved_period, & phistore,tortstore,rhoarraystore,rho_vpI,rho_vpII,rho_vsI, & LOCAL_PATH,SAVE_MESH_FILES ) + else + call check_mesh_resolution(myrank,nspec,nglob_dummy, & + ibool,xstore_dummy,ystore_dummy,zstore_dummy, & + kappastore,mustore,rho_vp,rho_vs, & + -1.0d0,model_speed_max,min_resolved_period, & + LOCAL_PATH,SAVE_MESH_FILES) + endif ! saves binary mesh files for attenuation if( ATTENUATION ) then @@ -1172,6 +1182,8 @@ subroutine crm_setup_inner_outer_elemnts(myrank,nspec, & character(len=256) :: filename logical,dimension(:),allocatable :: iglob_is_inner + logical,parameter :: DEBUG = .false. + ! allocates arrays allocate(ispec_is_inner(nspec),stat=ier) if( ier /= 0 ) stop 'error allocating array ispec_is_inner' @@ -1205,7 +1217,7 @@ subroutine crm_setup_inner_outer_elemnts(myrank,nspec, & ! frees temporary array deallocate( iglob_is_inner ) - if( SAVE_MESH_FILES ) then + if( SAVE_MESH_FILES .and. DEBUG ) then filename = prname(1:len_trim(prname))//'ispec_is_inner' call write_VTK_data_elem_l(nspec,nglob_dummy, & xstore_dummy,ystore_dummy,zstore_dummy,ibool, & @@ -1354,3 +1366,678 @@ subroutine crm_setup_inner_outer_elemnts(myrank,nspec, & end subroutine crm_setup_inner_outer_elemnts +! +!------------------------------------------------------------------------------------------------- +! + + subroutine crm_setup_color_perm(myrank,nspec,nglob,ibool,ANISOTROPY,SAVE_MESH_FILES) + +! sets up mesh coloring and permutes elements + + use create_regions_mesh_ext_par + implicit none + + integer :: myrank,nspec,nglob + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + + logical :: ANISOTROPY,SAVE_MESH_FILES + + ! local parameters + integer, dimension(:), allocatable :: perm + integer :: ier + + ! user output + if(myrank == 0) then + write(IMAIN,*) ' use coloring = ',USE_MESH_COLORING_GPU + endif + + ! initializes + num_colors_outer_acoustic = 0 + num_colors_inner_acoustic = 0 + num_colors_outer_elastic = 0 + num_colors_inner_elastic = 0 + + ! mesh coloring + if( USE_MESH_COLORING_GPU ) then + + ! creates coloring of elements + allocate(perm(nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating temporary perm array' + perm(:) = 0 + + ! acoustic domains + if( ACOUSTIC_SIMULATION ) then + if( myrank == 0) then + write(IMAIN,*) ' acoustic domains: ' + endif + call crm_setup_color(myrank,nspec,nglob,ibool,perm, & + ispec_is_acoustic,1, & + num_phase_ispec_acoustic,phase_ispec_inner_acoustic, & + SAVE_MESH_FILES) + else + ! allocates dummy arrays + allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array' + endif + + ! elastic domains + if( ELASTIC_SIMULATION ) then + if( myrank == 0) then + write(IMAIN,*) ' elastic domains: ' + endif + call crm_setup_color(myrank,nspec,nglob,ibool,perm, & + ispec_is_elastic,2, & + num_phase_ispec_elastic,phase_ispec_inner_elastic, & + SAVE_MESH_FILES) + else + allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array' + endif + + ! checks: after all domains are done + if(minval(perm) /= 1) & + call exit_MPI(myrank, 'minval(perm) should be 1') + if(maxval(perm) /= max(num_phase_ispec_acoustic,num_phase_ispec_elastic)) & + call exit_MPI(myrank, 'maxval(perm) should be max(num_phase_ispec_..)') + + ! sorts array according to permutation + call sync_all() + if(myrank == 0) then + write(IMAIN,*) ' mesh permutation:' + endif + call crm_setup_permutation(myrank,nspec,nglob,ibool,ANISOTROPY,perm, & + SAVE_MESH_FILES) + + deallocate(perm) + + else + + ! allocates dummy arrays + allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array' + allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array' + + endif ! USE_MESH_COLORING_GPU + + end subroutine crm_setup_color_perm + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine crm_setup_color(myrank,nspec,nglob,ibool,perm, & + ispec_is_d,idomain, & + num_phase_ispec_d,phase_ispec_inner_d, & + SAVE_MESH_FILES) + +! sets up mesh coloring + + use create_regions_mesh_ext_par + implicit none + + integer :: myrank,nspec,nglob + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + + integer, dimension(nspec) :: perm + + ! wrapper array for ispec is in domain: + ! idomain acoustic == 1 or elastic == 2 + integer :: idomain + logical, dimension(nspec) :: ispec_is_d + integer :: num_phase_ispec_d + integer, dimension(num_phase_ispec_d,2) :: phase_ispec_inner_d + + logical :: SAVE_MESH_FILES + + ! local parameters + ! added for color permutation + integer :: nb_colors_outer_elements,nb_colors_inner_elements + integer, dimension(:), allocatable :: num_of_elems_in_this_color + integer, dimension(:), allocatable :: color + integer, dimension(:), allocatable :: first_elem_number_in_this_color + logical, dimension(:), allocatable :: is_on_a_slice_edge + + integer :: nspec_outer,nspec_inner,nspec_domain + integer :: nspec_outer_min_global,nspec_outer_max_global + integer :: nb_colors,nb_colors_min,nb_colors_max + + integer :: icolor,ispec,ispec_counter + integer :: ispec_inner,ispec_outer + integer :: ier + + character(len=2),dimension(2) :: str_domain = (/ "ac", "el" /) + character(len=256) :: filename + + logical, parameter :: DEBUG = .false. + + !!!! David Michea: detection of the edges, coloring and permutation separately + + ! implement mesh coloring for GPUs if needed, to create subsets of disconnected elements + ! to remove dependencies and the need for atomic operations in the sum of + ! elemental contributions in the solver + + ! allocates temporary array with colors + allocate(color(nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating temporary color array' + allocate(first_elem_number_in_this_color(MAX_NUMBER_OF_COLORS + 1),stat=ier) + if( ier /= 0 ) stop 'error allocating first_elem_number_in_this_color array' + + ! flags for elements on outer rims + ! opposite to what is stored in ispec_is_inner + allocate(is_on_a_slice_edge(nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating is_on_a_slice_edge array' + do ispec = 1,nspec + is_on_a_slice_edge(ispec) = .not. ispec_is_inner(ispec) + enddo + + ! fast element coloring scheme + call get_perm_color_faster(is_on_a_slice_edge,ispec_is_d, & + ibool,perm,color, & + nspec,nglob, & + nb_colors_outer_elements,nb_colors_inner_elements, & + nspec_outer,nspec_inner,nspec_domain, & + first_elem_number_in_this_color, & + myrank) + + ! for the last color, the next color is fictitious and its first (fictitious) element number is nspec + 1 + first_elem_number_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements + 1) & + = nspec_domain + 1 + + allocate(num_of_elems_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements),stat=ier) + if( ier /= 0 ) stop 'error allocating num_of_elems_in_this_color array' + + num_of_elems_in_this_color(:) = 0 + do icolor = 1, nb_colors_outer_elements + nb_colors_inner_elements + num_of_elems_in_this_color(icolor) = first_elem_number_in_this_color(icolor+1) - first_elem_number_in_this_color(icolor) + enddo + + ! check that the sum of all the numbers of elements found in each color is equal + ! to the total number of elements in the mesh + if(sum(num_of_elems_in_this_color) /= nspec_domain) then + print *,'error number of elements in this color:',idomain + print *,'rank: ',myrank,' nspec = ',nspec_domain + print *,' total number of elements in all the colors of the mesh = ', & + sum(num_of_elems_in_this_color) + call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh') + endif + + ! check that the sum of all the numbers of elements found in each color for the outer elements is equal + ! to the total number of outer elements found in the mesh + if(sum(num_of_elems_in_this_color(1:nb_colors_outer_elements)) /= nspec_outer) then + print *,'error number of outer elements in this color:',idomain + print *,'rank: ',myrank,' nspec_outer = ',nspec_outer + print*,'nb_colors_outer_elements = ',nb_colors_outer_elements + print *,'total number of elements in all the colors of the mesh for outer elements = ', & + sum(num_of_elems_in_this_color(1:nb_colors_outer_elements)) + call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh for outer elements') + endif + + ! debug: file output + if( SAVE_MESH_FILES .and. DEBUG ) then + filename = prname(1:len_trim(prname))//'color_'//str_domain(idomain) + call write_VTK_data_elem_i(nspec,nglob, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool, & + color,filename) + endif + + deallocate(first_elem_number_in_this_color) + deallocate(is_on_a_slice_edge) + deallocate(color) + + ! debug: no mesh coloring, only creates dummy coloring arrays + if( DEBUG ) then + nb_colors_outer_elements = 0 + nb_colors_inner_elements = 0 + ispec_counter = 0 + + ! first generate all the outer elements + do ispec = 1,nspec + if( ispec_is_d(ispec) ) then + if( ispec_is_inner(ispec) .eqv. .false. ) then + ispec_counter = ispec_counter + 1 + perm(ispec) = ispec_counter + endif + endif + enddo + + ! store total number of outer elements + nspec_outer = ispec_counter + + ! only single color + if(nspec_outer > 0 ) nb_colors_outer_elements = 1 + + ! then generate all the inner elements + do ispec = 1,nspec + if( ispec_is_d(ispec) ) then + if( ispec_is_inner(ispec) .eqv. .true. ) then + ispec_counter = ispec_counter + 1 + perm(ispec) = ispec_counter - nspec_outer ! starts again at 1 + endif + endif + enddo + nspec_inner = ispec_counter - nspec_outer + + ! only single color + if(nspec_inner > 0 ) nb_colors_inner_elements = 1 + + allocate(num_of_elems_in_this_color(nb_colors_outer_elements + nb_colors_inner_elements),stat=ier) + if( ier /= 0 ) stop 'error allocating num_of_elems_in_this_color array' + + if( nspec_outer > 0 ) num_of_elems_in_this_color(1) = nspec_outer + if( nspec_inner > 0 ) num_of_elems_in_this_color(2) = nspec_inner + endif ! debug + + ! debug: saves mesh coloring numbers into files + if( DEBUG ) then + ! debug file output + filename = prname(1:len_trim(prname))//'num_of_elems_in_this_color_'//str_domain(idomain)//'.dat' + open(unit=99,file=trim(filename),status='unknown',iostat=ier) + if( ier /= 0 ) stop 'error opening num_of_elems_in_this_color file' + ! number of colors for outer elements + write(99,*) nb_colors_outer_elements + ! number of colors for inner elements + write(99,*) nb_colors_inner_elements + ! number of elements in each color + ! outer elements + do icolor = 1, nb_colors_outer_elements + nb_colors_inner_elements + write(99,*) num_of_elems_in_this_color(icolor) + enddo + close(99) + endif + + ! sets up domain coloring arrays + select case(idomain) + case( 1 ) + ! acoustic domains + num_colors_outer_acoustic = nb_colors_outer_elements + num_colors_inner_acoustic = nb_colors_inner_elements + + allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array' + + num_elem_colors_acoustic(:) = num_of_elems_in_this_color(:) + + case( 2 ) + ! elastic domains + num_colors_outer_elastic = nb_colors_outer_elements + num_colors_inner_elastic = nb_colors_inner_elements + + allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array' + + num_elem_colors_elastic(:) = num_of_elems_in_this_color(:) + + case default + stop 'error idomain not recognized' + end select + + ! sets up elements for loops in simulations + ispec_inner = 0 + ispec_outer = 0 + do ispec = 1, nspec + ! only elements in this domain + if( ispec_is_d(ispec) ) then + + ! sets phase_ispec arrays with ordering of elements + if( ispec_is_inner(ispec) .eqv. .false. ) then + ! outer elements + ispec_outer = perm(ispec) + + ! checks + if( ispec_outer < 1 .or. ispec_outer > num_phase_ispec_d ) then + print*,'error outer permutation:',idomain + print*,'rank:',myrank,' ispec_inner = ',ispec_outer + print*,'num_phase_ispec_d = ',num_phase_ispec_d + call exit_MPI(myrank,'error outer acoustic permutation') + endif + + phase_ispec_inner_d(ispec_outer,1) = ispec + + else + ! inner elements + ispec_inner = perm(ispec) + + ! checks + if( ispec_inner < 1 .or. ispec_inner > num_phase_ispec_d ) then + print*,'error inner permutation:',idomain + print*,'rank:',myrank,' ispec_inner = ',ispec_inner + print*,'num_phase_ispec_d = ',num_phase_ispec_d + call exit_MPI(myrank,'error inner acoustic permutation') + endif + + phase_ispec_inner_d(ispec_inner,2) = ispec + + endif + endif + enddo + + ! total number of colors + nb_colors = nb_colors_inner_elements + nb_colors_outer_elements + call min_all_i(nb_colors,nb_colors_min) + call max_all_i(nb_colors,nb_colors_max) + + ! user output + call min_all_i(nspec_outer,nspec_outer_min_global) + call max_all_i(nspec_outer,nspec_outer_max_global) + call min_all_i(nspec_outer,nspec_outer_min_global) + call max_all_i(nspec_outer,nspec_outer_max_global) + if(myrank == 0) then + write(IMAIN,*) ' colors min = ',nb_colors_min + write(IMAIN,*) ' colors max = ',nb_colors_max + write(IMAIN,*) ' outer elements: min = ',nspec_outer_min_global + write(IMAIN,*) ' outer elements: max = ',nspec_outer_max_global + endif + + ! debug: outputs permutation array as vtk file + if( DEBUG ) then + filename = prname(1:len_trim(prname))//'perm_'//str_domain(idomain) + call write_VTK_data_elem_i(nspec,nglob, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool, & + perm,filename) + endif + + deallocate(num_of_elems_in_this_color) + + end subroutine crm_setup_color + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine crm_setup_permutation(myrank,nspec,nglob,ibool,ANISOTROPY,perm, & + SAVE_MESH_FILES) + + use create_regions_mesh_ext_par + implicit none + + integer :: myrank,nspec,nglob + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + + logical :: ANISOTROPY + + integer, dimension(nspec),intent(inout) :: perm + + logical :: SAVE_MESH_FILES + + ! local parameters + ! added for sorting + integer, dimension(:,:,:,:), allocatable :: temp_array_int + real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: temp_array_real + logical, dimension(:), allocatable :: temp_array_logical_1D + + integer, dimension(:), allocatable :: temp_perm_global + logical, dimension(:), allocatable :: mask_global + + integer :: icolor,icounter,ispec,ielem,ier,i + integer :: iface,old_ispec,new_ispec + + character(len=256) :: filename + + logical,parameter :: DEBUG = .false. + + ! sorts array according to permutation + allocate(temp_perm_global(nspec),stat=ier) + if( ier /= 0 ) stop 'error temp_perm_global array' + + ! global ordering + temp_perm_global(:) = 0 + icounter = 0 + + ! fills global permutation array + ! starts with elastic elements + if( ELASTIC_SIMULATION ) then + ! first outer elements coloring + ! phase element counter + ielem = 0 + do icolor = 1,num_colors_outer_elastic + ! loops through elements + do i = 1,num_elem_colors_elastic(icolor) + ielem = ielem + 1 + ispec = phase_ispec_inner_elastic(ielem,1) ! 1 <-- first phase, outer elements + ! reorders elements + icounter = icounter + 1 + temp_perm_global(ispec) = icounter + ! resets to new order + phase_ispec_inner_elastic(ielem,1) = icounter + enddo + enddo + ! inner elements coloring + ielem = 0 + do icolor = num_colors_outer_elastic+1,num_colors_outer_elastic+num_colors_inner_elastic + ! loops through elements + do i = 1,num_elem_colors_elastic(icolor) + ielem = ielem + 1 + ispec = phase_ispec_inner_elastic(ielem,2) ! 2 <-- second phase, inner elements + ! reorders elements + icounter = icounter + 1 + temp_perm_global(ispec) = icounter + ! resets to new order + phase_ispec_inner_elastic(ielem,2) = icounter + enddo + enddo + endif + + ! continues with acoustic elements + if( ACOUSTIC_SIMULATION ) then + ! first outer elements coloring + ! phase element counter + ielem = 0 + do icolor = 1,num_colors_outer_acoustic + ! loops through elements + do i = 1,num_elem_colors_acoustic(icolor) + ielem = ielem + 1 + ispec = phase_ispec_inner_acoustic(ielem,1) ! 1 <-- first phase, outer elements + ! reorders elements + icounter = icounter + 1 + temp_perm_global(ispec) = icounter + ! resets to new order + phase_ispec_inner_acoustic(ielem,1) = icounter + enddo + enddo + ! inner elements coloring + ielem = 0 + do icolor = num_colors_outer_acoustic+1,num_colors_outer_acoustic+num_colors_inner_acoustic + ! loops through elements + do i = 1,num_elem_colors_acoustic(icolor) + ielem = ielem + 1 + ispec = phase_ispec_inner_acoustic(ielem,2) ! 2 <-- second phase, inner elements + ! reorders elements + icounter = icounter + 1 + temp_perm_global(ispec) = icounter + ! resets to new order + phase_ispec_inner_acoustic(ielem,2) = icounter + enddo + enddo + endif + + ! checks + if( icounter /= nspec ) then + print*,'error temp perm: ',icounter,nspec + stop 'error temporary global permutation incomplete' + endif + + ! checks perm entries + if(minval(temp_perm_global) /= 1) call exit_MPI(myrank, 'minval(temp_perm_global) should be 1') + if(maxval(temp_perm_global) /= nspec) call exit_MPI(myrank, 'maxval(temp_perm_global) should be nspec') + + ! checks if every element was uniquely set + allocate(mask_global(nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating temporary mask_global' + mask_global(:) = .false. + icounter = 0 ! counts permutations + do ispec = 1, nspec + new_ispec = temp_perm_global(ispec) + ! checks bounds + if( new_ispec < 1 .or. new_ispec > nspec ) call exit_MPI(myrank,'error temp_perm_global ispec bounds') + ! checks if already set + if( mask_global(new_ispec) ) then + print*,'error temp_perm_global:',ispec,new_ispec,'element already set' + call exit_MPI(myrank,'error global permutation') + else + mask_global(new_ispec) = .true. + endif + ! counts permutations + if( new_ispec /= ispec ) icounter = icounter + 1 + enddo + + ! checks number of set elements + if( count(mask_global(:)) /= nspec ) then + print*,'error temp_perm_global:',count(mask_global(:)),nspec,'permutation incomplete' + call exit_MPI(myrank,'error global permutation incomplete') + endif + deallocate(mask_global) + + ! user output + if(myrank == 0) then + write(IMAIN,*) ' number of permutations = ',icounter + endif + + ! outputs permutation array as vtk file + if( SAVE_MESH_FILES .and. DEBUG ) then + filename = prname(1:len_trim(prname))//'perm_global' + call write_VTK_data_elem_i(nspec,nglob, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool, & + temp_perm_global,filename) + endif + + ! store as new permutation + perm(:) = temp_perm_global(:) + deallocate(temp_perm_global) + + ! permutes all required mesh arrays according to new ordering + + ! permutation of ibool + allocate(temp_array_int(NGLLX,NGLLY,NGLLZ,nspec)) + call permute_elements_integer(ibool,temp_array_int,perm,nspec) + deallocate(temp_array_int) + + ! element domain flags + allocate(temp_array_logical_1D(nspec)) + call permute_elements_logical1D(ispec_is_acoustic,temp_array_logical_1D,perm,nspec) + call permute_elements_logical1D(ispec_is_elastic,temp_array_logical_1D,perm,nspec) + call permute_elements_logical1D(ispec_is_poroelastic,temp_array_logical_1D,perm,nspec) + call permute_elements_logical1D(ispec_is_inner,temp_array_logical_1D,perm,nspec) + deallocate(temp_array_logical_1D) + + ! mesh arrays + allocate(temp_array_real(NGLLX,NGLLY,NGLLZ,nspec)) + call permute_elements_real(xixstore,temp_array_real,perm,nspec) + call permute_elements_real(xiystore,temp_array_real,perm,nspec) + call permute_elements_real(xizstore,temp_array_real,perm,nspec) + call permute_elements_real(etaxstore,temp_array_real,perm,nspec) + call permute_elements_real(etaystore,temp_array_real,perm,nspec) + call permute_elements_real(etazstore,temp_array_real,perm,nspec) + call permute_elements_real(gammaxstore,temp_array_real,perm,nspec) + call permute_elements_real(gammaystore,temp_array_real,perm,nspec) + call permute_elements_real(gammazstore,temp_array_real,perm,nspec) + call permute_elements_real(jacobianstore,temp_array_real,perm,nspec) + + ! material parameters + call permute_elements_real(kappastore,temp_array_real,perm,nspec) + call permute_elements_real(mustore,temp_array_real,perm,nspec) + + ! acoustic arrays + if( ACOUSTIC_SIMULATION ) then + call permute_elements_real(rhostore,temp_array_real,perm,nspec) + endif + + ! elastic arrays + if( ELASTIC_SIMULATION ) then + call permute_elements_real(rho_vp,temp_array_real,perm,nspec) + call permute_elements_real(rho_vs,temp_array_real,perm,nspec) + if( ANISOTROPY ) then + call permute_elements_real(c11store,temp_array_real,perm,nspec) + call permute_elements_real(c12store,temp_array_real,perm,nspec) + call permute_elements_real(c13store,temp_array_real,perm,nspec) + call permute_elements_real(c14store,temp_array_real,perm,nspec) + call permute_elements_real(c15store,temp_array_real,perm,nspec) + call permute_elements_real(c16store,temp_array_real,perm,nspec) + call permute_elements_real(c22store,temp_array_real,perm,nspec) + call permute_elements_real(c23store,temp_array_real,perm,nspec) + call permute_elements_real(c24store,temp_array_real,perm,nspec) + call permute_elements_real(c25store,temp_array_real,perm,nspec) + call permute_elements_real(c33store,temp_array_real,perm,nspec) + call permute_elements_real(c34store,temp_array_real,perm,nspec) + call permute_elements_real(c35store,temp_array_real,perm,nspec) + call permute_elements_real(c36store,temp_array_real,perm,nspec) + call permute_elements_real(c44store,temp_array_real,perm,nspec) + call permute_elements_real(c45store,temp_array_real,perm,nspec) + call permute_elements_real(c46store,temp_array_real,perm,nspec) + call permute_elements_real(c55store,temp_array_real,perm,nspec) + call permute_elements_real(c56store,temp_array_real,perm,nspec) + call permute_elements_real(c66store,temp_array_real,perm,nspec) + endif + endif + deallocate(temp_array_real) + + ! poroelastic arrays + if( POROELASTIC_SIMULATION ) then + stop 'mesh permutation for poroelastic simulations not supported yet' + endif + + ! boundary surface + if( num_abs_boundary_faces > 0 ) then + do iface = 1,num_abs_boundary_faces + old_ispec = abs_boundary_ispec(iface) + new_ispec = perm(old_ispec) + abs_boundary_ispec(iface) = new_ispec + enddo + endif + + ! free surface + if( num_free_surface_faces > 0 ) then + do iface = 1,num_free_surface_faces + old_ispec = free_surface_ispec(iface) + new_ispec = perm(old_ispec) + free_surface_ispec(iface) = new_ispec + enddo + endif + + ! coupling surface + if( num_coupling_ac_el_faces > 0 ) then + do iface = 1,num_coupling_ac_el_faces + old_ispec = coupling_ac_el_ispec(iface) + new_ispec = perm(old_ispec) + coupling_ac_el_ispec(iface) = new_ispec + enddo + endif + if( num_coupling_ac_po_faces > 0 ) then + do iface = 1,num_coupling_ac_po_faces + old_ispec = coupling_ac_po_ispec(iface) + new_ispec = perm(old_ispec) + coupling_ac_po_ispec(iface) = new_ispec + enddo + endif + if( num_coupling_el_po_faces > 0 ) then + do iface = 1,num_coupling_el_po_faces + ! elastic-poroelastic + old_ispec = coupling_el_po_ispec(iface) + new_ispec = perm(old_ispec) + coupling_el_po_ispec(iface) = new_ispec + ! poroelastic-elastic + old_ispec = coupling_po_el_ispec(iface) + new_ispec = perm(old_ispec) + coupling_po_el_ispec(iface) = new_ispec + enddo + endif + + ! moho surface + if( NSPEC2D_MOHO > 0 ) then + allocate(temp_array_logical_1D(nspec)) + call permute_elements_logical1D(is_moho_top,temp_array_logical_1D,perm,nspec) + call permute_elements_logical1D(is_moho_bot,temp_array_logical_1D,perm,nspec) + deallocate(temp_array_logical_1D) + do iface = 1,NSPEC2D_MOHO + ! top + old_ispec = ibelm_moho_top(iface) + new_ispec = perm(old_ispec) + ibelm_moho_top(iface) = new_ispec + ! bottom + old_ispec = ibelm_moho_bot(iface) + new_ispec = perm(old_ispec) + ibelm_moho_bot(iface) = new_ispec + enddo + endif + + end subroutine crm_setup_permutation diff --git a/src/generate_databases/generate_databases.f90 b/src/generate_databases/generate_databases.f90 index 28c888033..86c9306f5 100644 --- a/src/generate_databases/generate_databases.f90 +++ b/src/generate_databases/generate_databases.f90 @@ -295,7 +295,7 @@ module generate_databases_par ! flag for noise simulation integer :: NOISE_TOMOGRAPHY integer :: IMODEL - + end module generate_databases_par ! @@ -418,7 +418,7 @@ subroutine gd_read_parameters write(IMAIN,*) 'Shape functions defined by NGNOD = ',NGNOD,' control nodes' write(IMAIN,*) 'Surface shape functions defined by NGNOD2D = ',NGNOD2D,' control nodes' write(IMAIN,*) - + write(IMAIN,'(a)',advance='no') ' velocity model: ' select case(IMODEL) case( IMODEL_DEFAULT ) @@ -437,8 +437,10 @@ subroutine gd_read_parameters write(IMAIN,'(a)',advance='yes') ' tomo' case( IMODEL_USER_EXTERNAL ) write(IMAIN,'(a)',advance='yes') ' external' + case( IMODEL_IPATI ) + write(IMAIN,'(a)',advance='yes') ' ipati' end select - + write(IMAIN,*) endif @@ -588,8 +590,8 @@ subroutine gd_read_partition_files open(unit=IIN,file=prname(1:len_trim(prname))//'Database', & status='old',action='read',form='unformatted',iostat=ier) if( ier /= 0 ) then - write(IMAIN,*) 'error opening file: ',prname(1:len_trim(prname))//'Database' - write(IMAIN,*) 'make sure file exists' + print*,'rank ',myrank,' error opening file: ',prname(1:len_trim(prname))//'Database' + print*,'please make sure file exists' call exit_mpi(myrank,'error opening database file') endif !read(IIN,*) nnodes_ext_mesh @@ -930,7 +932,8 @@ subroutine gd_setup_mesh write(IMAIN,*) 'create regions: ' endif call create_regions_mesh() - + +! now done inside create_regions_mesh_ext routine... ! now done inside create_regions_mesh_ext routine... ! Moho boundary parameters, 2-D jacobians and normals ! if( SAVE_MOHO_MESH ) then diff --git a/src/generate_databases/get_coupling_surfaces.f90 b/src/generate_databases/get_coupling_surfaces.f90 index 8cdae42e7..f4ce27ad7 100644 --- a/src/generate_databases/get_coupling_surfaces.f90 +++ b/src/generate_databases/get_coupling_surfaces.f90 @@ -32,6 +32,7 @@ subroutine get_coupling_surfaces(myrank, & my_neighbours_ext_mesh) ! determines coupling surface for acoustic-elastic domains +! based on ispec_is_acoustic, ispec_is_elastic and ispec_is_poroelastic arrays use create_regions_mesh_ext_par implicit none @@ -49,31 +50,106 @@ subroutine get_coupling_surfaces(myrank, & ibool_interfaces_ext_mesh integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh + ! local parameters + integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,poroelastic_flag + integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy + integer :: max_nibool_interfaces_ext_mesh + integer :: count_elastic,count_acoustic,count_poroelastic + integer :: ispec,i,j,k,iglob,ier,inum + ! initializes number of coupling faces num_coupling_ac_el_faces = 0 num_coupling_ac_po_faces = 0 num_coupling_el_po_faces = 0 - ! acoustic - elastic domain coupling - call get_coupling_surfaces_ac_el(myrank, & - nspec,ibool,NPROC, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - num_interfaces_ext_mesh,max_interface_size_ext_mesh, & + ! sets flags for acoustic / elastic / poroelastic on global points + allocate(acoustic_flag(nglob_dummy),stat=ier) + if( ier /= 0 ) stop 'error allocating array acoustic_flag' + allocate(elastic_flag(nglob_dummy),stat=ier) + if( ier /= 0 ) stop 'error allocating array elastic_flag' + allocate(poroelastic_flag(nglob_dummy),stat=ier) + if( ier /= 0 ) stop 'error allocating array poroelastic_flag' + + acoustic_flag(:) = 0 + elastic_flag(:) = 0 + poroelastic_flag(:) = 0 + + count_acoustic = 0 + count_elastic = 0 + count_poroelastic = 0 + + do ispec = 1, nspec + ! counts elements + if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1 + if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1 + if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1 + + ! sets flags on global points + do k = 1, NGLLZ + do j = 1, NGLLY + do i = 1, NGLLX + ! global index + iglob = ibool(i,j,k,ispec) + ! sets acoustic flag + if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) = myrank+1 + ! sets elastic flag + if( ispec_is_elastic(ispec) ) elastic_flag(iglob) = myrank+1 + ! sets poroelastic flag + if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) = myrank+1 + enddo + enddo + enddo + enddo + call sum_all_i(count_acoustic,inum) + if( myrank == 0 ) then + write(IMAIN,*) ' total acoustic elements :',inum + endif + call sum_all_i(count_elastic,inum) + if( myrank == 0 ) then + write(IMAIN,*) ' total elastic elements :',inum + endif + call sum_all_i(count_poroelastic,inum) + if( myrank == 0 ) then + write(IMAIN,*) ' total poroelastic elements:',inum + endif + + ! collects contributions from different MPI partitions + ! sets up MPI communications + max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) ) + allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier) + if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy' + do i = 1, num_interfaces_ext_mesh + ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i) + enddo + ! sums acoustic flags + call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& + my_neighbours_ext_mesh) + ! sums elastic flags + call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& my_neighbours_ext_mesh) + ! sums poroelastic flags + call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& + my_neighbours_ext_mesh) + + + ! determines common faces between different domains + ! acoustic - elastic domain coupling + call get_coupling_surfaces_ac_el(myrank,nspec,ibool,elastic_flag) ! acoustic - poroelastic domain coupling - call get_coupling_surfaces_ac_poro(myrank, & - nspec,ibool,NPROC, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - num_interfaces_ext_mesh,max_interface_size_ext_mesh, & - my_neighbours_ext_mesh) + call get_coupling_surfaces_ac_poro(myrank,nspec,ibool,acoustic_flag) ! elastic - poroelastic domain coupling - call get_coupling_surfaces_el_poro(myrank, & - nspec,ibool,NPROC, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - num_interfaces_ext_mesh,max_interface_size_ext_mesh, & - my_neighbours_ext_mesh) + call get_coupling_surfaces_el_poro(myrank,nspec,ibool,elastic_flag) + + ! frees temporary arrays + deallocate(acoustic_flag,elastic_flag,poroelastic_flag) end subroutine get_coupling_surfaces @@ -81,11 +157,7 @@ end subroutine get_coupling_surfaces !------------------------------------------------------------------------------------------------- ! - subroutine get_coupling_surfaces_ac_el(myrank, & - nspec,ibool,NPROC, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - num_interfaces_ext_mesh,max_interface_size_ext_mesh, & - my_neighbours_ext_mesh) + subroutine get_coupling_surfaces_ac_el(myrank,nspec,ibool,elastic_flag) ! determines coupling surface for acoustic-elastic domains @@ -93,17 +165,12 @@ subroutine get_coupling_surfaces_ac_el(myrank, & implicit none ! number of spectral elements in each block - integer :: myrank,nspec,NPROC + integer :: myrank,nspec ! arrays with the mesh integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool -! MPI communication - integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh - integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh - integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: & - ibool_interfaces_ext_mesh - integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh + integer,dimension(nglob_dummy) :: elastic_flag ! local parameters ! (assumes NGLLX=NGLLY=NGLLZ) @@ -116,16 +183,9 @@ subroutine get_coupling_surfaces_ac_el(myrank, & integer,dimension(:,:,:),allocatable :: tmp_ijk integer,dimension(:),allocatable :: tmp_ispec - integer,dimension(NGNOD2D) :: iglob_corners_ref !,iglob_corners - integer :: ispec,i,j,k,igll,ier,iglob - integer :: inum,iface_ref,icorner,iglob_midpoint ! iface,ispec_neighbor - integer :: count_elastic,count_acoustic - - ! mpi interface communication - integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,test_flag - integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy - integer :: max_nibool_interfaces_ext_mesh - logical, dimension(:), allocatable :: mask_ibool + integer,dimension(NGNOD2D) :: iglob_corners_ref + integer :: ispec,i,j,igll,ier + integer :: inum,iface_ref ! corners indices of reference cube faces integer,dimension(3,4),parameter :: iface1_corner_ijk = & @@ -153,7 +213,7 @@ subroutine get_coupling_surfaces_ac_el(myrank, & !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data !character(len=256):: prname_file -! allocates temporary arrays + ! allocates temporary arrays allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier) if( ier /= 0 ) stop 'error allocating array tmp_normal' allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier) @@ -167,184 +227,80 @@ subroutine get_coupling_surfaces_ac_el(myrank, & tmp_normal(:,:,:) = 0.0 tmp_jacobian2Dw(:,:) = 0.0 - ! sets flags for acoustic / elastic on global points - allocate(elastic_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array elastic_flag' - allocate(acoustic_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array acoustic_flag' - allocate(test_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array test_flag' - allocate(mask_ibool(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array mask_ibool' - elastic_flag(:) = 0 - acoustic_flag(:) = 0 - test_flag(:) = 0 - count_elastic = 0 - count_acoustic = 0 - do ispec = 1, nspec - ! counts elements - if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1 - if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1 - - ! sets flags on global points - do k = 1, NGLLZ - do j = 1, NGLLY - do i = 1, NGLLX - ! global index - iglob = ibool(i,j,k,ispec) - ! sets elastic flag - if( ispec_is_elastic(ispec) ) elastic_flag(iglob) = myrank+1 - ! sets acoustic flag - if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) = myrank+1 - ! sets test flag - test_flag(iglob) = myrank+1 - enddo - enddo - enddo - enddo - call sum_all_i(count_acoustic,inum) - if( myrank == 0 ) then - write(IMAIN,*) ' total acoustic elements:',inum - endif - call sum_all_i(count_elastic,inum) - if( myrank == 0 ) then - write(IMAIN,*) ' total elastic elements :',inum - endif - - ! collects contributions from different MPI partitions - ! sets up MPI communications - max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) ) - allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier) - if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy' - do i = 1, num_interfaces_ext_mesh - ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i) - enddo - ! sums elastic flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - ! sums acoustic flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - - ! sums test flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - ! loops over all element faces and ! counts number of coupling faces between acoustic and elastic elements - mask_ibool(:) = .false. inum = 0 + + ! coupling surfaces: takes point of view from acoustic elements, i.e. if element is acoustic + ! and has an elastic neighbor, then this acoustic element is added to + ! the coupling elements and its corresponding coupling surface + ! ( no matter in which MPI partition it is) + ! note: we use acoustic elements as reference elements because we will need + ! density from acoustic element when coupling pressure in case of gravity do ispec=1,nspec - ! loops over each face - do iface_ref= 1, 6 - - ! takes indices of corners of reference face - do icorner = 1,NGNOD2D - i = iface_all_corner_ijk(1,icorner,iface_ref) - j = iface_all_corner_ijk(2,icorner,iface_ref) - k = iface_all_corner_ijk(3,icorner,iface_ref) - ! global reference indices - iglob_corners_ref(icorner) = ibool(i,j,k,ispec) - - ! reference corner coordinates - xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner)) - ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner)) - zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner)) - enddo + if( ispec_is_acoustic(ispec) ) then + + ! loops over each face + do iface_ref= 1, 6 + + ! takes indices of corners of reference face + call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, & + ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, & + iface_all_corner_ijk) - ! checks if face has acoustic side - if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. & - acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. & - acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. & - acoustic_flag( iglob_corners_ref(4) ) >= 1) then ! checks if face is has an elastic side if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. & elastic_flag( iglob_corners_ref(2) ) >= 1 .and. & elastic_flag( iglob_corners_ref(3) ) >= 1 .and. & elastic_flag( iglob_corners_ref(4) ) >= 1) then - ! reference midpoint on face (used to avoid redundant face counting) - i = iface_all_midpointijk(1,iface_ref) - j = iface_all_midpointijk(2,iface_ref) - k = iface_all_midpointijk(3,iface_ref) - iglob_midpoint = ibool(i,j,k,ispec) - - ! checks if points on this face are masked already - if( .not. mask_ibool(iglob_midpoint) .and. & - ( acoustic_flag(iglob_midpoint) >= 1 .and. elastic_flag(iglob_midpoint) >= 1) ) then - - ! gets face GLL points i,j,k indices from element face - call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY) - - ! takes each element face only once, if it lies on an MPI interface - ! note: this is not exactly load balanced - ! lowest rank process collects as many faces as possible, second lowest as so forth - if( (test_flag(iglob_midpoint) == myrank+1) .or. & - (test_flag(iglob_midpoint) > 2*(myrank+1)) ) then - - ! gets face GLL 2Djacobian, weighted from element face - call get_jacobian_boundary_face(myrank,nspec, & - xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & - dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) - - ! normal convention: points away from acoustic, reference element - ! switch normal direction if necessary - do j=1,NGLLY - do i=1,NGLLX - ! directs normals such that they point outwards of element - call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & - ibool,nspec,nglob_dummy, & - xstore_dummy,ystore_dummy,zstore_dummy, & - normal_face(:,i,j) ) - ! makes sure that it always points away from acoustic element, - ! otherwise switch direction - if( ispec_is_elastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j) - enddo - enddo - - ! stores informations about this face - inum = inum + 1 - tmp_ispec(inum) = ispec - igll = 0 - do j=1,NGLLY - do i=1,NGLLX - ! adds all gll points on this face - igll = igll + 1 - - ! do we need to store local i,j,k,ispec info? or only global indices iglob? - tmp_ijk(:,igll,inum) = ijk_face(:,i,j) - - ! stores weighted jacobian and normals - tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j) - tmp_normal(:,igll,inum) = normal_face(:,i,j) - - ! masks global points ( to avoid redundant counting of faces) - iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec) - mask_ibool(iglob) = .true. - enddo - enddo - else - ! assumes to be already collected by lower rank process, masks face points - do j=1,NGLLY - do i=1,NGLLX - iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec) - mask_ibool(iglob) = .true. - enddo - enddo - endif ! test_flag - endif ! mask_ibool + ! gets face GLL points i,j,k indices from element face + call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY) + + ! gets face GLL 2Djacobian, weighted from element face + call get_jacobian_boundary_face(myrank,nspec, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & + dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) + + ! normal convention: points away from acoustic, reference element + ! switch normal direction if necessary + do j=1,NGLLY + do i=1,NGLLX + ! directs normals such that they point outwards of element + call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & + ibool,nspec,nglob_dummy, & + xstore_dummy,ystore_dummy,zstore_dummy, & + normal_face(:,i,j) ) + ! makes sure that it always points away from acoustic element, + ! otherwise switch direction + ! note: this should not happen, since we only loop over acoustic elements + if( ispec_is_elastic(ispec) ) stop 'error acoustic-elastic coupling surface' + enddo + enddo + + ! stores informations about this face + inum = inum + 1 + tmp_ispec(inum) = ispec + igll = 0 + do j=1,NGLLY + do i=1,NGLLX + ! adds all gll points on this face + igll = igll + 1 + + ! do we need to store local i,j,k,ispec info? or only global indices iglob? + tmp_ijk(:,igll,inum) = ijk_face(:,i,j) + + ! stores weighted jacobian and normals + tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j) + tmp_normal(:,igll,inum) = normal_face(:,i,j) + enddo + enddo endif ! elastic_flag - endif ! acoustic_flag - enddo ! iface_ref + enddo ! iface_ref + endif ! ispec_is_acoustic enddo ! ispec ! stores completed coupling face informations @@ -370,8 +326,7 @@ subroutine get_coupling_surfaces_ac_el(myrank, & ! user output call sum_all_i(num_coupling_ac_el_faces,inum) if( myrank == 0 ) then - write(IMAIN,*) ' acoustic-elastic coupling:' - write(IMAIN,*) ' total number of faces = ',inum + write(IMAIN,*) ' acoustic-elastic coupling : total number of faces = ',inum endif end subroutine get_coupling_surfaces_ac_el @@ -381,11 +336,7 @@ end subroutine get_coupling_surfaces_ac_el !------------------------------------------------------------------------------------------------- ! - subroutine get_coupling_surfaces_ac_poro(myrank, & - nspec,ibool,NPROC, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - num_interfaces_ext_mesh,max_interface_size_ext_mesh, & - my_neighbours_ext_mesh) + subroutine get_coupling_surfaces_ac_poro(myrank,nspec,ibool,acoustic_flag) ! determines coupling surface for acoustic-poroelastic domains @@ -393,17 +344,12 @@ subroutine get_coupling_surfaces_ac_poro(myrank, & implicit none ! number of spectral elements in each block - integer :: myrank,nspec,NPROC + integer :: myrank,nspec ! arrays with the mesh integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool -! MPI communication - integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh - integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh - integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: & - ibool_interfaces_ext_mesh - integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh + integer,dimension(nglob_dummy) :: acoustic_flag ! local parameters ! (assumes NGLLX=NGLLY=NGLLZ) @@ -417,14 +363,8 @@ subroutine get_coupling_surfaces_ac_poro(myrank, & integer,dimension(:),allocatable :: tmp_ispec integer,dimension(NGNOD2D) :: iglob_corners_ref - integer :: ispec,i,j,k,igll,ier,iglob - integer :: inum,iface_ref,icorner - integer :: count_poroelastic,count_acoustic - - ! mpi interface communication - integer, dimension(:), allocatable :: poroelastic_flag,acoustic_flag,test_flag - integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy - integer :: max_nibool_interfaces_ext_mesh + integer :: ispec,i,j,igll,ier + integer :: inum,iface_ref ! corners indices of reference cube faces integer,dimension(3,4),parameter :: iface1_corner_ijk = & @@ -452,7 +392,7 @@ subroutine get_coupling_surfaces_ac_poro(myrank, & !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data !character(len=256):: prname_file -! allocates temporary arrays + ! allocates temporary arrays allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier) if( ier /= 0 ) stop 'error allocating array tmp_normal' allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier) @@ -466,147 +406,71 @@ subroutine get_coupling_surfaces_ac_poro(myrank, & tmp_normal(:,:,:) = 0.0 tmp_jacobian2Dw(:,:) = 0.0 - ! sets flags for acoustic / poroelastic on global points - allocate(poroelastic_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array poroelastic_flag' - allocate(acoustic_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array acoustic_flag' - allocate(test_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array test_flag' - poroelastic_flag(:) = 0 - acoustic_flag(:) = 0 - test_flag(:) = 0 - count_poroelastic = 0 - count_acoustic = 0 - do ispec = 1, nspec - ! counts elements - if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1 - if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1 - - ! sets flags on global points - do k = 1, NGLLZ - do j = 1, NGLLY - do i = 1, NGLLX - ! global index - iglob = ibool(i,j,k,ispec) - ! sets poroelastic flag - if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) = myrank+1 - ! sets acoustic flag - if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) = myrank+1 - ! sets test flag - test_flag(iglob) = myrank+1 - enddo - enddo - enddo - enddo - call sum_all_i(count_acoustic,inum) - if( myrank == 0 ) then - write(IMAIN,*) ' total acoustic elements:',inum - endif - call sum_all_i(count_poroelastic,inum) - if( myrank == 0 ) then - write(IMAIN,*) ' total poroelastic elements :',inum - endif - - ! collects contributions from different MPI partitions - ! sets up MPI communications - max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) ) - allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier) - if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy' - do i = 1, num_interfaces_ext_mesh - ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i) - enddo - ! sums poroelastic flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - ! sums acoustic flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - - ! sums test flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - ! loops over all element faces and ! counts number of coupling faces between acoustic and poroelastic elements inum = 0 do ispec=1,nspec - if(ispec_is_poroelastic(ispec)) then - - ! loops over each face - do iface_ref= 1, 6 - - ! takes indices of corners of reference face - do icorner = 1,NGNOD2D - i = iface_all_corner_ijk(1,icorner,iface_ref) - j = iface_all_corner_ijk(2,icorner,iface_ref) - k = iface_all_corner_ijk(3,icorner,iface_ref) - ! global reference indices - iglob_corners_ref(icorner) = ibool(i,j,k,ispec) - - ! reference corner coordinates - xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner)) - ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner)) - zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner)) - enddo - - ! checks if face has acoustic side - if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. & - acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. & - acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. & - acoustic_flag( iglob_corners_ref(4) ) >= 1) then - - ! gets face GLL points i,j,k indices from poroelastic element face - call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY) - - ! gets face GLL 2Djacobian, weighted from element face - call get_jacobian_boundary_face(myrank,nspec, & - xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & - dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) - - ! normal convention: points away from acoustic, reference element - ! switch normal direction if necessary - do j=1,NGLLY - do i=1,NGLLX - ! directs normals such that they point outwards of element - call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & - ibool,nspec,nglob_dummy, & - xstore_dummy,ystore_dummy,zstore_dummy, & - normal_face(:,i,j) ) - ! reverse the sign, we know we are in a poroelastic element - normal_face(:,i,j) = - normal_face(:,i,j) - enddo - enddo - - ! stores informations about this face - inum = inum + 1 - tmp_ispec(inum) = ispec - igll = 0 - do j=1,NGLLY - do i=1,NGLLX - ! adds all gll points on this face - igll = igll + 1 - - ! we need to store local i,j,k,ispec info - tmp_ijk(:,igll,inum) = ijk_face(:,i,j) - - ! stores weighted jacobian and normals - tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j) - tmp_normal(:,igll,inum) = normal_face(:,i,j) - enddo - enddo - endif ! acoustic_flag - enddo ! iface_ref - endif ! ispec_is_poroelastic + if(ispec_is_poroelastic(ispec)) then + + ! loops over each face + do iface_ref= 1, 6 + + ! takes indices of corners of reference face + call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, & + ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, & + iface_all_corner_ijk) + + ! checks if face has acoustic side + if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. & + acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. & + acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. & + acoustic_flag( iglob_corners_ref(4) ) >= 1) then + + ! gets face GLL points i,j,k indices from poroelastic element face + call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY) + + ! gets face GLL 2Djacobian, weighted from element face + call get_jacobian_boundary_face(myrank,nspec, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & + dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) + + ! normal convention: points away from acoustic, reference element + ! switch normal direction if necessary + do j=1,NGLLY + do i=1,NGLLX + ! directs normals such that they point outwards of element + call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & + ibool,nspec,nglob_dummy, & + xstore_dummy,ystore_dummy,zstore_dummy, & + normal_face(:,i,j) ) + ! reverse the sign, we know we are in a poroelastic element + normal_face(:,i,j) = - normal_face(:,i,j) + enddo + enddo + + ! stores informations about this face + inum = inum + 1 + tmp_ispec(inum) = ispec + igll = 0 + do j=1,NGLLY + do i=1,NGLLX + ! adds all gll points on this face + igll = igll + 1 + + ! we need to store local i,j,k,ispec info + tmp_ijk(:,igll,inum) = ijk_face(:,i,j) + + ! stores weighted jacobian and normals + tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j) + tmp_normal(:,igll,inum) = normal_face(:,i,j) + enddo + enddo + endif ! acoustic_flag + enddo ! iface_ref + endif ! ispec_is_poroelastic enddo ! ispec ! stores completed coupling face informations @@ -633,8 +497,7 @@ subroutine get_coupling_surfaces_ac_poro(myrank, & ! user output call sum_all_i(num_coupling_ac_po_faces,inum) if( myrank == 0 ) then - write(IMAIN,*) ' acoustic-poroelastic coupling:' - write(IMAIN,*) ' total number of faces = ',inum + write(IMAIN,*) ' acoustic-poroelastic coupling: total number of faces = ',inum endif end subroutine get_coupling_surfaces_ac_poro @@ -643,11 +506,7 @@ end subroutine get_coupling_surfaces_ac_poro !------------------------------------------------------------------------------------------------- ! - subroutine get_coupling_surfaces_el_poro(myrank, & - nspec,ibool,NPROC, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - num_interfaces_ext_mesh,max_interface_size_ext_mesh, & - my_neighbours_ext_mesh) + subroutine get_coupling_surfaces_el_poro(myrank,nspec,ibool,elastic_flag) ! determines coupling surface for elastic-poroelastic domains @@ -655,17 +514,12 @@ subroutine get_coupling_surfaces_el_poro(myrank, & implicit none ! number of spectral elements in each block - integer :: myrank,nspec,NPROC + integer :: myrank,nspec ! arrays with the mesh integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool -! MPI communication - integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh - integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh - integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: & - ibool_interfaces_ext_mesh - integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh + integer,dimension(nglob_dummy) :: elastic_flag ! local parameters ! (assumes NGLLX=NGLLY=NGLLZ) @@ -679,14 +533,9 @@ subroutine get_coupling_surfaces_el_poro(myrank, & integer,dimension(:),allocatable :: tmp_ispec,tmp_ispec_el integer,dimension(NGNOD2D) :: iglob_corners_ref,iglob_corners_ref_el - integer :: ispec,i,j,k,igll,ier,iglob,ispec_el,ispec_ref_el + integer :: ispec,i,j,k,igll,ier + integer :: ispec_el,ispec_ref_el integer :: inum,iface_ref,iface_ref_el,iface_el,icorner - integer :: count_poroelastic,count_elastic - - ! mpi interface communication - integer, dimension(:), allocatable :: poroelastic_flag,elastic_flag,test_flag - integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy - integer :: max_nibool_interfaces_ext_mesh ! corners indices of reference cube faces integer,dimension(3,4),parameter :: iface1_corner_ijk = & @@ -734,175 +583,100 @@ subroutine get_coupling_surfaces_el_poro(myrank, & tmp_normal(:,:,:) = 0.0 tmp_jacobian2Dw(:,:) = 0.0 - ! sets flags for elastic / poroelastic on global points - allocate(poroelastic_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array poroelastic_flag' - allocate(elastic_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array elastic_flag' - allocate(test_flag(nglob_dummy),stat=ier) - if( ier /= 0 ) stop 'error allocating array test_flag' - poroelastic_flag(:) = 0 - elastic_flag(:) = 0 - test_flag(:) = 0 - count_poroelastic = 0 - count_elastic = 0 - do ispec = 1, nspec - ! counts elements - if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1 - if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1 - - ! sets flags on global points - do k = 1, NGLLZ - do j = 1, NGLLY - do i = 1, NGLLX - ! global index - iglob = ibool(i,j,k,ispec) - ! sets poroelastic flag - if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) = myrank+1 - ! sets elastic flag - if( ispec_is_elastic(ispec) ) elastic_flag(iglob) = myrank+1 - ! sets test flag - test_flag(iglob) = myrank+1 - enddo - enddo - enddo - enddo - call sum_all_i(count_elastic,inum) - if( myrank == 0 ) then - write(IMAIN,*) ' total elastic elements:',inum - endif - call sum_all_i(count_poroelastic,inum) - if( myrank == 0 ) then - write(IMAIN,*) ' total poroelastic elements :',inum - endif - - ! collects contributions from different MPI partitions - ! sets up MPI communications - max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) ) - allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier) - if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy' - do i = 1, num_interfaces_ext_mesh - ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i) - enddo - ! sums poroelastic flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - ! sums elastic flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - - ! sums test flags - call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& - my_neighbours_ext_mesh) - ! loops over all element faces and ! counts number of coupling faces between elastic and poroelastic elements inum = 0 do ispec=1,nspec - if(ispec_is_poroelastic(ispec)) then - - ! loops over each face - do iface_ref= 1, 6 + if(ispec_is_poroelastic(ispec)) then - ! takes indices of corners of reference face - do icorner = 1,NGNOD2D - i = iface_all_corner_ijk(1,icorner,iface_ref) - j = iface_all_corner_ijk(2,icorner,iface_ref) - k = iface_all_corner_ijk(3,icorner,iface_ref) - ! global reference indices - iglob_corners_ref(icorner) = ibool(i,j,k,ispec) + ! loops over each face + do iface_ref= 1, 6 - ! reference corner coordinates - xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner)) - ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner)) - zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner)) + ! takes indices of corners of reference face + call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, & + ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, & + iface_all_corner_ijk) - enddo - - ! checks if face has elastic side - if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. & - elastic_flag( iglob_corners_ref(2) ) >= 1 .and. & - elastic_flag( iglob_corners_ref(3) ) >= 1 .and. & - elastic_flag( iglob_corners_ref(4) ) >= 1) then + ! checks if face has elastic side + if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. & + elastic_flag( iglob_corners_ref(2) ) >= 1 .and. & + elastic_flag( iglob_corners_ref(3) ) >= 1 .and. & + elastic_flag( iglob_corners_ref(4) ) >= 1) then - ! need to find elastic element for coupling + ! need to find elastic element for coupling + ! + ! note: this assumes that both, elastic and poroelastic element, are in the same + ! partition; check with decomposition that this is valid for this mesh partitioning do ispec_el=1,nspec if(ispec_is_elastic(ispec_el))then - do iface_el=6,1,-1 - ! takes indices of corners of reference face - do icorner = 1,NGNOD2D - i = iface_all_corner_ijk(1,icorner,iface_el) - j = iface_all_corner_ijk(2,icorner,iface_el) - k = iface_all_corner_ijk(3,icorner,iface_el) - ! global reference indices - iglob_corners_ref_el(icorner) = ibool(i,j,k,ispec_el) - - enddo - - if ( (iglob_corners_ref(1) == iglob_corners_ref_el(3)) .and. & - (iglob_corners_ref(3) == iglob_corners_ref_el(1)) ) then - - iface_ref_el = iface_el ![CM]: for some reason this shows a wrong orientation - ! but the calcul is ok. - ispec_ref_el = ispec_el - - ! gets face GLL points i,j,k indices from poroelastic element face - call get_element_face_gll_indices(iface_ref,ijk_face_po,NGLLX,NGLLY) - ! gets face GLL points i,j,k indices from elastic element face - call get_element_face_gll_indices(iface_ref_el,ijk_face_el,NGLLX,NGLLY) - - ! gets face GLL 2Djacobian, weighted from element face - call get_jacobian_boundary_face(myrank,nspec, & - xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & - dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) - - ! normal convention: points away from poroelastic, reference element - do j=1,NGLLY - do i=1,NGLLX - ! directs normals such that they point outwards of poroelastic element - call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & - ibool,nspec,nglob_dummy, & - xstore_dummy,ystore_dummy,zstore_dummy, & - normal_face(:,i,j) ) - enddo - enddo - - ! stores informations about this face - inum = inum + 1 - tmp_ispec(inum) = ispec - tmp_ispec_el(inum) = ispec_ref_el - igll = 0 - do j=1,NGLLY - do i=1,NGLLX - ! adds all gll points on this face - igll = igll + 1 - - ! we need to store local i,j,k,ispec info - tmp_ijk(:,igll,inum) = ijk_face_po(:,i,j) - tmp_ijk_el(:,igll,inum) = ijk_face_el(:,NGLLY-j+1,NGLLX-i+1) - - ! stores weighted jacobian and normals - tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j) - tmp_normal(:,igll,inum) = normal_face(:,i,j) + do iface_el=6,1,-1 + ! takes indices of corners of reference face + do icorner = 1,NGNOD2D + i = iface_all_corner_ijk(1,icorner,iface_el) + j = iface_all_corner_ijk(2,icorner,iface_el) + k = iface_all_corner_ijk(3,icorner,iface_el) + ! global reference indices + iglob_corners_ref_el(icorner) = ibool(i,j,k,ispec_el) enddo - enddo - endif ! if - enddo ! do iface_ref_el=1,6 - endif ! if(ispec_is_elastic(ispec_el))then + if ( (iglob_corners_ref(1) == iglob_corners_ref_el(3)) .and. & + (iglob_corners_ref(3) == iglob_corners_ref_el(1)) ) then + + iface_ref_el = iface_el ![CM]: for some reason this shows a wrong orientation + ! but the calcul is ok. + ispec_ref_el = ispec_el + + ! gets face GLL points i,j,k indices from poroelastic element face + call get_element_face_gll_indices(iface_ref,ijk_face_po,NGLLX,NGLLY) + ! gets face GLL points i,j,k indices from elastic element face + call get_element_face_gll_indices(iface_ref_el,ijk_face_el,NGLLX,NGLLY) + + ! gets face GLL 2Djacobian, weighted from element face + call get_jacobian_boundary_face(myrank,nspec, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & + dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) + + ! normal convention: points away from poroelastic, reference element + do j=1,NGLLY + do i=1,NGLLX + ! directs normals such that they point outwards of poroelastic element + call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & + ibool,nspec,nglob_dummy, & + xstore_dummy,ystore_dummy,zstore_dummy, & + normal_face(:,i,j) ) + enddo + enddo + + ! stores informations about this face + inum = inum + 1 + tmp_ispec(inum) = ispec + tmp_ispec_el(inum) = ispec_ref_el + igll = 0 + do j=1,NGLLY + do i=1,NGLLX + ! adds all gll points on this face + igll = igll + 1 + + ! we need to store local i,j,k,ispec info + tmp_ijk(:,igll,inum) = ijk_face_po(:,i,j) + tmp_ijk_el(:,igll,inum) = ijk_face_el(:,NGLLY-j+1,NGLLX-i+1) + + ! stores weighted jacobian and normals + tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j) + tmp_normal(:,igll,inum) = normal_face(:,i,j) + enddo + enddo + endif ! if + + enddo ! do iface_ref_el=1,6 + endif ! if(ispec_is_elastic(ispec_el))then enddo ! do ispec_el=1,nspec - endif ! elastic_flag - enddo ! iface_ref - endif ! ispec_is_poroelastic + endif ! elastic_flag + enddo ! iface_ref + endif ! ispec_is_poroelastic enddo ! ispec ! stores completed coupling face informations @@ -935,575 +709,8 @@ subroutine get_coupling_surfaces_el_poro(myrank, & ! user output call sum_all_i(num_coupling_el_po_faces,inum) if( myrank == 0 ) then - write(IMAIN,*) ' elastic-poroelastic coupling:' - write(IMAIN,*) ' total number of faces = ',inum + write(IMAIN,*) ' elastic-poroelastic coupling : total number of faces = ',inum endif end subroutine get_coupling_surfaces_el_poro -! -!------------------------------------------------------------------------------------------------- -! - -! not working properly yet... - -! subroutine get_coupling_surfaces_comb(myrank, & -! nspec,ibool,NPROC, & -! nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & -! num_interfaces_ext_mesh,max_interface_size_ext_mesh, & -! my_neighbours_ext_mesh) -! -!! determines coupling surface for acoustic-elastic-poroelastic domains -! -! use create_regions_mesh_ext_par -! implicit none -! -!! number of spectral elements in each block -! integer :: myrank,nspec,NPROC -! -!! arrays with the mesh -! integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool -! -!! MPI communication -! integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh -! integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh -! integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: & -! ibool_interfaces_ext_mesh -! integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh -! -!! local parameters -! ! (assumes NGLLX=NGLLY=NGLLZ) -! real(kind=CUSTOM_REAL),dimension(NGNOD2D) :: xcoord,ycoord,zcoord -! real(kind=CUSTOM_REAL) :: jacobian2Dw_face(NGLLX,NGLLY) -! real(kind=CUSTOM_REAL) :: normal_face(NDIM,NGLLX,NGLLY) -! real(kind=CUSTOM_REAL),dimension(:,:,:),allocatable :: tmp_normal -! real(kind=CUSTOM_REAL),dimension(:,:),allocatable :: tmp_jacobian2Dw -! integer :: ijk_face(3,NGLLX,NGLLY) -! integer,dimension(:,:,:),allocatable :: tmp_ijk -! integer,dimension(:),allocatable :: tmp_ispec -! -! integer,dimension(NGNOD2D) :: iglob_corners_ref !,iglob_corners -! integer :: ispec,i,j,k,igll,ier,iglob -! integer :: inum,inum_ac,inum_el,inum_po,iface_ref,icorner,iglob_midpoint ! iface,ispec_neighbor -! integer :: inum_ac_el,inum_el_po,inum_ac_po -! integer :: count_elastic,count_acoustic,count_poroelastic -! -! ! mpi interface communication -! integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,poroelastic_flag,test_flag -! integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy -! integer :: max_nibool_interfaces_ext_mesh -! logical, dimension(:), allocatable :: mask_ibool_ac_el,mask_ibool_ac_po,mask_ibool_el_po -! -! ! corners indices of reference cube faces -! integer,dimension(3,4),parameter :: iface1_corner_ijk = & -! reshape( (/ 1,1,1, 1,NGLLY,1, 1,NGLLY,NGLLZ, 1,1,NGLLZ /),(/3,4/)) ! xmin -! integer,dimension(3,4),parameter :: iface2_corner_ijk = & -! reshape( (/ NGLLX,1,1, NGLLX,NGLLY,1, NGLLX,NGLLY,NGLLZ, NGLLX,1,NGLLZ /),(/3,4/)) ! xmax -! integer,dimension(3,4),parameter :: iface3_corner_ijk = & -! reshape( (/ 1,1,1, 1,1,NGLLZ, NGLLX,1,NGLLZ, NGLLX,1,1 /),(/3,4/)) ! ymin -! integer,dimension(3,4),parameter :: iface4_corner_ijk = & -! reshape( (/ 1,NGLLY,1, NGLLX,NGLLY,1, NGLLX,NGLLY,NGLLZ, 1,NGLLY,NGLLZ /),(/3,4/)) ! ymax -! integer,dimension(3,4),parameter :: iface5_corner_ijk = & -! reshape( (/ 1,1,1, 1,NGLLY,1, NGLLX,NGLLY,1, NGLLX,1,1 /),(/3,4/)) ! bottom -! integer,dimension(3,4),parameter :: iface6_corner_ijk = & -! reshape( (/ 1,1,NGLLZ, NGLLX,1,NGLLZ, NGLLX,NGLLY,NGLLZ, 1,NGLLY,NGLLZ /),(/3,4/)) ! top -! integer,dimension(3,4,6),parameter :: iface_all_corner_ijk = & -! reshape( (/ iface1_corner_ijk,iface2_corner_ijk, & -! iface3_corner_ijk,iface4_corner_ijk, & -! iface5_corner_ijk,iface6_corner_ijk /),(/3,4,6/)) ! all faces -! ! midpoint indices for each face (xmin,xmax,ymin,ymax,zmin,zmax) -! integer,dimension(3,6),parameter :: iface_all_midpointijk = & -! reshape( (/ 1,2,2, NGLLX,2,2, 2,1,2, 2,NGLLY,2, 2,2,1, 2,2,NGLLZ /),(/3,6/)) ! top -! -! -! ! test vtk output -! !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data -! !character(len=256):: prname_file -! -! ! initializes number of coupling faces -! num_coupling_ac_el_faces = 0 -! num_coupling_ac_po_faces = 0 -! num_coupling_el_po_faces = 0 -! -! ! allocates temporary arrays -! allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier) -! if( ier /= 0 ) stop 'error allocating array tmp_normal' -! allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier) -! if( ier /= 0 ) stop 'error allocating array tmp_jacobian2Dw' -! allocate(tmp_ijk(3,NGLLSQUARE,nspec*6),stat=ier) -! if( ier /= 0 ) stop 'error allocating array tmp_ijk' -! allocate(tmp_ispec(nspec*6),stat=ier) -! if( ier /= 0 ) stop 'error allocating array tmp_ispec' -! tmp_ispec(:) = 0 -! tmp_ijk(:,:,:) = 0 -! tmp_normal(:,:,:) = 0.0 -! tmp_jacobian2Dw(:,:) = 0.0 -! -! ! sets flags for acoustic / elastic /poroelastic on global points -! allocate(elastic_flag(nglob_dummy),stat=ier) -! if( ier /= 0 ) stop 'error allocating array elastic_flag' -! allocate(acoustic_flag(nglob_dummy),stat=ier) -! if( ier /= 0 ) stop 'error allocating array acoustic_flag' -! allocate(poroelastic_flag(nglob_dummy),stat=ier) -! if( ier /= 0 ) stop 'error allocating array poroelastic_flag' -! allocate(test_flag(nglob_dummy),stat=ier) -! if( ier /= 0 ) stop 'error allocating array test_flag' -! allocate(mask_ibool_ac_el(nglob_dummy),stat=ier) -! allocate(mask_ibool_ac_po(nglob_dummy),stat=ier) -! allocate(mask_ibool_el_po(nglob_dummy),stat=ier) -! if( ier /= 0 ) stop 'error allocating array mask_ibool' -! elastic_flag(:) = 0 -! acoustic_flag(:) = 0 -! poroelastic_flag(:) = 0 -! test_flag(:) = 0 -! count_elastic = 0 -! count_acoustic = 0 -! count_poroelastic = 0 -! do ispec = 1, nspec -! ! counts elements -! if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1 -! if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1 -! if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1 -! -! ! sets flags on global points -! do k = 1, NGLLZ -! do j = 1, NGLLY -! do i = 1, NGLLX -! ! global index -! iglob = ibool(i,j,k,ispec) -! ! sets elastic flag -! if( ispec_is_elastic(ispec) ) elastic_flag(iglob) = myrank+1 -! ! sets acoustic flag -! if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) = myrank+1 -! ! sets poroelastic flag -! if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) = myrank+1 -! ! sets test flag -! test_flag(iglob) = myrank+1 -! enddo -! enddo -! enddo -! enddo -! call sum_all_i(count_acoustic,inum_ac) -! if( myrank == 0 ) then -! write(IMAIN,*) ' total acoustic elements :',inum_ac -! endif -! call sum_all_i(count_elastic,inum_el) -! if( myrank == 0 ) then -! write(IMAIN,*) ' total elastic elements :',inum_el -! endif -! call sum_all_i(count_poroelastic,inum_po) -! if( myrank == 0 ) then -! write(IMAIN,*) ' total poroelastic elements :',inum_po -! endif -! -! ! collects contributions from different MPI partitions -! ! sets up MPI communications -! max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) ) -! allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier) -! if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy' -! do i = 1, num_interfaces_ext_mesh -! ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i) -! enddo -! ! sums elastic flags -! call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, & -! num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & -! nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& -! my_neighbours_ext_mesh) -! ! sums acoustic flags -! call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, & -! num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & -! nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& -! my_neighbours_ext_mesh) -! ! sums poroelastic flags -! call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, & -! num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & -! nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& -! my_neighbours_ext_mesh) -! ! sums test flags -! call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, & -! num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & -! nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,& -! my_neighbours_ext_mesh) -! -!!---------------------- -!! acoustic-elastic -!!---------------------- -! inum_ac_el = 0 -! !if (inum_el >0 .and. inum_ac >0) then -! ! loops over all element faces and -! ! counts number of coupling faces between acoustic and elastic elements -! mask_ibool_ac_el(:) = .false. -! do ispec=1,nspec -! -! ! loops over each face -! do iface_ref= 1, 6 -! -! ! takes indices of corners of reference face -! do icorner = 1,NGNOD2D -! i = iface_all_corner_ijk(1,icorner,iface_ref) -! j = iface_all_corner_ijk(2,icorner,iface_ref) -! k = iface_all_corner_ijk(3,icorner,iface_ref) -! ! global reference indices -! iglob_corners_ref(icorner) = ibool(i,j,k,ispec) -! -! ! reference corner coordinates -! xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner)) -! ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner)) -! zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner)) -! enddo -! -! ! checks if face has acoustic side -! if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. & -! acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. & -! acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. & -! acoustic_flag( iglob_corners_ref(4) ) >= 1) then -! ! checks if face is has an elastic side -! if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. & -! elastic_flag( iglob_corners_ref(2) ) >= 1 .and. & -! elastic_flag( iglob_corners_ref(3) ) >= 1 .and. & -! elastic_flag( iglob_corners_ref(4) ) >= 1) then -! -! ! reference midpoint on face (used to avoid redundant face counting) -! i = iface_all_midpointijk(1,iface_ref) -! j = iface_all_midpointijk(2,iface_ref) -! k = iface_all_midpointijk(3,iface_ref) -! iglob_midpoint = ibool(i,j,k,ispec) -! -! ! checks if points on this face are masked already -! if( .not. mask_ibool_ac_el(iglob_midpoint) ) then -! -! ! gets face GLL points i,j,k indices from element face -! call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY) -! -! ! takes each element face only once, if it lies on an MPI interface -! ! note: this is not exactly load balanced -! ! lowest rank process collects as many faces as possible, second lowest as so forth -! if( (test_flag(iglob_midpoint) == myrank+1) .or. & -! (test_flag(iglob_midpoint) > 2*(myrank+1)) ) then -! -! ! gets face GLL 2Djacobian, weighted from element face -! call get_jacobian_boundary_face(myrank,nspec, & -! xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & -! dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & -! wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & -! ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) -! -! ! normal convention: points away from acoustic, reference element -! ! switch normal direction if necessary -! do j=1,NGLLY -! do i=1,NGLLX -! ! directs normals such that they point outwards of element -! call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & -! ibool,nspec,nglob_dummy, & -! xstore_dummy,ystore_dummy,zstore_dummy, & -! normal_face(:,i,j) ) -! ! makes sure that it always points away from acoustic element, -! ! otherwise switch direction -! if( ispec_is_elastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j) -! enddo -! enddo -! -! ! stores informations about this face -! inum_ac_el = inum_ac_el + 1 -! tmp_ispec(inum_ac_el) = ispec -! igll = 0 -! do j=1,NGLLY -! do i=1,NGLLX -! ! adds all gll points on this face -! igll = igll + 1 -! -! ! do we need to store local i,j,k,ispec info? or only global indices iglob? -! tmp_ijk(:,igll,inum_ac_el) = ijk_face(:,i,j) -! -! ! stores weighted jacobian and normals -! tmp_jacobian2Dw(igll,inum_ac_el) = jacobian2Dw_face(i,j) -! tmp_normal(:,igll,inum_ac_el) = normal_face(:,i,j) -! -! ! masks global points ( to avoid redundant counting of faces) -! iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec) -! mask_ibool_ac_el(iglob) = .true. -! enddo -! enddo -! else -! ! assumes to be already collected by lower rank process, masks face points -! do j=1,NGLLY -! do i=1,NGLLX -! iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec) -! mask_ibool_ac_el(iglob) = .true. -! enddo -! enddo -! endif ! test_flag -! endif ! mask_ibool -! endif ! elastic_flag -! endif ! acoustic_flag -! enddo ! iface_ref -! enddo ! ispec -! -! !endif !if (count_elastic >0 .and. count_acoustic >0) -! -!! stores completed coupling face informations -!! -!! note: no need to store material parameters on these coupling points -!! for acoustic-elastic interface -! num_coupling_ac_el_faces = inum_ac_el -! allocate(coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_ac_el_normal' -! allocate(coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_ac_el_jacobian2Dw' -! allocate(coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_ac_el_ijk' -! allocate(coupling_ac_el_ispec(num_coupling_ac_el_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_ac_el_ispec' -! do inum = 1,num_coupling_ac_el_faces -! coupling_ac_el_normal(:,:,inum) = tmp_normal(:,:,inum) -! coupling_ac_el_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum) -! coupling_ac_el_ijk(:,:,inum) = tmp_ijk(:,:,inum) -! coupling_ac_el_ispec(inum) = tmp_ispec(inum) -! enddo -! -!! user output -!! makes sure processes are synchronized -! call sum_all_i(num_coupling_ac_el_faces,inum_ac_el) -! if( myrank == 0 ) then -! write(IMAIN,*) ' acoustic-elastic coupling:' -! write(IMAIN,*) ' total number of faces = ',inum_ac_el -! endif -! -! -!!---------------------- -!! acoustic-poroelastic -!!---------------------- -! tmp_ispec(:) = 0 -! tmp_ijk(:,:,:) = 0 -! tmp_normal(:,:,:) = 0.0 -! tmp_jacobian2Dw(:,:) = 0.0 -! inum_ac_po = 0 -! !if (inum_po >0 .and. inum_ac >0) then -! ! loops over all element faces and -! ! counts number of coupling faces between acoustic and poroelastic elements -! do ispec=1,nspec -! -! if(ispec_is_poroelastic(ispec)) then -! -! ! loops over each face -! do iface_ref= 1, 6 -! -! ! takes indices of corners of reference face -! do icorner = 1,NGNOD2D -! i = iface_all_corner_ijk(1,icorner,iface_ref) -! j = iface_all_corner_ijk(2,icorner,iface_ref) -! k = iface_all_corner_ijk(3,icorner,iface_ref) -! ! global reference indices -! iglob_corners_ref(icorner) = ibool(i,j,k,ispec) -! -! ! reference corner coordinates -! xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner)) -! ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner)) -! zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner)) -! enddo -! -! ! checks if face has acoustic side -! if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. & -! acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. & -! acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. & -! acoustic_flag( iglob_corners_ref(4) ) >= 1) then -! -! -! ! gets face GLL points i,j,k indices from element face -! call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY) -! -! ! gets face GLL 2Djacobian, weighted from element face -! call get_jacobian_boundary_face(myrank,nspec, & -! xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & -! dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & -! wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & -! ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) -! -! ! normal convention: points away from acoustic, reference element -! ! switch normal direction if necessary -! do j=1,NGLLY -! do i=1,NGLLX -! ! directs normals such that they point outwards of element -! call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & -! ibool,nspec,nglob_dummy, & -! xstore_dummy,ystore_dummy,zstore_dummy, & -! normal_face(:,i,j) ) -! ! makes sure that it always points away from acoustic element, -! ! otherwise switch direction -! !if( ispec_is_poroelastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j) -! normal_face(:,i,j) = - normal_face(:,i,j) -! enddo -! enddo -! -! ! stores informations about this face -! inum_ac_po = inum_ac_po + 1 -! tmp_ispec(inum_ac_po) = ispec -! igll = 0 -! do j=1,NGLLY -! do i=1,NGLLX -! ! adds all gll points on this face -! igll = igll + 1 -! -! ! do we need to store local i,j,k,ispec info? or only global indices iglob? -! tmp_ijk(:,igll,inum_ac_po) = ijk_face(:,i,j) -! -! ! stores weighted jacobian and normals -! tmp_jacobian2Dw(igll,inum_ac_po) = jacobian2Dw_face(i,j) -! tmp_normal(:,igll,inum_ac_po) = normal_face(:,i,j) -! -! enddo -! enddo -! endif ! acoustic_flag -! enddo ! iface_ref -! endif ! ispec_is_poroelastic -! enddo ! ispec -! -! !endif !if (count_poroelastic >0 .and. count_acoustic >0) -! -!! stores completed coupling face informations -!! -!! note: for this coupling we need to have access to porous properties. The construction is such -!! that i,j,k, & face correspond to poroelastic interface. Note that the normal is pointing outward the -!! acoustic element -! num_coupling_ac_po_faces = inum_ac_po -! allocate(coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_ac_po_normal' -! allocate(coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_ac_po_jacobian2Dw' -! allocate(coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_ac_po_ijk' -! allocate(coupling_ac_po_ispec(num_coupling_ac_po_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_ac_po_ispec' -! do inum = 1,num_coupling_ac_po_faces -! coupling_ac_po_normal(:,:,inum) = tmp_normal(:,:,inum) -! coupling_ac_po_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum) -! coupling_ac_po_ijk(:,:,inum) = tmp_ijk(:,:,inum) -! coupling_ac_po_ispec(inum) = tmp_ispec(inum) -! enddo -! -!! user output -!! makes sure processes are synchronized -! call sum_all_i(num_coupling_ac_po_faces,inum_ac_po) -! if( myrank == 0 ) then -! write(IMAIN,*) ' acoustic-poroelastic coupling:' -! write(IMAIN,*) ' total number of faces = ',inum_ac_po -! endif -! -! -!!---------------------- -!! elastic-poroelastic -!!---------------------- -! tmp_ispec(:) = 0 -! tmp_ijk(:,:,:) = 0 -! tmp_normal(:,:,:) = 0.0 -! tmp_jacobian2Dw(:,:) = 0.0 -! inum_el_po = 0 -! !if (inum_el >0 .and. inum_po >0) then -! ! loops over all element faces and -! ! counts number of coupling faces between elastic and poroelastic elements -! do ispec=1,nspec -! -! if(ispec_is_poroelastic(ispec)) then -! -! ! loops over each face -! do iface_ref= 1, 6 -! -! ! takes indices of corners of reference face -! do icorner = 1,NGNOD2D -! i = iface_all_corner_ijk(1,icorner,iface_ref) -! j = iface_all_corner_ijk(2,icorner,iface_ref) -! k = iface_all_corner_ijk(3,icorner,iface_ref) -! ! global reference indices -! iglob_corners_ref(icorner) = ibool(i,j,k,ispec) -! -! ! reference corner coordinates -! xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner)) -! ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner)) -! zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner)) -! enddo -! -! ! checks if face has elastic side -! if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. & -! elastic_flag( iglob_corners_ref(2) ) >= 1 .and. & -! elastic_flag( iglob_corners_ref(3) ) >= 1 .and. & -! elastic_flag( iglob_corners_ref(4) ) >= 1) then -! -! ! gets face GLL points i,j,k indices from element face -! call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY) -! -! ! gets face GLL 2Djacobian, weighted from element face -! call get_jacobian_boundary_face(myrank,nspec, & -! xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, & -! dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, & -! wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & -! ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY) -! -! ! normal convention: points away from poroelastic, reference element -! do j=1,NGLLY -! do i=1,NGLLX -! ! directs normals such that they point outwards of poroelastic element -! call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, & -! ibool,nspec,nglob_dummy, & -! xstore_dummy,ystore_dummy,zstore_dummy, & -! normal_face(:,i,j) ) -! enddo -! enddo -! -! ! stores informations about this face -! inum_el_po = inum_el_po + 1 -! tmp_ispec(inum_el_po) = ispec -! igll = 0 -! do j=1,NGLLY -! do i=1,NGLLX -! ! adds all gll points on this face -! igll = igll + 1 -! -! ! do we need to store local i,j,k,ispec info? or only global indices iglob? -! tmp_ijk(:,igll,inum_el_po) = ijk_face(:,i,j) -! -! ! stores weighted jacobian and normals -! tmp_jacobian2Dw(igll,inum_el_po) = jacobian2Dw_face(i,j) -! tmp_normal(:,igll,inum_el_po) = normal_face(:,i,j) -! -! enddo -! enddo -! endif ! elastic_flag -! enddo ! iface_ref -! endif ! ispec_is_poroelastic -! enddo ! ispec -! -! !endif !if (count_elastic >0 .and. count_poroelastic >0) -! -!! stores completed coupling face informations -!! -!! note: for this coupling we need to have access to porous properties. The construction is such -!! that i,j,k, & face correspond to poroelastic interface. Note that the normal is pointing outward the -!! poroelastic element -! num_coupling_el_po_faces = inum_el_po -! allocate(coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_el_po_normal' -! allocate(coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_el_po_jacobian2Dw' -! allocate(coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_el_po_ijk' -! allocate(coupling_el_po_ispec(num_coupling_el_po_faces),stat=ier) -! if( ier /= 0 ) stop 'error allocating array coupling_el_po_ispec' -! do inum = 1,num_coupling_el_po_faces -! coupling_el_po_normal(:,:,inum) = tmp_normal(:,:,inum) -! coupling_el_po_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum) -! coupling_el_po_ijk(:,:,inum) = tmp_ijk(:,:,inum) -! coupling_el_po_ispec(inum) = tmp_ispec(inum) -! enddo -! -!! user output -!! makes sure processes are synchronized -! call sum_all_i(num_coupling_el_po_faces,inum_el_po) -! if( myrank == 0 ) then -! write(IMAIN,*) ' elastic-poroelastic coupling:' -! write(IMAIN,*) ' total number of faces = ',inum_el_po -! endif -! -! -! -! end subroutine get_coupling_surfaces_comb - diff --git a/src/generate_databases/get_global.f90 b/src/generate_databases/get_global.f90 index 752957836..8a5ad614e 100644 --- a/src/generate_databases/get_global.f90 +++ b/src/generate_databases/get_global.f90 @@ -237,7 +237,7 @@ subroutine swap_all(IA,A,B,C,IW,W,ind,n) C(i)=W(ind(i)) enddo -end subroutine swap_all + end subroutine swap_all ! ------------------------------------------------------------------ @@ -292,4 +292,4 @@ subroutine get_global_indirect_addressing(nspec,nglob,ibool) deallocate(copy_ibool_ori,stat=ier); if(ier /= 0) stop 'error in deallocate' deallocate(mask_ibool,stat=ier); if(ier /= 0) stop 'error in deallocate' -end subroutine get_global_indirect_addressing + end subroutine get_global_indirect_addressing diff --git a/src/generate_databases/get_model.f90 b/src/generate_databases/get_model.f90 index 05b171881..91425cf59 100644 --- a/src/generate_databases/get_model.f90 +++ b/src/generate_databases/get_model.f90 @@ -24,15 +24,17 @@ ! !===================================================================== + subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & materials_ext_mesh,nmat_ext_mesh, & undef_mat_prop,nundefMat_ext_mesh, & - ANISOTROPY,LOCAL_PATH) + ANISOTROPY) use generate_databases_par,only: IMODEL use create_regions_mesh_ext_par implicit none + ! number of spectral elements in each block integer :: myrank,nspec @@ -59,26 +61,34 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & afactor,bfactor,cfactor integer :: ispec,i,j,k - + ! material domain integer :: idomain_id - + integer :: imaterial_id,imaterial_def ! gll point location - double precision :: xmesh,ymesh,zmesh + double precision :: xmesh,ymesh,zmesh integer :: iglob - character(len=256) LOCAL_PATH + + ! timing + double precision, external :: wtime + double precision :: time_start,tCPU ! initializes element domain flags ispec_is_acoustic(:) = .false. ispec_is_elastic(:) = .false. ispec_is_poroelastic(:) = .false. - ! prepares tomography model if needed for elements with undefined material definitions - if( nundefMat_ext_mesh > 0 .or. IMODEL == IMODEL_TOMO ) then - call model_tomography_broadcast(myrank) - endif + !debug + !print*,"nundefMat_ext_mesh:",nundefMat_ext_mesh + +! prepares tomography model if needed for elements with undefined material definitions + ! TODO: Max -- somehow this code is breaking when I try to run + ! Piero's PREM + ! if( nundefMat_ext_mesh > 0 .or. IMODEL == IMODEL_TOMO ) then + ! call model_tomography_broadcast(myrank) + ! endif ! prepares external model values if needed select case( IMODEL ) @@ -92,6 +102,8 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & ! in case, see file model_interface_bedrock.f90: ! call model_bedrock_broadcast(myrank) + ! get MPI starting time + time_start = wtime() ! material properties on all GLL points: taken from material values defined for ! each spectral element in input mesh @@ -106,7 +118,7 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & vp = 0._CUSTOM_REAL vs = 0._CUSTOM_REAL rho = 0._CUSTOM_REAL - + rho_s = 0._CUSTOM_REAL kappa_s = 0._CUSTOM_REAL rho_f = 0._CUSTOM_REAL @@ -122,7 +134,7 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & kyy = 0._CUSTOM_REAL kyz = 0._CUSTOM_REAL kzz = 0._CUSTOM_REAL - + qmu_atten = 0._CUSTOM_REAL c11 = 0._CUSTOM_REAL @@ -154,7 +166,7 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & zmesh = zstore_dummy(iglob) ! material index 1: associated material number - ! 1 = acoustic, 2 = elastic, 3 = poroelastic, -1 = undefined tomographic + ! 1 = acoustic, 2 = elastic, 3 = poroelastic, -1 = undefined tomographic imaterial_id = mat_ext_mesh(1,ispec) ! material index 2: associated material definition @@ -173,12 +185,12 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & c22,c23,c24,c25,c26,c33, & c34,c35,c36,c44,c45,c46,c55,c56,c66, & ANISOTROPY) - + ! stores velocity model - if(idomain_id == IDOMAIN_ACOUSTIC .or. idomain_id == IDOMAIN_ELASTIC) then - + if(idomain_id == IDOMAIN_ACOUSTIC .or. idomain_id == IDOMAIN_ELASTIC) then + ! elastic or acoustic material ! density @@ -204,8 +216,8 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & tortstore(i,j,k,ispec) = 1.d0 !end pll - else - + else + ! poroelastic material ! solid properties @@ -277,11 +289,11 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & ! stores material domain select case( idomain_id ) - case( IDOMAIN_ACOUSTIC ) + case( IDOMAIN_ACOUSTIC ) ispec_is_acoustic(ispec) = .true. case( IDOMAIN_ELASTIC ) ispec_is_elastic(ispec) = .true. - case( IDOMAIN_POROELASTIC ) + case( IDOMAIN_POROELASTIC ) ispec_is_poroelastic(ispec) = .true. case default stop 'error material domain index' @@ -290,6 +302,17 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & enddo enddo enddo + + ! user output + if(myrank == 0 ) then + if( mod(ispec,nspec/10) == 0 ) then + tCPU = wtime() - time_start + ! remaining + tCPU = (10.0-ispec/(nspec/10.0))/ispec/(nspec/10.0)*tCPU + write(IMAIN,*) " ",ispec/(nspec/10) * 10," %", & + " time remaining:", tCPU,"s" + endif + endif enddo ! checks material domains @@ -318,16 +341,6 @@ subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, & endif enddo - ! GLL model - ! variables for importing models from files in SPECFEM format, e.g., proc000000_vp.bin etc. - ! can be used for importing updated model in iterative inversions - if( IMODEL == IMODEL_GLL ) then - ! note: - ! import the model from files in SPECFEM format - ! note that those those files should be saved in LOCAL_PATH - call model_gll(myrank,nspec,LOCAL_PATH) - endif - end subroutine get_model ! @@ -356,7 +369,7 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, & integer, intent(in) :: nundefMat_ext_mesh character (len=30), dimension(6,nundefMat_ext_mesh):: undef_mat_prop - integer, intent(in) :: imaterial_id,imaterial_def + integer, intent(in) :: imaterial_id,imaterial_def double precision, intent(in) :: xmesh,ymesh,zmesh @@ -374,18 +387,20 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, & ! local parameters integer :: iflag_aniso - + integer :: iundef,imaterial_PB + ! use acoustic domains for simulation logical,parameter :: USE_PURE_ACOUSTIC_MOD = .false. ! initializes with default values + ! no anisotropy iflag_aniso = 0 idomain_id = IDOMAIN_ELASTIC - + ! selects chosen velocity model select case( IMODEL ) - case( IMODEL_DEFAULT, IMODEL_GLL ) + case( IMODEL_DEFAULT,IMODEL_GLL,IMODEL_IPATI ) ! material values determined by mesh properties call model_default(materials_ext_mesh,nmat_ext_mesh, & undef_mat_prop,nundefMat_ext_mesh, & @@ -395,11 +410,21 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, & iflag_aniso,qmu_atten,idomain_id, & rho_s,kappa_s,rho_f,kappa_f,eta_f,kappa_fr,mu_fr, & phi,tort,kxx,kxy,kxz,kyy,kyz,kzz) - + case( IMODEL_1D_PREM ) ! 1D model profile from PREM call model_1D_prem_iso(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) - + + case( IMODEL_1D_PREM_PB ) + ! 1D model profile from PREM modified by Piero + imaterial_PB = abs(imaterial_id) + call model_1D_PREM_routine_PB(xmesh,ymesh,zmesh,rho,vp,vs,imaterial_PB) + ! attenuation: arbitrary value, see maximum in constants.h + qmu_atten = ATTENUATION_COMP_MAXIMUM + ! sets acoustic/elastic domain as given in materials properties + iundef = - imaterial_id ! iundef must be positive + read(undef_mat_prop(6,iundef),*) idomain_id + case( IMODEL_1D_CASCADIA ) ! 1D model profile for Cascadia region call model_1D_cascadia(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) @@ -411,7 +436,7 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, & case( IMODEL_SALTON_TROUGH ) ! gets model values from tomography file call model_salton_trough(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) - + case( IMODEL_TOMO ) ! gets model values from tomography file call model_tomography(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) @@ -420,9 +445,9 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, & ! user model from external routine ! adds/gets velocity model as specified in model_external_values.f90 call model_external_values(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten,iflag_aniso,idomain_id) - - case default - stop 'error: model not implemented yet' + + case default + stop 'error: model not implemented yet' end select ! adds anisotropic default model @@ -430,7 +455,7 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, & call model_aniso(iflag_aniso,rho,vp,vs, & c11,c12,c13,c14,c15,c16, & c22,c23,c24,c25,c26,c33, & - c34,c35,c36,c44,c45,c46,c55,c56,c66) + c34,c35,c36,c44,c45,c46,c55,c56,c66) endif ! for pure acoustic simulations (a way of avoiding re-mesh, re-partition etc.) @@ -439,5 +464,41 @@ subroutine get_model_values(materials_ext_mesh,nmat_ext_mesh, & if( USE_PURE_ACOUSTIC_MOD ) then idomain_id = IDOMAIN_ACOUSTIC endif - + end subroutine get_model_values + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine get_model_binaries(myrank,nspec,LOCAL_PATH) + +! reads in material parameters from external binary files + + use generate_databases_par,only: IMODEL + use create_regions_mesh_ext_par + implicit none + + ! number of spectral elements in each block + integer :: myrank,nspec + character(len=256) :: LOCAL_PATH + + ! external GLL models + ! variables for importing models from files in SPECFEM format, e.g., proc000000_vp.bin etc. + ! can be used for importing updated model in iterative inversions + + ! note: we read in these binary files after mesh coloring, since mesh coloring is permuting arrays. + ! here, the ordering in **_vp.bin etc. can be permuted as they are outputted when saving mesh files + + select case( IMODEL ) + case( IMODEL_GLL ) + ! note: + ! import the model from files in SPECFEM format + ! note that those those files should be saved in LOCAL_PATH + call model_gll(myrank,nspec,LOCAL_PATH) + case( IMODEL_IPATI ) + ! import the model from modified files in SPECFEM format + call model_ipati(myrank,nspec,LOCAL_PATH) + end select + + end subroutine get_model_binaries diff --git a/src/generate_databases/get_perm_color.f90 b/src/generate_databases/get_perm_color.f90 new file mode 100644 index 000000000..e0d743bb5 --- /dev/null +++ b/src/generate_databases/get_perm_color.f90 @@ -0,0 +1,1210 @@ +!===================================================================== +! +! S p e c f e m 3 D V e r s i o n 2 . 0 +! --------------------------------------- +! +! Main authors: Dimitri Komatitsch and Jeroen Tromp +! Princeton University, USA and University of Pau / CNRS / INRIA +! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA +! April 2011 +! +! This program is free software; you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 2 of the License, or +! (at your option) any later version. +! +! This program is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along +! with this program; if not, write to the Free Software Foundation, Inc., +! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +! +!===================================================================== + + +! define sets of colors that contain disconnected elements for the CUDA solver. +! also split the elements into two subsets: inner and outer elements, in order +! to be able to compute the outer elements first in the solver and then +! start non-blocking MPI calls and overlap them with the calculation of the inner elements +! (which works fine because there are always far more inner elements than outer elements) + +! note: these are modified routines to use element domain flags given in ispec_is_d, thus +! coloring only acoustic or elastic (or..) elements in one run, then repeat run for other domains. +! also, the permutation re-starts at 1 for outer and for inner elements, +! making it usable for the phase_ispec_inner_** arrays for acoustic and elastic elements. + + subroutine get_perm_color_faster(is_on_a_slice_edge,ispec_is_d, & + ibool,perm,color, & + nspec,nglob, & + nb_colors_outer_elements,nb_colors_inner_elements, & + nspec_outer,nspec_inner,nspec_domain, & + first_elem_number_in_this_color, & + myrank) + + implicit none + + include "constants.h" + + integer, intent(in) :: nspec, nglob + logical, dimension(nspec), intent(in) :: is_on_a_slice_edge + logical, dimension(nspec), intent(in) :: ispec_is_d + + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec), intent(in) :: ibool + integer, dimension(nspec),intent(inout) :: perm + + integer, dimension(nspec),intent(inout) :: color + integer, dimension(MAX_NUMBER_OF_COLORS+1),intent(inout) :: first_elem_number_in_this_color + integer, intent(out) :: nb_colors_outer_elements,nb_colors_inner_elements + + integer, intent(out) :: nspec_outer,nspec_inner,nspec_domain + integer, intent(in) :: myrank + + ! local variables + integer :: nb_colors + + ! coloring algorithm w/ Droux + call get_color_faster(ibool, is_on_a_slice_edge, ispec_is_d, & + myrank, nspec, nglob, & + color, nb_colors_outer_elements, nb_colors_inner_elements, & + nspec_outer,nspec_inner,nspec_domain) + + !debug output + if(myrank == 0) then + write(IMAIN,*) ' colors:' + write(IMAIN,*) ' number of colors for inner elements = ',nb_colors_inner_elements + write(IMAIN,*) ' number of colors for outer elements = ',nb_colors_outer_elements + write(IMAIN,*) ' total number of colors (sum of both) = ', nb_colors_inner_elements + nb_colors_outer_elements + write(IMAIN,*) ' elements:' + write(IMAIN,*) ' number of elements for outer elements = ',nspec_outer + write(IMAIN,*) ' number of elements for inner elements = ',nspec_inner + write(IMAIN,*) ' total number of elements for domain elements = ',nspec_domain + endif + + ! total number of colors used + nb_colors = nb_colors_inner_elements+nb_colors_outer_elements + first_elem_number_in_this_color(:) = 0 + + ! gets element permutation depending on colors + call get_final_perm(color,perm,first_elem_number_in_this_color(1:nb_colors), & + nspec,nb_colors,nb_colors_outer_elements, & + ispec_is_d,nspec_domain) + + + end subroutine get_perm_color_faster + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine get_color_faster(ibool, is_on_a_slice_edge, ispec_is_d, & + myrank, nspec, nglob, & + color, nb_colors_outer_elements, nb_colors_inner_elements, & + nspec_outer,nspec_inner,nspec_domain) + + implicit none + + include "constants.h" + + integer nspec,nglob + logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d + + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + integer, dimension(nspec) :: color + integer :: nb_colors_outer_elements,nb_colors_inner_elements,myrank + + integer :: nspec_outer,nspec_inner,nspec_domain + + ! local variables + integer :: ispec + logical, dimension(:), allocatable :: mask_ibool + integer :: icolor, nb_already_colored + integer :: iglob1,iglob2,iglob3,iglob4,iglob5,iglob6,iglob7,iglob8 + integer :: ier + logical :: conflict_found_need_new_color + ! Droux + logical :: try_Droux_coloring + logical :: fail_safe + ! valence + integer :: maxval_count_ibool_outer,maxval_count_ibool_inner + + ! display absolute minimum possible number of colors, i.e., maximum valence (for information only) + ! beware: this wastes memory (needs an additional array called "count_ibool") + logical, parameter :: DISPLAY_MIN_POSSIBLE_COLORS = .false. + + ! user output + if( myrank == 0 ) then + if( USE_DROUX_OPTIMIZATION ) then + write(IMAIN,*) ' fast coloring mesh algorithm w/ Droux optimization' + else if( BALANCE_COLORS_SIMPLE_ALGO ) then + write(IMAIN,*) ' fast coloring mesh algorithm w/ color balancing' + else + write(IMAIN,*) ' fast coloring mesh algorithm' + endif + endif + + ! counts number of elements for inner, outer and total domain + nspec_outer = 0 + nspec_inner = 0 + nspec_domain = 0 + do ispec=1,nspec + ! domain elements + if(ispec_is_d(ispec)) then + ! outer/inner elements + if(is_on_a_slice_edge(ispec)) then + nspec_outer=nspec_outer+1 + else + nspec_inner=nspec_inner+1 + endif + nspec_domain=nspec_domain+1 + endif + enddo + + ! debug + !if(myrank == 0) then + ! print * + ! print *,'----------------------------------' + ! print *,'coloring the mesh' + ! print *,'----------------------------------' + ! print * + !endif + + ! Droux optimization + try_Droux_coloring = USE_DROUX_OPTIMIZATION + + if(BALANCE_COLORS_SIMPLE_ALGO .and. USE_DROUX_OPTIMIZATION ) then + if( myrank == 0 ) then + print *,'noticed a problem with mesh coloring options: ' + print *,' cannot set both USE_DROUX_OPTIMAL_ALGO and BALANCE_COLORS_SIMPLE_ALGO' + print *,' -> this run will use only BALANCE_COLORS_SIMPLE_ALGO' + print *,'please check parameter settings in constants.h...' + endif + try_Droux_coloring = .false. + endif + + ! gives a lower bound for the number of colors needed + if(DISPLAY_MIN_POSSIBLE_COLORS .or. try_Droux_coloring) then + ! gets maximum values of valence for inner and outer element points + call count_mesh_valence(ibool,is_on_a_slice_edge,ispec_is_d, & + myrank, nspec, nglob, & + maxval_count_ibool_outer,maxval_count_ibool_inner) + endif + + ! allocates mask + allocate(mask_ibool(nglob),stat=ier) + if( ier /= 0 ) stop 'error allocating mask_ibool array' + + ! entry point for fail-safe mechanism when Droux 1993 fails + 999 continue + + ! first set color of all elements to 0, + ! to use it as a flag to detect elements not yet colored + color(:) = 0 + icolor = 0 + nb_already_colored = 0 + + ! colors outer elements + do while( nb_already_colored < nspec_outer ) + + 333 continue + icolor = icolor + 1 + + ! debug: user output + !if(myrank == 0) then + ! print *,' analyzing color ',icolor,' - outer elements' + !endif + + ! resets flags + mask_ibool(:) = .false. + conflict_found_need_new_color = .false. + + ! finds un-colored elements + do ispec = 1,nspec + ! domain elements only + if( ispec_is_d(ispec) ) then + ! outer elements + if( is_on_a_slice_edge(ispec) ) then + if(color(ispec) == 0) then + ! the eight corners of the current element + iglob1=ibool(1,1,1,ispec) + iglob2=ibool(NGLLX,1,1,ispec) + iglob3=ibool(NGLLX,NGLLY,1,ispec) + iglob4=ibool(1,NGLLY,1,ispec) + iglob5=ibool(1,1,NGLLZ,ispec) + iglob6=ibool(NGLLX,1,NGLLZ,ispec) + iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglob8=ibool(1,NGLLY,NGLLZ,ispec) + + if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. & + mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then + ! if element of this color has a common point with another element of that same color + ! then we need to create a new color, i.e., increment the color of the current element + conflict_found_need_new_color = .true. + else + color(ispec) = icolor + nb_already_colored = nb_already_colored + 1 + mask_ibool(iglob1) = .true. + mask_ibool(iglob2) = .true. + mask_ibool(iglob3) = .true. + mask_ibool(iglob4) = .true. + mask_ibool(iglob5) = .true. + mask_ibool(iglob6) = .true. + mask_ibool(iglob7) = .true. + mask_ibool(iglob8) = .true. + endif + endif + endif + endif + enddo + + ! debug: user output + !if(myrank == 0) then + ! print *,' done ',(100.0*nb_already_colored)/nspec_domain,'% of ',nspec_domain,'elements' + !endif + + if(conflict_found_need_new_color) then + if( icolor >= MAX_NUMBER_OF_COLORS ) stop 'error MAX_NUMBER_OF_COLORS too small' + goto 333 + endif + enddo + + nb_colors_outer_elements = icolor + + ! colors inner elements + do while(nb_already_colored < nspec_domain) + + 334 continue + icolor = icolor + 1 + + ! debug: user output + !if(myrank == 0) then + ! print *,' analyzing color ',icolor,' - inner elements' + !endif + + ! resets flags + mask_ibool(:) = .false. + conflict_found_need_new_color = .false. + + do ispec = 1,nspec + ! domain elements only + if(ispec_is_d(ispec)) then + ! inner elements + if (.not. is_on_a_slice_edge(ispec)) then + if(color(ispec) == 0) then + ! the eight corners of the current element + iglob1=ibool(1,1,1,ispec) + iglob2=ibool(NGLLX,1,1,ispec) + iglob3=ibool(NGLLX,NGLLY,1,ispec) + iglob4=ibool(1,NGLLY,1,ispec) + iglob5=ibool(1,1,NGLLZ,ispec) + iglob6=ibool(NGLLX,1,NGLLZ,ispec) + iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglob8=ibool(1,NGLLY,NGLLZ,ispec) + + if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. & + mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then + ! if element of this color has a common point with another element of that same color + ! then we need to create a new color, i.e., increment the color of the current element + conflict_found_need_new_color = .true. + else + color(ispec) = icolor + nb_already_colored = nb_already_colored + 1 + mask_ibool(iglob1) = .true. + mask_ibool(iglob2) = .true. + mask_ibool(iglob3) = .true. + mask_ibool(iglob4) = .true. + mask_ibool(iglob5) = .true. + mask_ibool(iglob6) = .true. + mask_ibool(iglob7) = .true. + mask_ibool(iglob8) = .true. + endif + endif + endif + endif + enddo + + ! debug user output + !if(myrank == 0) then + ! print *,' done ',(100.0*nb_already_colored)/nspec_domain,'% of ',nspec_domain,'elements' + !endif + + if(conflict_found_need_new_color) then + if( icolor >= MAX_NUMBER_OF_COLORS ) stop 'error MAX_NUMBER_OF_COLORS too small' + goto 334 + endif + enddo + + nb_colors_inner_elements = icolor - nb_colors_outer_elements + + ! Droux optimization: + ! added this to create more balanced colors according to JJ Droux (1993) + ! note: this might not find an optimial solution. + ! we will probably have to try a few times with increasing colors + if( try_Droux_coloring ) then + ! initializes fail-safe mechanism + fail_safe = .false. + + ! tries to find a balanced coloring + call balance_colors_Droux(ibool,is_on_a_slice_edge,ispec_is_d, & + myrank, nspec, nglob, & + color,nb_colors_outer_elements,nb_colors_inner_elements, & + nspec_outer,nspec_inner,maxval_count_ibool_inner, & + mask_ibool,fail_safe) + + ! in case it fails go back to simple coloring algorithm + if( fail_safe ) then + try_Droux_coloring = .false. + if(myrank == 0) write(IMAIN,*) ' giving up on Droux 1993 algorithm, calling fail-safe mechanism' + goto 999 + endif + endif ! of if(try_Droux_coloring) + + ! balances colors using a simple algorithm (if Droux was not used) + if( BALANCE_COLORS_SIMPLE_ALGO ) then + call balance_colors_simple(ibool,is_on_a_slice_edge,ispec_is_d, & + myrank, nspec, nglob, & + color,nb_colors_outer_elements,nb_colors_inner_elements, & + nspec_outer,nspec_inner,mask_ibool) + endif + + ! checks that all the color sets are independent + do icolor = 1,maxval(color) + mask_ibool(:) = .false. + do ispec = 1,nspec + ! domain elements only + if(ispec_is_d(ispec)) then + if(color(ispec) == icolor ) then + ! the eight corners of the current element + iglob1=ibool(1,1,1,ispec) + iglob2=ibool(NGLLX,1,1,ispec) + iglob3=ibool(NGLLX,NGLLY,1,ispec) + iglob4=ibool(1,NGLLY,1,ispec) + iglob5=ibool(1,1,NGLLZ,ispec) + iglob6=ibool(NGLLX,1,NGLLZ,ispec) + iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglob8=ibool(1,NGLLY,NGLLZ,ispec) + + if(mask_ibool(iglob1) .or. mask_ibool(iglob2) .or. mask_ibool(iglob3) .or. mask_ibool(iglob4) .or. & + mask_ibool(iglob5) .or. mask_ibool(iglob6) .or. mask_ibool(iglob7) .or. mask_ibool(iglob8)) then + ! if element of this color has a common point with another element of that same color + ! then there is a problem, the color set is not correct + print*,'error check color:',icolor + stop 'error detected: found a common point inside a color set' + else + mask_ibool(iglob1) = .true. + mask_ibool(iglob2) = .true. + mask_ibool(iglob3) = .true. + mask_ibool(iglob4) = .true. + mask_ibool(iglob5) = .true. + mask_ibool(iglob6) = .true. + mask_ibool(iglob7) = .true. + mask_ibool(iglob8) = .true. + endif + endif + endif + enddo + + !debug output + !if(myrank == 0) print *,' color ',icolor,' has disjoint elements only and is therefore OK' + !if(myrank == 0) print *,' color ',icolor,' contains ',count(color == icolor),' elements' + enddo + ! debug output + !if(myrank == 0) then + ! print*, ' the ',maxval(color),' color sets are OK' + !endif + + deallocate(mask_ibool) + + end subroutine get_color_faster + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine count_mesh_valence(ibool,is_on_a_slice_edge,ispec_is_d, & + myrank, nspec, nglob, & + maxval_count_ibool_outer,maxval_count_ibool_inner) + + implicit none + + include "constants.h" + + integer :: nspec,nglob + + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + + logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d + + integer :: myrank + integer :: maxval_count_ibool_outer,maxval_count_ibool_inner + + ! local parameters + integer, dimension(:), allocatable :: count_ibool + integer :: ispec + integer :: iglob1,iglob2,iglob3,iglob4,iglob5,iglob6,iglob7,iglob8 + integer :: ier + + ! allocates count array + allocate(count_ibool(nglob),stat=ier) + if( ier /= 0 ) stop 'error allocating count_ibool array' + + ! valence numbers of the mesh + maxval_count_ibool_outer = 0 + maxval_count_ibool_inner = 0 + + ! valence for outer elements + count_ibool(:) = 0 + do ispec = 1,nspec + ! domain elements only + if(ispec_is_d(ispec)) then + ! outer elements + if (is_on_a_slice_edge(ispec)) then + ! the eight corners of the current element + iglob1=ibool(1,1,1,ispec) + iglob2=ibool(NGLLX,1,1,ispec) + iglob3=ibool(NGLLX,NGLLY,1,ispec) + iglob4=ibool(1,NGLLY,1,ispec) + iglob5=ibool(1,1,NGLLZ,ispec) + iglob6=ibool(NGLLX,1,NGLLZ,ispec) + iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglob8=ibool(1,NGLLY,NGLLZ,ispec) + + count_ibool(iglob1) = count_ibool(iglob1) + 1 + count_ibool(iglob2) = count_ibool(iglob2) + 1 + count_ibool(iglob3) = count_ibool(iglob3) + 1 + count_ibool(iglob4) = count_ibool(iglob4) + 1 + count_ibool(iglob5) = count_ibool(iglob5) + 1 + count_ibool(iglob6) = count_ibool(iglob6) + 1 + count_ibool(iglob7) = count_ibool(iglob7) + 1 + count_ibool(iglob8) = count_ibool(iglob8) + 1 + endif + endif + enddo + maxval_count_ibool_outer = maxval(count_ibool) + + ! valence for inner elements + count_ibool(:) = 0 + do ispec = 1,nspec + ! domain elements only + if(ispec_is_d(ispec)) then + ! inner elements + if (.not. is_on_a_slice_edge(ispec)) then + ! the eight corners of the current element + iglob1=ibool(1,1,1,ispec) + iglob2=ibool(NGLLX,1,1,ispec) + iglob3=ibool(NGLLX,NGLLY,1,ispec) + iglob4=ibool(1,NGLLY,1,ispec) + iglob5=ibool(1,1,NGLLZ,ispec) + iglob6=ibool(NGLLX,1,NGLLZ,ispec) + iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglob8=ibool(1,NGLLY,NGLLZ,ispec) + + count_ibool(iglob1) = count_ibool(iglob1) + 1 + count_ibool(iglob2) = count_ibool(iglob2) + 1 + count_ibool(iglob3) = count_ibool(iglob3) + 1 + count_ibool(iglob4) = count_ibool(iglob4) + 1 + count_ibool(iglob5) = count_ibool(iglob5) + 1 + count_ibool(iglob6) = count_ibool(iglob6) + 1 + count_ibool(iglob7) = count_ibool(iglob7) + 1 + count_ibool(iglob8) = count_ibool(iglob8) + 1 + endif + endif + enddo + maxval_count_ibool_inner = maxval(count_ibool) + + ! debug outupt + if( myrank == 0 ) then + write(IMAIN,*) ' maximum valence (i.e. minimum possible nb of colors) for outer = ',maxval_count_ibool_outer + write(IMAIN,*) ' maximum valence (i.e. minimum possible nb of colors) for inner = ',maxval_count_ibool_inner + endif + + deallocate(count_ibool) + + end subroutine count_mesh_valence + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine balance_colors_Droux(ibool,is_on_a_slice_edge,ispec_is_d, & + myrank, nspec, nglob, & + color, nb_colors_outer_elements, nb_colors_inner_elements, & + nspec_outer,nspec_inner,maxval_count_ibool_inner, & + mask_ibool,fail_safe) + + implicit none + + include "constants.h" + + integer :: nspec,nglob + + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + + logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d + logical, dimension(nglob) :: mask_ibool + + integer, dimension(nspec) :: color + + integer :: myrank + integer :: nb_colors_outer_elements,nb_colors_inner_elements + + integer :: nspec_outer,nspec_inner + integer :: maxval_count_ibool_inner + + logical :: fail_safe + + ! local parameters + logical, dimension(:), allocatable :: icolor_conflict_found + integer, dimension(:), allocatable :: nb_elems_in_this_color + integer :: ispec,ispec2,icolor,ncolors,icolormin,icolormax,icolor_chosen,nb_elems_in_color_chosen + integer :: nb_tries_of_Droux_1993,last_ispec_studied + integer :: ier + + ! debug outupt + if( myrank == 0 ) then + write(IMAIN,*) ' balancing colors: Droux algorithm' + write(IMAIN,*) ' initial number of outer element colors = ',nb_colors_outer_elements + write(IMAIN,*) ' initial number of inner element colors = ',nb_colors_inner_elements + write(IMAIN,*) ' initial number of total colors = ',nb_colors_outer_elements + nb_colors_inner_elements + endif + + ! initial guess of number of colors needed + if( maxval_count_ibool_inner > 0 .and. maxval_count_ibool_inner < nb_colors_inner_elements ) then + ! uses maximum valence to estimate number of colors for Droux + nb_colors_inner_elements = maxval_count_ibool_inner + endif + + !! DK DK do it for inner elements only for now + ! Droux optimization run + nb_tries_of_Droux_1993 = 1 + + ! entry point to re-try Droux + 765 continue + + ! initial guess of number of colors needed + ncolors = nb_colors_outer_elements + nb_colors_inner_elements + + ! debug output + if( myrank == 0 ) then + write(IMAIN,*) ' Droux optimization: try = ',nb_tries_of_Droux_1993,'colors = ',ncolors + endif + + icolormin = nb_colors_outer_elements + 1 + icolormax = ncolors + + ! allocates temporary arrays + allocate(nb_elems_in_this_color(ncolors), & + icolor_conflict_found(ncolors),stat=ier) + if( ier /= 0 ) stop 'error allocating nb_elems_in_this_color arrays' + + nb_elems_in_this_color(:) = 0 + mask_ibool(:) = .false. + last_ispec_studied = -1 + + do ispec = 1,nspec + ! domain elements only + if(ispec_is_d(ispec)) then + + ! only inner elements + if (is_on_a_slice_edge(ispec)) cycle + + ! unmark the eight corners of the previously marked element + if(last_ispec_studied > 0) then + mask_ibool(ibool(1,1,1,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,1,1,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,NGLLY,1,last_ispec_studied)) = .false. + mask_ibool(ibool(1,NGLLY,1,last_ispec_studied)) = .false. + mask_ibool(ibool(1,1,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,1,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(1,NGLLY,NGLLZ,last_ispec_studied)) = .false. + endif + icolor_conflict_found(icolormin:icolormax) = .false. + + ! mark the eight corners of the current element + mask_ibool(ibool(1,1,1,ispec)) = .true. + mask_ibool(ibool(NGLLX,1,1,ispec)) = .true. + mask_ibool(ibool(NGLLX,NGLLY,1,ispec)) = .true. + mask_ibool(ibool(1,NGLLY,1,ispec)) = .true. + mask_ibool(ibool(1,1,NGLLZ,ispec)) = .true. + mask_ibool(ibool(NGLLX,1,NGLLZ,ispec)) = .true. + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec)) = .true. + mask_ibool(ibool(1,NGLLY,NGLLZ,ispec)) = .true. + last_ispec_studied = ispec + + if(ispec > 1) then + do ispec2 = 1,ispec - 1 + ! domain elements only + if(ispec_is_d(ispec2)) then + + ! only inner elements + if (is_on_a_slice_edge(ispec2)) cycle + + ! if conflict already found previously with this color, no need to test again + if (icolor_conflict_found(color(ispec2))) cycle + + ! test the eight corners of the current element for a common point with element under study + if (mask_ibool(ibool(1,1,1,ispec2)) .or. & + mask_ibool(ibool(NGLLX,1,1,ispec2)) .or. & + mask_ibool(ibool(NGLLX,NGLLY,1,ispec2)) .or. & + mask_ibool(ibool(1,NGLLY,1,ispec2)) .or. & + mask_ibool(ibool(1,1,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(NGLLX,1,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(1,NGLLY,NGLLZ,ispec2))) & + icolor_conflict_found(color(ispec2)) = .true. + + endif ! domain elements + enddo + endif + + ! check if the Droux 1993 algorithm found a solution + if (all(icolor_conflict_found(icolormin:icolormax))) then + ! user output + !if(myrank == 0) write(IMAIN,*) ' Droux 1993 algorithm did not find any solution for ncolors = ',ncolors + + ! try with one more color + if(nb_tries_of_Droux_1993 < MAX_NB_TRIES_OF_DROUX_1993) then + nb_colors_inner_elements = nb_colors_inner_elements + 1 + deallocate(nb_elems_in_this_color) + deallocate(icolor_conflict_found) + nb_tries_of_Droux_1993 = nb_tries_of_Droux_1993 + 1 + goto 765 + else + ! fail-safe mechanism: if Droux 1993 still fails after all the tries with one more color, + ! then go back to my original simple and fast coloring algorithm + fail_safe = .true. + return + endif + endif + + ! loop on all the colors to determine the color with the smallest number + ! of elements and for which there is no conflict + nb_elems_in_color_chosen = 2147000000 ! start with extremely large unrealistic value + icolor_chosen = 0 + do icolor = icolormin,icolormax + if (.not. icolor_conflict_found(icolor) .and. nb_elems_in_this_color(icolor) < nb_elems_in_color_chosen) then + icolor_chosen = icolor + nb_elems_in_color_chosen = nb_elems_in_this_color(icolor) + endif + enddo + + ! store the color finally chosen + color(ispec) = icolor_chosen + nb_elems_in_this_color(icolor_chosen) = nb_elems_in_this_color(icolor_chosen) + 1 + + endif ! domain elements + enddo + + ! debug output + if(myrank == 0) then + write(IMAIN,*) ' created a total of ',maxval(color),' colors in this domain' ! 'for all the domain elements of the mesh' + if( nb_colors_outer_elements > 0 ) & + write(IMAIN,*) ' typical nb of elements per color for outer elements should be ', & + nspec_outer / nb_colors_outer_elements + if( nb_colors_inner_elements > 0 ) & + write(IMAIN,*) ' typical nb of elements per color for inner elements should be ', & + nspec_inner / nb_colors_inner_elements + endif + + + end subroutine balance_colors_Droux + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine balance_colors_simple(ibool,is_on_a_slice_edge,ispec_is_d, & + myrank, nspec, nglob, & + color, nb_colors_outer_elements, nb_colors_inner_elements, & + nspec_outer,nspec_inner,mask_ibool) + + implicit none + + include "constants.h" + + integer :: nspec,nglob + + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + + logical, dimension(nspec) :: is_on_a_slice_edge,ispec_is_d + logical, dimension(nglob) :: mask_ibool + + integer, dimension(nspec) :: color + + integer :: myrank + integer :: nb_colors_outer_elements,nb_colors_inner_elements + + integer :: nspec_outer,nspec_inner + + ! local parameters + logical, dimension(:), allocatable :: icolor_conflict_found + integer, dimension(:), allocatable :: nb_elems_in_this_color + integer :: ispec,ispec2,icolor,ncolors,icolormin,icolormax,icolor_chosen,nb_elems_in_color_chosen + integer :: last_ispec_studied + integer :: target_nb_elems_per_color,icolor_target + integer :: ier + + ! debug outupt + if( myrank == 0 ) then + write(IMAIN,*) ' balancing colors: simple algorithm' + write(IMAIN,*) ' number of outer element colors = ',nb_colors_outer_elements + write(IMAIN,*) ' number of inner element colors = ',nb_colors_inner_elements + write(IMAIN,*) ' number of total colors = ',nb_colors_outer_elements + nb_colors_inner_elements + endif + + ! balances colors in postprocess if Droux (1993) is not used + ncolors = nb_colors_outer_elements + nb_colors_inner_elements + + ! allocates temporary arrays + allocate(nb_elems_in_this_color(ncolors), & + icolor_conflict_found(ncolors),stat=ier) + if( ier /= 0 ) stop 'error allocating nb_elems_in_this_color arrays' + + !! DK DK do it for outer elements + icolormin = 1 + icolormax = nb_colors_outer_elements + + ! ideal value if all colors are perfectly balanced + if( nb_colors_outer_elements > 0 ) then + target_nb_elems_per_color = nspec_outer / nb_colors_outer_elements + 1 + else + target_nb_elems_per_color = 1 + endif + + ! print *,'nspec_outer,target_nb_elems_per_color = ',nspec_outer,target_nb_elems_per_color + + ! count the initial number of elements in each color + nb_elems_in_this_color(:) = 0 + do icolor = icolormin,icolormax + nb_elems_in_this_color(icolor) = count(color == icolor) + enddo + + ! do not balance the last one, because it will be balanced automatically by the others + do icolor = icolormin,icolormax-1 + + ! if color is already balanced, do nothing + ! (this works because in the initial set the number of elements per color decreases when the color number increases) + if(nb_elems_in_this_color(icolor) <= target_nb_elems_per_color) cycle + + mask_ibool(:) = .false. + last_ispec_studied = -1 + + do ispec = 1,nspec + ! domain elements only + if(ispec_is_d(ispec)) then + + ! only outer elements + if (.not. is_on_a_slice_edge(ispec)) cycle + + ! only elements of this color + if (color(ispec) /= icolor) cycle + + ! if color is now balanced because we have moved enough elements then stop searching + if(nb_elems_in_this_color(icolor) <= target_nb_elems_per_color) exit + + ! unmark the eight corners of the previously marked element + if(last_ispec_studied > 0) then + mask_ibool(ibool(1,1,1,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,1,1,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,NGLLY,1,last_ispec_studied)) = .false. + mask_ibool(ibool(1,NGLLY,1,last_ispec_studied)) = .false. + mask_ibool(ibool(1,1,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,1,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(1,NGLLY,NGLLZ,last_ispec_studied)) = .false. + endif + icolor_conflict_found(icolormin:icolormax) = .false. + icolor_conflict_found(icolor) = .true. ! cannot move element to the color it already has + + ! mark the eight corners of the current element + mask_ibool(ibool(1,1,1,ispec)) = .true. + mask_ibool(ibool(NGLLX,1,1,ispec)) = .true. + mask_ibool(ibool(NGLLX,NGLLY,1,ispec)) = .true. + mask_ibool(ibool(1,NGLLY,1,ispec)) = .true. + mask_ibool(ibool(1,1,NGLLZ,ispec)) = .true. + mask_ibool(ibool(NGLLX,1,NGLLZ,ispec)) = .true. + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec)) = .true. + mask_ibool(ibool(1,NGLLY,NGLLZ,ispec)) = .true. + last_ispec_studied = ispec + + ! test if we can move this element to another color + do ispec2 = 1,nspec + ! domain elements only + if(ispec_is_d(ispec2)) then + + ! do not test that element itself + if (ispec2 == ispec) cycle + + ! only outer elements + if (.not. is_on_a_slice_edge(ispec2)) cycle + + ! if conflict already found previously with this color, no need to test again + if (icolor_conflict_found(color(ispec2))) cycle + + ! test the eight corners of the current element for a common point with element under study + if (mask_ibool(ibool(1,1,1,ispec2)) .or. & + mask_ibool(ibool(NGLLX,1,1,ispec2)) .or. & + mask_ibool(ibool(NGLLX,NGLLY,1,ispec2)) .or. & + mask_ibool(ibool(1,NGLLY,1,ispec2)) .or. & + mask_ibool(ibool(1,1,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(NGLLX,1,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(1,NGLLY,NGLLZ,ispec2))) & + icolor_conflict_found(color(ispec2)) = .true. + endif ! domain elements + enddo + + ! if color is already above target size for a balanced set, do not move to it + do icolor_target = icolormin,icolormax + if(nb_elems_in_this_color(icolor_target) >= target_nb_elems_per_color) & + icolor_conflict_found(icolor_target) = .true. + enddo + + ! if cannot find any other color to move this element to + if (all(icolor_conflict_found(icolormin:icolormax))) cycle + + ! loop on all the colors to determine the color with the smallest number of elements + ! and for which there is no conflict + nb_elems_in_color_chosen = 2147000000 ! start with extremely large unrealistic value + icolor_chosen = 0 + do icolor_target = icolormin,icolormax + if (.not. icolor_conflict_found(icolor_target) .and. & + nb_elems_in_this_color(icolor_target) < nb_elems_in_color_chosen) then + icolor_chosen = icolor_target + nb_elems_in_color_chosen = nb_elems_in_this_color(icolor_target) + endif + enddo + + ! move the element to that new color + ! remove element from its current color + nb_elems_in_this_color(color(ispec)) = nb_elems_in_this_color(color(ispec)) - 1 + color(ispec) = icolor_chosen + ! and add it to the new color + nb_elems_in_this_color(icolor_chosen) = nb_elems_in_this_color(icolor_chosen) + 1 + + endif ! domain elements + enddo + + enddo ! icolor + +!! DK DK do it for inner elements + + icolormin = nb_colors_outer_elements + 1 + icolormax = ncolors + + ! ideal value if all colors are perfectly balanced + if( nb_colors_inner_elements > 0 ) then + target_nb_elems_per_color = nspec_inner / nb_colors_inner_elements + 1 + else + target_nb_elems_per_color = 1 + endif + ! print *,'nspec_inner,target_nb_elems_per_color = ',nspec_inner,target_nb_elems_per_color + + ! count the initial number of elements in each color + nb_elems_in_this_color(:) = 0 + do icolor = icolormin,icolormax + nb_elems_in_this_color(icolor) = count(color == icolor) + enddo + + ! do not balance the last one, because it will be balanced automatically by the others + do icolor = icolormin,icolormax-1 + + ! if color is already balanced, do nothing + ! (this works because in the initial set the number of elements per color decreases when the color number increases) + if(nb_elems_in_this_color(icolor) <= target_nb_elems_per_color) cycle + + mask_ibool(:) = .false. + last_ispec_studied = -1 + + do ispec = 1,nspec + ! domain elements only + if(ispec_is_d(ispec)) then + + ! only inner elements + if (is_on_a_slice_edge(ispec)) cycle + + ! only elements of this color + if (color(ispec) /= icolor) cycle + + ! if color is now balanced because we have moved enough elements then stop searching + if(nb_elems_in_this_color(icolor) <= target_nb_elems_per_color) exit + + ! unmark the eight corners of the previously marked element + if(last_ispec_studied > 0) then + mask_ibool(ibool(1,1,1,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,1,1,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,NGLLY,1,last_ispec_studied)) = .false. + mask_ibool(ibool(1,NGLLY,1,last_ispec_studied)) = .false. + mask_ibool(ibool(1,1,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,1,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,last_ispec_studied)) = .false. + mask_ibool(ibool(1,NGLLY,NGLLZ,last_ispec_studied)) = .false. + endif + icolor_conflict_found(icolormin:icolormax) = .false. + icolor_conflict_found(icolor) = .true. ! cannot move element to the color it already has + + ! mark the eight corners of the current element + mask_ibool(ibool(1,1,1,ispec)) = .true. + mask_ibool(ibool(NGLLX,1,1,ispec)) = .true. + mask_ibool(ibool(NGLLX,NGLLY,1,ispec)) = .true. + mask_ibool(ibool(1,NGLLY,1,ispec)) = .true. + mask_ibool(ibool(1,1,NGLLZ,ispec)) = .true. + mask_ibool(ibool(NGLLX,1,NGLLZ,ispec)) = .true. + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec)) = .true. + mask_ibool(ibool(1,NGLLY,NGLLZ,ispec)) = .true. + last_ispec_studied = ispec + + ! test if we can move this element to another color + do ispec2 = 1,nspec + ! domain elements only + if(ispec_is_d(ispec2)) then + + ! do not test that element itself + if (ispec2 == ispec) cycle + + ! only inner elements + if (is_on_a_slice_edge(ispec2)) cycle + + ! if conflict already found previously with this color, no need to test again + if (icolor_conflict_found(color(ispec2))) cycle + + ! test the eight corners of the current element for a common point with element under study + if (mask_ibool(ibool(1,1,1,ispec2)) .or. & + mask_ibool(ibool(NGLLX,1,1,ispec2)) .or. & + mask_ibool(ibool(NGLLX,NGLLY,1,ispec2)) .or. & + mask_ibool(ibool(1,NGLLY,1,ispec2)) .or. & + mask_ibool(ibool(1,1,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(NGLLX,1,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(NGLLX,NGLLY,NGLLZ,ispec2)) .or. & + mask_ibool(ibool(1,NGLLY,NGLLZ,ispec2))) & + icolor_conflict_found(color(ispec2)) = .true. + + endif ! domain elements + enddo + + ! if color is already above target size for a balanced set, do not move to it + do icolor_target = icolormin,icolormax + if(nb_elems_in_this_color(icolor_target) >= target_nb_elems_per_color) & + icolor_conflict_found(icolor_target) = .true. + enddo + + ! if cannot find any other color to move this element to + if (all(icolor_conflict_found(icolormin:icolormax))) cycle + + ! loops on all the colors to determine the color with the smallest number of elements + ! and for which there is no conflict + nb_elems_in_color_chosen = 2147000000 ! start with extremely large unrealistic value + icolor_chosen = 0 + do icolor_target = icolormin,icolormax + if (.not. icolor_conflict_found(icolor_target) .and. & + nb_elems_in_this_color(icolor_target) < nb_elems_in_color_chosen) then + icolor_chosen = icolor_target + nb_elems_in_color_chosen = nb_elems_in_this_color(icolor_target) + endif + enddo + + ! moves the element to that new color + ! remove element from its current color + nb_elems_in_this_color(color(ispec)) = nb_elems_in_this_color(color(ispec)) - 1 + color(ispec) = icolor_chosen + ! and add it to the new color + nb_elems_in_this_color(icolor_chosen) = nb_elems_in_this_color(icolor_chosen) + 1 + + endif ! domain elements + enddo + + enddo ! icolor + + end subroutine balance_colors_simple + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine get_final_perm(color,perm,first_elem_number_in_this_color, & + nspec,nb_colors,nb_colors_outer_elements, & + ispec_is_d,nspec_domain) + + integer, intent(in) :: nspec,nb_colors + + integer,dimension(nspec), intent(in) :: color + integer,dimension(nspec), intent(inout) :: perm + + integer, intent(inout) :: first_elem_number_in_this_color(nb_colors) + + logical,dimension(nspec),intent(in) :: ispec_is_d + + integer,intent(in) :: nb_colors_outer_elements,nspec_domain + + ! local parameters + integer :: ispec,icolor,icounter,counter_outer + + ! note: permutations are only valid within each domain + ! also, the counters start at 1 for each inner/outer element range + + ! outer elements first ( note: inner / outer order sensitive) + icounter = 1 + do icolor = 1, nb_colors_outer_elements + first_elem_number_in_this_color(icolor) = icounter + do ispec = 1, nspec + ! elements in this domain only + if( ispec_is_d(ispec) ) then + if(color(ispec) == icolor) then + perm(ispec) = icounter + icounter = icounter + 1 + endif + endif + enddo + enddo + counter_outer = icounter - 1 + + ! inner elements second + icounter = 1 + do icolor = nb_colors_outer_elements+1, nb_colors + first_elem_number_in_this_color(icolor) = icounter + counter_outer + do ispec = 1, nspec + ! elements in this domain only + if( ispec_is_d(ispec) ) then + ! outer elements + if(color(ispec) == icolor) then + perm(ispec) = icounter + icounter = icounter + 1 + endif + endif + enddo + enddo + + ! checks + if( counter_outer + icounter -1 /= nspec_domain ) then + print*,'error: perm: ',nspec_domain,counter_outer,icounter,counter_outer+icounter-1 + stop 'error get_final_perm: counter incomplete' + endif + + end subroutine get_final_perm + + +! +!------------------------------------------------------------------------------------------------- +! +! PERMUTATIONS +! +!------------------------------------------------------------------------------------------------- +! + +! implement permutation of elements for arrays of real (CUSTOM_REAL) type + + subroutine permute_elements_real(array_to_permute,temp_array,perm,nspec) + + implicit none + + include "constants.h" + + integer, intent(in) :: nspec + integer, intent(in), dimension(nspec) :: perm + + real(kind=CUSTOM_REAL), intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: & + array_to_permute,temp_array + + integer old_ispec,new_ispec + + ! copy the original array + temp_array(:,:,:,:) = array_to_permute(:,:,:,:) + + do old_ispec = 1,nspec + new_ispec = perm(old_ispec) + array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec) + enddo + + end subroutine permute_elements_real + +! +!------------------------------------------------------------------------------------------------- +! + +! implement permutation of elements for arrays of integer type + + subroutine permute_elements_integer(array_to_permute,temp_array,perm,nspec) + + implicit none + + include "constants.h" + + integer, intent(in) :: nspec + integer, intent(in), dimension(nspec) :: perm + + integer, intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: & + array_to_permute,temp_array + + integer old_ispec,new_ispec + + ! copy the original array + temp_array(:,:,:,:) = array_to_permute(:,:,:,:) + + do old_ispec = 1,nspec + new_ispec = perm(old_ispec) + array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec) + enddo + + end subroutine permute_elements_integer + +! +!------------------------------------------------------------------------------------------------- +! + +! implement permutation of elements for arrays of double precision type + + subroutine permute_elements_dble(array_to_permute,temp_array,perm,nspec) + + implicit none + + include "constants.h" + + integer, intent(in) :: nspec + integer, intent(in), dimension(nspec) :: perm + + double precision, intent(inout), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: & + array_to_permute,temp_array + + integer old_ispec,new_ispec + + ! copy the original array + temp_array(:,:,:,:) = array_to_permute(:,:,:,:) + + do old_ispec = 1,nspec + new_ispec = perm(old_ispec) + array_to_permute(:,:,:,new_ispec) = temp_array(:,:,:,old_ispec) + enddo + + end subroutine permute_elements_dble + +! +!------------------------------------------------------------------------------------------------- +! + +! implement permutation of elements for arrays of double precision type + + subroutine permute_elements_logical1D(array_to_permute,temp_array,perm,nspec) + + implicit none + + include "constants.h" + + integer, intent(in) :: nspec + integer, intent(in), dimension(nspec) :: perm + + logical, intent(inout), dimension(nspec) :: array_to_permute,temp_array + + integer old_ispec,new_ispec + + ! copy the original array + temp_array(:) = array_to_permute(:) + + do old_ispec = 1,nspec + new_ispec = perm(old_ispec) + array_to_permute(new_ispec) = temp_array(old_ispec) + enddo + + end subroutine permute_elements_logical1D diff --git a/src/generate_databases/memory_eval.f90 b/src/generate_databases/memory_eval.f90 index a0c592728..1942bc415 100644 --- a/src/generate_databases/memory_eval.f90 +++ b/src/generate_databases/memory_eval.f90 @@ -152,10 +152,10 @@ end subroutine memory_eval ! compute the approximate amount of static memory needed to run the mesher subroutine memory_eval_mesher(myrank,nspec,npointot,nnodes_ext_mesh, & - nelmnts_ext_mesh,nmat_ext_mesh,num_interfaces_ext_mesh, & - max_interface_size_ext_mesh,nspec2D_xmin,nspec2D_xmax, & - nspec2D_ymin,nspec2D_ymax,nspec2D_bottom,nspec2D_top, & - static_memory_size_request) + nelmnts_ext_mesh,nmat_ext_mesh,num_interfaces_ext_mesh, & + max_interface_size_ext_mesh,nspec2D_xmin,nspec2D_xmax, & + nspec2D_ymin,nspec2D_ymax,nspec2D_bottom,nspec2D_top, & + static_memory_size_request) implicit none @@ -166,7 +166,7 @@ subroutine memory_eval_mesher(myrank,nspec,npointot,nnodes_ext_mesh, & max_interface_size_ext_mesh,nspec2D_xmin,nspec2D_xmax, & nspec2D_ymin,nspec2D_ymax,nspec2D_bottom,nspec2D_top - integer,intent(inout) :: static_memory_size_request + double precision,intent(inout) :: static_memory_size_request ! local parameters integer :: static_memory_size diff --git a/src/generate_databases/model_1d_cascadia.f90 b/src/generate_databases/model_1d_cascadia.f90 index c3fc2d054..92bcec695 100644 --- a/src/generate_databases/model_1d_cascadia.f90 +++ b/src/generate_databases/model_1d_cascadia.f90 @@ -48,7 +48,7 @@ subroutine model_1D_cascadia(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) ! local parameters real(kind=CUSTOM_REAL) :: x,y,z real(kind=CUSTOM_REAL) :: depth - real(kind=CUSTOM_REAL) :: elevation,distmin + real(kind=CUSTOM_REAL) :: elevation,distmin ! converts GLL point location to real x = xmesh @@ -61,19 +61,19 @@ subroutine model_1D_cascadia(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) call get_topo_elevation_free_closest(x,y,elevation,distmin, & nspec,nglob_dummy,ibool,xstore_dummy,ystore_dummy,zstore_dummy, & num_free_surface_faces,free_surface_ispec,free_surface_ijk) - + ! depth in Z-direction - if( distmin < HUGEVAL ) then + if( distmin < HUGEVAL ) then depth = elevation - z else depth = - z endif ! depth in km - depth = depth / 1000.0 + depth = depth / 1000.0 ! 1D profile Cascadia - + ! super-imposes values if( depth < 1.0 ) then ! vp in m/s diff --git a/src/generate_databases/model_1d_prem.f90 b/src/generate_databases/model_1d_prem.f90 index 185c59335..93dacd96d 100644 --- a/src/generate_databases/model_1d_prem.f90 +++ b/src/generate_databases/model_1d_prem.f90 @@ -51,29 +51,29 @@ subroutine model_1D_prem_iso(xmesh,ymesh,zmesh,rho_prem,vp_prem,vs_prem,qmu_atte implicit none ! GLL point location - double precision, intent(in) :: xmesh,ymesh,zmesh - + double precision, intent(in) :: xmesh,ymesh,zmesh + ! material properties real(kind=CUSTOM_REAL), intent(inout) :: rho_prem,vp_prem,vs_prem,qmu_atten - + ! local parameters real(kind=CUSTOM_REAL) :: xloc,yloc,zloc real(kind=CUSTOM_REAL) :: depth - real(kind=CUSTOM_REAL) :: elevation,distmin + real(kind=CUSTOM_REAL) :: elevation,distmin double precision :: x,rho,drhodr,vp,vs,Qkappa,Qmu double precision :: & - R_EARTH,RICB,RCMB,RTOPDDOUBLEPRIME, & + R_EARTH_M,RICB,RCMB,RTOPDDOUBLEPRIME, & R771,R600,R670,R400,R220,R80,RMOHO,RMIDDLE_CRUST,ROCEAN double precision :: r ! uses crustal values from other models (like crust2.0) than prem ! set to .false. to use PREM crustal values, otherwise will take mantle values up to surface - logical,parameter :: CRUSTAL = .false. + logical,parameter :: CRUSTAL = .false. ! avoids crustal values, uses Moho values up to the surface logical,parameter :: SUPPRESS_CRUSTAL_MESH = .false. ! same properties everywhere in PREM crust if we decide to define only one layer in the crust logical,parameter :: ONE_CRUST = .false. - + ! GLL point location converted to real xloc = xmesh yloc = ymesh @@ -85,16 +85,16 @@ subroutine model_1D_prem_iso(xmesh,ymesh,zmesh,rho_prem,vp_prem,vs_prem,qmu_atte call get_topo_elevation_free_closest(xloc,yloc,elevation,distmin, & nspec,nglob_dummy,ibool,xstore_dummy,ystore_dummy,zstore_dummy, & num_free_surface_faces,free_surface_ispec,free_surface_ijk) - + ! depth in Z-direction - if( distmin < HUGEVAL ) then + if( distmin < HUGEVAL ) then depth = elevation - zloc else depth = - zloc endif ! PREM layers (in m) - R_EARTH = 6371000.d0 + R_EARTH_M = 6371000.d0 ROCEAN = 6368000.d0 RMIDDLE_CRUST = 6356000.d0 RMOHO = 6346600.d0 @@ -113,10 +113,18 @@ subroutine model_1D_prem_iso(xmesh,ymesh,zmesh,rho_prem,vp_prem,vs_prem,qmu_atte ! normalized radius x = r / R_EARTH - + ! given a normalized radius x, gives the non-dimensionalized density rho, ! speeds vp and vs, and the quality factors Qkappa and Qmu + ! initializes + rho = 0.d0 + vp = 0.d0 + vs = 0.d0 + Qmu = 0.d0 + Qkappa = 0.d0 + drhodr = 0.d0 + ! !--- inner core ! @@ -271,5 +279,102 @@ subroutine model_1D_prem_iso(xmesh,ymesh,zmesh,rho_prem,vp_prem,vs_prem,qmu_atte vs_prem=vs*1000.0d0 qmu_atten = Qmu - + end subroutine model_1D_prem_iso + + +! +!------------------------------------------------------------------------------------------------- +! + +!PB ROUTINE WHICH ASSINGS AT EACH GLL POINT THE VALUES OF v_p v_s AND rho TAKEN FROM THE PREM MODEL +!PB IT'S CALLED IN get_model.f90(:149) WITHIN THE generate_databases PROCESS + + subroutine model_1D_PREM_routine_PB(xloc,yloc,zloc,ro_prem,vp_prem,vs_prem,idom) + + use generate_databases_par,only: CUSTOM_REAL + implicit none + + double precision, intent(in) :: xloc,yloc,zloc + real(kind=CUSTOM_REAL),intent(inout) :: ro_prem,vp_prem,vs_prem + integer, intent(in) :: idom + + ! local parameters + double precision :: r0,r,x_prem + !double precision :: ro_prem,vp_prem,vs_prem + !character(len=3), intent(in) :: param !rho, vs,vp + + r0=sqrt(xloc**2+yloc**2+zloc**2) + ! print*,'xloc,yloc,zloc,r0,idom',xloc,yloc,zloc,r0,idom + r=r0/1000. + + x_prem=r/6371. ! Radius (normalized to x(surface)=1 ) + ! print*,'x_prem',x_prem + IF(idom==1)THEN ! upper crustal layer + !print*,'I am in domain 1' + ro_prem=2.6 + vp_prem=5.8 + vs_prem=3.2 + ! print*,'ro,vp,vs form domain 1',ro_prem,vp_prem,vs_prem + ELSEIF(idom==2)THEN + ro_prem=2.9 ! lower crustal layer + vp_prem=6.8 + vs_prem=3.9 + ELSEIF(idom==3)THEN + ro_prem=2.691+.6924*x_prem ! upper mantle + vp_prem=4.1875+3.9382*x_prem + vs_prem=2.1519+2.3481*x_prem + ELSEIF(idom==4)THEN + ro_prem=7.1089-3.8045*x_prem + vp_prem=20.3926-12.2569*x_prem + vs_prem=8.9496-4.4597*x_prem + ELSEIF(idom==5)THEN + ro_prem=11.2494-8.0298*x_prem + vp_prem=39.7027-32.6166*x_prem + vs_prem=22.3512-18.5856*x_prem + ELSEIF(idom==6)THEN + ro_prem=5.3197-1.4836*x_prem + vp_prem=19.0957-9.8672*x_prem + vs_prem=9.9839-4.9324*x_prem + ELSEIF(idom==7)THEN !lower mantle + ro_prem=7.9565-6.4761*x_prem+5.5283*x_prem**2-3.0807*x_prem**3 + vp_prem=29.2766-23.6027*x_prem+5.5242*x_prem**2-2.5514*x_prem**3 + vs_prem=22.3459-17.2473*x_prem-2.0834*x_prem**2+0.9783*x_prem**3 + ELSEIF(idom==8)THEN + ro_prem=7.9565-6.4761*x_prem+5.5283*x_prem**2-3.0807*x_prem**3 + vp_prem=24.9520-40.4673*x_prem+51.4832*x_prem**2-26.6419*x_prem**3 + vs_prem=11.1671-13.7818*x_prem+17.4575*x_prem**2-9.2777*x_prem**3 + ELSEIF(idom==9)THEN + ro_prem=7.9565-6.4761*x_prem+5.5283*x_prem**2-3.0807*x_prem**3 + vp_prem=15.3891-5.3181*x_prem+5.5242*x_prem**2-2.5514*x_prem**3 + vs_prem=6.9254+1.4672*x_prem-2.0834*x_prem**2+.9783*x_prem**3 + ELSEIF(idom==10)THEN ! outer core + ro_prem=12.5815-1.2638*x_prem-3.6426*x_prem**2-5.5281*x_prem**3 + vp_prem=11.0487-4.0362*x_prem+4.8023*x_prem**2-13.5732*x_prem**3 + vs_prem=0.00 + ELSEIF(idom==11)THEN ! inner core + ro_prem=13.0885-8.8381*x_prem**2 + vp_prem=11.2622-6.3640*x_prem**2 + vs_prem=3.6678-4.4475*x_prem**2 + ENDIF + + ! print*,'ro,vp,vs passed from routine and not multiplied',ro_prem,vp_prem,vs_prem + + ro_prem=ro_prem*1000 + vp_prem=vp_prem*1000 + vs_prem=vs_prem*1000 + + ! print*,'ro,vp,vs passed from PREM_ROUTINE multiplied by 1000',ro_prem,vp_prem,vs_prem + + ! if (param=='rho') then + ! prem_sub=ro_prem*1000. + ! elseif (param=='v_p') then + ! prem_sub=vp_prem*1000. + ! elseif (param=='v_s') then + ! prem_sub=vs_prem*1000. + ! else + ! write(6,*)'ERROR IN PREM_SUB FUNCTION:',param,'NOT AN OPTION' + ! stop + + end subroutine model_1D_PREM_routine_PB + diff --git a/src/generate_databases/model_1d_socal.f90 b/src/generate_databases/model_1d_socal.f90 index f9d95cdb4..05e813279 100644 --- a/src/generate_databases/model_1d_socal.f90 +++ b/src/generate_databases/model_1d_socal.f90 @@ -26,13 +26,13 @@ !-------------------------------------------------------------------------------------------------- ! -! 1D Southern California model +! 1D Southern California model ! ! model is the standard model used in southern California: -! Kanamori and Hadley (1975), Dreger and Helmberger (1990), Wald-Hutton,Given (1995) +! Kanamori and Hadley (1975), Dreger and Helmberger (1990), Wald-Hutton,Given (1995) ! !-------------------------------------------------------------------------------------------------- - + subroutine model_1D_socal(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) ! given a GLL point, returns super-imposed velocity model values @@ -49,12 +49,12 @@ subroutine model_1D_socal(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) ! local parameters real(kind=CUSTOM_REAL) :: depth,x,y,z - + ! mesh point location x = xmesh y = ymesh z = zmesh - + ! depth in m depth = -zmesh @@ -88,5 +88,5 @@ subroutine model_1D_socal(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) ! no attenuation information qmu_atten = 0.d0 - + end subroutine model_1D_socal diff --git a/src/generate_databases/model_aniso.f90 b/src/generate_databases/model_aniso.f90 index 7c1f7bc8e..736f9297f 100644 --- a/src/generate_databases/model_aniso.f90 +++ b/src/generate_databases/model_aniso.f90 @@ -316,5 +316,28 @@ subroutine model_aniso(iflag_aniso,rho,vp,vs, & c56 = - d46 c66 = d66 +! unused: fills values with the isotropic model +! c11 = rho*vpv*vpv +! c12 = rho*(vpv*vpv-2.*vsv*vsv) +! c13 = c12 +! c14 = 0.d0 +! c15 = 0.d0 +! c16 = 0.d0 +! c22 = c11 +! c23 = c12 +! c24 = 0.d0 +! c25 = 0.d0 +! c26 = 0.d0 +! c33 = c11 +! c34 = 0.d0 +! c35 = 0.d0 +! c36 = 0.d0 +! c44 = rho*vsv*vsv +! c45 = 0.d0 +! c46 = 0.d0 +! c55 = c44 +! c56 = 0.d0 +! c66 = c44 + end subroutine model_aniso diff --git a/src/generate_databases/model_default.f90 b/src/generate_databases/model_default.f90 index 85cb89cc5..c6c2e27d6 100644 --- a/src/generate_databases/model_default.f90 +++ b/src/generate_databases/model_default.f90 @@ -49,7 +49,7 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, & integer, intent(in) :: nundefMat_ext_mesh character (len=30), dimension(6,nundefMat_ext_mesh):: undef_mat_prop - integer, intent(in) :: imaterial_id,imaterial_def + integer, intent(in) :: imaterial_id,imaterial_def double precision, intent(in) :: xmesh,ymesh,zmesh @@ -60,11 +60,11 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, & real(kind=CUSTOM_REAL) :: kappa_s,kappa_f,kappa_fr,mu_fr,rho_s,rho_f,phi,tort,eta_f, & kxx,kxy,kxz,kyy,kyz,kzz - + ! local parameters integer :: iflag,flag_below,flag_above integer :: iundef - + ! check if the material is known or unknown if( imaterial_id > 0 ) then ! gets velocity model as specified by (cubit) mesh files for elastic & acoustic @@ -74,8 +74,8 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, & idomain_id = materials_ext_mesh(6,imaterial_id) select case( idomain_id ) - - case( IDOMAIN_ACOUSTIC,IDOMAIN_ELASTIC) + + case( IDOMAIN_ACOUSTIC,IDOMAIN_ELASTIC) ! elastic or acoustic ! density @@ -93,7 +93,7 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, & ! anisotropy iflag_aniso = materials_ext_mesh(5,imaterial_id) - case( IDOMAIN_POROELASTIC ) + case( IDOMAIN_POROELASTIC ) ! poroelastic ! materials_ext_mesh format: ! rho_s,kappa_s,rho_f,kappa_f,eta_f,kappa_fr,mu_fr,phi,tort,kxx,kxy,kxz,kyy,kyz,kzz @@ -120,9 +120,9 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, & case default print*,'error: domain id = ',idomain_id,'not recognized' stop 'error: domain not recognized' - + end select - + else if ( imaterial_def == 1 ) then stop 'material: interface not implemented yet' @@ -142,7 +142,7 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, & rho = materials_ext_mesh(1,iflag) vp = materials_ext_mesh(2,iflag) vs = materials_ext_mesh(3,iflag) - qmu_atten = materials_ext_mesh(4,iflag) + qmu_atten = materials_ext_mesh(4,iflag) iflag_aniso = materials_ext_mesh(5,iflag) idomain_id = materials_ext_mesh(6,iflag) @@ -154,7 +154,7 @@ subroutine model_default(materials_ext_mesh,nmat_ext_mesh, & call model_tomography(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) ! no anisotropy - iflag_aniso = 0 + iflag_aniso = 0 ! sets acoustic/elastic domain as given in materials properties iundef = - imaterial_id ! iundef must be positive diff --git a/src/generate_databases/model_external_values.f90 b/src/generate_databases/model_external_values.f90 index 789e7f891..15433822a 100644 --- a/src/generate_databases/model_external_values.f90 +++ b/src/generate_databases/model_external_values.f90 @@ -124,7 +124,7 @@ subroutine model_external_values(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten,iflag_ani use create_regions_mesh_ext_par implicit none - ! GLL point + ! GLL point double precision, intent(in) :: xmesh,ymesh,zmesh ! density, Vp and Vs @@ -172,14 +172,14 @@ subroutine model_external_values(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten,iflag_ani nspec,nglob_dummy,ibool,xstore_dummy,ystore_dummy,zstore_dummy, & num_free_surface_faces,free_surface_ispec,free_surface_ijk) - + ! depth in Z-direction - if( distmin < HUGEVAL ) then + if( distmin < HUGEVAL ) then depth = elevation - z else depth = zmin - z endif - + ! normalizes depth between 0 and 1 if( abs( zmax - zmin ) > TINYVAL ) depth = depth / (zmax - zmin) @@ -204,6 +204,6 @@ subroutine model_external_values(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten,iflag_ani iflag_aniso = 0 ! elastic material - idomain_id = IDOMAIN_ELASTIC - + idomain_id = IDOMAIN_ELASTIC + end subroutine model_external_values diff --git a/src/generate_databases/model_gll.f90 b/src/generate_databases/model_gll.f90 index eb0c537a3..e3622f558 100644 --- a/src/generate_databases/model_gll.f90 +++ b/src/generate_databases/model_gll.f90 @@ -39,14 +39,14 @@ subroutine model_gll(myrank,nspec,LOCAL_PATH) use create_regions_mesh_ext_par implicit none - + integer, intent(in) :: myrank,nspec character(len=256) :: LOCAL_PATH - - ! local parameters + + ! local parameters real, dimension(:,:,:,:),allocatable :: vp_read,vs_read,rho_read integer :: ier - character(len=256) :: prname_lp,filename + character(len=256) :: prname_lp,filename ! processors name write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'proc',myrank,'_' @@ -60,7 +60,11 @@ subroutine model_gll(myrank,nspec,LOCAL_PATH) if( ier /= 0 ) stop 'error allocating array rho_read' filename = prname_lp(1:len_trim(prname_lp))//'rho.bin' - open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted') + open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier) + if( ier /= 0 ) then + print*,'error opening file: ',trim(filename) + stop 'error reading rho.bin file' + endif read(28) rho_read close(28) @@ -70,7 +74,11 @@ subroutine model_gll(myrank,nspec,LOCAL_PATH) if( ier /= 0 ) stop 'error allocating array vp_read' filename = prname_lp(1:len_trim(prname_lp))//'vp.bin' - open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted') + open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier) + if( ier /= 0 ) then + print*,'error opening file: ',trim(filename) + stop 'error reading vp.bin file' + endif read(28) vp_read close(28) @@ -80,8 +88,12 @@ subroutine model_gll(myrank,nspec,LOCAL_PATH) if( ier /= 0 ) stop 'error allocating array vs_read' filename = prname_lp(1:len_trim(prname_lp))//'vs.bin' - open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted') - + open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier) + if( ier /= 0 ) then + print*,'error opening file: ',trim(filename) + stop 'error reading vs.bin file' + endif + read(28) vs_read close(28) diff --git a/src/generate_databases/model_ipati.f90 b/src/generate_databases/model_ipati.f90 new file mode 100644 index 000000000..f5739bd7f --- /dev/null +++ b/src/generate_databases/model_ipati.f90 @@ -0,0 +1,113 @@ +!===================================================================== +! +! S p e c f e m 3 D V e r s i o n 2 . 0 +! --------------------------------------- +! +! Main authors: Dimitri Komatitsch and Jeroen Tromp +! Princeton University, USA and University of Pau / CNRS / INRIA +! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA +! April 2011 +! +! This program is free software; you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 2 of the License, or +! (at your option) any later version. +! +! This program is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along +! with this program; if not, write to the Free Software Foundation, Inc., +! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +! +!===================================================================== + + +!-------------------------------------------------------------------------------------------------- +! +! IPATI +! +! based on given rho and vp structure for GLL files +! +!-------------------------------------------------------------------------------------------------- + + subroutine model_ipati(myrank,nspec,LOCAL_PATH) + + use create_regions_mesh_ext_par + implicit none + + integer, intent(in) :: myrank,nspec + character(len=256) :: LOCAL_PATH + + ! local parameters + real, dimension(:,:,:,:),allocatable :: vp_read,vs_read,rho_read + integer :: ier + character(len=256) :: prname_lp,filename + + ! ----------------------------------------------------------------------------- + + ! note: vp not vs structure is available (as is often the case in exploration seismology), + ! scaling factor + real, parameter :: SCALING_FACTOR = 1.0/1.8 + + ! ----------------------------------------------------------------------------- + + ! user output + if (myrank==0) then + write(IMAIN,*) + write(IMAIN,*) 'using external IPATI model from:',trim(LOCAL_PATH) + write(IMAIN,*) 'scaling factor: ',SCALING_FACTOR + write(IMAIN,*) + endif + + ! processors name + write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'proc',myrank,'_' + + ! density + allocate( rho_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating array rho_read' + + filename = prname_lp(1:len_trim(prname_lp))//'rho.bin' + open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier) + if( ier /= 0 ) then + print*,'error opening file: ',trim(filename) + stop 'error reading rho.bin file' + endif + + read(28) rho_read + close(28) + + ! vp + allocate( vp_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating array vp_read' + + filename = prname_lp(1:len_trim(prname_lp))//'vp.bin' + open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier) + if( ier /= 0 ) then + print*,'error opening file: ',trim(filename) + stop 'error reading vp.bin file' + endif + + read(28) vp_read + close(28) + + ! vs scaled from vp + allocate( vs_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating array vs_read' + + ! scaling + vs_read = vp_read * SCALING_FACTOR + + ! isotropic model parameters + rhostore = rho_read + kappastore = rhostore * ( vp_read * vp_read - FOUR_THIRDS * vs_read * vs_read ) + mustore = rhostore * vs_read * vs_read + rho_vp = rhostore * vp_read + rho_vs = rhostore * vs_read + + ! free memory + deallocate( rho_read,vp_read,vs_read) + + end subroutine model_ipati diff --git a/src/generate_databases/model_salton_trough.f90 b/src/generate_databases/model_salton_trough.f90 index 7e17ff64d..4776887ae 100644 --- a/src/generate_databases/model_salton_trough.f90 +++ b/src/generate_databases/model_salton_trough.f90 @@ -43,7 +43,7 @@ module salton_trough_par double precision, parameter :: GOCAD_ST_V_X = 109670.74, GOCAD_ST_V_Y = 71530.72 double precision, parameter :: GOCAD_ST_W_Z = 7666.334 double precision, parameter :: GOCAD_ST_NO_DATA_VALUE = -99999 - + real,dimension(:,:,:),allocatable :: vp_array end module salton_trough_par @@ -63,8 +63,8 @@ subroutine model_salton_trough_broadcast(myrank) ! local parameters integer :: ier - - + + allocate(vp_array(GOCAD_ST_NU,GOCAD_ST_NV,GOCAD_ST_NW),stat=ier) if( ier /= 0 ) call exit_mpi(myrank,'error allocating vp_array for salton') @@ -92,11 +92,11 @@ subroutine read_salton_sea_model() ! array length reclen=(GOCAD_ST_NU * GOCAD_ST_NV * GOCAD_ST_NW) * 4 - - ! file name + + ! file name call get_value_string(SALTON_SEA_MODEL_FILE,'model.SALTON_SEA_MODEL_FILE', & 'DATA/st_3D_block_harvard/regrid3_vel_p.bin') - + ! reads in file values open(11,file=trim(SALTON_SEA_MODEL_FILE), & status='old',action='read',form='unformatted',access='direct',recl=reclen,iostat=ios) @@ -104,10 +104,10 @@ subroutine read_salton_sea_model() print *,'error opening file: ',trim(SALTON_SEA_MODEL_FILE),' iostat = ', ios call exit_mpi(0,'Error opening file salton trough') endif - - read(11,rec=1,iostat=ios) vp_array + + read(11,rec=1,iostat=ios) vp_array if (ios /= 0) stop 'Error reading vp_array' - + close(11) end subroutine read_salton_sea_model @@ -126,7 +126,7 @@ subroutine model_salton_trough(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) use create_regions_mesh_ext_par implicit none - ! GLL point + ! GLL point double precision, intent(in) :: xmesh,ymesh,zmesh ! density, Vp and Vs @@ -139,20 +139,20 @@ subroutine model_salton_trough(xmesh,ymesh,zmesh,rho,vp,vs,qmu_atten) double precision :: uc,vc,wc double precision :: vp_st,vs_st,rho_st - ! GLL point location converted to u,v,w + ! GLL point location converted to u,v,w call vx_xyz2uvw(xmesh,ymesh,zmesh,uc,vc,wc) ! model values call vx_xyz_interp(uc,vc,wc,vp_st,vs_st,rho_st) - + ! converts to custom real vp = vp_st vs = vs_st rho = rho_st - + ! no attenuation info qmu_atten = 0.0 - + end subroutine model_salton_trough ! @@ -221,7 +221,7 @@ subroutine vx_xyz_interp(uc,vc,wc, vp, vs, rho) v8 = vp_array(i,j+1,k+1) vi = vp_array(i+ixi,j+ieta,k+iga) ! print *, v1, v2, v3, v4, v5, v6, v7, v8 - + if ((v1 - GOCAD_ST_NO_DATA_VALUE) > eps .and. & (v2 - GOCAD_ST_NO_DATA_VALUE) > eps .and. & (v3 - GOCAD_ST_NO_DATA_VALUE) > eps .and. & @@ -240,7 +240,7 @@ subroutine vx_xyz_interp(uc,vc,wc, vp, vs, rho) v8 * (1-xi) * eta * ga) else if ((vi - GOCAD_ST_NO_DATA_VALUE) > eps) then vp = dble(vi) - + ! else if ((v1 - GOCAD_ST_NO_DATA_VALUE) > eps) then ! vp = dble(v1) ! else if ((v2 - GOCAD_ST_NO_DATA_VALUE) > eps) then @@ -257,21 +257,21 @@ subroutine vx_xyz_interp(uc,vc,wc, vp, vs, rho) ! vp = dble(v7) ! else if ((v7 - GOCAD_ST_NO_DATA_VALUE) > eps) then ! vp = dble(v8) - + else vp = GOCAD_ST_NO_DATA_VALUE endif - + ! depth zmesh = wc / (GOCAD_ST_NW - 1) * GOCAD_ST_W_Z + GOCAD_ST_O_Z - + ! vs if (zmesh > -8500.) then vs = vp / (2 - (0.27*zmesh/(-8500))) else vs = vp/1.73 endif - + ! density if (vp > 2160.) then rho = vp/3 + 1280. diff --git a/src/generate_databases/model_tomography.f90 b/src/generate_databases/model_tomography.f90 index bdb83bb4e..5b4fbd802 100644 --- a/src/generate_databases/model_tomography.f90 +++ b/src/generate_databases/model_tomography.f90 @@ -82,7 +82,7 @@ subroutine model_tomography_broadcast(myrank) !if(myrank == 0) call read_external_model() ! broadcast the information read on the master to the nodes, e.g. !call bcast_all_i(nrecord,1) - + !if( myrank /= 0 ) then ! allocate( vp_tomography(1:nrecord) ,stat=ier) ! if( ier /= 0 ) stop 'error allocating array vp_tomography' @@ -161,7 +161,6 @@ subroutine read_model_tomography(myrank) end subroutine read_model_tomography - ! !------------------------------------------------------------------------------------------------- ! @@ -179,7 +178,7 @@ subroutine model_tomography(xmesh,ymesh,zmesh,rho_final,vp_final,vs_final,qmu_at !double precision, intent(in) :: VP_MIN,VS_MIN,RHO_MIN,VP_MAX,VS_MAX,RHO_MAX double precision, intent(in) :: xmesh,ymesh,zmesh - + real(kind=CUSTOM_REAL), intent(out) :: vp_final,vs_final,rho_final,qmu_atten ! local parameters @@ -367,6 +366,6 @@ subroutine model_tomography(xmesh,ymesh,zmesh,rho_final,vp_final,vs_final,qmu_at if(rho_final > RHO_MAX) rho_final = RHO_MAX ! attenuation: arbitrary value, see maximum in constants.h - qmu_atten = ATTENUATION_COMP_MAXIMUM + qmu_atten = ATTENUATION_COMP_MAXIMUM end subroutine model_tomography diff --git a/src/generate_databases/save_arrays_solver.f90 b/src/generate_databases/save_arrays_solver.f90 index 5d4a3fa55..08da5c804 100644 --- a/src/generate_databases/save_arrays_solver.f90 +++ b/src/generate_databases/save_arrays_solver.f90 @@ -28,114 +28,124 @@ ! for external mesh subroutine save_arrays_solver_ext_mesh(nspec,nglob, & - xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, & - gammaxstore,gammaystore,gammazstore, & - jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, & - rhostore,kappastore,mustore, & - rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, & - rho_vpI,rho_vpII,rho_vsI, & - rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, & - OCEANS,rmass_ocean_load,NGLOB_OCEAN,& +! xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, & +! gammaxstore,gammaystore,gammazstore, & +! jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, & +! rhostore,kappastore,mustore, & +! rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, & +! rho_vpI,rho_vpII,rho_vsI, & +! rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, & + OCEANS, & +! rmass_ocean_load,NGLOB_OCEAN,& ibool, & - xstore_dummy,ystore_dummy,zstore_dummy, & - abs_boundary_normal,abs_boundary_jacobian2Dw, & - abs_boundary_ijk,abs_boundary_ispec, & - num_abs_boundary_faces, & - free_surface_normal,free_surface_jacobian2Dw, & - free_surface_ijk,free_surface_ispec, & - num_free_surface_faces, & - coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, & - coupling_ac_el_ijk,coupling_ac_el_ispec, & - num_coupling_ac_el_faces, & - coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, & - coupling_ac_po_ijk,coupling_ac_po_ispec, & - num_coupling_ac_po_faces, & - coupling_el_po_normal,coupling_el_po_jacobian2Dw, & - coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, & - coupling_po_el_ispec,num_coupling_el_po_faces, & +! xstore_dummy,ystore_dummy,zstore_dummy, & +! abs_boundary_normal,abs_boundary_jacobian2Dw, & +! abs_boundary_ijk,abs_boundary_ispec, & +! num_abs_boundary_faces, & +! free_surface_normal,free_surface_jacobian2Dw, & +! free_surface_ijk,free_surface_ispec, & +! num_free_surface_faces, & +! coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, & +! coupling_ac_el_ijk,coupling_ac_el_ispec, & +! num_coupling_ac_el_faces, & +! coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, & +! coupling_ac_po_ijk,coupling_ac_po_ispec, & +! num_coupling_ac_po_faces, & +! coupling_el_po_normal,coupling_el_po_jacobian2Dw, & +! coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, & +! coupling_po_el_ispec,num_coupling_el_po_faces, & num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, & max_interface_size_ext_mesh,ibool_interfaces_ext_mesh, & - prname,SAVE_MESH_FILES, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store, & - c22store,c23store,c24store,c25store,c26store,c33store, & - c34store,c35store,c36store,c44store,c45store,c46store, & - c55store,c56store,c66store, & - ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, & - ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, & - nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, & - num_phase_ispec_acoustic,phase_ispec_inner_acoustic, & - num_phase_ispec_elastic,phase_ispec_inner_elastic, & - num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic) +! prname, & + SAVE_MESH_FILES, & + ANISOTROPY & +! NSPEC_ANISO, & +! c11store,c12store,c13store,c14store,c15store,c16store, & +! c22store,c23store,c24store,c25store,c26store,c33store, & +! c34store,c35store,c36store,c44store,c45store,c46store, & +! c55store,c56store,c66store, & +! ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, & +! ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, & +! nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, & +! num_phase_ispec_acoustic,phase_ispec_inner_acoustic, & +! num_phase_ispec_elastic,phase_ispec_inner_elastic, & +! num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, & +! num_colors_outer_acoustic,num_colors_inner_acoustic, & +! num_elem_colors_acoustic, & +! num_colors_outer_elastic,num_colors_inner_elastic, & +! num_elem_colors_elastic, & + ) + + use create_regions_mesh_ext_par implicit none - include "constants.h" +! include "constants.h" integer :: nspec,nglob ! jacobian - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xixstore,xiystore,xizstore, & - etaxstore,etaystore,etazstore,gammaxstore,gammaystore,gammazstore,jacobianstore - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vp,rho_vs +! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xixstore,xiystore,xizstore, & +! etaxstore,etaystore,etazstore,gammaxstore,gammaystore,gammazstore,jacobianstore +! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vp,rho_vs ! attenuation - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: qmu_attenuation_store +! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: qmu_attenuation_store ! material - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rhostore,kappastore,mustore - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: etastore,phistore,tortstore - real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,nspec) :: rhoarraystore - real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,nspec) :: kappaarraystore - real(kind=CUSTOM_REAL), dimension(6,NGLLX,NGLLY,NGLLZ,nspec) :: permstore - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vpI,rho_vpII,rho_vsI - real(kind=CUSTOM_REAL), dimension(nglob) :: rmass,rmass_acoustic, & - rmass_solid_poroelastic,rmass_fluid_poroelastic +! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rhostore,kappastore,mustore +! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: etastore,phistore,tortstore +! real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,nspec) :: rhoarraystore +! real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,nspec) :: kappaarraystore +! real(kind=CUSTOM_REAL), dimension(6,NGLLX,NGLLY,NGLLZ,nspec) :: permstore +! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vpI,rho_vpII,rho_vsI +! real(kind=CUSTOM_REAL), dimension(nglob) :: rmass,rmass_acoustic, & +! rmass_solid_poroelastic,rmass_fluid_poroelastic ! ocean load logical :: OCEANS - integer :: NGLOB_OCEAN - real(kind=CUSTOM_REAL),dimension(NGLOB_OCEAN) :: rmass_ocean_load +! integer :: NGLOB_OCEAN +! real(kind=CUSTOM_REAL),dimension(NGLOB_OCEAN) :: rmass_ocean_load ! mesh coordinates integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool - real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy +! real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy ! absorbing boundary surface - integer :: num_abs_boundary_faces - real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces) - real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces) - integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces) - integer :: abs_boundary_ispec(num_abs_boundary_faces) +! integer :: num_abs_boundary_faces +! real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces) +! real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces) +! integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces) +! integer :: abs_boundary_ispec(num_abs_boundary_faces) ! free surface - integer :: num_free_surface_faces - real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces) - real(kind=CUSTOM_REAL) :: free_surface_jacobian2Dw(NGLLSQUARE,num_free_surface_faces) - integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces) - integer :: free_surface_ispec(num_free_surface_faces) +! integer :: num_free_surface_faces +! real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces) +! real(kind=CUSTOM_REAL) :: free_surface_jacobian2Dw(NGLLSQUARE,num_free_surface_faces) +! integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces) +! integer :: free_surface_ispec(num_free_surface_faces) ! acoustic-elastic coupling surface - integer :: num_coupling_ac_el_faces - real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces) - real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces) - integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces) - integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces) +! integer :: num_coupling_ac_el_faces +! real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces) +! real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces) +! integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces) +! integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces) ! acoustic-poroelastic coupling surface - integer :: num_coupling_ac_po_faces - real(kind=CUSTOM_REAL) :: coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces) - real(kind=CUSTOM_REAL) :: coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces) - integer :: coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces) - integer :: coupling_ac_po_ispec(num_coupling_ac_po_faces) +! integer :: num_coupling_ac_po_faces +! real(kind=CUSTOM_REAL) :: coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces) +! real(kind=CUSTOM_REAL) :: coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces) +! integer :: coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces) +! integer :: coupling_ac_po_ispec(num_coupling_ac_po_faces) ! elastic-poroelastic coupling surface - integer :: num_coupling_el_po_faces - real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces) - real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces) - integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces) - integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces) - integer :: coupling_el_po_ispec(num_coupling_el_po_faces) - integer :: coupling_po_el_ispec(num_coupling_el_po_faces) +! integer :: num_coupling_el_po_faces +! real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces) +! real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces) +! integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces) +! integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces) +! integer :: coupling_el_po_ispec(num_coupling_el_po_faces) +! integer :: coupling_po_el_ispec(num_coupling_el_po_faces) ! MPI interfaces integer :: num_interfaces_ext_mesh @@ -146,35 +156,43 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, & integer :: max_nibool_interfaces_ext_mesh ! file name - character(len=256) prname +! character(len=256) prname logical :: SAVE_MESH_FILES ! anisotropy logical :: ANISOTROPY - integer :: NSPEC_ANISO - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & - c11store,c12store,c13store,c14store,c15store,c16store, & - c22store,c23store,c24store,c25store,c26store,c33store, & - c34store,c35store,c36store,c44store,c45store,c46store, & - c55store,c56store,c66store +! integer :: NSPEC_ANISO +! real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & +! c11store,c12store,c13store,c14store,c15store,c16store, & +! c22store,c23store,c24store,c25store,c26store,c33store, & +! c34store,c35store,c36store,c44store,c45store,c46store, & +! c55store,c56store,c66store ! material domain flags - logical, dimension(nspec) :: ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic +! logical, dimension(nspec) :: ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic ! inner/outer elements - logical,dimension(nspec) :: ispec_is_inner - integer :: nspec_inner_acoustic,nspec_outer_acoustic - integer :: nspec_inner_elastic,nspec_outer_elastic - integer :: nspec_inner_poroelastic,nspec_outer_poroelastic +! logical,dimension(nspec) :: ispec_is_inner +! integer :: nspec_inner_acoustic,nspec_outer_acoustic +! integer :: nspec_inner_elastic,nspec_outer_elastic +! integer :: nspec_inner_poroelastic,nspec_outer_poroelastic - integer :: num_phase_ispec_acoustic - integer,dimension(num_phase_ispec_acoustic,2) :: phase_ispec_inner_acoustic +! integer :: num_phase_ispec_acoustic +! integer,dimension(num_phase_ispec_acoustic,2) :: phase_ispec_inner_acoustic - integer :: num_phase_ispec_elastic - integer,dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic +! integer :: num_phase_ispec_elastic +! integer,dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic - integer :: num_phase_ispec_poroelastic - integer,dimension(num_phase_ispec_poroelastic,2) :: phase_ispec_inner_poroelastic +! integer :: num_phase_ispec_poroelastic +! integer,dimension(num_phase_ispec_poroelastic,2) :: phase_ispec_inner_poroelastic + + ! mesh coloring +! integer :: num_colors_outer_acoustic,num_colors_inner_acoustic +! integer, dimension(num_colors_outer_acoustic + num_colors_inner_acoustic) :: & +! num_elem_colors_acoustic +! integer :: num_colors_outer_elastic,num_colors_inner_elastic +! integer, dimension(num_colors_outer_elastic + num_colors_inner_elastic) :: & +! num_elem_colors_elastic ! local parameters real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: v_tmp @@ -183,13 +201,15 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, & !real(kind=CUSTOM_REAL) :: minimum(1) integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy integer :: ier,i - logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION +! logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION character(len=256) :: filename integer, dimension(:), allocatable :: iglob_tmp integer :: j,inum -! saves mesh file proc***_external_mesh.bin + logical,parameter :: DEBUG = .false. + + ! saves mesh file proc***_external_mesh.bin filename = prname(1:len_trim(prname))//'external_mesh.bin' open(unit=IOUT,file=trim(filename),status='unknown',action='write',form='unformatted',iostat=ier) if( ier /= 0 ) stop 'error opening database proc######_external_mesh.bin' @@ -247,6 +267,8 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, & ! poroelastic call any_all_l( ANY(ispec_is_poroelastic), POROELASTIC_SIMULATION ) if( POROELASTIC_SIMULATION ) then + stop 'not implemented yet: write rmass_solid_poroelastic .. ' + write(IOUT) rmass_solid_poroelastic write(IOUT) rmass_fluid_poroelastic write(IOUT) rhoarraystore @@ -370,6 +392,18 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, & if(num_phase_ispec_poroelastic > 0 ) write(IOUT) phase_ispec_inner_poroelastic endif + ! mesh coloring + if( USE_MESH_COLORING_GPU ) then + if( ACOUSTIC_SIMULATION ) then + write(IOUT) num_colors_outer_acoustic,num_colors_inner_acoustic + write(IOUT) num_elem_colors_acoustic + endif + if( ELASTIC_SIMULATION ) then + write(IOUT) num_colors_outer_elastic,num_colors_inner_elastic + write(IOUT) num_elem_colors_elastic + endif + endif + close(IOUT) @@ -461,180 +495,181 @@ subroutine save_arrays_solver_ext_mesh(nspec,nglob, & xstore_dummy,ystore_dummy,zstore_dummy,ibool, & qmu_attenuation_store,filename) + deallocate(v_tmp) + ! VTK file output - ! acoustic-elastic domains - if( ACOUSTIC_SIMULATION .and. ELASTIC_SIMULATION ) then - ! saves points on acoustic-elastic coupling interface - allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_el_faces),stat=ier) - if( ier /= 0 ) stop 'error allocating array iglob_tmp' - inum = 0 - iglob_tmp(:) = 0 - do i=1,num_coupling_ac_el_faces - do j=1,NGLLSQUARE - inum = inum+1 - iglob_tmp(inum) = ibool(coupling_ac_el_ijk(1,j,i), & - coupling_ac_el_ijk(2,j,i), & - coupling_ac_el_ijk(3,j,i), & - coupling_ac_el_ispec(i) ) + if( DEBUG ) then + ! acoustic-elastic domains + if( ACOUSTIC_SIMULATION .and. ELASTIC_SIMULATION ) then + ! saves points on acoustic-elastic coupling interface + allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_el_faces),stat=ier) + if( ier /= 0 ) stop 'error allocating array iglob_tmp' + inum = 0 + iglob_tmp(:) = 0 + do i=1,num_coupling_ac_el_faces + do j=1,NGLLSQUARE + inum = inum+1 + iglob_tmp(inum) = ibool(coupling_ac_el_ijk(1,j,i), & + coupling_ac_el_ijk(2,j,i), & + coupling_ac_el_ijk(3,j,i), & + coupling_ac_el_ispec(i) ) + enddo enddo - enddo - filename = prname(1:len_trim(prname))//'coupling_acoustic_elastic' - call write_VTK_data_points(nglob, & - xstore_dummy,ystore_dummy,zstore_dummy, & - iglob_tmp,NGLLSQUARE*num_coupling_ac_el_faces, & - filename) - - ! saves acoustic/elastic flag - allocate(v_tmp_i(nspec),stat=ier) - if( ier /= 0 ) stop 'error allocating array v_tmp_i' - do i=1,nspec - if( ispec_is_acoustic(i) ) then - v_tmp_i(i) = 1 - else if( ispec_is_elastic(i) ) then - v_tmp_i(i) = 2 - else - v_tmp_i(i) = 0 - endif - enddo - filename = prname(1:len_trim(prname))//'acoustic_elastic_flag' - call write_VTK_data_elem_i(nspec,nglob, & - xstore_dummy,ystore_dummy,zstore_dummy,ibool, & - v_tmp_i,filename) - - deallocate(iglob_tmp,v_tmp_i) - endif - - ! saves free surface points - if( num_free_surface_faces > 0 ) then - ! saves free surface interface points - allocate( iglob_tmp(NGLLSQUARE*num_free_surface_faces),stat=ier) - if( ier /= 0 ) stop 'error allocating array iglob_tmp' - inum = 0 - iglob_tmp(:) = 0 - do i=1,num_free_surface_faces - do j=1,NGLLSQUARE - inum = inum+1 - iglob_tmp(inum) = ibool(free_surface_ijk(1,j,i), & - free_surface_ijk(2,j,i), & - free_surface_ijk(3,j,i), & - free_surface_ispec(i) ) + filename = prname(1:len_trim(prname))//'coupling_acoustic_elastic' + call write_VTK_data_points(nglob, & + xstore_dummy,ystore_dummy,zstore_dummy, & + iglob_tmp,NGLLSQUARE*num_coupling_ac_el_faces, & + filename) + + ! saves acoustic/elastic flag + allocate(v_tmp_i(nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating array v_tmp_i' + do i=1,nspec + if( ispec_is_acoustic(i) ) then + v_tmp_i(i) = 1 + else if( ispec_is_elastic(i) ) then + v_tmp_i(i) = 2 + else + v_tmp_i(i) = 0 + endif enddo - enddo - filename = prname(1:len_trim(prname))//'free_surface' - call write_VTK_data_points(nglob, & - xstore_dummy,ystore_dummy,zstore_dummy, & - iglob_tmp,NGLLSQUARE*num_free_surface_faces, & - filename) - - deallocate(iglob_tmp) - endif - - ! debug: saves 1. MPI interface - !if( num_interfaces_ext_mesh >= 1 ) then - ! filename = prname(1:len_trim(prname))//'MPI_1_points' - ! call write_VTK_data_points(nglob, & - ! xstore_dummy,ystore_dummy,zstore_dummy, & - ! ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), & - ! nibool_interfaces_ext_mesh(1), & - ! filename) - !endif - - ! acoustic-poroelastic domains - if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then - ! saves points on acoustic-poroelastic coupling interface - allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_po_faces),stat=ier) - if( ier /= 0 ) stop 'error allocating array iglob_tmp' - inum = 0 - iglob_tmp(:) = 0 - do i=1,num_coupling_ac_po_faces - do j=1,NGLLSQUARE - inum = inum+1 - iglob_tmp(inum) = ibool(coupling_ac_po_ijk(1,j,i), & - coupling_ac_po_ijk(2,j,i), & - coupling_ac_po_ijk(3,j,i), & - coupling_ac_po_ispec(i) ) + filename = prname(1:len_trim(prname))//'acoustic_elastic_flag' + call write_VTK_data_elem_i(nspec,nglob, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool, & + v_tmp_i,filename) + + deallocate(iglob_tmp,v_tmp_i) + endif + + ! saves free surface points + if( num_free_surface_faces > 0 ) then + ! saves free surface interface points + allocate( iglob_tmp(NGLLSQUARE*num_free_surface_faces),stat=ier) + if( ier /= 0 ) stop 'error allocating array iglob_tmp' + inum = 0 + iglob_tmp(:) = 0 + do i=1,num_free_surface_faces + do j=1,NGLLSQUARE + inum = inum+1 + iglob_tmp(inum) = ibool(free_surface_ijk(1,j,i), & + free_surface_ijk(2,j,i), & + free_surface_ijk(3,j,i), & + free_surface_ispec(i) ) + enddo enddo - enddo - filename = prname(1:len_trim(prname))//'coupling_acoustic_poroelastic' - call write_VTK_data_points(nglob, & - xstore_dummy,ystore_dummy,zstore_dummy, & - iglob_tmp,NGLLSQUARE*num_coupling_ac_po_faces, & - filename) - - ! saves acoustic/poroelastic flag - allocate(v_tmp_i(nspec),stat=ier) - if( ier /= 0 ) stop 'error allocating array v_tmp_i' - do i=1,nspec - if( ispec_is_acoustic(i) ) then - v_tmp_i(i) = 1 - else if( ispec_is_poroelastic(i) ) then - v_tmp_i(i) = 2 - else - v_tmp_i(i) = 0 - endif - enddo - filename = prname(1:len_trim(prname))//'acoustic_poroelastic_flag' - call write_VTK_data_elem_i(nspec,nglob, & - xstore_dummy,ystore_dummy,zstore_dummy,ibool, & - v_tmp_i,filename) - - deallocate(v_tmp_i,iglob_tmp) - endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION ) - - ! elastic-poroelastic domains - if( ELASTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then - ! saves points on elastic-poroelastic coupling interface - allocate( iglob_tmp(NGLLSQUARE*num_coupling_el_po_faces),stat=ier) - if( ier /= 0 ) stop 'error allocating array iglob_tmp' - inum = 0 - iglob_tmp(:) = 0 - do i=1,num_coupling_el_po_faces - do j=1,NGLLSQUARE - inum = inum+1 - iglob_tmp(inum) = ibool(coupling_el_po_ijk(1,j,i), & - coupling_el_po_ijk(2,j,i), & - coupling_el_po_ijk(3,j,i), & - coupling_el_po_ispec(i) ) + filename = prname(1:len_trim(prname))//'free_surface' + call write_VTK_data_points(nglob, & + xstore_dummy,ystore_dummy,zstore_dummy, & + iglob_tmp,NGLLSQUARE*num_free_surface_faces, & + filename) + + deallocate(iglob_tmp) + endif + + ! debug: saves 1. MPI interface + !if( num_interfaces_ext_mesh >= 1 ) then + ! filename = prname(1:len_trim(prname))//'MPI_1_points' + ! call write_VTK_data_points(nglob, & + ! xstore_dummy,ystore_dummy,zstore_dummy, & + ! ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), & + ! nibool_interfaces_ext_mesh(1), & + ! filename) + !endif + + ! acoustic-poroelastic domains + if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then + ! saves points on acoustic-poroelastic coupling interface + allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_po_faces),stat=ier) + if( ier /= 0 ) stop 'error allocating array iglob_tmp' + inum = 0 + iglob_tmp(:) = 0 + do i=1,num_coupling_ac_po_faces + do j=1,NGLLSQUARE + inum = inum+1 + iglob_tmp(inum) = ibool(coupling_ac_po_ijk(1,j,i), & + coupling_ac_po_ijk(2,j,i), & + coupling_ac_po_ijk(3,j,i), & + coupling_ac_po_ispec(i) ) + enddo enddo - enddo - filename = prname(1:len_trim(prname))//'coupling_elastic_poroelastic' - call write_VTK_data_points(nglob, & - xstore_dummy,ystore_dummy,zstore_dummy, & - iglob_tmp,NGLLSQUARE*num_coupling_el_po_faces, & - filename) - - ! saves elastic/poroelastic flag - allocate(v_tmp_i(nspec),stat=ier) - if( ier /= 0 ) stop 'error allocating array v_tmp_i' - do i=1,nspec - if( ispec_is_elastic(i) ) then - v_tmp_i(i) = 1 - else if( ispec_is_poroelastic(i) ) then - v_tmp_i(i) = 2 - else - v_tmp_i(i) = 0 - endif - enddo - filename = prname(1:len_trim(prname))//'elastic_poroelastic_flag' - call write_VTK_data_elem_i(nspec,nglob, & - xstore_dummy,ystore_dummy,zstore_dummy,ibool, & - v_tmp_i,filename) - - deallocate(v_tmp_i,iglob_tmp) - endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION - - !debug: saves 1. MPI interface - ! if( num_interfaces_ext_mesh >= 1 ) then - ! filename = prname(1:len_trim(prname))//'MPI_1_points' - ! call write_VTK_data_points(nglob, & - ! xstore_dummy,ystore_dummy,zstore_dummy, & - ! ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), & - ! nibool_interfaces_ext_mesh(1), & - ! filename) - ! endif - ! - - deallocate(v_tmp) + filename = prname(1:len_trim(prname))//'coupling_acoustic_poroelastic' + call write_VTK_data_points(nglob, & + xstore_dummy,ystore_dummy,zstore_dummy, & + iglob_tmp,NGLLSQUARE*num_coupling_ac_po_faces, & + filename) + + ! saves acoustic/poroelastic flag + allocate(v_tmp_i(nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating array v_tmp_i' + do i=1,nspec + if( ispec_is_acoustic(i) ) then + v_tmp_i(i) = 1 + else if( ispec_is_poroelastic(i) ) then + v_tmp_i(i) = 2 + else + v_tmp_i(i) = 0 + endif + enddo + filename = prname(1:len_trim(prname))//'acoustic_poroelastic_flag' + call write_VTK_data_elem_i(nspec,nglob, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool, & + v_tmp_i,filename) + + deallocate(v_tmp_i,iglob_tmp) + endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION ) + + ! elastic-poroelastic domains + if( ELASTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then + ! saves points on elastic-poroelastic coupling interface + allocate( iglob_tmp(NGLLSQUARE*num_coupling_el_po_faces),stat=ier) + if( ier /= 0 ) stop 'error allocating array iglob_tmp' + inum = 0 + iglob_tmp(:) = 0 + do i=1,num_coupling_el_po_faces + do j=1,NGLLSQUARE + inum = inum+1 + iglob_tmp(inum) = ibool(coupling_el_po_ijk(1,j,i), & + coupling_el_po_ijk(2,j,i), & + coupling_el_po_ijk(3,j,i), & + coupling_el_po_ispec(i) ) + enddo + enddo + filename = prname(1:len_trim(prname))//'coupling_elastic_poroelastic' + call write_VTK_data_points(nglob, & + xstore_dummy,ystore_dummy,zstore_dummy, & + iglob_tmp,NGLLSQUARE*num_coupling_el_po_faces, & + filename) + + ! saves elastic/poroelastic flag + allocate(v_tmp_i(nspec),stat=ier) + if( ier /= 0 ) stop 'error allocating array v_tmp_i' + do i=1,nspec + if( ispec_is_elastic(i) ) then + v_tmp_i(i) = 1 + else if( ispec_is_poroelastic(i) ) then + v_tmp_i(i) = 2 + else + v_tmp_i(i) = 0 + endif + enddo + filename = prname(1:len_trim(prname))//'elastic_poroelastic_flag' + call write_VTK_data_elem_i(nspec,nglob, & + xstore_dummy,ystore_dummy,zstore_dummy,ibool, & + v_tmp_i,filename) + + deallocate(v_tmp_i,iglob_tmp) + endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION + + !debug: saves 1. MPI interface + !if( num_interfaces_ext_mesh >= 1 ) then + ! filename = prname(1:len_trim(prname))//'MPI_1_points' + ! call write_VTK_data_points(nglob, & + ! xstore_dummy,ystore_dummy,zstore_dummy, & + ! ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), & + ! nibool_interfaces_ext_mesh(1), & + ! filename) + !endif + endif ! DEBUG endif ! SAVE_MESH_FILES diff --git a/src/meshfem3D/check_mesh_quality.f90 b/src/meshfem3D/check_mesh_quality.f90 index 8c262d69f..ad5fcaa6b 100644 --- a/src/meshfem3D/check_mesh_quality.f90 +++ b/src/meshfem3D/check_mesh_quality.f90 @@ -54,7 +54,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & logical :: CREATE_VTK_FILES character(len=256) prname - + ! local parameters integer :: ispec,ispec_min_edge_length,ispec_max_edge_length,ispec_max_skewness, & ispec_max_skewness_MPI,skewness_max_rank,NSPEC_ALL_SLICES @@ -91,6 +91,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & !logical :: USE_OPENDX !character(len=256):: line + integer,dimension(1) :: tmp_ispec_max_skewness,tmp_ispec_max_skewness_MPI ! debug: for vtk output real(kind=CUSTOM_REAL),dimension(:),allocatable :: tmp1 @@ -123,6 +124,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & ispec_min_edge_length = -1 ispec_max_edge_length = -1 + ispec_max_skewness = -1 ! debug: for vtk output if( CREATE_VTK_FILES ) then @@ -145,7 +147,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & if(equiangle_skewness > equiangle_skewness_max) ispec_max_skewness = ispec if( CREATE_VTK_FILES ) tmp1(ispec) = equiangle_skewness - + ! compute minimum and maximum of quality numbers equiangle_skewness_min = min(equiangle_skewness_min,equiangle_skewness) edge_aspect_ratio_min = min(edge_aspect_ratio_min,edge_aspect_ratio) @@ -183,7 +185,8 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & if((myrank == skewness_max_rank) .and. (myrank /= 0)) then - call send_i_t(ispec_max_skewness,1,0) + tmp_ispec_max_skewness(1) = ispec_max_skewness + call send_i_t(tmp_ispec_max_skewness,1,0) end if @@ -191,7 +194,8 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & if(skewness_max_rank /= myrank) then - call recv_i_t(ispec_max_skewness_MPI,1,skewness_max_rank) + call recv_i_t(tmp_ispec_max_skewness_MPI,1,skewness_max_rank) + ispec_max_skewness_MPI = tmp_ispec_max_skewness_MPI(1) else ispec_max_skewness_MPI = ispec_max_skewness end if @@ -347,8 +351,8 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & end if ! debug: for vtk output - if( CREATE_VTK_FILES ) then - ! vtk file output + if( CREATE_VTK_FILES ) then + ! vtk file output open(66,file=prname(1:len_trim(prname))//'skewness.vtk',status='unknown') write(66,'(a)') '# vtk DataFile Version 3.1' write(66,'(a)') 'material model VTK file' @@ -357,7 +361,7 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & write(66, '(a,i12,a)') 'POINTS ', nglob, ' float' do ipoin = 1,nglob write(66,*) sngl(x(ipoin)),sngl(y(ipoin)),sngl(z(ipoin)) - enddo + enddo write(66,*) "" ! note: indices for vtk start at 0 @@ -381,8 +385,8 @@ subroutine check_mesh_quality(myrank,VP_MAX,NGLOB,NSPEC,x,y,z,ibool, & write(66,*) tmp1(ispec) enddo write(66,*) "" - close(66) - + close(66) + deallocate(tmp1) endif diff --git a/src/meshfem3D/compute_parameters.f90 b/src/meshfem3D/compute_parameters.f90 index 1efb4186d..d2349d9c7 100644 --- a/src/meshfem3D/compute_parameters.f90 +++ b/src/meshfem3D/compute_parameters.f90 @@ -315,7 +315,7 @@ subroutine compute_parameters(NER,NEX_XI,NEX_ETA,NPROC_XI,NPROC_ETA, & NSPEC2DMAX_XMIN_XMAX = NSPEC2D_B_ETA NSPEC2DMAX_YMIN_YMAX = NSPEC2D_B_XI - !debug + !debug !print*,'nspec minmax:',NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_yMAX ! theoretical number of Gauss-Lobatto points in radial direction diff --git a/src/meshfem3D/create_regions_mesh.f90 b/src/meshfem3D/create_regions_mesh.f90 index a493f72f2..8433af266 100644 --- a/src/meshfem3D/create_regions_mesh.f90 +++ b/src/meshfem3D/create_regions_mesh.f90 @@ -397,7 +397,7 @@ subroutine create_regions_mesh(xgrid,ygrid,zgrid,ibool, & if( ier /= 0 ) stop 'error allocating array nodes_coords' nodes_coords(:,:) = 0.0d0 ibool(:,:,:,:) = 0 - + do ispec=1,nspec ieoff = NGLLCUBE*(ispec-1) ilocnum = 0 @@ -413,7 +413,7 @@ subroutine create_regions_mesh(xgrid,ygrid,zgrid,ibool, & enddo enddo enddo - + ! checks ibool range if(minval(ibool(:,:,:,:)) /= 1 .or. maxval(ibool(:,:,:,:)) /= nglob) then print*,'error ibool: maximum value ',maxval(ibool(:,:,:,:)) ,'should be ',nglob diff --git a/src/meshfem3D/create_visual_files.f90 b/src/meshfem3D/create_visual_files.f90 index fca4e0f13..01e304d99 100644 --- a/src/meshfem3D/create_visual_files.f90 +++ b/src/meshfem3D/create_visual_files.f90 @@ -71,7 +71,7 @@ subroutine create_visual_files(CREATE_ABAQUS_FILES,CREATE_DX_FILES,CREATE_VTK_FI ibool(2,2,2,ispec) end do close(64) - + end if @@ -127,7 +127,7 @@ subroutine create_visual_files(CREATE_ABAQUS_FILES,CREATE_DX_FILES,CREATE_VTK_FI end if if( CREATE_VTK_FILES ) then - ! vtk file output + ! vtk file output open(66,file=prname(1:len_trim(prname))//'mesh.vtk',status='unknown') write(66,'(a)') '# vtk DataFile Version 3.1' write(66,'(a)') 'material model VTK file' @@ -136,7 +136,7 @@ subroutine create_visual_files(CREATE_ABAQUS_FILES,CREATE_DX_FILES,CREATE_VTK_FI write(66, '(a,i12,a)') 'POINTS ', nglob, ' float' do ipoin = 1,nglob write(66,*) sngl(nodes_coords(ipoin,1)),sngl(nodes_coords(ipoin,2)),sngl(nodes_coords(ipoin,3)) - enddo + enddo write(66,*) "" ! note: indices for vtk start at 0 @@ -161,7 +161,7 @@ subroutine create_visual_files(CREATE_ABAQUS_FILES,CREATE_DX_FILES,CREATE_VTK_FI enddo write(66,*) "" close(66) - + endif call sync_all() diff --git a/src/meshfem3D/meshfem3D.f90 b/src/meshfem3D/meshfem3D.f90 index a98eddaaa..fd88cd702 100644 --- a/src/meshfem3D/meshfem3D.f90 +++ b/src/meshfem3D/meshfem3D.f90 @@ -405,7 +405,7 @@ subroutine meshfem3D write(IMAIN,*) 'error: number of MPI processors actually run on: ',sizeprocs print* print*, 'error meshfem3D: number of processors supposed to run on: ',NPROC - print*, 'error meshfem3D: number of MPI processors actually run on: ',sizeprocs + print*, 'error meshfem3D: number of MPI processors actually run on: ',sizeprocs print* endif call exit_MPI(myrank,'wrong number of MPI processes') diff --git a/src/meshfem3D/store_boundaries.f90 b/src/meshfem3D/store_boundaries.f90 index 80508d785..0eb636698 100644 --- a/src/meshfem3D/store_boundaries.f90 +++ b/src/meshfem3D/store_boundaries.f90 @@ -90,21 +90,21 @@ subroutine store_boundaries(myrank,iboun,nspec,& ! on boundary: ymax if(iboun(4,ispec)) then ispecb4=ispecb4+1 - if( ispecb4 > NSPEC2DMAX_YMIN_YMAX ) stop 'error NSPEC2DMAX_YMIN_YMAX too small' + if( ispecb4 > NSPEC2DMAX_YMIN_YMAX ) stop 'error NSPEC2DMAX_YMIN_YMAX too small' ibelm_ymax(ispecb4)=ispec endif ! on boundary: bottom if(iboun(5,ispec)) then ispecb5=ispecb5+1 - if( ispecb5 > NSPEC2D_BOTTOM ) stop 'error NSPEC2D_BOTTOM too small' + if( ispecb5 > NSPEC2D_BOTTOM ) stop 'error NSPEC2D_BOTTOM too small' ibelm_bottom(ispecb5)=ispec endif ! on boundary: top if(iboun(6,ispec)) then ispecb6=ispecb6+1 - if( ispecb6 > NSPEC2D_TOP ) stop 'error NSPEC2D_TOP too small' + if( ispecb6 > NSPEC2D_TOP ) stop 'error NSPEC2D_TOP too small' ibelm_top(ispecb6)=ispec endif diff --git a/src/shared/assemble_MPI_scalar.f90 b/src/shared/assemble_MPI_scalar.f90 index 1e1c13cc0..c74035ad4 100644 --- a/src/shared/assemble_MPI_scalar.f90 +++ b/src/shared/assemble_MPI_scalar.f90 @@ -356,4 +356,3 @@ subroutine assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,array_val, & end subroutine assemble_MPI_scalar_ext_mesh_w - diff --git a/src/shared/check_mesh_resolution.f90 b/src/shared/check_mesh_resolution.f90 index b435ffe81..ab9a38505 100644 --- a/src/shared/check_mesh_resolution.f90 +++ b/src/shared/check_mesh_resolution.f90 @@ -154,6 +154,7 @@ subroutine check_mesh_resolution(myrank,NSPEC_AB,NGLOB_AB,ibool,xstore,ystore,zs if( SAVE_MESH_FILES ) tmp1(ispec) = cmax endif + ! suggested timestep dt_suggested = COURANT_SUGGESTED * distance_min / max( vpmax,vsmax ) dt_suggested_glob = min( dt_suggested_glob, dt_suggested) @@ -406,9 +407,11 @@ subroutine check_mesh_resolution_poro(myrank,NSPEC_AB,NGLOB_AB,ibool,xstore,ysto !real(kind=CUSTOM_REAL),parameter :: NELEM_PER_WAVELENGTH = 1.5 logical :: has_vs_zero,has_vp2_zero + real(kind=CUSTOM_REAL),dimension(1) :: tmp_val ! debug: for vtk output real(kind=CUSTOM_REAL),dimension(:),allocatable :: tmp1,tmp2 + integer:: ier character(len=256) :: filename,prname @@ -680,11 +683,15 @@ subroutine check_mesh_resolution_poro(myrank,NSPEC_AB,NGLOB_AB,ibool,xstore,ysto model_speed_max = vsmax_glob endif endif - call bcast_all_cr(model_speed_max,1) + tmp_val(1) = model_speed_max + call bcast_all_cr(tmp_val,1) + model_speed_max = tmp_val(1) ! returns minimum period if( myrank == 0 ) min_resolved_period = pmax_glob - call bcast_all_cr(min_resolved_period,1) + tmp_val(1) = min_resolved_period + call bcast_all_cr(tmp_val,1) + min_resolved_period = tmp_val(1) ! debug: for vtk output if( SAVE_MESH_FILES ) then diff --git a/src/shared/combine_vol_data.f90 b/src/shared/combine_vol_data.f90 index eeb380328..332bf5847 100644 --- a/src/shared/combine_vol_data.f90 +++ b/src/shared/combine_vol_data.f90 @@ -38,7 +38,7 @@ module vtk ! maximum number of slices integer,parameter :: MAX_NUM_NODES = 600 - + end module vtk ! @@ -76,9 +76,9 @@ program combine_paraview_data_ext_mesh integer :: i, ios, it, ier integer :: iproc, proc1, proc2, num_node - + integer,dimension(MAX_NUM_NODES) :: node_list - + integer :: np, ne, npp, nee, nelement, njunk character(len=256) :: sline, arg(6), filename, indir, outdir @@ -382,9 +382,9 @@ subroutine cvd_count_totals_ext_mesh(num_node,node_list,LOCAL_PATH,& include 'constants.h' integer,intent(in) :: num_node - integer,dimension(MAX_NUM_NODES),intent(in) :: node_list + integer,dimension(MAX_NUM_NODES),intent(in) :: node_list character(len=256),intent(in) :: LOCAL_PATH - + integer,intent(out) :: npp,nee logical,intent(in) :: HIGH_RESOLUTION_MESH diff --git a/src/shared/compute_arrays_source.f90 b/src/shared/compute_arrays_source.f90 new file mode 100644 index 000000000..ef6729311 --- /dev/null +++ b/src/shared/compute_arrays_source.f90 @@ -0,0 +1,352 @@ +!===================================================================== +! +! S p e c f e m 3 D V e r s i o n 2 . 0 +! --------------------------------------- +! +! Main authors: Dimitri Komatitsch and Jeroen Tromp +! Princeton University, USA and University of Pau / CNRS / INRIA +! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA +! April 2011 +! +! This program is free software; you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 2 of the License, or +! (at your option) any later version. +! +! This program is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along +! with this program; if not, write to the Free Software Foundation, Inc., +! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +! +!===================================================================== + + subroutine compute_arrays_source(ispec_selected_source, & + xi_source,eta_source,gamma_source,sourcearray, & + Mxx,Myy,Mzz,Mxy,Mxz,Myz, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + xigll,yigll,zigll,nspec) + + implicit none + + include "constants.h" + + integer ispec_selected_source + integer nspec + + double precision xi_source,eta_source,gamma_source + double precision Mxx,Myy,Mzz,Mxy,Mxz,Myz + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xix,xiy,xiz,etax,etay,etaz, & + gammax,gammay,gammaz + + real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearray + + double precision xixd,xiyd,xizd,etaxd,etayd,etazd,gammaxd,gammayd,gammazd + +! Gauss-Lobatto-Legendre points of integration and weights + double precision, dimension(NGLLX) :: xigll + double precision, dimension(NGLLY) :: yigll + double precision, dimension(NGLLZ) :: zigll + +! source arrays + double precision, dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearrayd + double precision, dimension(NGLLX,NGLLY,NGLLZ) :: G11,G12,G13,G21,G22,G23,G31,G32,G33 + double precision, dimension(NGLLX) :: hxis,hpxis + double precision, dimension(NGLLY) :: hetas,hpetas + double precision, dimension(NGLLZ) :: hgammas,hpgammas + + integer k,l,m + +! calculate G_ij for general source location +! the source does not necessarily correspond to a Gauss-Lobatto point + do m=1,NGLLZ + do l=1,NGLLY + do k=1,NGLLX + + xixd = dble(xix(k,l,m,ispec_selected_source)) + xiyd = dble(xiy(k,l,m,ispec_selected_source)) + xizd = dble(xiz(k,l,m,ispec_selected_source)) + etaxd = dble(etax(k,l,m,ispec_selected_source)) + etayd = dble(etay(k,l,m,ispec_selected_source)) + etazd = dble(etaz(k,l,m,ispec_selected_source)) + gammaxd = dble(gammax(k,l,m,ispec_selected_source)) + gammayd = dble(gammay(k,l,m,ispec_selected_source)) + gammazd = dble(gammaz(k,l,m,ispec_selected_source)) + + G11(k,l,m) = Mxx*xixd+Mxy*xiyd+Mxz*xizd + G12(k,l,m) = Mxx*etaxd+Mxy*etayd+Mxz*etazd + G13(k,l,m) = Mxx*gammaxd+Mxy*gammayd+Mxz*gammazd + G21(k,l,m) = Mxy*xixd+Myy*xiyd+Myz*xizd + G22(k,l,m) = Mxy*etaxd+Myy*etayd+Myz*etazd + G23(k,l,m) = Mxy*gammaxd+Myy*gammayd+Myz*gammazd + G31(k,l,m) = Mxz*xixd+Myz*xiyd+Mzz*xizd + G32(k,l,m) = Mxz*etaxd+Myz*etayd+Mzz*etazd + G33(k,l,m) = Mxz*gammaxd+Myz*gammayd+Mzz*gammazd + + enddo + enddo + enddo + +! compute Lagrange polynomials at the source location + call lagrange_any(xi_source,NGLLX,xigll,hxis,hpxis) + call lagrange_any(eta_source,NGLLY,yigll,hetas,hpetas) + call lagrange_any(gamma_source,NGLLZ,zigll,hgammas,hpgammas) + +! calculate source array + do m=1,NGLLZ + do l=1,NGLLY + do k=1,NGLLX + call multiply_arrays_source(sourcearrayd,G11,G12,G13,G21,G22,G23, & + G31,G32,G33,hxis,hpxis,hetas,hpetas,hgammas,hpgammas,k,l,m) + enddo + enddo + enddo + +! distinguish between single and double precision for reals + if(CUSTOM_REAL == SIZE_REAL) then + sourcearray(:,:,:,:) = sngl(sourcearrayd(:,:,:,:)) + else + sourcearray(:,:,:,:) = sourcearrayd(:,:,:,:) + endif + + end subroutine compute_arrays_source + +!============================================================================= + + subroutine compute_arrays_adjoint_source(myrank, adj_source_file, & + xi_receiver,eta_receiver,gamma_receiver, adj_sourcearray, & + xigll,yigll,zigll, & + it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC) + + implicit none + + include 'constants.h' + +! input + integer myrank, NSTEP, it_sub_adj, NTSTEP_BETWEEN_READ_ADJSRC + + double precision xi_receiver, eta_receiver, gamma_receiver + + character(len=*) adj_source_file + +! output + real(kind=CUSTOM_REAL),dimension(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ) :: adj_sourcearray + +! Gauss-Lobatto-Legendre points of integration and weights + double precision, dimension(NGLLX) :: xigll + double precision, dimension(NGLLY) :: yigll + double precision, dimension(NGLLZ) :: zigll + + double precision :: hxir(NGLLX), hpxir(NGLLX), hetar(NGLLY), hpetar(NGLLY), & + hgammar(NGLLZ), hpgammar(NGLLZ) + + real(kind=CUSTOM_REAL) :: adj_src(NTSTEP_BETWEEN_READ_ADJSRC,NDIM) + + integer icomp, itime, i, j, k, ios, it_start, it_end + double precision :: junk + ! note: should have same order as orientation in write_seismograms_to_file() + character(len=3),dimension(NDIM) :: comp != (/ "BHE", "BHN", "BHZ" /) + character(len=256) :: filename + + ! gets channel names + do icomp=1,NDIM + call write_channel_name(icomp,comp(icomp)) + enddo + + ! range of the block we need to read + it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1 + it_end = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1 + + !adj_sourcearray(:,:,:,:,:) = 0. + adj_src = 0._CUSTOM_REAL + + ! loops over components + do icomp = 1, NDIM + + filename = OUTPUT_FILES_PATH(1:len_trim(OUTPUT_FILES_PATH))//'/../SEM/'//trim(adj_source_file)//'.'//comp(icomp)//'.adj' + open(unit=IIN,file=trim(filename),status='old',action='read',iostat = ios) + ! cycles to next file (this might be more error prone) + !if (ios /= 0) cycle + ! requires adjoint files to exist (users will have to be more careful in setting up adjoint runs) + if (ios /= 0) call exit_MPI(myrank, ' file '//trim(filename)//' does not exist - required for adjoint runs') + + ! reads in adjoint source trace + !! skip unused blocks + do itime = 1, it_start-1 + read(IIN,*,iostat=ios) junk, junk + if( ios /= 0 ) & + call exit_MPI(myrank, & + 'file '//trim(filename)//' has wrong length, please check with your simulation duration (1111)') + enddo + !! read the block we need + do itime = it_start, it_end + read(IIN,*,iostat=ios) junk, adj_src(itime-it_start+1,icomp) + !!! used to check whether we read the correct block + ! if (icomp==1) print *, junk, adj_src(itime-it_start+1,icomp) + if( ios /= 0 ) & + call exit_MPI(myrank, & + 'file '//trim(filename)//' has wrong length, please check with your simulation duration (2222)') + enddo + close(IIN) + + enddo + + ! lagrange interpolators for receiver location + call lagrange_any(xi_receiver,NGLLX,xigll,hxir,hpxir) + call lagrange_any(eta_receiver,NGLLY,yigll,hetar,hpetar) + call lagrange_any(gamma_receiver,NGLLZ,zigll,hgammar,hpgammar) + + ! interpolates adjoint source onto GLL points within this element + do k = 1, NGLLZ + do j = 1, NGLLY + do i = 1, NGLLX + adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:) + enddo + enddo + enddo + +end subroutine compute_arrays_adjoint_source + + +! ======================================================================= + +! compute array for acoustic source + subroutine compute_arrays_source_acoustic(xi_source,eta_source,gamma_source,& + sourcearray,xigll,yigll,zigll,factor_source) + + implicit none + + include "constants.h" + + double precision :: xi_source,eta_source,gamma_source + real(kind=CUSTOM_REAL) :: factor_source + real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearray + +! Gauss-Lobatto-Legendre points of integration and weights + double precision, dimension(NGLLX) :: xigll + double precision, dimension(NGLLY) :: yigll + double precision, dimension(NGLLZ) :: zigll + +! local parameters +! source arrays + double precision, dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sourcearrayd + double precision, dimension(NGLLX) :: hxis,hpxis + double precision, dimension(NGLLY) :: hetas,hpetas + double precision, dimension(NGLLZ) :: hgammas,hpgammas + integer :: i,j,k + +! initializes + sourcearray(:,:,:,:) = 0._CUSTOM_REAL + sourcearrayd(:,:,:,:) = 0.d0 + +! computes Lagrange polynomials at the source location + call lagrange_any(xi_source,NGLLX,xigll,hxis,hpxis) + call lagrange_any(eta_source,NGLLY,yigll,hetas,hpetas) + call lagrange_any(gamma_source,NGLLZ,zigll,hgammas,hpgammas) + +! calculates source array for interpolated location + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! identical source array components in x,y,z-direction + sourcearrayd(:,i,j,k) = hxis(i)*hetas(j)*hgammas(k)*dble(factor_source) + enddo + enddo + enddo + +! distinguish between single and double precision for reals + if(CUSTOM_REAL == SIZE_REAL) then + sourcearray(:,:,:,:) = sngl(sourcearrayd(:,:,:,:)) + else + sourcearray(:,:,:,:) = sourcearrayd(:,:,:,:) + endif + + end subroutine compute_arrays_source_acoustic + + +! testing read in adjoint sources block by block + +!!!the original version +!!! +!!!subroutine compute_arrays_adjoint_source(myrank, adj_source_file, & +!!! xi_receiver,eta_receiver,gamma_receiver, adj_sourcearray, & +!!! xigll,yigll,zigll,NSTEP) +!!! +!!! +!!! implicit none +!!! +!!! include 'constants.h' +!!! +!!!! input +!!! integer myrank, NSTEP +!!! +!!! double precision xi_receiver, eta_receiver, gamma_receiver +!!! +!!! character(len=*) adj_source_file +!!! +!!!! output +!!! real(kind=CUSTOM_REAL),dimension(NSTEP,NDIM,NGLLX,NGLLY,NGLLZ) :: adj_sourcearray +!!! +!!!! Gauss-Lobatto-Legendre points of integration and weights +!!! double precision, dimension(NGLLX) :: xigll +!!! double precision, dimension(NGLLY) :: yigll +!!! double precision, dimension(NGLLZ) :: zigll +!!! +!!! double precision :: hxir(NGLLX), hpxir(NGLLX), hetar(NGLLY), hpetar(NGLLY), & +!!! hgammar(NGLLZ), hpgammar(NGLLZ) +!!! +!!! real(kind=CUSTOM_REAL) :: adj_src(NSTEP,NDIM) +!!! +!!! integer icomp, itime, i, j, k, ios +!!! double precision :: junk +!!! ! note: should have same order as orientation in write_seismograms_to_file() +!!! character(len=3),dimension(NDIM) :: comp = (/ "BHE", "BHN", "BHZ" /) +!!! character(len=256) :: filename +!!! +!!! !adj_sourcearray(:,:,:,:,:) = 0. +!!! adj_src = 0._CUSTOM_REAL +!!! +!!! ! loops over components +!!! do icomp = 1, NDIM +!!! +!!! filename = 'SEM/'//trim(adj_source_file) // '.'// comp(icomp) // '.adj' +!!! open(unit=IIN,file=trim(filename),status='old',action='read',iostat = ios) +!!! if (ios /= 0) cycle ! cycles to next file +!!! !if (ios /= 0) call exit_MPI(myrank, ' file '//trim(filename)//'does not exist') +!!! +!!! ! reads in adjoint source trace +!!! do itime = 1, NSTEP +!!! +!!! ! things become a bit tricky because of the Newmark time scheme at +!!! ! the very beginning of the time loop. however, when we read in the backward/reconstructed +!!! ! wavefields at the end of the first time loop, we can use the adjoint source index from 1 to NSTEP +!!! ! (and then access it in reverse NSTEP-it+1 down to 1, for it=1,..NSTEP; see compute_add_sources*.f90). +!!! read(IIN,*,iostat=ios) junk, adj_src(itime,icomp) +!!! if( ios /= 0 ) & +!!! call exit_MPI(myrank, & +!!! 'file '//trim(filename)//' has wrong length, please check with your simulation duration') +!!! enddo +!!! close(IIN) +!!! +!!! enddo +!!! +!!! ! lagrange interpolators for receiver location +!!! call lagrange_any(xi_receiver,NGLLX,xigll,hxir,hpxir) +!!! call lagrange_any(eta_receiver,NGLLY,yigll,hetar,hpetar) +!!! call lagrange_any(gamma_receiver,NGLLZ,zigll,hgammar,hpgammar) +!!! +!!! ! interpolates adjoint source onto GLL points within this element +!!! do k = 1, NGLLZ +!!! do j = 1, NGLLY +!!! do i = 1, NGLLX +!!! adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:) +!!! enddo +!!! enddo +!!! enddo +!!! +!!!end subroutine compute_arrays_adjoint_source + diff --git a/src/shared/constants.h.in b/src/shared/constants.h.in index 058a919d4..c769df374 100644 --- a/src/shared/constants.h.in +++ b/src/shared/constants.h.in @@ -70,10 +70,16 @@ ! for optimized routines by Deville et al. (2002) integer, parameter :: m1 = NGLLX, m2 = NGLLX * NGLLY +!!----------------------------------------------------------- +!! +!! seismogram and i/o output +!! +!!----------------------------------------------------------- ! ouput format of seismograms, ASCII or binary logical, parameter :: SEISMOGRAMS_BINARY = .false. + ! output format of seismograms, Seismic Unix (binary with 240-byte-headers) - logical, parameter :: SU_FORMAT=.false. + logical, parameter :: SU_FORMAT = .false. ! input, output and main MPI I/O files integer, parameter :: ISTANDARD_OUTPUT = 6 @@ -101,6 +107,12 @@ ! ignore variable name field (junk) at the beginning of each input line logical, parameter :: IGNORE_JUNK = .true.,DONT_IGNORE_JUNK = .false. +!!----------------------------------------------------------- +!! +!! source/receiver setup +!! +!!----------------------------------------------------------- + ! flag to print the details of source location logical, parameter :: SHOW_DETAILS_LOCATE_SOURCE = .false. @@ -123,6 +135,12 @@ ! use directory OUTPUT_FILES/ for seismogram output logical,parameter :: USE_OUTPUT_FILES_PATH = .true. +!!----------------------------------------------------------- +!! +!! absorption and PML +!! +!!----------------------------------------------------------- + ! absorb top surface ! (defined in mesh as 'free_surface_file') logical,parameter :: ABSORB_FREE_SURFACE = .false. @@ -133,6 +151,12 @@ ! (user parameters can be specified in PML_init.f90) logical,parameter :: ABSORB_USE_PML = .false. +!!----------------------------------------------------------- +!! +!! directory structure +!! +!!----------------------------------------------------------- + ! paths for inputs and outputs files character(len=256), parameter :: IN_DATA_FILES_PATH = '../in_data_files/' character(len=256), parameter :: MF_IN_DATA_FILES_PATH = '../in_data_files/meshfem3D_files/' @@ -173,6 +197,9 @@ double precision, parameter :: FACTOR_FORCE_SOURCE = 1.d15 integer, parameter :: COMPONENT_FORCE_SOURCE = 3 ! takes direction in comp E/N/Z = 1/2/3 +! set to use a Ricker source time function instead of a gaussian + logical, parameter :: USE_RICKER_IPATI = .false. + ! use this t0 as earliest starting time rather than the automatically calculated one ! (must be positive and bigger than the automatically one to be effective; ! simulation will start at t = - t0) @@ -190,6 +217,13 @@ ! that completes an orthonormal. logical, parameter :: EXT_MESH_RECV_NORMAL = .false. + +!!----------------------------------------------------------- +!! +!! image outputs +!! +!!----------------------------------------------------------- + ! shakemaps and movies can not be generated during the same run. Mutually exclusive. logical, parameter :: EXTERNAL_MESH_MOVIE_SURFACE = .false. logical, parameter :: EXTERNAL_MESH_CREATE_SHAKEMAP = .false. @@ -215,6 +249,26 @@ ! this is an absolute value for normalized coordinates in the Earth double precision, parameter :: SMALLVAL_TOL = 1.d-10 +!!----------------------------------------------------------- +!! +!! mesh optimization +!! +!!----------------------------------------------------------- + +! add mesh coloring for the GPU + MPI implementation + logical, parameter :: USE_MESH_COLORING_GPU = .false. + integer, parameter :: MAX_NUMBER_OF_COLORS = 1000 + +! enhanced coloring: +! +! using Droux algorithm +! try several times with one more color before giving up + logical, parameter :: USE_DROUX_OPTIMIZATION = .false. + integer, parameter :: MAX_NB_TRIES_OF_DROUX_1993 = 15 +! +! postprocess the colors to balance them if Droux (1993) algorithm is not used + logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .false. + !------------------------------------------------------ !----------- do not modify anything below ------------- !------------------------------------------------------ @@ -316,12 +370,44 @@ ! flag for projection from latitude/longitude to UTM, and back integer, parameter :: ILONGLAT2UTM = 0, IUTM2LONGLAT = 1 +!!----------------------------------------------------------- +!! +!! OCEANS load approximation +!! +!!----------------------------------------------------------- ! minimum thickness in meters to include the effect of the oceans ! to avoid taking into account spurious oscillations in topography model double precision, parameter :: MINIMUM_THICKNESS_3D_OCEANS = 10.d0 ! density of sea water real(kind=CUSTOM_REAL), parameter :: RHO_OCEANS = 1020.0 +!!----------------------------------------------------------- +!! +!! GRAVITY +!! +!!----------------------------------------------------------- +! gravitational constant + double precision, parameter :: GRAV = 6.6723d-11 +! number of layers in PREM + integer, parameter :: NR = 640 +! for lookup table for gravity every 100 m in radial direction of Earth model + integer, parameter :: NRAD_GRAVITY = 70000 +! R_EARTH is the radius of the bottom of the oceans (radius of Earth in m) + double precision, parameter :: R_EARTH = 6371000.d0 +! same radius in km + double precision, parameter :: R_EARTH_KM = R_EARTH / 1000.d0 +! radius of the Earth for gravity calculation + double precision, parameter :: R_EARTH_GRAVITY = 6371000.d0 +! radius of the ocean floor for gravity calculation + double precision, parameter :: ROCEAN_GRAVITY = 6368000.d0 +! average density in the full Earth to normalize equation + double precision, parameter :: RHOAV = 5514.3d0 + +!!----------------------------------------------------------- +!! +!! DOMAINS +!! +!!----------------------------------------------------------- ! material domain ids integer, parameter :: IDOMAIN_ACOUSTIC = 1 integer, parameter :: IDOMAIN_ELASTIC = 2 @@ -339,3 +425,5 @@ integer, parameter :: IMODEL_USER_EXTERNAL = 6 integer, parameter :: IMODEL_GLL = 7 integer, parameter :: IMODEL_SALTON_TROUGH = 8 + integer, parameter :: IMODEL_1D_PREM_PB = 9 + integer, parameter :: IMODEL_IPATI = 10 diff --git a/src/shared/detect_surface.f90 b/src/shared/detect_surface.f90 index a1a4c6629..17c6c80ab 100644 --- a/src/shared/detect_surface.f90 +++ b/src/shared/detect_surface.f90 @@ -703,9 +703,10 @@ subroutine detect_surface_PNM_GIF_image(NPROC,nglob,nspec,ibool,& real(kind=CUSTOM_REAL), dimension(nglob) :: xstore,ystore,zstore !local parameters - real(kind=CUSTOM_REAL) :: mindist + real(kind=CUSTOM_REAL) :: mindist,distance integer, dimension(:), allocatable :: valence_external_mesh integer :: ispec,i,j,k,iglob,ier,count + real(kind=CUSTOM_REAL),parameter :: TOLERANCE_DISTANCE = 0.9 ! detecting surface points/elements (based on valence check on NGLL points) for external mesh allocate(valence_external_mesh(nglob),stat=ier) @@ -722,6 +723,7 @@ subroutine detect_surface_PNM_GIF_image(NPROC,nglob,nspec,ibool,& + (ystore(ibool(1,1,1,:)) - ystore(ibool(2,1,1,:)))**2 & + (zstore(ibool(1,1,1,:)) - zstore(ibool(2,1,1,:)))**2 ) mindist = sqrt(mindist) + distance = TOLERANCE_DISTANCE*mindist ! sets valence value to one corresponding to process rank for points on cross-sections do ispec = 1, nspec @@ -732,7 +734,7 @@ subroutine detect_surface_PNM_GIF_image(NPROC,nglob,nspec,ibool,& ! chooses points close to cross-section if( abs((xstore(iglob)-section_xorg)*section_nx + (ystore(iglob)-section_yorg)*section_ny & - + (zstore(iglob)-section_zorg)*section_nz ) < 0.8*mindist ) then + + (zstore(iglob)-section_zorg)*section_nz ) < distance ) then ! sets valence to 1 for points on cross-sections valence_external_mesh(iglob) = myrank+1 endif diff --git a/src/shared/get_element_face.f90 b/src/shared/get_element_face.f90 index 8c1f07159..2b068d36c 100644 --- a/src/shared/get_element_face.f90 +++ b/src/shared/get_element_face.f90 @@ -24,7 +24,8 @@ ! !===================================================================== -subroutine get_element_face_id(ispec,xcoord,ycoord,zcoord,& + + subroutine get_element_face_id(ispec,xcoord,ycoord,zcoord,& ibool,nspec,nglob, & xstore_dummy,ystore_dummy,zstore_dummy, & iface_id ) @@ -174,13 +175,13 @@ subroutine get_element_face_id(ispec,xcoord,ycoord,zcoord,& endif -end subroutine get_element_face_id + end subroutine get_element_face_id ! -!---- +!------------------------------------------------------------------------------------------------- ! -subroutine get_element_face_gll_indices(iface,ijk_face,NGLLA,NGLLB ) + subroutine get_element_face_gll_indices(iface,ijk_face,NGLLA,NGLLB ) ! returns local indices in ijk_face for specified face @@ -368,10 +369,10 @@ subroutine get_element_face_gll_indices(iface,ijk_face,NGLLA,NGLLB ) end subroutine get_element_face_gll_indices ! -!---- +!------------------------------------------------------------------------------------------------- ! -subroutine get_element_face_normal(ispec,iface,xcoord,ycoord,zcoord, & + subroutine get_element_face_normal(ispec,iface,xcoord,ycoord,zcoord, & ibool,nspec,nglob, & xstore_dummy,ystore_dummy,zstore_dummy, & normal) @@ -463,13 +464,13 @@ subroutine get_element_face_normal(ispec,iface,xcoord,ycoord,zcoord, & endif !print*,'face ',iface,'scalarproduct:',tmp -end subroutine get_element_face_normal + end subroutine get_element_face_normal ! -!---- +!------------------------------------------------------------------------------------------------- ! -subroutine get_element_face_normal_idirect(ispec,iface,xcoord,ycoord,zcoord, & + subroutine get_element_face_normal_idirect(ispec,iface,xcoord,ycoord,zcoord, & ibool,nspec,nglob, & xstore_dummy,ystore_dummy,zstore_dummy, & normal,idirect) @@ -565,5 +566,50 @@ subroutine get_element_face_normal_idirect(ispec,iface,xcoord,ycoord,zcoord, & idirect = 1 endif -end subroutine get_element_face_normal_idirect + end subroutine get_element_face_normal_idirect + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, & + ibool,nspec,nglob,xstore_dummy,ystore_dummy,zstore_dummy, & + iface_all_corner_ijk) + + implicit none + + include "constants.h" + + integer,intent(in) :: ispec,iface_ref,nspec,nglob + + ! face corner locations + real(kind=CUSTOM_REAL),dimension(NGNOD2D),intent(out) :: xcoord,ycoord,zcoord + integer,dimension(NGNOD2D),intent(out):: iglob_corners_ref + + ! index array + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + ! global point locations + real(kind=CUSTOM_REAL) :: xstore_dummy(nglob),ystore_dummy(nglob),zstore_dummy(nglob) + + ! assumes NGNOD2D == 4 + integer,dimension(3,4,6) :: iface_all_corner_ijk + + ! local parameters + integer :: icorner,i,j,k + + ! loops over corners + do icorner = 1,NGNOD2D + i = iface_all_corner_ijk(1,icorner,iface_ref) + j = iface_all_corner_ijk(2,icorner,iface_ref) + k = iface_all_corner_ijk(3,icorner,iface_ref) + + ! global reference indices + iglob_corners_ref(icorner) = ibool(i,j,k,ispec) + + ! reference corner coordinates + xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner)) + ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner)) + zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner)) + enddo + end subroutine get_element_corners diff --git a/src/shared/param_reader.c b/src/shared/param_reader.c index 701f965c3..51896d89d 100644 --- a/src/shared/param_reader.c +++ b/src/shared/param_reader.c @@ -142,7 +142,7 @@ FC_FUNC_(param_read,PARAM_READ)(char * string_read, int * string_read_len, char /* Regular expression for parsing lines from param file. ** Good luck reading this regular expression. Basically, the lines of ** the parameter file should be of the form 'parameter = value', - ** optionally followed by a #-delimited comment. + ** optionally followed by a #-delimited comment. ** 'value' can be any number of space- or tab-separated words. Blank ** lines, lines containing only white space and lines whose first non- ** whitespace character is '#' are ignored. White space is generally diff --git a/src/shared/read_parameter_file.f90 b/src/shared/read_parameter_file.f90 index 8a6e75508..999beeee0 100644 --- a/src/shared/read_parameter_file.f90 +++ b/src/shared/read_parameter_file.f90 @@ -50,7 +50,7 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, & logical ANISOTROPY,SAVE_MESH_FILES,PRINT_SOURCE_TIME_FUNCTION,SUPPRESS_UTM_PROJECTION character(len=256) LOCAL_PATH,CMTSOLUTION - + ! local variables integer ::ios,icounter,isource,idummy,nproc_eta_old,nproc_xi_old double precision :: hdur,minval_hdur @@ -101,7 +101,7 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, & ! define the velocity model call read_value_string(MODEL, 'model.MODEL') if(err_occurred() /= 0) stop 'an error occurred while reading the parameter file: MODEL' - + call read_value_logical(OCEANS, 'model.OCEANS') if(err_occurred() /= 0) return call read_value_logical(TOPOGRAPHY, 'model.TOPOGRAPHY') @@ -178,15 +178,15 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, & call get_value_string(CMTSOLUTION, 'solver.CMTSOLUTION',& IN_DATA_FILES_PATH(1:len_trim(IN_DATA_FILES_PATH))//'CMTSOLUTION') - open(unit=1,file=CMTSOLUTION,iostat=ios,status='old',action='read') + open(unit=21,file=trim(CMTSOLUTION),iostat=ios,status='old',action='read') if(ios /= 0) stop 'error opening CMTSOLUTION file' icounter = 0 do while(ios == 0) - read(1,"(a)",iostat=ios) dummystring + read(21,"(a)",iostat=ios) dummystring if(ios == 0) icounter = icounter + 1 enddo - close(1) + close(21) if(mod(icounter,NLINES_PER_CMTSOLUTION_SOURCE) /= 0) & stop 'total number of lines in CMTSOLUTION file should be a multiple of NLINES_PER_CMTSOLUTION_SOURCE' @@ -195,27 +195,27 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, & if(NSOURCES < 1) stop 'need at least one source in CMTSOLUTION file' ! compute the minimum value of hdur in CMTSOLUTION file - open(unit=1,file=CMTSOLUTION,status='old',action='read') + open(unit=21,file=trim(CMTSOLUTION),status='old',action='read') minval_hdur = HUGEVAL do isource = 1,NSOURCES ! skip other information do idummy = 1,3 - read(1,"(a)") dummystring + read(21,"(a)") dummystring enddo ! read half duration and compute minimum - read(1,"(a)") dummystring + read(21,"(a)") dummystring read(dummystring(15:len_trim(dummystring)),*) hdur minval_hdur = min(minval_hdur,hdur) ! skip other information do idummy = 1,9 - read(1,"(a)") dummystring + read(21,"(a)") dummystring enddo enddo - close(1) + close(21) ! one cannot use a Heaviside source for the movies if((MOVIE_SURFACE .or. MOVIE_VOLUME) .and. sqrt(minval_hdur**2 + HDUR_MOVIE**2) < TINYVAL) & @@ -242,24 +242,62 @@ subroutine read_parameter_file( NPROC,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,DT, & case( '1d_cascadia') IMODEL = IMODEL_1D_CASCADIA - ! user models + ! user models + case( '1d_prem_pb' ) + IMODEL = IMODEL_1D_PREM_PB + case( 'aniso' ) + IMODEL = IMODEL_DEFAULT + ANISOTROPY = .true. + case( 'external' ) + IMODEL = IMODEL_USER_EXTERNAL + case( 'ipati' ) + IMODEL = IMODEL_IPATI + case( 'gll' ) + IMODEL = IMODEL_GLL case( 'salton_trough') IMODEL = IMODEL_SALTON_TROUGH case( 'tomo' ) IMODEL = IMODEL_TOMO - case( 'external' ) - IMODEL = IMODEL_USER_EXTERNAL - case( 'aniso' ) - IMODEL = IMODEL_DEFAULT - ANISOTROPY = .true. - case default + + case default print* print*,'********** model not recognized: ',trim(MODEL),' **************' print*,'********** using model: default',' **************' print* IMODEL = IMODEL_DEFAULT end select - + + ! check + if( IMODEL == IMODEL_IPATI ) then + if( USE_RICKER_IPATI .eqv. .false. ) stop 'please set USE_RICKER_IPATI to true in shared/constants.h and recompile' + endif end subroutine read_parameter_file +! +!------------------------------------------------------------------------------------------------- +! + + subroutine read_gpu_mode(GPU_MODE,GRAVITY) + + implicit none + include "constants.h" + + logical :: GPU_MODE + logical :: GRAVITY + + ! initializes flags + GPU_MODE = .false. + GRAVITY = .false. + + ! opens file Par_file + call open_parameter_file() + + call read_value_logical(GPU_MODE, 'solver.GPU_MODE') + call read_value_logical(GRAVITY, 'solver.GRAVITY') + + ! close parameter file + call close_parameter_file() + + end subroutine read_gpu_mode + diff --git a/src/shared/read_topo_bathy_file.f90 b/src/shared/read_topo_bathy_file.f90 index 4cded288b..f05313209 100644 --- a/src/shared/read_topo_bathy_file.f90 +++ b/src/shared/read_topo_bathy_file.f90 @@ -48,7 +48,7 @@ subroutine read_topo_bathy_file(itopo_bathy,NX_TOPO,NY_TOPO) print*,'error opening topography file: ',trim(TOPO_FILE) stop 'error opening topography file' endif - + ! reads in values do iy=1,NY_TOPO do ix=1,NX_TOPO @@ -74,7 +74,7 @@ subroutine get_topo_bathy_elevation(x_target,y_target,target_elevation, & include "constants.h" real(kind=CUSTOM_REAL),intent(in) :: x_target,y_target - + real(kind=CUSTOM_REAL),intent(out) :: target_elevation integer :: NX_TOPO,NY_TOPO @@ -87,7 +87,7 @@ subroutine get_topo_bathy_elevation(x_target,y_target,target_elevation, & double precision :: xval,yval,long,lat double precision :: long_corner,lat_corner,ratio_xi,ratio_eta integer :: icornerlong,icornerlat - + ! get coordinates of current point xval = dble(x_target) yval = dble(y_target) @@ -127,7 +127,7 @@ subroutine get_topo_bathy_elevation(x_target,y_target,target_elevation, & itopo_bathy(icornerlong,icornerlat+1)*(1.-ratio_xi)*ratio_eta end subroutine get_topo_bathy_elevation - + ! !------------------------------------------------------------------------------------------------- ! @@ -143,10 +143,10 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis include "constants.h" real(kind=CUSTOM_REAL),intent(in) :: x_target,y_target - + real(kind=CUSTOM_REAL),intent(out) :: target_elevation real(kind=CUSTOM_REAL),intent(out) :: target_distmin - + integer :: NSPEC_AB,NGLOB_AB integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool @@ -160,7 +160,7 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis integer, dimension(3,NGLLSQUARE,num_free_surface_faces) :: free_surface_ijk ! local parameters - real(kind=CUSTOM_REAL),dimension(4) :: elevation_node,dist_node + real(kind=CUSTOM_REAL),dimension(4) :: elevation_node,dist_node real(kind=CUSTOM_REAL) :: distmin,dist integer :: iface,i,j,ispec,iglob,igll,jgll,kgll @@ -172,26 +172,26 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis integer,parameter :: MIDX = (NGLLX+1)/2 integer,parameter :: MIDY = (NGLLY+1)/2 integer,parameter :: MIDZ = (NGLLZ+1)/2 - + real(kind=CUSTOM_REAL) :: typical_size logical :: located_target - + ! initialize target_elevation = 0.0_CUSTOM_REAL target_distmin = HUGEVAL - - + + if(num_free_surface_faces > 0) then ! computes typical size of elements at the surface (uses first element for estimation) if( USE_DISTANCE_CRITERION ) then - ispec = free_surface_ispec(1) + ispec = free_surface_ispec(1) typical_size = (xstore(ibool(1,1,1,ispec)) - xstore(ibool(NGLLX,NGLLY,NGLLZ,ispec)))**2 & + (ystore(ibool(1,1,1,ispec)) - ystore(ibool(NGLLX,NGLLY,NGLLZ,ispec)))**2 ! use 10 times the distance as a criterion for point detection typical_size = 10. * typical_size endif - + ! flag to check that we located at least one target element located_target = .false. @@ -200,7 +200,7 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis iselected = 2 jselected = 2 iface_selected = 1 - + ! loops over all free surface faces do iface=1,num_free_surface_faces ispec = free_surface_ispec(iface) @@ -208,10 +208,10 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis ! exclude elements that are too far from target if( USE_DISTANCE_CRITERION ) then iglob = ibool(MIDX,MIDY,MIDZ,ispec) - dist = (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2 + dist = (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2 if( dist > typical_size ) cycle endif - + ! loop only on points inside the element ! exclude edges to ensure this point is not shared with other elements do j = 2,NGLLY - 1 @@ -220,13 +220,13 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis igll = free_surface_ijk(1,(j-1)*NGLLY+i,iface) jgll = free_surface_ijk(2,(j-1)*NGLLY+i,iface) kgll = free_surface_ijk(3,(j-1)*NGLLY+i,iface) - + iglob = ibool(igll,jgll,kgll,ispec) ! distance (squared) to target dist = ( x_target - xstore(iglob) )**2 + & ( y_target - ystore(iglob) )**2 - + ! keep this point if it is closer to the receiver if(dist < distmin) then distmin = dist @@ -238,8 +238,8 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis located_target = .true. endif enddo - enddo - end do + enddo + end do ! if we have not located a target element, the point is not in this slice ! therefore use first element only for fictitious iterative search @@ -248,7 +248,7 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis jselected = 2 iface_selected = 1 endif - + ! weighted mean at current point of topography elevation of the four closest nodes ! set distance to huge initial value distmin = HUGEVAL @@ -264,23 +264,23 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis jgll = free_surface_ijk(2,(j-jadjust-1)*NGLLY+i-iadjust,iface_selected) kgll = free_surface_ijk(3,(j-jadjust-1)*NGLLY+i-iadjust,iface_selected) iglob = ibool(igll,jgll,kgll,ispec) - + ! stores node infos inode = inode + 1 elevation_node(inode) = zstore(iglob) dist_node(inode) = sqrt( (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2 ) end do end do - + ! weighted elevation dist = sum( dist_node(:) ) if(dist < distmin) then - + ! sets new minimum distance (of all 4 closest nodes) distmin = dist target_distmin = distmin - - ! interpolates elevation + + ! interpolates elevation if( dist > TINYVAL ) then target_elevation = (dist_node(1)/dist)*elevation_node(1) + & (dist_node(2)/dist)*elevation_node(2) + & @@ -290,10 +290,10 @@ subroutine get_topo_elevation_free(x_target,y_target,target_elevation,target_dis stop 'error summed distance to node is zero' endif endif - - end do + + end do end do - + end if end subroutine get_topo_elevation_free @@ -313,10 +313,10 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta include "constants.h" real(kind=CUSTOM_REAL),intent(in) :: x_target,y_target - + real(kind=CUSTOM_REAL),intent(out) :: target_elevation real(kind=CUSTOM_REAL),intent(out) :: target_distmin - + integer :: NSPEC_AB,NGLOB_AB integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool @@ -339,26 +339,26 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta integer,parameter :: MIDX = (NGLLX+1)/2 integer,parameter :: MIDY = (NGLLY+1)/2 integer,parameter :: MIDZ = (NGLLZ+1)/2 - + real(kind=CUSTOM_REAL) :: typical_size logical :: located_target - + ! initialize target_elevation = 0.0_CUSTOM_REAL target_distmin = HUGEVAL - - + + if(num_free_surface_faces > 0) then ! computes typical size of elements at the surface (uses first element for estimation) if( USE_DISTANCE_CRITERION ) then - ispec = free_surface_ispec(1) + ispec = free_surface_ispec(1) typical_size = (xstore(ibool(1,1,1,ispec)) - xstore(ibool(NGLLX,NGLLY,NGLLZ,ispec)))**2 & + (ystore(ibool(1,1,1,ispec)) - ystore(ibool(NGLLX,NGLLY,NGLLZ,ispec)))**2 ! use 10 times the distance as a criterion for point detection typical_size = 10. * typical_size endif - + ! flag to check that we located at least one target element located_target = .false. @@ -372,22 +372,22 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta ! excludes elements that are too far from target if( USE_DISTANCE_CRITERION ) then iglob = ibool(MIDX,MIDY,MIDZ,ispec) - dist = (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2 + dist = (x_target - xstore(iglob))**2 + (y_target - ystore(iglob))**2 if( dist > typical_size ) cycle endif - + ! loop only on points inside the element do i = 1,NGLLSQUARE igll = free_surface_ijk(1,i,iface) jgll = free_surface_ijk(2,i,iface) kgll = free_surface_ijk(3,i,iface) - + iglob = ibool(igll,jgll,kgll,ispec) ! distance (squared) to target dist = ( x_target - xstore(iglob) )**2 + & ( y_target - ystore(iglob) )**2 - + ! keep this point if it is closer to the receiver if(dist < distmin) then distmin = dist @@ -397,8 +397,8 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta target_distmin = dist located_target = .true. endif - enddo - end do + enddo + end do ! if we have not located a target element, the point is not in this slice ! therefore use first element only for fictitious iterative search @@ -409,10 +409,10 @@ subroutine get_topo_elevation_free_closest(x_target,y_target,target_elevation,ta ! elevation (given in z - coordinate) target_elevation = zstore(iglob) target_distmin = ( x_target - xstore(iglob) )**2 + ( y_target - ystore(iglob) )**2 - located_target = .true. + located_target = .true. endif - + end if end subroutine get_topo_elevation_free_closest - \ No newline at end of file + diff --git a/src/shared/serial.f90 b/src/shared/serial.f90 index aad0e25a5..97bc42672 100644 --- a/src/shared/serial.f90 +++ b/src/shared/serial.f90 @@ -38,6 +38,8 @@ end subroutine stop_all ! double precision function wtime() + + implicit none real :: ct ! note: for simplicity, we take cpu_time which returns the elapsed CPU time in seconds @@ -45,6 +47,7 @@ double precision function wtime() call cpu_time(ct) wtime = ct + end function wtime ! @@ -682,11 +685,10 @@ end subroutine recv_i_t subroutine send_dp(sendbuf, sendcount, dest, sendtag) implicit none - include "constants.h" integer dest,sendtag integer sendcount - real(kind=CUSTOM_REAL),dimension(sendcount):: sendbuf + double precision,dimension(sendcount):: sendbuf stop 'send_dp not implemented for serial code' @@ -697,13 +699,12 @@ end subroutine send_dp subroutine recv_dp(recvbuf, recvcount, dest, recvtag) implicit none - include "constants.h" integer dest,recvtag integer recvcount - real(kind=CUSTOM_REAL),dimension(recvcount):: recvbuf + double precision,dimension(recvcount):: recvbuf - stop 'recv_dp not implemented for parallel code' + stop 'recv_dp not implemented for serial code' end subroutine recv_dp diff --git a/src/shared/smooth_vol_data.f90 b/src/shared/smooth_vol_data.f90 index 9af47205f..9948b6bd8 100644 --- a/src/shared/smooth_vol_data.f90 +++ b/src/shared/smooth_vol_data.f90 @@ -73,7 +73,7 @@ program smooth_vol_data real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: dummy_2 real(kind=CUSTOM_REAL), dimension(:,:,:),allocatable :: dummy_3 real(kind=CUSTOM_REAL), dimension(:,:,:,:,:),allocatable :: dummy_5 - + integer, dimension(:),allocatable :: idummy logical, dimension(:),allocatable :: ldummy integer, dimension(:,:,:),allocatable :: idummy_3 @@ -262,7 +262,7 @@ program smooth_vol_data ! reads mesh file ! ! needs to get point locations, jacobians and MPI neighbours - + ! opens external mesh file write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'/proc',myrank,'_'//'external_mesh.bin' open(unit=27,file=trim(prname_lp),& @@ -273,7 +273,7 @@ program smooth_vol_data call exit_mpi(myrank, 'error reading external mesh file') endif - ! gets number of elements and global points for this partition + ! gets number of elements and global points for this partition read(27) NSPEC_AB read(27) NGLOB_AB @@ -292,7 +292,7 @@ program smooth_vol_data allocate(dummy(NGLLX,NGLLY,NGLLZ,NSPEC_AB), & jacobian(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier) if( ier /= 0 ) stop 'error allocating array dummy and jacobian' - + ! needs jacobian read(27) dummy ! xix read(27) dummy ! xiy @@ -306,7 +306,7 @@ program smooth_vol_data read(27) jacobian ! now skips all until MPI section can be read in - + ! reads in partiton neighbors read(27) dummy ! kappastore read(27) dummy ! mustore @@ -331,7 +331,7 @@ program smooth_vol_data read(27) dummy_1 ! rmass_acoustic read(27) dummy ! rhostore endif - + ! elastic if( ELASTIC_SIMULATION ) then read(27) dummy_1 ! rmass @@ -341,28 +341,28 @@ program smooth_vol_data read(27) dummy ! rho_vp read(27) dummy ! rho_vs endif - + ! poroelastic if( POROELASTIC_SIMULATION ) then read(27) dummy_1 ! rmass_solid_poroelastic read(27) dummy_1 ! rmass_fluid_poroelastic allocate(dummy_5(2,NGLLX,NGLLY,NGLLZ,NSPEC_AB)) read(27) dummy_5 ! rhoarraystore - deallocate(dummy_5) + deallocate(dummy_5) allocate(dummy_5(3,NGLLX,NGLLY,NGLLZ,NSPEC_AB)) read(27) dummy_5 ! kappaarraystore deallocate(dummy_5) read(27) dummy ! etastore - read(27) dummy ! tortstore + read(27) dummy ! tortstore allocate(dummy_5(6,NGLLX,NGLLY,NGLLZ,NSPEC_AB)) read(27) dummy_5 ! permstore deallocate(dummy_5) read(27) dummy ! phistore read(27) dummy ! rho_vpI read(27) dummy ! rho_vpII - read(27) dummy ! rho_vsI + read(27) dummy ! rho_vsI endif - + deallocate(dummy_1) deallocate(dummy) @@ -388,7 +388,7 @@ program smooth_vol_data read(27) dummy_3 ! abs_boundary_normal deallocate( idummy,idummy_3,dummy_2,dummy_3) endif - + ! free surface read(27) idummy_a ! num_free_surface_faces if( idummy_a > 0 ) then @@ -403,7 +403,7 @@ program smooth_vol_data read(27) dummy_3 ! free_surface_normal deallocate( idummy,idummy_3,dummy_2,dummy_3) endif - + ! acoustic-elastic coupling surface read(27) idummy_a ! num_coupling_ac_el_faces if( idummy_a > 0 ) then @@ -411,7 +411,7 @@ program smooth_vol_data idummy_3(3,NGLLSQUARE,idummy_a), & dummy_2(NGLLSQUARE,idummy_a), & dummy_3(NDIM,NGLLSQUARE,idummy_a),stat=ier) - if( ier /= 0 ) stop 'error allocating array idummy etc.' + if( ier /= 0 ) stop 'error allocating array idummy etc.' read(27) idummy ! coupling_ac_el_ispec read(27) idummy_3 ! coupling_ac_el_ijk read(27) dummy_2 ! coupling_ac_el_jacobian2Dw @@ -426,7 +426,7 @@ program smooth_vol_data idummy_3(3,NGLLSQUARE,idummy_a), & dummy_2(NGLLSQUARE,idummy_a), & dummy_3(NDIM,NGLLSQUARE,idummy_a),stat=ier) - if( ier /= 0 ) stop 'error allocating array idummy etc.' + if( ier /= 0 ) stop 'error allocating array idummy etc.' read(27) idummy ! coupling_ac_po_ispec read(27) idummy_3 ! coupling_ac_po_ijk read(27) dummy_2 ! coupling_ac_po_jacobian2Dw @@ -441,11 +441,11 @@ program smooth_vol_data idummy_3(3,NGLLSQUARE,idummy_a), & dummy_2(NGLLSQUARE,idummy_a), & dummy_3(NDIM,NGLLSQUARE,idummy_a),stat=ier) - if( ier /= 0 ) stop 'error allocating array idummy etc.' + if( ier /= 0 ) stop 'error allocating array idummy etc.' read(27) idummy ! coupling_el_po_ispec - read(27) idummy ! coupling_po_el_ispec + read(27) idummy ! coupling_po_el_ispec read(27) idummy_3 ! coupling_el_po_ijk - read(27) idummy_3 ! coupling_po_el_ijk + read(27) idummy_3 ! coupling_po_el_ijk read(27) dummy_2 ! coupling_el_po_jacobian2Dw read(27) dummy_3 ! coupling_el_po_normal deallocate( idummy,idummy_3,dummy_2,dummy_3) @@ -461,12 +461,12 @@ program smooth_vol_data read(27) my_neighbours_ext_mesh ! no more information is needed from external mesh files endif - + ! we're done reading in mesh arrays close(27) ! --------------------- - + ! for smoothing, we use cell centers to find and locate nearby elements ! ! sets the location of the center of the elements and local points diff --git a/src/shared/sum_kernels.f90 b/src/shared/sum_kernels.f90 index 143f73268..0f761f1f9 100644 --- a/src/shared/sum_kernels.f90 +++ b/src/shared/sum_kernels.f90 @@ -24,7 +24,7 @@ module sum_par include 'constants.h' - + ! USER PARAMETERS ! by default, this algorithm uses transverse isotropic (bulk,bulk_betav,bulk_betah,eta) kernels to sum up @@ -38,13 +38,13 @@ module sum_par ! 1 permille of maximum for inverting hessian real(kind=CUSTOM_REAL),parameter :: THRESHOLD_HESS = 1.e-3 - + ! sums all hessians before inverting and preconditioning logical, parameter :: USE_HESS_SUM = .true. - + ! uses source mask to blend out source elements logical, parameter :: USE_SOURCE_MASK = .false. - + ! maximum number of kernels listed integer, parameter :: MAX_NUM_NODES = 1000 @@ -53,7 +53,7 @@ module sum_par ! mesh size integer :: NSPEC_AB, NGLOB_AB - + end module sum_par ! @@ -64,7 +64,7 @@ program sum_kernels use sum_par implicit none - + include 'mpif.h' include 'precision.h' @@ -102,7 +102,7 @@ program sum_kernels write(*,*) 'reading kernel list: ' endif call mpi_barrier(MPI_COMM_WORLD,ier) - + ! reads in event list nker=0 open(unit = 20, file = trim(kernel_file_list), status = 'old',iostat = ios) @@ -150,7 +150,7 @@ program sum_kernels ! reads mesh file ! ! needs to get array dimensions - + ! opens external mesh file write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'/proc',myrank,'_'//'external_mesh.bin' open(unit=27,file=trim(prname_lp),& @@ -161,7 +161,7 @@ program sum_kernels call exit_mpi(myrank, 'error reading external mesh file') endif - ! gets number of elements and global points for this partition + ! gets number of elements and global points for this partition read(27) NSPEC_AB read(27) NGLOB_AB @@ -177,7 +177,7 @@ program sum_kernels ! synchronizes call mpi_barrier(MPI_COMM_WORLD,ier) - + ! sums up kernels if( USE_ISO_KERNELS ) then @@ -260,18 +260,18 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank) allocate(kernel(NGLLX,NGLLY,NGLLZ,NSPEC_AB), & hess(NGLLX,NGLLY,NGLLZ,NSPEC_AB), & total_kernel(NGLLX,NGLLY,NGLLZ,NSPEC_AB)) - - - if( USE_HESS_SUM ) then + + + if( USE_HESS_SUM ) then allocate( total_hess(NGLLX,NGLLY,NGLLZ,NSPEC_AB) ) - total_hess(:,:,:,:) = 0.0_CUSTOM_REAL + total_hess(:,:,:,:) = 0.0_CUSTOM_REAL endif - - if( USE_SOURCE_MASK ) then + + if( USE_SOURCE_MASK ) then allocate( mask_source(NGLLX,NGLLY,NGLLZ,NSPEC_AB) ) - mask_source(:,:,:,:) = 1.0_CUSTOM_REAL + mask_source(:,:,:,:) = 1.0_CUSTOM_REAL endif - + ! loops over all event kernels total_kernel = 0._CUSTOM_REAL do iker = 1, nker @@ -295,7 +295,7 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank) read(12) kernel close(12) - ! outputs norm of kernel + ! outputs norm of kernel norm = sum( kernel * kernel ) call mpi_reduce(norm,norm_sum,1,CUSTOM_MPI_TYPE,MPI_SUM,0,MPI_COMM_WORLD,ier) if( myrank == 0 ) then @@ -326,7 +326,7 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank) ! note: we take absolute values for hessian (as proposed by Yang) hess = abs(hess) - + ! source mask if( USE_SOURCE_MASK ) then ! reads in mask @@ -339,10 +339,10 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank) endif read(12) mask_source close(12) - + ! masks source elements kernel = kernel * mask_source - + endif ! precondition @@ -355,26 +355,26 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank) ! inverts hessian call invert_hess( myrank,hess ) - + ! preconditions each event kernel with its hessian kernel = kernel * hess endif - + ! sums all kernels from each event total_kernel = total_kernel + kernel - + enddo ! preconditions summed kernels with summed hessians if( USE_HESS_SUM ) then - + ! inverts hessian matrix call invert_hess( myrank,total_hess ) - + ! preconditions kernel total_kernel = total_kernel * total_hess - + endif ! stores summed kernels @@ -396,7 +396,7 @@ subroutine sum_kernel_pre(kernel_name,kernel_list,nker,myrank) deallocate(kernel,hess,total_kernel) if( USE_HESS_SUM ) deallocate(total_hess) if( USE_SOURCE_MASK ) deallocate(mask_source) - + end subroutine sum_kernel_pre ! @@ -409,7 +409,7 @@ subroutine invert_hess( myrank,hess_matrix ) ! the approximate hessian is only defined for diagonal elements: like ! H_nn = \frac{ \partial^2 \chi }{ \partial \rho_n \partial \rho_n } ! on all GLL points, which are indexed (i,j,k,ispec) - + use sum_par implicit none @@ -423,30 +423,30 @@ subroutine invert_hess( myrank,hess_matrix ) ! local parameters real(kind=CUSTOM_REAL) :: maxh,maxh_all integer :: ier - + ! maximum value of hessian maxh = maxval( abs(hess_matrix) ) ! determines maximum from all slices on master call mpi_allreduce(maxh,maxh_all,1,CUSTOM_MPI_TYPE,MPI_MAX,MPI_COMM_WORLD,ier) - + ! user output if( myrank == 0 ) then print* print*,'hessian maximum: ',maxh_all print* endif - - ! normalizes hessian + + ! normalizes hessian if( maxh_all < 1.e-18 ) then - ! hessian is zero, re-initializes + ! hessian is zero, re-initializes hess_matrix = 1.0_CUSTOM_REAL !call exit_mpi(myrank,'error hessian too small') else ! since hessian has absolute values, this scales between [0,1] - hess_matrix = hess_matrix / maxh_all + hess_matrix = hess_matrix / maxh_all endif - + ! inverts hessian values where( abs(hess_matrix(:,:,:,:)) > THRESHOLD_HESS ) @@ -457,5 +457,5 @@ subroutine invert_hess( myrank,hess_matrix ) ! rescales hessian !hess_matrix = hess_matrix * maxh_all - + end subroutine invert_hess diff --git a/src/shared/write_VTK_data.f90 b/src/shared/write_VTK_data.f90 index d5691080c..b3bf95d86 100644 --- a/src/shared/write_VTK_data.f90 +++ b/src/shared/write_VTK_data.f90 @@ -38,16 +38,19 @@ subroutine write_VTK_data_elem_i(nspec,nglob, & integer :: nspec,nglob -! global coordinates + ! global coordinates integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy -! element flag array + ! element flag array integer, dimension(nspec) :: elem_flag - integer :: ispec,i -! file name - character(len=256) prname_file + ! file name + character(len=256) :: prname_file + !character(len=2), optional, intent(in) :: str_id + + ! local parameters + integer :: ispec,i ! write source and receiver VTK files for Paraview !debug @@ -79,6 +82,11 @@ subroutine write_VTK_data_elem_i(nspec,nglob, & write(IOVTK,*) "" write(IOVTK,'(a,i12)') "CELL_DATA ",nspec + !if( present( str_id ) ) then + ! write(IOVTK,'(a)') "SCALARS elem_flag_"//str_id//" integer" + !else + ! write(IOVTK,'(a)') "SCALARS elem_flag integer" + !endif write(IOVTK,'(a)') "SCALARS elem_flag integer" write(IOVTK,'(a)') "LOOKUP_TABLE default" do ispec = 1,nspec diff --git a/src/shared/write_c_binary.c b/src/shared/write_c_binary.c index 9403dfaf7..f802b60cb 100644 --- a/src/shared/write_c_binary.c +++ b/src/shared/write_c_binary.c @@ -78,7 +78,7 @@ FC_FUNC_(write_real,WRITE_REAL)(float *z) { Jul 18, 2003 - uses functions fopen/fread/fwrite for binary file I/O - + --------------------------------------- */ #define __USE_GNU @@ -122,13 +122,13 @@ void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *files char * blank; FILE *ft; int ret; - + // checks filesize if( *filesize == 0 ){ perror("Error file size for reading"); exit(EXIT_FAILURE); } - + // Trim the file name. fncopy = strndup(filename, *length); blank = strchr(fncopy, ' '); @@ -137,8 +137,8 @@ void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *files } /* -//daniel: debug checks file size -// see: +//debug checks file size +// see: //https://www.securecoding.cert.org/confluence/display/seccode/FIO19-C.+Do+not+use+fseek()+and+ftell()+to+compute+the+size+of+a+file printf("file size: %lld \n",*filesize); int fd; @@ -148,10 +148,10 @@ void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *files if(fd == -1) { fprintf(stderr, "Error opening file: %s exiting\n", fncopy); exit(-1); - } - if( fstat(fd, &stbuf) == 0 ){ + } + if( fstat(fd, &stbuf) == 0 ){ size = stbuf.st_size; - printf("file size found is: %lld (Bytes) \n",size); + printf("file size found is: %lld (Bytes) \n",size); } close(fd); */ @@ -161,15 +161,15 @@ void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *files ft = fopen( fncopy, "rb+" ); // read binary file if( ft == NULL ) { perror("fopen"); exit(-1); } - + // sets mode for full buffering work_buffer[*fid] = (char *)malloc(MAX_B); ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B ); if( ret != 0 ){ perror("Error setting working buffer"); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } - + // stores file index id fid: from 0 to 8 fp_abs[*fid] = ft; @@ -188,7 +188,7 @@ void open_file_abs_w_fbin(int *fid, char *filename, int *length, long long *file char * blank; FILE *ft; int ret; - + // checks filesize if( *filesize == 0 ){ perror("Error file size for writing"); @@ -212,9 +212,9 @@ void open_file_abs_w_fbin(int *fid, char *filename, int *length, long long *file ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B ); if( ret != 0 ){ perror("Error setting working buffer"); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } - + // stores file index id fid: from 0 to 8 fp_abs[*fid] = ft; @@ -242,7 +242,7 @@ void write_abs_fbin(int *fid, char *buffer, int *length, int *index){ FILE *ft; int itemlen,remlen,donelen,ret; - void *buf; + char *buf; // file pointer ft = fp_abs[*fid]; @@ -253,7 +253,7 @@ void write_abs_fbin(int *fid, char *buffer, int *length, int *index){ ret = 0; /* -//daniel: debug +//debug float dat[*length/4]; memcpy(dat,buffer,*length); printf("buffer length: %d %d\n",*length,*index); @@ -262,23 +262,23 @@ void write_abs_fbin(int *fid, char *buffer, int *length, int *index){ for(i=0;i< 50;i++){ printf("buffer: %d %e \n",i,dat[i]); } - + // positions file pointer (for reverse time access) // make sure to use 64-bit arithmetic to avoid overflow for very large files long long pos,cur; - - pos = ((long long)*length) * (*index -1 ); + + pos = ((long long)*length) * (*index -1 ); cur = ftell(ft); - - printf("current position: %d %lld %lld \n",*fid,cur,pos); + + printf("current position: %d %lld %lld \n",*fid,cur,pos); ret = fseek(ft, pos , SEEK_SET); if ( ret != 0 ) { perror("Error fseek"); exit(EXIT_FAILURE); } */ - - + + // writes items of maximum MAX_B to the file while (remlen > 0){ @@ -295,10 +295,8 @@ void write_abs_fbin(int *fid, char *buffer, int *length, int *index){ } } -//daniel: debug -// printf("buffer done length: %d %d\n",donelen,*length); - - + //debug + // printf("buffer done length: %d %d\n",donelen,*length); } //void @@ -310,7 +308,7 @@ void read_abs_fbin(int *fid, char *buffer, int *length, int *index){ FILE *ft; int ret,itemlen,remlen,donelen; long long pos; - void *buf; + char *buf; // file pointer ft = fp_abs[*fid]; @@ -318,7 +316,7 @@ void read_abs_fbin(int *fid, char *buffer, int *length, int *index){ // positions file pointer (for reverse time access) // make sure to use 64-bit arithmetic to avoid overflow for very large files pos = ((long long)*length) * (*index -1 ); - + ret = fseek(ft, pos , SEEK_SET); if ( ret != 0 ) { perror("Error fseek"); @@ -330,9 +328,9 @@ void read_abs_fbin(int *fid, char *buffer, int *length, int *index){ buf = buffer; ret = 0; - // cleans buffer + // cleans buffer //memset( buf,0,remlen); - + // reads items of maximum MAX_B to the file while (remlen > 0){ @@ -355,19 +353,18 @@ void read_abs_fbin(int *fid, char *buffer, int *length, int *index){ } /* -//daniel: debug +// debug printf("position: %lld %d %d \n",pos,*length,*index); printf("buffer done length: %d %d\n",donelen,*length); float dat[*length/4]; memcpy(dat,buffer,*length); printf("return buffer length: %d %d\n",*length,*index); - printf("return buffer size: %d %d \n",sizeof(dat),sizeof(buffer)); + printf("return buffer size: %d %d \n",sizeof(dat),sizeof(buffer)); int i; for(i=0;i< 50;i++){ printf("return buffer: %d %e \n",i,dat[i]); } */ - } @@ -428,13 +425,13 @@ void open_file_abs_w_map(int *fid, char *filename, int *length, long long *files } /* - // daniel: debug check filesize below or above 2 GB + // debug check filesize below or above 2 GB // filesize gives bytes needed; 4-byte integer limited to +- 2,147,483,648 bytes ~ 2 GB float s = *filesize / 1024. / 1024. / 1024.; if( s > 2.0 ){ printf("file size bigger than 2 GB: %lld B or %f GB \n",*filesize,s); } - */ + */ // Trim the file name. fncopy = strndup(filename, *length); diff --git a/src/specfem3D/Makefile.in b/src/specfem3D/Makefile.in index 5ad9cd28e..c0d216071 100644 --- a/src/specfem3D/Makefile.in +++ b/src/specfem3D/Makefile.in @@ -29,6 +29,41 @@ # @configure_input@ +## example: +# CUDA_LIBS = -L/apps/eiger/Cuda-4.0/cuda/lib64 -lcuda -lcudart -lcublas +# CUDA_INC = -I/apps/eiger/Cuda-4.0/cuda/include +# MPI_INC = -I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include +## +#CUDA_LIBS= -L/u/dpeter/install/cuda/lib64 -lcudart -lcublas +#MPI_INC= -I/usr/local/openmpi/1.4.3/gcc/x86_64/include + +# CUDA +# with configure: ./configure --with-cuda CUDA_LIB=.. CUDA_INC=.. MPI_INC=.. + +# default cuda libraries +# runtime library -lcudart needed, others are optional -lcuda -lcublas +@COND_CUDA_TRUE@CUDA_LIBS = -lcudart +@COND_CUDA_FALSE@CUDA_LIBS = + +CUDA_LIB_LOCATION = @CUDA_LIB@ +CUDA_LINK = $(CUDA_LIB_LOCATION) $(CUDA_LIBS) +CUDA_INC = @CUDA_INC@ -I../../ +MPI_INC = @MPI_INC@ + +@COND_CUDA_TRUE@NVCC = nvcc +@COND_CUDA_FALSE@NVCC = @CC@ + +@COND_CUDA_TRUE@NVCC_FLAGS = $(CUDA_INC) $(MPI_INC) $(COND_MPI_CPPFLAGS) -DCUDA -gencode=arch=compute_20,code=sm_20 +@COND_CUDA_FALSE@NVCC_FLAGS = $(MPI_INC) $(COND_MPI_CPPFLAGS) + +# OpenMP +# with configure: ./configure --with-openmp FLAGS_NO_CHECK="-openmp .." OPENMP_LIB=.. +@COND_OPENMP_TRUE@OPENMP_LIBS = $(OPENMP_LIB) +@COND_OPENMP_FALSE@OPENMP_LIBS = +@COND_OPENMP_TRUE@COND_OPENMP_FFLAGS = -DOPENMP_MODE +@COND_OPENMP_FALSE@COND_OPENMP_FFLAGS = + + FC = @FC@ FCFLAGS = #@FCFLAGS@ MPIFC = @MPIFC@ @@ -37,7 +72,10 @@ FLAGS_CHECK = @FLAGS_CHECK@ FLAGS_NO_CHECK = @FLAGS_NO_CHECK@ FCFLAGS_f90 = @FCFLAGS_f90@ + SHARED = ../shared/ +CUDAD = ../cuda/ + # E : executables directory E = ../../bin # O : objects directory @@ -47,10 +85,10 @@ L = ../../lib # Output files directory OUTPUT=../../in_out_files/OUTPUT_FILES -FCCOMPILE_CHECK =@FCENV@ ${FC} ${FCFLAGS} $(FLAGS_CHECK) -I$(SHARED) -FCCOMPILE_NO_CHECK =@FCENV@ ${FC} ${FCFLAGS} $(FLAGS_NO_CHECK) -I$(SHARED) -MPIFCCOMPILE_CHECK =@FCENV@ ${MPIFC} ${FCFLAGS} $(FLAGS_CHECK) -I$(SHARED) -MPIFCCOMPILE_NO_CHECK =@FCENV@ ${MPIFC} ${FCFLAGS} $(FLAGS_NO_CHECK) -I$(SHARED) +FCCOMPILE_CHECK =@FCENV@ ${FC} ${FCFLAGS} $(FLAGS_CHECK) $(COND_OPENMP_FFLAGS) -I$(SHARED) +FCCOMPILE_NO_CHECK =@FCENV@ ${FC} ${FCFLAGS} $(FLAGS_NO_CHECK) $(COND_OPENMP_FFLAGS) -I$(SHARED) +MPIFCCOMPILE_CHECK =@FCENV@ ${MPIFC} ${FCFLAGS} $(FLAGS_CHECK) $(COND_OPENMP_FFLAGS) -I$(SHARED) +MPIFCCOMPILE_NO_CHECK =@FCENV@ ${MPIFC} ${FCFLAGS} $(FLAGS_NO_CHECK) $(COND_OPENMP_FFLAGS) -I$(SHARED) @COND_MPI_TRUE@FCLINK = $(MPIFCCOMPILE_NO_CHECK) @COND_MPI_FALSE@FCLINK = $(FCCOMPILE_NO_CHECK) @@ -65,46 +103,66 @@ ARFLAGS = cru RANLIB = ranlib libspecfem_a_OBJECTS = \ - $O/assemble_MPI_scalar.o \ - $O/check_mesh_resolution.o \ + $O/assemble_MPI_scalar.shared.o \ + $O/check_mesh_resolution.shared.o \ $O/comp_source_time_function.o \ $O/compute_adj_source_frechet.o \ - $O/compute_arrays_source.o \ - $O/multiply_arrays_source.o \ - $O/create_name_database.o \ - $O/create_serial_name_database.o \ - $O/define_derivation_matrices.o \ - $O/detect_surface.o \ - $O/exit_mpi.o \ - $O/force_ftz.o \ - $O/get_attenuation_model.o \ - $O/get_cmt.o \ - $O/get_element_face.o \ - $O/get_jacobian_boundaries.o \ - $O/get_shape2D.o \ - $O/get_shape3D.o \ - $O/get_value_parameters.o \ - $O/gll_library.o \ - $O/hex_nodes.o \ - $O/lagrange_poly.o \ + $O/compute_arrays_source.shared.o \ + $O/multiply_arrays_source.shared.o \ + $O/create_name_database.shared.o \ + $O/create_serial_name_database.shared.o \ + $O/define_derivation_matrices.shared.o \ + $O/detect_surface.shared.o \ + $O/exit_mpi.shared.o \ + $O/force_ftz.cc.o \ + $O/get_attenuation_model.shared.o \ + $O/get_cmt.shared.o \ + $O/get_element_face.shared.o \ + $O/get_jacobian_boundaries.shared.o \ + $O/get_shape2D.shared.o \ + $O/get_shape3D.shared.o \ + $O/get_value_parameters.shared.o \ + $O/gll_library.shared.o \ + $O/hex_nodes.shared.o \ + $O/lagrange_poly.shared.o \ $O/locate_receivers.o \ $O/locate_source.o \ - $O/netlib_specfun_erf.o \ - $O/param_reader.o \ - $O/prepare_assemble_MPI.o \ - $O/read_topo_bathy_file.o \ - $O/read_parameter_file.o \ - $O/read_value_parameters.o \ - $O/recompute_jacobian.o \ - $O/save_header_file.o \ - $O/sort_array_coordinates.o \ - $O/utm_geo.o \ - $O/write_VTK_data.o \ - $O/write_c_binary.o \ - $(EMPTY_MACRO) + $O/netlib_specfun_erf.shared.o \ + $O/param_reader.cc.o \ + $O/prepare_assemble_MPI.shared.o \ + $O/read_topo_bathy_file.shared.o \ + $O/read_parameter_file.shared.o \ + $O/read_value_parameters.shared.o \ + $O/recompute_jacobian.shared.o \ + $O/save_header_file.shared.o \ + $O/sort_array_coordinates.shared.o \ + $O/utm_geo.shared.o \ + $O/write_VTK_data.shared.o \ + $O/write_c_binary.cc.o + +CUDA_OBJECTS = \ + $O/check_fields_cuda.cuda.o \ + $O/compute_add_sources_acoustic_cuda.cuda.o \ + $O/compute_add_sources_elastic_cuda.cuda.o \ + $O/compute_coupling_cuda.cuda.o \ + $O/compute_forces_acoustic_cuda.cuda.o \ + $O/compute_forces_elastic_cuda.cuda.o \ + $O/compute_kernels_cuda.cuda.o \ + $O/compute_stacey_acoustic_cuda.cuda.o \ + $O/compute_stacey_elastic_cuda.cuda.o \ + $O/it_update_displacement_cuda.cuda.o \ + $O/noise_tomography_cuda.cuda.o \ + $O/prepare_mesh_constants_cuda.cuda.o \ + $O/transfer_fields_cuda.cuda.o \ + $O/write_seismograms_cuda.cuda.o \ + $O/save_and_compare_cpu_vs_gpu.cudacc.o + +CUDA_STUBS = \ + $O/specfem3D_gpu_cuda_method_stubs.cudacc.o # solver objects - no statically allocated arrays anymore SOLVER_ARRAY_OBJECTS = \ + $O/program_specfem3D.o \ $O/specfem3D_par.o \ $O/PML_init.o \ $O/compute_boundary_kernel.o \ @@ -114,6 +172,7 @@ SOLVER_ARRAY_OBJECTS = \ $O/compute_forces_acoustic_PML.o \ $O/compute_forces_elastic.o \ $O/compute_forces_elastic_Dev.o \ + $O/compute_forces_elastic_Dev2.o \ $O/compute_forces_elastic_noDev.o \ $O/compute_forces_poroelastic.o \ $O/compute_forces_solid.o \ @@ -149,36 +208,36 @@ SOLVER_ARRAY_OBJECTS = \ $O/save_adjoint_kernels.o \ $O/specfem3D.o \ $O/assemble_MPI_vector.o \ - $O/noise_tomography.o \ - $(EMPTY_MACRO) + $O/make_gravity.o \ + $O/noise_tomography.o MODEL_UPD_OBJECTS = \ $O/model_update.o \ $O/specfem3D_par.o \ $O/initialize_simulation.o \ - $O/read_parameter_file.o \ - $O/read_value_parameters.o \ - $O/get_value_parameters.o \ - $O/param_reader.o \ - $O/exit_mpi.o \ + $O/read_parameter_file.shared.o \ + $O/read_value_parameters.shared.o \ + $O/get_value_parameters.shared.o \ + $O/param_reader.cc.o \ + $O/exit_mpi.shared.o \ $O/read_mesh_databases.o \ - $O/create_name_database.o \ - $O/check_mesh_resolution.o \ - $O/gll_library.o \ - $O/get_attenuation_model.o \ + $O/create_name_database.shared.o \ + $O/check_mesh_resolution.shared.o \ + $O/gll_library.shared.o \ + $O/get_attenuation_model.shared.o \ $O/save_external_bin_m_up.o \ - $O/write_VTK_data.o \ + $O/write_VTK_data.shared.o \ $(EMPTY_MACRO) SUM_KERNELS_OBJECTS = \ $O/sum_kernels.o \ - $O/read_parameter_file.o \ - $O/read_value_parameters.o \ - $O/get_value_parameters.o \ - $O/param_reader.o \ - $O/exit_mpi.o \ + $O/read_parameter_file.shared.o \ + $O/read_value_parameters.shared.o \ + $O/get_value_parameters.shared.o \ + $O/param_reader.cc.o \ + $O/exit_mpi.shared.o \ $(EMPTY_MACRO) @@ -186,11 +245,15 @@ SUM_KERNELS_OBJECTS = \ @COND_MPI_TRUE@COND_MPI_OBJECTS = $O/parallel.o @COND_MPI_FALSE@COND_MPI_OBJECTS = $O/serial.o +# objects toggled between openmp and non-openmp version +@COND_OPENMP_TRUE@COND_OPENMP_OBJECTS = $O/compute_forces_elastic_Dev_openmp.openmp.o +@COND_OPENMP_FALSE@COND_OPENMP_OBJECTS = + LIBSPECFEM = $L/libspecfem.a # objects for the pure Fortran version -@COND_PYRE_FALSE@XGENERATE_DATABASES_OBJECTS = $O/program_generate_databases.o $(LIBSPECFEM) -@COND_PYRE_FALSE@XSPECFEM_OBJECTS = $O/program_specfem3D.o $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) +@COND_PYRE_FALSE@@COND_CUDA_TRUE@XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) $(CUDA_OBJECTS) +@COND_PYRE_FALSE@@COND_CUDA_FALSE@XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) $(CUDA_STUBS) #### #### targets @@ -198,7 +261,6 @@ LIBSPECFEM = $L/libspecfem.a # default targets for the pure Fortran version @COND_PYRE_FALSE@DEFAULT = \ -@COND_PYRE_FALSE@ generate_databases \ @COND_PYRE_FALSE@ specfem3D \ @COND_PYRE_FALSE@ combine_vol_data \ @COND_PYRE_FALSE@ combine_surf_data \ @@ -219,13 +281,11 @@ specfem3D: xspecfem3D #### rules for executables #### + # rules for the pure Fortran version -@COND_PYRE_FALSE@xgenerate_databases: $(XGENERATE_DATABASES_OBJECTS) $(COND_MPI_OBJECTS) -@COND_PYRE_FALSE@ ${FCLINK} -o ${E}/xgenerate_databases $(XGENERATE_DATABASES_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) -@COND_PYRE_FALSE@ @COND_PYRE_FALSE@# solver also depends on values from mesher -@COND_PYRE_FALSE@xspecfem3D: $(XSPECFEM_OBJECTS) $(COND_MPI_OBJECTS) -@COND_PYRE_FALSE@ ${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) +@COND_PYRE_FALSE@xspecfem3D: $(XSPECFEM_OBJECTS) $(COND_MPI_OBJECTS) $(COND_OPENMP_OBJECTS) +@COND_PYRE_FALSE@ ${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) $(COND_OPENMP_OBJECTS) $(OPENMP_LIBS) $(CUDA_LINK) @COND_PYRE_FALSE@ convolve_source_timefunction: xconvolve_source_timefunction @@ -236,30 +296,30 @@ smooth_vol_data: xsmooth_vol_data sum_kernels: xsum_kernels model_update: xmodel_update -xconvolve_source_timefunction: $O/convolve_source_timefunction.o - ${FCCOMPILE_CHECK} -o ${E}/xconvolve_source_timefunction $O/convolve_source_timefunction.o +xconvolve_source_timefunction: $O/convolve_source_timefunction.shared.o + ${FCCOMPILE_CHECK} -o ${E}/xconvolve_source_timefunction $O/convolve_source_timefunction.shared.o -@COND_PYRE_FALSE@xcreate_movie_shakemap_AVS_DX_GMT: $O/create_movie_shakemap_AVS_DX_GMT.o $(LIBSPECFEM) $(OUTPUT)/surface_from_mesher.h -@COND_PYRE_FALSE@ ${FCCOMPILE_CHECK} -o ${E}/xcreate_movie_shakemap_AVS_DX_GMT $O/create_movie_shakemap_AVS_DX_GMT.o $(LIBSPECFEM) -I$(OUTPUT) +@COND_PYRE_FALSE@xcreate_movie_shakemap_AVS_DX_GMT: $O/create_movie_shakemap_AVS_DX_GMT.shared.o $(LIBSPECFEM) $(OUTPUT)/surface_from_mesher.h +@COND_PYRE_FALSE@ ${FCCOMPILE_CHECK} -o ${E}/xcreate_movie_shakemap_AVS_DX_GMT $O/create_movie_shakemap_AVS_DX_GMT.shared.o $(LIBSPECFEM) -I$(OUTPUT) -xcombine_vol_data: $O/combine_vol_data.o $O/write_c_binary.o $O/read_parameter_file.o $O/read_value_parameters.o $O/get_value_parameters.o $O/param_reader.o - ${FCCOMPILE_CHECK} -o ${E}/xcombine_vol_data $O/combine_vol_data.o $O/write_c_binary.o $O/read_parameter_file.o $O/read_value_parameters.o $O/get_value_parameters.o $O/param_reader.o +xcombine_vol_data: $O/combine_vol_data.shared.o $O/write_c_binary.cc.o $O/read_parameter_file.shared.o $O/read_value_parameters.shared.o $O/get_value_parameters.shared.o $O/param_reader.cc.o + ${FCCOMPILE_CHECK} -o ${E}/xcombine_vol_data $O/combine_vol_data.shared.o $O/write_c_binary.cc.o $O/read_parameter_file.shared.o $O/read_value_parameters.shared.o $O/get_value_parameters.shared.o $O/param_reader.cc.o -xcombine_surf_data: $O/combine_surf_data.o $O/write_c_binary.o $O/param_reader.o - ${FCCOMPILE_CHECK} -o ${E}/xcombine_surf_data $O/combine_surf_data.o $O/write_c_binary.o $O/param_reader.o +xcombine_surf_data: $O/combine_surf_data.shared.o $O/write_c_binary.cc.o $O/param_reader.cc.o + ${FCCOMPILE_CHECK} -o ${E}/xcombine_surf_data $O/combine_surf_data.shared.o $O/write_c_binary.cc.o $O/param_reader.cc.o -xsmooth_vol_data: $O/smooth_vol_data.o $O/read_parameter_file.o $O/read_value_parameters.o $O/get_value_parameters.o $O/param_reader.o $O/gll_library.o $O/exit_mpi.o $O/parallel.o - ${FCLINK} -o ${E}/xsmooth_vol_data $O/smooth_vol_data.o $O/read_parameter_file.o $O/read_value_parameters.o $O/get_value_parameters.o $O/param_reader.o $O/gll_library.o $O/exit_mpi.o $O/parallel.o $(MPILIBS) +xsmooth_vol_data: $O/smooth_vol_data.o $O/read_parameter_file.shared.o $O/read_value_parameters.shared.o $O/get_value_parameters.shared.o $O/param_reader.cc.o $O/gll_library.shared.o $O/exit_mpi.shared.o $(COND_MPI_OBJECTS) + ${FCLINK} -o ${E}/xsmooth_vol_data $O/smooth_vol_data.o $O/read_parameter_file.shared.o $O/read_value_parameters.shared.o $O/get_value_parameters.shared.o $O/param_reader.cc.o $O/gll_library.shared.o $O/exit_mpi.shared.o $(COND_MPI_OBJECTS) $(MPILIBS) xsum_kernels: $(SUM_KERNELS_OBJECTS) $(COND_MPI_OBJECTS) ${FCLINK} -o ${E}/xsum_kernels $(SUM_KERNELS_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) -xmodel_update: $(MODEL_UPD_OBJECTS) $(COND_MPI_OBJECTS) - ${FCLINK} -o ${E}/xmodel_update $(MODEL_UPD_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) +xmodel_update: $(MODEL_UPD_OBJECTS) $(COND_MPI_OBJECTS) $(CUDA_STUBS) + ${FCLINK} -o ${E}/xmodel_update $(MODEL_UPD_OBJECTS) $(COND_MPI_OBJECTS) $(MPILIBS) $(CUDA_STUBS) clean: rm -f $O/* *.o *.gnu *.mod $(OUTPUT)/timestamp* $(OUTPUT)/starttime*txt work.pc* \ - xgenerate_databases xspecfem3D \ + xspecfem3D \ xconvolve_source_timefunction \ xcreate_movie_shakemap_AVS_DX_GMT xcombine_vol_data xcombine_surf_data \ xsmooth_vol_data xmodel_update xsum_kernels @@ -281,293 +341,67 @@ $L/libspecfem.a: $(libspecfem_a_OBJECTS) ### optimized flags (not dependent on values from mesher anymore) ### -$O/specfem3D_par.o: $(SHARED)constants.h specfem3D_par.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/specfem3D_par.o specfem3D_par.f90 - -$O/specfem3D.o: $(SHARED)constants.h specfem3D.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/specfem3D.o specfem3D.f90 - -$O/compute_forces_elastic_noDev.o: $(SHARED)constants.h compute_forces_elastic_noDev.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_elastic_noDev.o compute_forces_elastic_noDev.f90 - -$O/compute_forces_elastic_Dev.o: $(SHARED)constants.h compute_forces_elastic_Dev.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_elastic_Dev.o compute_forces_elastic_Dev.f90 - -$O/compute_forces_elastic.o: $(SHARED)constants.h compute_forces_elastic.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_elastic.o compute_forces_elastic.f90 - -$O/compute_forces_poroelastic.o: $(SHARED)constants.h compute_forces_poroelastic.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_poroelastic.o compute_forces_poroelastic.f90 - -$O/compute_forces_solid.o: $(SHARED)constants.h compute_forces_solid.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_solid.o compute_forces_solid.f90 - -$O/compute_forces_fluid.o: $(SHARED)constants.h compute_forces_fluid.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_fluid.o compute_forces_fluid.f90 - -$O/compute_forces_acoustic.o: $(SHARED)constants.h compute_forces_acoustic.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_acoustic.o compute_forces_acoustic.f90 - -$O/compute_forces_acoustic_pot.o: $(SHARED)constants.h compute_forces_acoustic_pot.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_acoustic_pot.o compute_forces_acoustic_pot.f90 - -$O/compute_forces_acoustic_PML.o: $(SHARED)constants.h compute_forces_acoustic_PML.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_acoustic_PML.o compute_forces_acoustic_PML.f90 - -$O/compute_add_sources_acoustic.o: $(SHARED)constants.h compute_add_sources_acoustic.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_add_sources_acoustic.o compute_add_sources_acoustic.f90 - -$O/compute_add_sources_elastic.o: $(SHARED)constants.h compute_add_sources_elastic.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_add_sources_elastic.o compute_add_sources_elastic.f90 - -$O/compute_add_sources_poroelastic.o: $(SHARED)constants.h compute_add_sources_poroelastic.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_add_sources_poroelastic.o compute_add_sources_poroelastic.f90 - -$O/get_poroelastic_velocities.o: $(SHARED)constants.h get_poroelastic_velocities.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/get_poroelastic_velocities.o get_poroelastic_velocities.f90 - -$O/compute_coupling_acoustic_el.o: $(SHARED)constants.h compute_coupling_acoustic_el.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_acoustic_el.o compute_coupling_acoustic_el.f90 - -$O/compute_coupling_elastic_ac.o: $(SHARED)constants.h compute_coupling_elastic_ac.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_elastic_ac.o compute_coupling_elastic_ac.f90 - -$O/compute_coupling_elastic_po.o: $(SHARED)constants.h compute_coupling_elastic_po.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_elastic_po.o compute_coupling_elastic_po.f90 - -$O/compute_coupling_acoustic_po.o: $(SHARED)constants.h compute_coupling_acoustic_po.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_acoustic_po.o compute_coupling_acoustic_po.f90 - -$O/compute_coupling_poroelastic_ac.o: $(SHARED)constants.h compute_coupling_poroelastic_ac.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_poroelastic_ac.o compute_coupling_poroelastic_ac.f90 - -$O/compute_coupling_poroelastic_el.o: $(SHARED)constants.h compute_coupling_poroelastic_el.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling_poroelastic_el.o compute_coupling_poroelastic_el.f90 - -$O/compute_stacey_acoustic.o: $(SHARED)constants.h compute_stacey_acoustic.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_stacey_acoustic.o compute_stacey_acoustic.f90 - -$O/compute_stacey_elastic.o: $(SHARED)constants.h compute_stacey_elastic.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_stacey_elastic.o compute_stacey_elastic.f90 - -$O/compute_gradient.o: $(SHARED)constants.h compute_gradient.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_gradient.o compute_gradient.f90 - -$O/compute_interpolated_dva.o: $(SHARED)constants.h compute_interpolated_dva.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/compute_interpolated_dva.o compute_interpolated_dva.f90 - -### C compilation -$O/force_ftz.o: ${SHARED}/force_ftz.c ../../config.h - ${CC} -c $(CPPFLAGS) $(CFLAGS) -I../.. -o $O/force_ftz.o ${SHARED}/force_ftz.c - -$O/initialize_simulation.o: $(SHARED)constants.h initialize_simulation.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/initialize_simulation.o initialize_simulation.f90 - -$O/read_mesh_databases.o: $(SHARED)constants.h read_mesh_databases.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/read_mesh_databases.o read_mesh_databases.f90 - -$O/setup_GLL_points.o: $(SHARED)constants.h setup_GLL_points.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/setup_GLL_points.o setup_GLL_points.f90 - -$O/detect_mesh_surfaces.o: $(SHARED)constants.h detect_mesh_surfaces.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/detect_mesh_surfaces.o detect_mesh_surfaces.f90 - -$O/setup_movie_meshes.o: $(SHARED)constants.h setup_movie_meshes.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/setup_movie_meshes.o setup_movie_meshes.f90 - -$O/read_topography_bathymetry.o: $(SHARED)constants.h read_topography_bathymetry.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/read_topography_bathymetry.o read_topography_bathymetry.f90 - -$O/setup_sources_receivers.o: $(SHARED)constants.h setup_sources_receivers.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/setup_sources_receivers.o setup_sources_receivers.f90 - -$O/prepare_timerun.o: $(SHARED)constants.h prepare_timerun.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/prepare_timerun.o prepare_timerun.f90 - -$O/iterate_time.o: $(SHARED)constants.h iterate_time.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/iterate_time.o iterate_time.f90 - -$O/finalize_simulation.o: $(SHARED)constants.h finalize_simulation.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/finalize_simulation.o finalize_simulation.f90 - -$O/assemble_MPI_vector.o: $(SHARED)constants.h assemble_MPI_vector.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_vector.o assemble_MPI_vector.f90 +$O/%.o: %.f90 $(SHARED)constants.h + ${FCCOMPILE_NO_CHECK} -c -o $@ $< -$O/assemble_MPI_scalar.o: $(SHARED)constants.h ${SHARED}/assemble_MPI_scalar.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_scalar.o ${SHARED}/assemble_MPI_scalar.f90 +$O/%.o: %.F90 $(SHARED)constants.h + ${FCCOMPILE_NO_CHECK} -c -o $@ $< -$O/save_adjoint_kernels.o: $(SHARED)constants.h save_adjoint_kernels.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/save_adjoint_kernels.o save_adjoint_kernels.f90 +$O/%.shared.o: ${SHARED}%.f90 $(SHARED)constants.h + ${FCCOMPILE_NO_CHECK} -c -o $@ $< -$O/write_movie_output.o: $(SHARED)constants.h write_movie_output.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/write_movie_output.o write_movie_output.f90 - -$O/create_color_image.o: $(SHARED)constants.h create_color_image.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/create_color_image.o create_color_image.f90 - -$O/write_seismograms.o: $(SHARED)constants.h write_seismograms.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/write_seismograms.o write_seismograms.f90 - -$O/write_output_ASCII.o: $(SHARED)constants.h write_output_ASCII.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/write_output_ASCII.o write_output_ASCII.f90 - -$O/write_output_SU.o: $(SHARED)constants.h write_output_SU.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/write_output_SU.o write_output_SU.f90 - -$O/noise_tomography.o: $(SHARED)constants.h noise_tomography.f90 - ${FCCOMPILE_NO_CHECK} -c -o $O/noise_tomography.o noise_tomography.f90 +$O/%.shared.o: ${SHARED}%.F90 $(SHARED)constants.h + ${FCCOMPILE_NO_CHECK} -c -o $@ $< ### -### MPI compilation without optimization +### OpenMP compilation ### +$O/%.openmp.o: %.f90 $(SHARED)constants.h + ${FCCOMPILE_NO_CHECK} -c -o $@ $< -$O/parallel.o: $(SHARED)constants.h ${SHARED}/parallel.f90 - ${MPIFCCOMPILE_CHECK} -c -o $O/parallel.o ${SHARED}/parallel.f90 ### -### serial compilation without optimization +### CUDA compilation ### +$O/%.cuda.o: ${CUDAD}%.cu ../../config.h $(CUDAD)mesh_constants_cuda.h $(CUDAD)prepare_constants_cuda.h + $(NVCC) -c $< -o $@ $(NVCC_FLAGS) -$O/serial.o: $(SHARED)constants.h ${SHARED}/exit_mpi.f90 - ${FCCOMPILE_CHECK} -c -o $O/serial.o ${SHARED}/serial.f90 - -$O/program_specfem3D.o: program_specfem3D.f90 - ${FCCOMPILE_CHECK} -c -o $O/program_specfem3D.o program_specfem3D.f90 - -$O/locate_source.o: $(SHARED)constants.h locate_source.f90 - ${FCCOMPILE_CHECK} -c -o $O/locate_source.o locate_source.f90 - -$O/locate_receivers.o: $(SHARED)constants.h locate_receivers.f90 - ${FCCOMPILE_CHECK} -c -o $O/locate_receivers.o locate_receivers.f90 - -$O/exit_mpi.o: $(SHARED)constants.h ${SHARED}/exit_mpi.f90 - ${FCCOMPILE_CHECK} -c -o $O/exit_mpi.o ${SHARED}/exit_mpi.f90 - -$O/convolve_source_timefunction.o: ${SHARED}/convolve_source_timefunction.f90 - ${FCCOMPILE_CHECK} -c -o $O/convolve_source_timefunction.o ${SHARED}/convolve_source_timefunction.f90 - -$O/save_header_file.o: $(SHARED)constants.h ${SHARED}/save_header_file.f90 - ${FCCOMPILE_CHECK} -c -o $O/save_header_file.o ${SHARED}/save_header_file.f90 - -$O/read_parameter_file.o: $(SHARED)constants.h ${SHARED}/read_parameter_file.f90 - ${FCCOMPILE_CHECK} -c -o $O/read_parameter_file.o ${SHARED}/read_parameter_file.f90 - -$O/read_value_parameters.o: $(SHARED)constants.h ${SHARED}/read_value_parameters.f90 - ${FCCOMPILE_CHECK} -c -o $O/read_value_parameters.o ${SHARED}/read_value_parameters.f90 - -$O/get_value_parameters.o: $(SHARED)constants.h ${SHARED}/get_value_parameters.f90 - ${FCCOMPILE_CHECK} -c -o $O/get_value_parameters.o ${SHARED}/get_value_parameters.f90 - -$O/utm_geo.o: $(SHARED)constants.h ${SHARED}/utm_geo.f90 - ${FCCOMPILE_CHECK} -c -o $O/utm_geo.o ${SHARED}/utm_geo.f90 - -$O/check_mesh_resolution.o: $(SHARED)constants.h ${SHARED}/check_mesh_resolution.f90 - ${FCCOMPILE_CHECK} -c -o $O/check_mesh_resolution.o ${SHARED}/check_mesh_resolution.f90 - -$O/detect_surface.o: $(SHARED)constants.h ${SHARED}/detect_surface.f90 - ${FCCOMPILE_CHECK} -c -o $O/detect_surface.o ${SHARED}/detect_surface.f90 - -$O/gll_library.o: $(SHARED)constants.h ${SHARED}/gll_library.f90 - ${FCCOMPILE_CHECK} -c -o $O/gll_library.o ${SHARED}/gll_library.f90 - -$O/get_jacobian_boundaries.o: $(SHARED)constants.h ${SHARED}/get_jacobian_boundaries.f90 - ${FCCOMPILE_CHECK} -c -o $O/get_jacobian_boundaries.o ${SHARED}/get_jacobian_boundaries.f90 - -$O/get_flags_boundaries.o: $(SHARED)constants.h ${SHARED}/get_flags_boundaries.f90 - ${FCCOMPILE_CHECK} -c -o $O/get_flags_boundaries.o ${SHARED}/get_flags_boundaries.f90 - -$O/get_cmt.o: $(SHARED)constants.h ${SHARED}/get_cmt.f90 - ${FCCOMPILE_CHECK} -c -o $O/get_cmt.o ${SHARED}/get_cmt.f90 - -$O/create_movie_shakemap_AVS_DX_GMT.o: $(SHARED)constants.h ${SHARED}/create_movie_shakemap_AVS_DX_GMT.f90 $(OUTPUT)/surface_from_mesher.h - ${FCCOMPILE_CHECK} -c -o $O/create_movie_shakemap_AVS_DX_GMT.o ${SHARED}/create_movie_shakemap_AVS_DX_GMT.f90 -I$(OUTPUT) - -$O/get_element_face.o: $(SHARED)constants.h ${SHARED}/get_element_face.f90 - ${FCCOMPILE_CHECK} -c -o $O/get_element_face.o ${SHARED}/get_element_face.f90 - -$O/write_VTK_data.o: $(SHARED)constants.h ${SHARED}/write_VTK_data.f90 - ${FCCOMPILE_CHECK} -c -o $O/write_VTK_data.o ${SHARED}/write_VTK_data.f90 - -$O/get_shape3D.o: $(SHARED)constants.h ${SHARED}/get_shape3D.f90 - ${FCCOMPILE_CHECK} -c -o $O/get_shape3D.o ${SHARED}/get_shape3D.f90 - -$O/get_shape2D.o: $(SHARED)constants.h ${SHARED}/get_shape2D.f90 - ${FCCOMPILE_CHECK} -c -o $O/get_shape2D.o ${SHARED}/get_shape2D.f90 - -$O/hex_nodes.o: $(SHARED)constants.h ${SHARED}/hex_nodes.f90 - ${FCCOMPILE_CHECK} -c -o $O/hex_nodes.o ${SHARED}/hex_nodes.f90 - -$O/netlib_specfun_erf.o: ${SHARED}/netlib_specfun_erf.f90 - ${FCCOMPILE_CHECK} -c -o $O/netlib_specfun_erf.o ${SHARED}/netlib_specfun_erf.f90 - -$O/sort_array_coordinates.o: $(SHARED)constants.h ${SHARED}/sort_array_coordinates.f90 - ${FCCOMPILE_CHECK} -c -o $O/sort_array_coordinates.o ${SHARED}/sort_array_coordinates.f90 - -$O/comp_source_time_function.o: $(SHARED)constants.h comp_source_time_function.f90 - ${FCCOMPILE_CHECK} -c -o $O/comp_source_time_function.o comp_source_time_function.f90 - -$O/read_topo_bathy_file.o: $(SHARED)constants.h ${SHARED}/read_topo_bathy_file.f90 - ${FCCOMPILE_CHECK} -c -o $O/read_topo_bathy_file.o ${SHARED}/read_topo_bathy_file.f90 - -$O/lagrange_poly.o: $(SHARED)constants.h ${SHARED}/lagrange_poly.f90 - ${FCCOMPILE_CHECK} -c -o $O/lagrange_poly.o ${SHARED}/lagrange_poly.f90 - -$O/recompute_jacobian.o: $(SHARED)constants.h ${SHARED}/recompute_jacobian.f90 - ${FCCOMPILE_CHECK} -c -o $O/recompute_jacobian.o ${SHARED}/recompute_jacobian.f90 - -$O/create_name_database.o: $(SHARED)constants.h ${SHARED}/create_name_database.f90 - ${FCCOMPILE_CHECK} -c -o $O/create_name_database.o ${SHARED}/create_name_database.f90 - -$O/create_serial_name_database.o: $(SHARED)constants.h ${SHARED}/create_serial_name_database.f90 - ${FCCOMPILE_CHECK} -c -o $O/create_serial_name_database.o ${SHARED}/create_serial_name_database.f90 - -$O/define_derivation_matrices.o: $(SHARED)constants.h ${SHARED}/define_derivation_matrices.f90 - ${FCCOMPILE_CHECK} -c -o $O/define_derivation_matrices.o ${SHARED}/define_derivation_matrices.f90 - -$O/compute_adj_source_frechet.o: $(SHARED)constants.h compute_adj_source_frechet.f90 - ${FCCOMPILE_CHECK} -c -o $O/compute_adj_source_frechet.o compute_adj_source_frechet.f90 - -$O/compute_arrays_source.o: $(SHARED)constants.h compute_arrays_source.f90 - ${FCCOMPILE_CHECK} -c -o $O/compute_arrays_source.o compute_arrays_source.f90 - -$O/multiply_arrays_source.o: ${SHARED}/constants.h ${SHARED}/multiply_arrays_source.f90 - ${FCCOMPILE_CHECK} -c -o $O/multiply_arrays_source.o ${SHARED}/multiply_arrays_source.f90 +### +### C compilation +### +force_ftz.o: ${SHARED}force_ftz.c ../../config.h + ${CC} -c $(CPPFLAGS) $(CFLAGS) -I../.. -o $O/force_ftz.o ${SHARED}force_ftz.c -$O/get_attenuation_model.o: $(SHARED)constants.h ${SHARED}/get_attenuation_model.f90 - ${FCCOMPILE_CHECK} -c -o $O/get_attenuation_model.o ${SHARED}/get_attenuation_model.f90 +$O/%.cc.o: ${SHARED}%.c ../../config.h + ${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${SHARED}$< -I../../ -$O/compute_boundary_kernel.o: $(SHARED)constants.h compute_boundary_kernel.f90 - ${FCCOMPILE_CHECK} -c -o $O/compute_boundary_kernel.o compute_boundary_kernel.f90 +$O/%.cudacc.o: ${CUDAD}%.c ../../config.h + ${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${CUDAD}$< -I../../ -$O/compute_kernels.o: $(SHARED)constants.h compute_kernels.f90 - ${FCCOMPILE_CHECK} -c -o $O/compute_kernels.o compute_kernels.f90 -$O/combine_vol_data.o: $(SHARED)constants.h ${SHARED}/combine_vol_data.f90 - ${FCCOMPILE_CHECK} -c -o $O/combine_vol_data.o ${SHARED}/combine_vol_data.f90 +### +### MPI compilation without optimization +### -$O/combine_surf_data.o: $(SHARED)constants.h ${SHARED}/combine_surf_data.f90 - ${FCCOMPILE_CHECK} -c -o $O/combine_surf_data.o ${SHARED}/combine_surf_data.f90 +$O/parallel.o: $(SHARED)constants.h $(SHARED)parallel.f90 + ${MPIFCCOMPILE_CHECK} -c -o $O/parallel.o $(SHARED)parallel.f90 -$O/prepare_assemble_MPI.o: $(SHARED)constants.h ${SHARED}/prepare_assemble_MPI.f90 - ${FCCOMPILE_CHECK} -c -o $O/prepare_assemble_MPI.o ${SHARED}/prepare_assemble_MPI.f90 +$O/serial.o: $(SHARED)constants.h $(SHARED)serial.f90 + ${FCCOMPILE_CHECK} -c -o $O/serial.o $(SHARED)serial.f90 + +$O/smooth_vol_data.o: $(SHARED)constants.h $(SHARED)smooth_vol_data.f90 + ${MPIFCCOMPILE_NO_CHECK} -c -o $O/smooth_vol_data.o $(SHARED)smooth_vol_data.f90 -$O/PML_init.o: $(SHARED)constants.h PML_init.f90 - ${FCCOMPILE_CHECK} -c -o $O/PML_init.o PML_init.f90 -## -## smoothing -## -$O/smooth_vol_data.o: $(SHARED)constants.h ${SHARED}/smooth_vol_data.f90 - ${MPIFCCOMPILE_NO_CHECK} -c -o $O/smooth_vol_data.o ${SHARED}/smooth_vol_data.f90 ## ## kernel summation ## -$O/sum_kernels.o: $(SHARED)constants.h ${SHARED}/sum_kernels.f90 - ${MPIFCCOMPILE_NO_CHECK} -c -o $O/sum_kernels.o ${SHARED}/sum_kernels.f90 +$O/sum_kernels.o: $(SHARED)constants.h $(SHARED)sum_kernels.f90 + ${MPIFCCOMPILE_NO_CHECK} -c -o $O/sum_kernels.o $(SHARED)sum_kernels.f90 ## ## model update @@ -576,16 +410,6 @@ $O/sum_kernels.o: $(SHARED)constants.h ${SHARED}/sum_kernels.f90 $O/model_update.o: $(SHARED)constants.h model_update.f90 ${MPIFCCOMPILE_NO_CHECK} -c -o $O/model_update.o model_update.f90 -$O/save_external_bin_m_up.o: ${SHARED}/constants.h save_external_bin_m_up.f90 +$O/save_external_bin_m_up.o: $(SHARED)constants.h save_external_bin_m_up.f90 ${FCCOMPILE_CHECK} -c -o $O/save_external_bin_m_up.o save_external_bin_m_up.f90 -### -### C files below -### - -$O/param_reader.o: ${SHARED}/param_reader.c - ${CC} -c $(CFLAGS) -o $O/param_reader.o ${SHARED}/param_reader.c -I../../ - -$O/write_c_binary.o: ${SHARED}/write_c_binary.c - ${CC} -c $(CFLAGS) -o $O/write_c_binary.o ${SHARED}/write_c_binary.c -I../../ - diff --git a/src/specfem3D/assemble_MPI_vector.f90 b/src/specfem3D/assemble_MPI_vector.f90 index 353c653a1..b8ef9ede5 100644 --- a/src/specfem3D/assemble_MPI_vector.f90 +++ b/src/specfem3D/assemble_MPI_vector.f90 @@ -134,65 +134,65 @@ end subroutine assemble_MPI_vector_ext_mesh ! subroutine assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,array_val, & - buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,my_neighbours_ext_mesh, & - request_send_vector_ext_mesh,request_recv_vector_ext_mesh & - ) + buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + my_neighbours_ext_mesh, & + request_send_vector_ext_mesh,request_recv_vector_ext_mesh) -! sends data + ! sends data - implicit none + implicit none - include "constants.h" + include "constants.h" - integer :: NPROC - integer :: NGLOB_AB + integer :: NPROC + integer :: NGLOB_AB -! array to assemble - real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val + ! array to assemble + real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val - integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh + integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh - real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: & - buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh + real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: & + buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh - integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh - integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh - integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh + integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh + integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh + integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh - integer ipoin,iinterface + integer ipoin,iinterface -! here we have to assemble all the contributions between partitions using MPI + ! here we have to assemble all the contributions between partitions using MPI -! assemble only if more than one partition - if(NPROC > 1) then + ! assemble only if more than one partition + if(NPROC > 1) then -! partition border copy into the buffer - do iinterface = 1, num_interfaces_ext_mesh - do ipoin = 1, nibool_interfaces_ext_mesh(iinterface) - buffer_send_vector_ext_mesh(:,ipoin,iinterface) = & - array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) - enddo - enddo + ! partition border copy into the buffer + do iinterface = 1, num_interfaces_ext_mesh + do ipoin = 1, nibool_interfaces_ext_mesh(iinterface) + buffer_send_vector_ext_mesh(:,ipoin,iinterface) = & + array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + enddo + enddo -! send messages - do iinterface = 1, num_interfaces_ext_mesh - call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), & - NDIM*nibool_interfaces_ext_mesh(iinterface), & - my_neighbours_ext_mesh(iinterface), & - itag, & - request_send_vector_ext_mesh(iinterface) & - ) - call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), & - NDIM*nibool_interfaces_ext_mesh(iinterface), & - my_neighbours_ext_mesh(iinterface), & - itag, & - request_recv_vector_ext_mesh(iinterface) & - ) - enddo + ! send messages + do iinterface = 1, num_interfaces_ext_mesh + call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), & + NDIM*nibool_interfaces_ext_mesh(iinterface), & + my_neighbours_ext_mesh(iinterface), & + itag, & + request_send_vector_ext_mesh(iinterface) & + ) + call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), & + NDIM*nibool_interfaces_ext_mesh(iinterface), & + my_neighbours_ext_mesh(iinterface), & + itag, & + request_recv_vector_ext_mesh(iinterface) & + ) + enddo - endif + endif end subroutine assemble_MPI_vector_ext_mesh_s @@ -256,6 +256,7 @@ subroutine assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB,array_val, & end subroutine assemble_MPI_vector_ext_mesh_w + ! !-------------------------------------------------------------------------------------------------- ! @@ -405,3 +406,387 @@ subroutine assemble_MPI_vector_poro_w(NPROC,NGLOB_AB,array_val1,array_val2, & endif end subroutine assemble_MPI_vector_poro_w + + + +! +!------------------------------------------------------------------------------------------------- +! + +! with cuda functions... + + subroutine transfer_boundary_to_device(NPROC, Mesh_pointer, & + buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,& + request_recv_vector_ext_mesh) + + implicit none + + include "constants.h" + + integer :: NPROC + integer(kind=8) :: Mesh_pointer + + ! array to assemble + integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh + + real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: & + buffer_recv_vector_ext_mesh + + integer, dimension(num_interfaces_ext_mesh) :: request_recv_vector_ext_mesh + + ! local parameters + !integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh + !integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh + !integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh + integer :: iinterface + + ! here we have to assemble all the contributions between partitions using MPI + + ! assemble only if more than one partition + if(NPROC > 1) then + + ! wait for communications completion (recv) + !write(IMAIN,*) "sending MPI_wait" + do iinterface = 1, num_interfaces_ext_mesh + call wait_req(request_recv_vector_ext_mesh(iinterface)) + enddo + + ! send contributions to GPU + call transfer_boundary_to_device_a(Mesh_pointer, buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh) + endif + + ! This step is done via previous function transfer_and_assemble... + ! do iinterface = 1, num_interfaces_ext_mesh + ! do ipoin = 1, nibool_interfaces_ext_mesh(iinterface) + ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = & + ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface) + ! enddo + ! enddo + + end subroutine transfer_boundary_to_device + +! +!------------------------------------------------------------------------------------------------- +! + +! not used... +! subroutine assemble_MPI_vector_write_cuda_no_transfer(NPROC,NGLOB_AB,array_val, Mesh_pointer, & +! buffer_recv_vector_ext_mesh, & +! num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & +! nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & +! request_send_vector_ext_mesh,request_recv_vector_ext_mesh, & +! FORWARD_OR_ADJOINT ) +! +! implicit none +! +! include "constants.h" +! +! integer :: NPROC +! integer :: NGLOB_AB +! integer(kind=8) :: Mesh_pointer +! ! array to assemble +! real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val +! +! integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh +! +! real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: & +! buffer_recv_vector_ext_mesh +! +! integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh +! integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh +! integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh +! !integer, dimension(num_interfaces_ext_mesh) :: request_recv_vector_ext_mesh +! +! integer :: FORWARD_OR_ADJOINT +! +! ! local parameters +! integer :: iinterface +! +! ! here we have to assemble all the contributions between partitions using MPI +! +! ! assemble only if more than one partition +! if(NPROC > 1) then +! +! ! adding contributions of neighbours +! call assemble_accel_on_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, & +! num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, & +! nibool_interfaces_ext_mesh,& +! ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT) +! +! ! This step is done via previous function transfer_and_assemble... +! ! do iinterface = 1, num_interfaces_ext_mesh +! ! do ipoin = 1, nibool_interfaces_ext_mesh(iinterface) +! ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = & +! ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface) +! ! enddo +! ! enddo +! +! ! wait for communications completion (send) +! do iinterface = 1, num_interfaces_ext_mesh +! call wait_req(request_send_vector_ext_mesh(iinterface)) +! enddo +! endif +! +! end subroutine assemble_MPI_vector_write_cuda_no_transfer + +! +!------------------------------------------------------------------------------------------------- +! + + + subroutine assemble_MPI_vector_send_cuda(NPROC, & + buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh, & + my_neighbours_ext_mesh, & + request_send_vector_ext_mesh,request_recv_vector_ext_mesh) + +! sends data +! note: array to assemble already filled into buffer_send_vector_ext_mesh array + + implicit none + + include "constants.h" + + integer :: NPROC + + integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh + + real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: & + buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh + + integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh + integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh + + ! local parameters + integer :: iinterface + + ! note: preparation of the contribution between partitions using MPI + ! already done in transfer_boun_accel routine + + ! send only if more than one partition + if(NPROC > 1) then + + ! send messages + do iinterface = 1, num_interfaces_ext_mesh + call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), & + NDIM*nibool_interfaces_ext_mesh(iinterface), & + my_neighbours_ext_mesh(iinterface), & + itag, & + request_send_vector_ext_mesh(iinterface) & + ) + call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), & + NDIM*nibool_interfaces_ext_mesh(iinterface), & + my_neighbours_ext_mesh(iinterface), & + itag, & + request_recv_vector_ext_mesh(iinterface) & + ) + enddo + + endif + + end subroutine assemble_MPI_vector_send_cuda + + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,array_val, Mesh_pointer, & + buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + request_send_vector_ext_mesh,request_recv_vector_ext_mesh, & + FORWARD_OR_ADJOINT ) + +! waits for data to receive and assembles + + implicit none + + include "constants.h" + + integer :: NPROC + integer :: NGLOB_AB + integer(kind=8) :: Mesh_pointer +! array to assemble + real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val + + integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh + + real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: & + buffer_recv_vector_ext_mesh + + integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh + integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh + integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh + + integer :: FORWARD_OR_ADJOINT + + ! local parameters + integer :: iinterface + + ! here we have to assemble all the contributions between partitions using MPI + + ! assemble only if more than one partition + if(NPROC > 1) then + + ! wait for communications completion (recv) + do iinterface = 1, num_interfaces_ext_mesh + call wait_req(request_recv_vector_ext_mesh(iinterface)) + enddo + + ! adding contributions of neighbours + call transfer_asmbl_accel_to_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,& + ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT) + + ! This step is done via previous function transfer_and_assemble... + ! do iinterface = 1, num_interfaces_ext_mesh + ! do ipoin = 1, nibool_interfaces_ext_mesh(iinterface) + ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = & + ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface) + ! enddo + ! enddo + + ! wait for communications completion (send) + do iinterface = 1, num_interfaces_ext_mesh + call wait_req(request_send_vector_ext_mesh(iinterface)) + enddo + + endif + + end subroutine assemble_MPI_vector_write_cuda + + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine assemble_MPI_scalar_send_cuda(NPROC, & + buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh, & + my_neighbours_ext_mesh, & + request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh) + +! non-blocking MPI send + + ! sends data + ! note: assembling data already filled into buffer_send_scalar_ext_mesh array + implicit none + + include "constants.h" + + integer :: NPROC + integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh + + real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: & + buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh + + integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh + integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh + + ! local parameters + integer :: iinterface + + ! sends only if more than one partition + if(NPROC > 1) then + + ! note: partition border copy into the buffer has already been done + ! by routine transfer_boun_pot_from_device() + + ! send messages + do iinterface = 1, num_interfaces_ext_mesh + ! non-blocking synchronous send request + call isend_cr(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), & + nibool_interfaces_ext_mesh(iinterface), & + my_neighbours_ext_mesh(iinterface), & + itag, & + request_send_scalar_ext_mesh(iinterface) & + ) + ! receive request + call irecv_cr(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), & + nibool_interfaces_ext_mesh(iinterface), & + my_neighbours_ext_mesh(iinterface), & + itag, & + request_recv_scalar_ext_mesh(iinterface) & + ) + + enddo + + endif + + end subroutine assemble_MPI_scalar_send_cuda + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,array_val, & + Mesh_pointer, & + buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, & + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, & + FORWARD_OR_ADJOINT) + +! waits for send/receiver to be completed and assembles contributions + + implicit none + + include "constants.h" + + integer :: NPROC + integer :: NGLOB_AB + integer(kind=8) :: Mesh_pointer + + integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh + + ! array to assemble + real(kind=CUSTOM_REAL), dimension(NGLOB_AB) :: array_val + + + real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: & + buffer_recv_scalar_ext_mesh + + integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh + integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh + integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh + + integer FORWARD_OR_ADJOINT + + integer iinterface ! ipoin + + ! assemble only if more than one partition + if(NPROC > 1) then + + ! wait for communications completion (recv) + do iinterface = 1, num_interfaces_ext_mesh + call wait_req(request_recv_scalar_ext_mesh(iinterface)) + enddo + + ! adding contributions of neighbours + call transfer_asmbl_pot_to_device(Mesh_pointer, array_val, buffer_recv_scalar_ext_mesh, & + num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, nibool_interfaces_ext_mesh,& + ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT) + + ! note: adding contributions of neighbours has been done just above for cuda + !do iinterface = 1, num_interfaces_ext_mesh + ! do ipoin = 1, nibool_interfaces_ext_mesh(iinterface) + ! array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) = & + ! array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) & + ! + buffer_recv_scalar_ext_mesh(ipoin,iinterface) + ! enddo + !enddo + + ! wait for communications completion (send) + do iinterface = 1, num_interfaces_ext_mesh + call wait_req(request_send_scalar_ext_mesh(iinterface)) + enddo + + endif + + end subroutine assemble_MPI_scalar_write_cuda diff --git a/src/specfem3D/compute_add_sources_acoustic.f90 b/src/specfem3D/compute_add_sources_acoustic.f90 index 219441554..c7434e404 100644 --- a/src/specfem3D/compute_add_sources_acoustic.f90 +++ b/src/specfem3D/compute_add_sources_acoustic.f90 @@ -35,12 +35,13 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, & nrec,islice_selected_rec,ispec_selected_rec, & nadj_rec_local,adj_sourcearrays,b_potential_dot_dot_acoustic, & - NTSTEP_BETWEEN_READ_ADJSRC ) + NTSTEP_BETWEEN_READ_ADJSRC, & + GPU_MODE, Mesh_pointer ) use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, & xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,& station_name,network_name,adj_source_file,nrec_local,number_receiver_global, & - pm1_source_encoding + pm1_source_encoding,nsources_local implicit none include "constants.h" @@ -74,13 +75,16 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou !adjoint simulations integer:: SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT + logical:: GPU_MODE + integer(kind=8) :: Mesh_pointer integer:: nrec integer,dimension(nrec) :: islice_selected_rec,ispec_selected_rec integer:: nadj_rec_local real(kind=CUSTOM_REAL),dimension(NGLOB_ADJOINT):: b_potential_dot_dot_acoustic logical :: ibool_read_adj_arrays integer :: it_sub_adj,itime,NTSTEP_BETWEEN_READ_ADJSRC - real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: adj_sourcearrays + real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: & + adj_sourcearrays ! local parameters double precision :: f0 @@ -89,6 +93,7 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou real(kind=CUSTOM_REAL) stf_used,stf_used_total_all,time_source integer :: isource,iglob,ispec,i,j,k,ier integer :: irec_local,irec + double precision, dimension(NSOURCES) :: stf_pre_compute ! adjoint sources in SU format integer :: it_start,it_end @@ -109,101 +114,136 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou endif ! forward simulations - if (SIMULATION_TYPE == 1) then - - ! adds acoustic sources - do isource = 1,NSOURCES - - ! add the source (only if this proc carries the source) - if(myrank == islice_selected_source(isource)) then - - ispec = ispec_selected_source(isource) + if (SIMULATION_TYPE == 1 .and. nsources_local > 0) then + +!way 2 + if(GPU_MODE) then + if( NSOURCES > 0 ) then + do isource = 1,NSOURCES + if(USE_FORCE_POINT_SOURCE) then + ! precomputes source time function factor + stf_pre_compute(isource) = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr( & + dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + if( USE_RICKER_IPATI ) then + stf_pre_compute(isource) = comp_source_time_function_rickr( & + dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + stf_pre_compute(isource) = comp_source_time_function_gauss( & + dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + endif + endif + enddo + stf_used_total = stf_used_total + sum(stf_pre_compute(:)) + ! only implements SIMTYPE=1 and NOISE_TOM=0 + ! write(*,*) "fortran dt = ", dt + ! change dt -> DT + call compute_add_sources_ac_cuda(Mesh_pointer, phase_is_inner, & + NSOURCES, SIMULATION_TYPE, & + stf_pre_compute, myrank) + endif - if (ispec_is_inner(ispec) .eqv. phase_is_inner) then + else ! .NOT. GPU_MODE - if( ispec_is_acoustic(ispec) ) then + ! adds acoustic sources + do isource = 1,NSOURCES - if(USE_FORCE_POINT_SOURCE) then + ! add the source (only if this proc carries the source) + if(myrank == islice_selected_source(isource)) then - ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] - iglob = ibool(nint(xi_source(isource)), & - nint(eta_source(isource)), & - nint(gamma_source(isource)), & - ispec) + ispec = ispec_selected_source(isource) - f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format + if (ispec_is_inner(ispec) .eqv. phase_is_inner) then - !if (it == 1 .and. myrank == 0) then - ! write(IMAIN,*) 'using a source of dominant frequency ',f0 - ! write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0 - ! write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0) - !endif + if( ispec_is_acoustic(ispec) ) then - ! gaussian source time function - !stf_used = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + if(USE_FORCE_POINT_SOURCE) then - ! we use nu_source(:,3) here because we want a source normal to the surface. - ! This is the expression of a Ricker; should be changed according maybe to the Par_file. - stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0) + ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] + iglob = ibool(nint(xi_source(isource)), & + nint(eta_source(isource)), & + nint(gamma_source(isource)), & + ispec) - ! source encoding - stf_used = stf_used * pm1_source_encoding(isource) + f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format - ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid - ! the sign is negative because pressure p = - Chi_dot_dot therefore we need - ! to add minus the source to Chi_dot_dot to get plus the source in pressure: + !if (it == 1 .and. myrank == 0) then + ! write(IMAIN,*) 'using a source of dominant frequency ',f0 + ! write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0 + ! write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0) + !endif - ! acoustic source for pressure gets divided by kappa - ! source contribution - potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) & - - stf_used / kappastore(nint(xi_source(isource)), & - nint(eta_source(isource)), & - nint(gamma_source(isource)),ispec) + ! we use nu_source(:,3) here because we want a source normal to the surface. + ! This is the expression of a Ricker; should be changed according maybe to the Par_file. + stf_used = FACTOR_FORCE_SOURCE * & + comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0) - else - ! gaussian source time - stf = comp_source_time_function_gauss(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + ! source encoding + stf_used = stf_used * pm1_source_encoding(isource) - ! quasi-heaviside - !stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid + ! the sign is negative because pressure p = - Chi_dot_dot therefore we need + ! to add minus the source to Chi_dot_dot to get plus the source in pressure: - ! source encoding - stf = stf * pm1_source_encoding(isource) + ! acoustic source for pressure gets divided by kappa + ! source contribution + potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) & + - stf_used / kappastore(nint(xi_source(isource)), & + nint(eta_source(isource)), & + nint(gamma_source(isource)),ispec) - ! distinguishes between single and double precision for reals - if(CUSTOM_REAL == SIZE_REAL) then - stf_used = sngl(stf) else - stf_used = stf - endif - ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid - ! the sign is negative because pressure p = - Chi_dot_dot therefore we need - ! to add minus the source to Chi_dot_dot to get plus the source in pressure - - ! add source array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - ! adds source contribution - ! note: acoustic source for pressure gets divided by kappa - iglob = ibool(i,j,k,ispec) - potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) & - - sourcearrays(isource,1,i,j,k) * stf_used / kappastore(i,j,k,ispec) - enddo + if( USE_RICKER_IPATI ) then + stf = comp_source_time_function_rickr( & + dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + ! gaussian source time + stf = comp_source_time_function_gauss( & + dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + endif + + ! quasi-heaviside + !stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + + ! source encoding + stf = stf * pm1_source_encoding(isource) + + ! distinguishes between single and double precision for reals + if(CUSTOM_REAL == SIZE_REAL) then + stf_used = sngl(stf) + else + stf_used = stf + endif + + ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid + ! the sign is negative because pressure p = - Chi_dot_dot therefore we need + ! to add minus the source to Chi_dot_dot to get plus the source in pressure + + ! add source array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! adds source contribution + ! note: acoustic source for pressure gets divided by kappa + iglob = ibool(i,j,k,ispec) + potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) & + - sourcearrays(isource,1,i,j,k) * stf_used / kappastore(i,j,k,ispec) + enddo + enddo enddo - enddo - endif ! USE_FORCE_POINT_SOURCE + endif ! USE_FORCE_POINT_SOURCE - stf_used_total = stf_used_total + stf_used + stf_used_total = stf_used_total + stf_used - endif ! ispec_is_elastic - endif ! ispec_is_inner - endif ! myrank + endif ! ispec_is_acoustic + endif ! ispec_is_inner + endif ! myrank - enddo ! NSOURCES + enddo ! NSOURCES + endif ! GPU_MODE endif ! NOTE: adjoint sources and backward wavefield timing: @@ -233,119 +273,137 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou ! adjoint simulations if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then - ! read in adjoint sources block by block (for memory consideration) - ! e.g., in exploration experiments, both the number of receivers (nrec) and - ! the number of time steps (NSTEP) are huge, - ! which may cause problems since we have a large array: - ! adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ) - - ! figure out if we need to read in a chunk of the adjoint source at this timestep - it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) ) !chunk_number - ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0)) - - ! needs to read in a new chunk/block of the adjoint source - ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner' - ! we first do calculations for the boudaries, and then start communication - ! with other partitions while we calculate for the inner part - ! this must be done carefully, otherwise the adjoint sources may be added twice - if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then - - ! allocates temporary source array - allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier) - if( ier /= 0 ) stop 'error allocating array adj_sourcearray' - - if (.not. SU_FORMAT) then - !!! read ascii adjoint sources - irec_local = 0 - do irec = 1, nrec - ! compute source arrays - if (myrank == islice_selected_rec(irec)) then - irec_local = irec_local + 1 - - ! reads in **sta**.**net**.**LH**.adj files - adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec)) - call compute_arrays_adjoint_source(myrank,adj_source_file, & - xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), & - adj_sourcearray, xigll,yigll,zigll, & - it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC) - do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC - adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:) - enddo - - endif - enddo - else - !!! read SU adjoint sources - ! range of the block we need to read - it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1 - it_end = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1 - write(procname,"(i4)") myrank - open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', & - status='old',access='direct',recl=240+4*(NSTEP),iostat = ier) - if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) & - //'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj does not exit') - - do irec_local = 1,nrec_local - irec = number_receiver_global(irec_local) - read(IIN_SU1,rec=irec_local) r4head, adj_temp - adj_src(:,1)=adj_temp(it_start:it_end) - adj_src(:,2)=0.0 !TRIVIAL - adj_src(:,3)=0.0 !TRIVIAL - ! lagrange interpolators for receiver location - call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir) - call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar) - call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar) - ! interpolates adjoint source onto GLL points within this element - do k = 1, NGLLZ - do j = 1, NGLLY - do i = 1, NGLLX - adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:) + ! adds adjoint source in this partitions + if( nadj_rec_local > 0 ) then + + ! read in adjoint sources block by block (for memory consideration) + ! e.g., in exploration experiments, both the number of receivers (nrec) and + ! the number of time steps (NSTEP) are huge, + ! which may cause problems since we have a large array: + ! adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ) + + ! figure out if we need to read in a chunk of the adjoint source at this timestep + it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) ) !chunk_number + ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0)) + + ! needs to read in a new chunk/block of the adjoint source + ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner' + ! we first do calculations for the boudaries, and then start communication + ! with other partitions while we calculate for the inner part + ! this must be done carefully, otherwise the adjoint sources may be added twice + if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then + + ! allocates temporary source array + allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier) + if( ier /= 0 ) stop 'error allocating array adj_sourcearray' + + if (.not. SU_FORMAT) then + !!! read ascii adjoint sources + irec_local = 0 + do irec = 1, nrec + ! compute source arrays + if (myrank == islice_selected_rec(irec)) then + irec_local = irec_local + 1 + + ! reads in **sta**.**net**.**LH**.adj files + adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec)) + call compute_arrays_adjoint_source(myrank,adj_source_file, & + xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), & + adj_sourcearray, xigll,yigll,zigll, & + it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC) + do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC + adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:) enddo - enddo - enddo - do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC - adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:) - enddo - enddo - close(IIN_SU1) - endif !if (.not. SU_FORMAT) - - deallocate(adj_sourcearray) - - endif ! if(ibool_read_adj_arrays) - - if( it < NSTEP ) then - ! receivers act as sources - irec_local = 0 - do irec = 1,nrec - ! add the source (only if this proc carries the source) - if (myrank == islice_selected_rec(irec)) then - irec_local = irec_local + 1 - ! adds source array - ispec = ispec_selected_rec(irec) - - ! checks if element is in phase_is_inner run - if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then - - do k = 1,NGLLZ - do j = 1,NGLLY - do i = 1,NGLLX - iglob = ibool(i,j,k,ispec) + endif + enddo + else + !!! read SU adjoint sources + ! range of the block we need to read + it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1 + it_end = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1 + write(procname,"(i4)") myrank + open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', & + status='old',access='direct',recl=240+4*(NSTEP),iostat = ier) + if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) & + //'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj does not exit') - potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) & - + adj_sourcearrays(irec_local, & - NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), & - 1,i,j,k) + do irec_local = 1,nrec_local + irec = number_receiver_global(irec_local) + read(IIN_SU1,rec=irec_local) r4head, adj_temp + adj_src(:,1)=adj_temp(it_start:it_end) + adj_src(:,2)=0.0 !TRIVIAL + adj_src(:,3)=0.0 !TRIVIAL + ! lagrange interpolators for receiver location + call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir) + call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar) + call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar) + ! interpolates adjoint source onto GLL points within this element + do k = 1, NGLLZ + do j = 1, NGLLY + do i = 1, NGLLX + adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:) enddo enddo enddo - - endif ! phase_is_inner - - endif - enddo ! nrec - endif ! it + do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC + adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:) + enddo + enddo + close(IIN_SU1) + endif !if (.not. SU_FORMAT) + + deallocate(adj_sourcearray) + endif ! if(ibool_read_adj_arrays) + + if( it < NSTEP ) then + ! receivers act as sources + if( .NOT. GPU_MODE ) then + irec_local = 0 + do irec = 1,nrec + ! add the source (only if this proc carries the source) + if (myrank == islice_selected_rec(irec)) then + irec_local = irec_local + 1 + + ! adds source array + ispec = ispec_selected_rec(irec) + if( ispec_is_acoustic(ispec) ) then + + ! checks if element is in phase_is_inner run + if (ispec_is_inner(ispec) .eqv. phase_is_inner) then + do k = 1,NGLLZ + do j = 1,NGLLY + do i = 1,NGLLX + iglob = ibool(i,j,k,ispec) + ! beware, for acoustic medium, a pressure source would be taking the negative + ! and divide by Kappa of the fluid; + ! this would have to be done when constructing the adjoint source. + ! + ! note: we take the first component of the adj_sourcearrays + ! the idea is to have e.g. a pressure source, where all 3 components would be the same + potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) & + + adj_sourcearrays(irec_local, & + NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), & + 1,i,j,k) + enddo + enddo + enddo + endif ! phase_is_inner + endif + endif + enddo ! nrec + else + ! on GPU + call add_sources_ac_sim_2_or_3_cuda(Mesh_pointer,adj_sourcearrays,phase_is_inner, & + ispec_is_inner,ispec_is_acoustic, & + ispec_selected_rec,myrank,nrec, & + NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), & + islice_selected_rec,nadj_rec_local, & + NTSTEP_BETWEEN_READ_ADJSRC) + + endif ! GPU_MODE + endif ! it + endif ! nadj_rec_local > 0 endif ! note: b_potential() is read in after Newmark time scheme, thus @@ -353,100 +411,137 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou ! thus indexing is NSTEP - it , instead of NSTEP - it - 1 ! adjoint simulations - if (SIMULATION_TYPE == 3) then - ! adds acoustic sources - do isource = 1,NSOURCES - - ! add the source (only if this proc carries the source) - if(myrank == islice_selected_source(isource)) then + if (SIMULATION_TYPE == 3 .and. nsources_local > 0) then + + ! on GPU + if(GPU_MODE) then + if( NSOURCES > 0 ) then + do isource = 1,NSOURCES + if(USE_FORCE_POINT_SOURCE) then + ! precomputes source time function factors + stf_pre_compute(isource) = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + if( USE_RICKER_IPATI ) then + stf_pre_compute(isource) = comp_source_time_function_rickr( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + stf_pre_compute(isource) = comp_source_time_function_gauss( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + endif + endif + enddo + stf_used_total = stf_used_total + sum(stf_pre_compute(:)) - ispec = ispec_selected_source(isource) + ! only implements SIMTYPE=3 + call compute_add_sources_ac_s3_cuda(Mesh_pointer, phase_is_inner, & + NSOURCES, SIMULATION_TYPE, & + stf_pre_compute, myrank) + endif - if (ispec_is_inner(ispec) .eqv. phase_is_inner) then + else ! .NOT. GPU_MODE - if( ispec_is_acoustic(ispec) ) then + ! adds acoustic sources + do isource = 1,NSOURCES - if(USE_FORCE_POINT_SOURCE) then + ! add the source (only if this proc carries the source) + if(myrank == islice_selected_source(isource)) then - ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] - iglob = ibool(nint(xi_source(isource)), & - nint(eta_source(isource)), & - nint(gamma_source(isource)), & - ispec) + ispec = ispec_selected_source(isource) - f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format + if (ispec_is_inner(ispec) .eqv. phase_is_inner) then - !if (it == 1 .and. myrank == 0) then - ! write(IMAIN,*) 'using a source of dominant frequency ',f0 - ! write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0 - ! write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0) - !endif + if( ispec_is_acoustic(ispec) ) then - ! gaussian source time function - !stf_used = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + if(USE_FORCE_POINT_SOURCE) then - ! we use nu_source(:,3) here because we want a source normal to the surface. - ! This is the expression of a Ricker; should be changed according maybe to the Par_file. - stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0) + ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] + iglob = ibool(nint(xi_source(isource)), & + nint(eta_source(isource)), & + nint(gamma_source(isource)), & + ispec) - ! source encoding - stf_used = stf_used * pm1_source_encoding(isource) + f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format - ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid - ! the sign is negative because pressure p = - Chi_dot_dot therefore we need - ! to add minus the source to Chi_dot_dot to get plus the source in pressure: + !if (it == 1 .and. myrank == 0) then + ! write(IMAIN,*) 'using a source of dominant frequency ',f0 + ! write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0 + ! write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0) + !endif - ! acoustic source for pressure gets divided by kappa - ! source contribution - b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) & - - stf_used / kappastore(nint(xi_source(isource)), & - nint(eta_source(isource)), & - nint(gamma_source(isource)),ispec) + ! gaussian source time function + !stf_used = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) - else + ! we use nu_source(:,3) here because we want a source normal to the surface. + ! This is the expression of a Ricker; should be changed according maybe to the Par_file. + stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0) - ! gaussian source time - stf = comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + ! source encoding + stf_used = stf_used * pm1_source_encoding(isource) - ! quasi-heaviside - !stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid + ! the sign is negative because pressure p = - Chi_dot_dot therefore we need + ! to add minus the source to Chi_dot_dot to get plus the source in pressure: - ! source encoding - stf = stf * pm1_source_encoding(isource) + ! acoustic source for pressure gets divided by kappa + ! source contribution + b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) & + - stf_used / kappastore(nint(xi_source(isource)), & + nint(eta_source(isource)), & + nint(gamma_source(isource)),ispec) - ! distinguishes between single and double precision for reals - if(CUSTOM_REAL == SIZE_REAL) then - stf_used = sngl(stf) else - stf_used = stf - endif - ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid - ! the sign is negative because pressure p = - Chi_dot_dot therefore we need - ! to add minus the source to Chi_dot_dot to get plus the source in pressure - - ! add source array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - ! adds source contribution - ! note: acoustic source for pressure gets divided by kappa - iglob = ibool(i,j,k,ispec) - b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) & - - sourcearrays(isource,1,i,j,k) * stf_used / kappastore(i,j,k,ispec) - enddo + if( USE_RICKER_IPATI ) then + stf = comp_source_time_function_rickr( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + ! gaussian source time + stf = comp_source_time_function_gauss( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + endif + + ! quasi-heaviside + !stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + + ! source encoding + stf = stf * pm1_source_encoding(isource) + + ! distinguishes between single and double precision for reals + if(CUSTOM_REAL == SIZE_REAL) then + stf_used = sngl(stf) + else + stf_used = stf + endif + + ! beware, for acoustic medium, source is: pressure divided by Kappa of the fluid + ! the sign is negative because pressure p = - Chi_dot_dot therefore we need + ! to add minus the source to Chi_dot_dot to get plus the source in pressure + + ! add source array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! adds source contribution + ! note: acoustic source for pressure gets divided by kappa + iglob = ibool(i,j,k,ispec) + b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) & + - sourcearrays(isource,1,i,j,k) * stf_used / kappastore(i,j,k,ispec) + enddo + enddo enddo - enddo - endif ! USE_FORCE_POINT_SOURCE + endif ! USE_FORCE_POINT_SOURCE - stf_used_total = stf_used_total + stf_used + stf_used_total = stf_used_total + stf_used - endif ! ispec_is_elastic - endif ! ispec_is_inner - endif ! myrank + endif ! ispec_is_elastic + endif ! ispec_is_inner + endif ! myrank - enddo ! NSOURCES + enddo ! NSOURCES + endif ! GPU_MODE endif ! master prints out source time function to file @@ -456,5 +551,4 @@ subroutine compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acou if( myrank == 0 ) write(IOSTF,*) time_source,stf_used_total_all endif - -end subroutine compute_add_sources_acoustic + end subroutine compute_add_sources_acoustic diff --git a/src/specfem3D/compute_add_sources_elastic.f90 b/src/specfem3D/compute_add_sources_elastic.f90 index fda038812..7f351409f 100644 --- a/src/specfem3D/compute_add_sources_elastic.f90 +++ b/src/specfem3D/compute_add_sources_elastic.f90 @@ -34,19 +34,23 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & ispec_is_elastic,SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, & nrec,islice_selected_rec,ispec_selected_rec, & nadj_rec_local,adj_sourcearrays,b_accel, & - NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY ) + NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,GPU_MODE,Mesh_pointer ) use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, & xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,& station_name,network_name,adj_source_file, & LOCAL_PATH,wgllwgll_xy, & - num_free_surface_faces,free_surface_ispec,free_surface_ijk,free_surface_jacobian2Dw, & + num_free_surface_faces,free_surface_ispec, & + free_surface_ijk,free_surface_jacobian2Dw, & noise_sourcearray,irec_master_noise, & - normal_x_noise,normal_y_noise,normal_z_noise,mask_noise,noise_surface_movie, & - nrec_local,number_receiver_global + normal_x_noise,normal_y_noise,normal_z_noise, & + mask_noise,noise_surface_movie, & + nrec_local,number_receiver_global, & + nsources_local use specfem_par_movie,only: & - store_val_ux_external_mesh,store_val_uy_external_mesh,store_val_uz_external_mesh + store_val_ux_external_mesh,store_val_uy_external_mesh, & + store_val_uz_external_mesh implicit none @@ -79,21 +83,26 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & !adjoint simulations integer:: SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT + logical:: GPU_MODE + integer(kind=8) :: Mesh_pointer integer:: nrec integer,dimension(nrec) :: islice_selected_rec,ispec_selected_rec integer:: nadj_rec_local real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel logical :: ibool_read_adj_arrays integer :: it_sub_adj,itime,NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY - real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: adj_sourcearrays + real(kind=CUSTOM_REAL),dimension(nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ):: & + adj_sourcearrays ! local parameters double precision :: f0 double precision :: stf real(kind=CUSTOM_REAL),dimension(:,:,:,:,:),allocatable:: adj_sourcearray real(kind=CUSTOM_REAL) stf_used,stf_used_total_all,time_source - integer :: isource,iglob,i,j,k,ispec,ier - integer :: irec_local,irec + ! for GPU_MODE + double precision, dimension(NSOURCES) :: stf_pre_compute + integer :: isource,iglob,i,j,k,ispec + integer :: irec_local,irec, ier ! adjoint sources in SU format integer :: it_start,it_end @@ -105,83 +114,108 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & !integer(kind=4) :: i4head(nheader/4) ! 4-byte-integer real(kind=4) :: r4head(nheader/4) ! 4-byte-real !equivalence (i2head,i4head,r4head) ! share the same 240-byte-memory - double precision :: hxir(NGLLX), hpxir(NGLLX), hetar(NGLLY), hpetar(NGLLY),hgammar(NGLLZ), hpgammar(NGLLZ) + double precision :: hxir(NGLLX),hpxir(NGLLX),hetar(NGLLY),hpetar(NGLLY),hgammar(NGLLZ),hpgammar(NGLLZ) ! plotting source time function if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then - ! initializes total - stf_used_total = 0.0_CUSTOM_REAL + ! initializes total + stf_used_total = 0.0_CUSTOM_REAL endif -! forward simulations - if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0) then + ! forward simulations + if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then + + if(GPU_MODE) then + do isource = 1,NSOURCES + if( USE_RICKER_IPATI ) then + stf_pre_compute(isource) = comp_source_time_function_rickr( & + dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + stf_pre_compute(isource) = comp_source_time_function( & + dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + endif + enddo + ! only implements SIMTYPE=1 and NOISE_TOM=0 + ! write(*,*) "fortran dt = ", dt + ! change dt -> DT + call compute_add_sources_el_cuda(Mesh_pointer, & + phase_is_inner,NSOURCES, & + stf_pre_compute, myrank) - do isource = 1,NSOURCES + else ! .NOT. GPU_MODE - ! add the source (only if this proc carries the source) - if(myrank == islice_selected_source(isource)) then + do isource = 1,NSOURCES - ispec = ispec_selected_source(isource) + ! add the source (only if this proc carries the source) + if(myrank == islice_selected_source(isource)) then - if (ispec_is_inner(ispec) .eqv. phase_is_inner) then + ispec = ispec_selected_source(isource) - if( ispec_is_elastic(ispec) ) then + if (ispec_is_inner(ispec) .eqv. phase_is_inner) then - if(USE_FORCE_POINT_SOURCE) then + if( ispec_is_elastic(ispec) ) then - ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] - iglob = ibool(nint(xi_source(isource)), & - nint(eta_source(isource)), & - nint(gamma_source(isource)), & - ispec_selected_source(isource)) + if(USE_FORCE_POINT_SOURCE) then - f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format + ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] + iglob = ibool(nint(xi_source(isource)), & + nint(eta_source(isource)), & + nint(gamma_source(isource)), & + ispec_selected_source(isource)) - !if (it == 1 .and. myrank == 0) then - ! write(IMAIN,*) 'using a source of dominant frequency ',f0 - ! write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0 - ! write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0) - !endif + f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format - ! This is the expression of a Ricker; should be changed according maybe to the Par_file. - stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0) + !if (it == 1 .and. myrank == 0) then + ! write(IMAIN,*) 'using a source of dominant frequency ',f0 + ! write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0 + ! write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0) + !endif - ! we use a force in a single direction along one of the components: - ! x/y/z or E/N/Z-direction would correspond to 1/2/3 = COMPONENT_FORCE_SOURCE - ! e.g. nu_source(:,3) here would be a source normal to the surface (z-direction). - accel(:,iglob) = accel(:,iglob) & - + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used + ! This is the expression of a Ricker; should be changed according maybe to the Par_file. + stf_used = FACTOR_FORCE_SOURCE * & + comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0) - else + ! we use a force in a single direction along one of the components: + ! x/y/z or E/N/Z-direction would correspond to 1/2/3 = COMPONENT_FORCE_SOURCE + ! e.g. nu_source(:,3) here would be a source normal to the surface (z-direction). + accel(:,iglob) = accel(:,iglob) & + + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used - stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + else - ! distinguish between single and double precision for reals - if(CUSTOM_REAL == SIZE_REAL) then - stf_used = sngl(stf) - else - stf_used = stf - endif + if( USE_RICKER_IPATI) then + stf = comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + endif - ! add source array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - iglob = ibool(i,j,k,ispec) - accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used - enddo - enddo - enddo + ! distinguish between single and double precision for reals + if(CUSTOM_REAL == SIZE_REAL) then + stf_used = sngl(stf) + else + stf_used = stf + endif - endif ! USE_FORCE_POINT_SOURCE + ! add source array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + iglob = ibool(i,j,k,ispec) + accel(:,iglob) = accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used + enddo + enddo + enddo - stf_used_total = stf_used_total + stf_used + endif ! USE_FORCE_POINT_SOURCE - endif ! ispec_is_elastic - endif ! ispec_is_inner - endif ! myrank + stf_used_total = stf_used_total + stf_used - enddo ! NSOURCES + endif ! ispec_is_elastic + endif ! ispec_is_inner + endif ! myrank + + enddo ! NSOURCES + endif ! GPU_MODE endif ! forward ! NOTE: adjoint sources and backward wavefield timing: @@ -211,132 +245,146 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & ! adjoint simulations if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then - ! read in adjoint sources block by block (for memory consideration) - ! e.g., in exploration experiments, both the number of receivers (nrec) and - ! the number of time steps (NSTEP) are huge, - ! which may cause problems since we have a large array: - ! adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ) - - ! figure out if we need to read in a chunk of the adjoint source at this timestep - it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) ) !chunk_number - ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0)) - - ! needs to read in a new chunk/block of the adjoint source - ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner' - ! we first do calculations for the boudaries, and then start communication - ! with other partitions while calculate for the inner part - ! this must be done carefully, otherwise the adjoint sources may be added twice - if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then - - ! allocates temporary source array - allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier) - if( ier /= 0 ) stop 'error allocating array adj_sourcearray' - - if (.not. SU_FORMAT) then - !!! read ascii adjoint sources - irec_local = 0 - do irec = 1, nrec - ! compute source arrays - if (myrank == islice_selected_rec(irec)) then - irec_local = irec_local + 1 - ! reads in **sta**.**net**.**LH**.adj files - adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec)) - call compute_arrays_adjoint_source(myrank,adj_source_file, & - xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), & - adj_sourcearray, xigll,yigll,zigll, & - it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC) - do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC - adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:) - enddo - - endif - enddo - else - !!! read SU adjoint sources - ! range of the block we need to read - it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1 - it_end = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1 - write(procname,"(i4)") myrank - ! read adjoint sources - open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', & + ! adds adjoint source in this partitions + if( nadj_rec_local > 0 ) then + + ! read in adjoint sources block by block (for memory consideration) + ! e.g., in exploration experiments, both the number of receivers (nrec) and + ! the number of time steps (NSTEP) are huge, + ! which may cause problems since we have a large array: + ! adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ) + + ! figure out if we need to read in a chunk of the adjoint source at this timestep + it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) ) !chunk_number + ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0)) + + ! needs to read in a new chunk/block of the adjoint source + ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner' + ! we first do calculations for the boudaries, and then start communication + ! with other partitions while calculate for the inner part + ! this must be done carefully, otherwise the adjoint sources may be added twice + if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then + + ! allocates temporary source array + allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier) + if( ier /= 0 ) stop 'error allocating array adj_sourcearray' + + if (.not. SU_FORMAT) then + !!! read ascii adjoint sources + irec_local = 0 + do irec = 1, nrec + ! compute source arrays + if (myrank == islice_selected_rec(irec)) then + irec_local = irec_local + 1 + ! reads in **sta**.**net**.**LH**.adj files + adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec)) + call compute_arrays_adjoint_source(myrank,adj_source_file, & + xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), & + adj_sourcearray, xigll,yigll,zigll, & + it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC) + do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC + adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:) + enddo + endif + enddo + else + !!! read SU adjoint sources + ! range of the block we need to read + it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1 + it_end = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1 + write(procname,"(i4)") myrank + ! read adjoint sources + open(unit=IIN_SU1, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj', & status='old',access='direct',recl=240+4*NSTEP,iostat = ier) - if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) & + if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) & //'../SEM/'//trim(adjustl(procname))//'_dx_SU.adj does not exit') - open(unit=IIN_SU2, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj', & + open(unit=IIN_SU2, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj', & status='old',access='direct',recl=240+4*NSTEP,iostat = ier) - if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) & + if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) & //'../SEM/'//trim(adjustl(procname))//'_dy_SU.adj does not exit') - open(unit=IIN_SU3, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj', & + open(unit=IIN_SU3, file=trim(adjustl(OUTPUT_FILES_PATH))//'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj', & status='old',access='direct',recl=240+4*NSTEP,iostat = ier) - if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) & + if( ier /= 0 ) call exit_MPI(myrank,'file '//trim(adjustl(OUTPUT_FILES_PATH)) & //'../SEM/'//trim(adjustl(procname))//'_dz_SU.adj does not exit') - do irec_local = 1,nrec_local - irec = number_receiver_global(irec_local) - read(IIN_SU1,rec=irec_local) r4head, adj_temp - adj_src(:,1)=adj_temp(it_start:it_end) - read(IIN_SU2,rec=irec_local) r4head, adj_temp - adj_src(:,2)=adj_temp(it_start:it_end) - read(IIN_SU3,rec=irec_local) r4head, adj_temp - adj_src(:,3)=adj_temp(it_start:it_end) - ! lagrange interpolators for receiver location - call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir) - call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar) - call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar) - ! interpolates adjoint source onto GLL points within this element - do k = 1, NGLLZ - do j = 1, NGLLY - do i = 1, NGLLX - adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:) - enddo - enddo - enddo - do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC - adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:) - enddo - enddo - close(IIN_SU1) - close(IIN_SU2) - close(IIN_SU3) - endif !if(.not. SU_FORMAT) - - deallocate(adj_sourcearray) - - endif ! if(ibool_read_adj_arrays) - - if( it < NSTEP ) then - - ! receivers act as sources - irec_local = 0 - do irec = 1,nrec - - ! add the source (only if this proc carries the source) - if (myrank == islice_selected_rec(irec)) then - irec_local = irec_local + 1 - - ! checks if element is in phase_is_inner run - if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then - - ! adds source array - do k = 1,NGLLZ - do j = 1,NGLLY - do i = 1,NGLLX - iglob = ibool(i,j,k,ispec_selected_rec(irec)) - - accel(:,iglob) = accel(:,iglob) & - + adj_sourcearrays(irec_local, & - NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), & - :,i,j,k) + do irec_local = 1,nrec_local + irec = number_receiver_global(irec_local) + read(IIN_SU1,rec=irec_local) r4head, adj_temp + adj_src(:,1)=adj_temp(it_start:it_end) + read(IIN_SU2,rec=irec_local) r4head, adj_temp + adj_src(:,2)=adj_temp(it_start:it_end) + read(IIN_SU3,rec=irec_local) r4head, adj_temp + adj_src(:,3)=adj_temp(it_start:it_end) + ! lagrange interpolators for receiver location + call lagrange_any(xi_receiver(irec),NGLLX,xigll,hxir,hpxir) + call lagrange_any(eta_receiver(irec),NGLLY,yigll,hetar,hpetar) + call lagrange_any(gamma_receiver(irec),NGLLZ,zigll,hgammar,hpgammar) + ! interpolates adjoint source onto GLL points within this element + do k = 1, NGLLZ + do j = 1, NGLLY + do i = 1, NGLLX + adj_sourcearray(:,:,i,j,k) = hxir(i) * hetar(j) * hgammar(k) * adj_src(:,:) enddo enddo enddo + do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC + adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:) + enddo + enddo + close(IIN_SU1) + close(IIN_SU2) + close(IIN_SU3) + endif !if(.not. SU_FORMAT) - endif ! phase_is_inner - endif - enddo ! nrec + deallocate(adj_sourcearray) + endif ! if(ibool_read_adj_arrays) + + + if( it < NSTEP ) then + + if(.NOT. GPU_MODE) then + + ! receivers act as sources + irec_local = 0 + do irec = 1,nrec - endif ! it + ! add the source (only if this proc carries the source) + if (myrank == islice_selected_rec(irec)) then + irec_local = irec_local + 1 + ispec = ispec_selected_rec(irec) + if( ispec_is_elastic(ispec) ) then + + ! checks if element is in phase_is_inner run + if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then + + ! adds source array + do k = 1,NGLLZ + do j = 1,NGLLY + do i = 1,NGLLX + iglob = ibool(i,j,k,ispec_selected_rec(irec)) + + accel(:,iglob) = accel(:,iglob) & + + adj_sourcearrays(irec_local, & + NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), & + :,i,j,k) + enddo + enddo + enddo + endif ! phase_is_inner + endif ! ispec_is_elastic + endif + enddo ! nrec + else ! GPU_MODE == .true. + call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, & + ispec_is_inner,ispec_is_elastic, & + ispec_selected_rec,myrank,nrec, & + NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), & + islice_selected_rec,nadj_rec_local, & + NTSTEP_BETWEEN_READ_ADJSRC) + endif ! GPU_MODE + endif ! it + endif ! nadj_rec_local endif !adjoint ! note: b_displ() is read in after Newmark time scheme, thus @@ -344,75 +392,99 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & ! thus indexing is NSTEP - it , instead of NSTEP - it - 1 ! adjoint simulations - if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0) then + if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then + + if(GPU_MODE) then + do isource = 1,NSOURCES + if( USE_RICKER_IPATI ) then + stf_pre_compute(isource) = comp_source_time_function_rickr( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + stf_pre_compute(isource) = comp_source_time_function( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + endif + enddo - ! backward source reconstruction - do isource = 1,NSOURCES + call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute, & + NSOURCES,phase_is_inner,myrank) - ! add the source (only if this proc carries the source) - if(myrank == islice_selected_source(isource)) then + else ! .NOT. GPU_MODE - ispec = ispec_selected_source(isource) + ! backward source reconstruction + do isource = 1,NSOURCES - if (ispec_is_inner(ispec) .eqv. phase_is_inner) then + ! add the source (only if this proc carries the source) + if(myrank == islice_selected_source(isource)) then - if( ispec_is_elastic(ispec) ) then + ispec = ispec_selected_source(isource) - if(USE_FORCE_POINT_SOURCE) then + if (ispec_is_inner(ispec) .eqv. phase_is_inner) then - ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] - iglob = ibool(nint(xi_source(isource)), & - nint(eta_source(isource)), & - nint(gamma_source(isource)), & - ispec_selected_source(isource)) + if( ispec_is_elastic(ispec) ) then - f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format + if(USE_FORCE_POINT_SOURCE) then - !if (it == 1 .and. myrank == 0) then - ! write(IMAIN,*) 'using a source of dominant frequency ',f0 - ! write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0 - ! write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0) - !endif + ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] + iglob = ibool(nint(xi_source(isource)), & + nint(eta_source(isource)), & + nint(gamma_source(isource)), & + ispec_selected_source(isource)) - ! This is the expression of a Ricker; should be changed according maybe to the Par_file. - stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0) + f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format - ! e.g. we use nu_source(:,3) here if we want a source normal to the surface. - ! note: time step is now at NSTEP-it - b_accel(:,iglob) = b_accel(:,iglob) & - + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used + !if (it == 1 .and. myrank == 0) then + ! write(IMAIN,*) 'using a source of dominant frequency ',f0 + ! write(IMAIN,*) 'lambda_S at dominant frequency = ',3000./sqrt(3.)/f0 + ! write(IMAIN,*) 'lambda_S at highest significant frequency = ',3000./sqrt(3.)/(2.5*f0) + !endif - else + ! This is the expression of a Ricker; should be changed according maybe to the Par_file. + stf_used = FACTOR_FORCE_SOURCE * & + comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),f0) - ! see note above: time step corresponds now to NSTEP-it - ! (also compare to it-1 for forward simulation) - stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + ! e.g. we use nu_source(:,3) here if we want a source normal to the surface. + ! note: time step is now at NSTEP-it + b_accel(:,iglob) = b_accel(:,iglob) & + + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used - ! distinguish between single and double precision for reals - if(CUSTOM_REAL == SIZE_REAL) then - stf_used = sngl(stf) - else - stf_used = stf - endif + else - ! add source array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - iglob = ibool(i,j,k,ispec_selected_source(isource)) - b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used - enddo - enddo - enddo - endif ! USE_FORCE_POINT_SOURCE + ! see note above: time step corresponds now to NSTEP-it + ! (also compare to it-1 for forward simulation) + if( USE_RICKER_IPATI ) then + stf = comp_source_time_function_rickr( & + dble(it-1)*DT-t0-tshift_cmt(isource),hdur(isource)) + else + stf = comp_source_time_function( & + dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource)) + endif + + ! distinguish between single and double precision for reals + if(CUSTOM_REAL == SIZE_REAL) then + stf_used = sngl(stf) + else + stf_used = stf + endif + + ! add source array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + iglob = ibool(i,j,k,ispec_selected_source(isource)) + b_accel(:,iglob) = b_accel(:,iglob) + sourcearrays(isource,:,i,j,k)*stf_used + enddo + enddo + enddo + endif ! USE_FORCE_POINT_SOURCE - stf_used_total = stf_used_total + stf_used + stf_used_total = stf_used_total + stf_used - endif ! elastic - endif ! phase_inner - endif ! myrank + endif ! elastic + endif ! phase_inner + endif ! myrank - enddo ! NSOURCES + enddo ! NSOURCES + endif ! GPU_MODE endif ! adjoint ! master prints out source time function to file @@ -432,11 +504,15 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed. ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes. ! now this must be manually set in DATA/CMTSOLUTION, by USERS. - call add_source_master_rec_noise(myrank,nrec, & - NSTEP,accel,noise_sourcearray, & - ibool,islice_selected_rec,ispec_selected_rec, & - it,irec_master_noise, & - NSPEC_AB,NGLOB_AB) + if(.NOT. GPU_MODE) then + call add_source_master_rec_noise(myrank,nrec, & + NSTEP,accel,noise_sourcearray, & + ibool,islice_selected_rec,ispec_selected_rec, & + it,irec_master_noise, & + NSPEC_AB,NGLOB_AB) + else ! GPU_MODE == .true. + call add_source_master_rec_noise_cu(Mesh_pointer, myrank, it, irec_master_noise, islice_selected_rec) + endif elseif ( NOISE_TOMOGRAPHY == 2 ) then ! second step of noise tomography, i.e., read the surface movie saved at every timestep ! use the movie to drive the ensemble forward wavefield @@ -447,7 +523,8 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & NSTEP-it+1, & NSPEC_AB,NGLOB_AB, & num_free_surface_faces,free_surface_ispec,free_surface_ijk, & - free_surface_jacobian2Dw) + free_surface_jacobian2Dw,& + Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY) ! be careful, since ensemble forward sources are reversals of generating wavefield "eta" ! hence the "NSTEP-it+1", i.e., start reading from the last timestep ! note the ensemble forward sources are generally distributed on the surface of the earth @@ -465,7 +542,8 @@ subroutine compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & it, & NSPEC_AB,NGLOB_AB, & num_free_surface_faces,free_surface_ispec,free_surface_ijk, & - free_surface_jacobian2Dw) + free_surface_jacobian2Dw,& + Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY) endif endif diff --git a/src/specfem3D/compute_coupling_elastic_po.f90 b/src/specfem3D/compute_coupling_elastic_po.f90 index b9e4fac83..37f239e31 100644 --- a/src/specfem3D/compute_coupling_elastic_po.f90 +++ b/src/specfem3D/compute_coupling_elastic_po.f90 @@ -224,11 +224,11 @@ subroutine compute_coupling_elastic_po(NSPEC_AB,NGLOB_AB,ibool,& if (SIMULATION_TYPE == 3) then ! to do stop 'compute_coupling_elastic_po() : adjoint run not implemented yet' - + ! dummy to avoid compiler warnings - iglob = NGLOB_ADJOINT - iglob = NSPEC_ADJOINT - + iglob = NGLOB_ADJOINT + iglob = NSPEC_ADJOINT + endif ! adjoint !!! can merge these loops because NGLLX = NGLLY = NGLLZ enddo diff --git a/src/specfem3D/compute_coupling_poroelastic_el.f90 b/src/specfem3D/compute_coupling_poroelastic_el.f90 index cc19f10df..f46d3375f 100644 --- a/src/specfem3D/compute_coupling_poroelastic_el.f90 +++ b/src/specfem3D/compute_coupling_poroelastic_el.f90 @@ -360,12 +360,12 @@ subroutine compute_coupling_poroelastic_el(NSPEC_AB,NGLOB_AB,ibool,& if (SIMULATION_TYPE == 3) then ! to do stop 'compute_coupling_poroelastic_el() : adjoint run not implemented yet' - + ! dummy to avoid compiler warnings - iglob = NGLOB_ADJOINT - iglob = NSPEC_ADJOINT + iglob = NGLOB_ADJOINT + iglob = NSPEC_ADJOINT endif ! adjoint - + !!! can merge these loops because NGLLX = NGLLY = NGLLZ enddo !!! can merge these loops because NGLLX = NGLLY = NGLLZ do diff --git a/src/specfem3D/compute_forces_acoustic.f90 b/src/specfem3D/compute_forces_acoustic.f90 index 1e2ca39fe..89ebf7e8e 100644 --- a/src/specfem3D/compute_forces_acoustic.f90 +++ b/src/specfem3D/compute_forces_acoustic.f90 @@ -65,21 +65,36 @@ subroutine compute_forces_acoustic() logical:: phase_is_inner -! enforces free surface (zeroes potentials at free surface) - call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, & + ! enforces free surface (zeroes potentials at free surface) + if(.NOT. GPU_MODE) then + ! on CPU + call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, & potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, & ibool,free_surface_ijk,free_surface_ispec, & num_free_surface_faces,ispec_is_acoustic) - ! adjoint simulations - if( SIMULATION_TYPE == 3 ) & - call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT, & + ! adjoint simulations + if( SIMULATION_TYPE == 3 ) & + call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT, & b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, & ibool,free_surface_ijk,free_surface_ispec, & num_free_surface_faces,ispec_is_acoustic) - + else + ! on GPU + call acoustic_enforce_free_surf_cuda(Mesh_pointer,SIMULATION_TYPE,ABSORB_FREE_SURFACE) + endif if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then + ! enforces free surface on PML elements + + ! note: + ! PML routines are not implemented as CUDA kernels, we just transfer the fields + ! from the GPU to the CPU and vice versa + + ! transfers potentials to the CPU + if(GPU_MODE) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + call PML_acoustic_enforce_free_srfc(NSPEC_AB,NGLOB_AB, & potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, & ibool,free_surface_ijk,free_surface_ispec, & @@ -89,20 +104,26 @@ subroutine compute_forces_acoustic() chi1_dot,chi2_t_dot,chi3_dot,chi4_dot,& chi1_dot_dot,chi2_t_dot_dot,& chi3_dot_dot,chi4_dot_dot) + + ! transfers potentials back to GPU + if(GPU_MODE) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) endif -! distinguishes two runs: for points on MPI interfaces, and points within the partitions + ! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions do iphase=1,2 - !first for points on MPI interfaces + !first for points on MPI interfaces, thus outer elements if( iphase == 1 ) then phase_is_inner = .false. else phase_is_inner = .true. endif -! acoustic pressure term - call compute_forces_acoustic_pot( iphase, NSPEC_AB,NGLOB_AB, & + ! acoustic pressure term + if(.NOT. GPU_MODE) then + ! on CPU + call compute_forces_acoustic_pot( iphase, NSPEC_AB,NGLOB_AB, & potential_acoustic,potential_dot_dot_acoustic, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & hprime_xx,hprime_yy,hprime_zz, & @@ -112,9 +133,9 @@ subroutine compute_forces_acoustic() num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,& phase_ispec_inner_acoustic ) - ! adjoint simulations - if( SIMULATION_TYPE == 3 ) & - call compute_forces_acoustic_pot( iphase, NSPEC_ADJOINT,NGLOB_ADJOINT, & + ! adjoint simulations + if( SIMULATION_TYPE == 3 ) & + call compute_forces_acoustic_pot( iphase, NSPEC_ADJOINT,NGLOB_ADJOINT, & b_potential_acoustic,b_potential_dot_dot_acoustic, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & hprime_xx,hprime_yy,hprime_zz, & @@ -123,8 +144,19 @@ subroutine compute_forces_acoustic() rhostore,jacobian,ibool, & num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,& phase_ispec_inner_acoustic ) + else + ! on GPU + ! includes code for SIMULATION_TYPE==3 + call compute_forces_acoustic_cuda(Mesh_pointer, iphase, & + nspec_outer_acoustic, nspec_inner_acoustic, & + SIMULATION_TYPE) + endif if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then + ! transfers potentials to CPU + if(GPU_MODE) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + call compute_forces_acoustic_PML(NSPEC_AB,NGLOB_AB, & ibool,ispec_is_inner,phase_is_inner, & rhostore,ispec_is_acoustic,potential_acoustic, & @@ -145,9 +177,13 @@ subroutine compute_forces_acoustic() num_PML_ispec,PML_ispec,iglob_is_PML_interface,& chi1_dot_dot,chi3_dot_dot,chi4_dot_dot) + ! transfers potentials back to GPU + if(GPU_MODE) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + endif ! PML -! absorbing boundaries + ! absorbing boundaries if(ABSORBING_CONDITIONS) then if(ABSORB_USE_PML) then if( PML_USE_SOMMERFELD ) then @@ -161,8 +197,13 @@ subroutine compute_forces_acoustic() num_PML_ispec,PML_ispec,ispec_is_PML_inum,& chi1_dot,chi2_t,chi2_t_dot,chi3_dot,chi4_dot,& chi1_dot_dot,chi3_dot_dot,chi4_dot_dot) + + ! transfers potentials back to GPU + if(GPU_MODE) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) endif else + ! Stacey boundary conditions call compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, & potential_dot_dot_acoustic,potential_dot_acoustic, & ibool,ispec_is_inner,phase_is_inner, & @@ -170,42 +211,50 @@ subroutine compute_forces_acoustic() num_abs_boundary_faces,rhostore,kappastore,ispec_is_acoustic, & SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it,NGLOB_ADJOINT, & b_potential_dot_dot_acoustic,b_reclen_potential, & - b_absorb_potential,b_num_abs_boundary_faces) + b_absorb_potential,b_num_abs_boundary_faces, & + GPU_MODE,Mesh_pointer) endif endif -! elastic coupling + ! elastic coupling if(ELASTIC_SIMULATION ) then if( num_coupling_ac_el_faces > 0 ) then - if( SIMULATION_TYPE == 1 ) then - ! forward definition: \bfs=\frac{1}{\rho}\bfnabla\phi - call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, & - ibool,displ,potential_dot_dot_acoustic, & + if( .NOT. GPU_MODE ) then + if( SIMULATION_TYPE == 1 ) then + ! forward definition: \bfs=\frac{1}{\rho}\bfnabla\phi + call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, & + ibool,displ,potential_dot_dot_acoustic, & + num_coupling_ac_el_faces, & + coupling_ac_el_ispec,coupling_ac_el_ijk, & + coupling_ac_el_normal, & + coupling_ac_el_jacobian2Dw, & + ispec_is_inner,phase_is_inner) + else + ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield + ! adjoint definition: \partial_t^2 \bfs^\dagger=-\frac{1}{\rho}\bfnabla\phi^\dagger + call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, & + ibool,-accel_adj_coupling,potential_dot_dot_acoustic, & + num_coupling_ac_el_faces, & + coupling_ac_el_ispec,coupling_ac_el_ijk, & + coupling_ac_el_normal, & + coupling_ac_el_jacobian2Dw, & + ispec_is_inner,phase_is_inner) + endif + ! adjoint/kernel simulations + if( SIMULATION_TYPE == 3 ) & + call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, & + ibool,b_displ,b_potential_dot_dot_acoustic, & num_coupling_ac_el_faces, & coupling_ac_el_ispec,coupling_ac_el_ijk, & coupling_ac_el_normal, & coupling_ac_el_jacobian2Dw, & ispec_is_inner,phase_is_inner) + else - ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield - ! adjoint definition: \partial_t^2 \bfs^\dagger=-\frac{1}{\rho}\bfnabla\phi^\dagger - call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, & - ibool,-accel_adj_coupling,potential_dot_dot_acoustic, & - num_coupling_ac_el_faces, & - coupling_ac_el_ispec,coupling_ac_el_ijk, & - coupling_ac_el_normal, & - coupling_ac_el_jacobian2Dw, & - ispec_is_inner,phase_is_inner) - endif - ! adjoint/kernel simulations - if( SIMULATION_TYPE == 3 ) & - call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, & - ibool,b_displ,b_potential_dot_dot_acoustic, & - num_coupling_ac_el_faces, & - coupling_ac_el_ispec,coupling_ac_el_ijk, & - coupling_ac_el_normal, & - coupling_ac_el_jacobian2Dw, & - ispec_is_inner,phase_is_inner) + ! on GPU + call compute_coupling_ac_el_cuda(Mesh_pointer,phase_is_inner, & + num_coupling_ac_el_faces,SIMULATION_TYPE) + endif ! GPU_MODE endif endif @@ -213,7 +262,7 @@ subroutine compute_forces_acoustic() if(POROELASTIC_SIMULATION ) then if( num_coupling_ac_po_faces > 0 ) then if( SIMULATION_TYPE == 1 ) then - call compute_coupling_acoustic_po(NSPEC_AB,NGLOB_AB, & + call compute_coupling_acoustic_po(NSPEC_AB,NGLOB_AB, & ibool,displs_poroelastic,displw_poroelastic, & potential_dot_dot_acoustic, & num_coupling_ac_po_faces, & @@ -222,14 +271,14 @@ subroutine compute_forces_acoustic() coupling_ac_po_jacobian2Dw, & ispec_is_inner,phase_is_inner) else - stop 'not implemented yet' + stop 'not implemented yet' endif if( SIMULATION_TYPE == 3 ) & - stop 'not implemented yet' + stop 'not implemented yet' endif endif -! sources + ! sources call compute_add_sources_acoustic(NSPEC_AB,NGLOB_AB,potential_dot_dot_acoustic, & ibool,ispec_is_inner,phase_is_inner, & NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,& @@ -239,54 +288,127 @@ subroutine compute_forces_acoustic() SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, & nrec,islice_selected_rec,ispec_selected_rec, & nadj_rec_local,adj_sourcearrays,b_potential_dot_dot_acoustic, & - NTSTEP_BETWEEN_READ_ADJSRC ) + NTSTEP_BETWEEN_READ_ADJSRC, & + GPU_MODE, Mesh_pointer) -! assemble all the contributions between slices using MPI + ! assemble all the contributions between slices using MPI if( phase_is_inner .eqv. .false. ) then ! sends potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking) - call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,potential_dot_dot_acoustic, & + if(.NOT. GPU_MODE) then + call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,potential_dot_dot_acoustic, & buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, & num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,& my_neighbours_ext_mesh, & request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh) + else + ! on GPU + call transfer_boun_pot_from_device(NGLOB_AB, Mesh_pointer, & + potential_dot_dot_acoustic, & + buffer_send_scalar_ext_mesh, & + num_interfaces_ext_mesh, & + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh, & + ibool_interfaces_ext_mesh, & + 1) ! <-- 1 == fwd accel + call assemble_MPI_scalar_send_cuda(NPROC, & + buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,& + my_neighbours_ext_mesh, & + request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh) + endif + ! adjoint simulations - if( SIMULATION_TYPE == 3 ) & - call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, & + if( SIMULATION_TYPE == 3 ) then + if(.NOT. GPU_MODE) then + call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, & b_buffer_send_scalar_ext_mesh,b_buffer_recv_scalar_ext_mesh, & num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,& my_neighbours_ext_mesh, & b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh) + else + ! on GPU + call transfer_boun_pot_from_device(NGLOB_AB, Mesh_pointer, & + b_potential_dot_dot_acoustic, & + b_buffer_send_scalar_ext_mesh,& + num_interfaces_ext_mesh, & + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh, & + ibool_interfaces_ext_mesh, & + 3) ! <-- 3 == adjoint b_accel + + call assemble_MPI_scalar_send_cuda(NPROC, & + b_buffer_send_scalar_ext_mesh,b_buffer_recv_scalar_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,& + my_neighbours_ext_mesh, & + b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh) + + endif + endif + else + ! waits for send/receive requests to be completed and assembles values - call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,potential_dot_dot_acoustic, & + if(.NOT. GPU_MODE) then + call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,potential_dot_dot_acoustic, & buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,& max_nibool_interfaces_ext_mesh, & nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh) + else + ! on GPU + call assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,potential_dot_dot_acoustic, & + Mesh_pointer,& + buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,& + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, & + 1) + endif + ! adjoint simulations - if( SIMULATION_TYPE == 3 ) & - call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, & + if( SIMULATION_TYPE == 3 ) then + if(.NOT. GPU_MODE) then + call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, & b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,& max_nibool_interfaces_ext_mesh, & nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh) - - endif - + else + ! on GPU + call assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,b_potential_dot_dot_acoustic, & + Mesh_pointer, & + b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, & + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + b_request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, & + 3) + endif + endif + endif !phase_is_inner enddo - ! divides pressure with mass matrix - potential_dot_dot_acoustic(:) = potential_dot_dot_acoustic(:) * rmass_acoustic(:) + if(.NOT. GPU_MODE) then + ! divides pressure with mass matrix + potential_dot_dot_acoustic(:) = potential_dot_dot_acoustic(:) * rmass_acoustic(:) - ! adjoint simulations - if (SIMULATION_TYPE == 3) & - b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:) + ! adjoint simulations + if (SIMULATION_TYPE == 3) & + b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:) + else + ! on GPU + call kernel_3_a_acoustic_cuda(Mesh_pointer,NGLOB_AB,SIMULATION_TYPE) + endif if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then + ! note: no need to transfer fields between CPU and GPU; + ! PML arrays are all handled on the CPU + ! divides local contributions with mass term call PML_acoustic_mass_update(NSPEC_AB,NGLOB_AB,& ispec_is_acoustic,rmass_acoustic,ibool,& @@ -317,14 +439,24 @@ subroutine compute_forces_acoustic() ! ! corrector: ! updates the chi_dot term which requires chi_dot_dot(t+delta) - potential_dot_acoustic(:) = potential_dot_acoustic(:) + deltatover2*potential_dot_dot_acoustic(:) + if( .NOT. GPU_MODE ) then + ! corrector + potential_dot_acoustic(:) = potential_dot_acoustic(:) + deltatover2*potential_dot_dot_acoustic(:) - ! adjoint simulations - if (SIMULATION_TYPE == 3) & - b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:) + ! adjoint simulations + if (SIMULATION_TYPE == 3) & + b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:) + else + ! on GPU + call kernel_3_b_acoustic_cuda(Mesh_pointer,NGLOB_AB,deltatover2,SIMULATION_TYPE,b_deltatover2) + endif ! updates potential_dot_acoustic and potential_dot_dot_acoustic inside PML region for plotting seismograms/movies if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then + ! transfers potentials to CPU + if(GPU_MODE) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + call PML_acoustic_update_potentials(NGLOB_AB,NSPEC_AB, & ibool,ispec_is_acoustic, & potential_dot_acoustic,potential_dot_dot_acoustic,& @@ -336,29 +468,44 @@ subroutine compute_forces_acoustic() chi1,chi2,chi2_t,chi3,& chi1_dot,chi2_t_dot,chi3_dot,chi4_dot,& chi1_dot_dot,chi3_dot_dot,chi4_dot_dot) + + ! transfers potentials to GPU + if(GPU_MODE) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + endif ! enforces free surface (zeroes potentials at free surface) - call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, & + if(.NOT. GPU_MODE) then + ! on CPU + call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_AB, & potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, & ibool,free_surface_ijk,free_surface_ispec, & num_free_surface_faces,ispec_is_acoustic) - if( SIMULATION_TYPE /= 1 ) then - potential_acoustic_adj_coupling(:) = potential_acoustic(:) & + if( SIMULATION_TYPE /= 1 ) then + potential_acoustic_adj_coupling(:) = potential_acoustic(:) & + deltat * potential_dot_acoustic(:) & + deltatsqover2 * potential_dot_dot_acoustic(:) - endif + endif - ! adjoint simulations - if (SIMULATION_TYPE == 3) & - call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT, & + ! adjoint simulations + if (SIMULATION_TYPE == 3) & + call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT, & b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, & ibool,free_surface_ijk,free_surface_ispec, & num_free_surface_faces,ispec_is_acoustic) + else + ! on GPU + call acoustic_enforce_free_surf_cuda(Mesh_pointer,SIMULATION_TYPE,ABSORB_FREE_SURFACE) + endif if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then + ! enforces free surface on PML elements + if( GPU_MODE ) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + call PML_acoustic_enforce_free_srfc(NSPEC_AB,NGLOB_AB, & potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, & ibool,free_surface_ijk,free_surface_ispec, & @@ -369,6 +516,9 @@ subroutine compute_forces_acoustic() chi1_dot,chi2_t_dot,chi3_dot,chi4_dot,& chi1_dot_dot,chi2_t_dot_dot,& chi3_dot_dot,chi4_dot_dot) + + if( GPU_MODE ) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) endif end subroutine compute_forces_acoustic diff --git a/src/specfem3D/compute_forces_elastic.f90 b/src/specfem3D/compute_forces_elastic.F90 similarity index 64% rename from src/specfem3D/compute_forces_elastic.f90 rename to src/specfem3D/compute_forces_elastic.F90 index d76dee796..1ff63ce40 100644 --- a/src/specfem3D/compute_forces_elastic.f90 +++ b/src/specfem3D/compute_forces_elastic.F90 @@ -50,72 +50,107 @@ subroutine compute_forces_elastic() ! elastic term - if(USE_DEVILLE_PRODUCTS) then - ! uses Deville (2002) optimizations - call compute_forces_elastic_Dev_sim1(iphase) + if( .NOT. GPU_MODE ) then + if(USE_DEVILLE_PRODUCTS) then + ! uses Deville (2002) optimizations + call compute_forces_elastic_Dev_sim1(iphase) - ! adjoint simulations: backward/reconstructed wavefield - if( SIMULATION_TYPE == 3 ) & - call compute_forces_elastic_Dev_sim3(iphase) + ! adjoint simulations: backward/reconstructed wavefield + if( SIMULATION_TYPE == 3 ) & + call compute_forces_elastic_Dev_sim3(iphase) + + else + ! no optimizations used + call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,displ,accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_yy,hprime_zz, & + hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,& + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION,& + one_minus_sum_beta,factor_common, & + alphaval,betaval,gammaval,& + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy,& + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,& + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& + phase_ispec_inner_elastic ) + + ! adjoint simulations: backward/reconstructed wavefield + if( SIMULATION_TYPE == 3 ) & + call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,& + b_displ,b_accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_yy,hprime_zz, & + hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,& + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION,& + one_minus_sum_beta,factor_common, & + b_alphaval,b_betaval,b_gammaval,& + NSPEC_ATTENUATION_AB, & + b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, & + b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,& + b_epsilondev_xz,b_epsilondev_yz,b_epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,& + is_moho_top,is_moho_bot, & + b_dsdx_top,b_dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& + phase_ispec_inner_elastic ) + + endif else - ! no optimizations used - call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,displ,accel, & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - hprime_xx,hprime_yy,hprime_zz, & - hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,& - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - kappastore,mustore,jacobian,ibool, & - ATTENUATION,& - one_minus_sum_beta,factor_common, & - alphaval,betaval,gammaval,& - NSPEC_ATTENUATION_AB, & - R_xx,R_yy,R_xy,R_xz,R_yz, & - epsilondev_xx,epsilondev_yy,epsilondev_xy,& - epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store,& - c22store,c23store,c24store,c25store,c26store,c33store,& - c34store,c35store,c36store,c44store,c45store,c46store,& - c55store,c56store,c66store, & - SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & - NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,& - is_moho_top,is_moho_bot, & - dsdx_top,dsdx_bot, & - ispec2D_moho_top,ispec2D_moho_bot, & - num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& - phase_ispec_inner_elastic ) - - ! adjoint simulations: backward/reconstructed wavefield - if( SIMULATION_TYPE == 3 ) & - call compute_forces_elastic_noDev( iphase, NSPEC_AB,NGLOB_AB,& - b_displ,b_accel, & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - hprime_xx,hprime_yy,hprime_zz, & - hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,& - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - kappastore,mustore,jacobian,ibool, & - ATTENUATION,& - one_minus_sum_beta,factor_common, & - b_alphaval,b_betaval,b_gammaval,& - NSPEC_ATTENUATION_AB, & - b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, & - b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,& - b_epsilondev_xz,b_epsilondev_yz,b_epsilon_trace_over_3, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store,& - c22store,c23store,c24store,c25store,c26store,c33store,& - c34store,c35store,c36store,c44store,c45store,c46store,& - c55store,c56store,c66store, & - SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & - NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,& - is_moho_top,is_moho_bot, & - b_dsdx_top,b_dsdx_bot, & - ispec2D_moho_top,ispec2D_moho_bot, & - num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& - phase_ispec_inner_elastic ) + ! on GPU + ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations + call compute_forces_elastic_cuda(Mesh_pointer, iphase, & + nspec_outer_elastic, & + nspec_inner_elastic, & + SIMULATION_TYPE, & + COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY) + + if(phase_is_inner .eqv. .true.) then + ! while Inner elements compute "Kernel_2", we wait for MPI to + ! finish and transfer the boundary terms to the device + ! asynchronously + + !daniel: todo - this avoids calling the fortran vector send from CUDA routine + ! wait for asynchronous copy to finish + call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_ext_mesh) + ! sends mpi buffers + call assemble_MPI_vector_send_cuda(NPROC, & + buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,& + my_neighbours_ext_mesh, & + request_send_vector_ext_mesh,request_recv_vector_ext_mesh) + + ! transfers mpi buffers onto GPU + call transfer_boundary_to_device(NPROC,Mesh_pointer,buffer_recv_vector_ext_mesh,& + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,& + request_recv_vector_ext_mesh) + endif ! inner elements + + endif ! GPU_MODE - endif ! adds elastic absorbing boundary term to acceleration (Stacey conditions) if(ABSORBING_CONDITIONS) & @@ -127,48 +162,55 @@ subroutine compute_forces_elastic() veloc,rho_vp,rho_vs, & ispec_is_elastic,SIMULATION_TYPE,SAVE_FORWARD, & NSTEP,it,NGLOB_ADJOINT,b_accel, & - b_num_abs_boundary_faces,b_reclen_field,b_absorb_field ) + b_num_abs_boundary_faces,b_reclen_field,b_absorb_field,& + GPU_MODE,Mesh_pointer) ! acoustic coupling if( ACOUSTIC_SIMULATION ) then if( num_coupling_ac_el_faces > 0 ) then - if( SIMULATION_TYPE == 1 ) then - ! forward definition: pressure=-potential_dot_dot - call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, & - ibool,accel,potential_dot_dot_acoustic, & - num_coupling_ac_el_faces, & - coupling_ac_el_ispec,coupling_ac_el_ijk, & - coupling_ac_el_normal, & - coupling_ac_el_jacobian2Dw, & - ispec_is_inner,phase_is_inner) - else - ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield - ! adoint definition: pressure^\dagger=potential^\dagger - call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, & - ibool,accel,-potential_acoustic_adj_coupling, & - num_coupling_ac_el_faces, & - coupling_ac_el_ispec,coupling_ac_el_ijk, & - coupling_ac_el_normal, & - coupling_ac_el_jacobian2Dw, & - ispec_is_inner,phase_is_inner) - endif + if( .NOT. GPU_MODE ) then + if( SIMULATION_TYPE == 1 ) then + ! forward definition: pressure=-potential_dot_dot + call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, & + ibool,accel,potential_dot_dot_acoustic, & + num_coupling_ac_el_faces, & + coupling_ac_el_ispec,coupling_ac_el_ijk, & + coupling_ac_el_normal, & + coupling_ac_el_jacobian2Dw, & + ispec_is_inner,phase_is_inner) + else + ! handles adjoint runs coupling between adjoint potential and adjoint elastic wavefield + ! adoint definition: pressure^\dagger=potential^\dagger + call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, & + ibool,accel,-potential_acoustic_adj_coupling, & + num_coupling_ac_el_faces, & + coupling_ac_el_ispec,coupling_ac_el_ijk, & + coupling_ac_el_normal, & + coupling_ac_el_jacobian2Dw, & + ispec_is_inner,phase_is_inner) + endif ! adjoint simulations if( SIMULATION_TYPE == 3 ) & call compute_coupling_elastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, & - ibool,b_accel,b_potential_dot_dot_acoustic, & - num_coupling_ac_el_faces, & - coupling_ac_el_ispec,coupling_ac_el_ijk, & - coupling_ac_el_normal, & - coupling_ac_el_jacobian2Dw, & - ispec_is_inner,phase_is_inner) - endif + ibool,b_accel,b_potential_dot_dot_acoustic, & + num_coupling_ac_el_faces, & + coupling_ac_el_ispec,coupling_ac_el_ijk, & + coupling_ac_el_normal, & + coupling_ac_el_jacobian2Dw, & + ispec_is_inner,phase_is_inner) + + else + ! on GPU + call compute_coupling_el_ac_cuda(Mesh_pointer,phase_is_inner, & + num_coupling_ac_el_faces,SIMULATION_TYPE) + endif ! GPU_MODE + endif ! num_coupling_ac_el_faces endif ! poroelastic coupling - !print *,'entering poro counpling' - if( POROELASTIC_SIMULATION ) & + if( POROELASTIC_SIMULATION ) then call compute_coupling_elastic_po(NSPEC_AB,NGLOB_AB,ibool,& displs_poroelastic,displw_poroelastic,& xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & @@ -188,7 +230,7 @@ subroutine compute_forces_elastic() coupling_el_po_normal, & coupling_el_po_jacobian2Dw, & ispec_is_inner,phase_is_inner) - !print *,'ok poro counpling' + endif ! adds source term (single-force/moment-tensor solution) call compute_add_sources_elastic( NSPEC_AB,NGLOB_AB,accel, & @@ -199,43 +241,83 @@ subroutine compute_forces_elastic() ispec_is_elastic,SIMULATION_TYPE,NSTEP,NGLOB_ADJOINT, & nrec,islice_selected_rec,ispec_selected_rec, & nadj_rec_local,adj_sourcearrays,b_accel, & - NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY ) + NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, & + GPU_MODE, Mesh_pointer ) -! assemble all the contributions between slices using MPI + ! assemble all the contributions between slices using MPI if( phase_is_inner .eqv. .false. ) then - ! sends accel values to corresponding MPI interface neighbors - call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,accel, & - buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,& - my_neighbours_ext_mesh, & - request_send_vector_ext_mesh,request_recv_vector_ext_mesh) - - ! adjoint simulations - if( SIMULATION_TYPE == 3 ) then - call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_accel, & - b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, & - num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,& - my_neighbours_ext_mesh, & - b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh) - endif !adjoint + ! sends accel values to corresponding MPI interface neighbors + if(.NOT. GPU_MODE) then + call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,accel, & + buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,& + my_neighbours_ext_mesh, & + request_send_vector_ext_mesh,request_recv_vector_ext_mesh) + else ! GPU_MODE==1 + + ! transfers boundary region to host asynchronously. The + ! MPI-send is done from within compute_forces_elastic_cuda, + ! once the inner element kernels are launched, and the + ! memcpy has finished. see compute_forces_elastic_cuda:1655 + call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic) + + endif ! GPU_MODE + + ! adjoint simulations + if( SIMULATION_TYPE == 3 ) then + if(.NOT. GPU_MODE) then + call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_accel, & + b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,& + my_neighbours_ext_mesh, & + b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh) + else ! GPU_MODE == 1 + call transfer_boun_accel_from_device(NGLOB_AB*NDIM, Mesh_pointer, b_accel,& + b_buffer_send_vector_ext_mesh,& + num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh,& + nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh,3) ! <-- 3 == adjoint b_accel + call assemble_MPI_vector_send_cuda(NPROC, & + b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, & + num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,& + my_neighbours_ext_mesh, & + b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh) + + endif ! GPU + endif !adjoint else ! waits for send/receive requests to be completed and assembles values - call assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB,accel, & - buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,& - max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - request_send_vector_ext_mesh,request_recv_vector_ext_mesh) - + if(.NOT. GPU_MODE) then + call assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB,accel, & + buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,& + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + request_send_vector_ext_mesh,request_recv_vector_ext_mesh) + else ! GPU_MODE == 1 + call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,accel, Mesh_pointer,& + buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,& + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + request_send_vector_ext_mesh,request_recv_vector_ext_mesh,1) + endif ! adjoint simulations if( SIMULATION_TYPE == 3 ) then - call assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_accel, & - b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,& - max_nibool_interfaces_ext_mesh, & - nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & - b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh) + if(.NOT. GPU_MODE) then + call assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_accel, & + b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,& + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh) + else ! GPU_MODE == 1 + call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,b_accel, Mesh_pointer,& + b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,& + max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, & + b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh,3) + endif endif !adjoint endif @@ -246,28 +328,35 @@ subroutine compute_forces_elastic() !! DK DK May 2009: and assemble_MPI_vector_ext_mesh_w above can be used. !! DK DK May 2009: For adjoint runs below (SIMULATION_TYPE == 3) they should be used as well. - enddo - -! multiplies with inverse of mass matrix (note: rmass has been inverted already) - accel(1,:) = accel(1,:)*rmass(:) - accel(2,:) = accel(2,:)*rmass(:) - accel(3,:) = accel(3,:)*rmass(:) - - ! adjoint simulations - if (SIMULATION_TYPE == 3) then - b_accel(1,:) = b_accel(1,:)*rmass(:) - b_accel(2,:) = b_accel(2,:)*rmass(:) - b_accel(3,:) = b_accel(3,:)*rmass(:) - endif !adjoint - + enddo + + ! multiplies with inverse of mass matrix (note: rmass has been inverted already) + if(.NOT. GPU_MODE) then + accel(1,:) = accel(1,:)*rmass(:) + accel(2,:) = accel(2,:)*rmass(:) + accel(3,:) = accel(3,:)*rmass(:) + ! adjoint simulations + if (SIMULATION_TYPE == 3) then + b_accel(1,:) = b_accel(1,:)*rmass(:) + b_accel(2,:) = b_accel(2,:)*rmass(:) + b_accel(3,:) = b_accel(3,:)*rmass(:) + endif !adjoint + else ! GPU_MODE == 1 + call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS) + endif ! updates acceleration with ocean load term if(OCEANS) then - call elastic_ocean_load(NSPEC_AB,NGLOB_AB, & + if( .NOT. GPU_MODE ) then + call elastic_ocean_load(NSPEC_AB,NGLOB_AB, & ibool,rmass,rmass_ocean_load,accel, & free_surface_normal,free_surface_ijk,free_surface_ispec, & num_free_surface_faces,SIMULATION_TYPE, & NGLOB_ADJOINT,b_accel) + else + ! on GPU + call elastic_ocean_load_cuda(Mesh_pointer,SIMULATION_TYPE) + endif endif ! updates velocities @@ -286,10 +375,14 @@ subroutine compute_forces_elastic() ! ! corrector: ! updates the velocity term which requires a(t+delta) - veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:) - - ! adjoint simulations - if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:) +! GPU_MODE: this is handled in 'kernel_3' at the same time as accel*rmass + if(.NOT. GPU_MODE) then + veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:) + ! adjoint simulations + if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:) + else ! GPU_MODE == 1 + if( OCEANS ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2) + endif end subroutine compute_forces_elastic @@ -422,30 +515,65 @@ subroutine compute_forces_elastic_Dev_sim1(iphase) select case(NGLLX) case (5) + +!---------------------------------------------------------------------------------------------- + +! OpenMP routine flag for testing & benchmarking forward runs only +! configure additional flag, e.g.: FLAGS_NO_CHECK="-O3 -DOPENMP_MODE -openmp" + +!---------------------------------------------------------------------------------------------- +#ifdef OPENMP_MODE + call compute_forces_elastic_Dev_openmp(iphase, NSPEC_AB,NGLOB_AB,displ,accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION, & + one_minus_sum_beta,factor_common, & + alphaval,betaval,gammaval, & + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,& + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,& + phase_ispec_inner_elastic,& + num_colors_outer_elastic,num_colors_inner_elastic) +#else call compute_forces_elastic_Dev_5p(iphase, NSPEC_AB,NGLOB_AB,displ,accel, & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - kappastore,mustore,jacobian,ibool, & - ATTENUATION, & - one_minus_sum_beta,factor_common, & - alphaval,betaval,gammaval, & - NSPEC_ATTENUATION_AB, & - R_xx,R_yy,R_xy,R_xz,R_yz, & - epsilondev_xx,epsilondev_yy,epsilondev_xy, & - epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store,& - c22store,c23store,c24store,c25store,c26store,c33store,& - c34store,c35store,c36store,c44store,c45store,c46store,& - c55store,c56store,c66store, & - SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & - NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,& - is_moho_top,is_moho_bot, & - dsdx_top,dsdx_bot, & - ispec2D_moho_top,ispec2D_moho_bot, & - num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& - phase_ispec_inner_elastic ) + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION, & + one_minus_sum_beta,factor_common, & + alphaval,betaval,gammaval, & + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE, COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT,& + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& + phase_ispec_inner_elastic ) +#endif case (6) call compute_forces_elastic_Dev_6p(iphase, NSPEC_AB,NGLOB_AB,displ,accel, & diff --git a/src/specfem3D/compute_forces_elastic_Dev.f90 b/src/specfem3D/compute_forces_elastic_Dev.f90 index 3ac3a07ce..632ac70c0 100644 --- a/src/specfem3D/compute_forces_elastic_Dev.f90 +++ b/src/specfem3D/compute_forces_elastic_Dev.f90 @@ -24,8 +24,9 @@ ! !===================================================================== +! Deville routine for NGLL == 5 (default) -subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, & + subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, & displ,accel, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & hprime_xx,hprime_xxT, & @@ -655,3454 +656,5 @@ subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, & enddo ! spectral element loop -end subroutine compute_forces_elastic_Dev_5p + end subroutine compute_forces_elastic_Dev_5p -! -!===================================================================== -! - -subroutine compute_forces_elastic_Dev_6p( iphase ,NSPEC_AB,NGLOB_AB, & - displ,accel, & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - hprime_xx,hprime_xxT, & - hprimewgll_xx,hprimewgll_xxT, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - kappastore,mustore,jacobian,ibool, & - ATTENUATION, & - one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& - NSPEC_ATTENUATION_AB, & - R_xx,R_yy,R_xy,R_xz,R_yz, & - epsilondev_xx,epsilondev_yy,epsilondev_xy, & - epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store,& - c22store,c23store,c24store,c25store,c26store,c33store,& - c34store,c35store,c36store,c44store,c45store,c46store,& - c55store,c56store,c66store, & - SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & - NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & - is_moho_top,is_moho_bot, & - dsdx_top,dsdx_bot, & - ispec2D_moho_top,ispec2D_moho_bot, & - num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& - phase_ispec_inner_elastic) - - -! computes elastic tensor term - - use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & - N_SLS,SAVE_MOHO_MESH, & - ONE_THIRD,FOUR_THIRDS,m1,m2 - implicit none - - integer :: NSPEC_AB,NGLOB_AB - -! displacement and acceleration - real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel - -! arrays with mesh parameters per slice - integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - kappastore,mustore,jacobian - -! array with derivatives of Lagrange polynomials and precalculated products - real(kind=CUSTOM_REAL), dimension(NGLLX,6) :: hprime_xx,hprimewgll_xxT - real(kind=CUSTOM_REAL), dimension(6,NGLLX) :: hprime_xxT,hprimewgll_xx - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz - real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz - -! memory variables and standard linear solids for attenuation - logical :: ATTENUATION - logical :: COMPUTE_AND_STORE_STRAIN - integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT - integer :: NSPEC_ATTENUATION_AB - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta - real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common - real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & - R_xx,R_yy,R_xy,R_xz,R_yz - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & - epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz - real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 - -! anisotropy - logical :: ANISOTROPY - integer :: NSPEC_ANISO - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & - c11store,c12store,c13store,c14store,c15store,c16store, & - c22store,c23store,c24store,c25store,c26store,c33store, & - c34store,c35store,c36store,c44store,c45store,c46store, & - c55store,c56store,c66store - - integer :: iphase - integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic - integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic - -! adjoint simulations - integer :: SIMULATION_TYPE - integer :: NSPEC_BOUN,NSPEC2D_MOHO - - ! moho kernel - real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & - dsdx_top,dsdx_bot - logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot - integer :: ispec2D_moho_top, ispec2D_moho_bot - -! local parameters - real(kind=CUSTOM_REAL), dimension(6,6,6) :: dummyx_loc,dummyy_loc,dummyz_loc, & - newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 - real(kind=CUSTOM_REAL), dimension(6,6,6) :: & - tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 - - ! manually inline the calls to the Deville et al. (2002) routines - real(kind=CUSTOM_REAL), dimension(6,36) :: B1_m1_m2_6points,B2_m1_m2_6points,B3_m1_m2_6points - real(kind=CUSTOM_REAL), dimension(6,36) :: C1_m1_m2_6points,C2_m1_m2_6points,C3_m1_m2_6points - real(kind=CUSTOM_REAL), dimension(6,36) :: E1_m1_m2_6points,E2_m1_m2_6points,E3_m1_m2_6points - - equivalence(dummyx_loc,B1_m1_m2_6points) - equivalence(dummyy_loc,B2_m1_m2_6points) - equivalence(dummyz_loc,B3_m1_m2_6points) - equivalence(tempx1,C1_m1_m2_6points) - equivalence(tempy1,C2_m1_m2_6points) - equivalence(tempz1,C3_m1_m2_6points) - equivalence(newtempx1,E1_m1_m2_6points) - equivalence(newtempy1,E2_m1_m2_6points) - equivalence(newtempz1,E3_m1_m2_6points) - - real(kind=CUSTOM_REAL), dimension(36,6) :: & - A1_mxm_m2_m1_6points,A2_mxm_m2_m1_6points,A3_mxm_m2_m1_6points - real(kind=CUSTOM_REAL), dimension(36,6) :: & - C1_mxm_m2_m1_6points,C2_mxm_m2_m1_6points,C3_mxm_m2_m1_6points - real(kind=CUSTOM_REAL), dimension(36,6) :: & - E1_mxm_m2_m1_6points,E2_mxm_m2_m1_6points,E3_mxm_m2_m1_6points - - equivalence(dummyx_loc,A1_mxm_m2_m1_6points) - equivalence(dummyy_loc,A2_mxm_m2_m1_6points) - equivalence(dummyz_loc,A3_mxm_m2_m1_6points) - equivalence(tempx3,C1_mxm_m2_m1_6points) - equivalence(tempy3,C2_mxm_m2_m1_6points) - equivalence(tempz3,C3_mxm_m2_m1_6points) - equivalence(newtempx3,E1_mxm_m2_m1_6points) - equivalence(newtempy3,E2_mxm_m2_m1_6points) - equivalence(newtempz3,E3_mxm_m2_m1_6points) - - ! local attenuation parameters - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & - epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc - real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 - real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc - real(kind=CUSTOM_REAL) Sn,Snp1 - real(kind=CUSTOM_REAL) templ - - real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl - real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl - - real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl - real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl - - real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy - - real(kind=CUSTOM_REAL) fac1,fac2,fac3 - - real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul - real(kind=CUSTOM_REAL) kappal - - ! local anisotropy parameters - real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& - c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 - - integer i_SLS,imodulo_N_SLS - integer ispec,iglob,ispec_p,num_elements - integer i,j,k - - imodulo_N_SLS = mod(N_SLS,3) - - ! choses inner/outer elements - if( iphase == 1 ) then - num_elements = nspec_outer_elastic - else - num_elements = nspec_inner_elastic - endif - - do ispec_p = 1,num_elements - - ! returns element id from stored element list - ispec = phase_ispec_inner_elastic(ispec_p,iphase) - - ! adjoint simulations: moho kernel - if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - ispec2D_moho_top = ispec2D_moho_top + 1 - else if (is_moho_bot(ispec)) then - ispec2D_moho_bot = ispec2D_moho_bot + 1 - endif - endif ! adjoint - - ! stores displacment values in local array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - iglob = ibool(i,j,k,ispec) - dummyx_loc(i,j,k) = displ(1,iglob) - dummyy_loc(i,j,k) = displ(2,iglob) - dummyz_loc(i,j,k) = displ(3,iglob) - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_6points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) - do j=1,m2 - do i=1,m1 - C1_m1_m2_6points(i,j) = hprime_xx(i,1)*B1_m1_m2_6points(1,j) + & - hprime_xx(i,2)*B1_m1_m2_6points(2,j) + & - hprime_xx(i,3)*B1_m1_m2_6points(3,j) + & - hprime_xx(i,4)*B1_m1_m2_6points(4,j) + & - hprime_xx(i,5)*B1_m1_m2_6points(5,j) + & - hprime_xx(i,6)*B1_m1_m2_6points(6,j) - C2_m1_m2_6points(i,j) = hprime_xx(i,1)*B2_m1_m2_6points(1,j) + & - hprime_xx(i,2)*B2_m1_m2_6points(2,j) + & - hprime_xx(i,3)*B2_m1_m2_6points(3,j) + & - hprime_xx(i,4)*B2_m1_m2_6points(4,j) + & - hprime_xx(i,5)*B2_m1_m2_6points(5,j) + & - hprime_xx(i,6)*B2_m1_m2_6points(6,j) - C3_m1_m2_6points(i,j) = hprime_xx(i,1)*B3_m1_m2_6points(1,j) + & - hprime_xx(i,2)*B3_m1_m2_6points(2,j) + & - hprime_xx(i,3)*B3_m1_m2_6points(3,j) + & - hprime_xx(i,4)*B3_m1_m2_6points(4,j) + & - hprime_xx(i,5)*B3_m1_m2_6points(5,j) + & - hprime_xx(i,6)*B3_m1_m2_6points(6,j) - enddo - enddo - - ! call mxm_m1_m1_6points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & - ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) - do j=1,m1 - do i=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & - dummyx_loc(i,2,k)*hprime_xxT(2,j) + & - dummyx_loc(i,3,k)*hprime_xxT(3,j) + & - dummyx_loc(i,4,k)*hprime_xxT(4,j) + & - dummyx_loc(i,5,k)*hprime_xxT(5,j) + & - dummyx_loc(i,6,k)*hprime_xxT(6,j) - tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & - dummyy_loc(i,2,k)*hprime_xxT(2,j) + & - dummyy_loc(i,3,k)*hprime_xxT(3,j) + & - dummyy_loc(i,4,k)*hprime_xxT(4,j) + & - dummyy_loc(i,5,k)*hprime_xxT(5,j) + & - dummyy_loc(i,6,k)*hprime_xxT(6,j) - tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & - dummyz_loc(i,2,k)*hprime_xxT(2,j) + & - dummyz_loc(i,3,k)*hprime_xxT(3,j) + & - dummyz_loc(i,4,k)*hprime_xxT(4,j) + & - dummyz_loc(i,5,k)*hprime_xxT(5,j) + & - dummyz_loc(i,6,k)*hprime_xxT(6,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_6points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) - do j=1,m1 - do i=1,m2 - C1_mxm_m2_m1_6points(i,j) = A1_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + & - A1_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + & - A1_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + & - A1_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + & - A1_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + & - A1_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j) - C2_mxm_m2_m1_6points(i,j) = A2_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + & - A2_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + & - A2_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + & - A2_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + & - A2_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + & - A2_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j) - C3_mxm_m2_m1_6points(i,j) = A3_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + & - A3_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + & - A3_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + & - A3_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + & - A3_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + & - A3_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - ! get derivatives of ux, uy and uz with respect to x, y and z - xixl = xix(i,j,k,ispec) - xiyl = xiy(i,j,k,ispec) - xizl = xiz(i,j,k,ispec) - etaxl = etax(i,j,k,ispec) - etayl = etay(i,j,k,ispec) - etazl = etaz(i,j,k,ispec) - gammaxl = gammax(i,j,k,ispec) - gammayl = gammay(i,j,k,ispec) - gammazl = gammaz(i,j,k,ispec) - jacobianl = jacobian(i,j,k,ispec) - - duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) - duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) - duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) - - duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) - duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) - duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) - - duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) - duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) - duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) - - ! save strain on the Moho boundary - if (SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl - dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl - dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl - dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl - dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl - dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl - dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl - dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl - dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl - else if (is_moho_bot(ispec)) then - dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl - dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl - dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl - dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl - dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl - dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl - dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl - dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl - dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl - endif - endif - - ! precompute some sums to save CPU time - duxdxl_plus_duydyl = duxdxl + duydyl - duxdxl_plus_duzdzl = duxdxl + duzdzl - duydyl_plus_duzdzl = duydyl + duzdzl - duxdyl_plus_duydxl = duxdyl + duydxl - duzdxl_plus_duxdzl = duzdxl + duxdzl - duzdyl_plus_duydzl = duzdyl + duydzl - - ! computes deviatoric strain attenuation and/or for kernel calculations - if (COMPUTE_AND_STORE_STRAIN) then - templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) - if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ - epsilondev_xx_loc(i,j,k) = duxdxl - templ - epsilondev_yy_loc(i,j,k) = duydyl - templ - epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl - epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl - epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl - endif - - kappal = kappastore(i,j,k,ispec) - mul = mustore(i,j,k,ispec) - - ! attenuation - if(ATTENUATION) then - ! use unrelaxed parameters if attenuation - mul = mul * one_minus_sum_beta(i,j,k,ispec) - endif - - ! full anisotropic case, stress calculations - if(ANISOTROPY) then - c11 = c11store(i,j,k,ispec) - c12 = c12store(i,j,k,ispec) - c13 = c13store(i,j,k,ispec) - c14 = c14store(i,j,k,ispec) - c15 = c15store(i,j,k,ispec) - c16 = c16store(i,j,k,ispec) - c22 = c22store(i,j,k,ispec) - c23 = c23store(i,j,k,ispec) - c24 = c24store(i,j,k,ispec) - c25 = c25store(i,j,k,ispec) - c26 = c26store(i,j,k,ispec) - c33 = c33store(i,j,k,ispec) - c34 = c34store(i,j,k,ispec) - c35 = c35store(i,j,k,ispec) - c36 = c36store(i,j,k,ispec) - c44 = c44store(i,j,k,ispec) - c45 = c45store(i,j,k,ispec) - c46 = c46store(i,j,k,ispec) - c55 = c55store(i,j,k,ispec) - c56 = c56store(i,j,k,ispec) - c66 = c66store(i,j,k,ispec) - - sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & - c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl - sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & - c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl - sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & - c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl - sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & - c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl - sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & - c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl - sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & - c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl - - else - - ! isotropic case - lambdalplus2mul = kappal + FOUR_THIRDS * mul - lambdal = lambdalplus2mul - 2.*mul - - ! compute stress sigma - sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl - sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl - sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl - - sigma_xy = mul*duxdyl_plus_duydxl - sigma_xz = mul*duzdxl_plus_duxdzl - sigma_yz = mul*duzdyl_plus_duydzl - - endif ! ANISOTROPY - - ! subtract memory variables if attenuation - if(ATTENUATION) then -! way 1 -! do i_sls = 1,N_SLS -! R_xx_val = R_xx(i,j,k,ispec,i_sls) -! R_yy_val = R_yy(i,j,k,ispec,i_sls) -! sigma_xx = sigma_xx - R_xx_val -! sigma_yy = sigma_yy - R_yy_val -! sigma_zz = sigma_zz + R_xx_val + R_yy_val -! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) -! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) -! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) -! enddo - -! way 2 -! note: this should help compilers to pipeline the code and make better use of the cache; -! depending on compilers, it can further decrease the computation time by ~ 30%. -! by default, N_SLS = 3, therefore we take steps of 3 - if(imodulo_N_SLS >= 1) then - do i_sls = 1,imodulo_N_SLS - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - enddo - endif - - if(N_SLS >= imodulo_N_SLS+1) then - do i_sls = imodulo_N_SLS+1,N_SLS,3 - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - - R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) - R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) - sigma_xx = sigma_xx - R_xx_val2 - sigma_yy = sigma_yy - R_yy_val2 - sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) - - R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) - R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) - sigma_xx = sigma_xx - R_xx_val3 - sigma_yy = sigma_yy - R_yy_val3 - sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) - enddo - endif - - - endif - - ! define symmetric components of sigma - sigma_yx = sigma_xy - sigma_zx = sigma_xz - sigma_zy = sigma_yz - - ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) - tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x - tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y - tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z - - tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x - tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y - tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z - - tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x - tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y - tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z - - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_6points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) - do j=1,m2 - do i=1,m1 - E1_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_6points(1,j) + & - hprimewgll_xxT(i,2)*C1_m1_m2_6points(2,j) + & - hprimewgll_xxT(i,3)*C1_m1_m2_6points(3,j) + & - hprimewgll_xxT(i,4)*C1_m1_m2_6points(4,j) + & - hprimewgll_xxT(i,5)*C1_m1_m2_6points(5,j) + & - hprimewgll_xxT(i,6)*C1_m1_m2_6points(6,j) - E2_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_6points(1,j) + & - hprimewgll_xxT(i,2)*C2_m1_m2_6points(2,j) + & - hprimewgll_xxT(i,3)*C2_m1_m2_6points(3,j) + & - hprimewgll_xxT(i,4)*C2_m1_m2_6points(4,j) + & - hprimewgll_xxT(i,5)*C2_m1_m2_6points(5,j) + & - hprimewgll_xxT(i,6)*C2_m1_m2_6points(6,j) - E3_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_6points(1,j) + & - hprimewgll_xxT(i,2)*C3_m1_m2_6points(2,j) + & - hprimewgll_xxT(i,3)*C3_m1_m2_6points(3,j) + & - hprimewgll_xxT(i,4)*C3_m1_m2_6points(4,j) + & - hprimewgll_xxT(i,5)*C3_m1_m2_6points(5,j) + & - hprimewgll_xxT(i,6)*C3_m1_m2_6points(6,j) - enddo - enddo - - ! call mxm_m1_m1_6points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & - ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) - do i=1,m1 - do j=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & - tempx2(i,2,k)*hprimewgll_xx(2,j) + & - tempx2(i,3,k)*hprimewgll_xx(3,j) + & - tempx2(i,4,k)*hprimewgll_xx(4,j) + & - tempx2(i,5,k)*hprimewgll_xx(5,j) + & - tempx2(i,6,k)*hprimewgll_xx(6,j) - newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & - tempy2(i,2,k)*hprimewgll_xx(2,j) + & - tempy2(i,3,k)*hprimewgll_xx(3,j) + & - tempy2(i,4,k)*hprimewgll_xx(4,j) + & - tempy2(i,5,k)*hprimewgll_xx(5,j) + & - tempy2(i,6,k)*hprimewgll_xx(6,j) - newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & - tempz2(i,2,k)*hprimewgll_xx(2,j) + & - tempz2(i,3,k)*hprimewgll_xx(3,j) + & - tempz2(i,4,k)*hprimewgll_xx(4,j) + & - tempz2(i,5,k)*hprimewgll_xx(5,j) + & - tempz2(i,6,k)*hprimewgll_xx(6,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_6points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) - do j=1,m1 - do i=1,m2 - E1_mxm_m2_m1_6points(i,j) = C1_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + & - C1_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + & - C1_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + & - C1_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + & - C1_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + & - C1_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j) - E2_mxm_m2_m1_6points(i,j) = C2_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + & - C2_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + & - C2_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + & - C2_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + & - C2_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + & - C2_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j) - E3_mxm_m2_m1_6points(i,j) = C3_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + & - C3_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + & - C3_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + & - C3_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + & - C3_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + & - C3_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - - fac1 = wgllwgll_yz(j,k) - fac2 = wgllwgll_xz(i,k) - fac3 = wgllwgll_xy(i,j) - - ! sum contributions from each element to the global mesh using indirect addressing - iglob = ibool(i,j,k,ispec) - accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & - fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) - accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & - fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) - accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & - fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) - - ! update memory variables based upon the Runge-Kutta scheme - if(ATTENUATION) then - - ! use Runge-Kutta scheme to march in time - do i_sls = 1,N_SLS - - factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) - - alphaval_loc = alphaval(i_sls) - betaval_loc = betaval(i_sls) - gammaval_loc = gammaval(i_sls) - - ! term in xx - Sn = factor_loc * epsilondev_xx(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) - R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yy - Sn = factor_loc * epsilondev_yy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) - R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in zz not computed since zero trace - ! term in xy - Sn = factor_loc * epsilondev_xy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) - R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in xz - Sn = factor_loc * epsilondev_xz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) - R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yz - Sn = factor_loc * epsilondev_yz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) - R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - - enddo ! end of loop on memory variables - - endif ! end attenuation - - enddo - enddo - enddo - - ! save deviatoric strain for Runge-Kutta scheme - if ( COMPUTE_AND_STORE_STRAIN ) then - epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) - epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) - epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) - epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) - epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) - endif - - enddo ! spectral element loop - -end subroutine compute_forces_elastic_Dev_6p - -! -!===================================================================== -! - -subroutine compute_forces_elastic_Dev_7p( iphase ,NSPEC_AB,NGLOB_AB, & - displ,accel, & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - hprime_xx,hprime_xxT, & - hprimewgll_xx,hprimewgll_xxT, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - kappastore,mustore,jacobian,ibool, & - ATTENUATION, & - one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& - NSPEC_ATTENUATION_AB, & - R_xx,R_yy,R_xy,R_xz,R_yz, & - epsilondev_xx,epsilondev_yy,epsilondev_xy, & - epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store,& - c22store,c23store,c24store,c25store,c26store,c33store,& - c34store,c35store,c36store,c44store,c45store,c46store,& - c55store,c56store,c66store, & - SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & - NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & - is_moho_top,is_moho_bot, & - dsdx_top,dsdx_bot, & - ispec2D_moho_top,ispec2D_moho_bot, & - num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& - phase_ispec_inner_elastic) - - -! computes elastic tensor term - - use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & - N_SLS,SAVE_MOHO_MESH, & - ONE_THIRD,FOUR_THIRDS,m1,m2 - implicit none - - integer :: NSPEC_AB,NGLOB_AB - -! displacement and acceleration - real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel - -! arrays with mesh parameters per slice - integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - kappastore,mustore,jacobian - -! array with derivatives of Lagrange polynomials and precalculated products - real(kind=CUSTOM_REAL), dimension(NGLLX,7) :: hprime_xx,hprimewgll_xxT - real(kind=CUSTOM_REAL), dimension(7,NGLLX) :: hprime_xxT,hprimewgll_xx - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz - real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz - -! memory variables and standard linear solids for attenuation - logical :: ATTENUATION - logical :: COMPUTE_AND_STORE_STRAIN - integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT - integer :: NSPEC_ATTENUATION_AB - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta - real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common - real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & - R_xx,R_yy,R_xy,R_xz,R_yz - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & - epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz - real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 - -! anisotropy - logical :: ANISOTROPY - integer :: NSPEC_ANISO - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & - c11store,c12store,c13store,c14store,c15store,c16store, & - c22store,c23store,c24store,c25store,c26store,c33store, & - c34store,c35store,c36store,c44store,c45store,c46store, & - c55store,c56store,c66store - - integer :: iphase - integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic - integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic - -! adjoint simulations - integer :: SIMULATION_TYPE - integer :: NSPEC_BOUN,NSPEC2D_MOHO - - ! moho kernel - real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & - dsdx_top,dsdx_bot - logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot - integer :: ispec2D_moho_top, ispec2D_moho_bot - -! local parameters - real(kind=CUSTOM_REAL), dimension(7,7,7) :: dummyx_loc,dummyy_loc,dummyz_loc, & - newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 - real(kind=CUSTOM_REAL), dimension(7,7,7) :: & - tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 - - ! manually inline the calls to the Deville et al. (2002) routines - real(kind=CUSTOM_REAL), dimension(7,49) :: B1_m1_m2_7points,B2_m1_m2_7points,B3_m1_m2_7points - real(kind=CUSTOM_REAL), dimension(7,49) :: C1_m1_m2_7points,C2_m1_m2_7points,C3_m1_m2_7points - real(kind=CUSTOM_REAL), dimension(7,49) :: E1_m1_m2_7points,E2_m1_m2_7points,E3_m1_m2_7points - - equivalence(dummyx_loc,B1_m1_m2_7points) - equivalence(dummyy_loc,B2_m1_m2_7points) - equivalence(dummyz_loc,B3_m1_m2_7points) - equivalence(tempx1,C1_m1_m2_7points) - equivalence(tempy1,C2_m1_m2_7points) - equivalence(tempz1,C3_m1_m2_7points) - equivalence(newtempx1,E1_m1_m2_7points) - equivalence(newtempy1,E2_m1_m2_7points) - equivalence(newtempz1,E3_m1_m2_7points) - - real(kind=CUSTOM_REAL), dimension(49,7) :: & - A1_mxm_m2_m1_7points,A2_mxm_m2_m1_7points,A3_mxm_m2_m1_7points - real(kind=CUSTOM_REAL), dimension(49,7) :: & - C1_mxm_m2_m1_7points,C2_mxm_m2_m1_7points,C3_mxm_m2_m1_7points - real(kind=CUSTOM_REAL), dimension(49,7) :: & - E1_mxm_m2_m1_7points,E2_mxm_m2_m1_7points,E3_mxm_m2_m1_7points - - equivalence(dummyx_loc,A1_mxm_m2_m1_7points) - equivalence(dummyy_loc,A2_mxm_m2_m1_7points) - equivalence(dummyz_loc,A3_mxm_m2_m1_7points) - equivalence(tempx3,C1_mxm_m2_m1_7points) - equivalence(tempy3,C2_mxm_m2_m1_7points) - equivalence(tempz3,C3_mxm_m2_m1_7points) - equivalence(newtempx3,E1_mxm_m2_m1_7points) - equivalence(newtempy3,E2_mxm_m2_m1_7points) - equivalence(newtempz3,E3_mxm_m2_m1_7points) - - ! local attenuation parameters - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & - epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc - real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 - real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc - real(kind=CUSTOM_REAL) Sn,Snp1 - real(kind=CUSTOM_REAL) templ - - real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl - real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl - - real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl - real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl - - real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy - - real(kind=CUSTOM_REAL) fac1,fac2,fac3 - - real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul - real(kind=CUSTOM_REAL) kappal - - ! local anisotropy parameters - real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& - c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 - - integer i_SLS,imodulo_N_SLS - integer ispec,iglob,ispec_p,num_elements - integer i,j,k - - imodulo_N_SLS = mod(N_SLS,3) - - ! choses inner/outer elements - if( iphase == 1 ) then - num_elements = nspec_outer_elastic - else - num_elements = nspec_inner_elastic - endif - - do ispec_p = 1,num_elements - - ! returns element id from stored element list - ispec = phase_ispec_inner_elastic(ispec_p,iphase) - - ! adjoint simulations: moho kernel - if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - ispec2D_moho_top = ispec2D_moho_top + 1 - else if (is_moho_bot(ispec)) then - ispec2D_moho_bot = ispec2D_moho_bot + 1 - endif - endif ! adjoint - - ! stores displacment values in local array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - iglob = ibool(i,j,k,ispec) - dummyx_loc(i,j,k) = displ(1,iglob) - dummyy_loc(i,j,k) = displ(2,iglob) - dummyz_loc(i,j,k) = displ(3,iglob) - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_7points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) - do j=1,m2 - do i=1,m1 - C1_m1_m2_7points(i,j) = hprime_xx(i,1)*B1_m1_m2_7points(1,j) + & - hprime_xx(i,2)*B1_m1_m2_7points(2,j) + & - hprime_xx(i,3)*B1_m1_m2_7points(3,j) + & - hprime_xx(i,4)*B1_m1_m2_7points(4,j) + & - hprime_xx(i,5)*B1_m1_m2_7points(5,j) + & - hprime_xx(i,6)*B1_m1_m2_7points(6,j) + & - hprime_xx(i,7)*B1_m1_m2_7points(7,j) - C2_m1_m2_7points(i,j) = hprime_xx(i,1)*B2_m1_m2_7points(1,j) + & - hprime_xx(i,2)*B2_m1_m2_7points(2,j) + & - hprime_xx(i,3)*B2_m1_m2_7points(3,j) + & - hprime_xx(i,4)*B2_m1_m2_7points(4,j) + & - hprime_xx(i,5)*B2_m1_m2_7points(5,j) + & - hprime_xx(i,6)*B2_m1_m2_7points(6,j) + & - hprime_xx(i,7)*B2_m1_m2_7points(7,j) - C3_m1_m2_7points(i,j) = hprime_xx(i,1)*B3_m1_m2_7points(1,j) + & - hprime_xx(i,2)*B3_m1_m2_7points(2,j) + & - hprime_xx(i,3)*B3_m1_m2_7points(3,j) + & - hprime_xx(i,4)*B3_m1_m2_7points(4,j) + & - hprime_xx(i,5)*B3_m1_m2_7points(5,j) + & - hprime_xx(i,6)*B3_m1_m2_7points(6,j) + & - hprime_xx(i,7)*B3_m1_m2_7points(7,j) - enddo - enddo - - ! call mxm_m1_m1_7points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & - ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) - do j=1,m1 - do i=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & - dummyx_loc(i,2,k)*hprime_xxT(2,j) + & - dummyx_loc(i,3,k)*hprime_xxT(3,j) + & - dummyx_loc(i,4,k)*hprime_xxT(4,j) + & - dummyx_loc(i,5,k)*hprime_xxT(5,j) + & - dummyx_loc(i,6,k)*hprime_xxT(6,j) + & - dummyx_loc(i,7,k)*hprime_xxT(7,j) - tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & - dummyy_loc(i,2,k)*hprime_xxT(2,j) + & - dummyy_loc(i,3,k)*hprime_xxT(3,j) + & - dummyy_loc(i,4,k)*hprime_xxT(4,j) + & - dummyy_loc(i,5,k)*hprime_xxT(5,j) + & - dummyy_loc(i,6,k)*hprime_xxT(6,j) + & - dummyy_loc(i,7,k)*hprime_xxT(7,j) - tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & - dummyz_loc(i,2,k)*hprime_xxT(2,j) + & - dummyz_loc(i,3,k)*hprime_xxT(3,j) + & - dummyz_loc(i,4,k)*hprime_xxT(4,j) + & - dummyz_loc(i,5,k)*hprime_xxT(5,j) + & - dummyz_loc(i,6,k)*hprime_xxT(6,j) + & - dummyz_loc(i,7,k)*hprime_xxT(7,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_7points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) - do j=1,m1 - do i=1,m2 - C1_mxm_m2_m1_7points(i,j) = A1_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + & - A1_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + & - A1_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + & - A1_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + & - A1_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + & - A1_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + & - A1_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j) - C2_mxm_m2_m1_7points(i,j) = A2_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + & - A2_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + & - A2_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + & - A2_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + & - A2_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + & - A2_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + & - A2_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j) - C3_mxm_m2_m1_7points(i,j) = A3_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + & - A3_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + & - A3_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + & - A3_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + & - A3_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + & - A3_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + & - A3_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - ! get derivatives of ux, uy and uz with respect to x, y and z - xixl = xix(i,j,k,ispec) - xiyl = xiy(i,j,k,ispec) - xizl = xiz(i,j,k,ispec) - etaxl = etax(i,j,k,ispec) - etayl = etay(i,j,k,ispec) - etazl = etaz(i,j,k,ispec) - gammaxl = gammax(i,j,k,ispec) - gammayl = gammay(i,j,k,ispec) - gammazl = gammaz(i,j,k,ispec) - jacobianl = jacobian(i,j,k,ispec) - - duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) - duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) - duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) - - duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) - duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) - duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) - - duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) - duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) - duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) - - ! save strain on the Moho boundary - if (SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl - dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl - dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl - dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl - dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl - dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl - dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl - dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl - dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl - else if (is_moho_bot(ispec)) then - dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl - dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl - dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl - dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl - dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl - dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl - dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl - dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl - dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl - endif - endif - - ! precompute some sums to save CPU time - duxdxl_plus_duydyl = duxdxl + duydyl - duxdxl_plus_duzdzl = duxdxl + duzdzl - duydyl_plus_duzdzl = duydyl + duzdzl - duxdyl_plus_duydxl = duxdyl + duydxl - duzdxl_plus_duxdzl = duzdxl + duxdzl - duzdyl_plus_duydzl = duzdyl + duydzl - - ! computes deviatoric strain attenuation and/or for kernel calculations - if (COMPUTE_AND_STORE_STRAIN) then - templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) - if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ - epsilondev_xx_loc(i,j,k) = duxdxl - templ - epsilondev_yy_loc(i,j,k) = duydyl - templ - epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl - epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl - epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl - endif - - kappal = kappastore(i,j,k,ispec) - mul = mustore(i,j,k,ispec) - - ! attenuation - if(ATTENUATION) then - ! use unrelaxed parameters if attenuation - mul = mul * one_minus_sum_beta(i,j,k,ispec) - endif - - ! full anisotropic case, stress calculations - if(ANISOTROPY) then - c11 = c11store(i,j,k,ispec) - c12 = c12store(i,j,k,ispec) - c13 = c13store(i,j,k,ispec) - c14 = c14store(i,j,k,ispec) - c15 = c15store(i,j,k,ispec) - c16 = c16store(i,j,k,ispec) - c22 = c22store(i,j,k,ispec) - c23 = c23store(i,j,k,ispec) - c24 = c24store(i,j,k,ispec) - c25 = c25store(i,j,k,ispec) - c26 = c26store(i,j,k,ispec) - c33 = c33store(i,j,k,ispec) - c34 = c34store(i,j,k,ispec) - c35 = c35store(i,j,k,ispec) - c36 = c36store(i,j,k,ispec) - c44 = c44store(i,j,k,ispec) - c45 = c45store(i,j,k,ispec) - c46 = c46store(i,j,k,ispec) - c55 = c55store(i,j,k,ispec) - c56 = c56store(i,j,k,ispec) - c66 = c66store(i,j,k,ispec) - - sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & - c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl - sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & - c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl - sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & - c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl - sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & - c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl - sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & - c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl - sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & - c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl - - else - - ! isotropic case - lambdalplus2mul = kappal + FOUR_THIRDS * mul - lambdal = lambdalplus2mul - 2.*mul - - ! compute stress sigma - sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl - sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl - sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl - - sigma_xy = mul*duxdyl_plus_duydxl - sigma_xz = mul*duzdxl_plus_duxdzl - sigma_yz = mul*duzdyl_plus_duydzl - - endif ! ANISOTROPY - - ! subtract memory variables if attenuation - if(ATTENUATION) then -! way 1 -! do i_sls = 1,N_SLS -! R_xx_val = R_xx(i,j,k,ispec,i_sls) -! R_yy_val = R_yy(i,j,k,ispec,i_sls) -! sigma_xx = sigma_xx - R_xx_val -! sigma_yy = sigma_yy - R_yy_val -! sigma_zz = sigma_zz + R_xx_val + R_yy_val -! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) -! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) -! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) -! enddo - -! way 2 -! note: this should help compilers to pipeline the code and make better use of the cache; -! depending on compilers, it can further decrease the computation time by ~ 30%. -! by default, N_SLS = 3, therefore we take steps of 3 - if(imodulo_N_SLS >= 1) then - do i_sls = 1,imodulo_N_SLS - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - enddo - endif - - if(N_SLS >= imodulo_N_SLS+1) then - do i_sls = imodulo_N_SLS+1,N_SLS,3 - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - - R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) - R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) - sigma_xx = sigma_xx - R_xx_val2 - sigma_yy = sigma_yy - R_yy_val2 - sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) - - R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) - R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) - sigma_xx = sigma_xx - R_xx_val3 - sigma_yy = sigma_yy - R_yy_val3 - sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) - enddo - endif - - - endif - - ! define symmetric components of sigma - sigma_yx = sigma_xy - sigma_zx = sigma_xz - sigma_zy = sigma_yz - - ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) - tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x - tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y - tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z - - tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x - tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y - tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z - - tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x - tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y - tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z - - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_7points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) - do j=1,m2 - do i=1,m1 - E1_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_7points(1,j) + & - hprimewgll_xxT(i,2)*C1_m1_m2_7points(2,j) + & - hprimewgll_xxT(i,3)*C1_m1_m2_7points(3,j) + & - hprimewgll_xxT(i,4)*C1_m1_m2_7points(4,j) + & - hprimewgll_xxT(i,5)*C1_m1_m2_7points(5,j) + & - hprimewgll_xxT(i,6)*C1_m1_m2_7points(6,j) + & - hprimewgll_xxT(i,7)*C1_m1_m2_7points(7,j) - E2_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_7points(1,j) + & - hprimewgll_xxT(i,2)*C2_m1_m2_7points(2,j) + & - hprimewgll_xxT(i,3)*C2_m1_m2_7points(3,j) + & - hprimewgll_xxT(i,4)*C2_m1_m2_7points(4,j) + & - hprimewgll_xxT(i,5)*C2_m1_m2_7points(5,j) + & - hprimewgll_xxT(i,6)*C2_m1_m2_7points(6,j) + & - hprimewgll_xxT(i,7)*C2_m1_m2_7points(7,j) - E3_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_7points(1,j) + & - hprimewgll_xxT(i,2)*C3_m1_m2_7points(2,j) + & - hprimewgll_xxT(i,3)*C3_m1_m2_7points(3,j) + & - hprimewgll_xxT(i,4)*C3_m1_m2_7points(4,j) + & - hprimewgll_xxT(i,5)*C3_m1_m2_7points(5,j) + & - hprimewgll_xxT(i,6)*C3_m1_m2_7points(6,j) + & - hprimewgll_xxT(i,7)*C3_m1_m2_7points(7,j) - enddo - enddo - - ! call mxm_m1_m1_7points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & - ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) - do i=1,m1 - do j=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & - tempx2(i,2,k)*hprimewgll_xx(2,j) + & - tempx2(i,3,k)*hprimewgll_xx(3,j) + & - tempx2(i,4,k)*hprimewgll_xx(4,j) + & - tempx2(i,5,k)*hprimewgll_xx(5,j) + & - tempx2(i,6,k)*hprimewgll_xx(6,j) + & - tempx2(i,7,k)*hprimewgll_xx(7,j) - newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & - tempy2(i,2,k)*hprimewgll_xx(2,j) + & - tempy2(i,3,k)*hprimewgll_xx(3,j) + & - tempy2(i,4,k)*hprimewgll_xx(4,j) + & - tempy2(i,5,k)*hprimewgll_xx(5,j) + & - tempy2(i,6,k)*hprimewgll_xx(6,j) + & - tempy2(i,7,k)*hprimewgll_xx(7,j) - newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & - tempz2(i,2,k)*hprimewgll_xx(2,j) + & - tempz2(i,3,k)*hprimewgll_xx(3,j) + & - tempz2(i,4,k)*hprimewgll_xx(4,j) + & - tempz2(i,5,k)*hprimewgll_xx(5,j) + & - tempz2(i,6,k)*hprimewgll_xx(6,j) + & - tempz2(i,7,k)*hprimewgll_xx(7,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_7points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) - do j=1,m1 - do i=1,m2 - E1_mxm_m2_m1_7points(i,j) = C1_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + & - C1_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + & - C1_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + & - C1_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + & - C1_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + & - C1_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + & - C1_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j) - E2_mxm_m2_m1_7points(i,j) = C2_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + & - C2_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + & - C2_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + & - C2_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + & - C2_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + & - C2_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + & - C2_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j) - E3_mxm_m2_m1_7points(i,j) = C3_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + & - C3_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + & - C3_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + & - C3_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + & - C3_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + & - C3_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + & - C3_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - - fac1 = wgllwgll_yz(j,k) - fac2 = wgllwgll_xz(i,k) - fac3 = wgllwgll_xy(i,j) - - ! sum contributions from each element to the global mesh using indirect addressing - iglob = ibool(i,j,k,ispec) - accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & - fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) - accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & - fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) - accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & - fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) - - ! update memory variables based upon the Runge-Kutta scheme - if(ATTENUATION) then - - ! use Runge-Kutta scheme to march in time - do i_sls = 1,N_SLS - - factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) - - alphaval_loc = alphaval(i_sls) - betaval_loc = betaval(i_sls) - gammaval_loc = gammaval(i_sls) - - ! term in xx - Sn = factor_loc * epsilondev_xx(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) - R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yy - Sn = factor_loc * epsilondev_yy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) - R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in zz not computed since zero trace - ! term in xy - Sn = factor_loc * epsilondev_xy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) - R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in xz - Sn = factor_loc * epsilondev_xz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) - R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yz - Sn = factor_loc * epsilondev_yz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) - R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - - enddo ! end of loop on memory variables - - endif ! end attenuation - - enddo - enddo - enddo - - ! save deviatoric strain for Runge-Kutta scheme - if ( COMPUTE_AND_STORE_STRAIN ) then - epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) - epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) - epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) - epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) - epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) - endif - - enddo ! spectral element loop - -end subroutine compute_forces_elastic_Dev_7p - -! -!===================================================================== -! - -subroutine compute_forces_elastic_Dev_8p( iphase ,NSPEC_AB,NGLOB_AB, & - displ,accel, & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - hprime_xx,hprime_xxT, & - hprimewgll_xx,hprimewgll_xxT, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - kappastore,mustore,jacobian,ibool, & - ATTENUATION, & - one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& - NSPEC_ATTENUATION_AB, & - R_xx,R_yy,R_xy,R_xz,R_yz, & - epsilondev_xx,epsilondev_yy,epsilondev_xy, & - epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store,& - c22store,c23store,c24store,c25store,c26store,c33store,& - c34store,c35store,c36store,c44store,c45store,c46store,& - c55store,c56store,c66store, & - SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & - NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & - is_moho_top,is_moho_bot, & - dsdx_top,dsdx_bot, & - ispec2D_moho_top,ispec2D_moho_bot, & - num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& - phase_ispec_inner_elastic) - - -! computes elastic tensor term - - use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & - N_SLS,SAVE_MOHO_MESH, & - ONE_THIRD,FOUR_THIRDS,m1,m2 - implicit none - - integer :: NSPEC_AB,NGLOB_AB - -! displacement and acceleration - real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel - -! arrays with mesh parameters per slice - integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - kappastore,mustore,jacobian - -! array with derivatives of Lagrange polynomials and precalculated products - real(kind=CUSTOM_REAL), dimension(NGLLX,8) :: hprime_xx,hprimewgll_xxT - real(kind=CUSTOM_REAL), dimension(8,NGLLX) :: hprime_xxT,hprimewgll_xx - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz - real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz - -! memory variables and standard linear solids for attenuation - logical :: ATTENUATION - logical :: COMPUTE_AND_STORE_STRAIN - integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT - integer :: NSPEC_ATTENUATION_AB - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta - real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common - real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & - R_xx,R_yy,R_xy,R_xz,R_yz - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & - epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz - real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 - -! anisotropy - logical :: ANISOTROPY - integer :: NSPEC_ANISO - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & - c11store,c12store,c13store,c14store,c15store,c16store, & - c22store,c23store,c24store,c25store,c26store,c33store, & - c34store,c35store,c36store,c44store,c45store,c46store, & - c55store,c56store,c66store - - integer :: iphase - integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic - integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic - -! adjoint simulations - integer :: SIMULATION_TYPE - integer :: NSPEC_BOUN,NSPEC2D_MOHO - - ! moho kernel - real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & - dsdx_top,dsdx_bot - logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot - integer :: ispec2D_moho_top, ispec2D_moho_bot - -! local parameters - real(kind=CUSTOM_REAL), dimension(8,8,8) :: dummyx_loc,dummyy_loc,dummyz_loc, & - newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 - real(kind=CUSTOM_REAL), dimension(8,8,8) :: & - tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 - - ! manually inline the calls to the Deville et al. (2002) routines - real(kind=CUSTOM_REAL), dimension(8,64) :: B1_m1_m2_8points,B2_m1_m2_8points,B3_m1_m2_8points - real(kind=CUSTOM_REAL), dimension(8,64) :: C1_m1_m2_8points,C2_m1_m2_8points,C3_m1_m2_8points - real(kind=CUSTOM_REAL), dimension(8,64) :: E1_m1_m2_8points,E2_m1_m2_8points,E3_m1_m2_8points - - equivalence(dummyx_loc,B1_m1_m2_8points) - equivalence(dummyy_loc,B2_m1_m2_8points) - equivalence(dummyz_loc,B3_m1_m2_8points) - equivalence(tempx1,C1_m1_m2_8points) - equivalence(tempy1,C2_m1_m2_8points) - equivalence(tempz1,C3_m1_m2_8points) - equivalence(newtempx1,E1_m1_m2_8points) - equivalence(newtempy1,E2_m1_m2_8points) - equivalence(newtempz1,E3_m1_m2_8points) - - real(kind=CUSTOM_REAL), dimension(64,8) :: & - A1_mxm_m2_m1_8points,A2_mxm_m2_m1_8points,A3_mxm_m2_m1_8points - real(kind=CUSTOM_REAL), dimension(64,8) :: & - C1_mxm_m2_m1_8points,C2_mxm_m2_m1_8points,C3_mxm_m2_m1_8points - real(kind=CUSTOM_REAL), dimension(64,8) :: & - E1_mxm_m2_m1_8points,E2_mxm_m2_m1_8points,E3_mxm_m2_m1_8points - - equivalence(dummyx_loc,A1_mxm_m2_m1_8points) - equivalence(dummyy_loc,A2_mxm_m2_m1_8points) - equivalence(dummyz_loc,A3_mxm_m2_m1_8points) - equivalence(tempx3,C1_mxm_m2_m1_8points) - equivalence(tempy3,C2_mxm_m2_m1_8points) - equivalence(tempz3,C3_mxm_m2_m1_8points) - equivalence(newtempx3,E1_mxm_m2_m1_8points) - equivalence(newtempy3,E2_mxm_m2_m1_8points) - equivalence(newtempz3,E3_mxm_m2_m1_8points) - - ! local attenuation parameters - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & - epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc - real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 - real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc - real(kind=CUSTOM_REAL) Sn,Snp1 - real(kind=CUSTOM_REAL) templ - - real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl - real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl - - real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl - real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl - - real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy - - real(kind=CUSTOM_REAL) fac1,fac2,fac3 - - real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul - real(kind=CUSTOM_REAL) kappal - - ! local anisotropy parameters - real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& - c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 - - integer i_SLS,imodulo_N_SLS - integer ispec,iglob,ispec_p,num_elements - integer i,j,k - - imodulo_N_SLS = mod(N_SLS,3) - - ! choses inner/outer elements - if( iphase == 1 ) then - num_elements = nspec_outer_elastic - else - num_elements = nspec_inner_elastic - endif - - do ispec_p = 1,num_elements - - ! returns element id from stored element list - ispec = phase_ispec_inner_elastic(ispec_p,iphase) - - ! adjoint simulations: moho kernel - if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - ispec2D_moho_top = ispec2D_moho_top + 1 - else if (is_moho_bot(ispec)) then - ispec2D_moho_bot = ispec2D_moho_bot + 1 - endif - endif ! adjoint - - ! stores displacment values in local array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - iglob = ibool(i,j,k,ispec) - dummyx_loc(i,j,k) = displ(1,iglob) - dummyy_loc(i,j,k) = displ(2,iglob) - dummyz_loc(i,j,k) = displ(3,iglob) - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_8points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) - do j=1,m2 - do i=1,m1 - C1_m1_m2_8points(i,j) = hprime_xx(i,1)*B1_m1_m2_8points(1,j) + & - hprime_xx(i,2)*B1_m1_m2_8points(2,j) + & - hprime_xx(i,3)*B1_m1_m2_8points(3,j) + & - hprime_xx(i,4)*B1_m1_m2_8points(4,j) + & - hprime_xx(i,5)*B1_m1_m2_8points(5,j) + & - hprime_xx(i,6)*B1_m1_m2_8points(6,j) + & - hprime_xx(i,7)*B1_m1_m2_8points(7,j) + & - hprime_xx(i,8)*B1_m1_m2_8points(8,j) - C2_m1_m2_8points(i,j) = hprime_xx(i,1)*B2_m1_m2_8points(1,j) + & - hprime_xx(i,2)*B2_m1_m2_8points(2,j) + & - hprime_xx(i,3)*B2_m1_m2_8points(3,j) + & - hprime_xx(i,4)*B2_m1_m2_8points(4,j) + & - hprime_xx(i,5)*B2_m1_m2_8points(5,j) + & - hprime_xx(i,6)*B2_m1_m2_8points(6,j) + & - hprime_xx(i,7)*B2_m1_m2_8points(7,j) + & - hprime_xx(i,8)*B2_m1_m2_8points(8,j) - C3_m1_m2_8points(i,j) = hprime_xx(i,1)*B3_m1_m2_8points(1,j) + & - hprime_xx(i,2)*B3_m1_m2_8points(2,j) + & - hprime_xx(i,3)*B3_m1_m2_8points(3,j) + & - hprime_xx(i,4)*B3_m1_m2_8points(4,j) + & - hprime_xx(i,5)*B3_m1_m2_8points(5,j) + & - hprime_xx(i,6)*B3_m1_m2_8points(6,j) + & - hprime_xx(i,7)*B3_m1_m2_8points(7,j) + & - hprime_xx(i,8)*B3_m1_m2_8points(8,j) - enddo - enddo - - ! call mxm_m1_m1_8points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & - ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) - do j=1,m1 - do i=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & - dummyx_loc(i,2,k)*hprime_xxT(2,j) + & - dummyx_loc(i,3,k)*hprime_xxT(3,j) + & - dummyx_loc(i,4,k)*hprime_xxT(4,j) + & - dummyx_loc(i,5,k)*hprime_xxT(5,j) + & - dummyx_loc(i,6,k)*hprime_xxT(6,j) + & - dummyx_loc(i,7,k)*hprime_xxT(7,j) + & - dummyx_loc(i,8,k)*hprime_xxT(8,j) - tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & - dummyy_loc(i,2,k)*hprime_xxT(2,j) + & - dummyy_loc(i,3,k)*hprime_xxT(3,j) + & - dummyy_loc(i,4,k)*hprime_xxT(4,j) + & - dummyy_loc(i,5,k)*hprime_xxT(5,j) + & - dummyy_loc(i,6,k)*hprime_xxT(6,j) + & - dummyy_loc(i,7,k)*hprime_xxT(7,j) + & - dummyy_loc(i,8,k)*hprime_xxT(8,j) - tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & - dummyz_loc(i,2,k)*hprime_xxT(2,j) + & - dummyz_loc(i,3,k)*hprime_xxT(3,j) + & - dummyz_loc(i,4,k)*hprime_xxT(4,j) + & - dummyz_loc(i,5,k)*hprime_xxT(5,j) + & - dummyz_loc(i,6,k)*hprime_xxT(6,j) + & - dummyz_loc(i,7,k)*hprime_xxT(7,j) + & - dummyz_loc(i,8,k)*hprime_xxT(8,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_8points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) - do j=1,m1 - do i=1,m2 - C1_mxm_m2_m1_8points(i,j) = A1_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + & - A1_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + & - A1_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + & - A1_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + & - A1_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + & - A1_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + & - A1_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + & - A1_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j) - C2_mxm_m2_m1_8points(i,j) = A2_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + & - A2_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + & - A2_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + & - A2_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + & - A2_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + & - A2_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + & - A2_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + & - A2_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j) - C3_mxm_m2_m1_8points(i,j) = A3_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + & - A3_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + & - A3_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + & - A3_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + & - A3_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + & - A3_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + & - A3_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + & - A3_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - ! get derivatives of ux, uy and uz with respect to x, y and z - xixl = xix(i,j,k,ispec) - xiyl = xiy(i,j,k,ispec) - xizl = xiz(i,j,k,ispec) - etaxl = etax(i,j,k,ispec) - etayl = etay(i,j,k,ispec) - etazl = etaz(i,j,k,ispec) - gammaxl = gammax(i,j,k,ispec) - gammayl = gammay(i,j,k,ispec) - gammazl = gammaz(i,j,k,ispec) - jacobianl = jacobian(i,j,k,ispec) - - duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) - duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) - duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) - - duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) - duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) - duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) - - duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) - duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) - duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) - - ! save strain on the Moho boundary - if (SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl - dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl - dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl - dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl - dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl - dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl - dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl - dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl - dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl - else if (is_moho_bot(ispec)) then - dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl - dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl - dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl - dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl - dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl - dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl - dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl - dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl - dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl - endif - endif - - ! precompute some sums to save CPU time - duxdxl_plus_duydyl = duxdxl + duydyl - duxdxl_plus_duzdzl = duxdxl + duzdzl - duydyl_plus_duzdzl = duydyl + duzdzl - duxdyl_plus_duydxl = duxdyl + duydxl - duzdxl_plus_duxdzl = duzdxl + duxdzl - duzdyl_plus_duydzl = duzdyl + duydzl - - ! computes deviatoric strain attenuation and/or for kernel calculations - if (COMPUTE_AND_STORE_STRAIN) then - templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) - if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ - epsilondev_xx_loc(i,j,k) = duxdxl - templ - epsilondev_yy_loc(i,j,k) = duydyl - templ - epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl - epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl - epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl - endif - - kappal = kappastore(i,j,k,ispec) - mul = mustore(i,j,k,ispec) - - ! attenuation - if(ATTENUATION) then - ! use unrelaxed parameters if attenuation - mul = mul * one_minus_sum_beta(i,j,k,ispec) - endif - - ! full anisotropic case, stress calculations - if(ANISOTROPY) then - c11 = c11store(i,j,k,ispec) - c12 = c12store(i,j,k,ispec) - c13 = c13store(i,j,k,ispec) - c14 = c14store(i,j,k,ispec) - c15 = c15store(i,j,k,ispec) - c16 = c16store(i,j,k,ispec) - c22 = c22store(i,j,k,ispec) - c23 = c23store(i,j,k,ispec) - c24 = c24store(i,j,k,ispec) - c25 = c25store(i,j,k,ispec) - c26 = c26store(i,j,k,ispec) - c33 = c33store(i,j,k,ispec) - c34 = c34store(i,j,k,ispec) - c35 = c35store(i,j,k,ispec) - c36 = c36store(i,j,k,ispec) - c44 = c44store(i,j,k,ispec) - c45 = c45store(i,j,k,ispec) - c46 = c46store(i,j,k,ispec) - c55 = c55store(i,j,k,ispec) - c56 = c56store(i,j,k,ispec) - c66 = c66store(i,j,k,ispec) - - sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & - c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl - sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & - c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl - sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & - c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl - sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & - c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl - sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & - c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl - sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & - c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl - - else - - ! isotropic case - lambdalplus2mul = kappal + FOUR_THIRDS * mul - lambdal = lambdalplus2mul - 2.*mul - - ! compute stress sigma - sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl - sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl - sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl - - sigma_xy = mul*duxdyl_plus_duydxl - sigma_xz = mul*duzdxl_plus_duxdzl - sigma_yz = mul*duzdyl_plus_duydzl - - endif ! ANISOTROPY - - ! subtract memory variables if attenuation - if(ATTENUATION) then -! way 1 -! do i_sls = 1,N_SLS -! R_xx_val = R_xx(i,j,k,ispec,i_sls) -! R_yy_val = R_yy(i,j,k,ispec,i_sls) -! sigma_xx = sigma_xx - R_xx_val -! sigma_yy = sigma_yy - R_yy_val -! sigma_zz = sigma_zz + R_xx_val + R_yy_val -! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) -! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) -! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) -! enddo - -! way 2 -! note: this should help compilers to pipeline the code and make better use of the cache; -! depending on compilers, it can further decrease the computation time by ~ 30%. -! by default, N_SLS = 3, therefore we take steps of 3 - if(imodulo_N_SLS >= 1) then - do i_sls = 1,imodulo_N_SLS - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - enddo - endif - - if(N_SLS >= imodulo_N_SLS+1) then - do i_sls = imodulo_N_SLS+1,N_SLS,3 - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - - R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) - R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) - sigma_xx = sigma_xx - R_xx_val2 - sigma_yy = sigma_yy - R_yy_val2 - sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) - - R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) - R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) - sigma_xx = sigma_xx - R_xx_val3 - sigma_yy = sigma_yy - R_yy_val3 - sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) - enddo - endif - - - endif - - ! define symmetric components of sigma - sigma_yx = sigma_xy - sigma_zx = sigma_xz - sigma_zy = sigma_yz - - ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) - tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x - tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y - tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z - - tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x - tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y - tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z - - tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x - tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y - tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z - - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_8points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) - do j=1,m2 - do i=1,m1 - E1_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_8points(1,j) + & - hprimewgll_xxT(i,2)*C1_m1_m2_8points(2,j) + & - hprimewgll_xxT(i,3)*C1_m1_m2_8points(3,j) + & - hprimewgll_xxT(i,4)*C1_m1_m2_8points(4,j) + & - hprimewgll_xxT(i,5)*C1_m1_m2_8points(5,j) + & - hprimewgll_xxT(i,6)*C1_m1_m2_8points(6,j) + & - hprimewgll_xxT(i,7)*C1_m1_m2_8points(7,j) + & - hprimewgll_xxT(i,8)*C1_m1_m2_8points(8,j) - E2_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_8points(1,j) + & - hprimewgll_xxT(i,2)*C2_m1_m2_8points(2,j) + & - hprimewgll_xxT(i,3)*C2_m1_m2_8points(3,j) + & - hprimewgll_xxT(i,4)*C2_m1_m2_8points(4,j) + & - hprimewgll_xxT(i,5)*C2_m1_m2_8points(5,j) + & - hprimewgll_xxT(i,6)*C2_m1_m2_8points(6,j) + & - hprimewgll_xxT(i,7)*C2_m1_m2_8points(7,j) + & - hprimewgll_xxT(i,8)*C2_m1_m2_8points(8,j) - E3_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_8points(1,j) + & - hprimewgll_xxT(i,2)*C3_m1_m2_8points(2,j) + & - hprimewgll_xxT(i,3)*C3_m1_m2_8points(3,j) + & - hprimewgll_xxT(i,4)*C3_m1_m2_8points(4,j) + & - hprimewgll_xxT(i,5)*C3_m1_m2_8points(5,j) + & - hprimewgll_xxT(i,6)*C3_m1_m2_8points(6,j) + & - hprimewgll_xxT(i,7)*C3_m1_m2_8points(7,j) + & - hprimewgll_xxT(i,8)*C3_m1_m2_8points(8,j) - enddo - enddo - - ! call mxm_m1_m1_8points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & - ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) - do i=1,m1 - do j=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & - tempx2(i,2,k)*hprimewgll_xx(2,j) + & - tempx2(i,3,k)*hprimewgll_xx(3,j) + & - tempx2(i,4,k)*hprimewgll_xx(4,j) + & - tempx2(i,5,k)*hprimewgll_xx(5,j) + & - tempx2(i,6,k)*hprimewgll_xx(6,j) + & - tempx2(i,7,k)*hprimewgll_xx(7,j) + & - tempx2(i,8,k)*hprimewgll_xx(8,j) - newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & - tempy2(i,2,k)*hprimewgll_xx(2,j) + & - tempy2(i,3,k)*hprimewgll_xx(3,j) + & - tempy2(i,4,k)*hprimewgll_xx(4,j) + & - tempy2(i,5,k)*hprimewgll_xx(5,j) + & - tempy2(i,6,k)*hprimewgll_xx(6,j) + & - tempy2(i,7,k)*hprimewgll_xx(7,j) + & - tempy2(i,8,k)*hprimewgll_xx(8,j) - newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & - tempz2(i,2,k)*hprimewgll_xx(2,j) + & - tempz2(i,3,k)*hprimewgll_xx(3,j) + & - tempz2(i,4,k)*hprimewgll_xx(4,j) + & - tempz2(i,5,k)*hprimewgll_xx(5,j) + & - tempz2(i,6,k)*hprimewgll_xx(6,j) + & - tempz2(i,7,k)*hprimewgll_xx(7,j) + & - tempz2(i,8,k)*hprimewgll_xx(8,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_8points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) - do j=1,m1 - do i=1,m2 - E1_mxm_m2_m1_8points(i,j) = C1_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + & - C1_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + & - C1_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + & - C1_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + & - C1_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + & - C1_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + & - C1_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + & - C1_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j) - E2_mxm_m2_m1_8points(i,j) = C2_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + & - C2_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + & - C2_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + & - C2_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + & - C2_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + & - C2_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + & - C2_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + & - C2_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j) - E3_mxm_m2_m1_8points(i,j) = C3_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + & - C3_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + & - C3_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + & - C3_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + & - C3_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + & - C3_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + & - C3_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + & - C3_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - - fac1 = wgllwgll_yz(j,k) - fac2 = wgllwgll_xz(i,k) - fac3 = wgllwgll_xy(i,j) - - ! sum contributions from each element to the global mesh using indirect addressing - iglob = ibool(i,j,k,ispec) - accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & - fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) - accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & - fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) - accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & - fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) - - ! update memory variables based upon the Runge-Kutta scheme - if(ATTENUATION) then - - ! use Runge-Kutta scheme to march in time - do i_sls = 1,N_SLS - - factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) - - alphaval_loc = alphaval(i_sls) - betaval_loc = betaval(i_sls) - gammaval_loc = gammaval(i_sls) - - ! term in xx - Sn = factor_loc * epsilondev_xx(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) - R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yy - Sn = factor_loc * epsilondev_yy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) - R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in zz not computed since zero trace - ! term in xy - Sn = factor_loc * epsilondev_xy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) - R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in xz - Sn = factor_loc * epsilondev_xz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) - R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yz - Sn = factor_loc * epsilondev_yz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) - R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - - enddo ! end of loop on memory variables - - endif ! end attenuation - - enddo - enddo - enddo - - ! save deviatoric strain for Runge-Kutta scheme - if ( COMPUTE_AND_STORE_STRAIN ) then - epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) - epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) - epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) - epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) - epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) - endif - - enddo ! spectral element loop - -end subroutine compute_forces_elastic_Dev_8p - -! -!===================================================================== -! - -subroutine compute_forces_elastic_Dev_9p( iphase ,NSPEC_AB,NGLOB_AB, & - displ,accel, & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - hprime_xx,hprime_xxT, & - hprimewgll_xx,hprimewgll_xxT, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - kappastore,mustore,jacobian,ibool, & - ATTENUATION, & - one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& - NSPEC_ATTENUATION_AB, & - R_xx,R_yy,R_xy,R_xz,R_yz, & - epsilondev_xx,epsilondev_yy,epsilondev_xy, & - epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store,& - c22store,c23store,c24store,c25store,c26store,c33store,& - c34store,c35store,c36store,c44store,c45store,c46store,& - c55store,c56store,c66store, & - SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & - NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & - is_moho_top,is_moho_bot, & - dsdx_top,dsdx_bot, & - ispec2D_moho_top,ispec2D_moho_bot, & - num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& - phase_ispec_inner_elastic) - - -! computes elastic tensor term - - use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & - N_SLS,SAVE_MOHO_MESH, & - ONE_THIRD,FOUR_THIRDS,m1,m2 - implicit none - - integer :: NSPEC_AB,NGLOB_AB - -! displacement and acceleration - real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel - -! arrays with mesh parameters per slice - integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - kappastore,mustore,jacobian - -! array with derivatives of Lagrange polynomials and precalculated products - real(kind=CUSTOM_REAL), dimension(NGLLX,9) :: hprime_xx,hprimewgll_xxT - real(kind=CUSTOM_REAL), dimension(9,NGLLX) :: hprime_xxT,hprimewgll_xx - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz - real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz - -! memory variables and standard linear solids for attenuation - logical :: ATTENUATION - logical :: COMPUTE_AND_STORE_STRAIN - integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT - integer :: NSPEC_ATTENUATION_AB - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta - real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common - real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & - R_xx,R_yy,R_xy,R_xz,R_yz - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & - epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz - real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 - -! anisotropy - logical :: ANISOTROPY - integer :: NSPEC_ANISO - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & - c11store,c12store,c13store,c14store,c15store,c16store, & - c22store,c23store,c24store,c25store,c26store,c33store, & - c34store,c35store,c36store,c44store,c45store,c46store, & - c55store,c56store,c66store - - integer :: iphase - integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic - integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic - -! adjoint simulations - integer :: SIMULATION_TYPE - integer :: NSPEC_BOUN,NSPEC2D_MOHO - - ! moho kernel - real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & - dsdx_top,dsdx_bot - logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot - integer :: ispec2D_moho_top, ispec2D_moho_bot - -! local parameters - real(kind=CUSTOM_REAL), dimension(9,9,9) :: dummyx_loc,dummyy_loc,dummyz_loc, & - newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 - real(kind=CUSTOM_REAL), dimension(9,9,9) :: & - tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 - - ! manually inline the calls to the Deville et al. (2002) routines - real(kind=CUSTOM_REAL), dimension(9,81) :: B1_m1_m2_9points,B2_m1_m2_9points,B3_m1_m2_9points - real(kind=CUSTOM_REAL), dimension(9,81) :: C1_m1_m2_9points,C2_m1_m2_9points,C3_m1_m2_9points - real(kind=CUSTOM_REAL), dimension(9,81) :: E1_m1_m2_9points,E2_m1_m2_9points,E3_m1_m2_9points - - equivalence(dummyx_loc,B1_m1_m2_9points) - equivalence(dummyy_loc,B2_m1_m2_9points) - equivalence(dummyz_loc,B3_m1_m2_9points) - equivalence(tempx1,C1_m1_m2_9points) - equivalence(tempy1,C2_m1_m2_9points) - equivalence(tempz1,C3_m1_m2_9points) - equivalence(newtempx1,E1_m1_m2_9points) - equivalence(newtempy1,E2_m1_m2_9points) - equivalence(newtempz1,E3_m1_m2_9points) - - real(kind=CUSTOM_REAL), dimension(81,9) :: & - A1_mxm_m2_m1_9points,A2_mxm_m2_m1_9points,A3_mxm_m2_m1_9points - real(kind=CUSTOM_REAL), dimension(81,9) :: & - C1_mxm_m2_m1_9points,C2_mxm_m2_m1_9points,C3_mxm_m2_m1_9points - real(kind=CUSTOM_REAL), dimension(81,9) :: & - E1_mxm_m2_m1_9points,E2_mxm_m2_m1_9points,E3_mxm_m2_m1_9points - - equivalence(dummyx_loc,A1_mxm_m2_m1_9points) - equivalence(dummyy_loc,A2_mxm_m2_m1_9points) - equivalence(dummyz_loc,A3_mxm_m2_m1_9points) - equivalence(tempx3,C1_mxm_m2_m1_9points) - equivalence(tempy3,C2_mxm_m2_m1_9points) - equivalence(tempz3,C3_mxm_m2_m1_9points) - equivalence(newtempx3,E1_mxm_m2_m1_9points) - equivalence(newtempy3,E2_mxm_m2_m1_9points) - equivalence(newtempz3,E3_mxm_m2_m1_9points) - - ! local attenuation parameters - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & - epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc - real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 - real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc - real(kind=CUSTOM_REAL) Sn,Snp1 - real(kind=CUSTOM_REAL) templ - - real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl - real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl - - real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl - real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl - - real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy - - real(kind=CUSTOM_REAL) fac1,fac2,fac3 - - real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul - real(kind=CUSTOM_REAL) kappal - - ! local anisotropy parameters - real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& - c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 - - integer i_SLS,imodulo_N_SLS - integer ispec,iglob,ispec_p,num_elements - integer i,j,k - - imodulo_N_SLS = mod(N_SLS,3) - - ! choses inner/outer elements - if( iphase == 1 ) then - num_elements = nspec_outer_elastic - else - num_elements = nspec_inner_elastic - endif - - do ispec_p = 1,num_elements - - ! returns element id from stored element list - ispec = phase_ispec_inner_elastic(ispec_p,iphase) - - ! adjoint simulations: moho kernel - if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - ispec2D_moho_top = ispec2D_moho_top + 1 - else if (is_moho_bot(ispec)) then - ispec2D_moho_bot = ispec2D_moho_bot + 1 - endif - endif ! adjoint - - ! stores displacment values in local array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - iglob = ibool(i,j,k,ispec) - dummyx_loc(i,j,k) = displ(1,iglob) - dummyy_loc(i,j,k) = displ(2,iglob) - dummyz_loc(i,j,k) = displ(3,iglob) - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_9points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) - do j=1,m2 - do i=1,m1 - C1_m1_m2_9points(i,j) = hprime_xx(i,1)*B1_m1_m2_9points(1,j) + & - hprime_xx(i,2)*B1_m1_m2_9points(2,j) + & - hprime_xx(i,3)*B1_m1_m2_9points(3,j) + & - hprime_xx(i,4)*B1_m1_m2_9points(4,j) + & - hprime_xx(i,5)*B1_m1_m2_9points(5,j) + & - hprime_xx(i,6)*B1_m1_m2_9points(6,j) + & - hprime_xx(i,7)*B1_m1_m2_9points(7,j) + & - hprime_xx(i,8)*B1_m1_m2_9points(8,j) + & - hprime_xx(i,9)*B1_m1_m2_9points(9,j) - C2_m1_m2_9points(i,j) = hprime_xx(i,1)*B2_m1_m2_9points(1,j) + & - hprime_xx(i,2)*B2_m1_m2_9points(2,j) + & - hprime_xx(i,3)*B2_m1_m2_9points(3,j) + & - hprime_xx(i,4)*B2_m1_m2_9points(4,j) + & - hprime_xx(i,5)*B2_m1_m2_9points(5,j) + & - hprime_xx(i,6)*B2_m1_m2_9points(6,j) + & - hprime_xx(i,7)*B2_m1_m2_9points(7,j) + & - hprime_xx(i,8)*B2_m1_m2_9points(8,j) + & - hprime_xx(i,9)*B2_m1_m2_9points(9,j) - C3_m1_m2_9points(i,j) = hprime_xx(i,1)*B3_m1_m2_9points(1,j) + & - hprime_xx(i,2)*B3_m1_m2_9points(2,j) + & - hprime_xx(i,3)*B3_m1_m2_9points(3,j) + & - hprime_xx(i,4)*B3_m1_m2_9points(4,j) + & - hprime_xx(i,5)*B3_m1_m2_9points(5,j) + & - hprime_xx(i,6)*B3_m1_m2_9points(6,j) + & - hprime_xx(i,7)*B3_m1_m2_9points(7,j) + & - hprime_xx(i,8)*B3_m1_m2_9points(8,j) + & - hprime_xx(i,9)*B3_m1_m2_9points(9,j) - enddo - enddo - - ! call mxm_m1_m1_9points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & - ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) - do j=1,m1 - do i=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & - dummyx_loc(i,2,k)*hprime_xxT(2,j) + & - dummyx_loc(i,3,k)*hprime_xxT(3,j) + & - dummyx_loc(i,4,k)*hprime_xxT(4,j) + & - dummyx_loc(i,5,k)*hprime_xxT(5,j) + & - dummyx_loc(i,6,k)*hprime_xxT(6,j) + & - dummyx_loc(i,7,k)*hprime_xxT(7,j) + & - dummyx_loc(i,8,k)*hprime_xxT(8,j) + & - dummyx_loc(i,9,k)*hprime_xxT(9,j) - tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & - dummyy_loc(i,2,k)*hprime_xxT(2,j) + & - dummyy_loc(i,3,k)*hprime_xxT(3,j) + & - dummyy_loc(i,4,k)*hprime_xxT(4,j) + & - dummyy_loc(i,5,k)*hprime_xxT(5,j) + & - dummyy_loc(i,6,k)*hprime_xxT(6,j) + & - dummyy_loc(i,7,k)*hprime_xxT(7,j) + & - dummyy_loc(i,8,k)*hprime_xxT(8,j) + & - dummyy_loc(i,9,k)*hprime_xxT(9,j) - tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & - dummyz_loc(i,2,k)*hprime_xxT(2,j) + & - dummyz_loc(i,3,k)*hprime_xxT(3,j) + & - dummyz_loc(i,4,k)*hprime_xxT(4,j) + & - dummyz_loc(i,5,k)*hprime_xxT(5,j) + & - dummyz_loc(i,6,k)*hprime_xxT(6,j) + & - dummyz_loc(i,7,k)*hprime_xxT(7,j) + & - dummyz_loc(i,8,k)*hprime_xxT(8,j) + & - dummyz_loc(i,9,k)*hprime_xxT(9,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_9points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) - do j=1,m1 - do i=1,m2 - C1_mxm_m2_m1_9points(i,j) = A1_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + & - A1_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + & - A1_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + & - A1_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + & - A1_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + & - A1_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + & - A1_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + & - A1_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + & - A1_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j) - C2_mxm_m2_m1_9points(i,j) = A2_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + & - A2_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + & - A2_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + & - A2_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + & - A2_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + & - A2_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + & - A2_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + & - A2_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + & - A2_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j) - C3_mxm_m2_m1_9points(i,j) = A3_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + & - A3_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + & - A3_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + & - A3_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + & - A3_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + & - A3_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + & - A3_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + & - A3_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + & - A3_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - ! get derivatives of ux, uy and uz with respect to x, y and z - xixl = xix(i,j,k,ispec) - xiyl = xiy(i,j,k,ispec) - xizl = xiz(i,j,k,ispec) - etaxl = etax(i,j,k,ispec) - etayl = etay(i,j,k,ispec) - etazl = etaz(i,j,k,ispec) - gammaxl = gammax(i,j,k,ispec) - gammayl = gammay(i,j,k,ispec) - gammazl = gammaz(i,j,k,ispec) - jacobianl = jacobian(i,j,k,ispec) - - duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) - duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) - duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) - - duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) - duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) - duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) - - duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) - duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) - duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) - - ! save strain on the Moho boundary - if (SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl - dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl - dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl - dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl - dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl - dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl - dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl - dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl - dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl - else if (is_moho_bot(ispec)) then - dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl - dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl - dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl - dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl - dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl - dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl - dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl - dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl - dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl - endif - endif - - ! precompute some sums to save CPU time - duxdxl_plus_duydyl = duxdxl + duydyl - duxdxl_plus_duzdzl = duxdxl + duzdzl - duydyl_plus_duzdzl = duydyl + duzdzl - duxdyl_plus_duydxl = duxdyl + duydxl - duzdxl_plus_duxdzl = duzdxl + duxdzl - duzdyl_plus_duydzl = duzdyl + duydzl - - ! computes deviatoric strain attenuation and/or for kernel calculations - if (COMPUTE_AND_STORE_STRAIN) then - templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) - if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ - epsilondev_xx_loc(i,j,k) = duxdxl - templ - epsilondev_yy_loc(i,j,k) = duydyl - templ - epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl - epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl - epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl - endif - - kappal = kappastore(i,j,k,ispec) - mul = mustore(i,j,k,ispec) - - ! attenuation - if(ATTENUATION) then - ! use unrelaxed parameters if attenuation - mul = mul * one_minus_sum_beta(i,j,k,ispec) - endif - - ! full anisotropic case, stress calculations - if(ANISOTROPY) then - c11 = c11store(i,j,k,ispec) - c12 = c12store(i,j,k,ispec) - c13 = c13store(i,j,k,ispec) - c14 = c14store(i,j,k,ispec) - c15 = c15store(i,j,k,ispec) - c16 = c16store(i,j,k,ispec) - c22 = c22store(i,j,k,ispec) - c23 = c23store(i,j,k,ispec) - c24 = c24store(i,j,k,ispec) - c25 = c25store(i,j,k,ispec) - c26 = c26store(i,j,k,ispec) - c33 = c33store(i,j,k,ispec) - c34 = c34store(i,j,k,ispec) - c35 = c35store(i,j,k,ispec) - c36 = c36store(i,j,k,ispec) - c44 = c44store(i,j,k,ispec) - c45 = c45store(i,j,k,ispec) - c46 = c46store(i,j,k,ispec) - c55 = c55store(i,j,k,ispec) - c56 = c56store(i,j,k,ispec) - c66 = c66store(i,j,k,ispec) - - sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & - c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl - sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & - c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl - sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & - c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl - sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & - c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl - sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & - c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl - sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & - c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl - - else - - ! isotropic case - lambdalplus2mul = kappal + FOUR_THIRDS * mul - lambdal = lambdalplus2mul - 2.*mul - - ! compute stress sigma - sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl - sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl - sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl - - sigma_xy = mul*duxdyl_plus_duydxl - sigma_xz = mul*duzdxl_plus_duxdzl - sigma_yz = mul*duzdyl_plus_duydzl - - endif ! ANISOTROPY - - ! subtract memory variables if attenuation - if(ATTENUATION) then -! way 1 -! do i_sls = 1,N_SLS -! R_xx_val = R_xx(i,j,k,ispec,i_sls) -! R_yy_val = R_yy(i,j,k,ispec,i_sls) -! sigma_xx = sigma_xx - R_xx_val -! sigma_yy = sigma_yy - R_yy_val -! sigma_zz = sigma_zz + R_xx_val + R_yy_val -! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) -! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) -! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) -! enddo - -! way 2 -! note: this should help compilers to pipeline the code and make better use of the cache; -! depending on compilers, it can further decrease the computation time by ~ 30%. -! by default, N_SLS = 3, therefore we take steps of 3 - if(imodulo_N_SLS >= 1) then - do i_sls = 1,imodulo_N_SLS - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - enddo - endif - - if(N_SLS >= imodulo_N_SLS+1) then - do i_sls = imodulo_N_SLS+1,N_SLS,3 - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - - R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) - R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) - sigma_xx = sigma_xx - R_xx_val2 - sigma_yy = sigma_yy - R_yy_val2 - sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) - - R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) - R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) - sigma_xx = sigma_xx - R_xx_val3 - sigma_yy = sigma_yy - R_yy_val3 - sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) - enddo - endif - - - endif - - ! define symmetric components of sigma - sigma_yx = sigma_xy - sigma_zx = sigma_xz - sigma_zy = sigma_yz - - ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) - tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x - tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y - tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z - - tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x - tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y - tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z - - tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x - tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y - tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z - - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_9points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) - do j=1,m2 - do i=1,m1 - E1_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_9points(1,j) + & - hprimewgll_xxT(i,2)*C1_m1_m2_9points(2,j) + & - hprimewgll_xxT(i,3)*C1_m1_m2_9points(3,j) + & - hprimewgll_xxT(i,4)*C1_m1_m2_9points(4,j) + & - hprimewgll_xxT(i,5)*C1_m1_m2_9points(5,j) + & - hprimewgll_xxT(i,6)*C1_m1_m2_9points(6,j) + & - hprimewgll_xxT(i,7)*C1_m1_m2_9points(7,j) + & - hprimewgll_xxT(i,8)*C1_m1_m2_9points(8,j) + & - hprimewgll_xxT(i,9)*C1_m1_m2_9points(9,j) - E2_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_9points(1,j) + & - hprimewgll_xxT(i,2)*C2_m1_m2_9points(2,j) + & - hprimewgll_xxT(i,3)*C2_m1_m2_9points(3,j) + & - hprimewgll_xxT(i,4)*C2_m1_m2_9points(4,j) + & - hprimewgll_xxT(i,5)*C2_m1_m2_9points(5,j) + & - hprimewgll_xxT(i,6)*C2_m1_m2_9points(6,j) + & - hprimewgll_xxT(i,7)*C2_m1_m2_9points(7,j) + & - hprimewgll_xxT(i,8)*C2_m1_m2_9points(8,j) + & - hprimewgll_xxT(i,9)*C2_m1_m2_9points(9,j) - E3_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_9points(1,j) + & - hprimewgll_xxT(i,2)*C3_m1_m2_9points(2,j) + & - hprimewgll_xxT(i,3)*C3_m1_m2_9points(3,j) + & - hprimewgll_xxT(i,4)*C3_m1_m2_9points(4,j) + & - hprimewgll_xxT(i,5)*C3_m1_m2_9points(5,j) + & - hprimewgll_xxT(i,6)*C3_m1_m2_9points(6,j) + & - hprimewgll_xxT(i,7)*C3_m1_m2_9points(7,j) + & - hprimewgll_xxT(i,8)*C3_m1_m2_9points(8,j) + & - hprimewgll_xxT(i,9)*C3_m1_m2_9points(9,j) - enddo - enddo - - ! call mxm_m1_m1_9points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & - ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) - do i=1,m1 - do j=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & - tempx2(i,2,k)*hprimewgll_xx(2,j) + & - tempx2(i,3,k)*hprimewgll_xx(3,j) + & - tempx2(i,4,k)*hprimewgll_xx(4,j) + & - tempx2(i,5,k)*hprimewgll_xx(5,j) + & - tempx2(i,6,k)*hprimewgll_xx(6,j) + & - tempx2(i,7,k)*hprimewgll_xx(7,j) + & - tempx2(i,8,k)*hprimewgll_xx(8,j) + & - tempx2(i,9,k)*hprimewgll_xx(9,j) - newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & - tempy2(i,2,k)*hprimewgll_xx(2,j) + & - tempy2(i,3,k)*hprimewgll_xx(3,j) + & - tempy2(i,4,k)*hprimewgll_xx(4,j) + & - tempy2(i,5,k)*hprimewgll_xx(5,j) + & - tempy2(i,6,k)*hprimewgll_xx(6,j) + & - tempy2(i,7,k)*hprimewgll_xx(7,j) + & - tempy2(i,8,k)*hprimewgll_xx(8,j) + & - tempy2(i,9,k)*hprimewgll_xx(9,j) - newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & - tempz2(i,2,k)*hprimewgll_xx(2,j) + & - tempz2(i,3,k)*hprimewgll_xx(3,j) + & - tempz2(i,4,k)*hprimewgll_xx(4,j) + & - tempz2(i,5,k)*hprimewgll_xx(5,j) + & - tempz2(i,6,k)*hprimewgll_xx(6,j) + & - tempz2(i,7,k)*hprimewgll_xx(7,j) + & - tempz2(i,8,k)*hprimewgll_xx(8,j) + & - tempz2(i,9,k)*hprimewgll_xx(9,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_9points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) - do j=1,m1 - do i=1,m2 - E1_mxm_m2_m1_9points(i,j) = C1_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + & - C1_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + & - C1_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + & - C1_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + & - C1_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + & - C1_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + & - C1_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + & - C1_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + & - C1_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j) - E2_mxm_m2_m1_9points(i,j) = C2_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + & - C2_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + & - C2_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + & - C2_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + & - C2_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + & - C2_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + & - C2_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + & - C2_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + & - C2_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j) - E3_mxm_m2_m1_9points(i,j) = C3_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + & - C3_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + & - C3_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + & - C3_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + & - C3_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + & - C3_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + & - C3_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + & - C3_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + & - C3_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - - fac1 = wgllwgll_yz(j,k) - fac2 = wgllwgll_xz(i,k) - fac3 = wgllwgll_xy(i,j) - - ! sum contributions from each element to the global mesh using indirect addressing - iglob = ibool(i,j,k,ispec) - accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & - fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) - accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & - fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) - accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & - fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) - - ! update memory variables based upon the Runge-Kutta scheme - if(ATTENUATION) then - - ! use Runge-Kutta scheme to march in time - do i_sls = 1,N_SLS - - factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) - - alphaval_loc = alphaval(i_sls) - betaval_loc = betaval(i_sls) - gammaval_loc = gammaval(i_sls) - - ! term in xx - Sn = factor_loc * epsilondev_xx(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) - R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yy - Sn = factor_loc * epsilondev_yy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) - R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in zz not computed since zero trace - ! term in xy - Sn = factor_loc * epsilondev_xy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) - R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in xz - Sn = factor_loc * epsilondev_xz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) - R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yz - Sn = factor_loc * epsilondev_yz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) - R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - - enddo ! end of loop on memory variables - - endif ! end attenuation - - enddo - enddo - enddo - - ! save deviatoric strain for Runge-Kutta scheme - if ( COMPUTE_AND_STORE_STRAIN ) then - epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) - epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) - epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) - epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) - epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) - endif - - enddo ! spectral element loop - -end subroutine compute_forces_elastic_Dev_9p - -! -!===================================================================== -! - -subroutine compute_forces_elastic_Dev_10p( iphase ,NSPEC_AB,NGLOB_AB, & - displ,accel, & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - hprime_xx,hprime_xxT, & - hprimewgll_xx,hprimewgll_xxT, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - kappastore,mustore,jacobian,ibool, & - ATTENUATION, & - one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& - NSPEC_ATTENUATION_AB, & - R_xx,R_yy,R_xy,R_xz,R_yz, & - epsilondev_xx,epsilondev_yy,epsilondev_xy, & - epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & - ANISOTROPY,NSPEC_ANISO, & - c11store,c12store,c13store,c14store,c15store,c16store,& - c22store,c23store,c24store,c25store,c26store,c33store,& - c34store,c35store,c36store,c44store,c45store,c46store,& - c55store,c56store,c66store, & - SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & - NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & - is_moho_top,is_moho_bot, & - dsdx_top,dsdx_bot, & - ispec2D_moho_top,ispec2D_moho_bot, & - num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& - phase_ispec_inner_elastic) - - -! computes elastic tensor term - - use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & - N_SLS,SAVE_MOHO_MESH, & - ONE_THIRD,FOUR_THIRDS,m1,m2 - implicit none - - integer :: NSPEC_AB,NGLOB_AB - -! displacement and acceleration - real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel - -! arrays with mesh parameters per slice - integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & - kappastore,mustore,jacobian - -! array with derivatives of Lagrange polynomials and precalculated products - real(kind=CUSTOM_REAL), dimension(NGLLX,10) :: hprime_xx,hprimewgll_xxT - real(kind=CUSTOM_REAL), dimension(10,NGLLX) :: hprime_xxT,hprimewgll_xx - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz - real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz - -! memory variables and standard linear solids for attenuation - logical :: ATTENUATION - logical :: COMPUTE_AND_STORE_STRAIN - integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT - integer :: NSPEC_ATTENUATION_AB - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta - real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common - real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & - R_xx,R_yy,R_xy,R_xz,R_yz - - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & - epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz - real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 - -! anisotropy - logical :: ANISOTROPY - integer :: NSPEC_ANISO - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & - c11store,c12store,c13store,c14store,c15store,c16store, & - c22store,c23store,c24store,c25store,c26store,c33store, & - c34store,c35store,c36store,c44store,c45store,c46store, & - c55store,c56store,c66store - - integer :: iphase - integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic - integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic - -! adjoint simulations - integer :: SIMULATION_TYPE - integer :: NSPEC_BOUN,NSPEC2D_MOHO - - ! moho kernel - real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & - dsdx_top,dsdx_bot - logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot - integer :: ispec2D_moho_top, ispec2D_moho_bot - -! local parameters - real(kind=CUSTOM_REAL), dimension(10,10,10) :: dummyx_loc,dummyy_loc,dummyz_loc, & - newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 - real(kind=CUSTOM_REAL), dimension(10,10,10) :: & - tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 - - ! manually inline the calls to the Deville et al. (2002) routines - real(kind=CUSTOM_REAL), dimension(10,100) :: B1_m1_m2_10points,B2_m1_m2_10points,B3_m1_m2_10points - real(kind=CUSTOM_REAL), dimension(10,100) :: C1_m1_m2_10points,C2_m1_m2_10points,C3_m1_m2_10points - real(kind=CUSTOM_REAL), dimension(10,100) :: E1_m1_m2_10points,E2_m1_m2_10points,E3_m1_m2_10points - - equivalence(dummyx_loc,B1_m1_m2_10points) - equivalence(dummyy_loc,B2_m1_m2_10points) - equivalence(dummyz_loc,B3_m1_m2_10points) - equivalence(tempx1,C1_m1_m2_10points) - equivalence(tempy1,C2_m1_m2_10points) - equivalence(tempz1,C3_m1_m2_10points) - equivalence(newtempx1,E1_m1_m2_10points) - equivalence(newtempy1,E2_m1_m2_10points) - equivalence(newtempz1,E3_m1_m2_10points) - - real(kind=CUSTOM_REAL), dimension(100,10) :: & - A1_mxm_m2_m1_10points,A2_mxm_m2_m1_10points,A3_mxm_m2_m1_10points - real(kind=CUSTOM_REAL), dimension(100,10) :: & - C1_mxm_m2_m1_10points,C2_mxm_m2_m1_10points,C3_mxm_m2_m1_10points - real(kind=CUSTOM_REAL), dimension(100,10) :: & - E1_mxm_m2_m1_10points,E2_mxm_m2_m1_10points,E3_mxm_m2_m1_10points - - equivalence(dummyx_loc,A1_mxm_m2_m1_10points) - equivalence(dummyy_loc,A2_mxm_m2_m1_10points) - equivalence(dummyz_loc,A3_mxm_m2_m1_10points) - equivalence(tempx3,C1_mxm_m2_m1_10points) - equivalence(tempy3,C2_mxm_m2_m1_10points) - equivalence(tempz3,C3_mxm_m2_m1_10points) - equivalence(newtempx3,E1_mxm_m2_m1_10points) - equivalence(newtempy3,E2_mxm_m2_m1_10points) - equivalence(newtempz3,E3_mxm_m2_m1_10points) - - ! local attenuation parameters - real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & - epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc - real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 - real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc - real(kind=CUSTOM_REAL) Sn,Snp1 - real(kind=CUSTOM_REAL) templ - - real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl - real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl - - real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl - real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl - - real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy - - real(kind=CUSTOM_REAL) fac1,fac2,fac3 - - real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul - real(kind=CUSTOM_REAL) kappal - - ! local anisotropy parameters - real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& - c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 - - integer i_SLS,imodulo_N_SLS - integer ispec,iglob,ispec_p,num_elements - integer i,j,k - - imodulo_N_SLS = mod(N_SLS,3) - - ! choses inner/outer elements - if( iphase == 1 ) then - num_elements = nspec_outer_elastic - else - num_elements = nspec_inner_elastic - endif - - do ispec_p = 1,num_elements - - ! returns element id from stored element list - ispec = phase_ispec_inner_elastic(ispec_p,iphase) - - ! adjoint simulations: moho kernel - if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - ispec2D_moho_top = ispec2D_moho_top + 1 - else if (is_moho_bot(ispec)) then - ispec2D_moho_bot = ispec2D_moho_bot + 1 - endif - endif ! adjoint - - ! stores displacment values in local array - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - iglob = ibool(i,j,k,ispec) - dummyx_loc(i,j,k) = displ(1,iglob) - dummyy_loc(i,j,k) = displ(2,iglob) - dummyz_loc(i,j,k) = displ(3,iglob) - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_10points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) - do j=1,m2 - do i=1,m1 - C1_m1_m2_10points(i,j) = hprime_xx(i,1)*B1_m1_m2_10points(1,j) + & - hprime_xx(i,2)*B1_m1_m2_10points(2,j) + & - hprime_xx(i,3)*B1_m1_m2_10points(3,j) + & - hprime_xx(i,4)*B1_m1_m2_10points(4,j) + & - hprime_xx(i,5)*B1_m1_m2_10points(5,j) + & - hprime_xx(i,6)*B1_m1_m2_10points(6,j) + & - hprime_xx(i,7)*B1_m1_m2_10points(7,j) + & - hprime_xx(i,8)*B1_m1_m2_10points(8,j) + & - hprime_xx(i,9)*B1_m1_m2_10points(9,j) + & - hprime_xx(i,10)*B1_m1_m2_10points(10,j) - C2_m1_m2_10points(i,j) = hprime_xx(i,1)*B2_m1_m2_10points(1,j) + & - hprime_xx(i,2)*B2_m1_m2_10points(2,j) + & - hprime_xx(i,3)*B2_m1_m2_10points(3,j) + & - hprime_xx(i,4)*B2_m1_m2_10points(4,j) + & - hprime_xx(i,5)*B2_m1_m2_10points(5,j) + & - hprime_xx(i,6)*B2_m1_m2_10points(6,j) + & - hprime_xx(i,7)*B2_m1_m2_10points(7,j) + & - hprime_xx(i,8)*B2_m1_m2_10points(8,j) + & - hprime_xx(i,9)*B2_m1_m2_10points(9,j) + & - hprime_xx(i,10)*B2_m1_m2_10points(10,j) - C3_m1_m2_10points(i,j) = hprime_xx(i,1)*B3_m1_m2_10points(1,j) + & - hprime_xx(i,2)*B3_m1_m2_10points(2,j) + & - hprime_xx(i,3)*B3_m1_m2_10points(3,j) + & - hprime_xx(i,4)*B3_m1_m2_10points(4,j) + & - hprime_xx(i,5)*B3_m1_m2_10points(5,j) + & - hprime_xx(i,6)*B3_m1_m2_10points(6,j) + & - hprime_xx(i,7)*B3_m1_m2_10points(7,j) + & - hprime_xx(i,8)*B3_m1_m2_10points(8,j) + & - hprime_xx(i,9)*B3_m1_m2_10points(9,j) + & - hprime_xx(i,10)*B3_m1_m2_10points(10,j) - enddo - enddo - - ! call mxm_m1_m1_10points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & - ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) - do j=1,m1 - do i=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & - dummyx_loc(i,2,k)*hprime_xxT(2,j) + & - dummyx_loc(i,3,k)*hprime_xxT(3,j) + & - dummyx_loc(i,4,k)*hprime_xxT(4,j) + & - dummyx_loc(i,5,k)*hprime_xxT(5,j) + & - dummyx_loc(i,6,k)*hprime_xxT(6,j) + & - dummyx_loc(i,7,k)*hprime_xxT(7,j) + & - dummyx_loc(i,8,k)*hprime_xxT(8,j) + & - dummyx_loc(i,9,k)*hprime_xxT(9,j) + & - dummyx_loc(i,10,k)*hprime_xxT(10,j) - tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & - dummyy_loc(i,2,k)*hprime_xxT(2,j) + & - dummyy_loc(i,3,k)*hprime_xxT(3,j) + & - dummyy_loc(i,4,k)*hprime_xxT(4,j) + & - dummyy_loc(i,5,k)*hprime_xxT(5,j) + & - dummyy_loc(i,6,k)*hprime_xxT(6,j) + & - dummyy_loc(i,7,k)*hprime_xxT(7,j) + & - dummyy_loc(i,8,k)*hprime_xxT(8,j) + & - dummyy_loc(i,9,k)*hprime_xxT(9,j) + & - dummyy_loc(i,10,k)*hprime_xxT(10,j) - tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & - dummyz_loc(i,2,k)*hprime_xxT(2,j) + & - dummyz_loc(i,3,k)*hprime_xxT(3,j) + & - dummyz_loc(i,4,k)*hprime_xxT(4,j) + & - dummyz_loc(i,5,k)*hprime_xxT(5,j) + & - dummyz_loc(i,6,k)*hprime_xxT(6,j) + & - dummyz_loc(i,7,k)*hprime_xxT(7,j) + & - dummyz_loc(i,8,k)*hprime_xxT(8,j) + & - dummyz_loc(i,9,k)*hprime_xxT(9,j) + & - dummyz_loc(i,10,k)*hprime_xxT(10,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_10points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) - do j=1,m1 - do i=1,m2 - C1_mxm_m2_m1_10points(i,j) = A1_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + & - A1_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + & - A1_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + & - A1_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + & - A1_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + & - A1_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + & - A1_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + & - A1_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + & - A1_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + & - A1_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j) - C2_mxm_m2_m1_10points(i,j) = A2_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + & - A2_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + & - A2_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + & - A2_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + & - A2_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + & - A2_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + & - A2_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + & - A2_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + & - A2_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + & - A2_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j) - C3_mxm_m2_m1_10points(i,j) = A3_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + & - A3_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + & - A3_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + & - A3_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + & - A3_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + & - A3_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + & - A3_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + & - A3_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + & - A3_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + & - A3_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - ! get derivatives of ux, uy and uz with respect to x, y and z - xixl = xix(i,j,k,ispec) - xiyl = xiy(i,j,k,ispec) - xizl = xiz(i,j,k,ispec) - etaxl = etax(i,j,k,ispec) - etayl = etay(i,j,k,ispec) - etazl = etaz(i,j,k,ispec) - gammaxl = gammax(i,j,k,ispec) - gammayl = gammay(i,j,k,ispec) - gammazl = gammaz(i,j,k,ispec) - jacobianl = jacobian(i,j,k,ispec) - - duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) - duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) - duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) - - duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) - duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) - duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) - - duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) - duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) - duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) - - ! save strain on the Moho boundary - if (SAVE_MOHO_MESH ) then - if (is_moho_top(ispec)) then - dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl - dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl - dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl - dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl - dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl - dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl - dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl - dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl - dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl - else if (is_moho_bot(ispec)) then - dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl - dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl - dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl - dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl - dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl - dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl - dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl - dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl - dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl - endif - endif - - ! precompute some sums to save CPU time - duxdxl_plus_duydyl = duxdxl + duydyl - duxdxl_plus_duzdzl = duxdxl + duzdzl - duydyl_plus_duzdzl = duydyl + duzdzl - duxdyl_plus_duydxl = duxdyl + duydxl - duzdxl_plus_duxdzl = duzdxl + duxdzl - duzdyl_plus_duydzl = duzdyl + duydzl - - ! computes deviatoric strain attenuation and/or for kernel calculations - if (COMPUTE_AND_STORE_STRAIN) then - templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) - if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ - epsilondev_xx_loc(i,j,k) = duxdxl - templ - epsilondev_yy_loc(i,j,k) = duydyl - templ - epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl - epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl - epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl - endif - - kappal = kappastore(i,j,k,ispec) - mul = mustore(i,j,k,ispec) - - ! attenuation - if(ATTENUATION) then - ! use unrelaxed parameters if attenuation - mul = mul * one_minus_sum_beta(i,j,k,ispec) - endif - - ! full anisotropic case, stress calculations - if(ANISOTROPY) then - c11 = c11store(i,j,k,ispec) - c12 = c12store(i,j,k,ispec) - c13 = c13store(i,j,k,ispec) - c14 = c14store(i,j,k,ispec) - c15 = c15store(i,j,k,ispec) - c16 = c16store(i,j,k,ispec) - c22 = c22store(i,j,k,ispec) - c23 = c23store(i,j,k,ispec) - c24 = c24store(i,j,k,ispec) - c25 = c25store(i,j,k,ispec) - c26 = c26store(i,j,k,ispec) - c33 = c33store(i,j,k,ispec) - c34 = c34store(i,j,k,ispec) - c35 = c35store(i,j,k,ispec) - c36 = c36store(i,j,k,ispec) - c44 = c44store(i,j,k,ispec) - c45 = c45store(i,j,k,ispec) - c46 = c46store(i,j,k,ispec) - c55 = c55store(i,j,k,ispec) - c56 = c56store(i,j,k,ispec) - c66 = c66store(i,j,k,ispec) - - sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & - c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl - sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & - c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl - sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & - c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl - sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & - c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl - sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & - c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl - sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & - c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl - - else - - ! isotropic case - lambdalplus2mul = kappal + FOUR_THIRDS * mul - lambdal = lambdalplus2mul - 2.*mul - - ! compute stress sigma - sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl - sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl - sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl - - sigma_xy = mul*duxdyl_plus_duydxl - sigma_xz = mul*duzdxl_plus_duxdzl - sigma_yz = mul*duzdyl_plus_duydzl - - endif ! ANISOTROPY - - ! subtract memory variables if attenuation - if(ATTENUATION) then -! way 1 -! do i_sls = 1,N_SLS -! R_xx_val = R_xx(i,j,k,ispec,i_sls) -! R_yy_val = R_yy(i,j,k,ispec,i_sls) -! sigma_xx = sigma_xx - R_xx_val -! sigma_yy = sigma_yy - R_yy_val -! sigma_zz = sigma_zz + R_xx_val + R_yy_val -! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) -! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) -! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) -! enddo - -! way 2 -! note: this should help compilers to pipeline the code and make better use of the cache; -! depending on compilers, it can further decrease the computation time by ~ 30%. -! by default, N_SLS = 3, therefore we take steps of 3 - if(imodulo_N_SLS >= 1) then - do i_sls = 1,imodulo_N_SLS - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - enddo - endif - - if(N_SLS >= imodulo_N_SLS+1) then - do i_sls = imodulo_N_SLS+1,N_SLS,3 - R_xx_val1 = R_xx(i,j,k,ispec,i_sls) - R_yy_val1 = R_yy(i,j,k,ispec,i_sls) - sigma_xx = sigma_xx - R_xx_val1 - sigma_yy = sigma_yy - R_yy_val1 - sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) - - R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) - R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) - sigma_xx = sigma_xx - R_xx_val2 - sigma_yy = sigma_yy - R_yy_val2 - sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) - - R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) - R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) - sigma_xx = sigma_xx - R_xx_val3 - sigma_yy = sigma_yy - R_yy_val3 - sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 - sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) - sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) - sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) - enddo - endif - - - endif - - ! define symmetric components of sigma - sigma_yx = sigma_xy - sigma_zx = sigma_xz - sigma_zy = sigma_yz - - ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) - tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x - tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y - tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z - - tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x - tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y - tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z - - tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x - tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y - tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z - - enddo - enddo - enddo - - ! subroutines adapted from Deville, Fischer and Mund, High-order methods - ! for incompressible fluid flow, Cambridge University Press (2002), - ! pages 386 and 389 and Figure 8.3.1 - ! call mxm_m1_m2_10points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) - do j=1,m2 - do i=1,m1 - E1_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_10points(1,j) + & - hprimewgll_xxT(i,2)*C1_m1_m2_10points(2,j) + & - hprimewgll_xxT(i,3)*C1_m1_m2_10points(3,j) + & - hprimewgll_xxT(i,4)*C1_m1_m2_10points(4,j) + & - hprimewgll_xxT(i,5)*C1_m1_m2_10points(5,j) + & - hprimewgll_xxT(i,6)*C1_m1_m2_10points(6,j) + & - hprimewgll_xxT(i,7)*C1_m1_m2_10points(7,j) + & - hprimewgll_xxT(i,8)*C1_m1_m2_10points(8,j) + & - hprimewgll_xxT(i,9)*C1_m1_m2_10points(9,j) + & - hprimewgll_xxT(i,10)*C1_m1_m2_10points(10,j) - E2_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_10points(1,j) + & - hprimewgll_xxT(i,2)*C2_m1_m2_10points(2,j) + & - hprimewgll_xxT(i,3)*C2_m1_m2_10points(3,j) + & - hprimewgll_xxT(i,4)*C2_m1_m2_10points(4,j) + & - hprimewgll_xxT(i,5)*C2_m1_m2_10points(5,j) + & - hprimewgll_xxT(i,6)*C2_m1_m2_10points(6,j) + & - hprimewgll_xxT(i,7)*C2_m1_m2_10points(7,j) + & - hprimewgll_xxT(i,8)*C2_m1_m2_10points(8,j) + & - hprimewgll_xxT(i,9)*C2_m1_m2_10points(9,j) + & - hprimewgll_xxT(i,10)*C2_m1_m2_10points(10,j) - E3_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_10points(1,j) + & - hprimewgll_xxT(i,2)*C3_m1_m2_10points(2,j) + & - hprimewgll_xxT(i,3)*C3_m1_m2_10points(3,j) + & - hprimewgll_xxT(i,4)*C3_m1_m2_10points(4,j) + & - hprimewgll_xxT(i,5)*C3_m1_m2_10points(5,j) + & - hprimewgll_xxT(i,6)*C3_m1_m2_10points(6,j) + & - hprimewgll_xxT(i,7)*C3_m1_m2_10points(7,j) + & - hprimewgll_xxT(i,8)*C3_m1_m2_10points(8,j) + & - hprimewgll_xxT(i,9)*C3_m1_m2_10points(9,j) + & - hprimewgll_xxT(i,10)*C3_m1_m2_10points(10,j) - enddo - enddo - - ! call mxm_m1_m1_10points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & - ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) - do i=1,m1 - do j=1,m1 - ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code - do k = 1,NGLLX - newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & - tempx2(i,2,k)*hprimewgll_xx(2,j) + & - tempx2(i,3,k)*hprimewgll_xx(3,j) + & - tempx2(i,4,k)*hprimewgll_xx(4,j) + & - tempx2(i,5,k)*hprimewgll_xx(5,j) + & - tempx2(i,6,k)*hprimewgll_xx(6,j) + & - tempx2(i,7,k)*hprimewgll_xx(7,j) + & - tempx2(i,8,k)*hprimewgll_xx(8,j) + & - tempx2(i,9,k)*hprimewgll_xx(9,j) + & - tempx2(i,10,k)*hprimewgll_xx(10,j) - newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & - tempy2(i,2,k)*hprimewgll_xx(2,j) + & - tempy2(i,3,k)*hprimewgll_xx(3,j) + & - tempy2(i,4,k)*hprimewgll_xx(4,j) + & - tempy2(i,5,k)*hprimewgll_xx(5,j) + & - tempy2(i,6,k)*hprimewgll_xx(6,j) + & - tempy2(i,7,k)*hprimewgll_xx(7,j) + & - tempy2(i,8,k)*hprimewgll_xx(8,j) + & - tempy2(i,9,k)*hprimewgll_xx(9,j) + & - tempy2(i,10,k)*hprimewgll_xx(10,j) - newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & - tempz2(i,2,k)*hprimewgll_xx(2,j) + & - tempz2(i,3,k)*hprimewgll_xx(3,j) + & - tempz2(i,4,k)*hprimewgll_xx(4,j) + & - tempz2(i,5,k)*hprimewgll_xx(5,j) + & - tempz2(i,6,k)*hprimewgll_xx(6,j) + & - tempz2(i,7,k)*hprimewgll_xx(7,j) + & - tempz2(i,8,k)*hprimewgll_xx(8,j) + & - tempz2(i,9,k)*hprimewgll_xx(9,j) + & - tempz2(i,10,k)*hprimewgll_xx(10,j) - enddo - enddo - enddo - - ! call mxm_m2_m1_10points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) - do j=1,m1 - do i=1,m2 - E1_mxm_m2_m1_10points(i,j) = C1_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + & - C1_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + & - C1_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + & - C1_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + & - C1_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + & - C1_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + & - C1_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + & - C1_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + & - C1_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + & - C1_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j) - E2_mxm_m2_m1_10points(i,j) = C2_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + & - C2_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + & - C2_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + & - C2_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + & - C2_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + & - C2_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + & - C2_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + & - C2_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + & - C2_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + & - C2_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j) - E3_mxm_m2_m1_10points(i,j) = C3_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + & - C3_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + & - C3_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + & - C3_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + & - C3_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + & - C3_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + & - C3_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + & - C3_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + & - C3_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + & - C3_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j) - enddo - enddo - - do k=1,NGLLZ - do j=1,NGLLY - do i=1,NGLLX - - fac1 = wgllwgll_yz(j,k) - fac2 = wgllwgll_xz(i,k) - fac3 = wgllwgll_xy(i,j) - - ! sum contributions from each element to the global mesh using indirect addressing - iglob = ibool(i,j,k,ispec) - accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & - fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) - accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & - fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) - accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & - fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) - - ! update memory variables based upon the Runge-Kutta scheme - if(ATTENUATION) then - - ! use Runge-Kutta scheme to march in time - do i_sls = 1,N_SLS - - factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) - - alphaval_loc = alphaval(i_sls) - betaval_loc = betaval(i_sls) - gammaval_loc = gammaval(i_sls) - - ! term in xx - Sn = factor_loc * epsilondev_xx(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) - R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yy - Sn = factor_loc * epsilondev_yy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) - R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in zz not computed since zero trace - ! term in xy - Sn = factor_loc * epsilondev_xy(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) - R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in xz - Sn = factor_loc * epsilondev_xz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) - R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - ! term in yz - Sn = factor_loc * epsilondev_yz(i,j,k,ispec) - Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) - R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & - betaval_loc * Sn + gammaval_loc * Snp1 - - enddo ! end of loop on memory variables - - endif ! end attenuation - - enddo - enddo - enddo - - ! save deviatoric strain for Runge-Kutta scheme - if ( COMPUTE_AND_STORE_STRAIN ) then - epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) - epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) - epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) - epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) - epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) - endif - - enddo ! spectral element loop - -end subroutine compute_forces_elastic_Dev_10p diff --git a/src/specfem3D/compute_forces_elastic_Dev2.f90 b/src/specfem3D/compute_forces_elastic_Dev2.f90 new file mode 100644 index 000000000..6ee7d610a --- /dev/null +++ b/src/specfem3D/compute_forces_elastic_Dev2.f90 @@ -0,0 +1,3477 @@ +!===================================================================== +! +! S p e c f e m 3 D V e r s i o n 2 . 0 +! --------------------------------------- +! +! Main authors: Dimitri Komatitsch and Jeroen Tromp +! Princeton University, USA and University of Pau / CNRS / INRIA +! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA +! April 2011 +! +! This program is free software; you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 2 of the License, or +! (at your option) any later version. +! +! This program is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along +! with this program; if not, write to the Free Software Foundation, Inc., +! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +! +!===================================================================== + + +! higher-order Deville routines (NGLL == 6 to NGLL == 10 ) +! +! note: put these routines into this extra file to avoid reaching internal threshold +! for vectorizations when compiling + +subroutine compute_forces_elastic_Dev_6p( iphase ,NSPEC_AB,NGLOB_AB, & + displ,accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_xxT, & + hprimewgll_xx,hprimewgll_xxT, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION, & + one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& + phase_ispec_inner_elastic) + + +! computes elastic tensor term + + use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & + N_SLS,SAVE_MOHO_MESH, & + ONE_THIRD,FOUR_THIRDS,m1,m2 + implicit none + + integer :: NSPEC_AB,NGLOB_AB + +! displacement and acceleration + real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel + +! arrays with mesh parameters per slice + integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + kappastore,mustore,jacobian + +! array with derivatives of Lagrange polynomials and precalculated products + real(kind=CUSTOM_REAL), dimension(NGLLX,6) :: hprime_xx,hprimewgll_xxT + real(kind=CUSTOM_REAL), dimension(6,NGLLX) :: hprime_xxT,hprimewgll_xx + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz + real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz + +! memory variables and standard linear solids for attenuation + logical :: ATTENUATION + logical :: COMPUTE_AND_STORE_STRAIN + integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT + integer :: NSPEC_ATTENUATION_AB + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta + real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common + real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & + R_xx,R_yy,R_xy,R_xz,R_yz + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & + epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz + real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 + +! anisotropy + logical :: ANISOTROPY + integer :: NSPEC_ANISO + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & + c11store,c12store,c13store,c14store,c15store,c16store, & + c22store,c23store,c24store,c25store,c26store,c33store, & + c34store,c35store,c36store,c44store,c45store,c46store, & + c55store,c56store,c66store + + integer :: iphase + integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic + integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic + +! adjoint simulations + integer :: SIMULATION_TYPE + integer :: NSPEC_BOUN,NSPEC2D_MOHO + + ! moho kernel + real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & + dsdx_top,dsdx_bot + logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot + integer :: ispec2D_moho_top, ispec2D_moho_bot + +! local parameters + real(kind=CUSTOM_REAL), dimension(6,6,6) :: dummyx_loc,dummyy_loc,dummyz_loc, & + newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 + real(kind=CUSTOM_REAL), dimension(6,6,6) :: & + tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 + + ! manually inline the calls to the Deville et al. (2002) routines + real(kind=CUSTOM_REAL), dimension(6,36) :: B1_m1_m2_6points,B2_m1_m2_6points,B3_m1_m2_6points + real(kind=CUSTOM_REAL), dimension(6,36) :: C1_m1_m2_6points,C2_m1_m2_6points,C3_m1_m2_6points + real(kind=CUSTOM_REAL), dimension(6,36) :: E1_m1_m2_6points,E2_m1_m2_6points,E3_m1_m2_6points + + equivalence(dummyx_loc,B1_m1_m2_6points) + equivalence(dummyy_loc,B2_m1_m2_6points) + equivalence(dummyz_loc,B3_m1_m2_6points) + equivalence(tempx1,C1_m1_m2_6points) + equivalence(tempy1,C2_m1_m2_6points) + equivalence(tempz1,C3_m1_m2_6points) + equivalence(newtempx1,E1_m1_m2_6points) + equivalence(newtempy1,E2_m1_m2_6points) + equivalence(newtempz1,E3_m1_m2_6points) + + real(kind=CUSTOM_REAL), dimension(36,6) :: & + A1_mxm_m2_m1_6points,A2_mxm_m2_m1_6points,A3_mxm_m2_m1_6points + real(kind=CUSTOM_REAL), dimension(36,6) :: & + C1_mxm_m2_m1_6points,C2_mxm_m2_m1_6points,C3_mxm_m2_m1_6points + real(kind=CUSTOM_REAL), dimension(36,6) :: & + E1_mxm_m2_m1_6points,E2_mxm_m2_m1_6points,E3_mxm_m2_m1_6points + + equivalence(dummyx_loc,A1_mxm_m2_m1_6points) + equivalence(dummyy_loc,A2_mxm_m2_m1_6points) + equivalence(dummyz_loc,A3_mxm_m2_m1_6points) + equivalence(tempx3,C1_mxm_m2_m1_6points) + equivalence(tempy3,C2_mxm_m2_m1_6points) + equivalence(tempz3,C3_mxm_m2_m1_6points) + equivalence(newtempx3,E1_mxm_m2_m1_6points) + equivalence(newtempy3,E2_mxm_m2_m1_6points) + equivalence(newtempz3,E3_mxm_m2_m1_6points) + + ! local attenuation parameters + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & + epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc + real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 + real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc + real(kind=CUSTOM_REAL) Sn,Snp1 + real(kind=CUSTOM_REAL) templ + + real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl + real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl + + real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl + real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl + + real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy + + real(kind=CUSTOM_REAL) fac1,fac2,fac3 + + real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul + real(kind=CUSTOM_REAL) kappal + + ! local anisotropy parameters + real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& + c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 + + integer i_SLS,imodulo_N_SLS + integer ispec,iglob,ispec_p,num_elements + integer i,j,k + + imodulo_N_SLS = mod(N_SLS,3) + + ! choses inner/outer elements + if( iphase == 1 ) then + num_elements = nspec_outer_elastic + else + num_elements = nspec_inner_elastic + endif + + do ispec_p = 1,num_elements + + ! returns element id from stored element list + ispec = phase_ispec_inner_elastic(ispec_p,iphase) + + ! adjoint simulations: moho kernel + if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + ispec2D_moho_top = ispec2D_moho_top + 1 + else if (is_moho_bot(ispec)) then + ispec2D_moho_bot = ispec2D_moho_bot + 1 + endif + endif ! adjoint + + ! stores displacment values in local array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + iglob = ibool(i,j,k,ispec) + dummyx_loc(i,j,k) = displ(1,iglob) + dummyy_loc(i,j,k) = displ(2,iglob) + dummyz_loc(i,j,k) = displ(3,iglob) + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_6points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) + do j=1,m2 + do i=1,m1 + C1_m1_m2_6points(i,j) = hprime_xx(i,1)*B1_m1_m2_6points(1,j) + & + hprime_xx(i,2)*B1_m1_m2_6points(2,j) + & + hprime_xx(i,3)*B1_m1_m2_6points(3,j) + & + hprime_xx(i,4)*B1_m1_m2_6points(4,j) + & + hprime_xx(i,5)*B1_m1_m2_6points(5,j) + & + hprime_xx(i,6)*B1_m1_m2_6points(6,j) + C2_m1_m2_6points(i,j) = hprime_xx(i,1)*B2_m1_m2_6points(1,j) + & + hprime_xx(i,2)*B2_m1_m2_6points(2,j) + & + hprime_xx(i,3)*B2_m1_m2_6points(3,j) + & + hprime_xx(i,4)*B2_m1_m2_6points(4,j) + & + hprime_xx(i,5)*B2_m1_m2_6points(5,j) + & + hprime_xx(i,6)*B2_m1_m2_6points(6,j) + C3_m1_m2_6points(i,j) = hprime_xx(i,1)*B3_m1_m2_6points(1,j) + & + hprime_xx(i,2)*B3_m1_m2_6points(2,j) + & + hprime_xx(i,3)*B3_m1_m2_6points(3,j) + & + hprime_xx(i,4)*B3_m1_m2_6points(4,j) + & + hprime_xx(i,5)*B3_m1_m2_6points(5,j) + & + hprime_xx(i,6)*B3_m1_m2_6points(6,j) + enddo + enddo + + ! call mxm_m1_m1_6points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & + ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) + do j=1,m1 + do i=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & + dummyx_loc(i,2,k)*hprime_xxT(2,j) + & + dummyx_loc(i,3,k)*hprime_xxT(3,j) + & + dummyx_loc(i,4,k)*hprime_xxT(4,j) + & + dummyx_loc(i,5,k)*hprime_xxT(5,j) + & + dummyx_loc(i,6,k)*hprime_xxT(6,j) + tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & + dummyy_loc(i,2,k)*hprime_xxT(2,j) + & + dummyy_loc(i,3,k)*hprime_xxT(3,j) + & + dummyy_loc(i,4,k)*hprime_xxT(4,j) + & + dummyy_loc(i,5,k)*hprime_xxT(5,j) + & + dummyy_loc(i,6,k)*hprime_xxT(6,j) + tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & + dummyz_loc(i,2,k)*hprime_xxT(2,j) + & + dummyz_loc(i,3,k)*hprime_xxT(3,j) + & + dummyz_loc(i,4,k)*hprime_xxT(4,j) + & + dummyz_loc(i,5,k)*hprime_xxT(5,j) + & + dummyz_loc(i,6,k)*hprime_xxT(6,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_6points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) + do j=1,m1 + do i=1,m2 + C1_mxm_m2_m1_6points(i,j) = A1_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + & + A1_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + & + A1_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + & + A1_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + & + A1_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + & + A1_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j) + C2_mxm_m2_m1_6points(i,j) = A2_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + & + A2_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + & + A2_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + & + A2_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + & + A2_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + & + A2_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j) + C3_mxm_m2_m1_6points(i,j) = A3_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + & + A3_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + & + A3_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + & + A3_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + & + A3_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + & + A3_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! get derivatives of ux, uy and uz with respect to x, y and z + xixl = xix(i,j,k,ispec) + xiyl = xiy(i,j,k,ispec) + xizl = xiz(i,j,k,ispec) + etaxl = etax(i,j,k,ispec) + etayl = etay(i,j,k,ispec) + etazl = etaz(i,j,k,ispec) + gammaxl = gammax(i,j,k,ispec) + gammayl = gammay(i,j,k,ispec) + gammazl = gammaz(i,j,k,ispec) + jacobianl = jacobian(i,j,k,ispec) + + duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) + duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) + duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) + + duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) + duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) + duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) + + duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) + duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) + duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) + + ! save strain on the Moho boundary + if (SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl + dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl + dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl + dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl + dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl + dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl + dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl + dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl + dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl + else if (is_moho_bot(ispec)) then + dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl + dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl + dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl + dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl + dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl + dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl + dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl + dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl + dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl + endif + endif + + ! precompute some sums to save CPU time + duxdxl_plus_duydyl = duxdxl + duydyl + duxdxl_plus_duzdzl = duxdxl + duzdzl + duydyl_plus_duzdzl = duydyl + duzdzl + duxdyl_plus_duydxl = duxdyl + duydxl + duzdxl_plus_duxdzl = duzdxl + duxdzl + duzdyl_plus_duydzl = duzdyl + duydzl + + ! computes deviatoric strain attenuation and/or for kernel calculations + if (COMPUTE_AND_STORE_STRAIN) then + templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) + if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ + epsilondev_xx_loc(i,j,k) = duxdxl - templ + epsilondev_yy_loc(i,j,k) = duydyl - templ + epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl + epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl + epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl + endif + + kappal = kappastore(i,j,k,ispec) + mul = mustore(i,j,k,ispec) + + ! attenuation + if(ATTENUATION) then + ! use unrelaxed parameters if attenuation + mul = mul * one_minus_sum_beta(i,j,k,ispec) + endif + + ! full anisotropic case, stress calculations + if(ANISOTROPY) then + c11 = c11store(i,j,k,ispec) + c12 = c12store(i,j,k,ispec) + c13 = c13store(i,j,k,ispec) + c14 = c14store(i,j,k,ispec) + c15 = c15store(i,j,k,ispec) + c16 = c16store(i,j,k,ispec) + c22 = c22store(i,j,k,ispec) + c23 = c23store(i,j,k,ispec) + c24 = c24store(i,j,k,ispec) + c25 = c25store(i,j,k,ispec) + c26 = c26store(i,j,k,ispec) + c33 = c33store(i,j,k,ispec) + c34 = c34store(i,j,k,ispec) + c35 = c35store(i,j,k,ispec) + c36 = c36store(i,j,k,ispec) + c44 = c44store(i,j,k,ispec) + c45 = c45store(i,j,k,ispec) + c46 = c46store(i,j,k,ispec) + c55 = c55store(i,j,k,ispec) + c56 = c56store(i,j,k,ispec) + c66 = c66store(i,j,k,ispec) + + sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & + c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl + sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & + c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl + sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & + c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl + sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & + c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl + sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & + c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl + sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & + c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl + + else + + ! isotropic case + lambdalplus2mul = kappal + FOUR_THIRDS * mul + lambdal = lambdalplus2mul - 2.*mul + + ! compute stress sigma + sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl + sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl + sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl + + sigma_xy = mul*duxdyl_plus_duydxl + sigma_xz = mul*duzdxl_plus_duxdzl + sigma_yz = mul*duzdyl_plus_duydzl + + endif ! ANISOTROPY + + ! subtract memory variables if attenuation + if(ATTENUATION) then +! way 1 +! do i_sls = 1,N_SLS +! R_xx_val = R_xx(i,j,k,ispec,i_sls) +! R_yy_val = R_yy(i,j,k,ispec,i_sls) +! sigma_xx = sigma_xx - R_xx_val +! sigma_yy = sigma_yy - R_yy_val +! sigma_zz = sigma_zz + R_xx_val + R_yy_val +! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) +! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) +! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) +! enddo + +! way 2 +! note: this should help compilers to pipeline the code and make better use of the cache; +! depending on compilers, it can further decrease the computation time by ~ 30%. +! by default, N_SLS = 3, therefore we take steps of 3 + if(imodulo_N_SLS >= 1) then + do i_sls = 1,imodulo_N_SLS + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + enddo + endif + + if(N_SLS >= imodulo_N_SLS+1) then + do i_sls = imodulo_N_SLS+1,N_SLS,3 + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + + R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) + R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) + sigma_xx = sigma_xx - R_xx_val2 + sigma_yy = sigma_yy - R_yy_val2 + sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) + + R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) + R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) + sigma_xx = sigma_xx - R_xx_val3 + sigma_yy = sigma_yy - R_yy_val3 + sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) + enddo + endif + + + endif + + ! define symmetric components of sigma + sigma_yx = sigma_xy + sigma_zx = sigma_xz + sigma_zy = sigma_yz + + ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) + tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x + tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y + tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z + + tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x + tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y + tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z + + tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x + tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y + tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z + + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_6points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) + do j=1,m2 + do i=1,m1 + E1_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_6points(1,j) + & + hprimewgll_xxT(i,2)*C1_m1_m2_6points(2,j) + & + hprimewgll_xxT(i,3)*C1_m1_m2_6points(3,j) + & + hprimewgll_xxT(i,4)*C1_m1_m2_6points(4,j) + & + hprimewgll_xxT(i,5)*C1_m1_m2_6points(5,j) + & + hprimewgll_xxT(i,6)*C1_m1_m2_6points(6,j) + E2_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_6points(1,j) + & + hprimewgll_xxT(i,2)*C2_m1_m2_6points(2,j) + & + hprimewgll_xxT(i,3)*C2_m1_m2_6points(3,j) + & + hprimewgll_xxT(i,4)*C2_m1_m2_6points(4,j) + & + hprimewgll_xxT(i,5)*C2_m1_m2_6points(5,j) + & + hprimewgll_xxT(i,6)*C2_m1_m2_6points(6,j) + E3_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_6points(1,j) + & + hprimewgll_xxT(i,2)*C3_m1_m2_6points(2,j) + & + hprimewgll_xxT(i,3)*C3_m1_m2_6points(3,j) + & + hprimewgll_xxT(i,4)*C3_m1_m2_6points(4,j) + & + hprimewgll_xxT(i,5)*C3_m1_m2_6points(5,j) + & + hprimewgll_xxT(i,6)*C3_m1_m2_6points(6,j) + enddo + enddo + + ! call mxm_m1_m1_6points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & + ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) + do i=1,m1 + do j=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & + tempx2(i,2,k)*hprimewgll_xx(2,j) + & + tempx2(i,3,k)*hprimewgll_xx(3,j) + & + tempx2(i,4,k)*hprimewgll_xx(4,j) + & + tempx2(i,5,k)*hprimewgll_xx(5,j) + & + tempx2(i,6,k)*hprimewgll_xx(6,j) + newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & + tempy2(i,2,k)*hprimewgll_xx(2,j) + & + tempy2(i,3,k)*hprimewgll_xx(3,j) + & + tempy2(i,4,k)*hprimewgll_xx(4,j) + & + tempy2(i,5,k)*hprimewgll_xx(5,j) + & + tempy2(i,6,k)*hprimewgll_xx(6,j) + newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & + tempz2(i,2,k)*hprimewgll_xx(2,j) + & + tempz2(i,3,k)*hprimewgll_xx(3,j) + & + tempz2(i,4,k)*hprimewgll_xx(4,j) + & + tempz2(i,5,k)*hprimewgll_xx(5,j) + & + tempz2(i,6,k)*hprimewgll_xx(6,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_6points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) + do j=1,m1 + do i=1,m2 + E1_mxm_m2_m1_6points(i,j) = C1_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + & + C1_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + & + C1_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + & + C1_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + & + C1_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + & + C1_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j) + E2_mxm_m2_m1_6points(i,j) = C2_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + & + C2_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + & + C2_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + & + C2_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + & + C2_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + & + C2_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j) + E3_mxm_m2_m1_6points(i,j) = C3_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + & + C3_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + & + C3_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + & + C3_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + & + C3_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + & + C3_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + + fac1 = wgllwgll_yz(j,k) + fac2 = wgllwgll_xz(i,k) + fac3 = wgllwgll_xy(i,j) + + ! sum contributions from each element to the global mesh using indirect addressing + iglob = ibool(i,j,k,ispec) + accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & + fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) + accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & + fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) + accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & + fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) + + ! update memory variables based upon the Runge-Kutta scheme + if(ATTENUATION) then + + ! use Runge-Kutta scheme to march in time + do i_sls = 1,N_SLS + + factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) + + alphaval_loc = alphaval(i_sls) + betaval_loc = betaval(i_sls) + gammaval_loc = gammaval(i_sls) + + ! term in xx + Sn = factor_loc * epsilondev_xx(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) + R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yy + Sn = factor_loc * epsilondev_yy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) + R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in zz not computed since zero trace + ! term in xy + Sn = factor_loc * epsilondev_xy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) + R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in xz + Sn = factor_loc * epsilondev_xz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) + R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yz + Sn = factor_loc * epsilondev_yz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) + R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + + enddo ! end of loop on memory variables + + endif ! end attenuation + + enddo + enddo + enddo + + ! save deviatoric strain for Runge-Kutta scheme + if ( COMPUTE_AND_STORE_STRAIN ) then + epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) + epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) + epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) + epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) + epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) + endif + + enddo ! spectral element loop + +end subroutine compute_forces_elastic_Dev_6p + +! +!===================================================================== +! + +subroutine compute_forces_elastic_Dev_7p( iphase ,NSPEC_AB,NGLOB_AB, & + displ,accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_xxT, & + hprimewgll_xx,hprimewgll_xxT, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION, & + one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& + phase_ispec_inner_elastic) + + +! computes elastic tensor term + + use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & + N_SLS,SAVE_MOHO_MESH, & + ONE_THIRD,FOUR_THIRDS,m1,m2 + implicit none + + integer :: NSPEC_AB,NGLOB_AB + +! displacement and acceleration + real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel + +! arrays with mesh parameters per slice + integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + kappastore,mustore,jacobian + +! array with derivatives of Lagrange polynomials and precalculated products + real(kind=CUSTOM_REAL), dimension(NGLLX,7) :: hprime_xx,hprimewgll_xxT + real(kind=CUSTOM_REAL), dimension(7,NGLLX) :: hprime_xxT,hprimewgll_xx + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz + real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz + +! memory variables and standard linear solids for attenuation + logical :: ATTENUATION + logical :: COMPUTE_AND_STORE_STRAIN + integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT + integer :: NSPEC_ATTENUATION_AB + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta + real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common + real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & + R_xx,R_yy,R_xy,R_xz,R_yz + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & + epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz + real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 + +! anisotropy + logical :: ANISOTROPY + integer :: NSPEC_ANISO + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & + c11store,c12store,c13store,c14store,c15store,c16store, & + c22store,c23store,c24store,c25store,c26store,c33store, & + c34store,c35store,c36store,c44store,c45store,c46store, & + c55store,c56store,c66store + + integer :: iphase + integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic + integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic + +! adjoint simulations + integer :: SIMULATION_TYPE + integer :: NSPEC_BOUN,NSPEC2D_MOHO + + ! moho kernel + real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & + dsdx_top,dsdx_bot + logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot + integer :: ispec2D_moho_top, ispec2D_moho_bot + +! local parameters + real(kind=CUSTOM_REAL), dimension(7,7,7) :: dummyx_loc,dummyy_loc,dummyz_loc, & + newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 + real(kind=CUSTOM_REAL), dimension(7,7,7) :: & + tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 + + ! manually inline the calls to the Deville et al. (2002) routines + real(kind=CUSTOM_REAL), dimension(7,49) :: B1_m1_m2_7points,B2_m1_m2_7points,B3_m1_m2_7points + real(kind=CUSTOM_REAL), dimension(7,49) :: C1_m1_m2_7points,C2_m1_m2_7points,C3_m1_m2_7points + real(kind=CUSTOM_REAL), dimension(7,49) :: E1_m1_m2_7points,E2_m1_m2_7points,E3_m1_m2_7points + + equivalence(dummyx_loc,B1_m1_m2_7points) + equivalence(dummyy_loc,B2_m1_m2_7points) + equivalence(dummyz_loc,B3_m1_m2_7points) + equivalence(tempx1,C1_m1_m2_7points) + equivalence(tempy1,C2_m1_m2_7points) + equivalence(tempz1,C3_m1_m2_7points) + equivalence(newtempx1,E1_m1_m2_7points) + equivalence(newtempy1,E2_m1_m2_7points) + equivalence(newtempz1,E3_m1_m2_7points) + + real(kind=CUSTOM_REAL), dimension(49,7) :: & + A1_mxm_m2_m1_7points,A2_mxm_m2_m1_7points,A3_mxm_m2_m1_7points + real(kind=CUSTOM_REAL), dimension(49,7) :: & + C1_mxm_m2_m1_7points,C2_mxm_m2_m1_7points,C3_mxm_m2_m1_7points + real(kind=CUSTOM_REAL), dimension(49,7) :: & + E1_mxm_m2_m1_7points,E2_mxm_m2_m1_7points,E3_mxm_m2_m1_7points + + equivalence(dummyx_loc,A1_mxm_m2_m1_7points) + equivalence(dummyy_loc,A2_mxm_m2_m1_7points) + equivalence(dummyz_loc,A3_mxm_m2_m1_7points) + equivalence(tempx3,C1_mxm_m2_m1_7points) + equivalence(tempy3,C2_mxm_m2_m1_7points) + equivalence(tempz3,C3_mxm_m2_m1_7points) + equivalence(newtempx3,E1_mxm_m2_m1_7points) + equivalence(newtempy3,E2_mxm_m2_m1_7points) + equivalence(newtempz3,E3_mxm_m2_m1_7points) + + ! local attenuation parameters + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & + epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc + real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 + real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc + real(kind=CUSTOM_REAL) Sn,Snp1 + real(kind=CUSTOM_REAL) templ + + real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl + real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl + + real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl + real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl + + real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy + + real(kind=CUSTOM_REAL) fac1,fac2,fac3 + + real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul + real(kind=CUSTOM_REAL) kappal + + ! local anisotropy parameters + real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& + c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 + + integer i_SLS,imodulo_N_SLS + integer ispec,iglob,ispec_p,num_elements + integer i,j,k + + imodulo_N_SLS = mod(N_SLS,3) + + ! choses inner/outer elements + if( iphase == 1 ) then + num_elements = nspec_outer_elastic + else + num_elements = nspec_inner_elastic + endif + + do ispec_p = 1,num_elements + + ! returns element id from stored element list + ispec = phase_ispec_inner_elastic(ispec_p,iphase) + + ! adjoint simulations: moho kernel + if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + ispec2D_moho_top = ispec2D_moho_top + 1 + else if (is_moho_bot(ispec)) then + ispec2D_moho_bot = ispec2D_moho_bot + 1 + endif + endif ! adjoint + + ! stores displacment values in local array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + iglob = ibool(i,j,k,ispec) + dummyx_loc(i,j,k) = displ(1,iglob) + dummyy_loc(i,j,k) = displ(2,iglob) + dummyz_loc(i,j,k) = displ(3,iglob) + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_7points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) + do j=1,m2 + do i=1,m1 + C1_m1_m2_7points(i,j) = hprime_xx(i,1)*B1_m1_m2_7points(1,j) + & + hprime_xx(i,2)*B1_m1_m2_7points(2,j) + & + hprime_xx(i,3)*B1_m1_m2_7points(3,j) + & + hprime_xx(i,4)*B1_m1_m2_7points(4,j) + & + hprime_xx(i,5)*B1_m1_m2_7points(5,j) + & + hprime_xx(i,6)*B1_m1_m2_7points(6,j) + & + hprime_xx(i,7)*B1_m1_m2_7points(7,j) + C2_m1_m2_7points(i,j) = hprime_xx(i,1)*B2_m1_m2_7points(1,j) + & + hprime_xx(i,2)*B2_m1_m2_7points(2,j) + & + hprime_xx(i,3)*B2_m1_m2_7points(3,j) + & + hprime_xx(i,4)*B2_m1_m2_7points(4,j) + & + hprime_xx(i,5)*B2_m1_m2_7points(5,j) + & + hprime_xx(i,6)*B2_m1_m2_7points(6,j) + & + hprime_xx(i,7)*B2_m1_m2_7points(7,j) + C3_m1_m2_7points(i,j) = hprime_xx(i,1)*B3_m1_m2_7points(1,j) + & + hprime_xx(i,2)*B3_m1_m2_7points(2,j) + & + hprime_xx(i,3)*B3_m1_m2_7points(3,j) + & + hprime_xx(i,4)*B3_m1_m2_7points(4,j) + & + hprime_xx(i,5)*B3_m1_m2_7points(5,j) + & + hprime_xx(i,6)*B3_m1_m2_7points(6,j) + & + hprime_xx(i,7)*B3_m1_m2_7points(7,j) + enddo + enddo + + ! call mxm_m1_m1_7points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & + ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) + do j=1,m1 + do i=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & + dummyx_loc(i,2,k)*hprime_xxT(2,j) + & + dummyx_loc(i,3,k)*hprime_xxT(3,j) + & + dummyx_loc(i,4,k)*hprime_xxT(4,j) + & + dummyx_loc(i,5,k)*hprime_xxT(5,j) + & + dummyx_loc(i,6,k)*hprime_xxT(6,j) + & + dummyx_loc(i,7,k)*hprime_xxT(7,j) + tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & + dummyy_loc(i,2,k)*hprime_xxT(2,j) + & + dummyy_loc(i,3,k)*hprime_xxT(3,j) + & + dummyy_loc(i,4,k)*hprime_xxT(4,j) + & + dummyy_loc(i,5,k)*hprime_xxT(5,j) + & + dummyy_loc(i,6,k)*hprime_xxT(6,j) + & + dummyy_loc(i,7,k)*hprime_xxT(7,j) + tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & + dummyz_loc(i,2,k)*hprime_xxT(2,j) + & + dummyz_loc(i,3,k)*hprime_xxT(3,j) + & + dummyz_loc(i,4,k)*hprime_xxT(4,j) + & + dummyz_loc(i,5,k)*hprime_xxT(5,j) + & + dummyz_loc(i,6,k)*hprime_xxT(6,j) + & + dummyz_loc(i,7,k)*hprime_xxT(7,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_7points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) + do j=1,m1 + do i=1,m2 + C1_mxm_m2_m1_7points(i,j) = A1_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + & + A1_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + & + A1_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + & + A1_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + & + A1_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + & + A1_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + & + A1_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j) + C2_mxm_m2_m1_7points(i,j) = A2_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + & + A2_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + & + A2_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + & + A2_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + & + A2_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + & + A2_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + & + A2_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j) + C3_mxm_m2_m1_7points(i,j) = A3_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + & + A3_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + & + A3_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + & + A3_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + & + A3_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + & + A3_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + & + A3_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! get derivatives of ux, uy and uz with respect to x, y and z + xixl = xix(i,j,k,ispec) + xiyl = xiy(i,j,k,ispec) + xizl = xiz(i,j,k,ispec) + etaxl = etax(i,j,k,ispec) + etayl = etay(i,j,k,ispec) + etazl = etaz(i,j,k,ispec) + gammaxl = gammax(i,j,k,ispec) + gammayl = gammay(i,j,k,ispec) + gammazl = gammaz(i,j,k,ispec) + jacobianl = jacobian(i,j,k,ispec) + + duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) + duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) + duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) + + duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) + duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) + duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) + + duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) + duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) + duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) + + ! save strain on the Moho boundary + if (SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl + dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl + dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl + dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl + dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl + dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl + dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl + dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl + dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl + else if (is_moho_bot(ispec)) then + dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl + dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl + dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl + dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl + dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl + dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl + dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl + dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl + dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl + endif + endif + + ! precompute some sums to save CPU time + duxdxl_plus_duydyl = duxdxl + duydyl + duxdxl_plus_duzdzl = duxdxl + duzdzl + duydyl_plus_duzdzl = duydyl + duzdzl + duxdyl_plus_duydxl = duxdyl + duydxl + duzdxl_plus_duxdzl = duzdxl + duxdzl + duzdyl_plus_duydzl = duzdyl + duydzl + + ! computes deviatoric strain attenuation and/or for kernel calculations + if (COMPUTE_AND_STORE_STRAIN) then + templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) + if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ + epsilondev_xx_loc(i,j,k) = duxdxl - templ + epsilondev_yy_loc(i,j,k) = duydyl - templ + epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl + epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl + epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl + endif + + kappal = kappastore(i,j,k,ispec) + mul = mustore(i,j,k,ispec) + + ! attenuation + if(ATTENUATION) then + ! use unrelaxed parameters if attenuation + mul = mul * one_minus_sum_beta(i,j,k,ispec) + endif + + ! full anisotropic case, stress calculations + if(ANISOTROPY) then + c11 = c11store(i,j,k,ispec) + c12 = c12store(i,j,k,ispec) + c13 = c13store(i,j,k,ispec) + c14 = c14store(i,j,k,ispec) + c15 = c15store(i,j,k,ispec) + c16 = c16store(i,j,k,ispec) + c22 = c22store(i,j,k,ispec) + c23 = c23store(i,j,k,ispec) + c24 = c24store(i,j,k,ispec) + c25 = c25store(i,j,k,ispec) + c26 = c26store(i,j,k,ispec) + c33 = c33store(i,j,k,ispec) + c34 = c34store(i,j,k,ispec) + c35 = c35store(i,j,k,ispec) + c36 = c36store(i,j,k,ispec) + c44 = c44store(i,j,k,ispec) + c45 = c45store(i,j,k,ispec) + c46 = c46store(i,j,k,ispec) + c55 = c55store(i,j,k,ispec) + c56 = c56store(i,j,k,ispec) + c66 = c66store(i,j,k,ispec) + + sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & + c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl + sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & + c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl + sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & + c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl + sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & + c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl + sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & + c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl + sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & + c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl + + else + + ! isotropic case + lambdalplus2mul = kappal + FOUR_THIRDS * mul + lambdal = lambdalplus2mul - 2.*mul + + ! compute stress sigma + sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl + sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl + sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl + + sigma_xy = mul*duxdyl_plus_duydxl + sigma_xz = mul*duzdxl_plus_duxdzl + sigma_yz = mul*duzdyl_plus_duydzl + + endif ! ANISOTROPY + + ! subtract memory variables if attenuation + if(ATTENUATION) then +! way 1 +! do i_sls = 1,N_SLS +! R_xx_val = R_xx(i,j,k,ispec,i_sls) +! R_yy_val = R_yy(i,j,k,ispec,i_sls) +! sigma_xx = sigma_xx - R_xx_val +! sigma_yy = sigma_yy - R_yy_val +! sigma_zz = sigma_zz + R_xx_val + R_yy_val +! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) +! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) +! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) +! enddo + +! way 2 +! note: this should help compilers to pipeline the code and make better use of the cache; +! depending on compilers, it can further decrease the computation time by ~ 30%. +! by default, N_SLS = 3, therefore we take steps of 3 + if(imodulo_N_SLS >= 1) then + do i_sls = 1,imodulo_N_SLS + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + enddo + endif + + if(N_SLS >= imodulo_N_SLS+1) then + do i_sls = imodulo_N_SLS+1,N_SLS,3 + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + + R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) + R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) + sigma_xx = sigma_xx - R_xx_val2 + sigma_yy = sigma_yy - R_yy_val2 + sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) + + R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) + R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) + sigma_xx = sigma_xx - R_xx_val3 + sigma_yy = sigma_yy - R_yy_val3 + sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) + enddo + endif + + + endif + + ! define symmetric components of sigma + sigma_yx = sigma_xy + sigma_zx = sigma_xz + sigma_zy = sigma_yz + + ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) + tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x + tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y + tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z + + tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x + tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y + tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z + + tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x + tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y + tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z + + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_7points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) + do j=1,m2 + do i=1,m1 + E1_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_7points(1,j) + & + hprimewgll_xxT(i,2)*C1_m1_m2_7points(2,j) + & + hprimewgll_xxT(i,3)*C1_m1_m2_7points(3,j) + & + hprimewgll_xxT(i,4)*C1_m1_m2_7points(4,j) + & + hprimewgll_xxT(i,5)*C1_m1_m2_7points(5,j) + & + hprimewgll_xxT(i,6)*C1_m1_m2_7points(6,j) + & + hprimewgll_xxT(i,7)*C1_m1_m2_7points(7,j) + E2_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_7points(1,j) + & + hprimewgll_xxT(i,2)*C2_m1_m2_7points(2,j) + & + hprimewgll_xxT(i,3)*C2_m1_m2_7points(3,j) + & + hprimewgll_xxT(i,4)*C2_m1_m2_7points(4,j) + & + hprimewgll_xxT(i,5)*C2_m1_m2_7points(5,j) + & + hprimewgll_xxT(i,6)*C2_m1_m2_7points(6,j) + & + hprimewgll_xxT(i,7)*C2_m1_m2_7points(7,j) + E3_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_7points(1,j) + & + hprimewgll_xxT(i,2)*C3_m1_m2_7points(2,j) + & + hprimewgll_xxT(i,3)*C3_m1_m2_7points(3,j) + & + hprimewgll_xxT(i,4)*C3_m1_m2_7points(4,j) + & + hprimewgll_xxT(i,5)*C3_m1_m2_7points(5,j) + & + hprimewgll_xxT(i,6)*C3_m1_m2_7points(6,j) + & + hprimewgll_xxT(i,7)*C3_m1_m2_7points(7,j) + enddo + enddo + + ! call mxm_m1_m1_7points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & + ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) + do i=1,m1 + do j=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & + tempx2(i,2,k)*hprimewgll_xx(2,j) + & + tempx2(i,3,k)*hprimewgll_xx(3,j) + & + tempx2(i,4,k)*hprimewgll_xx(4,j) + & + tempx2(i,5,k)*hprimewgll_xx(5,j) + & + tempx2(i,6,k)*hprimewgll_xx(6,j) + & + tempx2(i,7,k)*hprimewgll_xx(7,j) + newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & + tempy2(i,2,k)*hprimewgll_xx(2,j) + & + tempy2(i,3,k)*hprimewgll_xx(3,j) + & + tempy2(i,4,k)*hprimewgll_xx(4,j) + & + tempy2(i,5,k)*hprimewgll_xx(5,j) + & + tempy2(i,6,k)*hprimewgll_xx(6,j) + & + tempy2(i,7,k)*hprimewgll_xx(7,j) + newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & + tempz2(i,2,k)*hprimewgll_xx(2,j) + & + tempz2(i,3,k)*hprimewgll_xx(3,j) + & + tempz2(i,4,k)*hprimewgll_xx(4,j) + & + tempz2(i,5,k)*hprimewgll_xx(5,j) + & + tempz2(i,6,k)*hprimewgll_xx(6,j) + & + tempz2(i,7,k)*hprimewgll_xx(7,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_7points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) + do j=1,m1 + do i=1,m2 + E1_mxm_m2_m1_7points(i,j) = C1_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + & + C1_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + & + C1_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + & + C1_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + & + C1_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + & + C1_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + & + C1_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j) + E2_mxm_m2_m1_7points(i,j) = C2_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + & + C2_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + & + C2_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + & + C2_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + & + C2_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + & + C2_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + & + C2_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j) + E3_mxm_m2_m1_7points(i,j) = C3_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + & + C3_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + & + C3_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + & + C3_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + & + C3_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + & + C3_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + & + C3_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + + fac1 = wgllwgll_yz(j,k) + fac2 = wgllwgll_xz(i,k) + fac3 = wgllwgll_xy(i,j) + + ! sum contributions from each element to the global mesh using indirect addressing + iglob = ibool(i,j,k,ispec) + accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & + fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) + accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & + fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) + accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & + fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) + + ! update memory variables based upon the Runge-Kutta scheme + if(ATTENUATION) then + + ! use Runge-Kutta scheme to march in time + do i_sls = 1,N_SLS + + factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) + + alphaval_loc = alphaval(i_sls) + betaval_loc = betaval(i_sls) + gammaval_loc = gammaval(i_sls) + + ! term in xx + Sn = factor_loc * epsilondev_xx(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) + R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yy + Sn = factor_loc * epsilondev_yy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) + R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in zz not computed since zero trace + ! term in xy + Sn = factor_loc * epsilondev_xy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) + R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in xz + Sn = factor_loc * epsilondev_xz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) + R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yz + Sn = factor_loc * epsilondev_yz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) + R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + + enddo ! end of loop on memory variables + + endif ! end attenuation + + enddo + enddo + enddo + + ! save deviatoric strain for Runge-Kutta scheme + if ( COMPUTE_AND_STORE_STRAIN ) then + epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) + epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) + epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) + epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) + epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) + endif + + enddo ! spectral element loop + +end subroutine compute_forces_elastic_Dev_7p + +! +!===================================================================== +! + +subroutine compute_forces_elastic_Dev_8p( iphase ,NSPEC_AB,NGLOB_AB, & + displ,accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_xxT, & + hprimewgll_xx,hprimewgll_xxT, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION, & + one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& + phase_ispec_inner_elastic) + + +! computes elastic tensor term + + use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & + N_SLS,SAVE_MOHO_MESH, & + ONE_THIRD,FOUR_THIRDS,m1,m2 + implicit none + + integer :: NSPEC_AB,NGLOB_AB + +! displacement and acceleration + real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel + +! arrays with mesh parameters per slice + integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + kappastore,mustore,jacobian + +! array with derivatives of Lagrange polynomials and precalculated products + real(kind=CUSTOM_REAL), dimension(NGLLX,8) :: hprime_xx,hprimewgll_xxT + real(kind=CUSTOM_REAL), dimension(8,NGLLX) :: hprime_xxT,hprimewgll_xx + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz + real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz + +! memory variables and standard linear solids for attenuation + logical :: ATTENUATION + logical :: COMPUTE_AND_STORE_STRAIN + integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT + integer :: NSPEC_ATTENUATION_AB + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta + real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common + real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & + R_xx,R_yy,R_xy,R_xz,R_yz + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & + epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz + real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 + +! anisotropy + logical :: ANISOTROPY + integer :: NSPEC_ANISO + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & + c11store,c12store,c13store,c14store,c15store,c16store, & + c22store,c23store,c24store,c25store,c26store,c33store, & + c34store,c35store,c36store,c44store,c45store,c46store, & + c55store,c56store,c66store + + integer :: iphase + integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic + integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic + +! adjoint simulations + integer :: SIMULATION_TYPE + integer :: NSPEC_BOUN,NSPEC2D_MOHO + + ! moho kernel + real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & + dsdx_top,dsdx_bot + logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot + integer :: ispec2D_moho_top, ispec2D_moho_bot + +! local parameters + real(kind=CUSTOM_REAL), dimension(8,8,8) :: dummyx_loc,dummyy_loc,dummyz_loc, & + newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 + real(kind=CUSTOM_REAL), dimension(8,8,8) :: & + tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 + + ! manually inline the calls to the Deville et al. (2002) routines + real(kind=CUSTOM_REAL), dimension(8,64) :: B1_m1_m2_8points,B2_m1_m2_8points,B3_m1_m2_8points + real(kind=CUSTOM_REAL), dimension(8,64) :: C1_m1_m2_8points,C2_m1_m2_8points,C3_m1_m2_8points + real(kind=CUSTOM_REAL), dimension(8,64) :: E1_m1_m2_8points,E2_m1_m2_8points,E3_m1_m2_8points + + equivalence(dummyx_loc,B1_m1_m2_8points) + equivalence(dummyy_loc,B2_m1_m2_8points) + equivalence(dummyz_loc,B3_m1_m2_8points) + equivalence(tempx1,C1_m1_m2_8points) + equivalence(tempy1,C2_m1_m2_8points) + equivalence(tempz1,C3_m1_m2_8points) + equivalence(newtempx1,E1_m1_m2_8points) + equivalence(newtempy1,E2_m1_m2_8points) + equivalence(newtempz1,E3_m1_m2_8points) + + real(kind=CUSTOM_REAL), dimension(64,8) :: & + A1_mxm_m2_m1_8points,A2_mxm_m2_m1_8points,A3_mxm_m2_m1_8points + real(kind=CUSTOM_REAL), dimension(64,8) :: & + C1_mxm_m2_m1_8points,C2_mxm_m2_m1_8points,C3_mxm_m2_m1_8points + real(kind=CUSTOM_REAL), dimension(64,8) :: & + E1_mxm_m2_m1_8points,E2_mxm_m2_m1_8points,E3_mxm_m2_m1_8points + + equivalence(dummyx_loc,A1_mxm_m2_m1_8points) + equivalence(dummyy_loc,A2_mxm_m2_m1_8points) + equivalence(dummyz_loc,A3_mxm_m2_m1_8points) + equivalence(tempx3,C1_mxm_m2_m1_8points) + equivalence(tempy3,C2_mxm_m2_m1_8points) + equivalence(tempz3,C3_mxm_m2_m1_8points) + equivalence(newtempx3,E1_mxm_m2_m1_8points) + equivalence(newtempy3,E2_mxm_m2_m1_8points) + equivalence(newtempz3,E3_mxm_m2_m1_8points) + + ! local attenuation parameters + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & + epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc + real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 + real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc + real(kind=CUSTOM_REAL) Sn,Snp1 + real(kind=CUSTOM_REAL) templ + + real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl + real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl + + real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl + real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl + + real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy + + real(kind=CUSTOM_REAL) fac1,fac2,fac3 + + real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul + real(kind=CUSTOM_REAL) kappal + + ! local anisotropy parameters + real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& + c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 + + integer i_SLS,imodulo_N_SLS + integer ispec,iglob,ispec_p,num_elements + integer i,j,k + + imodulo_N_SLS = mod(N_SLS,3) + + ! choses inner/outer elements + if( iphase == 1 ) then + num_elements = nspec_outer_elastic + else + num_elements = nspec_inner_elastic + endif + + do ispec_p = 1,num_elements + + ! returns element id from stored element list + ispec = phase_ispec_inner_elastic(ispec_p,iphase) + + ! adjoint simulations: moho kernel + if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + ispec2D_moho_top = ispec2D_moho_top + 1 + else if (is_moho_bot(ispec)) then + ispec2D_moho_bot = ispec2D_moho_bot + 1 + endif + endif ! adjoint + + ! stores displacment values in local array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + iglob = ibool(i,j,k,ispec) + dummyx_loc(i,j,k) = displ(1,iglob) + dummyy_loc(i,j,k) = displ(2,iglob) + dummyz_loc(i,j,k) = displ(3,iglob) + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_8points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) + do j=1,m2 + do i=1,m1 + C1_m1_m2_8points(i,j) = hprime_xx(i,1)*B1_m1_m2_8points(1,j) + & + hprime_xx(i,2)*B1_m1_m2_8points(2,j) + & + hprime_xx(i,3)*B1_m1_m2_8points(3,j) + & + hprime_xx(i,4)*B1_m1_m2_8points(4,j) + & + hprime_xx(i,5)*B1_m1_m2_8points(5,j) + & + hprime_xx(i,6)*B1_m1_m2_8points(6,j) + & + hprime_xx(i,7)*B1_m1_m2_8points(7,j) + & + hprime_xx(i,8)*B1_m1_m2_8points(8,j) + C2_m1_m2_8points(i,j) = hprime_xx(i,1)*B2_m1_m2_8points(1,j) + & + hprime_xx(i,2)*B2_m1_m2_8points(2,j) + & + hprime_xx(i,3)*B2_m1_m2_8points(3,j) + & + hprime_xx(i,4)*B2_m1_m2_8points(4,j) + & + hprime_xx(i,5)*B2_m1_m2_8points(5,j) + & + hprime_xx(i,6)*B2_m1_m2_8points(6,j) + & + hprime_xx(i,7)*B2_m1_m2_8points(7,j) + & + hprime_xx(i,8)*B2_m1_m2_8points(8,j) + C3_m1_m2_8points(i,j) = hprime_xx(i,1)*B3_m1_m2_8points(1,j) + & + hprime_xx(i,2)*B3_m1_m2_8points(2,j) + & + hprime_xx(i,3)*B3_m1_m2_8points(3,j) + & + hprime_xx(i,4)*B3_m1_m2_8points(4,j) + & + hprime_xx(i,5)*B3_m1_m2_8points(5,j) + & + hprime_xx(i,6)*B3_m1_m2_8points(6,j) + & + hprime_xx(i,7)*B3_m1_m2_8points(7,j) + & + hprime_xx(i,8)*B3_m1_m2_8points(8,j) + enddo + enddo + + ! call mxm_m1_m1_8points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & + ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) + do j=1,m1 + do i=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & + dummyx_loc(i,2,k)*hprime_xxT(2,j) + & + dummyx_loc(i,3,k)*hprime_xxT(3,j) + & + dummyx_loc(i,4,k)*hprime_xxT(4,j) + & + dummyx_loc(i,5,k)*hprime_xxT(5,j) + & + dummyx_loc(i,6,k)*hprime_xxT(6,j) + & + dummyx_loc(i,7,k)*hprime_xxT(7,j) + & + dummyx_loc(i,8,k)*hprime_xxT(8,j) + tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & + dummyy_loc(i,2,k)*hprime_xxT(2,j) + & + dummyy_loc(i,3,k)*hprime_xxT(3,j) + & + dummyy_loc(i,4,k)*hprime_xxT(4,j) + & + dummyy_loc(i,5,k)*hprime_xxT(5,j) + & + dummyy_loc(i,6,k)*hprime_xxT(6,j) + & + dummyy_loc(i,7,k)*hprime_xxT(7,j) + & + dummyy_loc(i,8,k)*hprime_xxT(8,j) + tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & + dummyz_loc(i,2,k)*hprime_xxT(2,j) + & + dummyz_loc(i,3,k)*hprime_xxT(3,j) + & + dummyz_loc(i,4,k)*hprime_xxT(4,j) + & + dummyz_loc(i,5,k)*hprime_xxT(5,j) + & + dummyz_loc(i,6,k)*hprime_xxT(6,j) + & + dummyz_loc(i,7,k)*hprime_xxT(7,j) + & + dummyz_loc(i,8,k)*hprime_xxT(8,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_8points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) + do j=1,m1 + do i=1,m2 + C1_mxm_m2_m1_8points(i,j) = A1_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + & + A1_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + & + A1_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + & + A1_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + & + A1_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + & + A1_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + & + A1_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + & + A1_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j) + C2_mxm_m2_m1_8points(i,j) = A2_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + & + A2_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + & + A2_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + & + A2_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + & + A2_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + & + A2_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + & + A2_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + & + A2_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j) + C3_mxm_m2_m1_8points(i,j) = A3_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + & + A3_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + & + A3_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + & + A3_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + & + A3_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + & + A3_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + & + A3_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + & + A3_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! get derivatives of ux, uy and uz with respect to x, y and z + xixl = xix(i,j,k,ispec) + xiyl = xiy(i,j,k,ispec) + xizl = xiz(i,j,k,ispec) + etaxl = etax(i,j,k,ispec) + etayl = etay(i,j,k,ispec) + etazl = etaz(i,j,k,ispec) + gammaxl = gammax(i,j,k,ispec) + gammayl = gammay(i,j,k,ispec) + gammazl = gammaz(i,j,k,ispec) + jacobianl = jacobian(i,j,k,ispec) + + duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) + duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) + duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) + + duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) + duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) + duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) + + duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) + duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) + duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) + + ! save strain on the Moho boundary + if (SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl + dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl + dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl + dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl + dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl + dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl + dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl + dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl + dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl + else if (is_moho_bot(ispec)) then + dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl + dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl + dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl + dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl + dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl + dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl + dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl + dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl + dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl + endif + endif + + ! precompute some sums to save CPU time + duxdxl_plus_duydyl = duxdxl + duydyl + duxdxl_plus_duzdzl = duxdxl + duzdzl + duydyl_plus_duzdzl = duydyl + duzdzl + duxdyl_plus_duydxl = duxdyl + duydxl + duzdxl_plus_duxdzl = duzdxl + duxdzl + duzdyl_plus_duydzl = duzdyl + duydzl + + ! computes deviatoric strain attenuation and/or for kernel calculations + if (COMPUTE_AND_STORE_STRAIN) then + templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) + if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ + epsilondev_xx_loc(i,j,k) = duxdxl - templ + epsilondev_yy_loc(i,j,k) = duydyl - templ + epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl + epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl + epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl + endif + + kappal = kappastore(i,j,k,ispec) + mul = mustore(i,j,k,ispec) + + ! attenuation + if(ATTENUATION) then + ! use unrelaxed parameters if attenuation + mul = mul * one_minus_sum_beta(i,j,k,ispec) + endif + + ! full anisotropic case, stress calculations + if(ANISOTROPY) then + c11 = c11store(i,j,k,ispec) + c12 = c12store(i,j,k,ispec) + c13 = c13store(i,j,k,ispec) + c14 = c14store(i,j,k,ispec) + c15 = c15store(i,j,k,ispec) + c16 = c16store(i,j,k,ispec) + c22 = c22store(i,j,k,ispec) + c23 = c23store(i,j,k,ispec) + c24 = c24store(i,j,k,ispec) + c25 = c25store(i,j,k,ispec) + c26 = c26store(i,j,k,ispec) + c33 = c33store(i,j,k,ispec) + c34 = c34store(i,j,k,ispec) + c35 = c35store(i,j,k,ispec) + c36 = c36store(i,j,k,ispec) + c44 = c44store(i,j,k,ispec) + c45 = c45store(i,j,k,ispec) + c46 = c46store(i,j,k,ispec) + c55 = c55store(i,j,k,ispec) + c56 = c56store(i,j,k,ispec) + c66 = c66store(i,j,k,ispec) + + sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & + c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl + sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & + c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl + sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & + c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl + sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & + c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl + sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & + c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl + sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & + c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl + + else + + ! isotropic case + lambdalplus2mul = kappal + FOUR_THIRDS * mul + lambdal = lambdalplus2mul - 2.*mul + + ! compute stress sigma + sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl + sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl + sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl + + sigma_xy = mul*duxdyl_plus_duydxl + sigma_xz = mul*duzdxl_plus_duxdzl + sigma_yz = mul*duzdyl_plus_duydzl + + endif ! ANISOTROPY + + ! subtract memory variables if attenuation + if(ATTENUATION) then +! way 1 +! do i_sls = 1,N_SLS +! R_xx_val = R_xx(i,j,k,ispec,i_sls) +! R_yy_val = R_yy(i,j,k,ispec,i_sls) +! sigma_xx = sigma_xx - R_xx_val +! sigma_yy = sigma_yy - R_yy_val +! sigma_zz = sigma_zz + R_xx_val + R_yy_val +! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) +! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) +! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) +! enddo + +! way 2 +! note: this should help compilers to pipeline the code and make better use of the cache; +! depending on compilers, it can further decrease the computation time by ~ 30%. +! by default, N_SLS = 3, therefore we take steps of 3 + if(imodulo_N_SLS >= 1) then + do i_sls = 1,imodulo_N_SLS + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + enddo + endif + + if(N_SLS >= imodulo_N_SLS+1) then + do i_sls = imodulo_N_SLS+1,N_SLS,3 + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + + R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) + R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) + sigma_xx = sigma_xx - R_xx_val2 + sigma_yy = sigma_yy - R_yy_val2 + sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) + + R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) + R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) + sigma_xx = sigma_xx - R_xx_val3 + sigma_yy = sigma_yy - R_yy_val3 + sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) + enddo + endif + + + endif + + ! define symmetric components of sigma + sigma_yx = sigma_xy + sigma_zx = sigma_xz + sigma_zy = sigma_yz + + ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) + tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x + tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y + tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z + + tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x + tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y + tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z + + tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x + tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y + tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z + + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_8points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) + do j=1,m2 + do i=1,m1 + E1_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_8points(1,j) + & + hprimewgll_xxT(i,2)*C1_m1_m2_8points(2,j) + & + hprimewgll_xxT(i,3)*C1_m1_m2_8points(3,j) + & + hprimewgll_xxT(i,4)*C1_m1_m2_8points(4,j) + & + hprimewgll_xxT(i,5)*C1_m1_m2_8points(5,j) + & + hprimewgll_xxT(i,6)*C1_m1_m2_8points(6,j) + & + hprimewgll_xxT(i,7)*C1_m1_m2_8points(7,j) + & + hprimewgll_xxT(i,8)*C1_m1_m2_8points(8,j) + E2_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_8points(1,j) + & + hprimewgll_xxT(i,2)*C2_m1_m2_8points(2,j) + & + hprimewgll_xxT(i,3)*C2_m1_m2_8points(3,j) + & + hprimewgll_xxT(i,4)*C2_m1_m2_8points(4,j) + & + hprimewgll_xxT(i,5)*C2_m1_m2_8points(5,j) + & + hprimewgll_xxT(i,6)*C2_m1_m2_8points(6,j) + & + hprimewgll_xxT(i,7)*C2_m1_m2_8points(7,j) + & + hprimewgll_xxT(i,8)*C2_m1_m2_8points(8,j) + E3_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_8points(1,j) + & + hprimewgll_xxT(i,2)*C3_m1_m2_8points(2,j) + & + hprimewgll_xxT(i,3)*C3_m1_m2_8points(3,j) + & + hprimewgll_xxT(i,4)*C3_m1_m2_8points(4,j) + & + hprimewgll_xxT(i,5)*C3_m1_m2_8points(5,j) + & + hprimewgll_xxT(i,6)*C3_m1_m2_8points(6,j) + & + hprimewgll_xxT(i,7)*C3_m1_m2_8points(7,j) + & + hprimewgll_xxT(i,8)*C3_m1_m2_8points(8,j) + enddo + enddo + + ! call mxm_m1_m1_8points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & + ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) + do i=1,m1 + do j=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & + tempx2(i,2,k)*hprimewgll_xx(2,j) + & + tempx2(i,3,k)*hprimewgll_xx(3,j) + & + tempx2(i,4,k)*hprimewgll_xx(4,j) + & + tempx2(i,5,k)*hprimewgll_xx(5,j) + & + tempx2(i,6,k)*hprimewgll_xx(6,j) + & + tempx2(i,7,k)*hprimewgll_xx(7,j) + & + tempx2(i,8,k)*hprimewgll_xx(8,j) + newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & + tempy2(i,2,k)*hprimewgll_xx(2,j) + & + tempy2(i,3,k)*hprimewgll_xx(3,j) + & + tempy2(i,4,k)*hprimewgll_xx(4,j) + & + tempy2(i,5,k)*hprimewgll_xx(5,j) + & + tempy2(i,6,k)*hprimewgll_xx(6,j) + & + tempy2(i,7,k)*hprimewgll_xx(7,j) + & + tempy2(i,8,k)*hprimewgll_xx(8,j) + newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & + tempz2(i,2,k)*hprimewgll_xx(2,j) + & + tempz2(i,3,k)*hprimewgll_xx(3,j) + & + tempz2(i,4,k)*hprimewgll_xx(4,j) + & + tempz2(i,5,k)*hprimewgll_xx(5,j) + & + tempz2(i,6,k)*hprimewgll_xx(6,j) + & + tempz2(i,7,k)*hprimewgll_xx(7,j) + & + tempz2(i,8,k)*hprimewgll_xx(8,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_8points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) + do j=1,m1 + do i=1,m2 + E1_mxm_m2_m1_8points(i,j) = C1_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + & + C1_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + & + C1_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + & + C1_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + & + C1_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + & + C1_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + & + C1_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + & + C1_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j) + E2_mxm_m2_m1_8points(i,j) = C2_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + & + C2_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + & + C2_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + & + C2_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + & + C2_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + & + C2_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + & + C2_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + & + C2_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j) + E3_mxm_m2_m1_8points(i,j) = C3_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + & + C3_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + & + C3_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + & + C3_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + & + C3_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + & + C3_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + & + C3_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + & + C3_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + + fac1 = wgllwgll_yz(j,k) + fac2 = wgllwgll_xz(i,k) + fac3 = wgllwgll_xy(i,j) + + ! sum contributions from each element to the global mesh using indirect addressing + iglob = ibool(i,j,k,ispec) + accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & + fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) + accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & + fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) + accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & + fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) + + ! update memory variables based upon the Runge-Kutta scheme + if(ATTENUATION) then + + ! use Runge-Kutta scheme to march in time + do i_sls = 1,N_SLS + + factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) + + alphaval_loc = alphaval(i_sls) + betaval_loc = betaval(i_sls) + gammaval_loc = gammaval(i_sls) + + ! term in xx + Sn = factor_loc * epsilondev_xx(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) + R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yy + Sn = factor_loc * epsilondev_yy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) + R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in zz not computed since zero trace + ! term in xy + Sn = factor_loc * epsilondev_xy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) + R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in xz + Sn = factor_loc * epsilondev_xz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) + R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yz + Sn = factor_loc * epsilondev_yz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) + R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + + enddo ! end of loop on memory variables + + endif ! end attenuation + + enddo + enddo + enddo + + ! save deviatoric strain for Runge-Kutta scheme + if ( COMPUTE_AND_STORE_STRAIN ) then + epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) + epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) + epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) + epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) + epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) + endif + + enddo ! spectral element loop + +end subroutine compute_forces_elastic_Dev_8p + +! +!===================================================================== +! + +subroutine compute_forces_elastic_Dev_9p( iphase ,NSPEC_AB,NGLOB_AB, & + displ,accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_xxT, & + hprimewgll_xx,hprimewgll_xxT, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION, & + one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& + phase_ispec_inner_elastic) + + +! computes elastic tensor term + + use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & + N_SLS,SAVE_MOHO_MESH, & + ONE_THIRD,FOUR_THIRDS,m1,m2 + implicit none + + integer :: NSPEC_AB,NGLOB_AB + +! displacement and acceleration + real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel + +! arrays with mesh parameters per slice + integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + kappastore,mustore,jacobian + +! array with derivatives of Lagrange polynomials and precalculated products + real(kind=CUSTOM_REAL), dimension(NGLLX,9) :: hprime_xx,hprimewgll_xxT + real(kind=CUSTOM_REAL), dimension(9,NGLLX) :: hprime_xxT,hprimewgll_xx + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz + real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz + +! memory variables and standard linear solids for attenuation + logical :: ATTENUATION + logical :: COMPUTE_AND_STORE_STRAIN + integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT + integer :: NSPEC_ATTENUATION_AB + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta + real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common + real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & + R_xx,R_yy,R_xy,R_xz,R_yz + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & + epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz + real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 + +! anisotropy + logical :: ANISOTROPY + integer :: NSPEC_ANISO + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & + c11store,c12store,c13store,c14store,c15store,c16store, & + c22store,c23store,c24store,c25store,c26store,c33store, & + c34store,c35store,c36store,c44store,c45store,c46store, & + c55store,c56store,c66store + + integer :: iphase + integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic + integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic + +! adjoint simulations + integer :: SIMULATION_TYPE + integer :: NSPEC_BOUN,NSPEC2D_MOHO + + ! moho kernel + real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & + dsdx_top,dsdx_bot + logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot + integer :: ispec2D_moho_top, ispec2D_moho_bot + +! local parameters + real(kind=CUSTOM_REAL), dimension(9,9,9) :: dummyx_loc,dummyy_loc,dummyz_loc, & + newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 + real(kind=CUSTOM_REAL), dimension(9,9,9) :: & + tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 + + ! manually inline the calls to the Deville et al. (2002) routines + real(kind=CUSTOM_REAL), dimension(9,81) :: B1_m1_m2_9points,B2_m1_m2_9points,B3_m1_m2_9points + real(kind=CUSTOM_REAL), dimension(9,81) :: C1_m1_m2_9points,C2_m1_m2_9points,C3_m1_m2_9points + real(kind=CUSTOM_REAL), dimension(9,81) :: E1_m1_m2_9points,E2_m1_m2_9points,E3_m1_m2_9points + + equivalence(dummyx_loc,B1_m1_m2_9points) + equivalence(dummyy_loc,B2_m1_m2_9points) + equivalence(dummyz_loc,B3_m1_m2_9points) + equivalence(tempx1,C1_m1_m2_9points) + equivalence(tempy1,C2_m1_m2_9points) + equivalence(tempz1,C3_m1_m2_9points) + equivalence(newtempx1,E1_m1_m2_9points) + equivalence(newtempy1,E2_m1_m2_9points) + equivalence(newtempz1,E3_m1_m2_9points) + + real(kind=CUSTOM_REAL), dimension(81,9) :: & + A1_mxm_m2_m1_9points,A2_mxm_m2_m1_9points,A3_mxm_m2_m1_9points + real(kind=CUSTOM_REAL), dimension(81,9) :: & + C1_mxm_m2_m1_9points,C2_mxm_m2_m1_9points,C3_mxm_m2_m1_9points + real(kind=CUSTOM_REAL), dimension(81,9) :: & + E1_mxm_m2_m1_9points,E2_mxm_m2_m1_9points,E3_mxm_m2_m1_9points + + equivalence(dummyx_loc,A1_mxm_m2_m1_9points) + equivalence(dummyy_loc,A2_mxm_m2_m1_9points) + equivalence(dummyz_loc,A3_mxm_m2_m1_9points) + equivalence(tempx3,C1_mxm_m2_m1_9points) + equivalence(tempy3,C2_mxm_m2_m1_9points) + equivalence(tempz3,C3_mxm_m2_m1_9points) + equivalence(newtempx3,E1_mxm_m2_m1_9points) + equivalence(newtempy3,E2_mxm_m2_m1_9points) + equivalence(newtempz3,E3_mxm_m2_m1_9points) + + ! local attenuation parameters + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & + epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc + real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 + real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc + real(kind=CUSTOM_REAL) Sn,Snp1 + real(kind=CUSTOM_REAL) templ + + real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl + real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl + + real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl + real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl + + real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy + + real(kind=CUSTOM_REAL) fac1,fac2,fac3 + + real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul + real(kind=CUSTOM_REAL) kappal + + ! local anisotropy parameters + real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& + c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 + + integer i_SLS,imodulo_N_SLS + integer ispec,iglob,ispec_p,num_elements + integer i,j,k + + imodulo_N_SLS = mod(N_SLS,3) + + ! choses inner/outer elements + if( iphase == 1 ) then + num_elements = nspec_outer_elastic + else + num_elements = nspec_inner_elastic + endif + + do ispec_p = 1,num_elements + + ! returns element id from stored element list + ispec = phase_ispec_inner_elastic(ispec_p,iphase) + + ! adjoint simulations: moho kernel + if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + ispec2D_moho_top = ispec2D_moho_top + 1 + else if (is_moho_bot(ispec)) then + ispec2D_moho_bot = ispec2D_moho_bot + 1 + endif + endif ! adjoint + + ! stores displacment values in local array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + iglob = ibool(i,j,k,ispec) + dummyx_loc(i,j,k) = displ(1,iglob) + dummyy_loc(i,j,k) = displ(2,iglob) + dummyz_loc(i,j,k) = displ(3,iglob) + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_9points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) + do j=1,m2 + do i=1,m1 + C1_m1_m2_9points(i,j) = hprime_xx(i,1)*B1_m1_m2_9points(1,j) + & + hprime_xx(i,2)*B1_m1_m2_9points(2,j) + & + hprime_xx(i,3)*B1_m1_m2_9points(3,j) + & + hprime_xx(i,4)*B1_m1_m2_9points(4,j) + & + hprime_xx(i,5)*B1_m1_m2_9points(5,j) + & + hprime_xx(i,6)*B1_m1_m2_9points(6,j) + & + hprime_xx(i,7)*B1_m1_m2_9points(7,j) + & + hprime_xx(i,8)*B1_m1_m2_9points(8,j) + & + hprime_xx(i,9)*B1_m1_m2_9points(9,j) + C2_m1_m2_9points(i,j) = hprime_xx(i,1)*B2_m1_m2_9points(1,j) + & + hprime_xx(i,2)*B2_m1_m2_9points(2,j) + & + hprime_xx(i,3)*B2_m1_m2_9points(3,j) + & + hprime_xx(i,4)*B2_m1_m2_9points(4,j) + & + hprime_xx(i,5)*B2_m1_m2_9points(5,j) + & + hprime_xx(i,6)*B2_m1_m2_9points(6,j) + & + hprime_xx(i,7)*B2_m1_m2_9points(7,j) + & + hprime_xx(i,8)*B2_m1_m2_9points(8,j) + & + hprime_xx(i,9)*B2_m1_m2_9points(9,j) + C3_m1_m2_9points(i,j) = hprime_xx(i,1)*B3_m1_m2_9points(1,j) + & + hprime_xx(i,2)*B3_m1_m2_9points(2,j) + & + hprime_xx(i,3)*B3_m1_m2_9points(3,j) + & + hprime_xx(i,4)*B3_m1_m2_9points(4,j) + & + hprime_xx(i,5)*B3_m1_m2_9points(5,j) + & + hprime_xx(i,6)*B3_m1_m2_9points(6,j) + & + hprime_xx(i,7)*B3_m1_m2_9points(7,j) + & + hprime_xx(i,8)*B3_m1_m2_9points(8,j) + & + hprime_xx(i,9)*B3_m1_m2_9points(9,j) + enddo + enddo + + ! call mxm_m1_m1_9points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & + ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) + do j=1,m1 + do i=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & + dummyx_loc(i,2,k)*hprime_xxT(2,j) + & + dummyx_loc(i,3,k)*hprime_xxT(3,j) + & + dummyx_loc(i,4,k)*hprime_xxT(4,j) + & + dummyx_loc(i,5,k)*hprime_xxT(5,j) + & + dummyx_loc(i,6,k)*hprime_xxT(6,j) + & + dummyx_loc(i,7,k)*hprime_xxT(7,j) + & + dummyx_loc(i,8,k)*hprime_xxT(8,j) + & + dummyx_loc(i,9,k)*hprime_xxT(9,j) + tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & + dummyy_loc(i,2,k)*hprime_xxT(2,j) + & + dummyy_loc(i,3,k)*hprime_xxT(3,j) + & + dummyy_loc(i,4,k)*hprime_xxT(4,j) + & + dummyy_loc(i,5,k)*hprime_xxT(5,j) + & + dummyy_loc(i,6,k)*hprime_xxT(6,j) + & + dummyy_loc(i,7,k)*hprime_xxT(7,j) + & + dummyy_loc(i,8,k)*hprime_xxT(8,j) + & + dummyy_loc(i,9,k)*hprime_xxT(9,j) + tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & + dummyz_loc(i,2,k)*hprime_xxT(2,j) + & + dummyz_loc(i,3,k)*hprime_xxT(3,j) + & + dummyz_loc(i,4,k)*hprime_xxT(4,j) + & + dummyz_loc(i,5,k)*hprime_xxT(5,j) + & + dummyz_loc(i,6,k)*hprime_xxT(6,j) + & + dummyz_loc(i,7,k)*hprime_xxT(7,j) + & + dummyz_loc(i,8,k)*hprime_xxT(8,j) + & + dummyz_loc(i,9,k)*hprime_xxT(9,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_9points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) + do j=1,m1 + do i=1,m2 + C1_mxm_m2_m1_9points(i,j) = A1_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + & + A1_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + & + A1_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + & + A1_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + & + A1_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + & + A1_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + & + A1_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + & + A1_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + & + A1_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j) + C2_mxm_m2_m1_9points(i,j) = A2_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + & + A2_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + & + A2_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + & + A2_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + & + A2_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + & + A2_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + & + A2_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + & + A2_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + & + A2_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j) + C3_mxm_m2_m1_9points(i,j) = A3_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + & + A3_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + & + A3_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + & + A3_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + & + A3_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + & + A3_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + & + A3_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + & + A3_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + & + A3_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! get derivatives of ux, uy and uz with respect to x, y and z + xixl = xix(i,j,k,ispec) + xiyl = xiy(i,j,k,ispec) + xizl = xiz(i,j,k,ispec) + etaxl = etax(i,j,k,ispec) + etayl = etay(i,j,k,ispec) + etazl = etaz(i,j,k,ispec) + gammaxl = gammax(i,j,k,ispec) + gammayl = gammay(i,j,k,ispec) + gammazl = gammaz(i,j,k,ispec) + jacobianl = jacobian(i,j,k,ispec) + + duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) + duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) + duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) + + duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) + duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) + duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) + + duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) + duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) + duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) + + ! save strain on the Moho boundary + if (SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl + dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl + dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl + dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl + dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl + dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl + dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl + dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl + dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl + else if (is_moho_bot(ispec)) then + dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl + dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl + dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl + dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl + dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl + dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl + dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl + dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl + dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl + endif + endif + + ! precompute some sums to save CPU time + duxdxl_plus_duydyl = duxdxl + duydyl + duxdxl_plus_duzdzl = duxdxl + duzdzl + duydyl_plus_duzdzl = duydyl + duzdzl + duxdyl_plus_duydxl = duxdyl + duydxl + duzdxl_plus_duxdzl = duzdxl + duxdzl + duzdyl_plus_duydzl = duzdyl + duydzl + + ! computes deviatoric strain attenuation and/or for kernel calculations + if (COMPUTE_AND_STORE_STRAIN) then + templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) + if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ + epsilondev_xx_loc(i,j,k) = duxdxl - templ + epsilondev_yy_loc(i,j,k) = duydyl - templ + epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl + epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl + epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl + endif + + kappal = kappastore(i,j,k,ispec) + mul = mustore(i,j,k,ispec) + + ! attenuation + if(ATTENUATION) then + ! use unrelaxed parameters if attenuation + mul = mul * one_minus_sum_beta(i,j,k,ispec) + endif + + ! full anisotropic case, stress calculations + if(ANISOTROPY) then + c11 = c11store(i,j,k,ispec) + c12 = c12store(i,j,k,ispec) + c13 = c13store(i,j,k,ispec) + c14 = c14store(i,j,k,ispec) + c15 = c15store(i,j,k,ispec) + c16 = c16store(i,j,k,ispec) + c22 = c22store(i,j,k,ispec) + c23 = c23store(i,j,k,ispec) + c24 = c24store(i,j,k,ispec) + c25 = c25store(i,j,k,ispec) + c26 = c26store(i,j,k,ispec) + c33 = c33store(i,j,k,ispec) + c34 = c34store(i,j,k,ispec) + c35 = c35store(i,j,k,ispec) + c36 = c36store(i,j,k,ispec) + c44 = c44store(i,j,k,ispec) + c45 = c45store(i,j,k,ispec) + c46 = c46store(i,j,k,ispec) + c55 = c55store(i,j,k,ispec) + c56 = c56store(i,j,k,ispec) + c66 = c66store(i,j,k,ispec) + + sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & + c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl + sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & + c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl + sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & + c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl + sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & + c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl + sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & + c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl + sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & + c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl + + else + + ! isotropic case + lambdalplus2mul = kappal + FOUR_THIRDS * mul + lambdal = lambdalplus2mul - 2.*mul + + ! compute stress sigma + sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl + sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl + sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl + + sigma_xy = mul*duxdyl_plus_duydxl + sigma_xz = mul*duzdxl_plus_duxdzl + sigma_yz = mul*duzdyl_plus_duydzl + + endif ! ANISOTROPY + + ! subtract memory variables if attenuation + if(ATTENUATION) then +! way 1 +! do i_sls = 1,N_SLS +! R_xx_val = R_xx(i,j,k,ispec,i_sls) +! R_yy_val = R_yy(i,j,k,ispec,i_sls) +! sigma_xx = sigma_xx - R_xx_val +! sigma_yy = sigma_yy - R_yy_val +! sigma_zz = sigma_zz + R_xx_val + R_yy_val +! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) +! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) +! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) +! enddo + +! way 2 +! note: this should help compilers to pipeline the code and make better use of the cache; +! depending on compilers, it can further decrease the computation time by ~ 30%. +! by default, N_SLS = 3, therefore we take steps of 3 + if(imodulo_N_SLS >= 1) then + do i_sls = 1,imodulo_N_SLS + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + enddo + endif + + if(N_SLS >= imodulo_N_SLS+1) then + do i_sls = imodulo_N_SLS+1,N_SLS,3 + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + + R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) + R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) + sigma_xx = sigma_xx - R_xx_val2 + sigma_yy = sigma_yy - R_yy_val2 + sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) + + R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) + R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) + sigma_xx = sigma_xx - R_xx_val3 + sigma_yy = sigma_yy - R_yy_val3 + sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) + enddo + endif + + + endif + + ! define symmetric components of sigma + sigma_yx = sigma_xy + sigma_zx = sigma_xz + sigma_zy = sigma_yz + + ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) + tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x + tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y + tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z + + tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x + tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y + tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z + + tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x + tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y + tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z + + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_9points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) + do j=1,m2 + do i=1,m1 + E1_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_9points(1,j) + & + hprimewgll_xxT(i,2)*C1_m1_m2_9points(2,j) + & + hprimewgll_xxT(i,3)*C1_m1_m2_9points(3,j) + & + hprimewgll_xxT(i,4)*C1_m1_m2_9points(4,j) + & + hprimewgll_xxT(i,5)*C1_m1_m2_9points(5,j) + & + hprimewgll_xxT(i,6)*C1_m1_m2_9points(6,j) + & + hprimewgll_xxT(i,7)*C1_m1_m2_9points(7,j) + & + hprimewgll_xxT(i,8)*C1_m1_m2_9points(8,j) + & + hprimewgll_xxT(i,9)*C1_m1_m2_9points(9,j) + E2_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_9points(1,j) + & + hprimewgll_xxT(i,2)*C2_m1_m2_9points(2,j) + & + hprimewgll_xxT(i,3)*C2_m1_m2_9points(3,j) + & + hprimewgll_xxT(i,4)*C2_m1_m2_9points(4,j) + & + hprimewgll_xxT(i,5)*C2_m1_m2_9points(5,j) + & + hprimewgll_xxT(i,6)*C2_m1_m2_9points(6,j) + & + hprimewgll_xxT(i,7)*C2_m1_m2_9points(7,j) + & + hprimewgll_xxT(i,8)*C2_m1_m2_9points(8,j) + & + hprimewgll_xxT(i,9)*C2_m1_m2_9points(9,j) + E3_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_9points(1,j) + & + hprimewgll_xxT(i,2)*C3_m1_m2_9points(2,j) + & + hprimewgll_xxT(i,3)*C3_m1_m2_9points(3,j) + & + hprimewgll_xxT(i,4)*C3_m1_m2_9points(4,j) + & + hprimewgll_xxT(i,5)*C3_m1_m2_9points(5,j) + & + hprimewgll_xxT(i,6)*C3_m1_m2_9points(6,j) + & + hprimewgll_xxT(i,7)*C3_m1_m2_9points(7,j) + & + hprimewgll_xxT(i,8)*C3_m1_m2_9points(8,j) + & + hprimewgll_xxT(i,9)*C3_m1_m2_9points(9,j) + enddo + enddo + + ! call mxm_m1_m1_9points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & + ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) + do i=1,m1 + do j=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & + tempx2(i,2,k)*hprimewgll_xx(2,j) + & + tempx2(i,3,k)*hprimewgll_xx(3,j) + & + tempx2(i,4,k)*hprimewgll_xx(4,j) + & + tempx2(i,5,k)*hprimewgll_xx(5,j) + & + tempx2(i,6,k)*hprimewgll_xx(6,j) + & + tempx2(i,7,k)*hprimewgll_xx(7,j) + & + tempx2(i,8,k)*hprimewgll_xx(8,j) + & + tempx2(i,9,k)*hprimewgll_xx(9,j) + newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & + tempy2(i,2,k)*hprimewgll_xx(2,j) + & + tempy2(i,3,k)*hprimewgll_xx(3,j) + & + tempy2(i,4,k)*hprimewgll_xx(4,j) + & + tempy2(i,5,k)*hprimewgll_xx(5,j) + & + tempy2(i,6,k)*hprimewgll_xx(6,j) + & + tempy2(i,7,k)*hprimewgll_xx(7,j) + & + tempy2(i,8,k)*hprimewgll_xx(8,j) + & + tempy2(i,9,k)*hprimewgll_xx(9,j) + newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & + tempz2(i,2,k)*hprimewgll_xx(2,j) + & + tempz2(i,3,k)*hprimewgll_xx(3,j) + & + tempz2(i,4,k)*hprimewgll_xx(4,j) + & + tempz2(i,5,k)*hprimewgll_xx(5,j) + & + tempz2(i,6,k)*hprimewgll_xx(6,j) + & + tempz2(i,7,k)*hprimewgll_xx(7,j) + & + tempz2(i,8,k)*hprimewgll_xx(8,j) + & + tempz2(i,9,k)*hprimewgll_xx(9,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_9points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) + do j=1,m1 + do i=1,m2 + E1_mxm_m2_m1_9points(i,j) = C1_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + & + C1_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + & + C1_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + & + C1_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + & + C1_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + & + C1_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + & + C1_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + & + C1_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + & + C1_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j) + E2_mxm_m2_m1_9points(i,j) = C2_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + & + C2_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + & + C2_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + & + C2_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + & + C2_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + & + C2_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + & + C2_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + & + C2_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + & + C2_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j) + E3_mxm_m2_m1_9points(i,j) = C3_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + & + C3_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + & + C3_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + & + C3_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + & + C3_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + & + C3_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + & + C3_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + & + C3_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + & + C3_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + + fac1 = wgllwgll_yz(j,k) + fac2 = wgllwgll_xz(i,k) + fac3 = wgllwgll_xy(i,j) + + ! sum contributions from each element to the global mesh using indirect addressing + iglob = ibool(i,j,k,ispec) + accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & + fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) + accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & + fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) + accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & + fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) + + ! update memory variables based upon the Runge-Kutta scheme + if(ATTENUATION) then + + ! use Runge-Kutta scheme to march in time + do i_sls = 1,N_SLS + + factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) + + alphaval_loc = alphaval(i_sls) + betaval_loc = betaval(i_sls) + gammaval_loc = gammaval(i_sls) + + ! term in xx + Sn = factor_loc * epsilondev_xx(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) + R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yy + Sn = factor_loc * epsilondev_yy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) + R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in zz not computed since zero trace + ! term in xy + Sn = factor_loc * epsilondev_xy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) + R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in xz + Sn = factor_loc * epsilondev_xz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) + R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yz + Sn = factor_loc * epsilondev_yz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) + R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + + enddo ! end of loop on memory variables + + endif ! end attenuation + + enddo + enddo + enddo + + ! save deviatoric strain for Runge-Kutta scheme + if ( COMPUTE_AND_STORE_STRAIN ) then + epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) + epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) + epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) + epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) + epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) + endif + + enddo ! spectral element loop + +end subroutine compute_forces_elastic_Dev_9p + +! +!===================================================================== +! + +subroutine compute_forces_elastic_Dev_10p( iphase ,NSPEC_AB,NGLOB_AB, & + displ,accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_xxT, & + hprimewgll_xx,hprimewgll_xxT, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION, & + one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,& + phase_ispec_inner_elastic) + + +! computes elastic tensor term + + use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & + N_SLS,SAVE_MOHO_MESH, & + ONE_THIRD,FOUR_THIRDS,m1,m2 + implicit none + + integer :: NSPEC_AB,NGLOB_AB + +! displacement and acceleration + real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel + +! arrays with mesh parameters per slice + integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + kappastore,mustore,jacobian + +! array with derivatives of Lagrange polynomials and precalculated products + real(kind=CUSTOM_REAL), dimension(NGLLX,10) :: hprime_xx,hprimewgll_xxT + real(kind=CUSTOM_REAL), dimension(10,NGLLX) :: hprime_xxT,hprimewgll_xx + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz + real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz + +! memory variables and standard linear solids for attenuation + logical :: ATTENUATION + logical :: COMPUTE_AND_STORE_STRAIN + integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT + integer :: NSPEC_ATTENUATION_AB + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta + real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common + real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & + R_xx,R_yy,R_xy,R_xz,R_yz + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & + epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz + real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 + +! anisotropy + logical :: ANISOTROPY + integer :: NSPEC_ANISO + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & + c11store,c12store,c13store,c14store,c15store,c16store, & + c22store,c23store,c24store,c25store,c26store,c33store, & + c34store,c35store,c36store,c44store,c45store,c46store, & + c55store,c56store,c66store + + integer :: iphase + integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic + integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic + +! adjoint simulations + integer :: SIMULATION_TYPE + integer :: NSPEC_BOUN,NSPEC2D_MOHO + + ! moho kernel + real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & + dsdx_top,dsdx_bot + logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot + integer :: ispec2D_moho_top, ispec2D_moho_bot + +! local parameters + real(kind=CUSTOM_REAL), dimension(10,10,10) :: dummyx_loc,dummyy_loc,dummyz_loc, & + newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3 + real(kind=CUSTOM_REAL), dimension(10,10,10) :: & + tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 + + ! manually inline the calls to the Deville et al. (2002) routines + real(kind=CUSTOM_REAL), dimension(10,100) :: B1_m1_m2_10points,B2_m1_m2_10points,B3_m1_m2_10points + real(kind=CUSTOM_REAL), dimension(10,100) :: C1_m1_m2_10points,C2_m1_m2_10points,C3_m1_m2_10points + real(kind=CUSTOM_REAL), dimension(10,100) :: E1_m1_m2_10points,E2_m1_m2_10points,E3_m1_m2_10points + + equivalence(dummyx_loc,B1_m1_m2_10points) + equivalence(dummyy_loc,B2_m1_m2_10points) + equivalence(dummyz_loc,B3_m1_m2_10points) + equivalence(tempx1,C1_m1_m2_10points) + equivalence(tempy1,C2_m1_m2_10points) + equivalence(tempz1,C3_m1_m2_10points) + equivalence(newtempx1,E1_m1_m2_10points) + equivalence(newtempy1,E2_m1_m2_10points) + equivalence(newtempz1,E3_m1_m2_10points) + + real(kind=CUSTOM_REAL), dimension(100,10) :: & + A1_mxm_m2_m1_10points,A2_mxm_m2_m1_10points,A3_mxm_m2_m1_10points + real(kind=CUSTOM_REAL), dimension(100,10) :: & + C1_mxm_m2_m1_10points,C2_mxm_m2_m1_10points,C3_mxm_m2_m1_10points + real(kind=CUSTOM_REAL), dimension(100,10) :: & + E1_mxm_m2_m1_10points,E2_mxm_m2_m1_10points,E3_mxm_m2_m1_10points + + equivalence(dummyx_loc,A1_mxm_m2_m1_10points) + equivalence(dummyy_loc,A2_mxm_m2_m1_10points) + equivalence(dummyz_loc,A3_mxm_m2_m1_10points) + equivalence(tempx3,C1_mxm_m2_m1_10points) + equivalence(tempy3,C2_mxm_m2_m1_10points) + equivalence(tempz3,C3_mxm_m2_m1_10points) + equivalence(newtempx3,E1_mxm_m2_m1_10points) + equivalence(newtempy3,E2_mxm_m2_m1_10points) + equivalence(newtempz3,E3_mxm_m2_m1_10points) + + ! local attenuation parameters + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & + epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc + real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 + real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc + real(kind=CUSTOM_REAL) Sn,Snp1 + real(kind=CUSTOM_REAL) templ + + real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl + real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl + + real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl + real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl + + real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy + + real(kind=CUSTOM_REAL) fac1,fac2,fac3 + + real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul + real(kind=CUSTOM_REAL) kappal + + ! local anisotropy parameters + real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& + c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 + + integer i_SLS,imodulo_N_SLS + integer ispec,iglob,ispec_p,num_elements + integer i,j,k + + imodulo_N_SLS = mod(N_SLS,3) + + ! choses inner/outer elements + if( iphase == 1 ) then + num_elements = nspec_outer_elastic + else + num_elements = nspec_inner_elastic + endif + + do ispec_p = 1,num_elements + + ! returns element id from stored element list + ispec = phase_ispec_inner_elastic(ispec_p,iphase) + + ! adjoint simulations: moho kernel + if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + ispec2D_moho_top = ispec2D_moho_top + 1 + else if (is_moho_bot(ispec)) then + ispec2D_moho_bot = ispec2D_moho_bot + 1 + endif + endif ! adjoint + + ! stores displacment values in local array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + iglob = ibool(i,j,k,ispec) + dummyx_loc(i,j,k) = displ(1,iglob) + dummyy_loc(i,j,k) = displ(2,iglob) + dummyz_loc(i,j,k) = displ(3,iglob) + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_10points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) + do j=1,m2 + do i=1,m1 + C1_m1_m2_10points(i,j) = hprime_xx(i,1)*B1_m1_m2_10points(1,j) + & + hprime_xx(i,2)*B1_m1_m2_10points(2,j) + & + hprime_xx(i,3)*B1_m1_m2_10points(3,j) + & + hprime_xx(i,4)*B1_m1_m2_10points(4,j) + & + hprime_xx(i,5)*B1_m1_m2_10points(5,j) + & + hprime_xx(i,6)*B1_m1_m2_10points(6,j) + & + hprime_xx(i,7)*B1_m1_m2_10points(7,j) + & + hprime_xx(i,8)*B1_m1_m2_10points(8,j) + & + hprime_xx(i,9)*B1_m1_m2_10points(9,j) + & + hprime_xx(i,10)*B1_m1_m2_10points(10,j) + C2_m1_m2_10points(i,j) = hprime_xx(i,1)*B2_m1_m2_10points(1,j) + & + hprime_xx(i,2)*B2_m1_m2_10points(2,j) + & + hprime_xx(i,3)*B2_m1_m2_10points(3,j) + & + hprime_xx(i,4)*B2_m1_m2_10points(4,j) + & + hprime_xx(i,5)*B2_m1_m2_10points(5,j) + & + hprime_xx(i,6)*B2_m1_m2_10points(6,j) + & + hprime_xx(i,7)*B2_m1_m2_10points(7,j) + & + hprime_xx(i,8)*B2_m1_m2_10points(8,j) + & + hprime_xx(i,9)*B2_m1_m2_10points(9,j) + & + hprime_xx(i,10)*B2_m1_m2_10points(10,j) + C3_m1_m2_10points(i,j) = hprime_xx(i,1)*B3_m1_m2_10points(1,j) + & + hprime_xx(i,2)*B3_m1_m2_10points(2,j) + & + hprime_xx(i,3)*B3_m1_m2_10points(3,j) + & + hprime_xx(i,4)*B3_m1_m2_10points(4,j) + & + hprime_xx(i,5)*B3_m1_m2_10points(5,j) + & + hprime_xx(i,6)*B3_m1_m2_10points(6,j) + & + hprime_xx(i,7)*B3_m1_m2_10points(7,j) + & + hprime_xx(i,8)*B3_m1_m2_10points(8,j) + & + hprime_xx(i,9)*B3_m1_m2_10points(9,j) + & + hprime_xx(i,10)*B3_m1_m2_10points(10,j) + enddo + enddo + + ! call mxm_m1_m1_10points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & + ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) + do j=1,m1 + do i=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + & + dummyx_loc(i,2,k)*hprime_xxT(2,j) + & + dummyx_loc(i,3,k)*hprime_xxT(3,j) + & + dummyx_loc(i,4,k)*hprime_xxT(4,j) + & + dummyx_loc(i,5,k)*hprime_xxT(5,j) + & + dummyx_loc(i,6,k)*hprime_xxT(6,j) + & + dummyx_loc(i,7,k)*hprime_xxT(7,j) + & + dummyx_loc(i,8,k)*hprime_xxT(8,j) + & + dummyx_loc(i,9,k)*hprime_xxT(9,j) + & + dummyx_loc(i,10,k)*hprime_xxT(10,j) + tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + & + dummyy_loc(i,2,k)*hprime_xxT(2,j) + & + dummyy_loc(i,3,k)*hprime_xxT(3,j) + & + dummyy_loc(i,4,k)*hprime_xxT(4,j) + & + dummyy_loc(i,5,k)*hprime_xxT(5,j) + & + dummyy_loc(i,6,k)*hprime_xxT(6,j) + & + dummyy_loc(i,7,k)*hprime_xxT(7,j) + & + dummyy_loc(i,8,k)*hprime_xxT(8,j) + & + dummyy_loc(i,9,k)*hprime_xxT(9,j) + & + dummyy_loc(i,10,k)*hprime_xxT(10,j) + tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + & + dummyz_loc(i,2,k)*hprime_xxT(2,j) + & + dummyz_loc(i,3,k)*hprime_xxT(3,j) + & + dummyz_loc(i,4,k)*hprime_xxT(4,j) + & + dummyz_loc(i,5,k)*hprime_xxT(5,j) + & + dummyz_loc(i,6,k)*hprime_xxT(6,j) + & + dummyz_loc(i,7,k)*hprime_xxT(7,j) + & + dummyz_loc(i,8,k)*hprime_xxT(8,j) + & + dummyz_loc(i,9,k)*hprime_xxT(9,j) + & + dummyz_loc(i,10,k)*hprime_xxT(10,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_10points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) + do j=1,m1 + do i=1,m2 + C1_mxm_m2_m1_10points(i,j) = A1_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + & + A1_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + & + A1_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + & + A1_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + & + A1_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + & + A1_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + & + A1_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + & + A1_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + & + A1_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + & + A1_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j) + C2_mxm_m2_m1_10points(i,j) = A2_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + & + A2_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + & + A2_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + & + A2_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + & + A2_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + & + A2_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + & + A2_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + & + A2_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + & + A2_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + & + A2_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j) + C3_mxm_m2_m1_10points(i,j) = A3_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + & + A3_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + & + A3_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + & + A3_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + & + A3_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + & + A3_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + & + A3_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + & + A3_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + & + A3_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + & + A3_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! get derivatives of ux, uy and uz with respect to x, y and z + xixl = xix(i,j,k,ispec) + xiyl = xiy(i,j,k,ispec) + xizl = xiz(i,j,k,ispec) + etaxl = etax(i,j,k,ispec) + etayl = etay(i,j,k,ispec) + etazl = etaz(i,j,k,ispec) + gammaxl = gammax(i,j,k,ispec) + gammayl = gammay(i,j,k,ispec) + gammazl = gammaz(i,j,k,ispec) + jacobianl = jacobian(i,j,k,ispec) + + duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k) + duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k) + duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k) + + duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k) + duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k) + duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k) + + duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k) + duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k) + duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k) + + ! save strain on the Moho boundary + if (SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl + dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl + dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl + dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl + dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl + dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl + dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl + dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl + dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl + else if (is_moho_bot(ispec)) then + dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl + dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl + dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl + dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl + dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl + dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl + dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl + dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl + dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl + endif + endif + + ! precompute some sums to save CPU time + duxdxl_plus_duydyl = duxdxl + duydyl + duxdxl_plus_duzdzl = duxdxl + duzdzl + duydyl_plus_duzdzl = duydyl + duzdzl + duxdyl_plus_duydxl = duxdyl + duydxl + duzdxl_plus_duxdzl = duzdxl + duxdzl + duzdyl_plus_duydzl = duzdyl + duydzl + + ! computes deviatoric strain attenuation and/or for kernel calculations + if (COMPUTE_AND_STORE_STRAIN) then + templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) + if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ + epsilondev_xx_loc(i,j,k) = duxdxl - templ + epsilondev_yy_loc(i,j,k) = duydyl - templ + epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl + epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl + epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl + endif + + kappal = kappastore(i,j,k,ispec) + mul = mustore(i,j,k,ispec) + + ! attenuation + if(ATTENUATION) then + ! use unrelaxed parameters if attenuation + mul = mul * one_minus_sum_beta(i,j,k,ispec) + endif + + ! full anisotropic case, stress calculations + if(ANISOTROPY) then + c11 = c11store(i,j,k,ispec) + c12 = c12store(i,j,k,ispec) + c13 = c13store(i,j,k,ispec) + c14 = c14store(i,j,k,ispec) + c15 = c15store(i,j,k,ispec) + c16 = c16store(i,j,k,ispec) + c22 = c22store(i,j,k,ispec) + c23 = c23store(i,j,k,ispec) + c24 = c24store(i,j,k,ispec) + c25 = c25store(i,j,k,ispec) + c26 = c26store(i,j,k,ispec) + c33 = c33store(i,j,k,ispec) + c34 = c34store(i,j,k,ispec) + c35 = c35store(i,j,k,ispec) + c36 = c36store(i,j,k,ispec) + c44 = c44store(i,j,k,ispec) + c45 = c45store(i,j,k,ispec) + c46 = c46store(i,j,k,ispec) + c55 = c55store(i,j,k,ispec) + c56 = c56store(i,j,k,ispec) + c66 = c66store(i,j,k,ispec) + + sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & + c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl + sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & + c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl + sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & + c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl + sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & + c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl + sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & + c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl + sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & + c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl + + else + + ! isotropic case + lambdalplus2mul = kappal + FOUR_THIRDS * mul + lambdal = lambdalplus2mul - 2.*mul + + ! compute stress sigma + sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl + sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl + sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl + + sigma_xy = mul*duxdyl_plus_duydxl + sigma_xz = mul*duzdxl_plus_duxdzl + sigma_yz = mul*duzdyl_plus_duydzl + + endif ! ANISOTROPY + + ! subtract memory variables if attenuation + if(ATTENUATION) then +! way 1 +! do i_sls = 1,N_SLS +! R_xx_val = R_xx(i,j,k,ispec,i_sls) +! R_yy_val = R_yy(i,j,k,ispec,i_sls) +! sigma_xx = sigma_xx - R_xx_val +! sigma_yy = sigma_yy - R_yy_val +! sigma_zz = sigma_zz + R_xx_val + R_yy_val +! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) +! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) +! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) +! enddo + +! way 2 +! note: this should help compilers to pipeline the code and make better use of the cache; +! depending on compilers, it can further decrease the computation time by ~ 30%. +! by default, N_SLS = 3, therefore we take steps of 3 + if(imodulo_N_SLS >= 1) then + do i_sls = 1,imodulo_N_SLS + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + enddo + endif + + if(N_SLS >= imodulo_N_SLS+1) then + do i_sls = imodulo_N_SLS+1,N_SLS,3 + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + + R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) + R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) + sigma_xx = sigma_xx - R_xx_val2 + sigma_yy = sigma_yy - R_yy_val2 + sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) + + R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) + R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) + sigma_xx = sigma_xx - R_xx_val3 + sigma_yy = sigma_yy - R_yy_val3 + sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) + enddo + endif + + + endif + + ! define symmetric components of sigma + sigma_yx = sigma_xy + sigma_zx = sigma_xz + sigma_zy = sigma_yz + + ! form dot product with test vector, non-symmetric form (which is useful in the case of PML) + tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x + tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y + tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z + + tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x + tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y + tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z + + tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x + tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y + tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z + + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_10points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) + do j=1,m2 + do i=1,m1 + E1_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_10points(1,j) + & + hprimewgll_xxT(i,2)*C1_m1_m2_10points(2,j) + & + hprimewgll_xxT(i,3)*C1_m1_m2_10points(3,j) + & + hprimewgll_xxT(i,4)*C1_m1_m2_10points(4,j) + & + hprimewgll_xxT(i,5)*C1_m1_m2_10points(5,j) + & + hprimewgll_xxT(i,6)*C1_m1_m2_10points(6,j) + & + hprimewgll_xxT(i,7)*C1_m1_m2_10points(7,j) + & + hprimewgll_xxT(i,8)*C1_m1_m2_10points(8,j) + & + hprimewgll_xxT(i,9)*C1_m1_m2_10points(9,j) + & + hprimewgll_xxT(i,10)*C1_m1_m2_10points(10,j) + E2_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_10points(1,j) + & + hprimewgll_xxT(i,2)*C2_m1_m2_10points(2,j) + & + hprimewgll_xxT(i,3)*C2_m1_m2_10points(3,j) + & + hprimewgll_xxT(i,4)*C2_m1_m2_10points(4,j) + & + hprimewgll_xxT(i,5)*C2_m1_m2_10points(5,j) + & + hprimewgll_xxT(i,6)*C2_m1_m2_10points(6,j) + & + hprimewgll_xxT(i,7)*C2_m1_m2_10points(7,j) + & + hprimewgll_xxT(i,8)*C2_m1_m2_10points(8,j) + & + hprimewgll_xxT(i,9)*C2_m1_m2_10points(9,j) + & + hprimewgll_xxT(i,10)*C2_m1_m2_10points(10,j) + E3_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_10points(1,j) + & + hprimewgll_xxT(i,2)*C3_m1_m2_10points(2,j) + & + hprimewgll_xxT(i,3)*C3_m1_m2_10points(3,j) + & + hprimewgll_xxT(i,4)*C3_m1_m2_10points(4,j) + & + hprimewgll_xxT(i,5)*C3_m1_m2_10points(5,j) + & + hprimewgll_xxT(i,6)*C3_m1_m2_10points(6,j) + & + hprimewgll_xxT(i,7)*C3_m1_m2_10points(7,j) + & + hprimewgll_xxT(i,8)*C3_m1_m2_10points(8,j) + & + hprimewgll_xxT(i,9)*C3_m1_m2_10points(9,j) + & + hprimewgll_xxT(i,10)*C3_m1_m2_10points(10,j) + enddo + enddo + + ! call mxm_m1_m1_10points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & + ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) + do i=1,m1 + do j=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + & + tempx2(i,2,k)*hprimewgll_xx(2,j) + & + tempx2(i,3,k)*hprimewgll_xx(3,j) + & + tempx2(i,4,k)*hprimewgll_xx(4,j) + & + tempx2(i,5,k)*hprimewgll_xx(5,j) + & + tempx2(i,6,k)*hprimewgll_xx(6,j) + & + tempx2(i,7,k)*hprimewgll_xx(7,j) + & + tempx2(i,8,k)*hprimewgll_xx(8,j) + & + tempx2(i,9,k)*hprimewgll_xx(9,j) + & + tempx2(i,10,k)*hprimewgll_xx(10,j) + newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + & + tempy2(i,2,k)*hprimewgll_xx(2,j) + & + tempy2(i,3,k)*hprimewgll_xx(3,j) + & + tempy2(i,4,k)*hprimewgll_xx(4,j) + & + tempy2(i,5,k)*hprimewgll_xx(5,j) + & + tempy2(i,6,k)*hprimewgll_xx(6,j) + & + tempy2(i,7,k)*hprimewgll_xx(7,j) + & + tempy2(i,8,k)*hprimewgll_xx(8,j) + & + tempy2(i,9,k)*hprimewgll_xx(9,j) + & + tempy2(i,10,k)*hprimewgll_xx(10,j) + newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + & + tempz2(i,2,k)*hprimewgll_xx(2,j) + & + tempz2(i,3,k)*hprimewgll_xx(3,j) + & + tempz2(i,4,k)*hprimewgll_xx(4,j) + & + tempz2(i,5,k)*hprimewgll_xx(5,j) + & + tempz2(i,6,k)*hprimewgll_xx(6,j) + & + tempz2(i,7,k)*hprimewgll_xx(7,j) + & + tempz2(i,8,k)*hprimewgll_xx(8,j) + & + tempz2(i,9,k)*hprimewgll_xx(9,j) + & + tempz2(i,10,k)*hprimewgll_xx(10,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_10points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) + do j=1,m1 + do i=1,m2 + E1_mxm_m2_m1_10points(i,j) = C1_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + & + C1_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + & + C1_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + & + C1_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + & + C1_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + & + C1_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + & + C1_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + & + C1_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + & + C1_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + & + C1_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j) + E2_mxm_m2_m1_10points(i,j) = C2_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + & + C2_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + & + C2_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + & + C2_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + & + C2_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + & + C2_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + & + C2_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + & + C2_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + & + C2_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + & + C2_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j) + E3_mxm_m2_m1_10points(i,j) = C3_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + & + C3_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + & + C3_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + & + C3_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + & + C3_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + & + C3_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + & + C3_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + & + C3_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + & + C3_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + & + C3_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + + fac1 = wgllwgll_yz(j,k) + fac2 = wgllwgll_xz(i,k) + fac3 = wgllwgll_xy(i,j) + + ! sum contributions from each element to the global mesh using indirect addressing + iglob = ibool(i,j,k,ispec) + accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - & + fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k) + accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - & + fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k) + accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - & + fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k) + + ! update memory variables based upon the Runge-Kutta scheme + if(ATTENUATION) then + + ! use Runge-Kutta scheme to march in time + do i_sls = 1,N_SLS + + factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) + + alphaval_loc = alphaval(i_sls) + betaval_loc = betaval(i_sls) + gammaval_loc = gammaval(i_sls) + + ! term in xx + Sn = factor_loc * epsilondev_xx(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) + R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yy + Sn = factor_loc * epsilondev_yy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) + R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in zz not computed since zero trace + ! term in xy + Sn = factor_loc * epsilondev_xy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) + R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in xz + Sn = factor_loc * epsilondev_xz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) + R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yz + Sn = factor_loc * epsilondev_yz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) + R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + + enddo ! end of loop on memory variables + + endif ! end attenuation + + enddo + enddo + enddo + + ! save deviatoric strain for Runge-Kutta scheme + if ( COMPUTE_AND_STORE_STRAIN ) then + epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) + epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) + epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) + epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) + epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) + endif + + enddo ! spectral element loop + +end subroutine compute_forces_elastic_Dev_10p diff --git a/src/specfem3D/compute_forces_elastic_Dev_openmp.f90 b/src/specfem3D/compute_forces_elastic_Dev_openmp.f90 new file mode 100644 index 000000000..7a73f63bc --- /dev/null +++ b/src/specfem3D/compute_forces_elastic_Dev_openmp.f90 @@ -0,0 +1,786 @@ +!===================================================================== +! +! S p e c f e m 3 D V e r s i o n 2 . 0 +! --------------------------------------- +! +! Main authors: Dimitri Komatitsch and Jeroen Tromp +! Princeton University, USA and University of Pau / CNRS / INRIA +! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA +! October 2011 +! +! This program is free software; you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 2 of the License, or +! (at your option) any later version. +! +! This program is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along +! with this program; if not, write to the Free Software Foundation, Inc., +! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +! +!===================================================================== + +! OpenMP Threaded variant by John Levesque, Max Rietmann and Olaf Schenk + + subroutine compute_forces_elastic_Dev_openmp(iphase ,NSPEC_AB,NGLOB_AB, & + displ,accel, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + hprime_xx,hprime_xxT, & + hprimewgll_xx,hprimewgll_xxT, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & + kappastore,mustore,jacobian,ibool, & + ATTENUATION, & + one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,& + NSPEC_ATTENUATION_AB, & + R_xx,R_yy,R_xy,R_xz,R_yz, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, & + ANISOTROPY,NSPEC_ANISO, & + c11store,c12store,c13store,c14store,c15store,c16store,& + c22store,c23store,c24store,c25store,c26store,c33store,& + c34store,c35store,c36store,c44store,c45store,c46store,& + c55store,c56store,c66store, & + SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, & + NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, & + is_moho_top,is_moho_bot, & + dsdx_top,dsdx_bot, & + ispec2D_moho_top,ispec2D_moho_bot, & + num_phase_ispec_elastic,& + phase_ispec_inner_elastic,& + num_colors_outer_elastic,num_colors_inner_elastic) + + + + ! computes elastic tensor term + + use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, & + N_SLS,SAVE_MOHO_MESH, & + ONE_THIRD,FOUR_THIRDS,m1,m2 + + ! Trying to pass these variables as subroutine arguments ran into + ! problems, so we reference them from their module, making them + ! accessible from this subroutine + use specfem_par_elastic, only:dummyx_loc,dummyy_loc,dummyz_loc,newtempx1,newtempx2,newtempx3,& + newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3,& + tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3,num_elem_colors_elastic + + implicit none + + integer :: NSPEC_AB,NGLOB_AB + + ! displacement and acceleration + real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel + + + ! arrays with mesh parameters per slice + integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: & + kappastore,mustore,jacobian + + ! array with derivatives of Lagrange polynomials and precalculated products + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz + real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz + + ! memory variables and standard linear solids for attenuation + logical :: ATTENUATION + logical :: COMPUTE_AND_STORE_STRAIN + integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT + integer :: NSPEC_ATTENUATION_AB + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta + real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common + real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: & + R_xx,R_yy,R_xy,R_xz,R_yz + + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: & + epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz + real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3 + + ! anisotropy + logical :: ANISOTROPY + integer :: NSPEC_ANISO + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: & + c11store,c12store,c13store,c14store,c15store,c16store, & + c22store,c23store,c24store,c25store,c26store,c33store, & + c34store,c35store,c36store,c44store,c45store,c46store, & + c55store,c56store,c66store + + integer :: iphase + integer :: num_phase_ispec_elastic + integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic + + ! adjoint simulations + integer :: SIMULATION_TYPE + integer :: NSPEC_BOUN,NSPEC2D_MOHO + + ! moho kernel + real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: & + dsdx_top,dsdx_bot + logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot + integer :: ispec2D_moho_top, ispec2D_moho_bot + + ! local attenuation parameters + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, & + epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc + real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3 + real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc + real(kind=CUSTOM_REAL) Sn,Snp1 + real(kind=CUSTOM_REAL) templ + + real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl + real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl + + real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl + real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl + + real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz + + real(kind=CUSTOM_REAL) fac1,fac2,fac3 + + real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul + real(kind=CUSTOM_REAL) kappal + + integer OMP_get_thread_num + integer OMP_GET_MAX_THREADS + + ! timing + !double precision omp_get_wtime + !double precision start_time + !double precision end_time + !double precision accumulate_time_start + !double precision accumulate_time_stop + + ! local anisotropy parameters + real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& + c33,c34,c35,c36,c44,c45,c46,c55,c56,c66 + + integer i_SLS,imodulo_N_SLS + integer ispec,iglob,ispec_p,num_elements + integer i,j,k + integer thread_id + integer NUM_THREADS + !integer omp_get_num_threads ! function + + ! coloring additions + ! integer, dimension(:), allocatable :: num_elem_colors_elastic + integer istart, estart, number_of_colors + integer num_colors_outer_elastic, num_colors_inner_elastic + integer icolor + + ! write(*,*) "num_elem_colors_elastic(1) = ",num_elem_colors_elastic(1) + imodulo_N_SLS = mod(N_SLS,3) + + ! NUM_THREADS = 1 + NUM_THREADS = OMP_GET_MAX_THREADS() + + + ! choses inner/outer elements + if( iphase == 1 ) then + number_of_colors = num_colors_outer_elastic + istart = 1 + else + number_of_colors = num_colors_inner_elastic + num_colors_outer_elastic + istart = num_colors_outer_elastic+1 + ! istart = num_colors_outer_elastic + endif + + ! "start" timer + ! start_time = omp_get_wtime() + + ! The mesh coloring algorithm provides disjoint sets of elements that + ! do not share degrees of freedom which is required for the assembly + ! step at the "accel(iglob) += update" step. The coloring is + ! implemented, such that the element and node indices are ordered by + ! color. This requires then only to iterate through the elements in + ! order, stopping to synchronize threads after all the elements in a + ! color are finished. + estart = 1 + do icolor = istart, number_of_colors + + !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(& + !$OMP R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3,& + !$OMP factor_loc,alphaval_loc,betaval_loc,gammaval_loc,& + !$OMP Sn,Snp1,& + !$OMP templ,& + !$OMP xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl,& + !$OMP duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,& + !$OMP duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,& + !$OMP duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,& + !$OMP sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,& + !$OMP fac1,fac2,fac3,& + !$OMP lambdal,mul,lambdalplus2mul,kappal,& + !$OMP c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,& + !$OMP c33,c34,c35,c36,c44,c45,c46,c55,c56,c66,& + !$OMP i_SLS,& + !$OMP ispec,iglob,ispec_p,& + !$OMP i,j,k,& + !$OMP thread_id) + + thread_id = OMP_get_thread_num()+1 + + ! we retrive the subset of the total elements determined by the mesh + ! coloring. This number changes as we iterate through the colors + num_elements = num_elem_colors_elastic(icolor) + !$OMP DO + do ispec_p = estart,(estart-1)+num_elements + + + ! returns element id from stored element list + ispec = phase_ispec_inner_elastic(ispec_p,iphase) + + ! adjoint simulations: moho kernel + if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + ispec2D_moho_top = ispec2D_moho_top + 1 + else if (is_moho_bot(ispec)) then + ispec2D_moho_bot = ispec2D_moho_bot + 1 + endif + endif ! adjoint + + ! stores displacment values in local array + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + iglob = ibool(i,j,k,ispec) + dummyx_loc(i,j,k,thread_id) = displ(1,iglob) + dummyy_loc(i,j,k,thread_id) = displ(2,iglob) + dummyz_loc(i,j,k,thread_id) = displ(3,iglob) + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_5points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1) + do j=1,m2 + do i=1,m1 + tempx1(i,j,1,thread_id) = & + hprime_xx(i,1)*dummyx_loc(1,j,1,thread_id) + & + hprime_xx(i,2)*dummyx_loc(2,j,1,thread_id) + & + hprime_xx(i,3)*dummyx_loc(3,j,1,thread_id) + & + hprime_xx(i,4)*dummyx_loc(4,j,1,thread_id) + & + hprime_xx(i,5)*dummyx_loc(5,j,1,thread_id) + tempy1(i,j,1,thread_id) = & + hprime_xx(i,1)*dummyy_loc(1,j,1,thread_id) + & + hprime_xx(i,2)*dummyy_loc(2,j,1,thread_id) + & + hprime_xx(i,3)*dummyy_loc(3,j,1,thread_id) + & + hprime_xx(i,4)*dummyy_loc(4,j,1,thread_id) + & + hprime_xx(i,5)*dummyy_loc(5,j,1,thread_id) + tempz1(i,j,1,thread_id) = & + hprime_xx(i,1)*dummyz_loc(1,j,1,thread_id) + & + hprime_xx(i,2)*dummyz_loc(2,j,1,thread_id) + & + hprime_xx(i,3)*dummyz_loc(3,j,1,thread_id) + & + hprime_xx(i,4)*dummyz_loc(4,j,1,thread_id) + & + hprime_xx(i,5)*dummyz_loc(5,j,1,thread_id) + enddo + enddo + + ! call mxm_m1_m1_5points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), & + ! hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k)) + do j=1,m1 + do i=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + tempx2(i,j,k,thread_id) = dummyx_loc(i,1,k,thread_id)*hprime_xxT(1,j) + & + dummyx_loc(i,2,k,thread_id)*hprime_xxT(2,j) + & + dummyx_loc(i,3,k,thread_id)*hprime_xxT(3,j) + & + dummyx_loc(i,4,k,thread_id)*hprime_xxT(4,j) + & + dummyx_loc(i,5,k,thread_id)*hprime_xxT(5,j) + tempy2(i,j,k,thread_id) = dummyy_loc(i,1,k,thread_id)*hprime_xxT(1,j) + & + dummyy_loc(i,2,k,thread_id)*hprime_xxT(2,j) + & + dummyy_loc(i,3,k,thread_id)*hprime_xxT(3,j) + & + dummyy_loc(i,4,k,thread_id)*hprime_xxT(4,j) + & + dummyy_loc(i,5,k,thread_id)*hprime_xxT(5,j) + tempz2(i,j,k,thread_id) = dummyz_loc(i,1,k,thread_id)*hprime_xxT(1,j) + & + dummyz_loc(i,2,k,thread_id)*hprime_xxT(2,j) + & + dummyz_loc(i,3,k,thread_id)*hprime_xxT(3,j) + & + dummyz_loc(i,4,k,thread_id)*hprime_xxT(4,j) + & + dummyz_loc(i,5,k,thread_id)*hprime_xxT(5,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_5points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3) + do j=1,m1 + do i=1,m2 + tempx3(i,1,j,thread_id) = & + dummyx_loc(i,1,1,thread_id)*hprime_xxT(1,j) + & + dummyx_loc(i,1,2,thread_id)*hprime_xxT(2,j) + & + dummyx_loc(i,1,3,thread_id)*hprime_xxT(3,j) + & + dummyx_loc(i,1,4,thread_id)*hprime_xxT(4,j) + & + dummyx_loc(i,1,5,thread_id)*hprime_xxT(5,j) + tempy3(i,1,j,thread_id) = & + dummyy_loc(i,1,1,thread_id)*hprime_xxT(1,j) + & + dummyy_loc(i,1,2,thread_id)*hprime_xxT(2,j) + & + dummyy_loc(i,1,3,thread_id)*hprime_xxT(3,j) + & + dummyy_loc(i,1,4,thread_id)*hprime_xxT(4,j) + & + dummyy_loc(i,1,5,thread_id)*hprime_xxT(5,j) + tempz3(i,1,j,thread_id) = & + dummyz_loc(i,1,1,thread_id)*hprime_xxT(1,j) + & + dummyz_loc(i,1,2,thread_id)*hprime_xxT(2,j) + & + dummyz_loc(i,1,3,thread_id)*hprime_xxT(3,j) + & + dummyz_loc(i,1,4,thread_id)*hprime_xxT(4,j) + & + dummyz_loc(i,1,5,thread_id)*hprime_xxT(5,j) + + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + ! get derivatives of ux, uy and uz with respect to x, y and z + xixl = xix(i,j,k,ispec) + xiyl = xiy(i,j,k,ispec) + xizl = xiz(i,j,k,ispec) + etaxl = etax(i,j,k,ispec) + etayl = etay(i,j,k,ispec) + etazl = etaz(i,j,k,ispec) + gammaxl = gammax(i,j,k,ispec) + gammayl = gammay(i,j,k,ispec) + gammazl = gammaz(i,j,k,ispec) + jacobianl = jacobian(i,j,k,ispec) + + duxdxl = xixl*tempx1(i,j,k,thread_id) + etaxl*tempx2(i,j,k,thread_id) + gammaxl*tempx3(i,j,k,thread_id) + duxdyl = xiyl*tempx1(i,j,k,thread_id) + etayl*tempx2(i,j,k,thread_id) + gammayl*tempx3(i,j,k,thread_id) + duxdzl = xizl*tempx1(i,j,k,thread_id) + etazl*tempx2(i,j,k,thread_id) + gammazl*tempx3(i,j,k,thread_id) + + duydxl = xixl*tempy1(i,j,k,thread_id) + etaxl*tempy2(i,j,k,thread_id) + gammaxl*tempy3(i,j,k,thread_id) + duydyl = xiyl*tempy1(i,j,k,thread_id) + etayl*tempy2(i,j,k,thread_id) + gammayl*tempy3(i,j,k,thread_id) + duydzl = xizl*tempy1(i,j,k,thread_id) + etazl*tempy2(i,j,k,thread_id) + gammazl*tempy3(i,j,k,thread_id) + + duzdxl = xixl*tempz1(i,j,k,thread_id) + etaxl*tempz2(i,j,k,thread_id) + gammaxl*tempz3(i,j,k,thread_id) + duzdyl = xiyl*tempz1(i,j,k,thread_id) + etayl*tempz2(i,j,k,thread_id) + gammayl*tempz3(i,j,k,thread_id) + duzdzl = xizl*tempz1(i,j,k,thread_id) + etazl*tempz2(i,j,k,thread_id) + gammazl*tempz3(i,j,k,thread_id) + + ! save strain on the Moho boundary + if (SAVE_MOHO_MESH ) then + if (is_moho_top(ispec)) then + dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl + dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl + dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl + dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl + dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl + dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl + dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl + dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl + dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl + else if (is_moho_bot(ispec)) then + dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl + dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl + dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl + dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl + dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl + dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl + dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl + dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl + dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl + endif + endif + + ! precompute some sums to save CPU time + duxdxl_plus_duydyl = duxdxl + duydyl + duxdxl_plus_duzdzl = duxdxl + duzdzl + duydyl_plus_duzdzl = duydyl + duzdzl + duxdyl_plus_duydxl = duxdyl + duydxl + duzdxl_plus_duxdzl = duzdxl + duxdzl + duzdyl_plus_duydzl = duzdyl + duydzl + + ! computes deviatoric strain attenuation and/or for kernel calculations + if (COMPUTE_AND_STORE_STRAIN) then + templ = ONE_THIRD * (duxdxl + duydyl + duzdzl) + if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ + epsilondev_xx_loc(i,j,k) = duxdxl - templ + epsilondev_yy_loc(i,j,k) = duydyl - templ + epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl + epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl + epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl + endif + + kappal = kappastore(i,j,k,ispec) + mul = mustore(i,j,k,ispec) + + ! attenuation + if(ATTENUATION) then + ! use unrelaxed parameters if attenuation + mul = mul * one_minus_sum_beta(i,j,k,ispec) + endif + + ! full anisotropic case, stress calculations + if(ANISOTROPY) then + c11 = c11store(i,j,k,ispec) + c12 = c12store(i,j,k,ispec) + c13 = c13store(i,j,k,ispec) + c14 = c14store(i,j,k,ispec) + c15 = c15store(i,j,k,ispec) + c16 = c16store(i,j,k,ispec) + c22 = c22store(i,j,k,ispec) + c23 = c23store(i,j,k,ispec) + c24 = c24store(i,j,k,ispec) + c25 = c25store(i,j,k,ispec) + c26 = c26store(i,j,k,ispec) + c33 = c33store(i,j,k,ispec) + c34 = c34store(i,j,k,ispec) + c35 = c35store(i,j,k,ispec) + c36 = c36store(i,j,k,ispec) + c44 = c44store(i,j,k,ispec) + c45 = c45store(i,j,k,ispec) + c46 = c46store(i,j,k,ispec) + c55 = c55store(i,j,k,ispec) + c56 = c56store(i,j,k,ispec) + c66 = c66store(i,j,k,ispec) + + sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + & + c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl + sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + & + c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl + sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + & + c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl + sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + & + c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl + sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + & + c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl + sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + & + c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl + + else + + ! isotropic case + lambdalplus2mul = kappal + FOUR_THIRDS * mul + lambdal = lambdalplus2mul - 2.*mul + + ! compute stress sigma + sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl + sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl + sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl + + sigma_xy = mul*duxdyl_plus_duydxl + sigma_xz = mul*duzdxl_plus_duxdzl + sigma_yz = mul*duzdyl_plus_duydzl + + endif ! ANISOTROPY + + ! subtract memory variables if attenuation + if(ATTENUATION) then + ! way 1 + ! do i_sls = 1,N_SLS + ! R_xx_val = R_xx(i,j,k,ispec,i_sls) + ! R_yy_val = R_yy(i,j,k,ispec,i_sls) + ! sigma_xx = sigma_xx - R_xx_val + ! sigma_yy = sigma_yy - R_yy_val + ! sigma_zz = sigma_zz + R_xx_val + R_yy_val + ! sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + ! sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + ! sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + ! enddo + + ! way 2 + ! note: this should help compilers to pipeline the code and make better use of the cache; + ! depending on compilers, it can further decrease the computation time by ~ 30%. + ! by default, N_SLS = 3, therefore we take steps of 3 + if(imodulo_N_SLS >= 1) then + do i_sls = 1,imodulo_N_SLS + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + enddo + endif + + if(N_SLS >= imodulo_N_SLS+1) then + do i_sls = imodulo_N_SLS+1,N_SLS,3 + R_xx_val1 = R_xx(i,j,k,ispec,i_sls) + R_yy_val1 = R_yy(i,j,k,ispec,i_sls) + sigma_xx = sigma_xx - R_xx_val1 + sigma_yy = sigma_yy - R_yy_val1 + sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls) + + R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1) + R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1) + sigma_xx = sigma_xx - R_xx_val2 + sigma_yy = sigma_yy - R_yy_val2 + sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1) + + R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2) + R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2) + sigma_xx = sigma_xx - R_xx_val3 + sigma_yy = sigma_yy - R_yy_val3 + sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3 + sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2) + sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2) + sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2) + enddo + endif + + + endif + + ! form dot product with test vector, symmetric form + tempx1(i,j,k,thread_id) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl) + tempy1(i,j,k,thread_id) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl) + tempz1(i,j,k,thread_id) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) + + tempx2(i,j,k,thread_id) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl) + tempy2(i,j,k,thread_id) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl) + tempz2(i,j,k,thread_id) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) + + tempx3(i,j,k,thread_id) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl) + tempy3(i,j,k,thread_id) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl) + tempz3(i,j,k,thread_id) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) + + enddo + enddo + enddo + + ! subroutines adapted from Deville, Fischer and Mund, High-order methods + ! for incompressible fluid flow, Cambridge University Press (2002), + ! pages 386 and 389 and Figure 8.3.1 + ! call mxm_m1_m2_5points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1) + do j=1,m2 + do i=1,m1 + newtempx1(i,j,1,thread_id) = & + hprimewgll_xxT(i,1)*tempx1(1,j,1,thread_id) + & + hprimewgll_xxT(i,2)*tempx1(2,j,1,thread_id) + & + hprimewgll_xxT(i,3)*tempx1(3,j,1,thread_id) + & + hprimewgll_xxT(i,4)*tempx1(4,j,1,thread_id) + & + hprimewgll_xxT(i,5)*tempx1(5,j,1,thread_id) + newtempy1(i,j,1,thread_id) = & + hprimewgll_xxT(i,1)*tempy1(1,j,1,thread_id) + & + hprimewgll_xxT(i,2)*tempy1(2,j,1,thread_id) + & + hprimewgll_xxT(i,3)*tempy1(3,j,1,thread_id) + & + hprimewgll_xxT(i,4)*tempy1(4,j,1,thread_id) + & + hprimewgll_xxT(i,5)*tempy1(5,j,1,thread_id) + newtempz1(i,j,1,thread_id) = & + hprimewgll_xxT(i,1)*tempz1(1,j,1,thread_id) + & + hprimewgll_xxT(i,2)*tempz1(2,j,1,thread_id) + & + hprimewgll_xxT(i,3)*tempz1(3,j,1,thread_id) + & + hprimewgll_xxT(i,4)*tempz1(4,j,1,thread_id) + & + hprimewgll_xxT(i,5)*tempz1(5,j,1,thread_id) + enddo + enddo + + ! call mxm_m1_m1_5points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), & + ! hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k)) + do i=1,m1 + do j=1,m1 + ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code + do k = 1,NGLLX + newtempx2(i,j,k,thread_id) = tempx2(i,1,k,thread_id)*hprimewgll_xx(1,j) + & + tempx2(i,2,k,thread_id)*hprimewgll_xx(2,j) + & + tempx2(i,3,k,thread_id)*hprimewgll_xx(3,j) + & + tempx2(i,4,k,thread_id)*hprimewgll_xx(4,j) + & + tempx2(i,5,k,thread_id)*hprimewgll_xx(5,j) + newtempy2(i,j,k,thread_id) = tempy2(i,1,k,thread_id)*hprimewgll_xx(1,j) + & + tempy2(i,2,k,thread_id)*hprimewgll_xx(2,j) + & + tempy2(i,3,k,thread_id)*hprimewgll_xx(3,j) + & + tempy2(i,4,k,thread_id)*hprimewgll_xx(4,j) + & + tempy2(i,5,k,thread_id)*hprimewgll_xx(5,j) + newtempz2(i,j,k,thread_id) = tempz2(i,1,k,thread_id)*hprimewgll_xx(1,j) + & + tempz2(i,2,k,thread_id)*hprimewgll_xx(2,j) + & + tempz2(i,3,k,thread_id)*hprimewgll_xx(3,j) + & + tempz2(i,4,k,thread_id)*hprimewgll_xx(4,j) + & + tempz2(i,5,k,thread_id)*hprimewgll_xx(5,j) + enddo + enddo + enddo + + ! call mxm_m2_m1_5points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3) + do j=1,m1 + do i=1,m2 + newtempx3(i,1,j,thread_id) = & + tempx3(i,1,1,thread_id)*hprimewgll_xx(1,j) + & + tempx3(i,1,2,thread_id)*hprimewgll_xx(2,j) + & + tempx3(i,1,3,thread_id)*hprimewgll_xx(3,j) + & + tempx3(i,1,4,thread_id)*hprimewgll_xx(4,j) + & + tempx3(i,1,5,thread_id)*hprimewgll_xx(5,j) + newtempy3(i,1,j,thread_id) = & + tempy3(i,1,1,thread_id)*hprimewgll_xx(1,j) + & + tempy3(i,1,2,thread_id)*hprimewgll_xx(2,j) + & + tempy3(i,1,3,thread_id)*hprimewgll_xx(3,j) + & + tempy3(i,1,4,thread_id)*hprimewgll_xx(4,j) + & + tempy3(i,1,5,thread_id)*hprimewgll_xx(5,j) + newtempz3(i,1,j,thread_id) = & + tempz3(i,1,1,thread_id)*hprimewgll_xx(1,j) + & + tempz3(i,1,2,thread_id)*hprimewgll_xx(2,j) + & + tempz3(i,1,3,thread_id)*hprimewgll_xx(3,j) + & + tempz3(i,1,4,thread_id)*hprimewgll_xx(4,j) + & + tempz3(i,1,5,thread_id)*hprimewgll_xx(5,j) + enddo + enddo + + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + + fac1 = wgllwgll_yz(j,k) + fac2 = wgllwgll_xz(i,k) + fac3 = wgllwgll_xy(i,j) + + ! sum contributions from each element to the global mesh using indirect addressing + iglob = ibool(i,j,k,ispec) + + ! accel_omp(1,iglob,thread_id) = accel_omp(1,iglob,thread_id)& + ! - fac1*newtempx1(i,j,k,thread_id) - fac2*newtempx2(i,j,k,thread_id)& + ! - fac3*newtempx3(i,j,k,thread_id) + ! accel_omp(2,iglob,thread_id) = accel_omp(2,iglob,thread_id)& + ! - fac1*newtempy1(i,j,k,thread_id) - fac2*newtempy2(i,j,k,thread_id)& + ! - fac3*newtempy3(i,j,k,thread_id) + ! accel_omp(3,iglob,thread_id) = accel_omp(3,iglob,thread_id)& + ! - fac1*newtempz1(i,j,k,thread_id) - fac2*newtempz2(i,j,k,thread_id)& + ! - fac3*newtempz3(i,j,k,thread_id) + + ! Assembly of shared degrees of freedom fixed through mesh coloring + !! !$OMP ATOMIC + accel(1,iglob) = accel(1,iglob) & + - (fac1*newtempx1(i,j,k,thread_id) & + + fac2*newtempx2(i,j,k,thread_id) & + + fac3*newtempx3(i,j,k,thread_id)) + !! !$OMP ATOMIC + accel(2,iglob) = accel(2,iglob) & + - (fac1*newtempy1(i,j,k,thread_id) & + + fac2*newtempy2(i,j,k,thread_id) & + + fac3*newtempy3(i,j,k,thread_id)) + !! !$OMP ATOMIC + accel(3,iglob) = accel(3,iglob) & + - (fac1*newtempz1(i,j,k,thread_id) & + + fac2*newtempz2(i,j,k,thread_id) & + + fac3*newtempz3(i,j,k,thread_id)) + + ! accel(1,iglob) = accel(1,iglob) - & + ! (fac1*newtempx1(i,j,k,thread_id) + fac2*newtempx2(i,j,k,thread_id) + fac3*newtempx3(i,j,k,thread_id)) + ! accel(2,iglob) = accel(2,iglob) - & + ! (fac1*newtempy1(i,j,k,thread_id) + fac2*newtempy2(i,j,k,thread_id) + fac3*newtempy3(i,j,k,thread_id)) + ! accel(3,iglob) = accel(3,iglob) - & + ! (fac1*newtempz1(i,j,k,thread_id) + fac2*newtempz2(i,j,k,thread_id) + fac3*newtempz3(i,j,k,thread_id)) + + ! accel_omp(1,iglob,thread_id) = accel_omp(1,iglob,thread_id) - fac1*newtempx1(i,j,k,thread_id) - & + ! fac2*newtempx2(i,j,k,thread_id) - fac3*newtempx3(i,j,k,thread_id) + ! accel_omp(2,iglob,thread_id) = accel_omp(2,iglob,thread_id) - fac1*newtempy1(i,j,k,thread_id) - & + ! fac2*newtempy2(i,j,k,thread_id) - fac3*newtempy3(i,j,k,thread_id) + ! accel_omp(3,iglob,thread_id) = accel_omp(3,iglob,thread_id) - fac1*newtempz1(i,j,k,thread_id) - & + ! fac2*newtempz2(i,j,k,thread_id) - fac3*newtempz3(i,j,k,thread_id) + + ! update memory variables based upon the Runge-Kutta scheme + if(ATTENUATION) then + + ! use Runge-Kutta scheme to march in time + do i_sls = 1,N_SLS + + factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec) + + alphaval_loc = alphaval(i_sls) + betaval_loc = betaval(i_sls) + gammaval_loc = gammaval(i_sls) + + ! term in xx + Sn = factor_loc * epsilondev_xx(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xx_loc(i,j,k) + R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yy + Sn = factor_loc * epsilondev_yy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yy_loc(i,j,k) + R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in zz not computed since zero trace + ! term in xy + Sn = factor_loc * epsilondev_xy(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xy_loc(i,j,k) + R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in xz + Sn = factor_loc * epsilondev_xz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_xz_loc(i,j,k) + R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + ! term in yz + Sn = factor_loc * epsilondev_yz(i,j,k,ispec) + Snp1 = factor_loc * epsilondev_yz_loc(i,j,k) + R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + & + betaval_loc * Sn + gammaval_loc * Snp1 + + enddo ! end of loop on memory variables + + endif ! end attenuation + + enddo + enddo + enddo + + ! save deviatoric strain for Runge-Kutta scheme + if ( COMPUTE_AND_STORE_STRAIN ) then + epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:) + epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:) + epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:) + epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:) + epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:) + endif + + enddo ! spectral element loop + !$OMP END DO + !$OMP END PARALLEL + + ! The elements are in order of color. First we do color 1 elements, + ! then color 2, etc. The ispec has to moved to start at the next + ! color. + estart = estart + num_elements + + enddo ! loop over colors + + + ! "stop" timer + ! end_time = omp_get_wtime() + + ! write(*,*) "Total Elapsed time: ", (end_time-start_time) , "seconds. (Threads=",NUM_THREADS,")" + ! write(*,*) "Accumulate Elapsed time: ", (accumulate_time_stop-accumulate_time_start) , "seconds" + + + ! These are now allocated at the beginning and never deallocated + ! because the program just finishes at the end. + + ! deallocate(dummyx_loc) + ! deallocate(dummyy_loc) + ! deallocate(dummyz_loc) + ! deallocate(newtempx1) + ! deallocate(newtempx2) + ! deallocate(newtempx3) + ! deallocate(newtempy1) + ! deallocate(newtempy2) + ! deallocate(newtempy3) + ! deallocate(newtempz1) + ! deallocate(newtempz2) + ! deallocate(newtempz3) + ! deallocate(tempx1) + ! deallocate(tempx2) + ! deallocate(tempx3) + ! deallocate(tempy1) + ! deallocate(tempy2) + ! deallocate(tempy3) + ! deallocate(tempz1) + ! deallocate(tempz2) + ! deallocate(tempz3) + + ! accel(:,:) = accel_omp(:,:,1) + + end subroutine compute_forces_elastic_Dev_openmp + + diff --git a/src/specfem3D/compute_gradient.f90 b/src/specfem3D/compute_gradient.f90 index 39dd1b961..3fd2dffe3 100644 --- a/src/specfem3D/compute_gradient.f90 +++ b/src/specfem3D/compute_gradient.f90 @@ -29,14 +29,18 @@ subroutine compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & scalar_field, vector_field_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! calculates gradient of given acoustic scalar (potential) field on all GLL points in one, single element ! note: ! displacement s = (rho)^{-1} \del \chi ! velocity v = (rho)^{-1} \del \ddot \chi ! +! in case of gravity: +! displacement s = \del \chi +! velocity v = \del \ddot \chi ! returns: (1/rho) times gradient vector field (vector_field_element) in specified element +! or in gravity case, just gradient vector field implicit none include 'constants.h' @@ -59,6 +63,8 @@ subroutine compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz + logical :: GRAVITY + ! local parameters real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl real(kind=CUSTOM_REAL) temp1l,temp2l,temp3l @@ -100,7 +106,12 @@ subroutine compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & gammayl = gammay(i,j,k,ispec) gammazl = gammaz(i,j,k,ispec) - rho_invl = 1.0_CUSTOM_REAL / rhostore(i,j,k,ispec) + ! daniel: TODO - check gravity case here + if( GRAVITY ) then + rho_invl = 1.0_CUSTOM_REAL / rhostore(i,j,k,ispec) + else + rho_invl = 1.0_CUSTOM_REAL / rhostore(i,j,k,ispec) + endif ! derivatives of acoustic scalar potential field on GLL points vector_field_element(1,i,j,k) = (temp1l*xixl + temp2l*etaxl + temp3l*gammaxl) * rho_invl diff --git a/src/specfem3D/compute_kernels.f90 b/src/specfem3D/compute_kernels.f90 index b7fee1029..c1c7606d3 100644 --- a/src/specfem3D/compute_kernels.f90 +++ b/src/specfem3D/compute_kernels.f90 @@ -32,68 +32,160 @@ subroutine compute_kernels() use specfem_par use specfem_par_elastic use specfem_par_acoustic + implicit none + + ! elastic simulations + if( ELASTIC_SIMULATION ) then + call compute_kernels_el() + endif + + ! elastic simulations + if( ACOUSTIC_SIMULATION ) then + call compute_kernels_ac() + endif + + ! computes an approximative hessian for preconditioning kernels + if ( APPROXIMATE_HESS_KL ) then + call compute_kernels_hessian() + endif + + end subroutine compute_kernels + + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine compute_kernels_el() + +! kernel calculations +! see e.g. Tromp et al. (2005) + + use specfem_par + use specfem_par_elastic implicit none ! local parameters - real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: b_displ_elm,accel_elm - real(kind=CUSTOM_REAL) :: kappal,rhol integer :: i,j,k,ispec,iglob real(kind=CUSTOM_REAL), dimension(5) :: epsilondev_loc,b_epsilondev_loc - ! updates kernels - do ispec = 1, NSPEC_AB + if( .not. GPU_MODE ) then + ! updates kernels on CPU + do ispec = 1, NSPEC_AB + + ! elastic domains + if( ispec_is_elastic(ispec) ) then + + do k = 1, NGLLZ + do j = 1, NGLLY + do i = 1, NGLLX + iglob = ibool(i,j,k,ispec) + + ! isotropic kernels + ! note: takes displacement from backward/reconstructed (forward) field b_displ + ! and acceleration from adjoint field accel (containing adjoint sources) + ! + ! note: : time integral summation uses deltat + ! + ! compare with Tromp et al. (2005), eq. (14), which takes adjoint displacement + ! and forward acceleration, that is the symmetric form of what is calculated here + ! however, this kernel expression is symmetric with regards + ! to interchange adjoint - forward field + rho_kl(i,j,k,ispec) = rho_kl(i,j,k,ispec) & + + deltat * dot_product(accel(:,iglob), b_displ(:,iglob)) + + ! kernel for shear modulus, see e.g. Tromp et al. (2005), equation (17) + ! note: multiplication with 2*mu(x) will be done after the time loop + epsilondev_loc(1) = epsilondev_xx(i,j,k,ispec) + epsilondev_loc(2) = epsilondev_yy(i,j,k,ispec) + epsilondev_loc(3) = epsilondev_xy(i,j,k,ispec) + epsilondev_loc(4) = epsilondev_xz(i,j,k,ispec) + epsilondev_loc(5) = epsilondev_yz(i,j,k,ispec) + + b_epsilondev_loc(1) = b_epsilondev_xx(i,j,k,ispec) + b_epsilondev_loc(2) = b_epsilondev_yy(i,j,k,ispec) + b_epsilondev_loc(3) = b_epsilondev_xy(i,j,k,ispec) + b_epsilondev_loc(4) = b_epsilondev_xz(i,j,k,ispec) + b_epsilondev_loc(5) = b_epsilondev_yz(i,j,k,ispec) + + mu_kl(i,j,k,ispec) = mu_kl(i,j,k,ispec) & + + deltat * (epsilondev_loc(1)*b_epsilondev_loc(1) + epsilondev_loc(2)*b_epsilondev_loc(2) & + + (epsilondev_loc(1)+epsilondev_loc(2)) * (b_epsilondev_loc(1)+b_epsilondev_loc(2)) & + + 2 * (epsilondev_loc(3)*b_epsilondev_loc(3) + epsilondev_loc(4)*b_epsilondev_loc(4) + & + epsilondev_loc(5)*b_epsilondev_loc(5)) ) + + ! kernel for bulk modulus, see e.g. Tromp et al. (2005), equation (18) + ! note: multiplication with kappa(x) will be done after the time loop + kappa_kl(i,j,k,ispec) = kappa_kl(i,j,k,ispec) & + + deltat * (9 * epsilon_trace_over_3(i,j,k,ispec) & + * b_epsilon_trace_over_3(i,j,k,ispec)) + + enddo + enddo + enddo + endif !ispec_is_elastic + + enddo + + else + ! updates kernels on GPU + call compute_kernels_elastic_cuda(Mesh_pointer,deltat) + endif - ! elastic domains - if( ispec_is_elastic(ispec) ) then - do k = 1, NGLLZ - do j = 1, NGLLY - do i = 1, NGLLX - iglob = ibool(i,j,k,ispec) + ! moho kernel + if( SAVE_MOHO_MESH ) then + if( GPU_MODE ) then + call transfer_accel_from_device(NDIM*NGLOB_AB,accel,Mesh_pointer) + call transfer_b_displ_from_device(NDIM*NGLOB_AB,b_displ,Mesh_pointer) + endif + ! updates on CPU + call compute_boundary_kernel() + endif - ! isotropic kernels - ! note: takes displacement from backward/reconstructed (forward) field b_displ - ! and acceleration from adjoint field accel (containing adjoint sources) - ! - ! note: : time integral summation uses deltat - ! - ! compare with Tromp et al. (2005), eq. (14), which takes adjoint displacement - ! and forward acceleration, that is the symmetric form of what is calculated here - ! however, this kernel expression is symmetric with regards - ! to interchange adjoint - forward field - rho_kl(i,j,k,ispec) = rho_kl(i,j,k,ispec) & - + deltat * dot_product(accel(:,iglob), b_displ(:,iglob)) - - ! kernel for shear modulus, see e.g. Tromp et al. (2005), equation (17) - ! note: multiplication with 2*mu(x) will be done after the time loop - epsilondev_loc(1) = epsilondev_xx(i,j,k,ispec) - epsilondev_loc(2) = epsilondev_yy(i,j,k,ispec) - epsilondev_loc(3) = epsilondev_xy(i,j,k,ispec) - epsilondev_loc(4) = epsilondev_xz(i,j,k,ispec) - epsilondev_loc(5) = epsilondev_yz(i,j,k,ispec) - - b_epsilondev_loc(1) = b_epsilondev_xx(i,j,k,ispec) - b_epsilondev_loc(2) = b_epsilondev_yy(i,j,k,ispec) - b_epsilondev_loc(3) = b_epsilondev_xy(i,j,k,ispec) - b_epsilondev_loc(4) = b_epsilondev_xz(i,j,k,ispec) - b_epsilondev_loc(5) = b_epsilondev_yz(i,j,k,ispec) - - mu_kl(i,j,k,ispec) = mu_kl(i,j,k,ispec) & - + deltat * (epsilondev_loc(1)*b_epsilondev_loc(1) + epsilondev_loc(2)*b_epsilondev_loc(2) & - + (epsilondev_loc(1)+epsilondev_loc(2)) * (b_epsilondev_loc(1)+b_epsilondev_loc(2)) & - + 2 * (epsilondev_loc(3)*b_epsilondev_loc(3) + epsilondev_loc(4)*b_epsilondev_loc(4) + & - epsilondev_loc(5)*b_epsilondev_loc(5)) ) - - ! kernel for bulk modulus, see e.g. Tromp et al. (2005), equation (18) - ! note: multiplication with kappa(x) will be done after the time loop - kappa_kl(i,j,k,ispec) = kappa_kl(i,j,k,ispec) & - + deltat * (9 * epsilon_trace_over_3(i,j,k,ispec) & - * b_epsilon_trace_over_3(i,j,k,ispec)) + ! for noise simulations --- source strength kernel + if (NOISE_TOMOGRAPHY == 3) then + call compute_kernels_strength_noise(NGLLSQUARE*num_free_surface_faces,ibool, & + sigma_kl,displ,deltat,it, & + normal_x_noise,normal_y_noise,normal_z_noise, & + noise_surface_movie, & + NSPEC_AB,NGLOB_AB, & + num_free_surface_faces,free_surface_ispec,free_surface_ijk,& + GPU_MODE,Mesh_pointer) + endif - enddo - enddo - enddo - endif !ispec_is_elastic + end subroutine compute_kernels_el + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine compute_kernels_ac() + +! kernel calculations +! see e.g. Tromp et al. (2005) + + use specfem_par + use specfem_par_acoustic + + implicit none + ! local parameters + real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: b_displ_elm,accel_elm + real(kind=CUSTOM_REAL) :: kappal,rhol + integer :: i,j,k,ispec,iglob + + ! updates kernels on GPU + if(GPU_MODE) then + + ! computes contribution to density and bulk modulus kernel + call compute_kernels_acoustic_cuda(Mesh_pointer,deltat) + + ! kernels are done + return + endif + + ! updates kernels + do ispec = 1, NSPEC_AB ! acoustic domains if( ispec_is_acoustic(ispec) ) then @@ -103,41 +195,20 @@ subroutine compute_kernels() b_potential_acoustic, b_displ_elm,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! adjoint fields: acceleration vector - !YANGL + ibool,rhostore,GRAVITY) do k = 1, NGLLZ do j = 1, NGLLY do i = 1, NGLLX iglob = ibool(i,j,k,ispec) - !YANGL + * potential_acoustic(iglob) & + * b_potential_dot_dot_acoustic(iglob) enddo enddo @@ -160,29 +229,11 @@ subroutine compute_kernels() enddo - ! moho kernel - if( ELASTIC_SIMULATION .and. SAVE_MOHO_MESH ) then - call compute_boundary_kernel() - endif - - ! for noise simulations --- source strength kernel - if (NOISE_TOMOGRAPHY == 3) & - call compute_kernels_strength_noise(NGLLSQUARE*num_free_surface_faces,ibool, & - sigma_kl,displ,deltat,it, & - normal_x_noise,normal_y_noise,normal_z_noise, & - noise_surface_movie, & - NSPEC_AB,NGLOB_AB, & - num_free_surface_faces,free_surface_ispec,free_surface_ijk) + end subroutine compute_kernels_ac - ! computes an approximative hessian for preconditioning kernels - if ( APPROXIMATE_HESS_KL ) then - call compute_kernels_hessian() - endif - - end subroutine compute_kernels - - -!----------------------------------------------------------------------------- +! +!------------------------------------------------------------------------------------------------- +! subroutine compute_kernels_hessian() @@ -195,6 +246,17 @@ subroutine compute_kernels_hessian() real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: b_accel_elm,accel_elm integer :: i,j,k,ispec,iglob + ! updates kernels on GPU + if(GPU_MODE) then + + ! computes contribution to density and bulk modulus kernel + call compute_kernels_hess_cuda(Mesh_pointer,deltat, & + ELASTIC_SIMULATION,ACOUSTIC_SIMULATION) + + ! done on GPU + return + endif + ! loops over all elements do ispec = 1, NSPEC_AB @@ -206,14 +268,14 @@ subroutine compute_kernels_hessian() potential_dot_dot_acoustic, accel_elm,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! adjoint fields: acceleration vector call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & b_potential_dot_dot_acoustic, b_accel_elm,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) do k = 1, NGLLZ do j = 1, NGLLY diff --git a/src/specfem3D/compute_stacey_acoustic.f90 b/src/specfem3D/compute_stacey_acoustic.f90 index 14e01a728..422205a92 100644 --- a/src/specfem3D/compute_stacey_acoustic.f90 +++ b/src/specfem3D/compute_stacey_acoustic.f90 @@ -33,7 +33,8 @@ subroutine compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, & num_abs_boundary_faces,rhostore,kappastore,ispec_is_acoustic,& SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it,NGLOB_ADJOINT, & b_potential_dot_dot_acoustic,b_reclen_potential, & - b_absorb_potential,b_num_abs_boundary_faces) + b_absorb_potential,b_num_abs_boundary_faces, & + GPU_MODE,Mesh_pointer) implicit none @@ -67,16 +68,23 @@ subroutine compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, & real(kind=CUSTOM_REAL),dimension(NGLLSQUARE,b_num_abs_boundary_faces):: b_absorb_potential logical:: SAVE_FORWARD + ! GPU_MODE variables + integer(kind=8) :: Mesh_pointer + logical :: GPU_MODE + ! local parameters real(kind=CUSTOM_REAL) :: rhol,cpl,jacobianw,absorbl integer :: ispec,iglob,i,j,k,iface,igll !integer:: reclen1,reclen2 + ! checks if anything to do + if( num_abs_boundary_faces == 0 ) return + ! adjoint simulations: if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then ! reads in absorbing boundary array when first phase is running if( phase_is_inner .eqv. .false. ) then - ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newark scheme + ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme ! uses fortran routine !read(IOABS_AC,rec=NSTEP-it+1) reclen1,b_absorb_potential,reclen2 !if (reclen1 /= b_reclen_potential .or. reclen1 /= reclen2) & @@ -87,52 +95,61 @@ subroutine compute_stacey_acoustic(NSPEC_AB,NGLOB_AB, & endif !adjoint ! absorbs absorbing-boundary surface using Sommerfeld condition (vanishing field in the outer-space) - do iface=1,num_abs_boundary_faces + if( .NOT. GPU_MODE ) then + ! on CPU + do iface=1,num_abs_boundary_faces - ispec = abs_boundary_ispec(iface) + ispec = abs_boundary_ispec(iface) - if (ispec_is_inner(ispec) .eqv. phase_is_inner) then + if (ispec_is_inner(ispec) .eqv. phase_is_inner) then - if( ispec_is_acoustic(ispec) ) then + if( ispec_is_acoustic(ispec) ) then - ! reference gll points on boundary face - do igll = 1,NGLLSQUARE + ! reference gll points on boundary face + do igll = 1,NGLLSQUARE - ! gets local indices for GLL point - i = abs_boundary_ijk(1,igll,iface) - j = abs_boundary_ijk(2,igll,iface) - k = abs_boundary_ijk(3,igll,iface) + ! gets local indices for GLL point + i = abs_boundary_ijk(1,igll,iface) + j = abs_boundary_ijk(2,igll,iface) + k = abs_boundary_ijk(3,igll,iface) - ! gets global index - iglob=ibool(i,j,k,ispec) + ! gets global index + iglob=ibool(i,j,k,ispec) - ! determines bulk sound speed - rhol = rhostore(i,j,k,ispec) - cpl = sqrt( kappastore(i,j,k,ispec) / rhol ) + ! determines bulk sound speed + rhol = rhostore(i,j,k,ispec) + cpl = sqrt( kappastore(i,j,k,ispec) / rhol ) - ! gets associated, weighted jacobian - jacobianw = abs_boundary_jacobian2Dw(igll,iface) + ! gets associated, weighted jacobian + jacobianw = abs_boundary_jacobian2Dw(igll,iface) - ! Sommerfeld condition - absorbl = potential_dot_acoustic(iglob) * jacobianw / cpl / rhol - potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) & - - absorbl + ! Sommerfeld condition + absorbl = potential_dot_acoustic(iglob) * jacobianw / cpl / rhol + potential_dot_dot_acoustic(iglob) = potential_dot_dot_acoustic(iglob) & + - absorbl - ! adjoint simulations - if (SIMULATION_TYPE == 3) then - b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) & - - b_absorb_potential(igll,iface) + ! adjoint simulations + if (SIMULATION_TYPE == 3) then + b_potential_dot_dot_acoustic(iglob) = b_potential_dot_dot_acoustic(iglob) & + - b_absorb_potential(igll,iface) - else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then - b_absorb_potential(igll,iface) = absorbl + else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then + b_absorb_potential(igll,iface) = absorbl - endif !adjoint + endif !adjoint - enddo + enddo - endif ! ispec_is_acoustic - endif ! ispec_is_inner - enddo ! num_abs_boundary_faces + endif ! ispec_is_acoustic + endif ! ispec_is_inner + enddo ! num_abs_boundary_faces + else + ! GPU_MODE == .true. + if( num_abs_boundary_faces > 0 ) & + call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner, & + SIMULATION_TYPE,SAVE_FORWARD,b_absorb_potential) + + endif ! adjoint simulations: stores absorbed wavefield part if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then diff --git a/src/specfem3D/compute_stacey_elastic.f90 b/src/specfem3D/compute_stacey_elastic.f90 index 18633e893..c4997d532 100644 --- a/src/specfem3D/compute_stacey_elastic.f90 +++ b/src/specfem3D/compute_stacey_elastic.f90 @@ -36,7 +36,8 @@ subroutine compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, & veloc,rho_vp,rho_vs, & ispec_is_elastic,SIMULATION_TYPE,SAVE_FORWARD, & NSTEP,it,NGLOB_ADJOINT,b_accel, & - b_num_abs_boundary_faces,b_reclen_field,b_absorb_field) + b_num_abs_boundary_faces,b_reclen_field,b_absorb_field, & + GPU_MODE,Mesh_pointer) implicit none @@ -74,11 +75,17 @@ subroutine compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, & real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB_ADJOINT):: b_accel logical:: SAVE_FORWARD + ! GPU_MODE variables + integer(kind=8) :: Mesh_pointer + logical :: GPU_MODE + ! local parameters real(kind=CUSTOM_REAL) vx,vy,vz,nx,ny,nz,tx,ty,tz,vn,jacobianw integer :: ispec,iglob,i,j,k,iface,igll !integer:: reclen1,reclen2 + ! checks if anything to do + if( num_abs_boundary_faces == 0 ) return ! adjoint simulations: if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then @@ -95,63 +102,72 @@ subroutine compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, & endif !adjoint -! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist) - do iface=1,num_abs_boundary_faces + if(.NOT. GPU_MODE) then + + ! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist) + do iface=1,num_abs_boundary_faces - ispec = abs_boundary_ispec(iface) + ispec = abs_boundary_ispec(iface) - if (ispec_is_inner(ispec) .eqv. phase_is_inner) then + if (ispec_is_inner(ispec) .eqv. phase_is_inner) then - if( ispec_is_elastic(ispec) ) then + if( ispec_is_elastic(ispec) ) then - ! reference gll points on boundary face - do igll = 1,NGLLSQUARE + ! reference gll points on boundary face + do igll = 1,NGLLSQUARE - ! gets local indices for GLL point - i = abs_boundary_ijk(1,igll,iface) - j = abs_boundary_ijk(2,igll,iface) - k = abs_boundary_ijk(3,igll,iface) + ! gets local indices for GLL point + i = abs_boundary_ijk(1,igll,iface) + j = abs_boundary_ijk(2,igll,iface) + k = abs_boundary_ijk(3,igll,iface) - ! gets velocity - iglob=ibool(i,j,k,ispec) - vx=veloc(1,iglob) - vy=veloc(2,iglob) - vz=veloc(3,iglob) + ! gets velocity + iglob=ibool(i,j,k,ispec) + vx=veloc(1,iglob) + vy=veloc(2,iglob) + vz=veloc(3,iglob) - ! gets associated normal - nx = abs_boundary_normal(1,igll,iface) - ny = abs_boundary_normal(2,igll,iface) - nz = abs_boundary_normal(3,igll,iface) + ! gets associated normal + nx = abs_boundary_normal(1,igll,iface) + ny = abs_boundary_normal(2,igll,iface) + nz = abs_boundary_normal(3,igll,iface) - ! velocity component in normal direction (normal points out of element) - vn = vx*nx + vy*ny + vz*nz + ! velocity component in normal direction (normal points out of element) + vn = vx*nx + vy*ny + vz*nz - ! stacey term: velocity vector component * vp * rho in normal direction + vs * rho component tangential to it - tx = rho_vp(i,j,k,ispec)*vn*nx + rho_vs(i,j,k,ispec)*(vx-vn*nx) - ty = rho_vp(i,j,k,ispec)*vn*ny + rho_vs(i,j,k,ispec)*(vy-vn*ny) - tz = rho_vp(i,j,k,ispec)*vn*nz + rho_vs(i,j,k,ispec)*(vz-vn*nz) + ! stacey term: velocity vector component * vp * rho in normal direction + vs * rho component tangential to it + tx = rho_vp(i,j,k,ispec)*vn*nx + rho_vs(i,j,k,ispec)*(vx-vn*nx) + ty = rho_vp(i,j,k,ispec)*vn*ny + rho_vs(i,j,k,ispec)*(vy-vn*ny) + tz = rho_vp(i,j,k,ispec)*vn*nz + rho_vs(i,j,k,ispec)*(vz-vn*nz) - ! gets associated, weighted jacobian - jacobianw = abs_boundary_jacobian2Dw(igll,iface) + ! gets associated, weighted jacobian + jacobianw = abs_boundary_jacobian2Dw(igll,iface) - ! adds stacey term (weak form) - accel(1,iglob) = accel(1,iglob) - tx*jacobianw - accel(2,iglob) = accel(2,iglob) - ty*jacobianw - accel(3,iglob) = accel(3,iglob) - tz*jacobianw + ! adds stacey term (weak form) + accel(1,iglob) = accel(1,iglob) - tx*jacobianw + accel(2,iglob) = accel(2,iglob) - ty*jacobianw + accel(3,iglob) = accel(3,iglob) - tz*jacobianw - ! adjoint simulations - if (SIMULATION_TYPE == 3) then - b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface) - else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then - b_absorb_field(1,igll,iface) = tx*jacobianw - b_absorb_field(2,igll,iface) = ty*jacobianw - b_absorb_field(3,igll,iface) = tz*jacobianw - endif !adjoint + ! adjoint simulations + if (SIMULATION_TYPE == 3) then + b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface) + else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then + b_absorb_field(1,igll,iface) = tx*jacobianw + b_absorb_field(2,igll,iface) = ty*jacobianw + b_absorb_field(3,igll,iface) = tz*jacobianw + endif !adjoint - enddo - endif ! ispec_is_elastic - endif ! ispec_is_inner - enddo + enddo + endif ! ispec_is_elastic + endif ! ispec_is_inner + enddo + + else + ! GPU_MODE == .true. + if( num_abs_boundary_faces > 0 ) & + call compute_stacey_elastic_cuda(Mesh_pointer,phase_is_inner, & + SIMULATION_TYPE,SAVE_FORWARD,b_absorb_field) + endif ! adjoint simulations: stores absorbed wavefield part if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then diff --git a/src/specfem3D/create_color_image.f90 b/src/specfem3D/create_color_image.f90 index 76672681a..45f629df2 100644 --- a/src/specfem3D/create_color_image.f90 +++ b/src/specfem3D/create_color_image.f90 @@ -33,32 +33,32 @@ module image_PNM_GIF_par ! USER PARAMETER ! image data output: - ! type = 1 : velocity V_x component - ! type = 2 : velocity V_y component - ! type = 3 : velocity V_z component - ! type = 4 : velocity V norm - integer,parameter:: IMAGE_TYPE = 3 + ! type = 1 : displ/velocity x-component + ! type = 2 : displ/velocity y-component + ! type = 3 : displ/velocity z-component + ! type = 4 : displ/velocity norm + integer,parameter:: IMAGE_TYPE = 3 ! 4 ! cross-section surface ! cross-section origin point - real(kind=CUSTOM_REAL),parameter:: section_xorg = 67000.0 - real(kind=CUSTOM_REAL),parameter:: section_yorg = 0.0 - real(kind=CUSTOM_REAL),parameter:: section_zorg = -1000.0 + real(kind=CUSTOM_REAL),parameter:: section_xorg = 0.0 ! 67000.0 + real(kind=CUSTOM_REAL),parameter:: section_yorg = 0.0 ! 0.0 + real(kind=CUSTOM_REAL),parameter:: section_zorg = -100.0 ! 0.0 ! cross-section surface normal - real(kind=CUSTOM_REAL),parameter:: section_nx = 0.0 - real(kind=CUSTOM_REAL),parameter:: section_ny = 0.0 - real(kind=CUSTOM_REAL),parameter:: section_nz = 1.0 + real(kind=CUSTOM_REAL),parameter:: section_nx = 0.0 !1.0 + real(kind=CUSTOM_REAL),parameter:: section_ny = 0.0 !0.0 + real(kind=CUSTOM_REAL),parameter:: section_nz = 1.0 !0.0 ! cross-section (in-plane) horizontal-direction - real(kind=CUSTOM_REAL),parameter:: section_hdirx = 1.0 - real(kind=CUSTOM_REAL),parameter:: section_hdiry = 0.0 - real(kind=CUSTOM_REAL),parameter:: section_hdirz = 0.0 + real(kind=CUSTOM_REAL),parameter:: section_hdirx = 1.0 ! 0.0 + real(kind=CUSTOM_REAL),parameter:: section_hdiry = 0.0 !1.0 + real(kind=CUSTOM_REAL),parameter:: section_hdirz = 0.0 ! 0.0 ! cross-section (in-plane) vertical-direction - real(kind=CUSTOM_REAL),parameter:: section_vdirx = 0.0 - real(kind=CUSTOM_REAL),parameter:: section_vdiry = 1.0 - real(kind=CUSTOM_REAL),parameter:: section_vdirz = 0.0 + real(kind=CUSTOM_REAL),parameter:: section_vdirx = 0.0 ! 0.0 + real(kind=CUSTOM_REAL),parameter:: section_vdiry = 1.0 ! 0.0 + real(kind=CUSTOM_REAL),parameter:: section_vdirz = 0.0 ! 1.0 ! non linear display to enhance small amplitudes in color images real(kind=CUSTOM_REAL), parameter :: POWER_DISPLAY_COLOR = 0.30_CUSTOM_REAL @@ -118,6 +118,7 @@ subroutine write_PNM_GIF_initialize() real(kind=CUSTOM_REAL),dimension(:,:),allocatable :: dist_pixel_image,dist_pixel_recv real(kind=CUSTOM_REAL):: pixel_midpoint_x,pixel_midpoint_z,x_loc,z_loc,xtmp,ztmp real(kind=CUSTOM_REAL):: ratio + real(kind=CUSTOM_REAL):: distance_x1,distance_x2,distance_z1,distance_z2 integer:: npgeo,npgeo_glob integer:: i,j,k,iproc,iglob,ispec,ier ! data from mesh @@ -129,6 +130,8 @@ subroutine write_PNM_GIF_initialize() !character(len=256) :: vtkfilename integer :: zoom_factor = 4 logical :: zoom + integer, dimension(1) :: tmp_pixel_loc + integer, dimension(1,0:NPROC-1) :: tmp_pixel_per_proc ! checks image type if(IMAGE_TYPE > 4 .or. IMAGE_TYPE < 1) then @@ -263,8 +266,17 @@ subroutine write_PNM_GIF_initialize() endif ! create all the pixels - size_pixel_horizontal = (xmax_color_image - xmin_color_image) / dble(NX_IMAGE_color) - size_pixel_vertical = (zmax_color_image - zmin_color_image) / dble(NZ_IMAGE_color) + if( NX_IMAGE_color /= 0 ) then + size_pixel_horizontal = (xmax_color_image - xmin_color_image) / dble(NX_IMAGE_color) + else + size_pixel_horizontal = 0.0 + endif + + if( NZ_IMAGE_color /= 0 ) then + size_pixel_vertical = (zmax_color_image - zmin_color_image) / dble(NZ_IMAGE_color) + else + size_pixel_vertical = 0.0 + endif if (myrank == 0) then write(IMAIN,*) ' image points: ',npgeo_glob @@ -285,6 +297,19 @@ subroutine write_PNM_GIF_initialize() iglob_image_color(:,:) = -1 ispec_image_color(:,:) = 0 dist_pixel_image(:,:) = HUGEVAL + + if( zoom ) then + distance_x1 = zoom_factor*size_pixel_horizontal + distance_x2 = (zoom_factor+1)*size_pixel_horizontal + distance_z1 = zoom_factor*size_pixel_vertical + distance_z2 = (zoom_factor+1)*size_pixel_vertical + else + distance_x1 = 0.0 + distance_x2 = 2.0*size_pixel_horizontal + distance_z1 = 0.0 + distance_z2 = 2.0*size_pixel_vertical + endif + do j=1,NZ_IMAGE_color do i=1,NX_IMAGE_color ! calculates midpoint of pixel @@ -307,15 +332,8 @@ subroutine write_PNM_GIF_initialize() z_loc = zcoord(iglob) ! checks if inside pixel range for larger numbers of points, minimizing computation time - if( zoom ) then - if( x_loc < xtmp-zoom_factor*size_pixel_horizontal .or. & - x_loc > xtmp + (zoom_factor+1)*size_pixel_horizontal ) cycle - if( z_loc < ztmp-zoom_factor*size_pixel_vertical .or. & - z_loc > ztmp + (zoom_factor+1)*size_pixel_vertical ) cycle - else - if( x_loc < xtmp .or. x_loc > xtmp + size_pixel_horizontal ) cycle - if( z_loc < ztmp .or. z_loc > ztmp + size_pixel_vertical ) cycle - endif + if( x_loc < xtmp - distance_x1 .or. x_loc > xtmp + distance_x2 ) cycle + if( z_loc < ztmp - distance_z1 .or. z_loc > ztmp + distance_z2 ) cycle ! stores closest iglob x_loc = pixel_midpoint_x - x_loc @@ -382,7 +400,10 @@ subroutine write_PNM_GIF_initialize() ! filling array iglob_image_color, containing info on which process owns which pixels. allocate(nb_pixel_per_proc(0:NPROC-1),stat=ier) if( ier /= 0 ) stop 'error allocating array nb_pixel_per_proc' - call gather_all_i(nb_pixel_loc,1,nb_pixel_per_proc,1,NPROC) + + tmp_pixel_loc(1) = nb_pixel_loc + call gather_all_i(tmp_pixel_loc,1,tmp_pixel_per_proc,1,NPROC) + nb_pixel_per_proc(:) = tmp_pixel_per_proc(1,:) ! allocates receiving array if ( myrank == 0 ) then @@ -506,7 +527,7 @@ subroutine write_PNM_GIF_create_image implicit none ! local parameters - real(kind=CUSTOM_REAL),dimension(NDIM) :: veloc_val + real(kind=CUSTOM_REAL),dimension(NDIM) :: val_vector real(kind=CUSTOM_REAL):: temp integer :: i,j,k,iglob,ispec,iproc @@ -523,15 +544,15 @@ subroutine write_PNM_GIF_create_image ispec = ispec_image_color(i,j) ! gets velocity for point iglob - call get_iglob_veloc(iglob,ispec,veloc_val) + call get_iglob_veloc(iglob,ispec,val_vector) ! data type if( IMAGE_TYPE == 4 ) then ! velocity norm - temp = sqrt( veloc_val(1)**2 + veloc_val(2)**2 + veloc_val(3)**2 ) + temp = sqrt( val_vector(1)**2 + val_vector(2)**2 + val_vector(3)**2 ) else ! velocity component - temp = veloc_val(IMAGE_TYPE) + temp = val_vector(IMAGE_TYPE) endif ! stores data @@ -829,53 +850,89 @@ end subroutine get_iglob_vp !============================================================= - subroutine get_iglob_veloc(iglob,ispec,veloc_val) + subroutine get_iglob_veloc(iglob,ispec,val_vector) use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM - use specfem_par_acoustic,only: ACOUSTIC_SIMULATION,potential_dot_acoustic,& - rhostore,ispec_is_acoustic,b_potential_dot_acoustic - use specfem_par_elastic,only: ELASTIC_SIMULATION,veloc,ispec_is_elastic,b_veloc + use specfem_par_acoustic,only: ACOUSTIC_SIMULATION,potential_acoustic,potential_dot_acoustic, & + rhostore,ispec_is_acoustic, & + b_potential_acoustic,b_potential_dot_acoustic + use specfem_par_elastic,only: ELASTIC_SIMULATION,displ,veloc, & + ispec_is_elastic,b_displ,b_veloc use specfem_par,only: NSPEC_AB,NGLOB_AB,hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,SIMULATION_TYPE + ibool,SIMULATION_TYPE,GRAVITY + use specfem_par_movie,only:SAVE_DISPLACEMENT implicit none integer,intent(in) :: iglob,ispec - real(kind=CUSTOM_REAL),dimension(NDIM),intent(out):: veloc_val + real(kind=CUSTOM_REAL),dimension(NDIM),intent(out):: val_vector ! local parameters - real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: veloc_element + real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: val_element integer :: i,j,k ! returns first element encountered for iglob index if( ELASTIC_SIMULATION ) then if( ispec_is_elastic(ispec) ) then - if( SIMULATION_TYPE == 3 ) then - veloc_val(:) = b_veloc(:,iglob) + if(SAVE_DISPLACEMENT) then + if( SIMULATION_TYPE == 3 ) then + ! to display re-constructed wavefield + !val_vector(:) = b_displ(:,iglob) + ! to display adjoint wavefield + val_vector(:) = displ(:,iglob) + else + val_vector(:) = displ(:,iglob) + endif else - veloc_val(:) = veloc(:,iglob) + if( SIMULATION_TYPE == 3 ) then + ! to display re-constructed wavefield + !val_vector(:) = b_veloc(:,iglob) + ! to display adjoint wavefield + val_vector(:) = veloc(:,iglob) + else + val_vector(:) = veloc(:,iglob) + endif endif ! returns with this result return endif endif + if( ACOUSTIC_SIMULATION ) then if( ispec_is_acoustic(ispec) ) then - if( SIMULATION_TYPE == 3 ) then - ! velocity vector for backward/reconstructed wavefield - call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & - b_potential_dot_acoustic, veloc_element,& + if(SAVE_DISPLACEMENT) then + if( SIMULATION_TYPE == 3 ) then + ! displacement vector from backward potential + call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & + b_potential_acoustic, val_element,& + hprime_xx,hprime_yy,hprime_zz, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + ibool,rhostore,GRAVITY) + else + ! displacement vector + call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & + potential_acoustic, val_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) + endif else - ! velocity vector - call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & - potential_dot_acoustic, veloc_element,& + if( SIMULATION_TYPE == 3 ) then + ! velocity vector for backward/reconstructed wavefield + call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & + b_potential_dot_acoustic, val_element,& + hprime_xx,hprime_yy,hprime_zz, & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & + ibool,rhostore,GRAVITY) + else + ! velocity vector + call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & + potential_dot_acoustic, val_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) + endif endif ! returns corresponding iglob velocity entry @@ -883,7 +940,7 @@ subroutine get_iglob_veloc(iglob,ispec,veloc_val) do j=1,NGLLY do i=1,NGLLX if( ibool(i,j,k,ispec) == iglob ) then - veloc_val(:) = veloc_element(:,i,j,k) + val_vector(:) = val_element(:,i,j,k) return endif enddo diff --git a/src/specfem3D/finalize_simulation.f90 b/src/specfem3D/finalize_simulation.f90 index 4b5acfc40..a20d9c007 100644 --- a/src/specfem3D/finalize_simulation.f90 +++ b/src/specfem3D/finalize_simulation.f90 @@ -36,10 +36,10 @@ subroutine finalize_simulation() integer :: irec_local -! save last frame + ! save last frame if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then - open(unit=27,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',& + open(unit=27,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',& status='unknown',form='unformatted') if( ACOUSTIC_SIMULATION ) then diff --git a/src/specfem3D/initialize_simulation.f90 b/src/specfem3D/initialize_simulation.f90 index f63fd21ab..54d337be3 100644 --- a/src/specfem3D/initialize_simulation.f90 +++ b/src/specfem3D/initialize_simulation.f90 @@ -48,6 +48,9 @@ subroutine initialize_simulation() NTSTEP_BETWEEN_OUTPUT_INFO,SIMULATION_TYPE,SAVE_FORWARD, & NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,IMODEL) + ! GPU_MODE is in par_file + call read_gpu_mode(GPU_MODE,GRAVITY) + ! get the base pathname for output files call get_value_string(OUTPUT_FILES, 'OUTPUT_FILES', OUTPUT_FILES_PATH(1:len_trim(OUTPUT_FILES_PATH))) @@ -111,9 +114,11 @@ subroutine initialize_simulation() write(IMAIN,'(a)',advance='yes') ' tomo' case( IMODEL_USER_EXTERNAL ) write(IMAIN,'(a)',advance='yes') ' external' + case( IMODEL_IPATI ) + write(IMAIN,'(a)',advance='yes') ' ipati' end select - - write(IMAIN,*) + + write(IMAIN,*) endif ! reads in numbers of spectral elements and points for this process' domain @@ -167,15 +172,18 @@ subroutine initialize_simulation() gammaz(NGLLX,NGLLY,NGLLZ,NSPEC_AB), & jacobian(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier) if( ier /= 0 ) stop 'error allocating arrays for databases' + ! mesh node locations allocate(xstore(NGLOB_AB), & ystore(NGLOB_AB), & zstore(NGLOB_AB),stat=ier) if( ier /= 0 ) stop 'error allocating arrays for mesh nodes' + ! material properties allocate(kappastore(NGLLX,NGLLY,NGLLZ,NSPEC_AB), & mustore(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier) if( ier /= 0 ) stop 'error allocating arrays for material properties' + ! material flags allocate(ispec_is_acoustic(NSPEC_AB), & ispec_is_elastic(NSPEC_AB), & @@ -185,6 +193,9 @@ subroutine initialize_simulation() ! initializes adjoint simulations call initialize_simulation_adjoint() + ! initializes GPU cards + if( GPU_MODE ) call initialize_GPU() + end subroutine initialize_simulation ! @@ -214,7 +225,7 @@ subroutine initialize_simulation_check() write(IMAIN,*) 'error: number of MPI processors actually run on: ',sizeprocs print* print*, 'error specfem3D: number of processors supposed to run on: ',NPROC - print*, 'error specfem3D: number of MPI processors actually run on: ',sizeprocs + print*, 'error specfem3D: number of MPI processors actually run on: ',sizeprocs print* endif call exit_MPI(myrank,'wrong number of MPI processes') @@ -233,6 +244,10 @@ subroutine initialize_simulation_check() stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ is in [5-10]' endif + ! gravity only on GPU supported + if( .not. GPU_MODE .and. GRAVITY ) & + stop 'GRAVITY only supported in GPU mode' + ! absorbing surfaces if( ABSORBING_CONDITIONS ) then ! for arbitrary orientation of elements, which face belongs to xmin,xmax,etc... - @@ -278,6 +293,7 @@ subroutine initialize_simulation_check() endif end subroutine initialize_simulation_check + ! !------------------------------------------------------------------------------------------------- ! @@ -318,3 +334,55 @@ subroutine initialize_simulation_adjoint() endif end subroutine initialize_simulation_adjoint + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine initialize_GPU() + +! initialization for GPU cards + + use specfem_par + use specfem_par_elastic + use specfem_par_acoustic + use specfem_par_poroelastic + implicit none + integer :: ncuda_devices,ncuda_devices_min,ncuda_devices_max + + ! GPU_MODE now defined in Par_file + if(myrank == 0 ) then + write(IMAIN,*) + write(IMAIN,*) "GPU_MODE Active." + endif + + ! check for GPU runs + if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) & + stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5' + if( CUSTOM_REAL /= 4 ) & + stop 'GPU mode runs only with CUSTOM_REAL == 4' + if( SAVE_MOHO_MESH ) & + stop 'GPU mode does not support SAVE_MOHO_MESH yet' + if( ATTENUATION ) then + if( N_SLS /= 3 ) & + stop 'GPU mode does not support N_SLS /= 3 yet' + endif + if( POROELASTIC_SIMULATION ) then + stop 'poroelastic simulations on GPU not supported yet' + endif + + ! initializes GPU and outputs info to files for all processes + call prepare_cuda_device(myrank,ncuda_devices) + + ! collects min/max of local devices found for statistics + call sync_all() + call min_all_i(ncuda_devices,ncuda_devices_min) + call max_all_i(ncuda_devices,ncuda_devices_max) + + if( myrank == 0 ) then + write(IMAIN,*) "GPU number of devices per node: min =",ncuda_devices_min + write(IMAIN,*) " max =",ncuda_devices_max + write(IMAIN,*) + endif + + end subroutine initialize_GPU diff --git a/src/specfem3D/iterate_time.f90 b/src/specfem3D/iterate_time.f90 index bb74fadb2..d0b317748 100644 --- a/src/specfem3D/iterate_time.f90 +++ b/src/specfem3D/iterate_time.f90 @@ -90,8 +90,8 @@ subroutine iterate_time() call it_read_forward_arrays() endif - ! write the seismograms with time shift - if (nrec_local > 0) then + ! write the seismograms with time shift (GPU_MODE transfer included) + if (nrec_local > 0 .or. ( WRITE_SEISMOGRAMS_BY_MASTER .and. myrank == 0 ) ) then call write_seismograms() endif @@ -111,12 +111,14 @@ subroutine iterate_time() endif ! first step of noise tomography, i.e., save a surface movie at every time step - if ( NOISE_TOMOGRAPHY == 1 ) then - call noise_save_surface_movie(displ, & - ibool, & - noise_surface_movie,it, & - NSPEC_AB,NGLOB_AB, & - num_free_surface_faces,free_surface_ispec,free_surface_ijk) + if ( NOISE_TOMOGRAPHY == 1) then + if( num_free_surface_faces > 0) then + call noise_save_surface_movie(displ,ibool, & + noise_surface_movie,it, & + NSPEC_AB,NGLOB_AB, & + num_free_surface_faces,free_surface_ispec,free_surface_ijk, & + Mesh_pointer,GPU_MODE) + endif endif ! @@ -124,9 +126,39 @@ subroutine iterate_time() ! enddo ! end of main time loop + call it_print_elapsed_time() + + ! Transfer fields from GPU card to host for further analysis + if(GPU_MODE) call it_transfer_from_GPU() + end subroutine iterate_time +!===================================================================== + + subroutine it_print_elapsed_time() + use specfem_par + use specfem_par_elastic + use specfem_par_acoustic + implicit none + + ! local parameters + double precision :: tCPU + integer :: ihours,iminutes,iseconds,int_tCPU + + if(myrank == 0) then + ! elapsed time since beginning of the simulation + tCPU = wtime() - time_start + int_tCPU = int(tCPU) + ihours = int_tCPU / 3600 + iminutes = (int_tCPU - 3600*ihours) / 60 + iseconds = int_tCPU - 3600*ihours - 60*iminutes + write(IMAIN,*) 'Time-Loop Complete. Timing info:' + write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU + write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds + endif + end subroutine it_print_elapsed_time + !===================================================================== subroutine it_check_stability() @@ -146,47 +178,104 @@ subroutine it_check_stability() ihours_remain,iminutes_remain,iseconds_remain,int_t_remain, & ihours_total,iminutes_total,iseconds_total,int_t_total + ! maximum of the norm of the displacement + real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all ! elastic + real(kind=CUSTOM_REAL) Usolidnormp,Usolidnormp_all ! acoustic + real(kind=CUSTOM_REAL) Usolidnorms,Usolidnorms_all ! solid poroelastic + real(kind=CUSTOM_REAL) Usolidnormw,Usolidnormw_all ! fluid (w.r.t.s) poroelastic + + ! norm of the backward displacement + real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all + + ! initializes + Usolidnorm_all = 0.0_CUSTOM_REAL + Usolidnormp_all = 0.0_CUSTOM_REAL + Usolidnorms_all = 0.0_CUSTOM_REAL + Usolidnormw_all = 0.0_CUSTOM_REAL + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !chris: Rewrite to get norm for each material when coupled simulations !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! compute maximum of norm of displacement in each slice - if( ELASTIC_SIMULATION ) & - Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2)) - if( ACOUSTIC_SIMULATION ) & + if( ELASTIC_SIMULATION ) then + if( GPU_MODE) then + ! way 2: just get maximum of field from GPU + call get_norm_elastic_from_device(Usolidnorm,Mesh_pointer,1) + else + Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2)) + endif + + ! check stability of the code, exit if unstable + ! negative values can occur with some compilers when the unstable value is greater + ! than the greatest possible floating-point number of the machine + if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0) & + call exit_MPI(myrank,'forward simulation became unstable and blew up') + + ! compute the maximum of the maxima for all the slices using an MPI reduction + call max_all_cr(Usolidnorm,Usolidnorm_all) + endif + + if( ACOUSTIC_SIMULATION ) then + if(GPU_MODE) then + ! way 2: just get maximum of field from GPU + call get_norm_acoustic_from_device(Usolidnormp,Mesh_pointer,1) + else Usolidnormp = maxval(abs(potential_dot_dot_acoustic(:))) + endif + + ! compute the maximum of the maxima for all the slices using an MPI reduction + call max_all_cr(Usolidnormp,Usolidnormp_all) + endif + if( POROELASTIC_SIMULATION ) then Usolidnorms = maxval(sqrt(displs_poroelastic(1,:)**2 + displs_poroelastic(2,:)**2 + & displs_poroelastic(3,:)**2)) Usolidnormw = maxval(sqrt(displw_poroelastic(1,:)**2 + displw_poroelastic(2,:)**2 + & displw_poroelastic(3,:)**2)) + + ! compute the maximum of the maxima for all the slices using an MPI reduction + call max_all_cr(Usolidnorms,Usolidnorms_all) + call max_all_cr(Usolidnormw,Usolidnormw_all) endif -! compute the maximum of the maxima for all the slices using an MPI reduction - call max_all_cr(Usolidnorm,Usolidnorm_all) - call max_all_cr(Usolidnormp,Usolidnormp_all) - call max_all_cr(Usolidnorms,Usolidnorms_all) - call max_all_cr(Usolidnormw,Usolidnormw_all) -! adjoint simulations + ! adjoint simulations if( SIMULATION_TYPE == 3 ) then if( ELASTIC_SIMULATION ) then - b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2)) + ! way 2 + if(GPU_MODE) then + call get_norm_elastic_from_device(b_Usolidnorm,Mesh_pointer,3) + else + b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2)) + endif else if( ACOUSTIC_SIMULATION ) then - b_Usolidnorm = maxval(abs(b_potential_dot_dot_acoustic(:))) + ! way 2 + if(GPU_MODE) then + call get_norm_acoustic_from_device(b_Usolidnorm,Mesh_pointer,3) + else + b_Usolidnorm = maxval(abs(b_potential_dot_dot_acoustic(:))) + endif endif endif + ! check stability of the code, exit if unstable + ! negative values can occur with some compilers when the unstable value is greater + ! than the greatest possible floating-point number of the machine + if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0) & + call exit_MPI(myrank,'backward simulation became unstable and blew up') + + ! compute max of all slices call max_all_cr(b_Usolidnorm,b_Usolidnorm_all) - endif + endif -! user output + ! user output if(myrank == 0) then write(IMAIN,*) 'Time step # ',it write(IMAIN,*) 'Time: ',sngl((it-1)*DT-t0),' seconds' -! elapsed time since beginning of the simulation + ! elapsed time since beginning of the simulation tCPU = wtime() - time_start int_tCPU = int(tCPU) ihours = int_tCPU / 3600 @@ -194,11 +283,13 @@ subroutine it_check_stability() iseconds = int_tCPU - 3600*ihours - 60*iminutes write(IMAIN,*) 'Elapsed time in seconds = ',tCPU write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds - write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it) - if( ELASTIC_SIMULATION ) & + write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',sngl(tCPU/dble(it)) + if( ELASTIC_SIMULATION ) then write(IMAIN,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all - if( ACOUSTIC_SIMULATION ) & + endif + if( ACOUSTIC_SIMULATION ) then write(IMAIN,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all + endif if( POROELASTIC_SIMULATION ) then write(IMAIN,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all write(IMAIN,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all @@ -207,7 +298,7 @@ subroutine it_check_stability() if (SIMULATION_TYPE == 3) write(IMAIN,*) & 'Max norm U (backward) in all slices = ',b_Usolidnorm_all -! compute estimated remaining simulation time + ! compute estimated remaining simulation time t_remain = (NSTEP - it) * (tCPU/dble(it)) int_t_remain = int(t_remain) ihours_remain = int_t_remain / 3600 @@ -215,17 +306,17 @@ subroutine it_check_stability() iseconds_remain = int_t_remain - 3600*ihours_remain - 60*iminutes_remain write(IMAIN,*) 'Time steps done = ',it,' out of ',NSTEP write(IMAIN,*) 'Time steps remaining = ',NSTEP - it - write(IMAIN,*) 'Estimated remaining time in seconds = ',t_remain + write(IMAIN,*) 'Estimated remaining time in seconds = ',sngl(t_remain) write(IMAIN,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") & ihours_remain,iminutes_remain,iseconds_remain -! compute estimated total simulation time + ! compute estimated total simulation time t_total = t_remain + tCPU int_t_total = int(t_total) ihours_total = int_t_total / 3600 iminutes_total = (int_t_total - 3600*ihours_total) / 60 iseconds_total = int_t_total - 3600*ihours_total - 60*iminutes_total - write(IMAIN,*) 'Estimated total run time in seconds = ',t_total + write(IMAIN,*) 'Estimated total run time in seconds = ',sngl(t_total) write(IMAIN,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") & ihours_total,iminutes_total,iseconds_total write(IMAIN,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that' @@ -238,7 +329,7 @@ subroutine it_check_stability() endif write(IMAIN,*) -! write time stamp file to give information about progression of simulation + ! write time stamp file to give information about progression of simulation write(outputname,"('/timestamp',i6.6)") it open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown') write(IOUT,*) 'Time step # ',it @@ -246,10 +337,12 @@ subroutine it_check_stability() write(IOUT,*) 'Elapsed time in seconds = ',tCPU write(IOUT,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds write(IOUT,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it) - if( ELASTIC_SIMULATION ) & + if( ELASTIC_SIMULATION ) then write(IOUT,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all - if( ACOUSTIC_SIMULATION ) & + endif + if( ACOUSTIC_SIMULATION ) then write(IOUT,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all + endif if( POROELASTIC_SIMULATION ) then write(IOUT,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all write(IOUT,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all @@ -259,28 +352,27 @@ subroutine it_check_stability() 'Max norm U (backward) in all slices = ',b_Usolidnorm_all ! estimation write(IOUT,*) 'Time steps done = ',it,' out of ',NSTEP - write(IOUT,*) 'Time steps remaining = ',NSTEP - it + write(IOUT,*) 'Time steps remaining = ',NSTEP - it write(IOUT,*) 'Estimated remaining time in seconds = ',t_remain write(IOUT,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") & - ihours_remain,iminutes_remain,iseconds_remain + ihours_remain,iminutes_remain,iseconds_remain write(IOUT,*) 'Estimated total run time in seconds = ',t_total write(IOUT,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") & ihours_total,iminutes_total,iseconds_total - write(IOUT,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that' + write(IOUT,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that' close(IOUT) - -! check stability of the code, exit if unstable -! negative values can occur with some compilers when the unstable value is greater -! than the greatest possible floating-point number of the machine - if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0 & - .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0 & - .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0 & - .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0) & + ! check stability of the code, exit if unstable + ! negative values can occur with some compilers when the unstable value is greater + ! than the greatest possible floating-point number of the machine + if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0 & + .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0 & + .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0 & + .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0) & call exit_MPI(myrank,'forward simulation became unstable and blew up') ! adjoint simulations if(SIMULATION_TYPE == 3 .and. (b_Usolidnorm_all > STABILITY_THRESHOLD & - .or. b_Usolidnorm_all < 0)) & + .or. b_Usolidnorm_all < 0.0)) & call exit_MPI(myrank,'backward simulation became unstable and blew up') endif ! myrank @@ -331,15 +423,27 @@ subroutine it_update_displacement_scheme() ! updates acoustic potentials if( ACOUSTIC_SIMULATION ) then - potential_acoustic(:) = potential_acoustic(:) & + + if(.NOT. GPU_MODE) then + ! on CPU + potential_acoustic(:) = potential_acoustic(:) & + deltat * potential_dot_acoustic(:) & + deltatsqover2 * potential_dot_dot_acoustic(:) - potential_dot_acoustic(:) = potential_dot_acoustic(:) & + potential_dot_acoustic(:) = potential_dot_acoustic(:) & + deltatover2 * potential_dot_dot_acoustic(:) - potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL + potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL + else + ! on GPU + call it_update_displacement_ac_cuda(Mesh_pointer, NGLOB_AB, & + deltat, deltatsqover2, deltatover2, & + SIMULATION_TYPE, b_deltat, b_deltatsqover2, b_deltatover2) + endif ! time marching potentials if(ABSORB_USE_PML .and. ABSORBING_CONDITIONS) then + if( GPU_MODE ) call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + call PML_acoustic_time_march(NSPEC_AB,NGLOB_AB,ibool,& potential_acoustic,potential_dot_acoustic,& deltat,deltatsqover2,deltatover2,& @@ -352,15 +456,28 @@ subroutine it_update_displacement_scheme() nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,& my_neighbours_ext_mesh,NPROC,& ispec_is_acoustic) + + if( GPU_MODE ) call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) endif - endif + + endif ! ACOUSTIC_SIMULATION ! updates elastic displacement and velocity if( ELASTIC_SIMULATION ) then - displ(:,:) = displ(:,:) + deltat*veloc(:,:) + deltatsqover2*accel(:,:) - veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:) - if( SIMULATION_TYPE /= 1 ) accel_adj_coupling(:,:) = accel(:,:) - accel(:,:) = 0._CUSTOM_REAL + + if(.NOT. GPU_MODE) then + ! on CPU + displ(:,:) = displ(:,:) + deltat*veloc(:,:) + deltatsqover2*accel(:,:) + veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:) + if( SIMULATION_TYPE /= 1 ) accel_adj_coupling(:,:) = accel(:,:) + accel(:,:) = 0._CUSTOM_REAL + else + ! on GPU + ! Includes SIM_TYPE 1 & 3 (for noise tomography) + call it_update_displacement_cuda(Mesh_pointer, size(displ), deltat, deltatsqover2,& + deltatover2, SIMULATION_TYPE, b_deltat, b_deltatsqover2, b_deltatover2) + endif endif ! updates poroelastic displacements and velocities @@ -379,7 +496,7 @@ subroutine it_update_displacement_scheme() endif ! adjoint simulations - if (SIMULATION_TYPE == 3) then + if (SIMULATION_TYPE == 3 .and. .NOT. GPU_MODE) then ! acoustic backward fields if( ACOUSTIC_SIMULATION ) then b_potential_acoustic(:) = b_potential_acoustic(:) & @@ -389,6 +506,7 @@ subroutine it_update_displacement_scheme() + b_deltatover2 * b_potential_dot_dot_acoustic(:) b_potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL endif + ! elastic backward fields if( ELASTIC_SIMULATION ) then b_displ(:,:) = b_displ(:,:) + b_deltat*b_veloc(:,:) + b_deltatsqover2*b_accel(:,:) @@ -450,6 +568,11 @@ subroutine it_read_forward_arrays() read(27) b_potential_acoustic read(27) b_potential_dot_acoustic read(27) b_potential_dot_dot_acoustic + + ! transfers fields onto GPU + if(GPU_MODE) & + call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, & + b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer) endif ! elastic wavefields @@ -458,18 +581,29 @@ subroutine it_read_forward_arrays() read(27) b_veloc read(27) b_accel + ! puts elastic wavefield to GPU + if(GPU_MODE) & + call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer) + ! memory variables if attenuation if( ATTENUATION ) then - read(27) b_R_xx - read(27) b_R_yy - read(27) b_R_xy - read(27) b_R_xz - read(27) b_R_yz - read(27) b_epsilondev_xx - read(27) b_epsilondev_yy - read(27) b_epsilondev_xy - read(27) b_epsilondev_xz - read(27) b_epsilondev_yz + read(27) b_R_xx + read(27) b_R_yy + read(27) b_R_xy + read(27) b_R_xz + read(27) b_R_yz + read(27) b_epsilondev_xx + read(27) b_epsilondev_yy + read(27) b_epsilondev_xy + read(27) b_epsilondev_xz + read(27) b_epsilondev_yz + + ! puts elastic attenuation arrays to GPU + if(GPU_MODE) & + call transfer_b_fields_att_to_device(Mesh_pointer, & + b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), & + b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, & + size(b_epsilondev_xx)) endif endif @@ -514,9 +648,17 @@ subroutine it_store_attenuation_arrays() open(unit=27,file=trim(prname_Q)//trim(outputname),status='old',& action='read',form='unformatted') if( ELASTIC_SIMULATION ) then + ! reads arrays from disk files read(27) b_displ read(27) b_veloc read(27) b_accel + + ! puts elastic fields onto GPU + if(GPU_MODE) then + ! wavefields + call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer) + endif + read(27) b_R_xx read(27) b_R_yy read(27) b_R_xy @@ -527,11 +669,28 @@ subroutine it_store_attenuation_arrays() read(27) b_epsilondev_xy read(27) b_epsilondev_xz read(27) b_epsilondev_yz + + ! puts elastic fields onto GPU + if(GPU_MODE) then + ! attenuation arrays + call transfer_b_fields_att_to_device(Mesh_pointer, & + b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), & + b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, & + size(b_epsilondev_xx)) + endif endif + if( ACOUSTIC_SIMULATION ) then + ! reads arrays from disk files read(27) b_potential_acoustic read(27) b_potential_dot_acoustic read(27) b_potential_dot_dot_acoustic + + ! puts acoustic fields onto GPU + if(GPU_MODE) & + call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, & + b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer) + endif close(27) else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. mod(it,NSTEP_Q_SAVE) == 0) then @@ -540,9 +699,24 @@ subroutine it_store_attenuation_arrays() open(unit=27,file=trim(prname_Q)//trim(outputname),status='unknown',& action='write',form='unformatted') if( ELASTIC_SIMULATION ) then + ! gets elastic fields from GPU onto CPU + if(GPU_MODE) then + call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer) + endif + + ! writes to disk file write(27) displ write(27) veloc write(27) accel + + if(GPU_MODE) then + ! attenuation arrays + call transfer_fields_att_from_device(Mesh_pointer, & + R_xx,R_yy,R_xy,R_xz,R_yz,size(R_xx), & + epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, & + size(epsilondev_xx)) + endif + write(27) R_xx write(27) R_yy write(27) R_xy @@ -555,6 +729,12 @@ subroutine it_store_attenuation_arrays() write(27) epsilondev_yz endif if( ACOUSTIC_SIMULATION ) then + ! gets acoustic fields from GPU onto CPU + if(GPU_MODE) & + call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + + ! writes to disk file write(27) potential_acoustic write(27) potential_dot_acoustic write(27) potential_dot_dot_acoustic @@ -564,3 +744,80 @@ subroutine it_store_attenuation_arrays() endif ! it end subroutine it_store_attenuation_arrays + + +!===================================================================== + + subroutine it_transfer_from_GPU() + +! transfers fields on GPU back onto CPU + + use specfem_par + use specfem_par_elastic + use specfem_par_acoustic + + implicit none + + ! to store forward wave fields + if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then + + ! acoustic potentials + if( ACOUSTIC_SIMULATION ) & + call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer) + + ! elastic wavefield + if( ELASTIC_SIMULATION ) then + call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer) + + if (ATTENUATION) & + call transfer_fields_att_from_device(Mesh_pointer, & + R_xx,R_yy,R_xy,R_xz,R_yz,size(R_xx), & + epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, & + size(epsilondev_xx)) + + endif + else if (SIMULATION_TYPE == 3) then + + ! to store kernels + ! acoustic domains + if( ACOUSTIC_SIMULATION ) then + ! only in case needed... + !call transfer_b_fields_ac_from_device(NGLOB_AB,b_potential_acoustic, & + ! b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer) + + ! acoustic kernels + call transfer_kernels_ac_to_host(Mesh_pointer,rho_ac_kl,kappa_ac_kl,NSPEC_AB) + endif + + ! elastic domains + if( ELASTIC_SIMULATION ) then + ! only in case needed... + !call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer) + + ! elastic kernels + call transfer_kernels_el_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,NSPEC_AB) + endif + + ! specific noise strength kernel + if( NOISE_TOMOGRAPHY == 3 ) then + call transfer_kernels_noise_to_host(Mesh_pointer,Sigma_kl,NSPEC_AB) + endif + + ! approximative hessian for preconditioning kernels + if ( APPROXIMATE_HESS_KL ) then + if( ELASTIC_SIMULATION ) call transfer_kernels_hess_el_tohost(Mesh_pointer,hess_kl,NSPEC_AB) + if( ACOUSTIC_SIMULATION ) call transfer_kernels_hess_ac_tohost(Mesh_pointer,hess_ac_kl,NSPEC_AB) + endif + + endif + + ! frees allocated memory on GPU + call prepare_cleanup_device(Mesh_pointer, & + SIMULATION_TYPE,SAVE_FORWARD, & + ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, & + ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, & + ATTENUATION,ANISOTROPY,OCEANS, & + APPROXIMATE_HESS_KL) + + end subroutine it_transfer_from_GPU diff --git a/src/specfem3D/locate_receivers.f90 b/src/specfem3D/locate_receivers.f90 index 079afc269..0003a9cc2 100644 --- a/src/specfem3D/locate_receivers.f90 +++ b/src/specfem3D/locate_receivers.f90 @@ -86,17 +86,17 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, & integer ios double precision,dimension(1) :: altitude_rec,distmin_ele - !double precision,dimension(4) :: elevation_node,dist_node + !double precision,dimension(4) :: elevation_node,dist_node double precision,dimension(NPROC) :: distmin_ele_all,elevation_all real(kind=CUSTOM_REAL) :: xloc,yloc,loc_ele,loc_distmin - + double precision, allocatable, dimension(:) :: x_target,y_target,z_target - + double precision, allocatable, dimension(:) :: horiz_dist double precision, allocatable, dimension(:) :: x_found,y_found,z_found double precision dist - + double precision xi,eta,gamma,dx,dy,dz,dxi,deta,dgamma double precision x,y,z double precision xix,xiy,xiz @@ -220,11 +220,11 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, & status='unknown',action='write',iostat=ios) if( ios /= 0 ) & call exit_mpi(myrank,'error opening file '//trim(OUTPUT_FILES)//'/output_list_stations.txt') - + do irec=1,nrec write(IOUT_SU,*) x_found(irec),y_found(irec),z_found(irec) enddo - + close(IOUT_SU) deallocate(x_found,y_found,z_found) endif @@ -286,7 +286,7 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, & read(1,*,iostat=ios) station_name(irec),network_name(irec), & stlat(irec),stlon(irec),stele(irec),stbur(irec) - + if (ios /= 0) call exit_mpi(myrank, 'Error reading station file '//trim(rec_filename)) ! convert station location to UTM @@ -315,7 +315,7 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, & num_free_surface_faces,free_surface_ispec,free_surface_ijk) altitude_rec(1) = loc_ele distmin_ele(1) = loc_distmin - + ! ! set distance to huge initial value ! distmin = HUGEVAL ! if(num_free_surface_faces > 0) then @@ -385,11 +385,11 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, & ! end do ! end do ! end if - + ! MPI communications to determine the best slice call gather_all_dp(distmin_ele,1,distmin_ele_all,1,NPROC) call gather_all_dp(altitude_rec,1,elevation_all,1,NPROC) - + if(myrank == 0) then iproc = minloc(distmin_ele_all) altitude_rec(1) = elevation_all(iproc(1)) @@ -499,6 +499,7 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, & ! end of loop on all the spectral elements in current slice enddo else + ! SeismicUnix format ispec_selected_rec(irec) = 0 ix_initial_guess(irec) = 0 iy_initial_guess(irec) = 0 @@ -519,7 +520,7 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, & (y_target(irec)>=ymin_ELE .and. y_target(irec)<=ymax_ELE) .and. & (z_target(irec)>=zmin_ELE .and. z_target(irec)<=zmax_ELE) ) then ! we find the element (ispec) which "may" contain the receiver (irec) - ! so we only need to compute distances + ! so we only need to compute distances !(which is expensive because of "dsqrt") within those elements ispec_selected_rec(irec) = ispec do k = kmin_temp,kmax_temp @@ -994,11 +995,11 @@ subroutine locate_receivers(ibool,myrank,NSPEC_AB,NGLOB_AB, & status='unknown',action='write',iostat=ios) if( ios /= 0 ) & call exit_mpi(myrank,'error opening file '//trim(OUTPUT_FILES)//'/output_list_stations.txt') - + do irec=1,nrec write(IOUT_SU,*) x_found(irec),y_found(irec),z_found(irec) enddo - + close(IOUT_SU) ! stores station infos for later runs diff --git a/src/specfem3D/locate_source.f90 b/src/specfem3D/locate_source.f90 index db33f5aed..67d7c2649 100644 --- a/src/specfem3D/locate_source.f90 +++ b/src/specfem3D/locate_source.f90 @@ -68,7 +68,7 @@ subroutine locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB,xstore,ystore,z integer iprocloop integer i,j,k,ispec,iglob,isource - integer imin,imax,jmin,jmax,kmin,kmax + integer imin,imax,jmin,jmax,kmin,kmax ! integer igll,jgll,kgll,inode,iface,iglob_selected, ! integer iselected,jselected,iface_selected,iadjust,jadjust integer iproc(1) @@ -103,12 +103,12 @@ subroutine locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB,xstore,ystore,z double precision x_target_source,y_target_source,z_target_source double precision,dimension(1) :: altitude_source,distmin_ele - + double precision,dimension(NPROC) :: distmin_ele_all,elevation_all ! double precision,dimension(4) :: elevation_node,dist_node real(kind=CUSTOM_REAL) :: xloc,yloc,loc_ele,loc_distmin - + integer islice_selected_source(NSOURCES) @@ -220,9 +220,9 @@ subroutine locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB,xstore,ystore,z num_free_surface_faces,free_surface_ispec,free_surface_ijk) altitude_source(1) = loc_ele distmin_ele(1) = loc_distmin - - + + ! ! set distance to huge initial value ! distmin = HUGEVAL ! if(num_free_surface_faces > 0) then @@ -295,11 +295,11 @@ subroutine locate_source(ibool,NSOURCES,myrank,NSPEC_AB,NGLOB_AB,xstore,ystore,z ! end do ! end if ! distmin_ele(1)= distmin - + ! MPI communications to determine the best slice call gather_all_dp(distmin_ele,1,distmin_ele_all,1,NPROC) call gather_all_dp(altitude_source,1,elevation_all,1,NPROC) - + if(myrank == 0) then iproc = minloc(distmin_ele_all) altitude_source(1) = elevation_all(iproc(1)) diff --git a/src/specfem3D/make_gravity.f90 b/src/specfem3D/make_gravity.f90 new file mode 100644 index 000000000..7dce52175 --- /dev/null +++ b/src/specfem3D/make_gravity.f90 @@ -0,0 +1,680 @@ +!===================================================================== +! +! S p e c f e m 3 D V e r s i o n 2 . 0 +! --------------------------------------- +! +! Main authors: Dimitri Komatitsch and Jeroen Tromp +! Princeton University, USA and University of Pau / CNRS / INRIA +! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA +! April 2011 +! +! This program is free software; you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation; either version 2 of the License, or +! (at your option) any later version. +! +! This program is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along +! with this program; if not, write to the Free Software Foundation, Inc., +! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +! +!===================================================================== +! +! United States and French Government Sponsorship Acknowledged. + + + subroutine make_gravity(nspl,rspl,gspl,gspl2, & + ROCEAN,RMIDDLE_CRUST,RMOHO,R80,R220,R400,R600,R670, & + R771,RTOPDDOUBLEPRIME,RCMB,RICB) + +! creates a spline for the gravity profile in PREM +! radius and density are non-dimensional + + implicit none + + include "constants.h" + + integer:: nspl + double precision:: rspl(NR),gspl(NR),gspl2(NR) + double precision ROCEAN,RMIDDLE_CRUST,RMOHO,R80,R220,R400,R600,R670, & + R771,RTOPDDOUBLEPRIME,RCMB,RICB + + ! local parameters + double precision r_icb,r_cmb,r_topddoubleprime,r_771,r_670,r_600 + double precision r_400,r_220,r_80,r_moho,r_middle_crust,r_ocean,r_0 + double precision r(NR),rho(NR),g(NR),i_rho + double precision s1(NR),s2(NR),s3(NR) + double precision yp1,ypn + integer i + +! PREM + ! values in (m) + ROCEAN = 6368000.d0 + RMIDDLE_CRUST = 6356000.d0 + RMOHO = 6346600.d0 ! PREM moho depth at 24.4 km + R80 = 6291000.d0 + R220 = 6151000.d0 + R400 = 5971000.d0 + R600 = 5771000.d0 + R670 = 5701000.d0 + R771 = 5600000.d0 + RTOPDDOUBLEPRIME = 3630000.d0 + RCMB = 3480000.d0 + RICB = 1221000.d0 + +! non-dimensionalize + r_icb = RICB/R_EARTH_GRAVITY + r_cmb = RCMB/R_EARTH_GRAVITY + r_topddoubleprime = RTOPDDOUBLEPRIME/R_EARTH_GRAVITY + r_771 = R771/R_EARTH_GRAVITY + r_670 = R670/R_EARTH_GRAVITY + r_600 = R600/R_EARTH_GRAVITY + r_400 = R400/R_EARTH_GRAVITY + r_220 = R220/R_EARTH_GRAVITY + r_80 = R80/R_EARTH_GRAVITY + r_moho = RMOHO/R_EARTH_GRAVITY + r_middle_crust = RMIDDLE_CRUST/R_EARTH_GRAVITY + r_ocean = ROCEAN_GRAVITY/R_EARTH_GRAVITY + r_0 = 1.d0 + + do i=1,163 + r(i) = r_icb*dble(i-1)/dble(162) + enddo + do i=164,323 + r(i) = r_icb+(r_cmb-r_icb)*dble(i-164)/dble(159) + enddo + do i=324,336 + r(i) = r_cmb+(r_topddoubleprime-r_cmb)*dble(i-324)/dble(12) + enddo + do i=337,517 + r(i) = r_topddoubleprime+(r_771-r_topddoubleprime)*dble(i-337)/dble(180) + enddo + do i=518,530 + r(i) = r_771+(r_670-r_771)*dble(i-518)/dble(12) + enddo + do i=531,540 + r(i) = r_670+(r_600-r_670)*dble(i-531)/dble(9) + enddo + do i=541,565 + r(i) = r_600+(r_400-r_600)*dble(i-541)/dble(24) + enddo + do i=566,590 + r(i) = r_400+(r_220-r_400)*dble(i-566)/dble(24) + enddo + do i=591,609 + r(i) = r_220+(r_80-r_220)*dble(i-591)/dble(18) + enddo + do i=610,619 + r(i) = r_80+(r_moho-r_80)*dble(i-610)/dble(9) + enddo + do i=620,626 + r(i) = r_moho+(r_middle_crust-r_moho)*dble(i-620)/dble(6) + enddo + do i=627,633 + r(i) = r_middle_crust+(r_ocean-r_middle_crust)*dble(i-627)/dble(6) + enddo + do i=634,NR + r(i) = r_ocean+(r_0-r_ocean)*dble(i-634)/dble(6) + enddo + +! use PREM to get the density profile for ellipticity (fine for other 1D reference models) + do i=1,NR + call prem_density(r(i),rho(i), & + RICB,RCMB,RTOPDDOUBLEPRIME, & + R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN_GRAVITY) + enddo + + g(1)=0.0d0 + do i=2,NR + call intgrl(i_rho,r,1,i,rho,s1,s2,s3) + g(i)=4.0d0*i_rho/(r(i)*r(i)) + enddo + +! +! get ready to spline g +! + nspl=1 + rspl(1)=r(1) + gspl(1)=g(1) + do i=2,NR + if(r(i)/=r(i-1)) then + nspl=nspl+1 + rspl(nspl)=r(i) + gspl(nspl)=g(i) + endif + enddo + yp1=(4.0d0/3.0d0)*rho(1) + ypn=4.0d0*rho(NR)-2.0d0*g(NR)/r(NR) + call spline_construction(rspl,gspl,nspl,yp1,ypn,gspl2) + + end subroutine make_gravity + + +!-------------------------------------------------------------------------------------------------- +! +! PREM [Dziewonski and Anderson, 1981]. +! +! A. M. Dziewonski and D. L. Anderson. +! Preliminary reference Earth model. +! Phys. Earth Planet. Inter., 25:297–356, 1981. +! +! Isotropic (iso) and transversely isotropic (aniso) version of the +! spherically symmetric Preliminary Reference Earth Model +! +!-------------------------------------------------------------------------------------------------- + + + subroutine model_prem_iso(x,rho,drhodr,vp,vs,Qkappa,Qmu,& + RICB,RCMB,RTOPDDOUBLEPRIME, & + R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN) + + implicit none + + include "constants.h" + +! given a normalized radius x, gives the non-dimensionalized density rho, +! speeds vp and vs, and the quality factors Qkappa and Qmu + + double precision x,rho,drhodr,vp,vs,Qkappa,Qmu,RICB,RCMB,RTOPDDOUBLEPRIME, & + R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN + + double precision r,scaleval + + logical,parameter :: ONE_CRUST = .false. + logical,parameter :: CRUSTAL = .false. + +! compute real physical radius in meters + r = x * R_EARTH + +! +!--- inner core +! + if(r >= 0.d0 .and. r <= RICB) then + drhodr=-2.0d0*8.8381d0*x + rho=13.0885d0-8.8381d0*x*x + vp=11.2622d0-6.3640d0*x*x + vs=3.6678d0-4.4475d0*x*x + Qmu=84.6d0 + Qkappa=1327.7d0 +! +!--- outer core +! + else if(r > RICB .and. r <= RCMB) then + drhodr=-1.2638d0-2.0d0*3.6426d0*x-3.0d0*5.5281d0*x*x + rho=12.5815d0-1.2638d0*x-3.6426d0*x*x-5.5281d0*x*x*x + vp=11.0487d0-4.0362d0*x+4.8023d0*x*x-13.5732d0*x*x*x + vs=0.0d0 + Qmu=0.0d0 + Qkappa=57827.0d0 +! +!--- D" at the base of the mantle +! + else if(r > RCMB .and. r <= RTOPDDOUBLEPRIME) then + drhodr=-6.4761d0+2.0d0*5.5283d0*x-3.0d0*3.0807d0*x*x + rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x + vp=15.3891d0-5.3181d0*x+5.5242d0*x*x-2.5514d0*x*x*x + vs=6.9254d0+1.4672d0*x-2.0834d0*x*x+0.9783d0*x*x*x + Qmu=312.0d0 + Qkappa=57827.0d0 +! +!--- mantle: from top of D" to d670 +! + else if(r > RTOPDDOUBLEPRIME .and. r <= R771) then + drhodr=-6.4761d0+2.0d0*5.5283d0*x-3.0d0*3.0807d0*x*x + rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x + vp=24.9520d0-40.4673d0*x+51.4832d0*x*x-26.6419d0*x*x*x + vs=11.1671d0-13.7818d0*x+17.4575d0*x*x-9.2777d0*x*x*x + Qmu=312.0d0 + Qkappa=57827.0d0 + else if(r > R771 .and. r <= R670) then + drhodr=-6.4761d0+2.0d0*5.5283d0*x-3.0d0*3.0807d0*x*x + rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x + vp=29.2766d0-23.6027d0*x+5.5242d0*x*x-2.5514d0*x*x*x + vs=22.3459d0-17.2473d0*x-2.0834d0*x*x+0.9783d0*x*x*x + Qmu=312.0d0 + Qkappa=57827.0d0 +! +!--- mantle: above d670 +! + else if(r > R670 .and. r <= R600) then + drhodr=-1.4836d0 + rho=5.3197d0-1.4836d0*x + vp=19.0957d0-9.8672d0*x + vs=9.9839d0-4.9324d0*x + Qmu=143.0d0 + Qkappa=57827.0d0 + else if(r > R600 .and. r <= R400) then + drhodr=-8.0298d0 + rho=11.2494d0-8.0298d0*x + vp=39.7027d0-32.6166d0*x + vs=22.3512d0-18.5856d0*x + Qmu=143.0d0 + Qkappa=57827.0d0 + else if(r > R400 .and. r <= R220) then + drhodr=-3.8045d0 + rho=7.1089d0-3.8045d0*x + vp=20.3926d0-12.2569d0*x + vs=8.9496d0-4.4597d0*x + Qmu=143.0d0 + Qkappa=57827.0d0 + else if(r > R220 .and. r <= R80) then + drhodr=0.6924d0 + rho=2.6910d0+0.6924d0*x + vp=4.1875d0+3.9382d0*x + vs=2.1519d0+2.3481d0*x + Qmu=80.0d0 + Qkappa=57827.0d0 + else + if(CRUSTAL) then + ! fill with PREM mantle and later add CRUST2.0 + if(r > R80) then + ! density/velocity from mantle just below moho + drhodr=0.6924d0 + rho=2.6910d0+0.6924d0*x + vp=4.1875d0+3.9382d0*x + vs=2.1519d0+2.3481d0*x + ! shear attenuation for R80 to surface + Qmu=600.0d0 + Qkappa=57827.0d0 + endif + else + ! use PREM crust + if(r > R80 .and. r <= RMOHO) then + drhodr=0.6924d0 + rho=2.6910d0+0.6924d0*x + vp=4.1875d0+3.9382d0*x + vs=2.1519d0+2.3481d0*x + Qmu=600.0d0 + Qkappa=57827.0d0 + + else if(r > RMOHO .and. r <= RMIDDLE_CRUST) then + drhodr=0.0d0 + rho=2.9d0 + vp=6.8d0 + vs=3.9d0 + Qmu=600.0d0 + Qkappa=57827.0d0 + + ! same properties everywhere in PREM crust if we decide to define only one layer in the crust + if(ONE_CRUST) then + drhodr=0.0d0 + rho=2.6d0 + vp=5.8d0 + vs=3.2d0 + Qmu=600.0d0 + Qkappa=57827.0d0 + endif + + else if(r > RMIDDLE_CRUST .and. r <= ROCEAN) then + drhodr=0.0d0 + rho=2.6d0 + vp=5.8d0 + vs=3.2d0 + Qmu=600.0d0 + Qkappa=57827.0d0 + ! for density profile for gravity, we do not check that r <= R_EARTH + else if(r > ROCEAN) then + drhodr=0.0d0 + rho=2.6d0 + vp=5.8d0 + vs=3.2d0 + Qmu=600.0d0 + Qkappa=57827.0d0 + + endif + endif + endif + +! non-dimensionalize +! time scaling (s^{-1}) is done with scaleval + scaleval=dsqrt(PI*GRAV*RHOAV) + drhodr=drhodr*1000.0d0/RHOAV + rho=rho*1000.0d0/RHOAV + vp=vp*1000.0d0/(R_EARTH*scaleval) + vs=vs*1000.0d0/(R_EARTH*scaleval) + + end subroutine model_prem_iso + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine prem_density(x,rho, & + RICB,RCMB,RTOPDDOUBLEPRIME, & + R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN) + + implicit none + + include "constants.h" + + double precision x,rho,RICB,RCMB,RTOPDDOUBLEPRIME, & + R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN + double precision r + + logical,parameter :: ONE_CRUST = .false. + + ! compute real physical radius in meters + r = x * R_EARTH + + ! calculates density according to radius + if(r <= RICB) then + rho=13.0885d0-8.8381d0*x*x + else if(r > RICB .and. r <= RCMB) then + rho=12.5815d0-1.2638d0*x-3.6426d0*x*x-5.5281d0*x*x*x + else if(r > RCMB .and. r <= RTOPDDOUBLEPRIME) then + rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x + else if(r > RTOPDDOUBLEPRIME .and. r <= R771) then + rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x + else if(r > R771 .and. r <= R670) then + rho=7.9565d0-6.4761d0*x+5.5283d0*x*x-3.0807d0*x*x*x + else if(r > R670 .and. r <= R600) then + rho=5.3197d0-1.4836d0*x + else if(r > R600 .and. r <= R400) then + rho=11.2494d0-8.0298d0*x + else if(r > R400 .and. r <= R220) then + rho=7.1089d0-3.8045d0*x + else if(r > R220 .and. r <= R80) then + rho=2.6910d0+0.6924d0*x + else + if(r > R80 .and. r <= RMOHO) then + rho=2.6910d0+0.6924d0*x + else if(r > RMOHO .and. r <= RMIDDLE_CRUST) then + if(ONE_CRUST) then + rho=2.6d0 + else + rho=2.9d0 + endif + else if(r > RMIDDLE_CRUST .and. r <= ROCEAN) then + rho=2.6d0 + else if(r > ROCEAN) then + rho=2.6d0 + endif + endif + + rho=rho*1000.0d0/RHOAV + + end subroutine prem_density + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine intgrl(sum,r,nir,ner,f,s1,s2,s3) + +! Computes the integral of f[i]*r[i]*r[i] from i=nir to i=ner for +! radii values as in model PREM_an640 + + implicit none + +! Argument variables + integer ner,nir + double precision f(640),r(640),s1(640),s2(640) + double precision s3(640),sum + +! Local variables + double precision, parameter :: third = 1.0d0/3.0d0 + double precision, parameter :: fifth = 1.0d0/5.0d0 + double precision, parameter :: sixth = 1.0d0/6.0d0 + + double precision rji,yprime(640) + double precision s1l,s2l,s3l + + integer i,j,n,kdis(28) + integer ndis,nir1 + + data kdis/163,323,336,517,530,540,565,590,609,619,626,633,16*0/ + + ndis = 12 + n = 640 + + call deriv(f,yprime,n,r,ndis,kdis,s1,s2,s3) + nir1 = nir + 1 + sum = 0.0d0 + do i=nir1,ner + j = i-1 + rji = r(i) - r(j) + s1l = s1(j) + s2l = s2(j) + s3l = s3(j) + sum = sum + r(j)*r(j)*rji*(f(j) & + + rji*(0.5d0*s1l + rji*(third*s2l + rji*0.25d0*s3l))) & + + 2.0d0*r(j)*rji*rji*(0.5d0*f(j) + rji*(third*s1l + rji*(0.25d0*s2l + rji*fifth*s3l))) & + + rji*rji*rji*(third*f(j) + rji*(0.25d0*s1l + rji*(fifth*s2l + rji*sixth*s3l))) + enddo + + end subroutine intgrl + +! +!------------------------------------------------------------------------------------------------- +! + + subroutine deriv(y,yprime,n,r,ndis,kdis,s1,s2,s3) + + implicit none + +! Argument variables + integer kdis(28),n,ndis + double precision r(n),s1(n),s2(n),s3(n) + double precision y(n),yprime(n) + +! Local variables + integer i,j,j1,j2 + integer k,nd,ndp + double precision a0,b0,b1 + double precision f(3,1000),h,h2,h2a + double precision h2b,h3a,ha,s13 + double precision s21,s32,yy(3) + + yy(1) = 0.d0 + yy(2) = 0.d0 + yy(3) = 0.d0 + + ndp=ndis+1 + do 3 nd=1,ndp + if(nd == 1) goto 4 + if(nd == ndp) goto 5 + j1=kdis(nd-1)+1 + j2=kdis(nd)-2 + goto 6 + 4 j1=1 + j2=kdis(1)-2 + goto 6 + 5 j1=kdis(ndis)+1 + j2=n-2 + 6 if((j2+1-j1)>0) goto 11 + j2=j2+2 + yy(1)=(y(j2)-y(j1))/(r(j2)-r(j1)) + s1(j1)=yy(1) + s1(j2)=yy(1) + s2(j1)=yy(2) + s2(j2)=yy(2) + s3(j1)=yy(3) + s3(j2)=yy(3) + goto 3 + 11 a0=0.0d0 + if(j1 == 1) goto 7 + h=r(j1+1)-r(j1) + h2=r(j1+2)-r(j1) + yy(1)=h*h2*(h2-h) + h=h*h + h2=h2*h2 + b0=(y(j1)*(h-h2)+y(j1+1)*h2-y(j1+2)*h)/yy(1) + goto 8 + 7 b0=0.0d0 + 8 b1=b0 + + if(j2 > 1000) stop 'error in subroutine deriv for j2' + + do i=j1,j2 + h=r(i+1)-r(i) + yy(1)=y(i+1)-y(i) + h2=h*h + ha=h-a0 + h2a=h-2.0d0*a0 + h3a=2.0d0*h-3.0d0*a0 + h2b=h2*b0 + s1(i)=h2/ha + s2(i)=-ha/(h2a*h2) + s3(i)=-h*h2a/h3a + f(1,i)=(yy(1)-h*b0)/(h*ha) + f(2,i)=(h2b-yy(1)*(2.0d0*h-a0))/(h*h2*h2a) + f(3,i)=-(h2b-3.0d0*yy(1)*ha)/(h*h3a) + a0=s3(i) + b0=f(3,i) + enddo + + i=j2+1 + h=r(i+1)-r(i) + yy(1)=y(i+1)-y(i) + h2=h*h + ha=h-a0 + h2a=h*ha + h2b=h2*b0-yy(1)*(2.d0*h-a0) + s1(i)=h2/ha + f(1,i)=(yy(1)-h*b0)/h2a + ha=r(j2)-r(i+1) + yy(1)=-h*ha*(ha+h) + ha=ha*ha + yy(1)=(y(i+1)*(h2-ha)+y(i)*ha-y(j2)*h2)/yy(1) + s3(i)=(yy(1)*h2a+h2b)/(h*h2*(h-2.0d0*a0)) + s13=s1(i)*s3(i) + s2(i)=f(1,i)-s13 + + do j=j1,j2 + k=i-1 + s32=s3(k)*s2(i) + s1(i)=f(3,k)-s32 + s21=s2(k)*s1(i) + s3(k)=f(2,k)-s21 + s13=s1(k)*s3(k) + s2(k)=f(1,k)-s13 + i=k + enddo + + s1(i)=b1 + j2=j2+2 + s1(j2)=yy(1) + s2(j2)=yy(2) + s3(j2)=yy(3) + 3 continue + + do i=1,n + yprime(i)=s1(i) + enddo + + end subroutine deriv + +! +!------------------------------------------------------------------------------------------------- +! + +! compute spline coefficients + + subroutine spline_construction(xpoint,ypoint,npoint,tangent_first_point,tangent_last_point,spline_coefficients) + + implicit none + +! tangent to the spline imposed at the first and last points + double precision, intent(in) :: tangent_first_point,tangent_last_point + +! number of input points and coordinates of the input points + integer, intent(in) :: npoint + double precision, dimension(npoint), intent(in) :: xpoint,ypoint + +! spline coefficients output by the routine + double precision, dimension(npoint), intent(out) :: spline_coefficients + + integer :: i + + double precision, dimension(:), allocatable :: temporary_array + + allocate(temporary_array(npoint)) + + spline_coefficients(1) = - 1.d0 / 2.d0 + + temporary_array(1) = (3.d0/(xpoint(2)-xpoint(1)))*((ypoint(2)-ypoint(1))/(xpoint(2)-xpoint(1))-tangent_first_point) + + do i = 2,npoint-1 + + spline_coefficients(i) = ((xpoint(i)-xpoint(i-1))/(xpoint(i+1)-xpoint(i-1))-1.d0) & + / ((xpoint(i)-xpoint(i-1))/(xpoint(i+1)-xpoint(i-1))*spline_coefficients(i-1)+2.d0) + + temporary_array(i) = (6.d0*((ypoint(i+1)-ypoint(i))/(xpoint(i+1)-xpoint(i)) & + - (ypoint(i)-ypoint(i-1))/(xpoint(i)-xpoint(i-1)))/(xpoint(i+1)-xpoint(i-1)) & + - (xpoint(i)-xpoint(i-1))/(xpoint(i+1)-xpoint(i-1))*temporary_array(i-1)) & + / ((xpoint(i)-xpoint(i-1))/(xpoint(i+1)-xpoint(i-1))*spline_coefficients(i-1)+2.d0) + + enddo + + spline_coefficients(npoint) = ((3.d0/(xpoint(npoint)-xpoint(npoint-1))) & + * (tangent_last_point-(ypoint(npoint)-ypoint(npoint-1))/(xpoint(npoint)-xpoint(npoint-1))) & + - 1.d0/2.d0*temporary_array(npoint-1))/(1.d0/2.d0*spline_coefficients(npoint-1)+1.d0) + + do i = npoint-1,1,-1 + spline_coefficients(i) = spline_coefficients(i)*spline_coefficients(i+1) + temporary_array(i) + enddo + + deallocate(temporary_array) + + end subroutine spline_construction + +! +!------------------------------------------------------------------------------------------------- +! + +! evaluate a spline + + subroutine spline_evaluation(xpoint,ypoint,spline_coefficients,npoint,x_evaluate_spline,y_spline_obtained) + + implicit none + +! number of input points and coordinates of the input points + integer, intent(in) :: npoint + double precision, dimension(npoint), intent(in) :: xpoint,ypoint + +! spline coefficients to use + double precision, dimension(npoint), intent(in) :: spline_coefficients + +! abscissa at which we need to evaluate the value of the spline + double precision, intent(in):: x_evaluate_spline + +! ordinate evaluated by the routine for the spline at this abscissa + double precision, intent(out):: y_spline_obtained + + integer :: index_loop,index_lower,index_higher + + double precision :: coef1,coef2 + +! initialize to the whole interval + index_lower = 1 + index_higher = npoint + +! determine the right interval to use, by dichotomy + do while (index_higher - index_lower > 1) +! compute the middle of the interval + index_loop = (index_higher + index_lower) / 2 + if(xpoint(index_loop) > x_evaluate_spline) then + index_higher = index_loop + else + index_lower = index_loop + endif + enddo + +! test that the interval obtained does not have a size of zero +! (this could happen for instance in the case of duplicates in the input list of points) + if(xpoint(index_higher) == xpoint(index_lower)) stop 'incorrect interval found in spline evaluation' + + coef1 = (xpoint(index_higher) - x_evaluate_spline) / (xpoint(index_higher) - xpoint(index_lower)) + coef2 = (x_evaluate_spline - xpoint(index_lower)) / (xpoint(index_higher) - xpoint(index_lower)) + + y_spline_obtained = coef1*ypoint(index_lower) + coef2*ypoint(index_higher) + & + ((coef1**3 - coef1)*spline_coefficients(index_lower) + & + (coef2**3 - coef2)*spline_coefficients(index_higher))*((xpoint(index_higher) - xpoint(index_lower))**2)/6.d0 + + end subroutine spline_evaluation + + diff --git a/src/specfem3D/noise_tomography.f90 b/src/specfem3D/noise_tomography.f90 index 3ea63bbc2..9f50482e6 100644 --- a/src/specfem3D/noise_tomography.f90 +++ b/src/specfem3D/noise_tomography.f90 @@ -43,14 +43,54 @@ ! ============================================================================================================= ! ============================================================================================================= +module user_noise_distribution + +!daniel: TODO -- setting USE_PIERO_DISTRIBUTION = .true. will produce errors +! when using with the default example in "example/noise_tomography/" +! i left it here so that Max can run his example without changing this every time... + logical,parameter :: USE_PIERO_DISTRIBUTION = .true. + +contains + +! wrapper function +! this subroutine must be modified by USERS for their own noise distribution + + subroutine noise_distribution_direction(xcoord_in,ycoord_in,zcoord_in, & + normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, & + mask_noise_out) + implicit none + include "constants.h" + ! input parameters + real(kind=CUSTOM_REAL) :: xcoord_in,ycoord_in,zcoord_in + ! output parameters + real(kind=CUSTOM_REAL) :: normal_x_noise_out,normal_y_noise_out,normal_z_noise_out,mask_noise_out + + ! Setup for NOISE_TOMOGRAPHY by Piero Basini + if( USE_PIERO_DISTRIBUTION ) then + call noise_distribution_dir_non_uni(xcoord_in,ycoord_in,zcoord_in, & + normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, & + mask_noise_out) + else + ! DEFAULT routine + call noise_distribution_direction_d(xcoord_in,ycoord_in,zcoord_in, & + normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, & + mask_noise_out) + endif + + end subroutine noise_distribution_direction + + ! + !----------------------------------------------------------------------------------------------- + ! + ! characterizes noise statistics: ! for a given point (xcoord,ycoord,zcoord), specify the noise direction "normal_x/y/z_noise" ! and noise distribution "mask_noise" ! ! USERS: need to modify this subroutine for their own noise characteristics - subroutine noise_distribution_direction(xcoord_in,ycoord_in,zcoord_in, & - normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, & - mask_noise_out) + subroutine noise_distribution_direction_d(xcoord_in,ycoord_in,zcoord_in, & + normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, & + mask_noise_out) implicit none include "constants.h" ! input parameters @@ -78,7 +118,100 @@ subroutine noise_distribution_direction(xcoord_in,ycoord_in,zcoord_in, & ldummy = ycoord_in ldummy = zcoord_in - end subroutine noise_distribution_direction + end subroutine noise_distribution_direction_d + + ! + !----------------------------------------------------------------------------------------------- + ! + + subroutine noise_distribution_dir_non_uni(xcoord_in,ycoord_in,zcoord_in, & + normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, & + mask_noise_out) + implicit none + include "constants.h" + ! input parameters + real(kind=CUSTOM_REAL) :: xcoord_in,ycoord_in,zcoord_in + ! output parameters + real(kind=CUSTOM_REAL) :: normal_x_noise_out,normal_y_noise_out,normal_z_noise_out,mask_noise_out + ! local parameters + !PB VARIABLES TO DEFINE THE REGION OF NOISE + real(kind=CUSTOM_REAL) :: xcoord,ycoord,zcoord !,xcoord_center,ycoord_center + real :: lon,lat,colat,lon_cn,lat_cn,dsigma,d,dmax + + ! coordinates "x/y/zcoord_in" actually contain r theta phi, therefore convert back to x y z + ! call rthetaphi_2_xyz(xcoord,ycoord,zcoord, xcoord_in,ycoord_in,zcoord_in) + xcoord=xcoord_in + ycoord=ycoord_in + zcoord=zcoord_in + + !PB NOT UNIF DISTRIBUTION OF NOISE ON THE SURFACE OF A SPHERE + !PB lon lat colat ARE IN RADIANS SINCE ARE OBTAINED FROM CARTESIAN COORDINATES + !PB lon_cn lat_cn (cn = CENTER OF NOISE REGION) IF NOT, MUST BE CONVERTED IN RADIANS + !PB lon_cn lat_cn ARE INSERTED DIRECTLY HERE FOR SIMPLICITY + + lon_cn = (3.89)*PI/180 + lat_cn = (45.113)*PI/180 + + if (xcoord >= 0) then + lon=asin(ycoord/(sqrt(xcoord**2+ycoord**2))) + else + lon=(PI-(asin(ycoord/(sqrt(xcoord**2+ycoord**2))))) + endif + colat=atan(sqrt(xcoord**2+ycoord**2)/zcoord) + lat=(PI/2)-colat + + !PB CALCULATE THE DISTANCE BETWEEN CENTER OF NOISE REGION AND EACH + ! POINT OF THE MODEL'S FREE SURFACE !PB dsigma IS THE "3D" ANGLE BETWEEN + ! THE TWO POINTS, THEN d = R*dsigma + dsigma=acos(sin(lon)*sin(lon_cn)+cos(lon)*cos(lon_cn)*cos(lat-lat_cn)) + d=sqrt(xcoord**2+ycoord**2+zcoord**2)*dsigma + + !PB IF YOU WANT TO USE A NONUNIFORM DISTRIBUTION OF NOISE IN THE EXAMPLE + !PROVIDED WITH THE CODE, THEN UNCOMMENT THE FOLLOWING LINES (before definition + !of dmax) + + ! xcoord_center = 30000 + ! ycoord_center = 30000 + ! d = sqrt((xcoord_center-xcoord)**2+(ycoord_center-ycoord)**2) + + !PB NOTE THAT d IS EXPRESSED IN METERS REMEBER THAT WHEN YOU SET THE PARAMETER dmax + !PB dmax IS THE RADIUS OF THE AREA IN WHICH masc_noise_out IS 1 (NOISE IS DEFINED) + + dmax = 300000 + + ! NOTE that all coordinates are non-dimensionalized in GLOBAL package! + ! USERS are free to choose which set to use, + ! either "r theta phi" (xcoord_in,ycoord_in,zcoord_in) + ! or "x y z" (xcoord,ycoord,zcoord) + + + !***************************************************************************************************************** + !******************************** change your noise characteristics below **************************************** + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! noise direction + !!!!! here, the noise is assumed to be vertical (SESAME) + normal_x_noise_out = 0.0 + normal_y_noise_out = 0.0 + normal_z_noise_out = 1.0 + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! noise distribution + !!!!! here, the noise is assumed to be uniform + ! mask_noise_out = 1.0 + + !HERE IS NOT UNIFORM + if (d <= dmax) then + mask_noise_out = 1.0 + else + mask_noise_out = 0.0 + endif + + !******************************** change your noise characteristics above **************************************** + !***************************************************************************************************************** + + end subroutine noise_distribution_dir_non_uni + + +end module user_noise_distribution + + ! ============================================================================================================= ! ============================================================================================================= @@ -94,6 +227,7 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, & NSPEC_AB_VAL,NGLOB_AB_VAL, & num_free_surface_faces,free_surface_ispec,free_surface_ijk, & ispec_is_acoustic) + use user_noise_distribution implicit none include "constants.h" ! input parameters @@ -116,7 +250,7 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, & logical, dimension(NSPEC_AB_VAL) :: ispec_is_acoustic -!daniel: from global code... + !from global code... !integer, dimension(NSPEC2D_TOP_VAL) :: ibelm_top ! equals free_surface_ispec !integer :: NSPEC2D_TOP_VAL ! equals num_free_surface_faces !integer :: nspec_top ! equals num_free_surface_faces @@ -131,7 +265,7 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, & character(len=256) :: filename ! read master receiver ID -- the ID in "STATIONS" - filename = trim(OUTPUT_FILES_PATH)//'/../OUTPUT_FILES/NOISE_TOMOGRAPHY/irec_master_noise' + filename = trim(OUTPUT_FILES_PATH)//'/..//NOISE_TOMOGRAPHY/irec_master_noise' open(unit=IIN_NOISE,file=trim(filename),status='old',action='read',iostat=ios) if( ios /= 0 ) & call exit_MPI(myrank, 'file '//trim(filename)//' does NOT exist! This file contains the ID of the master receiver') @@ -163,7 +297,7 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, & ! noise distribution and noise direction ipoin = 0 - !daniel: from global code, carefull: ngllz must not be face on top... + !from global code, carefull: ngllz must not be face on top... ! do ispec2D = 1, nspec_top ! ispec = ibelm_top(ispec2D) ! k = NGLLZ @@ -187,11 +321,13 @@ subroutine read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, & ipoin = ipoin + 1 iglob = ibool(i,j,k,ispec) - ! this subroutine must be modified by USERS + + ! this subroutine must be modified by USERS in module user_noise_distribution call noise_distribution_direction(xstore(iglob), & - ystore(iglob),zstore(iglob), & - normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, & - mask_noise_out) + ystore(iglob),zstore(iglob), & + normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, & + mask_noise_out) + normal_x_noise(ipoin) = normal_x_noise_out normal_y_noise(ipoin) = normal_y_noise_out normal_z_noise(ipoin) = normal_z_noise_out @@ -314,22 +450,34 @@ subroutine check_parameters_noise(myrank,NOISE_TOMOGRAPHY,SIMULATION_TYPE,SAVE_F ! size of single record reclen=CUSTOM_REAL*NDIM*NGLLSQUARE*NSPEC_TOP - ! total file size - filesize = reclen - filesize = filesize*NSTEP - - write(outputname,"('/proc',i6.6,'_surface_movie')") myrank - if (NOISE_TOMOGRAPHY==1) call open_file_abs_w(2,trim(LOCAL_PATH)//trim(outputname), & - len_trim(trim(LOCAL_PATH)//trim(outputname)), & - filesize) - if (NOISE_TOMOGRAPHY==2) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), & - len_trim(trim(LOCAL_PATH)//trim(outputname)), & - filesize) - if (NOISE_TOMOGRAPHY==3) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), & - len_trim(trim(LOCAL_PATH)//trim(outputname)), & - filesize) - endif + ! only open files if there are surface faces in this paritition + if(NSPEC_TOP .gt. 0) then + + ! check integer size limit: size of b_reclen_field must fit onto an 4-byte integer + if( NSPEC_TOP > 2147483647 / (CUSTOM_REAL * NGLLSQUARE * NDIM) ) then + print *,'reclen of noise surface_movie needed exceeds integer 4-byte limit: ',reclen + print *,' ',CUSTOM_REAL, NDIM, NGLLSQUARE, NSPEC_TOP + print*,'bit size fortran: ',bit_size(NSPEC_TOP) + call exit_MPI(myrank,"error NSPEC_TOP integer limit") + endif + + ! total file size + filesize = reclen + filesize = filesize*NSTEP + + write(outputname,"('/proc',i6.6,'_surface_movie')") myrank + if (NOISE_TOMOGRAPHY==1) call open_file_abs_w(2,trim(LOCAL_PATH)//trim(outputname), & + len_trim(trim(LOCAL_PATH)//trim(outputname)), & + filesize) + if (NOISE_TOMOGRAPHY==2) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), & + len_trim(trim(LOCAL_PATH)//trim(outputname)), & + filesize) + if (NOISE_TOMOGRAPHY==3) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), & + len_trim(trim(LOCAL_PATH)//trim(outputname)), & + filesize) + endif + endif end subroutine check_parameters_noise ! ============================================================================================================= @@ -364,7 +512,7 @@ subroutine compute_arrays_source_noise(myrank, & noise_src(:) = 0._CUSTOM_REAL ! noise file (source time function) - filename = trim(OUTPUT_FILES_PATH)//'/../OUTPUT_FILES/NOISE_TOMOGRAPHY/S_squared' + filename = trim(OUTPUT_FILES_PATH)//'/..//NOISE_TOMOGRAPHY/S_squared' open(unit=IIN_NOISE,file=trim(filename),status='old',action='read',iostat=ios) if( ios /= 0 .and. myrank == 0 ) & call exit_MPI(myrank, 'file '//trim(filename)//' does NOT exist! This file should have been generated using Matlab scripts') @@ -379,7 +527,7 @@ subroutine compute_arrays_source_noise(myrank, & ! master receiver component direction, \nu_master - filename = trim(OUTPUT_FILES_PATH)//'/../OUTPUT_FILES/NOISE_TOMOGRAPHY/nu_master' + filename = trim(OUTPUT_FILES_PATH)//'/..//NOISE_TOMOGRAPHY/nu_master' open(unit=IIN_NOISE,file=trim(filename),status='old',action='read',iostat=ios) if( ios /= 0 .and. myrank == 0 ) & call exit_MPI(myrank,& @@ -483,7 +631,8 @@ subroutine noise_save_surface_movie(displ, & ibool, & noise_surface_movie,it, & NSPEC_AB_VAL,NGLOB_AB_VAL, & - num_free_surface_faces,free_surface_ispec,free_surface_ijk) + num_free_surface_faces,free_surface_ispec,free_surface_ijk,& + Mesh_pointer,GPU_MODE) implicit none include "constants.h" ! input parameters @@ -505,25 +654,37 @@ subroutine noise_save_surface_movie(displ, & ! local parameters integer :: ispec,i,j,k,iglob,iface,igll real(kind=CUSTOM_REAL),dimension(NDIM,NGLLSQUARE,num_free_surface_faces) :: noise_surface_movie + integer(kind=8) :: Mesh_pointer + logical :: GPU_MODE + + ! writes out wavefield at surface + if( num_free_surface_faces > 0 ) then + + if(.NOT. GPU_MODE) then + ! loops over surface points + ! get coordinates of surface mesh and surface displacement + do iface = 1, num_free_surface_faces + + ispec = free_surface_ispec(iface) + + do igll = 1, NGLLSQUARE + i = free_surface_ijk(1,igll,iface) + j = free_surface_ijk(2,igll,iface) + k = free_surface_ijk(3,igll,iface) + + iglob = ibool(i,j,k,ispec) + noise_surface_movie(:,igll,iface) = displ(:,iglob) + enddo + enddo + ! TODO: Check if transfer_surface_to_hose is compatible with newer version above + else ! GPU_MODE == 1 + call transfer_surface_to_host(Mesh_pointer,noise_surface_movie) + endif - ! loops over surface points - ! get coordinates of surface mesh and surface displacement - do iface = 1, num_free_surface_faces - - ispec = free_surface_ispec(iface) - - do igll = 1, NGLLSQUARE - i = free_surface_ijk(1,igll,iface) - j = free_surface_ijk(2,igll,iface) - k = free_surface_ijk(3,igll,iface) - - iglob = ibool(i,j,k,ispec) - noise_surface_movie(:,igll,iface) = displ(:,iglob) - enddo - enddo + ! save surface motion to disk + call write_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it) - ! save surface motion to disk - call write_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it) + endif end subroutine noise_save_surface_movie @@ -544,7 +705,8 @@ subroutine noise_read_add_surface_movie(nmovie_points, & it, & NSPEC_AB_VAL,NGLOB_AB_VAL, & num_free_surface_faces,free_surface_ispec,free_surface_ijk, & - free_surface_jacobian2Dw) + free_surface_jacobian2Dw, & + Mesh_pointer,GPU_MODE,NOISE_TOMOGRAPHY) implicit none include "constants.h" ! input parameters @@ -572,39 +734,54 @@ subroutine noise_read_add_surface_movie(nmovie_points, & real(kind=CUSTOM_REAL) :: eta real(kind=CUSTOM_REAL), dimension(NDIM,NGLLSQUARE,num_free_surface_faces) :: noise_surface_movie - ! read surface movie - call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it) + ! GPU_MODE parameters + integer(kind=8) :: Mesh_pointer + logical :: GPU_MODE + integer :: NOISE_TOMOGRAPHY - ! get coordinates of surface mesh and surface displacement - ipoin = 0 + ! reads in ensemble noise sources at surface + if( num_free_surface_faces > 0 ) then - ! loops over surface points - ! puts noise distrubution and direction onto the surface points - do iface = 1, num_free_surface_faces + ! read surface movie + call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it) - ispec = free_surface_ispec(iface) + if(GPU_MODE) then + call noise_read_add_surface_movie_cu(Mesh_pointer, noise_surface_movie,NOISE_TOMOGRAPHY) + else ! GPU_MODE==0 - do igll = 1, NGLLSQUARE - i = free_surface_ijk(1,igll,iface) - j = free_surface_ijk(2,igll,iface) - k = free_surface_ijk(3,igll,iface) - - ipoin = ipoin + 1 - iglob = ibool(i,j,k,ispec) - - eta = noise_surface_movie(1,igll,iface) * normal_x_noise(ipoin) + & - noise_surface_movie(2,igll,iface) * normal_y_noise(ipoin) + & - noise_surface_movie(3,igll,iface) * normal_z_noise(ipoin) - - accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) & - * free_surface_jacobian2Dw(igll,iface) - accel(2,iglob) = accel(2,iglob) + eta * mask_noise(ipoin) * normal_y_noise(ipoin) & - * free_surface_jacobian2Dw(igll,iface) - accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) & - * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface) - enddo + ! get coordinates of surface mesh and surface displacement + ipoin = 0 - enddo + ! loops over surface points + ! puts noise distrubution and direction onto the surface points + do iface = 1, num_free_surface_faces + + ispec = free_surface_ispec(iface) + + do igll = 1, NGLLSQUARE + i = free_surface_ijk(1,igll,iface) + j = free_surface_ijk(2,igll,iface) + k = free_surface_ijk(3,igll,iface) + + ipoin = ipoin + 1 + iglob = ibool(i,j,k,ispec) + + eta = noise_surface_movie(1,igll,iface) * normal_x_noise(ipoin) + & + noise_surface_movie(2,igll,iface) * normal_y_noise(ipoin) + & + noise_surface_movie(3,igll,iface) * normal_z_noise(ipoin) + + accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) & + * free_surface_jacobian2Dw(igll,iface) + accel(2,iglob) = accel(2,iglob) + eta * mask_noise(ipoin) * normal_y_noise(ipoin) & + * free_surface_jacobian2Dw(igll,iface) + accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) & + * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface) + enddo + + enddo + endif ! GPU_MODE + + endif end subroutine noise_read_add_surface_movie @@ -620,7 +797,8 @@ subroutine compute_kernels_strength_noise(nmovie_points,ibool, & normal_x_noise,normal_y_noise,normal_z_noise, & noise_surface_movie, & NSPEC_AB_VAL,NGLOB_AB_VAL, & - num_free_surface_faces,free_surface_ispec,free_surface_ijk) + num_free_surface_faces,free_surface_ispec,free_surface_ijk, & + GPU_MODE,Mesh_pointer) implicit none include "constants.h" ! input parameters @@ -650,39 +828,54 @@ subroutine compute_kernels_strength_noise(nmovie_points,ibool, & real(kind=CUSTOM_REAL) :: eta real(kind=CUSTOM_REAL), dimension(NDIM,NGLLSQUARE,num_free_surface_faces) :: noise_surface_movie - ! read surface movie, needed for Sigma_kl - call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it) + ! GPU_MODE parameters + integer(kind=8) :: Mesh_pointer + logical :: GPU_MODE - ! noise source strength kernel - ! to keep similar structure to other kernels, the source strength kernel is saved as a volumetric kernel - ! but only updated at the surface, because the noise is generated there - ipoin = 0 + ! updates contribution to noise strength kernel + if( num_free_surface_faces > 0 ) then - ! loops over surface points - ! puts noise distrubution and direction onto the surface points - do iface = 1, num_free_surface_faces + ! read surface movie, needed for Sigma_kl + call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it) - ispec = free_surface_ispec(iface) + if(.NOT. GPU_MODE) then - do igll = 1, NGLLSQUARE - i = free_surface_ijk(1,igll,iface) - j = free_surface_ijk(2,igll,iface) - k = free_surface_ijk(3,igll,iface) + ! noise source strength kernel + ! to keep similar structure to other kernels, the source strength kernel is saved as a volumetric kernel + ! but only updated at the surface, because the noise is generated there + ipoin = 0 - ipoin = ipoin + 1 - iglob = ibool(i,j,k,ispec) + ! loops over surface points + ! puts noise distrubution and direction onto the surface points + do iface = 1, num_free_surface_faces - eta = noise_surface_movie(1,igll,iface) * normal_x_noise(ipoin) + & - noise_surface_movie(2,igll,iface) * normal_y_noise(ipoin) + & - noise_surface_movie(3,igll,iface) * normal_z_noise(ipoin) + ispec = free_surface_ispec(iface) - Sigma_kl(i,j,k,ispec) = Sigma_kl(i,j,k,ispec) & - + deltat * eta * ( normal_x_noise(ipoin) * displ(1,iglob) & - + normal_y_noise(ipoin) * displ(2,iglob) & - + normal_z_noise(ipoin) * displ(3,iglob) ) - enddo + do igll = 1, NGLLSQUARE + i = free_surface_ijk(1,igll,iface) + j = free_surface_ijk(2,igll,iface) + k = free_surface_ijk(3,igll,iface) - enddo + ipoin = ipoin + 1 + iglob = ibool(i,j,k,ispec) + + eta = noise_surface_movie(1,igll,iface) * normal_x_noise(ipoin) + & + noise_surface_movie(2,igll,iface) * normal_y_noise(ipoin) + & + noise_surface_movie(3,igll,iface) * normal_z_noise(ipoin) + + Sigma_kl(i,j,k,ispec) = Sigma_kl(i,j,k,ispec) & + + deltat * eta * ( normal_x_noise(ipoin) * displ(1,iglob) & + + normal_y_noise(ipoin) * displ(2,iglob) & + + normal_z_noise(ipoin) * displ(3,iglob) ) + enddo + + enddo + + else ! GPU_MODE==1 + call compute_kernels_strgth_noise_cu(Mesh_pointer,noise_surface_movie,deltat) + endif ! GPU_MODE + + endif end subroutine compute_kernels_strength_noise diff --git a/src/specfem3D/prepare_timerun.f90 b/src/specfem3D/prepare_timerun.F90 similarity index 59% rename from src/specfem3D/prepare_timerun.f90 rename to src/specfem3D/prepare_timerun.F90 index 7738a3377..ebe21d418 100644 --- a/src/specfem3D/prepare_timerun.f90 +++ b/src/specfem3D/prepare_timerun.F90 @@ -75,6 +75,13 @@ subroutine prepare_timerun() write(IMAIN,*) 'no oceans' endif + write(IMAIN,*) + if(GRAVITY) then + write(IMAIN,*) 'incorporating gravity' + else + write(IMAIN,*) 'no gravity' + endif + write(IMAIN,*) if(ACOUSTIC_SIMULATION) then write(IMAIN,*) 'incorporating acoustic simulation' @@ -176,6 +183,9 @@ subroutine prepare_timerun() ! prepares attenuation arrays call prepare_timerun_attenuation() + ! prepares gravity arrays + call prepare_timerun_gravity() + ! initializes PML arrays if( ABSORBING_CONDITIONS ) then if (SIMULATION_TYPE /= 1 .and. ABSORB_USE_PML ) then @@ -210,10 +220,11 @@ subroutine prepare_timerun() write(IMAIN,*) 'total simulated time: ',sngl(NSTEP*DT),' seconds' write(IMAIN,*) 'start time:',sngl(-t0),' seconds' write(IMAIN,*) - - !debug: time estimation + + !daniel debug: time estimation ! elastic elements: time per element t_per_element = 1.40789368e-05 s ! total time = nspec * nstep * t_per_element + endif ! prepares ADJOINT simulations @@ -222,6 +233,14 @@ subroutine prepare_timerun() ! prepares noise simulations call prepare_timerun_noise() + ! prepares GPU arrays + if(GPU_MODE) call prepare_timerun_GPU() + +#ifdef OPENMP_MODE + ! prepares arrays for OpenMP + call prepare_timerun_OpenMP() +#endif + end subroutine prepare_timerun ! @@ -416,6 +435,100 @@ subroutine prepare_timerun_attenuation() end subroutine prepare_timerun_attenuation +! +!------------------------------------------------------------------------------------------------- +! + + subroutine prepare_timerun_gravity() + +! precomputes gravity factors + + use specfem_par + use specfem_par_acoustic + use specfem_par_elastic + use specfem_par_poroelastic + implicit none + + ! local parameters + double precision RICB,RCMB,RTOPDDOUBLEPRIME, & + R80,R220,R400,R600,R670,R771,RMOHO,RMIDDLE_CRUST,ROCEAN + double precision :: rspl_gravity(NR),gspl(NR),gspl2(NR) + double precision :: radius,g,dg ! radius_km + !double precision :: g_cmb_dble,g_icb_dble + double precision :: rho,drhodr,vp,vs,Qkappa,Qmu + integer :: nspl_gravity !int_radius + integer :: i,j,k,iglob,ier + + ! sets up weights needed for integration of gravity + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + wgll_cube(i,j,k) = sngl( wxgll(i)*wygll(j)*wzgll(k) ) + enddo + enddo + enddo + + ! store g, rho and dg/dr=dg using normalized radius in lookup table every 100 m + ! get density and velocity from PREM model using dummy doubling flag + ! this assumes that the gravity perturbations are small and smooth + ! and that we can neglect the 3D model and use PREM every 100 m in all cases + ! this is probably a rather reasonable assumption + if(GRAVITY) then + + ! allocates gravity arrays + allocate( minus_deriv_gravity(NGLOB_AB), & + minus_g(NGLOB_AB), stat=ier) + if( ier /= 0 ) stop 'error allocating gravity arrays' + + ! sets up spline table + call make_gravity(nspl_gravity,rspl_gravity,gspl,gspl2, & + ROCEAN,RMIDDLE_CRUST,RMOHO,R80,R220,R400,R600,R670, & + R771,RTOPDDOUBLEPRIME,RCMB,RICB) + + ! pre-calculates gravity terms for all global points + do iglob = 1,NGLOB_AB + + ! normalized radius ( zstore values given in m, negative values for depth) + radius = ( R_EARTH + zstore(iglob) ) / R_EARTH + call spline_evaluation(rspl_gravity,gspl,gspl2,nspl_gravity,radius,g) + + ! use PREM density profile to calculate gravity (fine for other 1D models) + call model_prem_iso(radius,rho,drhodr,vp,vs,Qkappa,Qmu, & + RICB,RCMB,RTOPDDOUBLEPRIME, & + R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN) + + dg = 4.0d0*rho - 2.0d0*g/radius + + ! re-dimensionalize + g = g * R_EARTH*(PI*GRAV*RHOAV) ! in m / s^2 ( should be around 10 m/s^2 ) + dg = dg * R_EARTH*(PI*GRAV*RHOAV) / R_EARTH ! gradient d/dz g , in 1/s^2 + + minus_deriv_gravity(iglob) = - dg + minus_g(iglob) = - g ! in negative z-direction + + ! debug + !if( iglob == 1 .or. iglob == 1000 .or. iglob == 10000 ) then + ! ! re-dimensionalize + ! radius = radius * R_EARTH ! in m + ! vp = vp * R_EARTH*dsqrt(PI*GRAV*RHOAV) ! in m / s + ! rho = rho * RHOAV ! in kg / m^3 + ! print*,'gravity: radius=',radius,'g=',g,'depth=',radius-R_EARTH + ! print*,'vp=',vp,'rho=',rho,'kappa=',(vp**2) * rho + ! print*,'minus_g..=',minus_g(iglob) + !endif + enddo + + else + + ! allocates dummy gravity arrays + allocate( minus_deriv_gravity(0), & + minus_g(0), stat=ier) + if( ier /= 0 ) stop 'error allocating gravity arrays' + + endif + + end subroutine prepare_timerun_gravity + ! !------------------------------------------------------------------------------------------------- @@ -553,6 +666,14 @@ subroutine prepare_timerun_adjoint() ! size of single record b_reclen_field = CUSTOM_REAL * NDIM * NGLLSQUARE * num_abs_boundary_faces + ! check integer size limit: size of b_reclen_field must fit onto an 4-byte integer + if( num_abs_boundary_faces > 2147483647 / (CUSTOM_REAL * NDIM * NGLLSQUARE) ) then + print *,'reclen needed exceeds integer 4-byte limit: ',b_reclen_field + print *,' ',CUSTOM_REAL, NDIM, NGLLSQUARE, num_abs_boundary_faces + print*,'bit size fortran: ',bit_size(b_reclen_field) + call exit_MPI(myrank,"error b_reclen_field integer limit") + endif + ! total file size filesize = b_reclen_field filesize = filesize*NSTEP @@ -594,12 +715,20 @@ subroutine prepare_timerun_adjoint() ! size of single record b_reclen_potential = CUSTOM_REAL * NGLLSQUARE * num_abs_boundary_faces + ! check integer size limit: size of b_reclen_potential must fit onto an 4-byte integer + if( num_abs_boundary_faces > 2147483647 / (CUSTOM_REAL * NGLLSQUARE) ) then + print *,'reclen needed exceeds integer 4-byte limit: ',b_reclen_potential + print *,' ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces + print*,'bit size fortran: ',bit_size(b_reclen_potential) + call exit_MPI(myrank,"error b_reclen_potential integer limit") + endif + ! total file size (two lines to implicitly convert to 8-byte integers) filesize = b_reclen_potential filesize = filesize*NSTEP - ! daniel: debug check size limit - !if( NSTEP > 2147483648 / b_reclen_potential ) then + ! debug check size limit + !if( NSTEP > 2147483647 / b_reclen_potential ) then ! print *,'file size needed exceeds integer 4-byte limit: ',b_reclen_potential,NSTEP ! print *,' ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces,NSTEP ! print*,'file size fortran: ',filesize @@ -633,9 +762,8 @@ subroutine prepare_timerun_adjoint() endif endif - else - ! dummy array + ! needs dummy array b_num_abs_boundary_faces = 1 if( ELASTIC_SIMULATION ) then allocate(b_absorb_field(NDIM,NGLLSQUARE,b_num_abs_boundary_faces),stat=ier) @@ -646,10 +774,22 @@ subroutine prepare_timerun_adjoint() allocate(b_absorb_potential(NGLLSQUARE,b_num_abs_boundary_faces),stat=ier) if( ier /= 0 ) stop 'error allocating array b_absorb_potential' endif + endif + else ! ABSORBING_CONDITIONS + ! needs dummy array + b_num_abs_boundary_faces = 1 + if( ELASTIC_SIMULATION ) then + allocate(b_absorb_field(NDIM,NGLLSQUARE,b_num_abs_boundary_faces),stat=ier) + if( ier /= 0 ) stop 'error allocating array b_absorb_field' + endif + if( ACOUSTIC_SIMULATION ) then + allocate(b_absorb_potential(NGLLSQUARE,b_num_abs_boundary_faces),stat=ier) + if( ier /= 0 ) stop 'error allocating array b_absorb_potential' endif endif + end subroutine prepare_timerun_adjoint ! @@ -674,7 +814,8 @@ subroutine prepare_timerun_noise() ! checks if free surface is defined if( num_free_surface_faces == 0 ) then - stop 'error: noise simulations need a free surface' + write(*,*) myrank, " doesn't have a free_surface_face" + ! stop 'error: noise simulations need a free surface' endif ! allocates arrays @@ -719,3 +860,258 @@ subroutine prepare_timerun_noise() end subroutine prepare_timerun_noise +! +!------------------------------------------------------------------------------------------------- +! + + subroutine prepare_timerun_GPU() + + use specfem_par + use specfem_par_acoustic + use specfem_par_elastic + use specfem_par_poroelastic + use specfem_par_movie + + implicit none + real :: free_mb,used_mb,total_mb + + ! GPU_MODE now defined in Par_file + if(myrank == 0 ) then + write(IMAIN,*) + write(IMAIN,*) "GPU Preparing Fields and Constants on Device." + write(IMAIN,*) + endif + + ! prepares general fields on GPU + call prepare_constants_device(Mesh_pointer, & + NGLLX, NSPEC_AB, NGLOB_AB, & + xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, & + kappastore, mustore,ibool, & + num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, & + nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, & + hprime_xx, hprime_yy, hprime_zz, & + hprimewgll_xx, hprimewgll_yy, hprimewgll_zz, & + wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, & + ABSORBING_CONDITIONS, & + abs_boundary_ispec, abs_boundary_ijk, & + abs_boundary_normal, & + abs_boundary_jacobian2Dw, & + num_abs_boundary_faces, & + ispec_is_inner, & + NSOURCES, nsources_local, & + sourcearrays, islice_selected_source, ispec_selected_source, & + number_receiver_global, ispec_selected_rec, & + nrec, nrec_local, & + SIMULATION_TYPE, & + USE_MESH_COLORING_GPU, & + nspec_acoustic,nspec_elastic,& + my_neighbours_ext_mesh,& + request_send_vector_ext_mesh,& + request_recv_vector_ext_mesh,& + buffer_recv_vector_ext_mesh) + + + ! prepares fields on GPU for acoustic simulations + if( ACOUSTIC_SIMULATION ) then + call prepare_fields_acoustic_device(Mesh_pointer,rmass_acoustic,rhostore,kappastore, & + num_phase_ispec_acoustic,phase_ispec_inner_acoustic, & + ispec_is_acoustic, & + NOISE_TOMOGRAPHY,num_free_surface_faces, & + free_surface_ispec,free_surface_ijk, & + ABSORBING_CONDITIONS,b_reclen_potential,b_absorb_potential, & + ELASTIC_SIMULATION, num_coupling_ac_el_faces, & + coupling_ac_el_ispec,coupling_ac_el_ijk, & + coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, & + num_colors_outer_acoustic,num_colors_inner_acoustic, & + num_elem_colors_acoustic) + + if( SIMULATION_TYPE == 3 ) & + call prepare_fields_acoustic_adj_dev(Mesh_pointer, & + SIMULATION_TYPE, & + APPROXIMATE_HESS_KL) + + endif + + ! prepares fields on GPU for elastic simulations + if( ELASTIC_SIMULATION ) then + call prepare_fields_elastic_device(Mesh_pointer, NDIM*NGLOB_AB, & + rmass,rho_vp,rho_vs, & + num_phase_ispec_elastic,phase_ispec_inner_elastic, & + ispec_is_elastic, & + ABSORBING_CONDITIONS,b_absorb_field,b_reclen_field, & + SIMULATION_TYPE,SAVE_FORWARD, & + COMPUTE_AND_STORE_STRAIN, & + epsilondev_xx,epsilondev_yy,epsilondev_xy, & + epsilondev_xz,epsilondev_yz, & + ATTENUATION, & + size(R_xx), & + R_xx,R_yy,R_xy,R_xz,R_yz, & + one_minus_sum_beta,factor_common, & + alphaval,betaval,gammaval, & + OCEANS,rmass_ocean_load, & + NOISE_TOMOGRAPHY, & + free_surface_normal,free_surface_ispec,free_surface_ijk, & + num_free_surface_faces, & + ACOUSTIC_SIMULATION, & + num_colors_outer_elastic,num_colors_inner_elastic, & + num_elem_colors_elastic, & + ANISOTROPY, & + c11store,c12store,c13store,c14store,c15store,c16store, & + c22store,c23store,c24store,c25store,c26store, & + c33store,c34store,c35store,c36store, & + c44store,c45store,c46store,c55store,c56store,c66store) + + if( SIMULATION_TYPE == 3 ) & + call prepare_fields_elastic_adj_dev(Mesh_pointer, NDIM*NGLOB_AB, & + SIMULATION_TYPE, & + COMPUTE_AND_STORE_STRAIN, & + epsilon_trace_over_3, & + b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, & + b_epsilondev_xz,b_epsilondev_yz, & + b_epsilon_trace_over_3, & + ATTENUATION,size(R_xx), & + b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, & + b_alphaval,b_betaval,b_gammaval, & + APPROXIMATE_HESS_KL) + + endif + + ! prepares fields on GPU for poroelastic simulations + if( POROELASTIC_SIMULATION ) then + stop 'todo poroelastic simulations on GPU' + endif + + ! prepares needed receiver array for adjoint runs + if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) & + call prepare_sim2_or_3_const_device(Mesh_pointer, & + islice_selected_rec,size(islice_selected_rec), & + nadj_rec_local,nrec,myrank) + + ! prepares fields on GPU for noise simulations + if ( NOISE_TOMOGRAPHY > 0 ) then + ! note: noise tomography is only supported for elastic domains so far. + + ! copies noise arrays to GPU + call prepare_fields_noise_device(Mesh_pointer, NSPEC_AB, NGLOB_AB, & + free_surface_ispec, & + free_surface_ijk, & + num_free_surface_faces, & + SIMULATION_TYPE,NOISE_TOMOGRAPHY, & + NSTEP,noise_sourcearray, & + normal_x_noise,normal_y_noise,normal_z_noise, & + mask_noise,free_surface_jacobian2Dw) + + endif ! NOISE_TOMOGRAPHY + + ! prepares gravity arrays + if( GRAVITY ) then + call prepare_fields_gravity_device(Mesh_pointer,GRAVITY, & + minus_deriv_gravity,minus_g,wgll_cube,& + ACOUSTIC_SIMULATION,rhostore) + endif + + ! sends initial data to device + + ! puts acoustic initial fields onto GPU + if( ACOUSTIC_SIMULATION ) then + call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer) + if( SIMULATION_TYPE == 3 ) & + call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, & + b_potential_dot_acoustic,b_potential_dot_dot_acoustic,Mesh_pointer) + endif + + ! puts elastic initial fields onto GPU + if( ELASTIC_SIMULATION ) then + ! transfer forward and backward fields to device with initial values + call transfer_fields_el_to_device(NDIM*NGLOB_AB,displ,veloc,accel,Mesh_pointer) + if(SIMULATION_TYPE == 3) & + call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer) + endif + + ! outputs GPU usage to files for all processes + call output_free_device_memory(myrank) + + ! outputs usage for main process + if( myrank == 0 ) then + call get_free_device_memory(free_mb,used_mb,total_mb) + write(IMAIN,*) "GPU usage: free =",free_mb," MB",nint(free_mb/total_mb*100.0),"%" + write(IMAIN,*) " used =",used_mb," MB",nint(used_mb/total_mb*100.0),"%" + write(IMAIN,*) " total =",total_mb," MB",nint(total_mb/total_mb*100.0),"%" + write(IMAIN,*) + endif + + end subroutine prepare_timerun_GPU + +! +!------------------------------------------------------------------------------------------------- +! + +! OpenMP version uses "special" compute_forces_elastic_Dev routine +! we need to set num_elem_colors_elastic arrays + +#ifdef OPENMP_MODE + subroutine prepare_timerun_OpenMP() + + use specfem_par + use specfem_par_elastic + implicit none + + ! local parameters + integer :: ier + integer :: NUM_THREADS + integer :: OMP_GET_MAX_THREADS + + ! OpenMP for elastic simulation only supported yet + if( ELASTIC_SIMULATION ) then + + NUM_THREADS = OMP_GET_MAX_THREADS() + if( myrank == 0 ) then + write(IMAIN,*) + write(IMAIN,*) 'Using:',NUM_THREADS, ' OpenMP threads' + write(IMAIN,*) + endif + + ! allocate cfe_Dev_openmp local arrays for OpenMP version + allocate(dummyx_loc(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(dummyy_loc(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(dummyz_loc(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempx1(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempx2(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempx3(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempy1(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempy2(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempy3(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempz1(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempz2(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(newtempz3(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempx1(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempx2(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempx3(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempy1(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempy2(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempy3(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempz1(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempz2(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + allocate(tempz3(NGLLX,NGLLY,NGLLZ,NUM_THREADS)) + + ! set num_elem_colors array in case no mesh coloring is used + if( .not. USE_MESH_COLORING_GPU ) then + ! deallocate dummy array + if( allocated(num_elem_colors_elastic) ) deallocate(num_elem_colors_elastic) + + ! loads with corresonding values + num_colors_outer_elastic = 1 + num_colors_inner_elastic = 1 + allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array' + + ! sets to all elements in inner/outer phase + num_elem_colors_elastic(1) = nspec_outer_elastic + num_elem_colors_elastic(2) = nspec_inner_elastic + endif + + endif + + end subroutine prepare_timerun_OpenMP +#endif diff --git a/src/specfem3D/read_mesh_databases.f90 b/src/specfem3D/read_mesh_databases.f90 index 8281b24c4..0b97cce17 100644 --- a/src/specfem3D/read_mesh_databases.f90 +++ b/src/specfem3D/read_mesh_databases.f90 @@ -76,6 +76,8 @@ subroutine read_mesh_databases() read(27) ispec_is_poroelastic ! acoustic + ! number of acoustic elements in this partition + nspec_acoustic = count(ispec_is_acoustic(:)) ! all processes will have acoustic_simulation set if any flag is .true. call any_all_l( ANY(ispec_is_acoustic), ACOUSTIC_SIMULATION ) if( ACOUSTIC_SIMULATION ) then @@ -101,6 +103,10 @@ subroutine read_mesh_databases() endif ! elastic + ! number of elastic elements in this partition + nspec_elastic = count(ispec_is_elastic(:)) + + ! elastic simulation call any_all_l( ANY(ispec_is_elastic), ELASTIC_SIMULATION ) if( ELASTIC_SIMULATION ) then ! displacement,velocity,acceleration @@ -170,7 +176,8 @@ subroutine read_mesh_databases() if( ier /= 0 ) stop 'error allocating array one_minus_sum_beta etc.' ! reads mass matrices - read(27) rmass + read(27,iostat=ier) rmass + if( ier /= 0 ) stop 'error reading in array rmass' if( OCEANS ) then ! ocean mass matrix @@ -182,10 +189,36 @@ subroutine read_mesh_databases() allocate(rmass_ocean_load(1),stat=ier) if( ier /= 0 ) stop 'error allocating dummy array rmass_ocean_load' endif - !pll - read(27) rho_vp - read(27) rho_vs + !pll material parameters for stacey conditions + read(27,iostat=ier) rho_vp + if( ier /= 0 ) stop 'error reading in array rho_vp' + read(27,iostat=ier) rho_vs + if( ier /= 0 ) stop 'error reading in array rho_vs' + + ! checks if rhostore is available for gravity + if( GRAVITY ) then + + if( .not. ACOUSTIC_SIMULATION ) then + ! rho array needed for gravity + allocate(rhostore(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier) + if( ier /= 0 ) stop 'error allocating array rhostore' + + ! extract rho information from mu = rho * vs * vs and rho_vs = rho * vs + rhostore = 0.0_CUSTOM_REAL + where( mustore > TINYVAL ) + rhostore = (rho_vs*rho_vs) / mustore + endwhere + + ! note: the construct below leads to a segmentation fault (ifort v11.1). not sure why... + ! (where statement - standard fortran 95) + !where( mustore > TINYVAL ) + ! rhostore = (rho_vs*rho_vs) / mustore + !elsewhere + ! rhostore = 0.0_CUSTOM_REAL + !endwhere + endif + endif else ! no elastic attenuation & anisotropy ATTENUATION = .false. @@ -195,6 +228,9 @@ subroutine read_mesh_databases() ! poroelastic call any_all_l( ANY(ispec_is_poroelastic), POROELASTIC_SIMULATION ) if( POROELASTIC_SIMULATION ) then + + if( GPU_MODE ) call exit_mpi(myrank,'POROELASTICITY not supported by GPU mode yet...') + ! displacement,velocity,acceleration for the solid (s) & fluid (w) phases allocate(displs_poroelastic(NDIM,NGLOB_AB),stat=ier) if( ier /= 0 ) stop 'error allocating array displs_poroelastic' @@ -393,6 +429,42 @@ subroutine read_mesh_databases() if(num_phase_ispec_poroelastic > 0 ) read(27) phase_ispec_inner_poroelastic endif +! mesh coloring for GPUs + if( USE_MESH_COLORING_GPU ) then + ! acoustic domain colors + if( ACOUSTIC_SIMULATION ) then + read(27) num_colors_outer_acoustic,num_colors_inner_acoustic + + allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array' + + read(27) num_elem_colors_acoustic + endif + ! elastic domain colors + if( ELASTIC_SIMULATION ) then + read(27) num_colors_outer_elastic,num_colors_inner_elastic + + allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array' + + read(27) num_elem_colors_elastic + endif + else + ! allocates dummy arrays + if( ACOUSTIC_SIMULATION ) then + num_colors_outer_acoustic = 0 + num_colors_inner_acoustic = 0 + allocate(num_elem_colors_acoustic(num_colors_outer_acoustic + num_colors_inner_acoustic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_acoustic array' + endif + if( ELASTIC_SIMULATION ) then + num_colors_outer_elastic = 0 + num_colors_inner_elastic = 0 + allocate(num_elem_colors_elastic(num_colors_outer_elastic + num_colors_inner_elastic),stat=ier) + if( ier /= 0 ) stop 'error allocating num_elem_colors_elastic array' + endif + endif + close(27) ! outputs total element numbers diff --git a/src/specfem3D/setup_GLL_points.f90 b/src/specfem3D/setup_GLL_points.f90 index 514522849..31d24757c 100644 --- a/src/specfem3D/setup_GLL_points.f90 +++ b/src/specfem3D/setup_GLL_points.f90 @@ -41,9 +41,9 @@ subroutine setup_GLL_points() ! set up GLL points, weights and derivation matrices for reference element (between -1,1) call define_derivation_matrices(xigll,yigll,zigll,wxgll,wygll,wzgll, & - hprime_xx,hprime_yy,hprime_zz, & - hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, & - wgllwgll_xy,wgllwgll_xz,wgllwgll_yz) + hprime_xx,hprime_yy,hprime_zz, & + hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, & + wgllwgll_xy,wgllwgll_xz,wgllwgll_yz) ! define transpose of derivation matrix do j = 1,NGLLY diff --git a/src/specfem3D/setup_sources_receivers.f90 b/src/specfem3D/setup_sources_receivers.f90 index c265b593e..437c80a4b 100644 --- a/src/specfem3D/setup_sources_receivers.f90 +++ b/src/specfem3D/setup_sources_receivers.f90 @@ -153,7 +153,7 @@ subroutine setup_sources() call max_all_all_dp(t0_acoustic,t0) ! point force sources will start depending on the frequency given by hdur - if( USE_FORCE_POINT_SOURCE ) then + if( USE_FORCE_POINT_SOURCE .or. USE_RICKER_IPATI ) then ! note: point force sources will give the dominant frequency in hdur, ! thus the main period is 1/hdur. ! also, these sources use a Ricker source time function instead of a gaussian. @@ -211,6 +211,12 @@ subroutine setup_sources() call exit_mpi(myrank,'error negative USER_T0 parameter in constants.h') endif + ! count number of sources located in this slice + nsources_local = 0 + do isource = 1, NSOURCES + if(myrank == islice_selected_source(isource)) nsources_local = nsources_local + 1 + enddo + ! checks if source is in an acoustic element and exactly on the free surface because pressure is zero there call setup_sources_check_acoustic() @@ -523,6 +529,7 @@ subroutine setup_sources_precompute_arrays() real(kind=CUSTOM_REAL) :: junk integer :: isource,ispec integer :: irec !,irec_local + integer :: i,j,k,iglob integer :: icomp,itime,nadj_files_found,nadj_files_found_tot,ier character(len=3),dimension(NDIM) :: comp ! = (/ "BHE", "BHN", "BHZ" /) character(len=256) :: filename @@ -577,11 +584,44 @@ subroutine setup_sources_precompute_arrays() sourcearray,xigll,yigll,zigll,factor_source) endif + ! point forces, initializes sourcearray, used for simplified CUDA routines + if(USE_FORCE_POINT_SOURCE) then + ! note: for use_force_point_source xi/eta/gamma are in the range [1,NGLL*] + iglob = ibool(nint(xi_source(isource)), & + nint(eta_source(isource)), & + nint(gamma_source(isource)), & + ispec) + ! sets sourcearrays + sourcearray(:,:,:,:) = 0.0 + do k=1,NGLLZ + do j=1,NGLLY + do i=1,NGLLX + if( ibool(i,j,k,ispec) == iglob ) then + ! acoustic source + ! identical source array components in x,y,z-direction + if( ispec_is_acoustic(ispec) ) then + sourcearray(:,i,j,k) = 1.0 + endif + ! elastic source + if( ispec_is_elastic(ispec) ) then + sourcearray(:,i,j,k) = nu_source(COMPONENT_FORCE_SOURCE,:,isource) + endif + endif + enddo + enddo + enddo + endif + ! stores source excitations sourcearrays(isource,:,:,:,:) = sourcearray(:,:,:,:) endif enddo + else + ! SIMULATION_TYPE == 2 + ! allocate dummy array (needed for subroutine calls) + allocate(sourcearrays(0,0,0,0,0),stat=ier) + if( ier /= 0 ) stop 'error allocating dummy sourcearrays' endif ! ADJOINT simulations @@ -679,17 +719,19 @@ subroutine setup_receivers_precompute_intp() integer :: irec,irec_local,isource,ier -! stores local receivers interpolation factors + ! needs to be allocate for subroutine calls (even if nrec_local == 0) + allocate(number_receiver_global(nrec_local),stat=ier) + if( ier /= 0 ) stop 'error allocating array number_receiver_global' + + ! stores local receivers interpolation factors if (nrec_local > 0) then - ! allocate Lagrange interpolators for receivers + ! allocate Lagrange interpolators for receivers allocate(hxir_store(nrec_local,NGLLX), & hetar_store(nrec_local,NGLLY), & hgammar_store(nrec_local,NGLLZ),stat=ier) if( ier /= 0 ) stop 'error allocating array hxir_store etc.' - ! define local to global receiver numbering mapping - allocate(number_receiver_global(nrec_local),stat=ier) - if( ier /= 0 ) stop 'error allocating array number_reciever_global' + ! define local to global receiver numbering mapping irec_local = 0 if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then do irec = 1,nrec @@ -762,8 +804,21 @@ subroutine setup_sources_receivers_VTKfile() double precision :: xil,etal,gammal double precision :: xmesh,ymesh,zmesh real(kind=CUSTOM_REAL),dimension(NGNOD) :: xelm,yelm,zelm - integer :: ia,ispec,isource,irec,ier - character(len=256) :: filename,filename_new,system_command + integer :: ia,ispec,isource,irec,ier,totalpoints + + !INTEGER(kind=4) :: system_command_status + !integer :: ret + !integer,external :: system + + character(len=256) :: filename,filename_new,system_command,system_command1,system_command2 + + ! determines number of points for vtk file + if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then + totalpoints = NSOURCES + nrec + else + ! pure adjoint simulation only needs receivers + totalpoints = nrec + endif if (myrank == 0) then ! vtk file @@ -774,63 +829,65 @@ subroutine setup_sources_receivers_VTKfile() write(IOVTK,'(a)') 'Source and Receiver VTK file' write(IOVTK,'(a)') 'ASCII' write(IOVTK,'(a)') 'DATASET POLYDATA' - write(IOVTK, '(a,i6,a)') 'POINTS ', NSOURCES+nrec, ' float' + write(IOVTK, '(a,i6,a)') 'POINTS ', totalpoints, ' float' endif ! sources - do isource=1,NSOURCES - ! spectral element id - ispec = ispec_selected_source(isource) + if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then + do isource=1,NSOURCES + ! spectral element id + ispec = ispec_selected_source(isource) - ! gets element ancor nodes - if( myrank == islice_selected_source(isource) ) then - ! find the coordinates of the eight corner nodes of the element - call get_shape3D_element_corners(xelm,yelm,zelm,ispec,& - ibool,xstore,ystore,zstore,NSPEC_AB,NGLOB_AB) + ! gets element ancor nodes + if( myrank == islice_selected_source(isource) ) then + ! find the coordinates of the eight corner nodes of the element + call get_shape3D_element_corners(xelm,yelm,zelm,ispec,& + ibool,xstore,ystore,zstore,NSPEC_AB,NGLOB_AB) - endif - ! master collects corner locations - if( islice_selected_source(isource) /= 0 ) then - if( myrank == 0 ) then - call recvv_cr(xelm,NGNOD,islice_selected_source(isource),0) - call recvv_cr(yelm,NGNOD,islice_selected_source(isource),0) - call recvv_cr(zelm,NGNOD,islice_selected_source(isource),0) - else if( myrank == islice_selected_source(isource) ) then - call sendv_cr(xelm,NGNOD,0,0) - call sendv_cr(yelm,NGNOD,0,0) - call sendv_cr(zelm,NGNOD,0,0) endif - endif - - if( myrank == 0 ) then - ! get the 3-D shape functions - if( USE_FORCE_POINT_SOURCE ) then - ! note: we switch xi,eta,gamma range to be [-1,1] - ! uses initial guess in xi, eta and gamma - xil = xigll(nint(xi_source(isource))) - etal = yigll(nint(eta_source(isource))) - gammal = zigll(nint(gamma_source(isource))) - else - xil = xi_source(isource) - etal = eta_source(isource) - gammal = gamma_source(isource) + ! master collects corner locations + if( islice_selected_source(isource) /= 0 ) then + if( myrank == 0 ) then + call recvv_cr(xelm,NGNOD,islice_selected_source(isource),0) + call recvv_cr(yelm,NGNOD,islice_selected_source(isource),0) + call recvv_cr(zelm,NGNOD,islice_selected_source(isource),0) + else if( myrank == islice_selected_source(isource) ) then + call sendv_cr(xelm,NGNOD,0,0) + call sendv_cr(yelm,NGNOD,0,0) + call sendv_cr(zelm,NGNOD,0,0) + endif endif - call get_shape3D_single(myrank,shape3D,xil,etal,gammal) - ! interpolates source locations - xmesh = 0.0 - ymesh = 0.0 - zmesh = 0.0 - do ia=1,NGNOD - xmesh = xmesh + shape3D(ia)*xelm(ia) - ymesh = ymesh + shape3D(ia)*yelm(ia) - zmesh = zmesh + shape3D(ia)*zelm(ia) - enddo + if( myrank == 0 ) then + ! get the 3-D shape functions + if( USE_FORCE_POINT_SOURCE ) then + ! note: we switch xi,eta,gamma range to be [-1,1] + ! uses initial guess in xi, eta and gamma + xil = xigll(nint(xi_source(isource))) + etal = yigll(nint(eta_source(isource))) + gammal = zigll(nint(gamma_source(isource))) + else + xil = xi_source(isource) + etal = eta_source(isource) + gammal = gamma_source(isource) + endif + call get_shape3D_single(myrank,shape3D,xil,etal,gammal) + + ! interpolates source locations + xmesh = 0.0 + ymesh = 0.0 + zmesh = 0.0 + do ia=1,NGNOD + xmesh = xmesh + shape3D(ia)*xelm(ia) + ymesh = ymesh + shape3D(ia)*yelm(ia) + zmesh = zmesh + shape3D(ia)*zelm(ia) + enddo - ! writes out to VTK file - write(IOVTK,*) xmesh,ymesh,zmesh - endif - enddo ! NSOURCES + ! writes out to VTK file + write(IOVTK,*) xmesh,ymesh,zmesh + endif + enddo ! NSOURCES + endif ! receivers do irec=1,nrec @@ -856,15 +913,9 @@ subroutine setup_sources_receivers_VTKfile() if( myrank == 0 ) then ! get the 3-D shape functions - if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then - xil = xi_receiver(irec) - etal = eta_receiver(irec) - gammal = gamma_receiver(irec) - else - xil = xi_source(irec) - etal = eta_source(irec) - gammal = gamma_source(irec) - endif + xil = xi_receiver(irec) + etal = eta_receiver(irec) + gammal = gamma_receiver(irec) call get_shape3D_single(myrank,shape3D,xil,etal,gammal) ! interpolates receiver locations @@ -888,22 +939,50 @@ subroutine setup_sources_receivers_VTKfile() close(IOVTK) ! creates additional receiver and source files - ! extracts receiver locations - filename = trim(OUTPUT_FILES)//'/sr.vtk' - filename_new = trim(OUTPUT_FILES)//'/receiver.vtk' - write(system_command, & - "('awk ',a1,'{if(NR<5) print $0;if(NR==6)print ',a1,'POINTS',i6,' float',a1,';if(NR>5+',i6,')print $0}',a1,' < ',a,' > ',a)")& + if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then + ! extracts receiver locations + filename = trim(OUTPUT_FILES)//'/sr.vtk' + filename_new = trim(OUTPUT_FILES)//'/receiver.vtk' + + ! vtk file for receivers only + write(system_command, & + "('awk ',a1,'{if(NR<5) print $0;if(NR==5)print ',a1,'POINTS',i6,' float',a1,';if(NR>5+',i6,')print $0}',a1,' < ',a,' > ',a)")& "'",'"',nrec,'"',NSOURCES,"'",trim(filename),trim(filename_new) - call system(system_command) - ! extracts source locations - filename_new = trim(OUTPUT_FILES)//'/source.vtk' - write(system_command, & - "('awk ',a1,'{if(NR< 6 + ',i6,') print $0}END{print}',a1,' < ',a,' > ',a)")& - "'",NSOURCES,"'",trim(filename),trim(filename_new) - call system(system_command) +!daniel: +! gfortran +! call system(trim(system_command),system_command_status) +! ifort +! ret = system(trim(system_command)) + + ! extracts source locations + !"('awk ',a1,'{if(NR< 6 + ',i6,') print $0}END{print}',a1,' < ',a,' > ',a)")& + filename_new = trim(OUTPUT_FILES)//'/source.vtk' + + write(system_command1, & + "('awk ',a1,'{if(NR<5) print $0;if(NR==5)print ',a1,'POINTS',i6,' float',a1,';')") & + "'",'"',NSOURCES,'"' + !daniel + !print*,'command 1:',trim(system_command1) + write(system_command2, & + "('if(NR>5 && NR <6+',i6,')print $0}END{print ',a,'}',a1,' < ',a,' > ',a)") & + NSOURCES,'" "',"'",trim(filename),trim(filename_new) + + !print*,'command 2:',trim(system_command2) + + system_command = trim(system_command1)//trim(system_command2) + + !print*,'command:',trim(system_command) +!daniel: +! gfortran +! call system(trim(system_command),system_command_status) +! ifort +! ret = system(trim(system_command)) + + endif endif + end subroutine setup_sources_receivers_VTKfile diff --git a/src/specfem3D/specfem3D_par.f90 b/src/specfem3D/specfem3D_par.f90 index 9de96fc53..ad549bd8f 100644 --- a/src/specfem3D/specfem3D_par.f90 +++ b/src/specfem3D/specfem3D_par.f90 @@ -42,8 +42,32 @@ module specfem_par implicit none -! attenuation - integer :: NSPEC_ATTENUATION_AB +! parameters deduced from parameters read from file + integer :: NPROC + integer :: NSPEC_AB, NGLOB_AB + +! mesh parameters + integer, dimension(:,:,:,:), allocatable :: ibool + real(kind=CUSTOM_REAL), dimension(:), allocatable :: xstore,ystore,zstore + + real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: & + xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz,jacobian + +! material properties + ! isotropic + real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: kappastore,mustore + +! CUDA mesh pointer<->integer wrapper + integer(kind=8) :: Mesh_pointer + +! Global GPU toggle. Set in Par_file + logical :: GPU_MODE + +! use integer array to store topography values + integer :: NX_TOPO,NY_TOPO + !double precision :: ORIG_LAT_TOPO,ORIG_LONG_TOPO,DEGREES_PER_CELL_TOPO + !character(len=100) :: topo_file + integer, dimension(:,:), allocatable :: itopo_bathy ! absorbing boundary arrays (for all boundaries) - keeps all infos, allowing for irregular surfaces real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: abs_boundary_normal @@ -59,16 +83,10 @@ module specfem_par integer, dimension(:), allocatable :: free_surface_ispec integer :: num_free_surface_faces -! mesh parameters - integer, dimension(:,:,:,:), allocatable :: ibool - real(kind=CUSTOM_REAL), dimension(:), allocatable :: xstore,ystore,zstore - - real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: & - xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz,jacobian +! attenuation + integer :: NSPEC_ATTENUATION_AB + character(len=256) prname_Q -! material properties - ! isotropic - real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: kappastore,mustore ! additional mass matrix for ocean load real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmass_ocean_load @@ -91,7 +109,7 @@ module specfem_par double precision, external :: comp_source_time_function double precision :: t0 real(kind=CUSTOM_REAL) :: stf_used_total - integer :: NSOURCES + integer :: NSOURCES,nsources_local ! source encoding ! for acoustic sources: takes +/- 1 sign, depending on sign(Mxx)[ = sign(Myy) = sign(Mzz) ! since they have to equal in the acoustic setting] @@ -101,8 +119,9 @@ module specfem_par character(len=256) :: rec_filename,filtered_rec_filename,dummystring integer :: nrec,nrec_local,nrec_tot_found integer :: nrec_simulation - integer, allocatable, dimension(:) :: islice_selected_rec,ispec_selected_rec,number_receiver_global - double precision, allocatable, dimension(:) :: xi_receiver,eta_receiver,gamma_receiver + integer, dimension(:), allocatable :: islice_selected_rec,ispec_selected_rec + integer, dimension(:), allocatable :: number_receiver_global + double precision, dimension(:), allocatable :: xi_receiver,eta_receiver,gamma_receiver double precision, dimension(:,:), allocatable :: hpxir_store,hpetar_store,hpgammar_store ! timing information for the stations @@ -137,17 +156,16 @@ module specfem_par double precision, external :: wtime double precision :: time_start -! parameters read from parameter file - integer :: NPROC_XI,NPROC_ETA - integer :: NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,UTM_PROJECTION_ZONE +! parameters integer :: SIMULATION_TYPE + integer :: NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP,UTM_PROJECTION_ZONE integer :: IMODEL - + double precision :: DT - double precision :: LATITUDE_MIN,LATITUDE_MAX,LONGITUDE_MIN,LONGITUDE_MAX logical :: ATTENUATION,USE_OLSEN_ATTENUATION, & OCEANS,TOPOGRAPHY,ABSORBING_CONDITIONS,ANISOTROPY + logical :: GRAVITY logical :: SAVE_FORWARD,SAVE_MESH_FILES,PRINT_SOURCE_TIME_FUNCTION @@ -155,11 +173,11 @@ module specfem_par integer :: NTSTEP_BETWEEN_OUTPUT_INFO - character(len=256) OUTPUT_FILES,LOCAL_PATH,prname,prname_Q +! parameters read from mesh parameter file + integer :: NPROC_XI,NPROC_ETA + double precision :: LATITUDE_MIN,LATITUDE_MAX,LONGITUDE_MIN,LONGITUDE_MAX -! parameters deduced from parameters read from file - integer :: NPROC - integer :: NSPEC_AB, NGLOB_AB + character(len=256) OUTPUT_FILES,LOCAL_PATH,prname ! names of the data files for all the processors in MPI character(len=256) outputname @@ -194,13 +212,6 @@ module specfem_par ! MPI partition surfaces logical, dimension(:), allocatable :: ispec_is_inner -! maximum of the norm of the displacement - real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all ! elastic - real(kind=CUSTOM_REAL) Usolidnormp,Usolidnormp_all ! acoustic - real(kind=CUSTOM_REAL) Usolidnorms,Usolidnorms_all ! solid poroelastic - real(kind=CUSTOM_REAL) Usolidnormw,Usolidnormw_all ! fluid (w.r.t.s) poroelastic - integer:: Usolidnorm_index(1) - ! maximum speed in velocity model real(kind=CUSTOM_REAL):: model_speed_max @@ -210,6 +221,10 @@ module specfem_par !!$ real(kind=CUSTOM_REAL) :: weight, jacobianl !!!! NL NL REGOLITH + ! gravity + real(kind=CUSTOM_REAL), dimension(:),allocatable :: minus_deriv_gravity,minus_g + real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube + ! ADJOINT parameters @@ -241,9 +256,6 @@ module specfem_par ! adjoint elements integer :: NSPEC_ADJOINT, NGLOB_ADJOINT - ! norm of the backward displacement - real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all - ! length of reading blocks integer :: NTSTEP_BETWEEN_READ_ADJSRC @@ -285,6 +297,12 @@ module specfem_par_elastic real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: displ,veloc,accel real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: accel_adj_coupling +! variables needed for OpenMP version + real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: & + dummyx_loc,dummyy_loc,dummyz_loc,newtempx1,newtempx2,newtempx3,& + newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3,& + tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 + ! mass matrix real(kind=CUSTOM_REAL), dimension(:), allocatable :: rmass @@ -308,8 +326,12 @@ module specfem_par_elastic integer, dimension(:,:), allocatable :: phase_ispec_inner_elastic integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic - logical :: ELASTIC_SIMULATION +! mesh coloring + integer :: num_colors_outer_elastic,num_colors_inner_elastic + integer, dimension(:), allocatable :: num_elem_colors_elastic + integer :: nspec_elastic + logical :: ELASTIC_SIMULATION ! ADJOINT elastic @@ -389,6 +411,11 @@ module specfem_par_acoustic integer, dimension(:,:), allocatable :: phase_ispec_inner_acoustic integer :: num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic +! mesh coloring + integer :: num_colors_outer_acoustic,num_colors_inner_acoustic + integer, dimension(:), allocatable :: num_elem_colors_acoustic + integer :: nspec_acoustic + logical :: ACOUSTIC_SIMULATION ! ADJOINT acoustic diff --git a/src/specfem3D/write_movie_output.f90 b/src/specfem3D/write_movie_output.f90 index f33bf475d..c03c31ad0 100644 --- a/src/specfem3D/write_movie_output.f90 +++ b/src/specfem3D/write_movie_output.f90 @@ -30,8 +30,32 @@ subroutine write_movie_output() use specfem_par use specfem_par_movie + use specfem_par_elastic + use specfem_par_acoustic implicit none + ! gets resulting array values onto CPU + if(GPU_MODE .and. & + ( & + EXTERNAL_MESH_CREATE_SHAKEMAP .or. & + CREATE_SHAKEMAP .or. & + ( MOVIE_SURFACE .and. mod(it,NTSTEP_BETWEEN_FRAMES) == 0) .or. & + ( MOVIE_VOLUME .and. mod(it,NTSTEP_BETWEEN_FRAMES) == 0) .or. & + ( PNM_GIF_IMAGE .and. mod(it,NTSTEP_BETWEEN_FRAMES) == 0) & + ) ) then + ! acoustic domains + if( ACOUSTIC_SIMULATION ) then + ! transfers whole fields + call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer) + endif + ! elastic domains + if( ELASTIC_SIMULATION ) then + ! transfers whole fields + call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer) + endif + endif + ! shakemap creation if (EXTERNAL_MESH_CREATE_SHAKEMAP) then call wmo_create_shakemap_em() @@ -126,19 +150,19 @@ subroutine wmo_create_shakemap_em() potential_acoustic, displ_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! velocity vector call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & potential_dot_acoustic, veloc_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! accel ? call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & potential_dot_dot_acoustic, accel_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) endif @@ -146,7 +170,7 @@ subroutine wmo_create_shakemap_em() if (USE_HIGHRES_FOR_MOVIES) then do ipoin = 1, NGLLX*NGLLY iglob = faces_surface_ext_mesh(ipoin,ispec2D) - + ! saves norm of displacement,velocity and acceleration vector if( ispec_is_elastic(ispec) ) then ! norm of displacement @@ -286,7 +310,7 @@ subroutine wmo_get_max_vector(ispec,ispec2D,iglob,ipoin, & logical :: is_done is_done = .false. - + ! loops over all gll points from this element do k=1,NGLLZ do j=1,NGLLY @@ -374,14 +398,14 @@ subroutine wmo_create_movie_surface_em() potential_acoustic, val_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) else ! velocity vector call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & potential_dot_acoustic, val_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) endif endif @@ -613,14 +637,14 @@ subroutine wmo_movie_surface_output_o() potential_acoustic, val_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) else ! velocity vector call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & potential_dot_acoustic, val_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) endif endif @@ -770,19 +794,19 @@ subroutine wmo_create_shakemap_o() potential_acoustic, displ_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! velocity vector call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & potential_dot_acoustic, veloc_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! accel ? call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & potential_dot_dot_acoustic, accel_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) endif ! save all points for high resolution, or only four corners for low resolution @@ -940,14 +964,14 @@ subroutine wmo_get_max_vector_o(ispec,iglob,ipoin,displ_element,veloc_element,ac ! velocity vector is_done = .false. - + ! loops over all gll points from this element do k=1,NGLLZ do j=1,NGLLY do i=1,NGLLX ! checks if global point is found if( iglob == ibool(i,j,k,ispec) ) then - + ! horizontal displacement store_val_ux_external_mesh(ipoin) = max(store_val_ux_external_mesh(ipoin),& abs(displ_element(1,i,j,k)),abs(displ_element(2,i,j,k))) @@ -1019,7 +1043,7 @@ subroutine wmo_movie_volume_output() potential_dot_acoustic, veloc_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) velocity_x(:,:,:,ispec) = veloc_element(1,:,:,:) velocity_y(:,:,:,ispec) = veloc_element(2,:,:,:) velocity_z(:,:,:,ispec) = veloc_element(3,:,:,:) diff --git a/src/specfem3D/write_seismograms.f90 b/src/specfem3D/write_seismograms.f90 index 3bd8512d9..b03cdf6da 100644 --- a/src/specfem3D/write_seismograms.f90 +++ b/src/specfem3D/write_seismograms.f90 @@ -45,6 +45,50 @@ subroutine write_seismograms() real(kind=CUSTOM_REAL):: stf_deltat double precision :: stf + ! TODO: Test and Fix CUDA seismograms code. + logical,parameter :: USE_CUDA_SEISMOGRAMS = .false. + + ! gets resulting array values onto CPU + if(GPU_MODE) then + if( nrec_local > 0 ) then + ! this transfers fields only in elements with stations for efficiency + if( ACOUSTIC_SIMULATION ) then + ! only copy corresponding elements to CPU host + ! timing: Elapsed time: 5.230904e-04 + call transfer_station_ac_from_device( & + potential_acoustic,potential_dot_acoustic,potential_dot_dot_acoustic, & + b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, & + Mesh_pointer,number_receiver_global, & + ispec_selected_rec,ispec_selected_source,ibool,SIMULATION_TYPE) + + ! alternative: transfers whole fields + ! timing: Elapsed time: 4.138947e-03 + !call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, & + ! potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer) + endif + + ! this transfers fields only in elements with stations for efficiency + if( ELASTIC_SIMULATION ) then + if(USE_CUDA_SEISMOGRAMS) then + call transfer_seismograms_el_from_d(nrec_local,Mesh_pointer, & + SIMULATION_TYPE,& + seismograms_d,seismograms_v,seismograms_a,& + it) + else + call transfer_station_el_from_device(displ,veloc,accel, & + b_displ,b_veloc,b_accel, & + Mesh_pointer,number_receiver_global, & + ispec_selected_rec,ispec_selected_source, & + ibool,SIMULATION_TYPE) + endif + ! alternative: transfers whole fields + ! call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer) + endif + endif + endif + + if(.not. GPU_MODE .or. (GPU_MODE .and. (.not. USE_CUDA_SEISMOGRAMS))) then + do irec_local = 1,nrec_local ! gets global number of that receiver @@ -57,7 +101,8 @@ subroutine write_seismograms() hgammar(:) = hgammar_store(irec_local,:) ! forward simulations - if (SIMULATION_TYPE == 1) then + select case( SIMULATION_TYPE ) + case( 1 ) ! receiver's spectral element ispec = ispec_selected_rec(irec) @@ -79,13 +124,13 @@ subroutine write_seismograms() potential_acoustic, displ_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! velocity vector call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & potential_dot_acoustic, veloc_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! interpolates displ/veloc/pressure at receiver locations call compute_interpolated_dva_ac(displ_element,veloc_element,& @@ -109,7 +154,7 @@ subroutine write_seismograms() endif !poroelastic !adjoint simulations - else if (SIMULATION_TYPE == 2) then + case( 2 ) ! adjoint source is placed at receiver ispec = ispec_selected_source(irec) @@ -166,13 +211,13 @@ subroutine write_seismograms() potential_acoustic, displ_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! velocity vector call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, & potential_dot_acoustic, veloc_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! interpolates displ/veloc/pressure at receiver locations call compute_interpolated_dva_ac(displ_element,veloc_element,& @@ -185,7 +230,7 @@ subroutine write_seismograms() endif ! acoustic !adjoint simulations - else if (SIMULATION_TYPE == 3) then + case( 3 ) ispec = ispec_selected_rec(irec) @@ -206,13 +251,13 @@ subroutine write_seismograms() b_potential_acoustic, displ_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! backward fields: velocity vector call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, & b_potential_dot_acoustic, veloc_element,& hprime_xx,hprime_yy,hprime_zz, & xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, & - ibool,rhostore) + ibool,rhostore,GRAVITY) ! backward fields: interpolates displ/veloc/pressure at receiver locations call compute_interpolated_dva_ac(displ_element,veloc_element,& @@ -224,10 +269,10 @@ subroutine write_seismograms() dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd) endif ! acoustic - endif ! SIMULATION_TYPE + end select ! SIMULATION_TYPE -! store North, East and Vertical components -! distinguish between single and double precision for reals + ! store North, East and Vertical components + ! distinguish between single and double precision for reals if(CUSTOM_REAL == SIZE_REAL) then seismograms_d(:,irec_local,it) = sngl((nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd)) seismograms_v(:,irec_local,it) = sngl((nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd)) @@ -243,23 +288,22 @@ subroutine write_seismograms() enddo ! nrec_local -! write the current or final seismograms + endif + + ! write the current or final seismograms if((mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it == NSTEP) .and. (.not.SU_FORMAT)) then if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then - call write_seismograms_to_file(myrank,seismograms_d,number_receiver_global,station_name, & - network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,1,SIMULATION_TYPE) - call write_seismograms_to_file(myrank,seismograms_v,number_receiver_global,station_name, & - network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,2,SIMULATION_TYPE) - call write_seismograms_to_file(myrank,seismograms_a,number_receiver_global,station_name, & - network_name,nrec,nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,3,SIMULATION_TYPE) + call write_seismograms_to_file(seismograms_d,1) + call write_seismograms_to_file(seismograms_v,2) + call write_seismograms_to_file(seismograms_a,3) else call write_adj_seismograms_to_file(myrank,seismograms_d,number_receiver_global, & nrec_local,it,DT,NSTEP,t0,LOCAL_PATH,1) endif endif -! write ONE binary file for all receivers (nrec_local) within one proc -! SU format, with 240-byte-header for each trace + ! write ONE binary file for all receivers (nrec_local) within one proc + ! SU format, with 240-byte-header for each trace if ((mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it==NSTEP) .and. SU_FORMAT) & call write_output_SU() @@ -271,28 +315,19 @@ end subroutine write_seismograms ! write seismograms to text files - subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global, & - station_name,network_name,nrec,nrec_local, & - it,DT,NSTEP,t0,LOCAL_PATH,istore,SIMULATION_TYPE) + subroutine write_seismograms_to_file(seismograms,istore) - implicit none - - include "constants.h" + use constants + use specfem_par,only: & + myrank,number_receiver_global,station_name,network_name, & + nrec,nrec_local,islice_selected_rec, & + it,DT,NSTEP,t0,LOCAL_PATH,SIMULATION_TYPE - integer :: NSTEP,it - integer :: nrec,nrec_local - integer :: myrank,istore - integer :: SIMULATION_TYPE + implicit none - integer, dimension(nrec_local) :: number_receiver_global + integer :: istore real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local,NSTEP) :: seismograms - double precision t0,DT - - character(len=256) LOCAL_PATH - character(len=MAX_LENGTH_STATION_NAME), dimension(nrec) :: station_name - character(len=MAX_LENGTH_NETWORK_NAME), dimension(nrec) :: network_name - ! local parameters integer irec,irec_local integer irecord @@ -303,6 +338,8 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global, real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: one_seismogram integer :: nrec_local_received,NPROCTOT,total_seismos,receiver,sender integer :: iproc,ier + integer,dimension(1) :: tmp_nrec_local_received,tmp_irec,tmp_nrec_local + integer,dimension(:),allocatable:: islice_num_rec_local ! saves displacement, velocity or acceleration if(istore == 1) then @@ -350,12 +387,27 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global, ! loop on all the slices call world_size(NPROCTOT) + + ! counts number of local receivers for each slice + allocate(islice_num_rec_local(0:NPROCTOT-1),stat=ier) + if( ier /= 0 ) call exit_mpi(myrank,'error allocating islice_num_rec_local') + islice_num_rec_local(:) = 0 + do irec = 1,nrec + iproc = islice_selected_rec(irec) + islice_num_rec_local(iproc) = islice_num_rec_local(iproc) + 1 + enddo + + ! loops on all the slices do iproc = 0,NPROCTOT-1 + ! communicate only with processes which contain local receivers + if( islice_num_rec_local(iproc) == 0 ) cycle + ! receive except from proc 0, which is me and therefore I already have this value sender = iproc if(iproc /= 0) then - call recv_i(nrec_local_received,1,sender,itag) + call recv_i(tmp_nrec_local_received,1,sender,itag) + nrec_local_received = tmp_nrec_local_received(1) if(nrec_local_received < 0) call exit_MPI(myrank,'error while receiving local number of receivers') else nrec_local_received = nrec_local @@ -369,7 +421,8 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global, irec = number_receiver_global(irec_local) one_seismogram(:,:) = seismograms(:,irec_local,:) else - call recv_i(irec,1,sender,itag) + call recv_i(tmp_irec,1,sender,itag) + irec = tmp_irec(1) if(irec < 1 .or. irec > nrec) call exit_MPI(myrank,'error while receiving global receiver number') call recvv_cr(one_seismogram,NDIM*NSTEP,sender,itag) @@ -389,6 +442,7 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global, enddo ! nrec_local_received endif ! if(nrec_local_received > 0 ) enddo ! NPROCTOT-1 + deallocate(islice_num_rec_local) write(IMAIN,*) 'Component: .sem'//component write(IMAIN,*) ' total number of receivers saved is ',total_seismos,' out of ',nrec @@ -398,12 +452,14 @@ subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global, else ! on the nodes, send the seismograms to the master receiver = 0 - call send_i(nrec_local,1,receiver,itag) + tmp_nrec_local(1) = nrec_local + call send_i(tmp_nrec_local,1,receiver,itag) if (nrec_local > 0) then do irec_local = 1,nrec_local ! get global number of that receiver irec = number_receiver_global(irec_local) - call send_i(irec,1,receiver,itag) + tmp_irec(1) = irec + call send_i(tmp_irec,1,receiver,itag) ! sends seismogram of that receiver one_seismogram(:,:) = seismograms(:,irec_local,:) diff --git a/utils/Cluster/pbs/valgrind_go_solver_pbs.bash b/utils/Cluster/pbs/valgrind_go_solver_pbs.bash new file mode 100755 index 000000000..3776b8e32 --- /dev/null +++ b/utils/Cluster/pbs/valgrind_go_solver_pbs.bash @@ -0,0 +1,77 @@ +#!/bin/bash +# +# Valgrind, a suite of tools for debugging and profiling +# http://valgrind.org/ +# + +# bash script +#PBS -S /bin/bash + +# job name +#PBS -N valgrind_go_solver + +# joins output and error information +#PBS -j oe + +# job output file +#PBS -o in_out_files/OUTPUT_FILES/job.o + +########################################################### +# USER PARAMETERS + +# Queue +#PBS -q tromp + +# 150 CPUs ( 18*8+6 ), walltime 15 hour +#PBS -l nodes=18:ppn=8+1:ppn=6,walltime=15:00:00 + +# valgrind mpi library +PRELOAD_LIB=/my_valgrind_path/valgrind/lib/valgrind/libmpiwrap-x86-linux.so + +########################################################### + +cd $PBS_O_WORKDIR + +# script to run the mesher and the solver +# read Par_file to get information about the run +# compute total number of nodes needed +NPROC=`grep NPROC in_data_files/Par_file | cut -d = -f 2 ` + +# total number of nodes is the product of the values read +numnodes=$NPROC + +mkdir -p in_out_files/OUTPUT_FILES + +# backup files used for this simulation +cp in_data_files/Par_file in_out_files/OUTPUT_FILES/ +cp in_data_files/STATIONS in_out_files/OUTPUT_FILES/ +cp in_data_files/CMTSOLUTION in_out_files/OUTPUT_FILES/ + +# obtain job information +cat $PBS_NODEFILE > in_out_files/OUTPUT_FILES/compute_nodes +echo "$PBS_JOBID" > in_out_files/OUTPUT_FILES/jobid + +echo starting run in current directory $PWD +cd bin/ + +echo " " +echo "run: memory leaks" +echo " " +sleep 2 + +# memory leaks +LD_PRELOAD=$PRELOAD_LIB mpiexec -np $numnodes valgrind --leak-check=full ./xspecfem3D >& ../in_out_files/OUTPUT_FILES/output.memory-leaks.log + +sleep 2 +echo " " +echo "run: cache misses" +echo " " + +# cache misses +LD_PRELOAD=$PRELOAD_LIB mpiexec -np $numnodes valgrind --tool=cachegrind ./xspecfem3D >& ../in_out_files/OUTPUT_FILES/output.cache-misses.log + +cp cachegrind.out.* ../in_out_files/OUTPUT_FILES/ + +echo " " +echo "finished successfully" + diff --git a/utils/create_specfem3D_gpu_cuda_method_stubs.pl b/utils/create_specfem3D_gpu_cuda_method_stubs.pl new file mode 100755 index 000000000..5b0c14f0d --- /dev/null +++ b/utils/create_specfem3D_gpu_cuda_method_stubs.pl @@ -0,0 +1,141 @@ +#!/usr/bin/perl + +# +# Script to extract the function declarations in cuda files +# +# +# usage: ./ceate_specfem3D_gpu_cuda_method_stubs.pl +# run in directory root SPECFEM3D/ +# + +$outfile = "src/cuda/specfem3D_gpu_cuda_method_stubs.c"; + + +open(IOUT,"> _____temp_tutu_____"); + +$header = < +#include + +#include "config.h" + +typedef float realw; + +END + + +$warning = < _____temp_tutu01_____"); + open(IIN,"<_____temp_tutu01_____"); + + + # open the source file + $success = 1; + $do_extract = 0; + while($line = ) { + chop $line; + + # suppress trailing white spaces and carriage return + $line =~ s/\s*$//; + + # change the version number and copyright information + # $line =~ s#\(c\) California Institute of Technology and University of Pau, October 2007#\(c\) California Institute of Technology and University of Pau, November 2007#og; + # $line =~ s#rmass_sigma#rmass_time_integral_of_sigma#og; + + if($line =~ /extern "C"/){ + # new function declaration starts + #print "$line\n"; + if( $line =~/FC_FUNC/ ){ + # function declaration on same line as extern, ask for line skip + print "problem: please add a line break after extern 'C' here:"; + print "$line\n"; + $success = 0; + close(IIN); + exit; + } + $do_extract = 1; + next; + } + + # extract section + if($do_extract == 1 ){ + # function declaration + if($line =~ /{/){ + # function declaration ends + if( $line =~ /PREPARE_CUDA_DEVICE/ ){ + # adds warning + print IOUT "$line \n$warning\} \n\n"; + }else{ + print IOUT "$line\} \n\n"; + } + $do_extract = 0; + }else{ + # write line to the output file + print IOUT "$line\n"; + } + next; + } + } + close(IIN); + + if( $success == 0 ){ exit; } +} + +close(IOUT); +system("rm -f _____temp_tutu01_____"); + +# creates new stubs file if successful +if( $success == 1 ){ + print "\n\nsuccessfully extracted declarations \n\n"; + system("cp -p $outfile $outfile.bak"); + system("cp -p _____temp_tutu_____ $outfile"); + print "created new: $outfile \n"; +} +system("rm -f _____temp_tutu_____"); + + diff --git a/utils/readme_cuda_cscs.txt b/utils/readme_cuda_cscs.txt new file mode 100644 index 000000000..025321244 --- /dev/null +++ b/utils/readme_cuda_cscs.txt @@ -0,0 +1,63 @@ + +GPU_MODE Notes (by Max Rietmann) + +src/shared/constants.h: + +Change the following settings (for NOISE simulations): + +! sources and receivers Z coordinates given directly instead of with depth + logical, parameter :: USE_SOURCES_RECVS_Z = .true. + +! the seismograms are normal to surface +! Z record corresponds to the normal, while E and N are two tangent vectors +! that completes an orthonormal. + logical, parameter :: EXT_MESH_RECV_NORMAL = .true. + +Settings for Eiger +./configure --with-cuda FC=mpif90 MPIFC=mpif90 CUDA_LIB=-L/apps/eiger/Cuda-4.0/cuda/lib64 MPI_INC=-I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include +./configure --with-cuda FC=mpif90 MPIFC=mpif90 FLAGS_CHECK="-O3 -ffree-line-length-none -fopenmp" FLAGS_NO_CHECK="-Ofast -mfpmath=sse -funroll-loops -ffree-line-length-none -fopenmp" CUDA_LIB=-L/apps/eiger/Cuda-4.0/cuda/lib64 MPI_INC=-I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include + +./configure --with-cuda FC=mpif90 MPIFC=mpif90 FLAGS_CHECK="`echo $ICC_CHECK`" FLAGS_NO_CHECK='`echo $ICC_FCFLAGS`' CUDA_LIB=-L/apps/eiger/Cuda-4.0/cuda/lib64 MPI_INC=-I/apps/eiger/mvapich2/1.5.1p1/mvapich2-gnu/include + +NOTE: Need to add below to get nvcc to use the older version of gcc, as it doesn't support gcc4.5 or above. +--compiler-bindir /usr/bin/gcc-4.3 + + CUDA_LIB=-L$CUDA_HOME/lib64 MPI_INC=-I$MPICH_DIR/include + + +Settings for todi.cscs.ch +export GNU_FCFLAGS="-Ofast -mfpmath=sse -funroll-loops -ffree-line-length-none" +export GNU_CHECK="-O3 -ffree-line-length-none" +export DEBUG_FLAGS="-g -fbacktrace" +export ICC_FCFLAGS="-O3 -fp-model fast=2 -x SSE4.2 -ftz -funroll-loops -unroll5" +export ICC_FCFLAGS="-O3 -fp-model fast=2 -funroll-loops -unroll5 -msse3 -ftree-vectorize" +export ICC_CHECK="-O2" +export CRAY_FCFLAGS="-eF -em -rm -O3,fp3" +export CRAY_CHECK="-eF -em -rm" + +# Cray configure +./configure --with-cuda CC=cc FC=ftn MPIFC=ftn MPICC=cc FLAGS_CHECK="`echo $CRAY_CHECK`" FLAGS_NO_CHECK='`echo $CRAY_FCFLAGS`' CUDA_LIB=-L$CUDA_HOME/lib64 MPI_INC=-I$MPICH_DIR/include +# GNU configure +./configure --with-cuda CC=cc FC=ftn MPIFC=ftn MPICC=cc FLAGS_CHECK="`echo $GNU_CHECK`" FLAGS_NO_CHECK='`echo $GNU_FCFLAGS`' CUDA_LIB=-L$CUDA_HOME/lib64 MPI_INC=-I$MPICH_DIR/include +# Intel convifugre +./configure --with-cuda CC=cc FC=ftn MPIFC=ftn MPICC=cc FLAGS_CHECK="`echo $ICC_CHECK`" FLAGS_NO_CHECK='`echo $ICC_FCFLAGS`' CUDA_LIB=-L$CUDA_HOME/lib64 MPI_INC=-I$MPICH_DIR/include + +./configure CC=cc FC=ftn MPIFC=ftn MPICC=cc FLAGS_CHECK="`echo $CRAY_CHECK`" FLAGS_NO_CHECK='`echo $CRAY_FCFLAGS`' MPI_INC=-I$MPICH_DIR/include + +suggested compiler options: +intel: -O3 -fp-model fast=2 -x SSE4.2 -ftz -funroll-loops -unroll5 +gnu: -Ofast -mfpmath=sse -funroll-loops +pgi: +#################################### +Notes from Jeff Poznanovic: + +Configure and build with: + +module load cuda cudatools cudasdk +module load PrgEnv-cray +./configure --with-cuda --with-mpi MPIFC=ftn MPICC=cc FC=ftn CC=cc FCFLAGS="-eF -em -rm" CFLAGS="-h list=m" MPI_INC=-I/opt/cray/mpt/default/xt/gemini/mpich2-cray/73/include + + +make + + diff --git a/utils/remake_makefiles.sh b/utils/remake_makefiles.sh new file mode 100755 index 000000000..06529e545 --- /dev/null +++ b/utils/remake_makefiles.sh @@ -0,0 +1,12 @@ +#!/bin/bash +rm -r autom4te.cache +aclocal -I ./m4/ +echo "autoconf!" +autoconf configure.ac > test_conf +if [ $? -eq 0 ] +then + echo "configure!" + chmod +x ./test_conf + # example on guinan, which has "normal" defaults for a workstation or server + ./test_conf MPIFC=mpif90 FC=mpif90 CUDA_LIB="-L/usr/local/cuda/lib64/" MPI_INC="-I/usr/include/mpich2/" --with-cuda CUDA_INC="-I/usr/local/cuda/include" +fi diff --git a/utils/update_headers_change_word_f90.pl b/utils/update_headers_change_word_f90.pl new file mode 100755 index 000000000..90159f7ff --- /dev/null +++ b/utils/update_headers_change_word_f90.pl @@ -0,0 +1,114 @@ +#!/usr/bin/perl + +# +# Script to change the version number in f90 codes +# +# Author : Dimitri Komatitsch, EPS - Harvard University, May 1998 +# + +# +# read all f90 and F90 (and *.h) files in the current directory +# f90 files are supposed to have the extension "*.f90" or "*.F90" or "*.h" +# + +# +# known bug : does the changes also in constant strings, and not only +# in the code (which is dangerous, but really easier to program...) +# + +# +# usage: ./update_headers_change_word_f90.pl +# run in directory root SPECFEM3D/ +# + + +@objects = `ls src/*/*.f90 src/*/*.F90 src/*/*.h.in src/*/*.h src/*/*.c src/*/*.cu`; + +$nlines_total = 0; +$nlines_noblank = 0; +$nlines_nocomment = 0; + +foreach $name (@objects) { + chop $name; + + + # change tabs to white spaces + if( 1 == 1 ){ + system("expand -2 < $name > _____tutu01_____"); + $f90name = $name; + print STDOUT "Changing word in file $name ...\n"; + + open(FILEF77,"<_____tutu01_____"); + open(FILEF90,">$f90name"); + + # open the source file + while($line = ) { + chop $line; + + # suppress trailing white spaces and carriage return + $line =~ s/\s*$//; + + # change the version number and copyright information + # $line =~ s#\(c\) California Institute of Technology and University of Pau, October 2007#\(c\) California Institute of Technology and University of Pau, November 2007#og; + # $line =~ s#rmass_sigma#rmass_time_integral_of_sigma#og; + + # write the modified line to the output file + print FILEF90 "$line\n"; + + } + + close(FILEF77); + close(FILEF90); + } + + # line count + if( 1 == 0 ){ + print STDOUT "file $name : \n"; + + # counts all lines in file + $l = `wc -l $name | awk '{print \$1}'`; + chomp $l; + print " lines = $l \n"; + + # without blank lines + $lb = 0; + # without comments + $lc = 0; + system("expand -2 < $name > _____tutu01_____"); + open(FILEF77,"<_____tutu01_____"); + # open the source file + while($line = ) { + chop $line; + chomp $line; + # remove whitespace at start + $line =~ s/^\s+//; + # remove whitespace at end + $line =~ s/\s+$//; + # write the modified line to the output file + if( $line ne ""){ $lb = $lb + 1; } + if( ($line ne "") && (substr($line,0,1) ne "!") && (substr($line,0,1) ne "/") ){ $lc = $lc + 1; } + } + close(FILEF77); + print " lines (no blank) = $lb \n"; + print " lines (no comment) = $lc \n"; + + # summations + $nlines_total = $nlines_total + $l; + $nlines_noblank = $nlines_noblank + $lb; + $nlines_nocomment = $nlines_nocomment + $lc; + print " total = $nlines_total \n"; + print " total (no blank) = $nlines_noblank \n"; + print " total (no comment) = $nlines_nocomment \n"; + } +} + +#line count output +if( 1 == 0 ){ + print "\ntotal number of lines: \n"; + print " lines = $nlines_total \n"; + print " lines (no blank) = $nlines_noblank \n"; + print " lines (no comment) = $nlines_nocomment \n\n"; +} + +system("rm -f _____tutu01_____"); +