diff --git a/benchmark/advection_basic_1d.jl b/benchmark/advection_basic_1d.jl index aa4c0378..3b84208f 100644 --- a/benchmark/advection_basic_1d.jl +++ b/benchmark/advection_basic_1d.jl @@ -10,6 +10,7 @@ advection_velocity = 1.0f0 equations = LinearScalarAdvectionEquation1D(advection_velocity) solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT) +solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT) coordinates_min = -1.0f0 coordinates_max = 1.0f0 @@ -24,7 +25,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, solver) @info "Time for cache initialization on GPU" CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition_convergence_test, - solver) + solver_gpu) tspan_gpu = (0.0f0, 1.0f0) t_gpu = 0.0f0 diff --git a/benchmark/advection_basic_2d.jl b/benchmark/advection_basic_2d.jl index 0b1f9e7e..9de384a9 100644 --- a/benchmark/advection_basic_2d.jl +++ b/benchmark/advection_basic_2d.jl @@ -10,6 +10,7 @@ advection_velocity = (0.2f0, -0.7f0) equations = LinearScalarAdvectionEquation2D(advection_velocity) solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT) +solver = DGSEMGPU(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT) coordinates_min = (-1.0f0, -1.0f0) coordinates_max = (1.0f0, 1.0f0) @@ -24,7 +25,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, solver) @info "Time for cache initialization on GPU" CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition_convergence_test, - solver) + solver_gpu) tspan_gpu = (0.0f0, 1.0f0) t_gpu = 0.0f0 diff --git a/benchmark/advection_basic_3d.jl b/benchmark/advection_basic_3d.jl index d276118e..ed4dfc16 100644 --- a/benchmark/advection_basic_3d.jl +++ b/benchmark/advection_basic_3d.jl @@ -10,6 +10,7 @@ advection_velocity = (0.2f0, -0.7f0, 0.5f0) equations = LinearScalarAdvectionEquation3D(advection_velocity) solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT) +solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT) coordinates_min = (-1.0f0, -1.0f0, -1.0f0) coordinates_max = (1.0f0, 1.0f0, 1.0f0) @@ -24,7 +25,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, solver) @info "Time for cache initialization on GPU" CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition_convergence_test, - solver) + solver_gpu) tspan_gpu = (0.0f0, 1.0f0) t_gpu = 0.0f0 diff --git a/benchmark/euler_ec_1d.jl b/benchmark/euler_ec_1d.jl index 35bdfb80..6606a89a 100644 --- a/benchmark/euler_ec_1d.jl +++ b/benchmark/euler_ec_1d.jl @@ -14,6 +14,9 @@ volume_flux = flux_ranocha solver = DGSEM(polydeg = 3, surface_flux = flux_ranocha, volume_integral = VolumeIntegralFluxDifferencing(volume_flux), RealT = RealT) +solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_ranocha, + volume_integral = VolumeIntegralFluxDifferencing(volume_flux), + RealT = RealT) coordinates_min = (-2.0f0,) coordinates_max = (2.0f0,) @@ -25,7 +28,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @info "Time for cache initialization on CPU" @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) tspan_gpu = (0.0f0, 0.4f0) t_gpu = 0.0f0 diff --git a/benchmark/euler_ec_2d.jl b/benchmark/euler_ec_2d.jl index 95f7b245..a8123c89 100644 --- a/benchmark/euler_ec_2d.jl +++ b/benchmark/euler_ec_2d.jl @@ -14,6 +14,9 @@ volume_flux = flux_ranocha solver = DGSEM(polydeg = 3, surface_flux = flux_ranocha, volume_integral = VolumeIntegralFluxDifferencing(volume_flux), RealT = RealT) +solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_ranocha, + volume_integral = VolumeIntegralFluxDifferencing(volume_flux), + RealT = RealT) coordinates_min = (-2.0f0, -2.0f0) coordinates_max = (2.0f0, 2.0f0) @@ -27,7 +30,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver, boundary_conditions = boundary_condition_periodic) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver, +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu, boundary_conditions = boundary_condition_periodic) tspan_gpu = (0.0f0, 0.4f0) diff --git a/benchmark/euler_ec_3d.jl b/benchmark/euler_ec_3d.jl index 17ba65a2..f315a29e 100644 --- a/benchmark/euler_ec_3d.jl +++ b/benchmark/euler_ec_3d.jl @@ -14,6 +14,9 @@ volume_flux = flux_ranocha solver = DGSEM(polydeg = 2, surface_flux = flux_ranocha, volume_integral = VolumeIntegralFluxDifferencing(volume_flux), RealT = RealT) +solver_gpu = DGSEMGPU(polydeg = 2, surface_flux = flux_ranocha, + volume_integral = VolumeIntegralFluxDifferencing(volume_flux), + RealT = RealT) coordinates_min = (-2.0f0, -2.0f0, -2.0f0) coordinates_max = (2.0f0, 2.0f0, 2.0f0) @@ -25,7 +28,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @info "Time for cache initialization on CPU" @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) tspan_gpu = (0.0f0, 0.4f0) t_gpu = 0.0f0 diff --git a/benchmark/euler_shock_1d.jl b/benchmark/euler_shock_1d.jl index ad1a4e72..cdc2dbbb 100644 --- a/benchmark/euler_shock_1d.jl +++ b/benchmark/euler_shock_1d.jl @@ -12,7 +12,10 @@ initial_condition = initial_condition_weak_blast_wave surface_flux = flux_lax_friedrichs volume_flux = flux_shima_etal + basis = LobattoLegendreBasis(RealT, 3) +basis_gpu = LobattoLegendreBasisGPU(3, RealT) + indicator_sc = IndicatorHennemannGassner(equations, basis, alpha_max = 0.5f0, alpha_min = 0.001f0, @@ -21,7 +24,9 @@ indicator_sc = IndicatorHennemannGassner(equations, basis, volume_integral = VolumeIntegralShockCapturingHG(indicator_sc; volume_flux_dg = volume_flux, volume_flux_fv = surface_flux) + solver = DGSEM(basis, surface_flux, volume_integral) +solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral) coordinates_min = -2.0f0 coordinates_max = 2.0f0 @@ -33,7 +38,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @info "Time for cache initialization on CPU" @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) tspan = tspan_gpu = (0.0f0, 0.4f0) t = t_gpu = 0.0f0 @@ -63,74 +68,7 @@ du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cach # Reset du and volume integral @info "Time for reset_du! and volume_integral! on GPU" -CUDA.@time TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu.volume_integral, solver_gpu, - cache_gpu) -@info "Time for reset_du! and volume_integral! on CPU" -@time begin - Trixi.reset_du!(du, solver, cache) - Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), - equations, solver.volume_integral, solver, cache) -end - -# Prolong to interfaces -@info "Time for prolong2interfaces! on GPU" -CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) -@info "Time for prolong2interfaces! on CPU" -@time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver) - -# Interface flux -@info "Time for interface_flux! on GPU" -CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -@info "Time for interface_flux! on CPU" -@time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh, - Trixi.have_nonconservative_terms(equations), equations, - solver.surface_integral, solver, cache) - -# Prolong to boundaries -@info "Time for prolong2boundaries! on GPU" -CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, - equations_gpu, cache_gpu) -@info "Time for prolong2boundaries! on CPU" -@time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver) - -# Boundary flux -@info "Time for boundary_flux! on GPU" -CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -@info "Time for boundary_flux! on CPU" -@time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations, - solver.surface_integral, solver) - -# Surface integral -@info "Time for surface_integral! on GPU" -CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) -@info "Time for surface_integral! on CPU" -@time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, - solver, cache) - -# Jacobian -@info "Time for jacobian! on GPU" -CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu) -@info "Time for jacobian! on CPU" -@time Trixi.apply_jacobian!(du, mesh, equations, solver, cache) - -# Sources terms -@info "Time for sources! on GPU" -CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, - equations_gpu, cache_gpu) -@info "Time for sources! on CPU" -@time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache) - -# Semidiscretization process -@info "Time for rhs! on GPU" -CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu, - boundary_conditions_gpu, source_terms_gpu, - solver_gpu, cache_gpu) -@info "Time for rhs! on CPU" -@time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms, - solver, cache) +@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) diff --git a/benchmark/euler_shock_2d.jl b/benchmark/euler_shock_2d.jl index c05f4a0d..ff5fc51c 100644 --- a/benchmark/euler_shock_2d.jl +++ b/benchmark/euler_shock_2d.jl @@ -12,7 +12,10 @@ initial_condition = initial_condition_weak_blast_wave surface_flux = flux_lax_friedrichs volume_flux = flux_shima_etal + basis = LobattoLegendreBasis(RealT, 3) +basis_gpu = LobattoLegendreBasisGPU(3, RealT) + indicator_sc = IndicatorHennemannGassner(equations, basis, alpha_max = 0.5f0, alpha_min = 0.001f0, @@ -21,7 +24,9 @@ indicator_sc = IndicatorHennemannGassner(equations, basis, volume_integral = VolumeIntegralShockCapturingHG(indicator_sc; volume_flux_dg = volume_flux, volume_flux_fv = surface_flux) + solver = DGSEM(basis, surface_flux, volume_integral) +solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral) coordinates_min = (-2.0f0, -2.0f0) coordinates_max = (2.0f0, 2.0f0) @@ -33,7 +38,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @info "Time for cache initialization on CPU" @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) tspan = tspan_gpu = (0.0f0, 1.0f0) t = t_gpu = 0.0f0 @@ -63,93 +68,7 @@ du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cach # Reset du and volume integral @info "Time for reset_du! and volume_integral! on GPU" -CUDA.@time TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu.volume_integral, solver_gpu, - cache_gpu) -@info "Time for reset_du! and volume_integral! on CPU" -@time begin - Trixi.reset_du!(du, solver, cache) - Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), - equations, solver.volume_integral, solver, cache) -end - -# Prolong to interfaces -@info "Time for prolong2interfaces! on GPU" -CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) -@info "Time for prolong2interfaces! on CPU" -@time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver) - -# Interface flux -@info "Time for interface_flux! on GPU" -CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -@info "Time for interface_flux! on CPU" -@time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh, - Trixi.have_nonconservative_terms(equations), equations, - solver.surface_integral, solver, cache) - -# Prolong to boundaries -@info "Time for prolong2boundaries! on GPU" -CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, - equations_gpu, cache_gpu) -@info "Time for prolong2boundaries! on CPU" -@time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver) - -# Boundary flux -@info "Time for boundary_flux! on GPU" -CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -@info "Time for boundary_flux! on CPU" -@time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations, - solver.surface_integral, solver) - -# Prolong to mortars -@info "Time for prolong2mortars! on GPU" -CUDA.@time TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, - TrixiCUDA.check_cache_mortars(cache_gpu), - solver_gpu, cache_gpu) -@info "Time for prolong2mortars! on CPU" -@time Trixi.prolong2mortars!(cache, u, mesh, equations, - solver.mortar, solver.surface_integral, solver) - -# Mortar flux -@info "Time for mortar_flux! on GPU" -CUDA.@time TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu), - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -@info "Time for mortar_flux! on CPU" -@time Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh, - Trixi.have_nonconservative_terms(equations), equations, - solver.mortar, solver.surface_integral, solver, cache) - -# Surface integral -@info "Time for surface_integral! on GPU" -CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) -@info "Time for surface_integral! on CPU" -@time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, - solver, cache) - -# Jacobian -@info "Time for jacobian! on GPU" -CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu) -@info "Time for jacobian! on CPU" -@time Trixi.apply_jacobian!(du, mesh, equations, solver, cache) - -# Sources terms -@info "Time for sources! on GPU" -CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, - equations_gpu, cache_gpu) -@info "Time for sources! on CPU" -@time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache) - -# Semidiscretization process -@info "Time for rhs! on GPU" -CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu, - boundary_conditions_gpu, source_terms_gpu, - solver_gpu, cache_gpu) -@info "Time for rhs! on CPU" -@time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms, - solver, cache) +@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) diff --git a/benchmark/euler_shock_3d.jl b/benchmark/euler_shock_3d.jl index d23ffb71..5591653d 100644 --- a/benchmark/euler_shock_3d.jl +++ b/benchmark/euler_shock_3d.jl @@ -15,6 +15,8 @@ volume_flux = flux_ranocha polydeg = 3 basis = LobattoLegendreBasis(RealT, polydeg) +basis_gpu = LobattoLegendreBasisGPU(polydeg, RealT) + indicator_sc = IndicatorHennemannGassner(equations, basis, alpha_max = 0.5f0, alpha_min = 0.001f0, @@ -23,7 +25,9 @@ indicator_sc = IndicatorHennemannGassner(equations, basis, volume_integral = VolumeIntegralShockCapturingHG(indicator_sc; volume_flux_dg = volume_flux, volume_flux_fv = surface_flux) + solver = DGSEM(basis, surface_flux, volume_integral) +solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral) coordinates_min = (-2.0f0, -2.0f0, -2.0f0) coordinates_max = (2.0f0, 2.0f0, 2.0f0) @@ -36,7 +40,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver, source_terms = source_terms_convergence_test) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver, +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu, source_terms = source_terms_convergence_test) tspan = tspan_gpu = (0.0f0, 0.4f0) @@ -67,93 +71,7 @@ du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cach # Reset du and volume integral @info "Time for reset_du! and volume_integral! on GPU" -CUDA.@time TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu.volume_integral, solver_gpu, - cache_gpu) -@info "Time for reset_du! and volume_integral! on CPU" -@time begin - Trixi.reset_du!(du, solver, cache) - Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), - equations, solver.volume_integral, solver, cache) -end - -# Prolong to interfaces -@info "Time for prolong2interfaces! on GPU" -CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) -@info "Time for prolong2interfaces! on CPU" -@time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver) - -# Interface flux -@info "Time for interface_flux! on GPU" -CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -@info "Time for interface_flux! on CPU" -@time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh, - Trixi.have_nonconservative_terms(equations), equations, - solver.surface_integral, solver, cache) - -# Prolong to boundaries -@info "Time for prolong2boundaries! on GPU" -CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, - equations_gpu, cache_gpu) -@info "Time for prolong2boundaries! on CPU" -@time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver) - -# Boundary flux -@info "Time for boundary_flux! on GPU" -CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -@info "Time for boundary_flux! on CPU" -@time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations, - solver.surface_integral, solver) - -# Prolong to mortars -@info "Time for prolong2mortars! on GPU" -CUDA.@time TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, - TrixiCUDA.check_cache_mortars(cache_gpu), - solver_gpu, cache_gpu) -@info "Time for prolong2mortars! on CPU" -@time Trixi.prolong2mortars!(cache, u, mesh, equations, - solver.mortar, solver.surface_integral, solver) - -# Mortar flux -@info "Time for mortar_flux! on GPU" -CUDA.@time TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu), - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -@info "Time for mortar_flux! on CPU" -@time Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh, - Trixi.have_nonconservative_terms(equations), equations, - solver.mortar, solver.surface_integral, solver, cache) - -# Surface integral -@info "Time for surface_integral! on GPU" -CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) -@info "Time for surface_integral! on CPU" -@time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, - solver, cache) - -# Jacobian -@info "Time for jacobian! on GPU" -CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu) -@info "Time for jacobian! on CPU" -@time Trixi.apply_jacobian!(du, mesh, equations, solver, cache) - -# Sources terms -@info "Time for sources! on GPU" -CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, - equations_gpu, cache_gpu) -@info "Time for sources! on CPU" -@time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache) - -# Semidiscretization process -@info "Time for rhs! on GPU" -CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu, - boundary_conditions_gpu, source_terms_gpu, - solver_gpu, cache_gpu) -@info "Time for rhs! on CPU" -@time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms, - solver, cache) +@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) diff --git a/benchmark/mhd_ec_1d.jl b/benchmark/mhd_ec_1d.jl index d9eb3411..6ef7c90d 100644 --- a/benchmark/mhd_ec_1d.jl +++ b/benchmark/mhd_ec_1d.jl @@ -15,6 +15,9 @@ volume_flux = flux_hindenlang_gassner solver = DGSEM(polydeg = 3, surface_flux = flux_hindenlang_gassner, volume_integral = VolumeIntegralFluxDifferencing(volume_flux), RealT = RealT) +solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_hindenlang_gassner, + volume_integral = VolumeIntegralFluxDifferencing(volume_flux), + RealT = RealT) coordinates_min = 0.0f0 coordinates_max = 1.0f0 @@ -26,7 +29,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @info "Time for cache initialization on CPU" @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) tspan_gpu = (0.0f0, 0.4f0) t_gpu = 0.0f0 diff --git a/benchmark/mhd_ec_2d.jl b/benchmark/mhd_ec_2d.jl index 7d2ea8bd..6952432d 100644 --- a/benchmark/mhd_ec_2d.jl +++ b/benchmark/mhd_ec_2d.jl @@ -15,6 +15,10 @@ solver = DGSEM(polydeg = 3, surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell), volume_integral = VolumeIntegralFluxDifferencing(volume_flux), RealT = RealT) +solver_gpu = DGSEMGPU(polydeg = 3, + surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell), + volume_integral = VolumeIntegralFluxDifferencing(volume_flux), + RealT = RealT) coordinates_min = (-2.0f0, -2.0f0) coordinates_max = (2.0f0, 2.0f0) @@ -26,7 +30,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @info "Time for cache initialization on CPU" @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) tspan_gpu = (0.0f0, 0.4f0) t_gpu = 0.0f0 diff --git a/benchmark/mhd_ec_3d.jl b/benchmark/mhd_ec_3d.jl index 68195a34..72eb66a4 100644 --- a/benchmark/mhd_ec_3d.jl +++ b/benchmark/mhd_ec_3d.jl @@ -15,6 +15,10 @@ solver = DGSEM(polydeg = 3, surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell), volume_integral = VolumeIntegralFluxDifferencing(volume_flux), RealT = RealT) +solver_gpu = DGSEMGPU(polydeg = 3, + surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell), + volume_integral = VolumeIntegralFluxDifferencing(volume_flux), + RealT = RealT) coordinates_min = (-2.0f0, -2.0f0, -2.0f0) coordinates_max = (2.0f0, 2.0f0, 2.0f0) @@ -26,7 +30,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @info "Time for cache initialization on CPU" @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) @info "Time for cache initialization on GPU" -CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) tspan_gpu = (0.0f0, 0.4f0) t_gpu = 0.0f0 diff --git a/benchmark/mhd_shock_2d.jl b/benchmark/mhd_shock_2d.jl new file mode 100644 index 00000000..b0079a30 --- /dev/null +++ b/benchmark/mhd_shock_2d.jl @@ -0,0 +1,74 @@ +using Trixi, TrixiCUDA +using CUDA +using BenchmarkTools + +# Set the precision +RealT = Float32 + +# Set up the problem +equations = IdealGlmMhdEquations2D(1.4f0) + +initial_condition = initial_condition_weak_blast_wave + +surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) +volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) + +basis = LobattoLegendreBasis(RealT, 4) +basis_gpu = LobattoLegendreBasisGPU(4, RealT) + +indicator_sc = IndicatorHennemannGassner(equations, basis, + alpha_max = 0.5f0, + alpha_min = 0.001f0, + alpha_smooth = true, + variable = density_pressure) +volume_integral = VolumeIntegralShockCapturingHG(indicator_sc; + volume_flux_dg = volume_flux, + volume_flux_fv = surface_flux) + +solver = DGSEM(basis, surface_flux, volume_integral) +solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral) + +coordinates_min = (-2.0f0, -2.0f0) +coordinates_max = (2.0f0, 2.0f0) +mesh = TreeMesh(coordinates_min, coordinates_max, + initial_refinement_level = 3, + n_cells_max = 10_000, RealT = RealT) + +# Cache initialization +@info "Time for cache initialization on CPU" +@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) +@info "Time for cache initialization on GPU" +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) + +tspan = tspan_gpu = (0.0f0, 1.0f0) +t = t_gpu = 0.0f0 + +# Semi on CPU +(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi + +# Semi on GPU +equations_gpu = semi_gpu.equations +mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache +boundary_conditions_gpu = semi_gpu.boundary_conditions +source_terms_gpu = semi_gpu.source_terms + +# ODE on CPU +ode = semidiscretize(semi, tspan) +u_ode = copy(ode.u0) +du_ode = similar(u_ode) +u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) +du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) + +# ODE on GPU +ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) +u_gpu_ = copy(ode_gpu.u0) +du_gpu_ = similar(u_gpu_) +u_gpu = TrixiCUDA.wrap_array(u_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) +du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) + +# Reset du and volume integral +@info "Time for reset_du! and volume_integral! on GPU" +@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) diff --git a/benchmark/mhd_shock_3d.jl b/benchmark/mhd_shock_3d.jl new file mode 100644 index 00000000..862d1454 --- /dev/null +++ b/benchmark/mhd_shock_3d.jl @@ -0,0 +1,74 @@ +using Trixi, TrixiCUDA +using CUDA +using BenchmarkTools + +# Set the precision +RealT = Float32 + +# Set up the problem +equations = IdealGlmMhdEquations3D(1.4f0) + +initial_condition = initial_condition_weak_blast_wave + +surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) +volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) + +basis = LobattoLegendreBasis(RealT, 4) +basis_gpu = LobattoLegendreBasisGPU(4, RealT) + +indicator_sc = IndicatorHennemannGassner(equations, basis, + alpha_max = 0.5f0, + alpha_min = 0.001f0, + alpha_smooth = true, + variable = density_pressure) +volume_integral = VolumeIntegralShockCapturingHG(indicator_sc; + volume_flux_dg = volume_flux, + volume_flux_fv = surface_flux) + +solver = DGSEM(basis, surface_flux, volume_integral) +solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral) + +coordinates_min = (-2.0f0, -2.0f0, -2.0f0) +coordinates_max = (2.0f0, 2.0f0, 2.0f0) +mesh = TreeMesh(coordinates_min, coordinates_max, + initial_refinement_level = 3, + n_cells_max = 10_000, RealT = RealT) + +# Cache initialization +@info "Time for cache initialization on CPU" +@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) +@info "Time for cache initialization on GPU" +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu) + +tspan = tspan_gpu = (0.0f0, 1.0f0) +t = t_gpu = 0.0f0 + +# Semi on CPU +(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi + +# Semi on GPU +equations_gpu = semi_gpu.equations +mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache +boundary_conditions_gpu = semi_gpu.boundary_conditions +source_terms_gpu = semi_gpu.source_terms + +# ODE on CPU +ode = semidiscretize(semi, tspan) +u_ode = copy(ode.u0) +du_ode = similar(u_ode) +u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) +du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) + +# ODE on GPU +ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) +u_gpu_ = copy(ode_gpu.u0) +du_gpu_ = similar(u_gpu_) +u_gpu = TrixiCUDA.wrap_array(u_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) +du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) + +# Reset du and volume integral +@info "Time for reset_du! and volume_integral! on GPU" +@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) diff --git a/benchmark/shallowwater_shock_1d.jl b/benchmark/shallowwater_shock_1d.jl new file mode 100644 index 00000000..9da3c899 --- /dev/null +++ b/benchmark/shallowwater_shock_1d.jl @@ -0,0 +1,106 @@ +using Trixi, TrixiCUDA +using CUDA +using BenchmarkTools + +# Set the precision +RealT = Float32 + +# Set up the problem +equations = ShallowWaterEquations1D(gravity_constant = 9.812f0, H0 = 1.75f0) + +function initial_condition_stone_throw_discontinuous_bottom(x, t, + equations::ShallowWaterEquations1D) + # Flat lake + RealT = eltype(x) + H = equations.H0 + + # Discontinuous velocity + v = zero(RealT) + if x[1] >= -0.75f0 && x[1] <= 0 + v = -one(RealT) + elseif x[1] >= 0 && x[1] <= 0.75f0 + v = one(RealT) + end + + b = (1.5f0 / exp(0.5f0 * ((x[1] - 1)^2)) + + 0.75f0 / exp(0.5f0 * ((x[1] + 1)^2))) + + # Force a discontinuous bottom topography + if x[1] >= -1.5f0 && x[1] <= 0 + b = RealT(0.5f0) + end + + return prim2cons(SVector(H, v, b), equations) +end + +initial_condition = initial_condition_stone_throw_discontinuous_bottom + +boundary_condition = boundary_condition_slip_wall + +volume_flux = (flux_wintermeyer_etal, flux_nonconservative_wintermeyer_etal) +surface_flux = (FluxHydrostaticReconstruction(flux_lax_friedrichs, + hydrostatic_reconstruction_audusse_etal), + flux_nonconservative_audusse_etal) +basis = LobattoLegendreBasis(RealT, 4) +basis_gpu = LobattoLegendreBasisGPU(4, RealT) + +indicator_sc = IndicatorHennemannGassner(equations, basis, + alpha_max = 0.5f0, + alpha_min = 0.001f0, + alpha_smooth = true, + variable = waterheight_pressure) +volume_integral = VolumeIntegralShockCapturingHG(indicator_sc; + volume_flux_dg = volume_flux, + volume_flux_fv = surface_flux) + +solver = DGSEM(basis, surface_flux, volume_integral) +solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral) + +coordinates_min = -3.0f0 +coordinates_max = 3.0f0 +mesh = TreeMesh(coordinates_min, coordinates_max, + initial_refinement_level = 3, + n_cells_max = 10_000, + periodicity = false, + RealT = RealT) + +# Cache initialization +@info "Time for cache initialization on CPU" +@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver, + boundary_conditions = boundary_condition) +@info "Time for cache initialization on GPU" +CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu, + boundary_conditions = boundary_condition) + +tspan = tspan_gpu = (0.0f0, 3.0f0) +t = t_gpu = 0.0f0 + +# Semi on CPU +(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi + +# Semi on GPU +equations_gpu = semi_gpu.equations +mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache +boundary_conditions_gpu = semi_gpu.boundary_conditions +source_terms_gpu = semi_gpu.source_terms + +# ODE on CPU +ode = semidiscretize(semi, tspan) +u_ode = copy(ode.u0) +du_ode = similar(u_ode) +u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) +du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) + +# ODE on GPU +ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) +u_gpu_ = copy(ode_gpu.u0) +du_gpu_ = similar(u_gpu_) +u_gpu = TrixiCUDA.wrap_array(u_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) +du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) + +# Reset du and volume integral +@info "Time for reset_du! and volume_integral! on GPU" +@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) diff --git a/src/solvers/common.jl b/src/solvers/common.jl index a7205cb8..9cd3c992 100644 --- a/src/solvers/common.jl +++ b/src/solvers/common.jl @@ -24,21 +24,3 @@ function last_first_indices_kernel!(lasts, firsts, n_boundaries_per_direction) return nothing end - -# Kernel for counting elements for DG-only and blended DG-FV volume integral -# Maybe it is better to be moved to CPU -function pure_blended_element_count_kernel!(element_ids_dg, element_ids_dgfv, alpha, atol) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - if (i <= length(alpha)) - @inbounds dg_only = isapprox(alpha[i], 0, atol = atol) - - if dg_only # bad - @inbounds element_ids_dg[i] = i - else - @inbounds element_ids_dgfv[i] = i - end - end - - return nothing -end diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl index 4fd172e3..99c2fb9c 100644 --- a/src/solvers/dg_1d.jl +++ b/src/solvers/dg_1d.jl @@ -39,7 +39,7 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr) end ############################################################################## New optimization -# Kernel for calculating fluxes and weak form +# Kernel for calculating volume integrals with weak form function flux_weak_form_kernel!(du, u, derivative_dhat, equations::AbstractEquations{1}, flux::Any) # Set tile width @@ -129,7 +129,7 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr, end ############################################################################## New optimization -# Kernel for calculating volume fluxes and volume integrals +# Kernel for calculating volume integrals without conservative terms function volume_flux_integral_kernel!(du, u, derivative_split, equations::AbstractEquations{1}, volume_flux::Any) # Set tile width @@ -237,8 +237,7 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, nonco end ############################################################################## New optimization -# Kernel for calculating symmetric and nonconservative volume fluxes and -# corresponding volume integrals +# Kernel for calculating volume integrals with conservative terms function noncons_volume_flux_integral_kernel!(du, u, derivative_split, equations::AbstractEquations{1}, symmetric_flux::Any, nonconservative_flux::Any) @@ -312,21 +311,16 @@ end # Kernel for calculating pure DG and DG-FV volume fluxes function volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u, - element_ids_dgfv, equations::AbstractEquations{1}, + alpha, atol, equations::AbstractEquations{1}, volume_flux_dg::Any, volume_flux_fv::Any) j = (blockIdx().x - 1) * blockDim().x + threadIdx().x k = (blockIdx().y - 1) * blockDim().y + threadIdx().y if (j <= size(u, 2)^2 && k <= size(u, 3)) - # length(element_ids_dgfv) == size(u, 3) - j1 = div(j - 1, size(u, 2)) + 1 j2 = rem(j - 1, size(u, 2)) + 1 - @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - - # The sets of `get_node_vars` operations may be combined - # into a single set of operation for better performance (to be explored). + dg_only = isapprox(alpha[k], 0, atol = atol) u_node = get_node_vars(u, equations, j1, k) u_node1 = get_node_vars(u, equations, j2, k) @@ -335,58 +329,62 @@ function volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u, for ii in axes(u, 1) @inbounds volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii] - end - if j1 != 1 && j2 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1 - 1, element_dgfv) - u_rr = get_node_vars(u, equations, j1, element_dgfv) - flux_fv_node = volume_flux_fv(u_ll, u_rr, 1, equations) + # Small optimization, no much performance gain + if isequal(j1 + 1, j2) # avoid race condition + flux_fv_node = volume_flux_fv(u_node, u_node1, 1, equations) - for ii in axes(u, 1) @inbounds begin - fstar1_L[ii, j1, element_dgfv] = flux_fv_node[ii] - fstar1_R[ii, j1, element_dgfv] = flux_fv_node[ii] + fstar1_L[ii, j2, k] = flux_fv_node[ii] * (1 - dg_only) + fstar1_R[ii, j2, k] = flux_fv_node[ii] * (1 - dg_only) end end end + + # if j1 != 1 && j2 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, k) + # u_rr = get_node_vars(u, equations, j1, k) + # flux_fv_node = volume_flux_fv(u_ll, u_rr, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, k] = flux_fv_node[ii] * (1 - dg_only) + # fstar1_R[ii, j1, k] = flux_fv_node[ii] * (1 - dg_only) + # end + # end + # end end return nothing end -# Kernel for calculating DG volume integral contribution -function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr, equations::AbstractEquations{1}) +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr, fstar1_L, fstar1_R, atol, + equations::AbstractEquations{1}) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - # length(element_ids_dg) == size(du, 3) - # length(element_ids_dgfv) == size(du, 3) - @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros - @inbounds begin - element_dg = element_ids_dg[k] # check if `element_dg` is zero - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero + du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros alpha_element = alpha[k] end - if element_dg != 0 # bad - for ii in axes(du, 2) - @inbounds du[i, j, element_dg] += derivative_split[j, ii] * - (1 - isequal(j, ii)) * # set diagonal elements to zeros - volume_flux_arr[i, j, ii, element_dg] - end - end + dg_only = isapprox(alpha_element, 0, atol = atol) - if element_dgfv != 0 # bad - for ii in axes(du, 2) - @inbounds du[i, j, element_dgfv] += (1 - alpha_element) * derivative_split[j, ii] * - (1 - isequal(j, ii)) * # set diagonal elements to zeros - volume_flux_arr[i, j, ii, element_dgfv] - end + for ii in axes(du, 2) + @inbounds du[i, j, k] += derivative_split[j, ii] * + (1 - isequal(j, ii)) * # set diagonal elements to zeros + volume_flux_arr[i, j, ii, k] * dg_only + + (1 - alpha_element) * derivative_split[j, ii] * + (1 - isequal(j, ii)) * # set diagonal elements to zeros + volume_flux_arr[i, j, ii, k] * (1 - dg_only) end + + @inbounds du[i, j, k] += alpha_element * inverse_weights[j] * + (fstar1_L[i, j + 1, k] - fstar1_R[i, j, k]) * (1 - dg_only) end return nothing @@ -394,29 +392,24 @@ end # Kernel for calculating pure DG and DG-FV volume fluxes function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u, - element_ids_dgfv, derivative_split, + alpha, atol, derivative_split, equations::AbstractEquations{1}, - volume_flux_dg::Any, nonconservative_flux_dg::Any, - volume_flux_fv::Any, nonconservative_flux_fv::Any) + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) j = (blockIdx().x - 1) * blockDim().x + threadIdx().x k = (blockIdx().y - 1) * blockDim().y + threadIdx().y if (j <= size(u, 2)^2 && k <= size(u, 3)) - # length(element_ids_dgfv) == size(u, 3) - j1 = div(j - 1, size(u, 2)) + 1 j2 = rem(j - 1, size(u, 2)) + 1 - @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - - # The sets of `get_node_vars` operations may be combined - # into a single set of operation for better performance (to be explored). + dg_only = isapprox(alpha[k], 0, atol = atol) u_node = get_node_vars(u, equations, j1, k) u_node1 = get_node_vars(u, equations, j2, k) volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations) - noncons_flux_node = nonconservative_flux_dg(u_node, u_node1, 1, equations) + noncons_flux_node = noncons_flux_dg(u_node, u_node1, 1, equations) for ii in axes(u, 1) @inbounds begin @@ -424,106 +417,75 @@ function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, f (1 - isequal(j1, j2)) # set diagonal elements to zeros noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii] end - end - if j1 != 1 && j2 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1 - 1, element_dgfv) - u_rr = get_node_vars(u, equations, j1, element_dgfv) + # Small optimization, no much performance gain + if isequal(j1 + 1, j2) # avoid race condition + f1_node = volume_flux_fv(u_node, u_node1, 1, equations) + f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) + f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) - f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) - - f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations) - f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations) - - for ii in axes(u, 1) @inbounds begin - fstar1_L[ii, j1, element_dgfv] = f1_node[ii] + 0.5f0 * f1_L_node[ii] - fstar1_R[ii, j1, element_dgfv] = f1_node[ii] + 0.5f0 * f1_R_node[ii] + fstar1_L[ii, j2, k] = (f1_node[ii] + 0.5f0 * f1_L_node[ii]) * (1 - dg_only) + fstar1_R[ii, j2, k] = (f1_node[ii] + 0.5f0 * f1_R_node[ii]) * (1 - dg_only) end end end + + # if j1 != 1 && j2 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, k) + # u_rr = get_node_vars(u, equations, j1, k) + + # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) + + # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) + # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, k] = (f1_node[ii] + 0.5f0 * f1_L_node[ii]) * (1 - dg_only) + # fstar1_R[ii, j1, k] = (f1_node[ii] + 0.5f0 * f1_R_node[ii]) * (1 - dg_only) + # end + # end + # end end return nothing end -# Kernel for calculating DG volume integral contribution -function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr, noncons_flux_arr, - equations::AbstractEquations{1}) +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, + atol, equations::AbstractEquations{1}) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - # length(element_ids_dg) == size(du, 3) - # length(element_ids_dgfv) == size(du, 3) - @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros - @inbounds begin - element_dg = element_ids_dg[k] # check if `element_dg` is zero - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero + du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros alpha_element = alpha[k] end - if element_dg != 0 # bad - integral_contribution = zero(eltype(du)) + dg_only = isapprox(alpha_element, 0, atol = atol) - for ii in axes(du, 2) - @inbounds begin - du[i, j, element_dg] += volume_flux_arr[i, j, ii, element_dg] - integral_contribution += derivative_split[j, ii] * - noncons_flux_arr[i, j, ii, element_dg] - end - end - - @inbounds du[i, j, element_dg] += 0.5f0 * integral_contribution + for ii in axes(du, 2) + @inbounds du[i, j, k] += (volume_flux_arr[i, j, ii, k] + + 0.5f0 * + derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]) * dg_only + + ((1 - alpha_element) * volume_flux_arr[i, j, ii, k] + + 0.5f0 * (1 - alpha_element) * + derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]) * (1 - dg_only) end - if element_dgfv != 0 # bad - integral_contribution = zero(eltype(du)) - - for ii in axes(du, 2) - @inbounds begin - du[i, j, element_dgfv] += (1 - alpha_element) * - volume_flux_arr[i, j, ii, element_dgfv] - integral_contribution += derivative_split[j, ii] * - noncons_flux_arr[i, j, ii, element_dgfv] - end - end - - @inbounds du[i, j, element_dgfv] += 0.5f0 * (1 - alpha_element) * integral_contribution - end + @inbounds du[i, j, k] += alpha_element * inverse_weights[j] * + (fstar1_L[i, j + 1, k] - fstar1_R[i, j, k]) * (1 - dg_only) end return nothing end -# Kernel for calculating FV volume integral contribution -function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv, - alpha) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(du, 2) && k <= size(du, 3)) - # length(element_ids_dgfv) == size(du, 3) - - @inbounds begin - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - alpha_element = alpha[k] - end - - if element_dgfv != 0 # bad - for ii in axes(du, 1) - @inbounds du[ii, j, element_dgfv] += alpha_element * inverse_weights[j] * - (fstar1_L[ii, j + 1, element_dgfv] - - fstar1_R[ii, j, element_dgfv]) - end - end - end - - return nothing -end +############################################################################## New optimization +# Kernel for calculating pure DG and DG-FV volume integrals without conservative terms # Kernel for prolonging two interfaces function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids) @@ -913,60 +875,39 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: volume_flux_dg, volume_flux_fv = dg.volume_integral.volume_flux_dg, dg.volume_integral.volume_flux_fv indicator = dg.volume_integral.indicator + derivative_split = dg.basis.derivative_split + inverse_weights = dg.basis.inverse_weights + + fstar1_L = cache.fstar1_L + fstar1_R = cache.fstar1_R # TODO: Get copies of `u` and `du` on both device and host # FIXME: Scalar indexing on GPU arrays caused by using GPU cache alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache alpha = CuArray(alpha) - atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0)) - element_ids_dg = CUDA.zeros(Int, length(alpha)) - element_ids_dgfv = CUDA.zeros(Int, length(alpha)) - - # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated) - pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg, - element_ids_dgfv, - alpha, - atol) - pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - kernel_configurator_1d(pure_blended_element_count_kernel, - length(alpha))...) - derivative_split = dg.basis.derivative_split - inverse_weights = dg.basis.inverse_weights volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) - fstar1_L = cache.fstar1_L - fstar1_R = cache.fstar1_R - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u, - element_ids_dgfv, + alpha, atol, equations, volume_flux_dg, volume_flux_fv) - volume_flux_dgfv_kernel(volume_flux_arr, fstar1_L, fstar1_R, u, element_ids_dgfv, equations, + volume_flux_dgfv_kernel(volume_flux_arr, fstar1_L, fstar1_R, u, alpha, atol, equations, volume_flux_dg, volume_flux_fv; kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^2, size(u, 3))...) - volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, - element_ids_dgfv, - alpha, - derivative_split, - volume_flux_arr, - equations) - volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr, equations; - kernel_configurator_3d(volume_integral_dg_kernel, size(du)...)...) - - volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, - fstar1_R, - inverse_weights, - element_ids_dgfv, - alpha) - volume_integral_fv_kernel(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv, alpha; - kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2), - size(u, 3))...) + volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha, + derivative_split, + inverse_weights, + volume_flux_arr, + fstar1_L, fstar1_R, + atol, equations) + volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr, + fstar1_L, fstar1_R, atol, equations; + kernel_configurator_3d(volume_integral_dgfv_kernel, size(du)...)...) return nothing end @@ -976,71 +917,50 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache) RealT = eltype(du) - volume_flux_dg, nonconservative_flux_dg = dg.volume_integral.volume_flux_dg - volume_flux_fv, nonconservative_flux_fv = dg.volume_integral.volume_flux_fv + volume_flux_dg, noncons_flux_dg = dg.volume_integral.volume_flux_dg + volume_flux_fv, noncons_flux_fv = dg.volume_integral.volume_flux_fv indicator = dg.volume_integral.indicator + derivative_split = dg.basis.derivative_split + inverse_weights = dg.basis.inverse_weights + + fstar1_L = cache.fstar1_L + fstar1_R = cache.fstar1_R # TODO: Get copies of `u` and `du` on both device and host # FIXME: Scalar indexing on GPU arrays caused by using GPU cache alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache alpha = CuArray(alpha) - atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0)) - element_ids_dg = CUDA.zeros(Int, length(alpha)) - element_ids_dgfv = CUDA.zeros(Int, length(alpha)) - - # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated) - pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg, - element_ids_dgfv, - alpha, - atol) - pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - kernel_configurator_1d(pure_blended_element_count_kernel, - length(alpha))...) - derivative_split = dg.basis.derivative_split - inverse_weights = dg.basis.inverse_weights volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) noncons_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) - fstar1_L = cache.fstar1_L - fstar1_R = cache.fstar1_R - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u, - element_ids_dgfv, + alpha, atol, derivative_split, equations, volume_flux_dg, - nonconservative_flux_dg, + noncons_flux_dg, volume_flux_fv, - nonconservative_flux_fv) + noncons_flux_fv) volume_flux_dgfv_kernel(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u, - element_ids_dgfv, derivative_split, equations, volume_flux_dg, - nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv; + alpha, atol, derivative_split, equations, volume_flux_dg, + noncons_flux_dg, volume_flux_fv, noncons_flux_fv; kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^2, size(u, 3))...) - volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, - element_ids_dgfv, - alpha, - derivative_split, - volume_flux_arr, - noncons_flux_arr, - equations) - volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr, noncons_flux_arr, equations; - kernel_configurator_3d(volume_integral_dg_kernel, size(du)...)...) - - volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, - fstar1_R, - inverse_weights, - element_ids_dgfv, - alpha) - volume_integral_fv_kernel(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv, alpha; - kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2), - size(u, 3))...) + volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha, + derivative_split, + inverse_weights, + volume_flux_arr, + noncons_flux_arr, + fstar1_L, fstar1_R, + atol, equations) + volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr, + noncons_flux_arr, fstar1_L, fstar1_R, atol, equations; + kernel_configurator_3d(volume_integral_dgfv_kernel, size(du)...)...) return nothing end diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl index e01ac595..eb994cd2 100644 --- a/src/solvers/dg_2d.jl +++ b/src/solvers/dg_2d.jl @@ -52,7 +52,7 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2) end ############################################################################## New optimization -# Kernel for calculating fluxes and weak form +# Kernel for calculating volume integrals with weak form function flux_weak_form_kernel!(du, u, derivative_dhat, equations::AbstractEquations{2}, flux::Any) # Set tile width @@ -162,7 +162,7 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_ end ############################################################################## New optimization -# Kernel for calculating volume fluxes and volume integrals +# Kernel for calculating volume integrals without conservative terms function volume_flux_integral_kernel!(du, u, derivative_split, equations::AbstractEquations{2}, volume_flux::Any) # Set tile width @@ -295,8 +295,7 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, symm end ############################################################################## New optimization -# Kernel for calculating symmetric and nonconservative volume fluxes and -# corresponding volume integrals +# Kernel for calculating volume integrals with conservative terms function noncons_volume_flux_integral_kernel!(du, u, derivative_split, equations::AbstractEquations{2}, symmetric_flux::Any, nonconservative_flux::Any) @@ -378,24 +377,20 @@ end # Kernel for calculating pure DG and DG-FV volume fluxes function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R, - fstar2_L, fstar2_R, u, element_ids_dgfv, - equations::AbstractEquations{2}, volume_flux_dg::Any, - volume_flux_fv::Any) + fstar2_L, fstar2_R, u, alpha, atol, + equations::AbstractEquations{2}, + volume_flux_dg::Any, volume_flux_fv::Any) j = (blockIdx().x - 1) * blockDim().x + threadIdx().x k = (blockIdx().y - 1) * blockDim().y + threadIdx().y if (j <= size(u, 2)^3 && k <= size(u, 4)) - # length(element_ids_dgfv) == size(u, 4) u2 = size(u, 2) j1 = div(j - 1, u2^2) + 1 j2 = div(rem(j - 1, u2^2), u2) + 1 j3 = rem(rem(j - 1, u2^2), u2) + 1 - @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - - # The sets of `get_node_vars` operations may be combined - # into a single set of operation for better performance (to be explored). + dg_only = isapprox(alpha[k], 0, atol = atol) u_node = get_node_vars(u, equations, j1, j2, k) u_node1 = get_node_vars(u, equations, j3, j2, k) @@ -409,86 +404,97 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L, volume_flux_arr1[ii, j1, j3, j2, k] = volume_flux_node1[ii] volume_flux_arr2[ii, j1, j2, j3, k] = volume_flux_node2[ii] end - end - if j1 != 1 && j3 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1 - 1, j2, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, element_dgfv) - flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations) + # Small optimization, no much performance gain + if isequal(j1 + 1, j3) # avoid race condition + flux_fv_node1 = volume_flux_fv(u_node, u_node1, 1, equations) - for ii in axes(u, 1) @inbounds begin - fstar1_L[ii, j1, j2, element_dgfv] = flux_fv_node1[ii] - fstar1_R[ii, j1, j2, element_dgfv] = flux_fv_node1[ii] + fstar1_L[ii, j3, j2, k] = flux_fv_node1[ii] * (1 - dg_only) + fstar1_R[ii, j3, j2, k] = flux_fv_node1[ii] * (1 - dg_only) end end - end - if j2 != 1 && j3 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1, j2 - 1, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, element_dgfv) - flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations) + if isequal(j2 + 1, j3) # avoid race condition + flux_fv_node2 = volume_flux_fv(u_node, u_node2, 2, equations) - for ii in axes(u, 1) @inbounds begin - fstar2_L[ii, j1, j2, element_dgfv] = flux_fv_node2[ii] - fstar2_R[ii, j1, j2, element_dgfv] = flux_fv_node2[ii] + fstar2_L[ii, j1, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + fstar2_R[ii, j1, j3, k] = flux_fv_node2[ii] * (1 - dg_only) end end end + + # if j1 != 1 && j3 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, j2, k) + # u_rr = get_node_vars(u, equations, j1, j2, k) + # flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, j2, k] = flux_fv_node1[ii] * (1 - dg_only) + # fstar1_R[ii, j1, j2, k] = flux_fv_node1[ii] * (1 - dg_only) + # end + # end + # end + + # if j2 != 1 && j3 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2 - 1, k) + # u_rr = get_node_vars(u, equations, j1, j2, k) + # flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar2_L[ii, j1, j2, k] = flux_fv_node2[ii] * (1 - dg_only) + # fstar2_R[ii, j1, j2, k] = flux_fv_node2[ii] * (1 - dg_only) + # end + # end + # end end return nothing end -# Kernel for calculating DG volume integral contribution -function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr1, volume_flux_arr2, - equations::AbstractEquations{2}) +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr1, volume_flux_arr2, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol, + equations::AbstractEquations{2}) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - # length(element_ids_dg) == size(du, 4) - # length(element_ids_dgfv) == size(du, 4) - j1 = div(j - 1, size(du, 2)) + 1 j2 = rem(j - 1, size(du, 2)) + 1 - @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros - @inbounds begin - element_dg = element_ids_dg[k] # check if `element_dg` is zero - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero + du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros alpha_element = alpha[k] end - if element_dg != 0 # bad - for ii in axes(du, 2) - @inbounds begin - du[i, j1, j2, element_dg] += derivative_split[j1, ii] * - (1 - isequal(j1, ii)) * # set diagonal elements to zeros - volume_flux_arr1[i, j1, ii, j2, element_dg] - du[i, j1, j2, element_dg] += derivative_split[j2, ii] * - (1 - isequal(j2, ii)) * # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, element_dg] - end - end - end + dg_only = isapprox(alpha_element, 0, atol = atol) - if element_dgfv != 0 # bad - for ii in axes(du, 2) - @inbounds begin - du[i, j1, j2, element_dgfv] += (1 - alpha_element) * derivative_split[j1, ii] * - (1 - isequal(j1, ii)) * # set diagonal elements to zeros - volume_flux_arr1[i, j1, ii, j2, element_dgfv] - du[i, j1, j2, element_dgfv] += (1 - alpha_element) * derivative_split[j2, ii] * - (1 - isequal(j2, ii)) * # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, element_dgfv] - end - end + for ii in axes(du, 2) + @inbounds du[i, j1, j2, k] += (derivative_split[j1, ii] * + (1 - isequal(j1, ii)) * # set diagonal elements to zeros + volume_flux_arr1[i, j1, ii, j2, k] + + derivative_split[j2, ii] * + (1 - isequal(j2, ii)) * # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, k]) * dg_only + + ((1 - alpha_element) * derivative_split[j1, ii] * + (1 - isequal(j1, ii)) * # set diagonal elements to zeros + volume_flux_arr1[i, j1, ii, j2, k] + + (1 - alpha_element) * derivative_split[j2, ii] * + (1 - isequal(j2, ii)) * # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, k]) * (1 - dg_only) end + + @inbounds du[i, j1, j2, k] += alpha_element * + (inverse_weights[j1] * + (fstar1_L[i, j1 + 1, j2, k] - fstar1_R[i, j1, j2, k]) + + inverse_weights[j2] * + (fstar2_L[i, j1, j2 + 1, k] - fstar2_R[i, j1, j2, k])) * (1 - dg_only) end return nothing @@ -497,25 +503,21 @@ end # Kernel for calculating pure DG and DG-FV volume fluxes function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1, noncons_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, - u, element_ids_dgfv, derivative_split, + u, alpha, atol, derivative_split, equations::AbstractEquations{2}, - volume_flux_dg::Any, nonconservative_flux_dg::Any, - volume_flux_fv::Any, nonconservative_flux_fv::Any) + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) j = (blockIdx().x - 1) * blockDim().x + threadIdx().x k = (blockIdx().y - 1) * blockDim().y + threadIdx().y if (j <= size(u, 2)^3 && k <= size(u, 4)) - # length(element_ids_dgfv) == size(u, 4) u2 = size(u, 2) j1 = div(j - 1, u2^2) + 1 j2 = div(rem(j - 1, u2^2), u2) + 1 j3 = rem(rem(j - 1, u2^2), u2) + 1 - @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - - # The sets of `get_node_vars` operations may be combined - # into a single set of operation for better performance (to be explored). + dg_only = isapprox(alpha[k], 0, atol = atol) u_node = get_node_vars(u, equations, j1, j2, k) u_node1 = get_node_vars(u, equations, j3, j2, k) @@ -524,8 +526,8 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_fl volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) - noncons_flux_node1 = nonconservative_flux_dg(u_node, u_node1, 1, equations) - noncons_flux_node2 = nonconservative_flux_dg(u_node, u_node2, 2, equations) + noncons_flux_node1 = noncons_flux_dg(u_node, u_node1, 1, equations) + noncons_flux_node2 = noncons_flux_dg(u_node, u_node2, 2, equations) for ii in axes(u, 1) @inbounds begin @@ -536,144 +538,118 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_fl noncons_flux_arr1[ii, j1, j3, j2, k] = noncons_flux_node1[ii] noncons_flux_arr2[ii, j1, j2, j3, k] = noncons_flux_node2[ii] end - end - - if j1 != 1 && j3 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1 - 1, j2, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, element_dgfv) - - f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) - f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations) - f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations) + # Small optimization, no much performance gain + if isequal(j1 + 1, j3) # avoid race condition + f1_node = volume_flux_fv(u_node, u_node1, 1, equations) + f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) + f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) - for ii in axes(u, 1) @inbounds begin - fstar1_L[ii, j1, j2, element_dgfv] = f1_node[ii] + 0.5f0 * f1_L_node[ii] - fstar1_R[ii, j1, j2, element_dgfv] = f1_node[ii] + 0.5f0 * f1_R_node[ii] + fstar1_L[ii, j3, j2, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) + fstar1_R[ii, j3, j2, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) end end - end - - if j2 != 1 && j3 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1, j2 - 1, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, element_dgfv) - - f2_node = volume_flux_fv(u_ll, u_rr, 2, equations) - f2_L_node = nonconservative_flux_fv(u_ll, u_rr, 2, equations) - f2_R_node = nonconservative_flux_fv(u_rr, u_ll, 2, equations) + if isequal(j2 + 1, j3) # avoid race condition + f2_node = volume_flux_fv(u_node, u_node2, 2, equations) + f2_L_node = noncons_flux_fv(u_node, u_node2, 2, equations) + f2_R_node = noncons_flux_fv(u_node2, u_node, 2, equations) - for ii in axes(u, 1) @inbounds begin - fstar2_L[ii, j1, j2, element_dgfv] = f2_node[ii] + 0.5f0 * f2_L_node[ii] - fstar2_R[ii, j1, j2, element_dgfv] = f2_node[ii] + 0.5f0 * f2_R_node[ii] + fstar2_L[ii, j1, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) + fstar2_R[ii, j1, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) end end end - end - - return nothing -end - -# Kernel for calculating DG volume integral contribution -function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr1, volume_flux_arr2, - noncons_flux_arr1, noncons_flux_arr2, - equations::AbstractEquations{2}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - # length(element_ids_dg) == size(du, 4) - # length(element_ids_dgfv) == size(du, 4) - - j1 = div(j - 1, size(du, 2)) + 1 - j2 = rem(j - 1, size(du, 2)) + 1 - - @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros - @inbounds begin - element_dg = element_ids_dg[k] # check if `element_dg` is zero - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - alpha_element = alpha[k] - end + # if j1 != 1 && j3 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, j2, k) + # u_rr = get_node_vars(u, equations, j1, j2, k) - if element_dg != 0 # bad - integral_contribution = zero(eltype(du)) + # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) - for ii in axes(du, 2) - @inbounds begin - du[i, j1, j2, element_dg] += volume_flux_arr1[i, j1, ii, j2, element_dg] - du[i, j1, j2, element_dg] += volume_flux_arr2[i, j1, j2, ii, element_dg] + # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) + # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) - integral_contribution += derivative_split[j1, ii] * - noncons_flux_arr1[i, j1, ii, j2, element_dg] - integral_contribution += derivative_split[j2, ii] * - noncons_flux_arr2[i, j1, j2, ii, element_dg] - end - end + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, j2, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) + # fstar1_R[ii, j1, j2, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) + # end + # end + # end - @inbounds du[i, j1, j2, element_dg] += 0.5f0 * integral_contribution - end + # if j2 != 1 && j3 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2 - 1, k) + # u_rr = get_node_vars(u, equations, j1, j2, k) - if element_dgfv != 0 # bad - integral_contribution = zero(eltype(du)) + # f2_node = volume_flux_fv(u_ll, u_rr, 2, equations) - for ii in axes(du, 2) - @inbounds begin - du[i, j1, j2, element_dgfv] += (1 - alpha_element) * - volume_flux_arr1[i, j1, ii, j2, element_dgfv] - du[i, j1, j2, element_dgfv] += (1 - alpha_element) * - volume_flux_arr2[i, j1, j2, ii, element_dgfv] - - integral_contribution += derivative_split[j1, ii] * - noncons_flux_arr1[i, j1, ii, j2, element_dgfv] - integral_contribution += derivative_split[j2, ii] * - noncons_flux_arr2[i, j1, j2, ii, element_dgfv] - end - end + # f2_L_node = noncons_flux_fv(u_ll, u_rr, 2, equations) + # f2_R_node = noncons_flux_fv(u_rr, u_ll, 2, equations) - @inbounds du[i, j1, j2, element_dgfv] += 0.5f0 * (1 - alpha_element) * integral_contribution - end + # for ii in axes(u, 1) + # @inbounds begin + # fstar2_L[ii, j1, j2, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) + # fstar2_R[ii, j1, j2, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) + # end + # end + # end end return nothing end -# Kernel for calculating FV volume integral contribution -function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, - inverse_weights, element_ids_dgfv, alpha) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(du, 2)^2 && k <= size(du, 4)) - # length(element_ids_dgfv) == size(du, 4) +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr1, volume_flux_arr2, + noncons_flux_arr1, noncons_flux_arr2, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol, + equations::AbstractEquations{2}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) j1 = div(j - 1, size(du, 2)) + 1 j2 = rem(j - 1, size(du, 2)) + 1 @inbounds begin - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero + du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros alpha_element = alpha[k] end - if element_dgfv != 0 # bad - for ii in axes(du, 1) - @inbounds du[ii, j1, j2, element_dgfv] += alpha_element * ((inverse_weights[j1] * - (fstar1_L[ii, j1 + 1, j2, element_dgfv] - - fstar1_R[ii, j1, j2, element_dgfv]) + - inverse_weights[j2] * - (fstar2_L[ii, j1, j2 + 1, element_dgfv] - - fstar2_R[ii, j1, j2, element_dgfv]))) - end + dg_only = isapprox(alpha_element, 0, atol = atol) + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, k] += (volume_flux_arr1[i, j1, ii, j2, k] + + volume_flux_arr2[i, j1, j2, ii, k] + + 0.5f0 * + (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] + + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k])) * dg_only + + ((1 - alpha_element) * + volume_flux_arr1[i, j1, ii, j2, k] + + (1 - alpha_element) * + volume_flux_arr2[i, j1, j2, ii, k] + + 0.5f0 * (1 - alpha_element) * + (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] + + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k])) * (1 - dg_only) end + + @inbounds du[i, j1, j2, k] += alpha_element * + (inverse_weights[j1] * + (fstar1_L[i, j1 + 1, j2, k] - fstar1_R[i, j1, j2, k]) + + inverse_weights[j2] * + (fstar2_L[i, j1, j2 + 1, k] - fstar2_R[i, j1, j2, k])) * (1 - dg_only) end return nothing end +############################################################################## New optimization +# Kernel for calculating DG-FV volume integrals without conservative terms + # Kernel for prolonging two interfaces function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations, euqations::AbstractEquations{2}) @@ -1390,73 +1366,51 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: volume_flux_dg, volume_flux_fv = dg.volume_integral.volume_flux_dg, dg.volume_integral.volume_flux_fv indicator = dg.volume_integral.indicator + derivative_split = dg.basis.derivative_split + inverse_weights = dg.basis.inverse_weights + + fstar1_L = cache.fstar1_L + fstar1_R = cache.fstar1_R + fstar2_L = cache.fstar2_L + fstar2_R = cache.fstar2_R # TODO: Get copies of `u` and `du` on both device and host # FIXME: Scalar indexing on GPU arrays caused by using GPU cache alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache alpha = CuArray(alpha) - atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0)) - element_ids_dg = CUDA.zeros(Int, length(alpha)) - element_ids_dgfv = CUDA.zeros(Int, length(alpha)) - - # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated) - pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg, - element_ids_dgfv, - alpha, - atol) - pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - kernel_configurator_1d(pure_blended_element_count_kernel, - length(alpha))...) - derivative_split = dg.basis.derivative_split - inverse_weights = dg.basis.inverse_weights volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 4)) volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 4)) - fstar1_L = cache.fstar1_L - fstar1_R = cache.fstar1_R - fstar2_L = cache.fstar2_L - fstar2_R = cache.fstar2_R - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, - u, element_ids_dgfv, + u, alpha, atol, equations, volume_flux_dg, volume_flux_fv) volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R, fstar2_L, - fstar2_R, u, element_ids_dgfv, equations, volume_flux_dg, - volume_flux_fv; + fstar2_R, u, alpha, atol, equations, volume_flux_dg, volume_flux_fv; kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^3, size(u, 4))...) - volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, - element_ids_dgfv, - alpha, - derivative_split, - volume_flux_arr1, - volume_flux_arr2, - equations) - volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr1, volume_flux_arr2, equations; - kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1), - size(du, 2)^2, size(du, 4))...) - - volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, - fstar1_R, - fstar2_L, fstar2_R, - inverse_weights, - element_ids_dgfv, - alpha) - volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, inverse_weights, - element_ids_dgfv, alpha; - kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^2, - size(u, 4))...) + volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha, + derivative_split, + inverse_weights, + volume_flux_arr1, + volume_flux_arr2, + fstar1_L, fstar1_R, + fstar2_L, fstar2_R, + atol, equations) + volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr1, + volume_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol, + equations; + kernel_configurator_3d(volume_integral_dgfv_kernel, size(du, 1), + size(du, 2)^2, size(du, 4))...) return nothing end @@ -1466,30 +1420,23 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache) RealT = eltype(du) - volume_flux_dg, nonconservative_flux_dg = dg.volume_integral.volume_flux_dg - volume_flux_fv, nonconservative_flux_fv = dg.volume_integral.volume_flux_fv + volume_flux_dg, noncons_flux_dg = dg.volume_integral.volume_flux_dg + volume_flux_fv, noncons_flux_fv = dg.volume_integral.volume_flux_fv indicator = dg.volume_integral.indicator + derivative_split = dg.basis.derivative_split + inverse_weights = dg.basis.inverse_weights + + fstar1_L = cache.fstar1_L + fstar1_R = cache.fstar1_R + fstar2_L = cache.fstar2_L + fstar2_R = cache.fstar2_R # TODO: Get copies of `u` and `du` on both device and host # FIXME: Scalar indexing on GPU arrays caused by using GPU cache alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache alpha = CuArray(alpha) - atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0)) - element_ids_dg = CUDA.zeros(Int, length(alpha)) - element_ids_dgfv = CUDA.zeros(Int, length(alpha)) - - # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated) - pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg, - element_ids_dgfv, - alpha, - atol) - pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - kernel_configurator_1d(pure_blended_element_count_kernel, - length(alpha))...) - derivative_split = dg.basis.derivative_split - inverse_weights = dg.basis.inverse_weights volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 4)) volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), @@ -1499,56 +1446,41 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: noncons_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 4)) - fstar1_L = cache.fstar1_L - fstar1_R = cache.fstar1_R - fstar2_L = cache.fstar2_L - fstar2_R = cache.fstar2_R - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1, noncons_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, - u, element_ids_dgfv, + u, alpha, atol, derivative_split, equations, volume_flux_dg, - nonconservative_flux_dg, + noncons_flux_dg, volume_flux_fv, - nonconservative_flux_fv) + noncons_flux_fv) volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1, noncons_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, - element_ids_dgfv, derivative_split, equations, volume_flux_dg, - nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv; + alpha, atol, derivative_split, equations, volume_flux_dg, + noncons_flux_dg, volume_flux_fv, noncons_flux_fv; kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^3, size(u, 4))...) - volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, - element_ids_dgfv, - alpha, - derivative_split, - volume_flux_arr1, - volume_flux_arr2, - noncons_flux_arr1, - noncons_flux_arr2, - equations) - volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1, - noncons_flux_arr2, equations; - kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1), - size(du, 2)^2, size(du, 4))...) - - volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, - fstar1_R, - fstar2_L, fstar2_R, - inverse_weights, - element_ids_dgfv, - alpha) - volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, inverse_weights, - element_ids_dgfv, alpha; - kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^2, - size(u, 4))...) + volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha, + derivative_split, + inverse_weights, + volume_flux_arr1, + volume_flux_arr2, + noncons_flux_arr1, + noncons_flux_arr2, + fstar1_L, fstar1_R, + fstar2_L, fstar2_R, + atol, equations) + volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr1, + volume_flux_arr2, noncons_flux_arr1, noncons_flux_arr2, fstar1_L, + fstar1_R, fstar2_L, fstar2_R, atol, equations; + kernel_configurator_3d(volume_integral_dgfv_kernel, size(du, 1), + size(du, 2)^2, size(du, 4))...) return nothing end diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl index 3055a878..f8156318 100644 --- a/src/solvers/dg_3d.jl +++ b/src/solvers/dg_3d.jl @@ -62,7 +62,7 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3) end ############################################################################## New optimization -# Kernel for calculating fluxes and weak form +# Kernel for calculating volume integrals with weak form function flux_weak_form_kernel!(du, u, derivative_dhat, equations::AbstractEquations{3}, flux::Any) # Set tile width @@ -185,7 +185,7 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_ end ############################################################################## New optimization -# Kernel for calculating volume fluxes and volume integrals +# Kernel for calculating volume integrals without conservative terms function volume_flux_integral_kernel!(du, u, derivative_split, equations::AbstractEquations{3}, volume_flux::Any) # Set tile width @@ -337,8 +337,7 @@ function volume_integral_kernel!(du, derivative_split, end ############################################################################## New optimization -# Kernel for calculating symmetric and nonconservative volume fluxes and -# corresponding volume integrals +# Kernel for calculating volume integrals with conservative terms function noncons_volume_flux_integral_kernel!(du, u, derivative_split, equations::AbstractEquations{3}, symmetric_flux::Any, nonconservative_flux::Any) @@ -431,13 +430,12 @@ end # Kernel for calculating pure DG and DG-FV volume fluxes function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - u, element_ids_dgfv, equations::AbstractEquations{3}, + u, alpha, atol, equations::AbstractEquations{3}, volume_flux_dg::Any, volume_flux_fv::Any) j = (blockIdx().x - 1) * blockDim().x + threadIdx().x k = (blockIdx().y - 1) * blockDim().y + threadIdx().y if (j <= size(u, 2)^4 && k <= size(u, 5)) - # length(element_ids_dgfv) == size(u, 5) u2 = size(u, 2) j1 = div(j - 1, u2^3) + 1 @@ -445,10 +443,7 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu j3 = div(rem(j - 1, u2^2), u2) + 1 j4 = rem(j - 1, u2) + 1 - @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - - # The sets of `get_node_vars` operations may be combined - # into a single set of operation for better performance (to be explored). + dg_only = isapprox(alpha[k], 0, atol = atol) u_node = get_node_vars(u, equations, j1, j2, j3, k) u_node1 = get_node_vars(u, equations, j4, j2, j3, k) @@ -465,106 +460,130 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu volume_flux_arr2[ii, j1, j2, j4, j3, k] = volume_flux_node2[ii] volume_flux_arr3[ii, j1, j2, j3, j4, k] = volume_flux_node3[ii] end - end - if j1 != 1 && j4 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv) - flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations) + # Small optimization, no much performance gain + if isequal(j1 + 1, j4) # avoid race condition + flux_fv_node1 = volume_flux_fv(u_node, u_node1, 1, equations) - for ii in axes(u, 1) @inbounds begin - fstar1_L[ii, j1, j2, j3, element_dgfv] = flux_fv_node1[ii] - fstar1_R[ii, j1, j2, j3, element_dgfv] = flux_fv_node1[ii] + fstar1_L[ii, j4, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) + fstar1_R[ii, j4, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) end end - end - if j2 != 1 && j4 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv) - flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations) + if isequal(j2 + 1, j4) # avoid race condition + flux_fv_node2 = volume_flux_fv(u_node, u_node2, 2, equations) - for ii in axes(u, 1) @inbounds begin - fstar2_L[ii, j1, j2, j3, element_dgfv] = flux_fv_node2[ii] - fstar2_R[ii, j1, j2, j3, element_dgfv] = flux_fv_node2[ii] + fstar2_L[ii, j1, j4, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + fstar2_R[ii, j1, j4, j3, k] = flux_fv_node2[ii] * (1 - dg_only) end end - end - if j3 != 1 && j4 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv) - flux_fv_node3 = volume_flux_fv(u_ll, u_rr, 3, equations) + if isequal(j3 + 1, j4) # avoid race condition + flux_fv_node3 = volume_flux_fv(u_node, u_node3, 3, equations) - for ii in axes(u, 1) @inbounds begin - fstar3_L[ii, j1, j2, j3, element_dgfv] = flux_fv_node3[ii] - fstar3_R[ii, j1, j2, j3, element_dgfv] = flux_fv_node3[ii] + fstar3_L[ii, j1, j2, j4, k] = flux_fv_node3[ii] * (1 - dg_only) + fstar3_R[ii, j1, j2, j4, k] = flux_fv_node3[ii] * (1 - dg_only) end end end + + # if j1 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + # flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) + # fstar1_R[ii, j1, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) + # end + # end + # end + + # if j2 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + # flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar2_L[ii, j1, j2, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + # fstar2_R[ii, j1, j2, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + # end + # end + # end + + # if j3 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + # flux_fv_node3 = volume_flux_fv(u_ll, u_rr, 3, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar3_L[ii, j1, j2, j3, k] = flux_fv_node3[ii] * (1 - dg_only) + # fstar3_R[ii, j1, j2, j3, k] = flux_fv_node3[ii] * (1 - dg_only) + # end + # end + # end end return nothing end -# Kernel for calculating DG volume integral contribution -function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, - equations::AbstractEquations{3}) +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, + atol, equations::AbstractEquations{3}) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - # length(element_ids_dg) == size(u, 5) u2 = size(du, 2) # size(du, 2) == size(u, 2) j1 = div(j - 1, u2^2) + 1 j2 = div(rem(j - 1, u2^2), u2) + 1 j3 = rem(rem(j - 1, u2^2), u2) + 1 - @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros - @inbounds begin - element_dg = element_ids_dg[k] # check if `element_dg` is zero - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero + du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros alpha_element = alpha[k] end - if element_dg != 0 # bad - for ii in axes(du, 2) - @inbounds begin - du[i, j1, j2, j3, element_dg] += derivative_split[j1, ii] * - (1 - isequal(j1, ii)) * # set diagonal elements to zeros - volume_flux_arr1[i, j1, ii, j2, j3, element_dg] - du[i, j1, j2, j3, element_dg] += derivative_split[j2, ii] * - (1 - isequal(j2, ii)) * # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, j3, element_dg] - du[i, j1, j2, j3, element_dg] += derivative_split[j3, ii] * - (1 - isequal(j3, ii)) * # set diagonal elements to zeros - volume_flux_arr3[i, j1, j2, j3, ii, element_dg] - end - end - end + dg_only = isapprox(alpha_element, 0, atol = atol) - if element_dgfv != 0 # bad - for ii in axes(du, 2) - @inbounds begin - du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * derivative_split[j1, ii] * - (1 - isequal(j1, ii)) * # set diagonal elements to zeros - volume_flux_arr1[i, j1, ii, j2, j3, element_dgfv] - du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * derivative_split[j2, ii] * - (1 - isequal(j2, ii)) * # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, j3, element_dgfv] - du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * derivative_split[j3, ii] * - (1 - isequal(j3, ii)) * # set diagonal elements to zeros - volume_flux_arr3[i, j1, j2, j3, ii, element_dgfv] - end - end + for ii in axes(du, 2) + @inbounds du[i, j1, j2, j3, k] += (derivative_split[j1, ii] * + (1 - isequal(j1, ii)) * # set diagonal elements to zeros + volume_flux_arr1[i, j1, ii, j2, j3, k] + + derivative_split[j2, ii] * + (1 - isequal(j2, ii)) * # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, j3, k] + + derivative_split[j3, ii] * + (1 - isequal(j3, ii)) * # set diagonal elements to zeros + volume_flux_arr3[i, j1, j2, j3, ii, k]) * dg_only + + ((1 - alpha_element) * derivative_split[j1, ii] * + (1 - isequal(j1, ii)) * # set diagonal elements to zeros + volume_flux_arr1[i, j1, ii, j2, j3, k] + + (1 - alpha_element) * derivative_split[j2, ii] * + (1 - isequal(j2, ii)) * # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, j3, k] + + (1 - alpha_element) * derivative_split[j3, ii] * + (1 - isequal(j3, ii)) * # set diagonal elements to zeros + volume_flux_arr3[i, j1, j2, j3, ii, k]) * (1 - dg_only) end + + @inbounds du[i, j1, j2, j3, k] += alpha_element * + (inverse_weights[j1] * + (fstar1_L[i, j1 + 1, j2, j3, k] - fstar1_R[i, j1, j2, j3, k]) + + inverse_weights[j2] * + (fstar2_L[i, j1, j2 + 1, j3, k] - fstar2_R[i, j1, j2, j3, k]) + + inverse_weights[j3] * + (fstar3_L[i, j1, j2, j3 + 1, k] - fstar3_R[i, j1, j2, j3, k])) * (1 - dg_only) end return nothing @@ -574,15 +593,14 @@ end function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - u, element_ids_dgfv, derivative_split, + u, alpha, atol, derivative_split, equations::AbstractEquations{3}, - volume_flux_dg::Any, nonconservative_flux_dg::Any, - volume_flux_fv::Any, nonconservative_flux_fv::Any) + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) j = (blockIdx().x - 1) * blockDim().x + threadIdx().x k = (blockIdx().y - 1) * blockDim().y + threadIdx().y if (j <= size(u, 2)^4 && k <= size(u, 5)) - # length(element_ids_dgfv) == size(u, 5) u2 = size(u, 2) j1 = div(j - 1, u2^3) + 1 @@ -590,10 +608,7 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu j3 = div(rem(j - 1, u2^2), u2) + 1 j4 = rem(j - 1, u2) + 1 - @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - - # The sets of `get_node_vars` operations may be combined - # into a single set of operation for better performance (to be explored). + dg_only = isapprox(alpha[k], 0, atol = atol) u_node = get_node_vars(u, equations, j1, j2, j3, k) u_node1 = get_node_vars(u, equations, j4, j2, j3, k) @@ -604,9 +619,9 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) volume_flux_node3 = volume_flux_dg(u_node, u_node3, 3, equations) - noncons_flux_node1 = nonconservative_flux_dg(u_node, u_node1, 1, equations) - noncons_flux_node2 = nonconservative_flux_dg(u_node, u_node2, 2, equations) - noncons_flux_node3 = nonconservative_flux_dg(u_node, u_node3, 3, equations) + noncons_flux_node1 = noncons_flux_dg(u_node, u_node1, 1, equations) + noncons_flux_node2 = noncons_flux_dg(u_node, u_node2, 2, equations) + noncons_flux_node3 = noncons_flux_dg(u_node, u_node3, 3, equations) for ii in axes(u, 1) @inbounds begin @@ -621,154 +636,108 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu noncons_flux_arr2[ii, j1, j2, j4, j3, k] = noncons_flux_node2[ii] noncons_flux_arr3[ii, j1, j2, j3, j4, k] = noncons_flux_node3[ii] end - end - - if j1 != 1 && j4 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv) - f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) + # Small optimization, no much performance gain + if isequal(j1 + 1, j4) # avoid race condition + f1_node = volume_flux_fv(u_node, u_node1, 1, equations) + f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) + f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) - f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations) - f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations) - - for ii in axes(u, 1) @inbounds begin - fstar1_L[ii, j1, j2, j3, element_dgfv] = f1_node[ii] + 0.5f0 * f1_L_node[ii] - fstar1_R[ii, j1, j2, j3, element_dgfv] = f1_node[ii] + 0.5f0 * f1_R_node[ii] + fstar1_L[ii, j4, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) + fstar1_R[ii, j4, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) end end - end - - if j2 != 1 && j4 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv) - f2_node = volume_flux_fv(u_ll, u_rr, 2, equations) + if isequal(j2 + 1, j4) # avoid race condition + f2_node = volume_flux_fv(u_node, u_node2, 2, equations) + f2_L_node = noncons_flux_fv(u_node, u_node2, 2, equations) + f2_R_node = noncons_flux_fv(u_node2, u_node, 2, equations) - f2_L_node = nonconservative_flux_fv(u_ll, u_rr, 2, equations) - f2_R_node = nonconservative_flux_fv(u_rr, u_ll, 2, equations) - - for ii in axes(u, 1) @inbounds begin - fstar2_L[ii, j1, j2, j3, element_dgfv] = f2_node[ii] + 0.5f0 * f2_L_node[ii] - fstar2_R[ii, j1, j2, j3, element_dgfv] = f2_node[ii] + 0.5f0 * f2_R_node[ii] + fstar2_L[ii, j1, j4, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) + fstar2_R[ii, j1, j4, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) end end - end - - if j3 != 1 && j4 == 1 && element_dgfv != 0 # bad - u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, element_dgfv) - u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv) - f3_node = volume_flux_fv(u_ll, u_rr, 3, equations) + if isequal(j3 + 1, j4) # avoid race condition + f3_node = volume_flux_fv(u_node, u_node3, 3, equations) + f3_L_node = noncons_flux_fv(u_node, u_node3, 3, equations) + f3_R_node = noncons_flux_fv(u_node3, u_node, 3, equations) - f3_L_node = nonconservative_flux_fv(u_ll, u_rr, 3, equations) - f3_R_node = nonconservative_flux_fv(u_rr, u_ll, 3, equations) - - for ii in axes(u, 1) @inbounds begin - fstar3_L[ii, j1, j2, j3, element_dgfv] = f3_node[ii] + 0.5f0 * f3_L_node[ii] - fstar3_R[ii, j1, j2, j3, element_dgfv] = f3_node[ii] + 0.5f0 * f3_R_node[ii] + fstar3_L[ii, j1, j2, j4, k] = f3_node[ii] + 0.5f0 * f3_L_node[ii] * (1 - dg_only) + fstar3_R[ii, j1, j2, j4, k] = f3_node[ii] + 0.5f0 * f3_R_node[ii] * (1 - dg_only) end end end - end - return nothing -end + # if j1 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) -# Kernel for calculating DG volume integral contribution -function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, - noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, - equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) - if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - # length(element_ids_dg) == size(u, 5) - u2 = size(du, 2) # size(du, 2) == size(u, 2) + # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) + # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) + # fstar1_R[ii, j1, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) + # end + # end + # end - @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros + # if j2 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) - @inbounds begin - element_dg = element_ids_dg[k] # check if `element_dg` is zero - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero - alpha_element = alpha[k] - end + # f2_node = volume_flux_fv(u_ll, u_rr, 2, equations) - if element_dg != 0 # bad - integral_contribution = zero(eltype(du)) + # f2_L_node = noncons_flux_fv(u_ll, u_rr, 2, equations) + # f2_R_node = noncons_flux_fv(u_rr, u_ll, 2, equations) - for ii in axes(du, 2) - @inbounds begin - du[i, j1, j2, j3, element_dg] += volume_flux_arr1[i, j1, ii, j2, j3, element_dg] - du[i, j1, j2, j3, element_dg] += volume_flux_arr2[i, j1, j2, ii, j3, element_dg] - du[i, j1, j2, j3, element_dg] += volume_flux_arr3[i, j1, j2, j3, ii, element_dg] - - integral_contribution += derivative_split[j1, ii] * - noncons_flux_arr1[i, j1, ii, j2, j3, - element_dg] - integral_contribution += derivative_split[j2, ii] * - noncons_flux_arr2[i, j1, j2, ii, j3, - element_dg] - integral_contribution += derivative_split[j3, ii] * - noncons_flux_arr3[i, j1, j2, j3, ii, - element_dg] - end - end + # for ii in axes(u, 1) + # @inbounds begin + # fstar2_L[ii, j1, j2, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) + # fstar2_R[ii, j1, j2, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) + # end + # end + # end - @inbounds du[i, j1, j2, j3, element_dg] += 0.5f0 * integral_contribution - end + # if j3 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) - if element_dgfv != 0 # bad - integral_contribution = zero(eltype(du)) + # f3_node = volume_flux_fv(u_ll, u_rr, 3, equations) - for ii in axes(du, 2) - @inbounds begin - du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * - volume_flux_arr1[i, j1, ii, j2, j3, - element_dgfv] - du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * - volume_flux_arr2[i, j1, j2, ii, j3, - element_dgfv] - du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * - volume_flux_arr3[i, j1, j2, j3, ii, - element_dgfv] - - integral_contribution += derivative_split[j1, ii] * - noncons_flux_arr1[i, j1, ii, j2, j3, - element_dgfv] - integral_contribution += derivative_split[j2, ii] * - noncons_flux_arr2[i, j1, j2, ii, j3, - element_dgfv] - integral_contribution += derivative_split[j3, ii] * - noncons_flux_arr3[i, j1, j2, j3, ii, - element_dgfv] - end - end + # f3_L_node = noncons_flux_fv(u_ll, u_rr, 3, equations) + # f3_R_node = noncons_flux_fv(u_rr, u_ll, 3, equations) - @inbounds du[i, j1, j2, j3, element_dgfv] += 0.5f0 * (1 - alpha_element) * integral_contribution - end + # for ii in axes(u, 1) + # @inbounds begin + # fstar3_L[ii, j1, j2, j3, k] = f3_node[ii] + 0.5f0 * f3_L_node[ii] * (1 - dg_only) + # fstar3_R[ii, j1, j2, j3, k] = f3_node[ii] + 0.5f0 * f3_R_node[ii] * (1 - dg_only) + # end + # end + # end end return nothing end -# Kernel for calculating FV volume integral contribution -function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, - fstar3_L, fstar3_R, inverse_weights, element_ids_dgfv, alpha) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, + noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, + atol, equations::AbstractEquations{3}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - if (j <= size(du, 2)^3 && k <= size(du, 5)) - # length(element_ids_dgfv) == size(du, 5) + if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) u2 = size(du, 2) # size(du, 2) == size(u, 2) j1 = div(j - 1, u2^2) + 1 @@ -776,32 +745,47 @@ function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, j3 = rem(rem(j - 1, u2^2), u2) + 1 @inbounds begin - element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero + du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros alpha_element = alpha[k] end - if element_dgfv != 0 # bad - for ii in axes(du, 1) - @inbounds du[ii, j1, j2, j3, element_dgfv] += (alpha_element * - (inverse_weights[j1] * - (fstar1_L[ii, j1 + 1, j2, j3, - element_dgfv] - - fstar1_R[ii, j1, j2, j3, element_dgfv]) + - inverse_weights[j2] * - (fstar2_L[ii, j1, j2 + 1, j3, - element_dgfv] - - fstar2_R[ii, j1, j2, j3, element_dgfv]) + - inverse_weights[j3] * - (fstar3_L[ii, j1, j2, j3 + 1, - element_dgfv] - - fstar3_R[ii, j1, j2, j3, element_dgfv]))) - end + dg_only = isapprox(alpha_element, 0, atol = atol) + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, j3, k] += (volume_flux_arr1[i, j1, ii, j2, j3, k] + + volume_flux_arr2[i, j1, j2, ii, j3, k] + + volume_flux_arr3[i, j1, j2, j3, ii, k] + + 0.5f0 * + (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] + + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] + + derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k])) * dg_only + + ((1 - alpha_element) * + volume_flux_arr1[i, j1, ii, j2, j3, k] + + (1 - alpha_element) * + volume_flux_arr2[i, j1, j2, ii, j3, k] + + (1 - alpha_element) * + volume_flux_arr3[i, j1, j2, j3, ii, k] + + 0.5f0 * (1 - alpha_element) * + (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] + + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] + + derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k])) * (1 - dg_only) end + + @inbounds du[i, j1, j2, j3, k] += alpha_element * + (inverse_weights[j1] * + (fstar1_L[i, j1 + 1, j2, j3, k] - fstar1_R[i, j1, j2, j3, k]) + + inverse_weights[j2] * + (fstar2_L[i, j1, j2 + 1, j3, k] - fstar2_R[i, j1, j2, j3, k]) + + inverse_weights[j3] * + (fstar3_L[i, j1, j2, j3 + 1, k] - fstar3_R[i, j1, j2, j3, k])) * (1 - dg_only) end return nothing end +############################################################################## New optimization +# Kernel for calculating DG-FV volume integrals without conservative terms + # Kernel for prolonging two interfaces function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations, equations::AbstractEquations{3}) @@ -1990,27 +1974,22 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_flux_dg, volume_flux_fv = dg.volume_integral.volume_flux_dg, dg.volume_integral.volume_flux_fv indicator = dg.volume_integral.indicator + derivative_split = dg.basis.derivative_split + inverse_weights = dg.basis.inverse_weights + + fstar1_L = cache.fstar1_L + fstar1_R = cache.fstar1_R + fstar2_L = cache.fstar2_L + fstar2_R = cache.fstar2_R + fstar3_L = cache.fstar3_L + fstar3_R = cache.fstar3_R # TODO: Get copies of `u` and `du` on both device and host # FIXME: Scalar indexing on GPU arrays caused by using GPU cache alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache alpha = CuArray(alpha) - atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0)) - element_ids_dg = CUDA.zeros(Int, length(alpha)) - element_ids_dgfv = CUDA.zeros(Int, length(alpha)) - - # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated) - pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg, - element_ids_dgfv, - alpha, - atol) - pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - kernel_configurator_1d(pure_blended_element_count_kernel, - length(alpha))...) - derivative_split = dg.basis.derivative_split - inverse_weights = dg.basis.inverse_weights volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), @@ -2018,13 +1997,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_flux_arr3 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) - fstar1_L = cache.fstar1_L - fstar1_R = cache.fstar1_R - fstar2_L = cache.fstar2_L - fstar2_R = cache.fstar2_R - fstar3_L = cache.fstar3_L - fstar3_R = cache.fstar3_R - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, @@ -2032,40 +2004,31 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, - element_ids_dgfv, + alpha, atol, equations, volume_flux_dg, volume_flux_fv) volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, fstar1_L, - fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, element_ids_dgfv, + fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, alpha, atol, equations, volume_flux_dg, volume_flux_fv; kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^4, size(u, 5))...) - volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, - element_ids_dgfv, - alpha, - derivative_split, - volume_flux_arr1, - volume_flux_arr2, - volume_flux_arr3, - equations) - volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, equations; - kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1), - size(du, 2)^3, size(du, 5))...) - - volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, - fstar1_R, - fstar2_L, fstar2_R, - fstar3_L, fstar3_R, - inverse_weights, - element_ids_dgfv, - alpha) - volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - inverse_weights, element_ids_dgfv, alpha; - kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^3, - size(u, 5))...) + volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha, + derivative_split, + inverse_weights, + volume_flux_arr1, + volume_flux_arr2, + volume_flux_arr3, + fstar1_L, fstar1_R, + fstar2_L, fstar2_R, + fstar3_L, fstar3_R, + atol, equations) + volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr1, + volume_flux_arr2, volume_flux_arr3, fstar1_L, fstar1_R, + fstar2_L, fstar2_R, fstar3_L, fstar3_R, atol, equations; + kernel_configurator_3d(volume_integral_dgfv_kernel, size(du, 1), + size(du, 2)^3, size(du, 5))...) return nothing end @@ -2075,30 +2038,25 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache) RealT = eltype(du) - volume_flux_dg, nonconservative_flux_dg = dg.volume_integral.volume_flux_dg - volume_flux_fv, nonconservative_flux_fv = dg.volume_integral.volume_flux_fv + volume_flux_dg, noncons_flux_dg = dg.volume_integral.volume_flux_dg + volume_flux_fv, noncons_flux_fv = dg.volume_integral.volume_flux_fv indicator = dg.volume_integral.indicator + derivative_split = dg.basis.derivative_split + inverse_weights = dg.basis.inverse_weights + + fstar1_L = cache.fstar1_L + fstar1_R = cache.fstar1_R + fstar2_L = cache.fstar2_L + fstar2_R = cache.fstar2_R + fstar3_L = cache.fstar3_L + fstar3_R = cache.fstar3_R # TODO: Get copies of `u` and `du` on both device and host # FIXME: Scalar indexing on GPU arrays caused by using GPU cache alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache alpha = CuArray(alpha) - atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0)) - element_ids_dg = CUDA.zeros(Int, length(alpha)) - element_ids_dgfv = CUDA.zeros(Int, length(alpha)) - - # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated) - pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg, - element_ids_dgfv, - alpha, - atol) - pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - kernel_configurator_1d(pure_blended_element_count_kernel, - length(alpha))...) - derivative_split = dg.basis.derivative_split - inverse_weights = dg.basis.inverse_weights volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), @@ -2112,13 +2070,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: noncons_flux_arr3 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) - fstar1_L = cache.fstar1_L - fstar1_R = cache.fstar1_R - fstar2_L = cache.fstar2_L - fstar2_R = cache.fstar2_R - fstar3_L = cache.fstar3_L - fstar3_R = cache.fstar3_R - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, @@ -2128,49 +2079,40 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - u, element_ids_dgfv, + u, alpha, atol, derivative_split, equations, volume_flux_dg, - nonconservative_flux_dg, + noncons_flux_dg, volume_flux_fv, - nonconservative_flux_fv) + noncons_flux_fv) volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - u, element_ids_dgfv, derivative_split, equations, volume_flux_dg, - nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv; + u, alpha, atol, derivative_split, equations, volume_flux_dg, + noncons_flux_dg, volume_flux_fv, noncons_flux_fv; kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^4, size(u, 5))...) - volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, - element_ids_dgfv, - alpha, - derivative_split, - volume_flux_arr1, - volume_flux_arr2, - volume_flux_arr3, - noncons_flux_arr1, - noncons_flux_arr2, - noncons_flux_arr3, - equations) - volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, - volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, - noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, equations; - kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1), - size(du, 2)^3, size(du, 5))...) - - volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, - fstar1_R, - fstar2_L, fstar2_R, - fstar3_L, fstar3_R, - inverse_weights, - element_ids_dgfv, - alpha) - volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - inverse_weights, element_ids_dgfv, alpha; - kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^3, - size(u, 5))...) + volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha, + derivative_split, + inverse_weights, + volume_flux_arr1, + volume_flux_arr2, + volume_flux_arr3, + noncons_flux_arr1, + noncons_flux_arr2, + noncons_flux_arr3, + fstar1_L, fstar1_R, + fstar2_L, fstar2_R, + fstar3_L, fstar3_R, + atol, equations) + volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr1, + volume_flux_arr2, volume_flux_arr3, noncons_flux_arr1, + noncons_flux_arr2, noncons_flux_arr3, fstar1_L, fstar1_R, + fstar2_L, fstar2_R, fstar3_L, fstar3_R, atol, equations; + kernel_configurator_3d(volume_integral_dgfv_kernel, size(du, 1), + size(du, 2)^3, size(du, 5))...) return nothing end diff --git a/test/tree_dgsem_2d/mhd_shock.jl b/test/tree_dgsem_2d/mhd_shock.jl index e1a02165..5aa3f75f 100644 --- a/test/tree_dgsem_2d/mhd_shock.jl +++ b/test/tree_dgsem_2d/mhd_shock.jl @@ -12,9 +12,10 @@ include("../test_macros.jl") surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) - polydeg = 4 - basis = LobattoLegendreBasis(polydeg) - basis_gpu = LobattoLegendreBasisGPU(polydeg) + + basis = LobattoLegendreBasis(4) + basis_gpu = LobattoLegendreBasisGPU(4) + indicator_sc = IndicatorHennemannGassner(equations, basis, alpha_max = 0.5, alpha_min = 0.001, @@ -24,10 +25,8 @@ include("../test_macros.jl") volume_flux_dg = volume_flux, volume_flux_fv = surface_flux) - solver = DGSEM(polydeg = polydeg, surface_flux = surface_flux, - volume_integral = volume_integral) - solver_gpu = DGSEMGPU(polydeg = polydeg, surface_flux = surface_flux, - volume_integral = volume_integral) + solver = DGSEM(basis, surface_flux, volume_integral) + solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral) coordinates_min = (-2.0, -2.0) coordinates_max = (2.0, 2.0) diff --git a/test/tree_dgsem_3d/mhd_shock.jl b/test/tree_dgsem_3d/mhd_shock.jl index 0e14cd3b..ffcf7ea4 100644 --- a/test/tree_dgsem_3d/mhd_shock.jl +++ b/test/tree_dgsem_3d/mhd_shock.jl @@ -12,9 +12,10 @@ include("../test_macros.jl") surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) - polydeg = 4 - basis = LobattoLegendreBasis(polydeg) - basis_gpu = LobattoLegendreBasisGPU(polydeg) + + basis = LobattoLegendreBasis(4) + basis_gpu = LobattoLegendreBasisGPU(4) + indicator_sc = IndicatorHennemannGassner(equations, basis, alpha_max = 0.5, alpha_min = 0.001, @@ -24,10 +25,8 @@ include("../test_macros.jl") volume_flux_dg = volume_flux, volume_flux_fv = surface_flux) - solver = DGSEM(polydeg = polydeg, surface_flux = surface_flux, - volume_integral = volume_integral) - solver_gpu = DGSEMGPU(polydeg = polydeg, surface_flux = surface_flux, - volume_integral = volume_integral) + solver = DGSEM(basis, surface_flux, volume_integral) + solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral) coordinates_min = (-2.0, -2.0, -2.0) coordinates_max = (2.0, 2.0, 2.0)