diff --git a/benchmark/advection_basic_1d.jl b/benchmark/advection_basic_1d.jl
index aa4c0378..3b84208f 100644
--- a/benchmark/advection_basic_1d.jl
+++ b/benchmark/advection_basic_1d.jl
@@ -10,6 +10,7 @@ advection_velocity = 1.0f0
 equations = LinearScalarAdvectionEquation1D(advection_velocity)
 
 solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT)
+solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT)
 
 coordinates_min = -1.0f0
 coordinates_max = 1.0f0
@@ -24,7 +25,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
                                           solver)
 @info "Time for cache initialization on GPU"
 CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition_convergence_test,
-                                                      solver)
+                                                      solver_gpu)
 
 tspan_gpu = (0.0f0, 1.0f0)
 t_gpu = 0.0f0
diff --git a/benchmark/advection_basic_2d.jl b/benchmark/advection_basic_2d.jl
index 0b1f9e7e..9de384a9 100644
--- a/benchmark/advection_basic_2d.jl
+++ b/benchmark/advection_basic_2d.jl
@@ -10,6 +10,7 @@ advection_velocity = (0.2f0, -0.7f0)
 equations = LinearScalarAdvectionEquation2D(advection_velocity)
 
 solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT)
+solver = DGSEMGPU(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT)
 
 coordinates_min = (-1.0f0, -1.0f0)
 coordinates_max = (1.0f0, 1.0f0)
@@ -24,7 +25,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
                                           solver)
 @info "Time for cache initialization on GPU"
 CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition_convergence_test,
-                                                      solver)
+                                                      solver_gpu)
 
 tspan_gpu = (0.0f0, 1.0f0)
 t_gpu = 0.0f0
diff --git a/benchmark/advection_basic_3d.jl b/benchmark/advection_basic_3d.jl
index d276118e..ed4dfc16 100644
--- a/benchmark/advection_basic_3d.jl
+++ b/benchmark/advection_basic_3d.jl
@@ -10,6 +10,7 @@ advection_velocity = (0.2f0, -0.7f0, 0.5f0)
 equations = LinearScalarAdvectionEquation3D(advection_velocity)
 
 solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT)
+solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_lax_friedrichs, RealT = RealT)
 
 coordinates_min = (-1.0f0, -1.0f0, -1.0f0)
 coordinates_max = (1.0f0, 1.0f0, 1.0f0)
@@ -24,7 +25,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
                                           solver)
 @info "Time for cache initialization on GPU"
 CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition_convergence_test,
-                                                      solver)
+                                                      solver_gpu)
 
 tspan_gpu = (0.0f0, 1.0f0)
 t_gpu = 0.0f0
diff --git a/benchmark/euler_ec_1d.jl b/benchmark/euler_ec_1d.jl
index 35bdfb80..6606a89a 100644
--- a/benchmark/euler_ec_1d.jl
+++ b/benchmark/euler_ec_1d.jl
@@ -14,6 +14,9 @@ volume_flux = flux_ranocha
 solver = DGSEM(polydeg = 3, surface_flux = flux_ranocha,
                volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
                RealT = RealT)
+solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_ranocha,
+                      volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+                      RealT = RealT)
 
 coordinates_min = (-2.0f0,)
 coordinates_max = (2.0f0,)
@@ -25,7 +28,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @info "Time for cache initialization on CPU"
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
 
 tspan_gpu = (0.0f0, 0.4f0)
 t_gpu = 0.0f0
diff --git a/benchmark/euler_ec_2d.jl b/benchmark/euler_ec_2d.jl
index 95f7b245..a8123c89 100644
--- a/benchmark/euler_ec_2d.jl
+++ b/benchmark/euler_ec_2d.jl
@@ -14,6 +14,9 @@ volume_flux = flux_ranocha
 solver = DGSEM(polydeg = 3, surface_flux = flux_ranocha,
                volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
                RealT = RealT)
+solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_ranocha,
+                      volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+                      RealT = RealT)
 
 coordinates_min = (-2.0f0, -2.0f0)
 coordinates_max = (2.0f0, 2.0f0)
@@ -27,7 +30,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
                                           boundary_conditions = boundary_condition_periodic)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver,
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu,
                                                       boundary_conditions = boundary_condition_periodic)
 
 tspan_gpu = (0.0f0, 0.4f0)
diff --git a/benchmark/euler_ec_3d.jl b/benchmark/euler_ec_3d.jl
index 17ba65a2..f315a29e 100644
--- a/benchmark/euler_ec_3d.jl
+++ b/benchmark/euler_ec_3d.jl
@@ -14,6 +14,9 @@ volume_flux = flux_ranocha
 solver = DGSEM(polydeg = 2, surface_flux = flux_ranocha,
                volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
                RealT = RealT)
+solver_gpu = DGSEMGPU(polydeg = 2, surface_flux = flux_ranocha,
+                      volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+                      RealT = RealT)
 
 coordinates_min = (-2.0f0, -2.0f0, -2.0f0)
 coordinates_max = (2.0f0, 2.0f0, 2.0f0)
@@ -25,7 +28,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @info "Time for cache initialization on CPU"
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
 
 tspan_gpu = (0.0f0, 0.4f0)
 t_gpu = 0.0f0
diff --git a/benchmark/euler_shock_1d.jl b/benchmark/euler_shock_1d.jl
index ad1a4e72..cdc2dbbb 100644
--- a/benchmark/euler_shock_1d.jl
+++ b/benchmark/euler_shock_1d.jl
@@ -12,7 +12,10 @@ initial_condition = initial_condition_weak_blast_wave
 
 surface_flux = flux_lax_friedrichs
 volume_flux = flux_shima_etal
+
 basis = LobattoLegendreBasis(RealT, 3)
+basis_gpu = LobattoLegendreBasisGPU(3, RealT)
+
 indicator_sc = IndicatorHennemannGassner(equations, basis,
                                          alpha_max = 0.5f0,
                                          alpha_min = 0.001f0,
@@ -21,7 +24,9 @@ indicator_sc = IndicatorHennemannGassner(equations, basis,
 volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
                                                  volume_flux_dg = volume_flux,
                                                  volume_flux_fv = surface_flux)
+
 solver = DGSEM(basis, surface_flux, volume_integral)
+solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)
 
 coordinates_min = -2.0f0
 coordinates_max = 2.0f0
@@ -33,7 +38,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @info "Time for cache initialization on CPU"
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
 
 tspan = tspan_gpu = (0.0f0, 0.4f0)
 t = t_gpu = 0.0f0
@@ -63,74 +68,7 @@ du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cach
 
 # Reset du and volume integral
 @info "Time for reset_du! and volume_integral! on GPU"
-CUDA.@time TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
-                                           Trixi.have_nonconservative_terms(equations_gpu),
-                                           equations_gpu, solver_gpu.volume_integral, solver_gpu,
-                                           cache_gpu)
-@info "Time for reset_du! and volume_integral! on CPU"
-@time begin
-    Trixi.reset_du!(du, solver, cache)
-    Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
-                                equations, solver.volume_integral, solver, cache)
-end
-
-# Prolong to interfaces
-@info "Time for prolong2interfaces! on GPU"
-CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
-@info "Time for prolong2interfaces! on CPU"
-@time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
-
-# Interface flux
-@info "Time for interface_flux! on GPU"
-CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu,
-                                          Trixi.have_nonconservative_terms(equations_gpu),
-                                          equations_gpu, solver_gpu, cache_gpu)
-@info "Time for interface_flux! on CPU"
-@time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                                 Trixi.have_nonconservative_terms(equations), equations,
-                                 solver.surface_integral, solver, cache)
-
-# Prolong to boundaries
-@info "Time for prolong2boundaries! on GPU"
-CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
-                                              equations_gpu, cache_gpu)
-@info "Time for prolong2boundaries! on CPU"
-@time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
-
-# Boundary flux
-@info "Time for boundary_flux! on GPU"
-CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
-                                         Trixi.have_nonconservative_terms(equations_gpu),
-                                         equations_gpu, solver_gpu, cache_gpu)
-@info "Time for boundary_flux! on CPU"
-@time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-                                solver.surface_integral, solver)
-
-# Surface integral
-@info "Time for surface_integral! on GPU"
-CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
-@info "Time for surface_integral! on CPU"
-@time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral,
-                                   solver, cache)
-
-# Jacobian
-@info "Time for jacobian! on GPU"
-CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
-@info "Time for jacobian! on CPU"
-@time Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
-
-# Sources terms
-@info "Time for sources! on GPU"
-CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu,
-                                   equations_gpu, cache_gpu)
-@info "Time for sources! on CPU"
-@time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
-
-# Semidiscretization process
-@info "Time for rhs! on GPU"
-CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu,
-                              boundary_conditions_gpu, source_terms_gpu,
-                              solver_gpu, cache_gpu)
-@info "Time for rhs! on CPU"
-@time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms,
-                 solver, cache)
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
diff --git a/benchmark/euler_shock_2d.jl b/benchmark/euler_shock_2d.jl
index c05f4a0d..ff5fc51c 100644
--- a/benchmark/euler_shock_2d.jl
+++ b/benchmark/euler_shock_2d.jl
@@ -12,7 +12,10 @@ initial_condition = initial_condition_weak_blast_wave
 
 surface_flux = flux_lax_friedrichs
 volume_flux = flux_shima_etal
+
 basis = LobattoLegendreBasis(RealT, 3)
+basis_gpu = LobattoLegendreBasisGPU(3, RealT)
+
 indicator_sc = IndicatorHennemannGassner(equations, basis,
                                          alpha_max = 0.5f0,
                                          alpha_min = 0.001f0,
@@ -21,7 +24,9 @@ indicator_sc = IndicatorHennemannGassner(equations, basis,
 volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
                                                  volume_flux_dg = volume_flux,
                                                  volume_flux_fv = surface_flux)
+
 solver = DGSEM(basis, surface_flux, volume_integral)
+solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)
 
 coordinates_min = (-2.0f0, -2.0f0)
 coordinates_max = (2.0f0, 2.0f0)
@@ -33,7 +38,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @info "Time for cache initialization on CPU"
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
 
 tspan = tspan_gpu = (0.0f0, 1.0f0)
 t = t_gpu = 0.0f0
@@ -63,93 +68,7 @@ du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cach
 
 # Reset du and volume integral
 @info "Time for reset_du! and volume_integral! on GPU"
-CUDA.@time TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
-                                           Trixi.have_nonconservative_terms(equations_gpu),
-                                           equations_gpu, solver_gpu.volume_integral, solver_gpu,
-                                           cache_gpu)
-@info "Time for reset_du! and volume_integral! on CPU"
-@time begin
-    Trixi.reset_du!(du, solver, cache)
-    Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
-                                equations, solver.volume_integral, solver, cache)
-end
-
-# Prolong to interfaces
-@info "Time for prolong2interfaces! on GPU"
-CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
-@info "Time for prolong2interfaces! on CPU"
-@time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
-
-# Interface flux
-@info "Time for interface_flux! on GPU"
-CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu,
-                                          Trixi.have_nonconservative_terms(equations_gpu),
-                                          equations_gpu, solver_gpu, cache_gpu)
-@info "Time for interface_flux! on CPU"
-@time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                                 Trixi.have_nonconservative_terms(equations), equations,
-                                 solver.surface_integral, solver, cache)
-
-# Prolong to boundaries
-@info "Time for prolong2boundaries! on GPU"
-CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
-                                              equations_gpu, cache_gpu)
-@info "Time for prolong2boundaries! on CPU"
-@time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
-
-# Boundary flux
-@info "Time for boundary_flux! on GPU"
-CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
-                                         Trixi.have_nonconservative_terms(equations_gpu),
-                                         equations_gpu, solver_gpu, cache_gpu)
-@info "Time for boundary_flux! on CPU"
-@time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-                                solver.surface_integral, solver)
-
-# Prolong to mortars
-@info "Time for prolong2mortars! on GPU"
-CUDA.@time TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
-                                           TrixiCUDA.check_cache_mortars(cache_gpu),
-                                           solver_gpu, cache_gpu)
-@info "Time for prolong2mortars! on CPU"
-@time Trixi.prolong2mortars!(cache, u, mesh, equations,
-                             solver.mortar, solver.surface_integral, solver)
-
-# Mortar flux
-@info "Time for mortar_flux! on GPU"
-CUDA.@time TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
-                                       Trixi.have_nonconservative_terms(equations_gpu),
-                                       equations_gpu, solver_gpu, cache_gpu)
-@info "Time for mortar_flux! on CPU"
-@time Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
-                              Trixi.have_nonconservative_terms(equations), equations,
-                              solver.mortar, solver.surface_integral, solver, cache)
-
-# Surface integral
-@info "Time for surface_integral! on GPU"
-CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
-@info "Time for surface_integral! on CPU"
-@time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral,
-                                   solver, cache)
-
-# Jacobian
-@info "Time for jacobian! on GPU"
-CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
-@info "Time for jacobian! on CPU"
-@time Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
-
-# Sources terms
-@info "Time for sources! on GPU"
-CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu,
-                                   equations_gpu, cache_gpu)
-@info "Time for sources! on CPU"
-@time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
-
-# Semidiscretization process
-@info "Time for rhs! on GPU"
-CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu,
-                              boundary_conditions_gpu, source_terms_gpu,
-                              solver_gpu, cache_gpu)
-@info "Time for rhs! on CPU"
-@time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms,
-                 solver, cache)
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
diff --git a/benchmark/euler_shock_3d.jl b/benchmark/euler_shock_3d.jl
index d23ffb71..5591653d 100644
--- a/benchmark/euler_shock_3d.jl
+++ b/benchmark/euler_shock_3d.jl
@@ -15,6 +15,8 @@ volume_flux = flux_ranocha
 
 polydeg = 3
 basis = LobattoLegendreBasis(RealT, polydeg)
+basis_gpu = LobattoLegendreBasisGPU(polydeg, RealT)
+
 indicator_sc = IndicatorHennemannGassner(equations, basis,
                                          alpha_max = 0.5f0,
                                          alpha_min = 0.001f0,
@@ -23,7 +25,9 @@ indicator_sc = IndicatorHennemannGassner(equations, basis,
 volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
                                                  volume_flux_dg = volume_flux,
                                                  volume_flux_fv = surface_flux)
+
 solver = DGSEM(basis, surface_flux, volume_integral)
+solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)
 
 coordinates_min = (-2.0f0, -2.0f0, -2.0f0)
 coordinates_max = (2.0f0, 2.0f0, 2.0f0)
@@ -36,7 +40,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
                                           source_terms = source_terms_convergence_test)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver,
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu,
                                                       source_terms = source_terms_convergence_test)
 
 tspan = tspan_gpu = (0.0f0, 0.4f0)
@@ -67,93 +71,7 @@ du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cach
 
 # Reset du and volume integral
 @info "Time for reset_du! and volume_integral! on GPU"
-CUDA.@time TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
-                                           Trixi.have_nonconservative_terms(equations_gpu),
-                                           equations_gpu, solver_gpu.volume_integral, solver_gpu,
-                                           cache_gpu)
-@info "Time for reset_du! and volume_integral! on CPU"
-@time begin
-    Trixi.reset_du!(du, solver, cache)
-    Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
-                                equations, solver.volume_integral, solver, cache)
-end
-
-# Prolong to interfaces
-@info "Time for prolong2interfaces! on GPU"
-CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
-@info "Time for prolong2interfaces! on CPU"
-@time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
-
-# Interface flux
-@info "Time for interface_flux! on GPU"
-CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu,
-                                          Trixi.have_nonconservative_terms(equations_gpu),
-                                          equations_gpu, solver_gpu, cache_gpu)
-@info "Time for interface_flux! on CPU"
-@time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                                 Trixi.have_nonconservative_terms(equations), equations,
-                                 solver.surface_integral, solver, cache)
-
-# Prolong to boundaries
-@info "Time for prolong2boundaries! on GPU"
-CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
-                                              equations_gpu, cache_gpu)
-@info "Time for prolong2boundaries! on CPU"
-@time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
-
-# Boundary flux
-@info "Time for boundary_flux! on GPU"
-CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
-                                         Trixi.have_nonconservative_terms(equations_gpu),
-                                         equations_gpu, solver_gpu, cache_gpu)
-@info "Time for boundary_flux! on CPU"
-@time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-                                solver.surface_integral, solver)
-
-# Prolong to mortars
-@info "Time for prolong2mortars! on GPU"
-CUDA.@time TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
-                                           TrixiCUDA.check_cache_mortars(cache_gpu),
-                                           solver_gpu, cache_gpu)
-@info "Time for prolong2mortars! on CPU"
-@time Trixi.prolong2mortars!(cache, u, mesh, equations,
-                             solver.mortar, solver.surface_integral, solver)
-
-# Mortar flux
-@info "Time for mortar_flux! on GPU"
-CUDA.@time TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
-                                       Trixi.have_nonconservative_terms(equations_gpu),
-                                       equations_gpu, solver_gpu, cache_gpu)
-@info "Time for mortar_flux! on CPU"
-@time Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
-                              Trixi.have_nonconservative_terms(equations), equations,
-                              solver.mortar, solver.surface_integral, solver, cache)
-
-# Surface integral
-@info "Time for surface_integral! on GPU"
-CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
-@info "Time for surface_integral! on CPU"
-@time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral,
-                                   solver, cache)
-
-# Jacobian
-@info "Time for jacobian! on GPU"
-CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
-@info "Time for jacobian! on CPU"
-@time Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
-
-# Sources terms
-@info "Time for sources! on GPU"
-CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu,
-                                   equations_gpu, cache_gpu)
-@info "Time for sources! on CPU"
-@time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
-
-# Semidiscretization process
-@info "Time for rhs! on GPU"
-CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu,
-                              boundary_conditions_gpu, source_terms_gpu,
-                              solver_gpu, cache_gpu)
-@info "Time for rhs! on CPU"
-@time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms,
-                 solver, cache)
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
diff --git a/benchmark/mhd_ec_1d.jl b/benchmark/mhd_ec_1d.jl
index d9eb3411..6ef7c90d 100644
--- a/benchmark/mhd_ec_1d.jl
+++ b/benchmark/mhd_ec_1d.jl
@@ -15,6 +15,9 @@ volume_flux = flux_hindenlang_gassner
 solver = DGSEM(polydeg = 3, surface_flux = flux_hindenlang_gassner,
                volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
                RealT = RealT)
+solver_gpu = DGSEMGPU(polydeg = 3, surface_flux = flux_hindenlang_gassner,
+                      volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+                      RealT = RealT)
 
 coordinates_min = 0.0f0
 coordinates_max = 1.0f0
@@ -26,7 +29,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @info "Time for cache initialization on CPU"
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
 
 tspan_gpu = (0.0f0, 0.4f0)
 t_gpu = 0.0f0
diff --git a/benchmark/mhd_ec_2d.jl b/benchmark/mhd_ec_2d.jl
index 7d2ea8bd..6952432d 100644
--- a/benchmark/mhd_ec_2d.jl
+++ b/benchmark/mhd_ec_2d.jl
@@ -15,6 +15,10 @@ solver = DGSEM(polydeg = 3,
                surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell),
                volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
                RealT = RealT)
+solver_gpu = DGSEMGPU(polydeg = 3,
+                      surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell),
+                      volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+                      RealT = RealT)
 
 coordinates_min = (-2.0f0, -2.0f0)
 coordinates_max = (2.0f0, 2.0f0)
@@ -26,7 +30,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @info "Time for cache initialization on CPU"
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
 
 tspan_gpu = (0.0f0, 0.4f0)
 t_gpu = 0.0f0
diff --git a/benchmark/mhd_ec_3d.jl b/benchmark/mhd_ec_3d.jl
index 68195a34..72eb66a4 100644
--- a/benchmark/mhd_ec_3d.jl
+++ b/benchmark/mhd_ec_3d.jl
@@ -15,6 +15,10 @@ solver = DGSEM(polydeg = 3,
                surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell),
                volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
                RealT = RealT)
+solver_gpu = DGSEMGPU(polydeg = 3,
+                      surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell),
+                      volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+                      RealT = RealT)
 
 coordinates_min = (-2.0f0, -2.0f0, -2.0f0)
 coordinates_max = (2.0f0, 2.0f0, 2.0f0)
@@ -26,7 +30,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 @info "Time for cache initialization on CPU"
 @time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
 @info "Time for cache initialization on GPU"
-CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
 
 tspan_gpu = (0.0f0, 0.4f0)
 t_gpu = 0.0f0
diff --git a/benchmark/mhd_shock_2d.jl b/benchmark/mhd_shock_2d.jl
new file mode 100644
index 00000000..b0079a30
--- /dev/null
+++ b/benchmark/mhd_shock_2d.jl
@@ -0,0 +1,74 @@
+using Trixi, TrixiCUDA
+using CUDA
+using BenchmarkTools
+
+# Set the precision
+RealT = Float32
+
+# Set up the problem
+equations = IdealGlmMhdEquations2D(1.4f0)
+
+initial_condition = initial_condition_weak_blast_wave
+
+surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
+volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
+
+basis = LobattoLegendreBasis(RealT, 4)
+basis_gpu = LobattoLegendreBasisGPU(4, RealT)
+
+indicator_sc = IndicatorHennemannGassner(equations, basis,
+                                         alpha_max = 0.5f0,
+                                         alpha_min = 0.001f0,
+                                         alpha_smooth = true,
+                                         variable = density_pressure)
+volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
+                                                 volume_flux_dg = volume_flux,
+                                                 volume_flux_fv = surface_flux)
+
+solver = DGSEM(basis, surface_flux, volume_integral)
+solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)
+
+coordinates_min = (-2.0f0, -2.0f0)
+coordinates_max = (2.0f0, 2.0f0)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 3,
+                n_cells_max = 10_000, RealT = RealT)
+
+# Cache initialization
+@info "Time for cache initialization on CPU"
+@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+@info "Time for cache initialization on GPU"
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
+
+tspan = tspan_gpu = (0.0f0, 1.0f0)
+t = t_gpu = 0.0f0
+
+# Semi on CPU
+(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+# Semi on GPU
+equations_gpu = semi_gpu.equations
+mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+boundary_conditions_gpu = semi_gpu.boundary_conditions
+source_terms_gpu = semi_gpu.source_terms
+
+# ODE on CPU
+ode = semidiscretize(semi, tspan)
+u_ode = copy(ode.u0)
+du_ode = similar(u_ode)
+u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+# ODE on GPU
+ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+u_gpu_ = copy(ode_gpu.u0)
+du_gpu_ = similar(u_gpu_)
+u_gpu = TrixiCUDA.wrap_array(u_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+
+# Reset du and volume integral
+@info "Time for reset_du! and volume_integral! on GPU"
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
diff --git a/benchmark/mhd_shock_3d.jl b/benchmark/mhd_shock_3d.jl
new file mode 100644
index 00000000..862d1454
--- /dev/null
+++ b/benchmark/mhd_shock_3d.jl
@@ -0,0 +1,74 @@
+using Trixi, TrixiCUDA
+using CUDA
+using BenchmarkTools
+
+# Set the precision
+RealT = Float32
+
+# Set up the problem
+equations = IdealGlmMhdEquations3D(1.4f0)
+
+initial_condition = initial_condition_weak_blast_wave
+
+surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
+volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
+
+basis = LobattoLegendreBasis(RealT, 4)
+basis_gpu = LobattoLegendreBasisGPU(4, RealT)
+
+indicator_sc = IndicatorHennemannGassner(equations, basis,
+                                         alpha_max = 0.5f0,
+                                         alpha_min = 0.001f0,
+                                         alpha_smooth = true,
+                                         variable = density_pressure)
+volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
+                                                 volume_flux_dg = volume_flux,
+                                                 volume_flux_fv = surface_flux)
+
+solver = DGSEM(basis, surface_flux, volume_integral)
+solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)
+
+coordinates_min = (-2.0f0, -2.0f0, -2.0f0)
+coordinates_max = (2.0f0, 2.0f0, 2.0f0)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 3,
+                n_cells_max = 10_000, RealT = RealT)
+
+# Cache initialization
+@info "Time for cache initialization on CPU"
+@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+@info "Time for cache initialization on GPU"
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu)
+
+tspan = tspan_gpu = (0.0f0, 1.0f0)
+t = t_gpu = 0.0f0
+
+# Semi on CPU
+(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+# Semi on GPU
+equations_gpu = semi_gpu.equations
+mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+boundary_conditions_gpu = semi_gpu.boundary_conditions
+source_terms_gpu = semi_gpu.source_terms
+
+# ODE on CPU
+ode = semidiscretize(semi, tspan)
+u_ode = copy(ode.u0)
+du_ode = similar(u_ode)
+u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+# ODE on GPU
+ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+u_gpu_ = copy(ode_gpu.u0)
+du_gpu_ = similar(u_gpu_)
+u_gpu = TrixiCUDA.wrap_array(u_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+
+# Reset du and volume integral
+@info "Time for reset_du! and volume_integral! on GPU"
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
diff --git a/benchmark/shallowwater_shock_1d.jl b/benchmark/shallowwater_shock_1d.jl
new file mode 100644
index 00000000..9da3c899
--- /dev/null
+++ b/benchmark/shallowwater_shock_1d.jl
@@ -0,0 +1,106 @@
+using Trixi, TrixiCUDA
+using CUDA
+using BenchmarkTools
+
+# Set the precision
+RealT = Float32
+
+# Set up the problem
+equations = ShallowWaterEquations1D(gravity_constant = 9.812f0, H0 = 1.75f0)
+
+function initial_condition_stone_throw_discontinuous_bottom(x, t,
+                                                            equations::ShallowWaterEquations1D)
+    # Flat lake
+    RealT = eltype(x)
+    H = equations.H0
+
+    # Discontinuous velocity
+    v = zero(RealT)
+    if x[1] >= -0.75f0 && x[1] <= 0
+        v = -one(RealT)
+    elseif x[1] >= 0 && x[1] <= 0.75f0
+        v = one(RealT)
+    end
+
+    b = (1.5f0 / exp(0.5f0 * ((x[1] - 1)^2)) +
+         0.75f0 / exp(0.5f0 * ((x[1] + 1)^2)))
+
+    # Force a discontinuous bottom topography
+    if x[1] >= -1.5f0 && x[1] <= 0
+        b = RealT(0.5f0)
+    end
+
+    return prim2cons(SVector(H, v, b), equations)
+end
+
+initial_condition = initial_condition_stone_throw_discontinuous_bottom
+
+boundary_condition = boundary_condition_slip_wall
+
+volume_flux = (flux_wintermeyer_etal, flux_nonconservative_wintermeyer_etal)
+surface_flux = (FluxHydrostaticReconstruction(flux_lax_friedrichs,
+                                              hydrostatic_reconstruction_audusse_etal),
+                flux_nonconservative_audusse_etal)
+basis = LobattoLegendreBasis(RealT, 4)
+basis_gpu = LobattoLegendreBasisGPU(4, RealT)
+
+indicator_sc = IndicatorHennemannGassner(equations, basis,
+                                         alpha_max = 0.5f0,
+                                         alpha_min = 0.001f0,
+                                         alpha_smooth = true,
+                                         variable = waterheight_pressure)
+volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
+                                                 volume_flux_dg = volume_flux,
+                                                 volume_flux_fv = surface_flux)
+
+solver = DGSEM(basis, surface_flux, volume_integral)
+solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)
+
+coordinates_min = -3.0f0
+coordinates_max = 3.0f0
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 3,
+                n_cells_max = 10_000,
+                periodicity = false,
+                RealT = RealT)
+
+# Cache initialization
+@info "Time for cache initialization on CPU"
+@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                          boundary_conditions = boundary_condition)
+@info "Time for cache initialization on GPU"
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver_gpu,
+                                                      boundary_conditions = boundary_condition)
+
+tspan = tspan_gpu = (0.0f0, 3.0f0)
+t = t_gpu = 0.0f0
+
+# Semi on CPU
+(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+# Semi on GPU
+equations_gpu = semi_gpu.equations
+mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+boundary_conditions_gpu = semi_gpu.boundary_conditions
+source_terms_gpu = semi_gpu.source_terms
+
+# ODE on CPU
+ode = semidiscretize(semi, tspan)
+u_ode = copy(ode.u0)
+du_ode = similar(u_ode)
+u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+# ODE on GPU
+ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+u_gpu_ = copy(ode_gpu.u0)
+du_gpu_ = similar(u_gpu_)
+u_gpu = TrixiCUDA.wrap_array(u_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+du_gpu = TrixiCUDA.wrap_array(du_gpu_, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+
+# Reset du and volume integral
+@info "Time for reset_du! and volume_integral! on GPU"
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
diff --git a/src/solvers/common.jl b/src/solvers/common.jl
index a7205cb8..9cd3c992 100644
--- a/src/solvers/common.jl
+++ b/src/solvers/common.jl
@@ -24,21 +24,3 @@ function last_first_indices_kernel!(lasts, firsts, n_boundaries_per_direction)
 
     return nothing
 end
-
-# Kernel for counting elements for DG-only and blended DG-FV volume integral
-# Maybe it is better to be moved to CPU
-function pure_blended_element_count_kernel!(element_ids_dg, element_ids_dgfv, alpha, atol)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    if (i <= length(alpha))
-        @inbounds dg_only = isapprox(alpha[i], 0, atol = atol)
-
-        if dg_only # bad
-            @inbounds element_ids_dg[i] = i
-        else
-            @inbounds element_ids_dgfv[i] = i
-        end
-    end
-
-    return nothing
-end
diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl
index 4fd172e3..99c2fb9c 100644
--- a/src/solvers/dg_1d.jl
+++ b/src/solvers/dg_1d.jl
@@ -39,7 +39,7 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr)
 end
 
 ############################################################################## New optimization
-# Kernel for calculating fluxes and weak form
+# Kernel for calculating volume integrals with weak form
 function flux_weak_form_kernel!(du, u, derivative_dhat,
                                 equations::AbstractEquations{1}, flux::Any)
     # Set tile width
@@ -129,7 +129,7 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr,
 end
 
 ############################################################################## New optimization
-# Kernel for calculating volume fluxes and volume integrals
+# Kernel for calculating volume integrals without conservative terms
 function volume_flux_integral_kernel!(du, u, derivative_split,
                                       equations::AbstractEquations{1}, volume_flux::Any)
     # Set tile width
@@ -237,8 +237,7 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, nonco
 end
 
 ############################################################################## New optimization
-# Kernel for calculating symmetric and nonconservative volume fluxes and 
-# corresponding volume integrals
+# Kernel for calculating volume integrals with conservative terms
 function noncons_volume_flux_integral_kernel!(du, u, derivative_split,
                                               equations::AbstractEquations{1},
                                               symmetric_flux::Any, nonconservative_flux::Any)
@@ -312,21 +311,16 @@ end
 
 # Kernel for calculating pure DG and DG-FV volume fluxes
 function volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u,
-                                  element_ids_dgfv, equations::AbstractEquations{1},
+                                  alpha, atol, equations::AbstractEquations{1},
                                   volume_flux_dg::Any, volume_flux_fv::Any)
     j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
     if (j <= size(u, 2)^2 && k <= size(u, 3))
-        # length(element_ids_dgfv) == size(u, 3)
-
         j1 = div(j - 1, size(u, 2)) + 1
         j2 = rem(j - 1, size(u, 2)) + 1
 
-        @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-
-        # The sets of `get_node_vars` operations may be combined
-        # into a single set of operation for better performance (to be explored).
+        dg_only = isapprox(alpha[k], 0, atol = atol)
 
         u_node = get_node_vars(u, equations, j1, k)
         u_node1 = get_node_vars(u, equations, j2, k)
@@ -335,58 +329,62 @@ function volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u,
 
         for ii in axes(u, 1)
             @inbounds volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii]
-        end
 
-        if j1 != 1 && j2 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1 - 1, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, element_dgfv)
-            flux_fv_node = volume_flux_fv(u_ll, u_rr, 1, equations)
+            # Small optimization, no much performance gain
+            if isequal(j1 + 1, j2) # avoid race condition
+                flux_fv_node = volume_flux_fv(u_node, u_node1, 1, equations)
 
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar1_L[ii, j1, element_dgfv] = flux_fv_node[ii]
-                    fstar1_R[ii, j1, element_dgfv] = flux_fv_node[ii]
+                    fstar1_L[ii, j2, k] = flux_fv_node[ii] * (1 - dg_only)
+                    fstar1_R[ii, j2, k] = flux_fv_node[ii] * (1 - dg_only)
                 end
             end
         end
+
+        # if j1 != 1 && j2 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1 - 1, k)
+        #     u_rr = get_node_vars(u, equations, j1, k)
+        #     flux_fv_node = volume_flux_fv(u_ll, u_rr, 1, equations)
+
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar1_L[ii, j1, k] = flux_fv_node[ii] * (1 - dg_only)
+        #             fstar1_R[ii, j1, k] = flux_fv_node[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
     end
 
     return nothing
 end
 
-# Kernel for calculating DG volume integral contribution
-function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr, equations::AbstractEquations{1})
+# Kernel for calculating pure DG and DG-FV volume integrals
+function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights,
+                                      volume_flux_arr, fstar1_L, fstar1_R, atol,
+                                      equations::AbstractEquations{1})
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
     if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
-        # length(element_ids_dg) == size(du, 3)
-        # length(element_ids_dgfv) == size(du, 3)
-        @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros
-
         @inbounds begin
-            element_dg = element_ids_dg[k] # check if `element_dg` is zero
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+            du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros
             alpha_element = alpha[k]
         end
 
-        if element_dg != 0 # bad
-            for ii in axes(du, 2)
-                @inbounds du[i, j, element_dg] += derivative_split[j, ii] *
-                                                  (1 - isequal(j, ii)) * # set diagonal elements to zeros
-                                                  volume_flux_arr[i, j, ii, element_dg]
-            end
-        end
+        dg_only = isapprox(alpha_element, 0, atol = atol)
 
-        if element_dgfv != 0 # bad
-            for ii in axes(du, 2)
-                @inbounds du[i, j, element_dgfv] += (1 - alpha_element) * derivative_split[j, ii] *
-                                                    (1 - isequal(j, ii)) * # set diagonal elements to zeros
-                                                    volume_flux_arr[i, j, ii, element_dgfv]
-            end
+        for ii in axes(du, 2)
+            @inbounds du[i, j, k] += derivative_split[j, ii] *
+                                     (1 - isequal(j, ii)) * # set diagonal elements to zeros
+                                     volume_flux_arr[i, j, ii, k] * dg_only +
+                                     (1 - alpha_element) * derivative_split[j, ii] *
+                                     (1 - isequal(j, ii)) * # set diagonal elements to zeros
+                                     volume_flux_arr[i, j, ii, k] * (1 - dg_only)
         end
+
+        @inbounds du[i, j, k] += alpha_element * inverse_weights[j] *
+                                 (fstar1_L[i, j + 1, k] - fstar1_R[i, j, k]) * (1 - dg_only)
     end
 
     return nothing
@@ -394,29 +392,24 @@ end
 
 # Kernel for calculating pure DG and DG-FV volume fluxes
 function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u,
-                                  element_ids_dgfv, derivative_split,
+                                  alpha, atol, derivative_split,
                                   equations::AbstractEquations{1},
-                                  volume_flux_dg::Any, nonconservative_flux_dg::Any,
-                                  volume_flux_fv::Any, nonconservative_flux_fv::Any)
+                                  volume_flux_dg::Any, noncons_flux_dg::Any,
+                                  volume_flux_fv::Any, noncons_flux_fv::Any)
     j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
     if (j <= size(u, 2)^2 && k <= size(u, 3))
-        # length(element_ids_dgfv) == size(u, 3)
-
         j1 = div(j - 1, size(u, 2)) + 1
         j2 = rem(j - 1, size(u, 2)) + 1
 
-        @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-
-        # The sets of `get_node_vars` operations may be combined
-        # into a single set of operation for better performance (to be explored).
+        dg_only = isapprox(alpha[k], 0, atol = atol)
 
         u_node = get_node_vars(u, equations, j1, k)
         u_node1 = get_node_vars(u, equations, j2, k)
 
         volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations)
-        noncons_flux_node = nonconservative_flux_dg(u_node, u_node1, 1, equations)
+        noncons_flux_node = noncons_flux_dg(u_node, u_node1, 1, equations)
 
         for ii in axes(u, 1)
             @inbounds begin
@@ -424,106 +417,75 @@ function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, f
                                                  (1 - isequal(j1, j2)) # set diagonal elements to zeros
                 noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii]
             end
-        end
 
-        if j1 != 1 && j2 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1 - 1, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, element_dgfv)
+            # Small optimization, no much performance gain
+            if isequal(j1 + 1, j2) # avoid race condition
+                f1_node = volume_flux_fv(u_node, u_node1, 1, equations)
+                f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations)
+                f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations)
 
-            f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
-
-            f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations)
-            f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations)
-
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar1_L[ii, j1, element_dgfv] = f1_node[ii] + 0.5f0 * f1_L_node[ii]
-                    fstar1_R[ii, j1, element_dgfv] = f1_node[ii] + 0.5f0 * f1_R_node[ii]
+                    fstar1_L[ii, j2, k] = (f1_node[ii] + 0.5f0 * f1_L_node[ii]) * (1 - dg_only)
+                    fstar1_R[ii, j2, k] = (f1_node[ii] + 0.5f0 * f1_R_node[ii]) * (1 - dg_only)
                 end
             end
         end
+
+        # if j1 != 1 && j2 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1 - 1, k)
+        #     u_rr = get_node_vars(u, equations, j1, k)
+
+        #     f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
+
+        #     f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations)
+        #     f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations)
+
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar1_L[ii, j1, k] = (f1_node[ii] + 0.5f0 * f1_L_node[ii]) * (1 - dg_only)
+        #             fstar1_R[ii, j1, k] = (f1_node[ii] + 0.5f0 * f1_R_node[ii]) * (1 - dg_only)
+        #         end
+        #     end
+        # end
     end
 
     return nothing
 end
 
-# Kernel for calculating DG volume integral contribution
-function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr, noncons_flux_arr,
-                                    equations::AbstractEquations{1})
+# Kernel for calculating pure DG and DG-FV volume integrals
+function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights,
+                                      volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R,
+                                      atol, equations::AbstractEquations{1})
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
     if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
-        # length(element_ids_dg) == size(du, 3)
-        # length(element_ids_dgfv) == size(du, 3)
-        @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros
-
         @inbounds begin
-            element_dg = element_ids_dg[k] # check if `element_dg` is zero
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+            du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros
             alpha_element = alpha[k]
         end
 
-        if element_dg != 0 # bad
-            integral_contribution = zero(eltype(du))
+        dg_only = isapprox(alpha_element, 0, atol = atol)
 
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j, element_dg] += volume_flux_arr[i, j, ii, element_dg]
-                    integral_contribution += derivative_split[j, ii] *
-                                             noncons_flux_arr[i, j, ii, element_dg]
-                end
-            end
-
-            @inbounds du[i, j, element_dg] += 0.5f0 * integral_contribution
+        for ii in axes(du, 2)
+            @inbounds du[i, j, k] += (volume_flux_arr[i, j, ii, k] +
+                                      0.5f0 *
+                                      derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]) * dg_only +
+                                     ((1 - alpha_element) * volume_flux_arr[i, j, ii, k] +
+                                      0.5f0 * (1 - alpha_element) *
+                                      derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]) * (1 - dg_only)
         end
 
-        if element_dgfv != 0 # bad
-            integral_contribution = zero(eltype(du))
-
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j, element_dgfv] += (1 - alpha_element) *
-                                              volume_flux_arr[i, j, ii, element_dgfv]
-                    integral_contribution += derivative_split[j, ii] *
-                                             noncons_flux_arr[i, j, ii, element_dgfv]
-                end
-            end
-
-            @inbounds du[i, j, element_dgfv] += 0.5f0 * (1 - alpha_element) * integral_contribution
-        end
+        @inbounds du[i, j, k] += alpha_element * inverse_weights[j] *
+                                 (fstar1_L[i, j + 1, k] - fstar1_R[i, j, k]) * (1 - dg_only)
     end
 
     return nothing
 end
 
-# Kernel for calculating FV volume integral contribution 
-function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv,
-                                    alpha)
-    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-
-    if (j <= size(du, 2) && k <= size(du, 3))
-        # length(element_ids_dgfv) == size(du, 3)
-
-        @inbounds begin
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-            alpha_element = alpha[k]
-        end
-
-        if element_dgfv != 0 # bad
-            for ii in axes(du, 1)
-                @inbounds du[ii, j, element_dgfv] += alpha_element * inverse_weights[j] *
-                                                     (fstar1_L[ii, j + 1, element_dgfv] -
-                                                      fstar1_R[ii, j, element_dgfv])
-            end
-        end
-    end
-
-    return nothing
-end
+############################################################################## New optimization
+# Kernel for calculating pure DG and DG-FV volume integrals without conservative terms
 
 # Kernel for prolonging two interfaces
 function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids)
@@ -913,60 +875,39 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
     volume_flux_dg, volume_flux_fv = dg.volume_integral.volume_flux_dg,
                                      dg.volume_integral.volume_flux_fv
     indicator = dg.volume_integral.indicator
+    derivative_split = dg.basis.derivative_split
+    inverse_weights = dg.basis.inverse_weights
+
+    fstar1_L = cache.fstar1_L
+    fstar1_R = cache.fstar1_R
 
     # TODO: Get copies of `u` and `du` on both device and host
     # FIXME: Scalar indexing on GPU arrays caused by using GPU cache
     alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache
     alpha = CuArray(alpha)
-
     atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0))
-    element_ids_dg = CUDA.zeros(Int, length(alpha))
-    element_ids_dgfv = CUDA.zeros(Int, length(alpha))
-
-    # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated)
-    pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg,
-                                                                                              element_ids_dgfv,
-                                                                                              alpha,
-                                                                                              atol)
-    pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol;
-                                      kernel_configurator_1d(pure_blended_element_count_kernel,
-                                                             length(alpha))...)
 
-    derivative_split = dg.basis.derivative_split
-    inverse_weights = dg.basis.inverse_weights
     volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
 
-    fstar1_L = cache.fstar1_L
-    fstar1_R = cache.fstar1_R
-
     volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L,
                                                                           fstar1_R, u,
-                                                                          element_ids_dgfv,
+                                                                          alpha, atol,
                                                                           equations, volume_flux_dg,
                                                                           volume_flux_fv)
-    volume_flux_dgfv_kernel(volume_flux_arr, fstar1_L, fstar1_R, u, element_ids_dgfv, equations,
+    volume_flux_dgfv_kernel(volume_flux_arr, fstar1_L, fstar1_R, u, alpha, atol, equations,
                             volume_flux_dg, volume_flux_fv;
                             kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^2,
                                                    size(u, 3))...)
 
-    volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg,
-                                                                              element_ids_dgfv,
-                                                                              alpha,
-                                                                              derivative_split,
-                                                                              volume_flux_arr,
-                                                                              equations)
-    volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr, equations;
-                              kernel_configurator_3d(volume_integral_dg_kernel, size(du)...)...)
-
-    volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L,
-                                                                              fstar1_R,
-                                                                              inverse_weights,
-                                                                              element_ids_dgfv,
-                                                                              alpha)
-    volume_integral_fv_kernel(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv, alpha;
-                              kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2),
-                                                     size(u, 3))...)
+    volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha,
+                                                                                  derivative_split,
+                                                                                  inverse_weights,
+                                                                                  volume_flux_arr,
+                                                                                  fstar1_L, fstar1_R,
+                                                                                  atol, equations)
+    volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr,
+                                fstar1_L, fstar1_R, atol, equations;
+                                kernel_configurator_3d(volume_integral_dgfv_kernel, size(du)...)...)
 
     return nothing
 end
@@ -976,71 +917,50 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
                                volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
     RealT = eltype(du)
 
-    volume_flux_dg, nonconservative_flux_dg = dg.volume_integral.volume_flux_dg
-    volume_flux_fv, nonconservative_flux_fv = dg.volume_integral.volume_flux_fv
+    volume_flux_dg, noncons_flux_dg = dg.volume_integral.volume_flux_dg
+    volume_flux_fv, noncons_flux_fv = dg.volume_integral.volume_flux_fv
     indicator = dg.volume_integral.indicator
+    derivative_split = dg.basis.derivative_split
+    inverse_weights = dg.basis.inverse_weights
+
+    fstar1_L = cache.fstar1_L
+    fstar1_R = cache.fstar1_R
 
     # TODO: Get copies of `u` and `du` on both device and host
     # FIXME: Scalar indexing on GPU arrays caused by using GPU cache
     alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache
     alpha = CuArray(alpha)
-
     atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0))
-    element_ids_dg = CUDA.zeros(Int, length(alpha))
-    element_ids_dgfv = CUDA.zeros(Int, length(alpha))
-
-    # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated)
-    pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg,
-                                                                                              element_ids_dgfv,
-                                                                                              alpha,
-                                                                                              atol)
-    pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol;
-                                      kernel_configurator_1d(pure_blended_element_count_kernel,
-                                                             length(alpha))...)
 
-    derivative_split = dg.basis.derivative_split
-    inverse_weights = dg.basis.inverse_weights
     volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
     noncons_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
 
-    fstar1_L = cache.fstar1_L
-    fstar1_R = cache.fstar1_R
-
     volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr,
                                                                           noncons_flux_arr,
                                                                           fstar1_L, fstar1_R, u,
-                                                                          element_ids_dgfv,
+                                                                          alpha, atol,
                                                                           derivative_split,
                                                                           equations,
                                                                           volume_flux_dg,
-                                                                          nonconservative_flux_dg,
+                                                                          noncons_flux_dg,
                                                                           volume_flux_fv,
-                                                                          nonconservative_flux_fv)
+                                                                          noncons_flux_fv)
     volume_flux_dgfv_kernel(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u,
-                            element_ids_dgfv, derivative_split, equations, volume_flux_dg,
-                            nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv;
+                            alpha, atol, derivative_split, equations, volume_flux_dg,
+                            noncons_flux_dg, volume_flux_fv, noncons_flux_fv;
                             kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^2,
                                                    size(u, 3))...)
 
-    volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg,
-                                                                              element_ids_dgfv,
-                                                                              alpha,
-                                                                              derivative_split,
-                                                                              volume_flux_arr,
-                                                                              noncons_flux_arr,
-                                                                              equations)
-    volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr, noncons_flux_arr, equations;
-                              kernel_configurator_3d(volume_integral_dg_kernel, size(du)...)...)
-
-    volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L,
-                                                                              fstar1_R,
-                                                                              inverse_weights,
-                                                                              element_ids_dgfv,
-                                                                              alpha)
-    volume_integral_fv_kernel(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv, alpha;
-                              kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2),
-                                                     size(u, 3))...)
+    volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha,
+                                                                                  derivative_split,
+                                                                                  inverse_weights,
+                                                                                  volume_flux_arr,
+                                                                                  noncons_flux_arr,
+                                                                                  fstar1_L, fstar1_R,
+                                                                                  atol, equations)
+    volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr,
+                                noncons_flux_arr, fstar1_L, fstar1_R, atol, equations;
+                                kernel_configurator_3d(volume_integral_dgfv_kernel, size(du)...)...)
 
     return nothing
 end
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index e01ac595..eb994cd2 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -52,7 +52,7 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2)
 end
 
 ############################################################################## New optimization
-# Kernel for calculating fluxes and weak form
+# Kernel for calculating volume integrals with weak form
 function flux_weak_form_kernel!(du, u, derivative_dhat,
                                 equations::AbstractEquations{2}, flux::Any)
     # Set tile width
@@ -162,7 +162,7 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_
 end
 
 ############################################################################## New optimization
-# Kernel for calculating volume fluxes and volume integrals
+# Kernel for calculating volume integrals without conservative terms
 function volume_flux_integral_kernel!(du, u, derivative_split,
                                       equations::AbstractEquations{2}, volume_flux::Any)
     # Set tile width
@@ -295,8 +295,7 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, symm
 end
 
 ############################################################################## New optimization
-# Kernel for calculating symmetric and nonconservative volume fluxes and 
-# corresponding volume integrals
+# Kernel for calculating volume integrals with conservative terms
 function noncons_volume_flux_integral_kernel!(du, u, derivative_split,
                                               equations::AbstractEquations{2},
                                               symmetric_flux::Any, nonconservative_flux::Any)
@@ -378,24 +377,20 @@ end
 
 # Kernel for calculating pure DG and DG-FV volume fluxes
 function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R,
-                                  fstar2_L, fstar2_R, u, element_ids_dgfv,
-                                  equations::AbstractEquations{2}, volume_flux_dg::Any,
-                                  volume_flux_fv::Any)
+                                  fstar2_L, fstar2_R, u, alpha, atol,
+                                  equations::AbstractEquations{2},
+                                  volume_flux_dg::Any, volume_flux_fv::Any)
     j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
     if (j <= size(u, 2)^3 && k <= size(u, 4))
-        # length(element_ids_dgfv) == size(u, 4)
         u2 = size(u, 2)
 
         j1 = div(j - 1, u2^2) + 1
         j2 = div(rem(j - 1, u2^2), u2) + 1
         j3 = rem(rem(j - 1, u2^2), u2) + 1
 
-        @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-
-        # The sets of `get_node_vars` operations may be combined
-        # into a single set of operation for better performance (to be explored).
+        dg_only = isapprox(alpha[k], 0, atol = atol)
 
         u_node = get_node_vars(u, equations, j1, j2, k)
         u_node1 = get_node_vars(u, equations, j3, j2, k)
@@ -409,86 +404,97 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L,
                 volume_flux_arr1[ii, j1, j3, j2, k] = volume_flux_node1[ii]
                 volume_flux_arr2[ii, j1, j2, j3, k] = volume_flux_node2[ii]
             end
-        end
 
-        if j1 != 1 && j3 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1 - 1, j2, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, element_dgfv)
-            flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations)
+            # Small optimization, no much performance gain
+            if isequal(j1 + 1, j3) # avoid race condition
+                flux_fv_node1 = volume_flux_fv(u_node, u_node1, 1, equations)
 
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar1_L[ii, j1, j2, element_dgfv] = flux_fv_node1[ii]
-                    fstar1_R[ii, j1, j2, element_dgfv] = flux_fv_node1[ii]
+                    fstar1_L[ii, j3, j2, k] = flux_fv_node1[ii] * (1 - dg_only)
+                    fstar1_R[ii, j3, j2, k] = flux_fv_node1[ii] * (1 - dg_only)
                 end
             end
-        end
 
-        if j2 != 1 && j3 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1, j2 - 1, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, element_dgfv)
-            flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations)
+            if isequal(j2 + 1, j3) # avoid race condition
+                flux_fv_node2 = volume_flux_fv(u_node, u_node2, 2, equations)
 
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar2_L[ii, j1, j2, element_dgfv] = flux_fv_node2[ii]
-                    fstar2_R[ii, j1, j2, element_dgfv] = flux_fv_node2[ii]
+                    fstar2_L[ii, j1, j3, k] = flux_fv_node2[ii] * (1 - dg_only)
+                    fstar2_R[ii, j1, j3, k] = flux_fv_node2[ii] * (1 - dg_only)
                 end
             end
         end
+
+        # if j1 != 1 && j3 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1 - 1, j2, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, k)
+        #     flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations)
+
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar1_L[ii, j1, j2, k] = flux_fv_node1[ii] * (1 - dg_only)
+        #             fstar1_R[ii, j1, j2, k] = flux_fv_node1[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
+
+        # if j2 != 1 && j3 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1, j2 - 1, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, k)
+        #     flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations)
+
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar2_L[ii, j1, j2, k] = flux_fv_node2[ii] * (1 - dg_only)
+        #             fstar2_R[ii, j1, j2, k] = flux_fv_node2[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
     end
 
     return nothing
 end
 
-# Kernel for calculating DG volume integral contribution
-function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr1, volume_flux_arr2,
-                                    equations::AbstractEquations{2})
+# Kernel for calculating pure DG and DG-FV volume integrals
+function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights,
+                                      volume_flux_arr1, volume_flux_arr2,
+                                      fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol,
+                                      equations::AbstractEquations{2})
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
     if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4))
-        # length(element_ids_dg) == size(du, 4)
-        # length(element_ids_dgfv) == size(du, 4)
-
         j1 = div(j - 1, size(du, 2)) + 1
         j2 = rem(j - 1, size(du, 2)) + 1
 
-        @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros
-
         @inbounds begin
-            element_dg = element_ids_dg[k] # check if `element_dg` is zero
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+            du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros
             alpha_element = alpha[k]
         end
 
-        if element_dg != 0 # bad
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j1, j2, element_dg] += derivative_split[j1, ii] *
-                                                 (1 - isequal(j1, ii)) * # set diagonal elements to zeros
-                                                 volume_flux_arr1[i, j1, ii, j2, element_dg]
-                    du[i, j1, j2, element_dg] += derivative_split[j2, ii] *
-                                                 (1 - isequal(j2, ii)) * # set diagonal elements to zeros
-                                                 volume_flux_arr2[i, j1, j2, ii, element_dg]
-                end
-            end
-        end
+        dg_only = isapprox(alpha_element, 0, atol = atol)
 
-        if element_dgfv != 0 # bad
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j1, j2, element_dgfv] += (1 - alpha_element) * derivative_split[j1, ii] *
-                                                   (1 - isequal(j1, ii)) * # set diagonal elements to zeros
-                                                   volume_flux_arr1[i, j1, ii, j2, element_dgfv]
-                    du[i, j1, j2, element_dgfv] += (1 - alpha_element) * derivative_split[j2, ii] *
-                                                   (1 - isequal(j2, ii)) * # set diagonal elements to zeros
-                                                   volume_flux_arr2[i, j1, j2, ii, element_dgfv]
-                end
-            end
+        for ii in axes(du, 2)
+            @inbounds du[i, j1, j2, k] += (derivative_split[j1, ii] *
+                                           (1 - isequal(j1, ii)) * # set diagonal elements to zeros
+                                           volume_flux_arr1[i, j1, ii, j2, k] +
+                                           derivative_split[j2, ii] *
+                                           (1 - isequal(j2, ii)) * # set diagonal elements to zeros
+                                           volume_flux_arr2[i, j1, j2, ii, k]) * dg_only +
+                                          ((1 - alpha_element) * derivative_split[j1, ii] *
+                                           (1 - isequal(j1, ii)) * # set diagonal elements to zeros
+                                           volume_flux_arr1[i, j1, ii, j2, k] +
+                                           (1 - alpha_element) * derivative_split[j2, ii] *
+                                           (1 - isequal(j2, ii)) * # set diagonal elements to zeros
+                                           volume_flux_arr2[i, j1, j2, ii, k]) * (1 - dg_only)
         end
+
+        @inbounds du[i, j1, j2, k] += alpha_element *
+                                      (inverse_weights[j1] *
+                                       (fstar1_L[i, j1 + 1, j2, k] - fstar1_R[i, j1, j2, k]) +
+                                       inverse_weights[j2] *
+                                       (fstar2_L[i, j1, j2 + 1, k] - fstar2_R[i, j1, j2, k])) * (1 - dg_only)
     end
 
     return nothing
@@ -497,25 +503,21 @@ end
 # Kernel for calculating pure DG and DG-FV volume fluxes
 function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1,
                                   noncons_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R,
-                                  u, element_ids_dgfv, derivative_split,
+                                  u, alpha, atol, derivative_split,
                                   equations::AbstractEquations{2},
-                                  volume_flux_dg::Any, nonconservative_flux_dg::Any,
-                                  volume_flux_fv::Any, nonconservative_flux_fv::Any)
+                                  volume_flux_dg::Any, noncons_flux_dg::Any,
+                                  volume_flux_fv::Any, noncons_flux_fv::Any)
     j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
     if (j <= size(u, 2)^3 && k <= size(u, 4))
-        # length(element_ids_dgfv) == size(u, 4)
         u2 = size(u, 2)
 
         j1 = div(j - 1, u2^2) + 1
         j2 = div(rem(j - 1, u2^2), u2) + 1
         j3 = rem(rem(j - 1, u2^2), u2) + 1
 
-        @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-
-        # The sets of `get_node_vars` operations may be combined
-        # into a single set of operation for better performance (to be explored).
+        dg_only = isapprox(alpha[k], 0, atol = atol)
 
         u_node = get_node_vars(u, equations, j1, j2, k)
         u_node1 = get_node_vars(u, equations, j3, j2, k)
@@ -524,8 +526,8 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_fl
         volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations)
         volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations)
 
-        noncons_flux_node1 = nonconservative_flux_dg(u_node, u_node1, 1, equations)
-        noncons_flux_node2 = nonconservative_flux_dg(u_node, u_node2, 2, equations)
+        noncons_flux_node1 = noncons_flux_dg(u_node, u_node1, 1, equations)
+        noncons_flux_node2 = noncons_flux_dg(u_node, u_node2, 2, equations)
 
         for ii in axes(u, 1)
             @inbounds begin
@@ -536,144 +538,118 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_fl
                 noncons_flux_arr1[ii, j1, j3, j2, k] = noncons_flux_node1[ii]
                 noncons_flux_arr2[ii, j1, j2, j3, k] = noncons_flux_node2[ii]
             end
-        end
-
-        if j1 != 1 && j3 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1 - 1, j2, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, element_dgfv)
-
-            f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
 
-            f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations)
-            f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations)
+            # Small optimization, no much performance gain
+            if isequal(j1 + 1, j3) # avoid race condition
+                f1_node = volume_flux_fv(u_node, u_node1, 1, equations)
+                f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations)
+                f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations)
 
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar1_L[ii, j1, j2, element_dgfv] = f1_node[ii] + 0.5f0 * f1_L_node[ii]
-                    fstar1_R[ii, j1, j2, element_dgfv] = f1_node[ii] + 0.5f0 * f1_R_node[ii]
+                    fstar1_L[ii, j3, j2, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only)
+                    fstar1_R[ii, j3, j2, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only)
                 end
             end
-        end
-
-        if j2 != 1 && j3 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1, j2 - 1, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, element_dgfv)
-
-            f2_node = volume_flux_fv(u_ll, u_rr, 2, equations)
 
-            f2_L_node = nonconservative_flux_fv(u_ll, u_rr, 2, equations)
-            f2_R_node = nonconservative_flux_fv(u_rr, u_ll, 2, equations)
+            if isequal(j2 + 1, j3) # avoid race condition
+                f2_node = volume_flux_fv(u_node, u_node2, 2, equations)
+                f2_L_node = noncons_flux_fv(u_node, u_node2, 2, equations)
+                f2_R_node = noncons_flux_fv(u_node2, u_node, 2, equations)
 
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar2_L[ii, j1, j2, element_dgfv] = f2_node[ii] + 0.5f0 * f2_L_node[ii]
-                    fstar2_R[ii, j1, j2, element_dgfv] = f2_node[ii] + 0.5f0 * f2_R_node[ii]
+                    fstar2_L[ii, j1, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only)
+                    fstar2_R[ii, j1, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only)
                 end
             end
         end
-    end
-
-    return nothing
-end
-
-# Kernel for calculating DG volume integral contribution
-function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr1, volume_flux_arr2,
-                                    noncons_flux_arr1, noncons_flux_arr2,
-                                    equations::AbstractEquations{2})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-    if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4))
-        # length(element_ids_dg) == size(du, 4)
-        # length(element_ids_dgfv) == size(du, 4)
-
-        j1 = div(j - 1, size(du, 2)) + 1
-        j2 = rem(j - 1, size(du, 2)) + 1
-
-        @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros
 
-        @inbounds begin
-            element_dg = element_ids_dg[k] # check if `element_dg` is zero
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-            alpha_element = alpha[k]
-        end
+        # if j1 != 1 && j3 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1 - 1, j2, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, k)
 
-        if element_dg != 0 # bad
-            integral_contribution = zero(eltype(du))
+        #     f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
 
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j1, j2, element_dg] += volume_flux_arr1[i, j1, ii, j2, element_dg]
-                    du[i, j1, j2, element_dg] += volume_flux_arr2[i, j1, j2, ii, element_dg]
+        #     f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations)
+        #     f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations)
 
-                    integral_contribution += derivative_split[j1, ii] *
-                                             noncons_flux_arr1[i, j1, ii, j2, element_dg]
-                    integral_contribution += derivative_split[j2, ii] *
-                                             noncons_flux_arr2[i, j1, j2, ii, element_dg]
-                end
-            end
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar1_L[ii, j1, j2, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only)
+        #             fstar1_R[ii, j1, j2, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
 
-            @inbounds du[i, j1, j2, element_dg] += 0.5f0 * integral_contribution
-        end
+        # if j2 != 1 && j3 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1, j2 - 1, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, k)
 
-        if element_dgfv != 0 # bad
-            integral_contribution = zero(eltype(du))
+        #     f2_node = volume_flux_fv(u_ll, u_rr, 2, equations)
 
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j1, j2, element_dgfv] += (1 - alpha_element) *
-                                                   volume_flux_arr1[i, j1, ii, j2, element_dgfv]
-                    du[i, j1, j2, element_dgfv] += (1 - alpha_element) *
-                                                   volume_flux_arr2[i, j1, j2, ii, element_dgfv]
-
-                    integral_contribution += derivative_split[j1, ii] *
-                                             noncons_flux_arr1[i, j1, ii, j2, element_dgfv]
-                    integral_contribution += derivative_split[j2, ii] *
-                                             noncons_flux_arr2[i, j1, j2, ii, element_dgfv]
-                end
-            end
+        #     f2_L_node = noncons_flux_fv(u_ll, u_rr, 2, equations)
+        #     f2_R_node = noncons_flux_fv(u_rr, u_ll, 2, equations)
 
-            @inbounds du[i, j1, j2, element_dgfv] += 0.5f0 * (1 - alpha_element) * integral_contribution
-        end
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar2_L[ii, j1, j2, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only)
+        #             fstar2_R[ii, j1, j2, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
     end
 
     return nothing
 end
 
-# Kernel for calculating FV volume integral contribution 
-function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R,
-                                    inverse_weights, element_ids_dgfv, alpha)
-    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-
-    if (j <= size(du, 2)^2 && k <= size(du, 4))
-        # length(element_ids_dgfv) == size(du, 4)
+# Kernel for calculating pure DG and DG-FV volume integrals
+function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights,
+                                      volume_flux_arr1, volume_flux_arr2,
+                                      noncons_flux_arr1, noncons_flux_arr2,
+                                      fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol,
+                                      equations::AbstractEquations{2})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
+    if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4))
         j1 = div(j - 1, size(du, 2)) + 1
         j2 = rem(j - 1, size(du, 2)) + 1
 
         @inbounds begin
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+            du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros
             alpha_element = alpha[k]
         end
 
-        if element_dgfv != 0 # bad
-            for ii in axes(du, 1)
-                @inbounds du[ii, j1, j2, element_dgfv] += alpha_element * ((inverse_weights[j1] *
-                                                            (fstar1_L[ii, j1 + 1, j2, element_dgfv] -
-                                                             fstar1_R[ii, j1, j2, element_dgfv]) +
-                                                            inverse_weights[j2] *
-                                                            (fstar2_L[ii, j1, j2 + 1, element_dgfv] -
-                                                             fstar2_R[ii, j1, j2, element_dgfv])))
-            end
+        dg_only = isapprox(alpha_element, 0, atol = atol)
+
+        for ii in axes(du, 2)
+            @inbounds du[i, j1, j2, k] += (volume_flux_arr1[i, j1, ii, j2, k] +
+                                           volume_flux_arr2[i, j1, j2, ii, k] +
+                                           0.5f0 *
+                                           (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] +
+                                            derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k])) * dg_only +
+                                          ((1 - alpha_element) *
+                                           volume_flux_arr1[i, j1, ii, j2, k] +
+                                           (1 - alpha_element) *
+                                           volume_flux_arr2[i, j1, j2, ii, k] +
+                                           0.5f0 * (1 - alpha_element) *
+                                           (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] +
+                                            derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k])) * (1 - dg_only)
         end
+
+        @inbounds du[i, j1, j2, k] += alpha_element *
+                                      (inverse_weights[j1] *
+                                       (fstar1_L[i, j1 + 1, j2, k] - fstar1_R[i, j1, j2, k]) +
+                                       inverse_weights[j2] *
+                                       (fstar2_L[i, j1, j2 + 1, k] - fstar2_R[i, j1, j2, k])) * (1 - dg_only)
     end
 
     return nothing
 end
 
+############################################################################## New optimization
+# Kernel for calculating DG-FV volume integrals without conservative terms
+
 # Kernel for prolonging two interfaces 
 function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations,
                                     euqations::AbstractEquations{2})
@@ -1390,73 +1366,51 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
     volume_flux_dg, volume_flux_fv = dg.volume_integral.volume_flux_dg,
                                      dg.volume_integral.volume_flux_fv
     indicator = dg.volume_integral.indicator
+    derivative_split = dg.basis.derivative_split
+    inverse_weights = dg.basis.inverse_weights
+
+    fstar1_L = cache.fstar1_L
+    fstar1_R = cache.fstar1_R
+    fstar2_L = cache.fstar2_L
+    fstar2_R = cache.fstar2_R
 
     # TODO: Get copies of `u` and `du` on both device and host
     # FIXME: Scalar indexing on GPU arrays caused by using GPU cache 
     alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache
     alpha = CuArray(alpha)
-
     atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0))
-    element_ids_dg = CUDA.zeros(Int, length(alpha))
-    element_ids_dgfv = CUDA.zeros(Int, length(alpha))
-
-    # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated)
-    pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg,
-                                                                                              element_ids_dgfv,
-                                                                                              alpha,
-                                                                                              atol)
-    pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol;
-                                      kernel_configurator_1d(pure_blended_element_count_kernel,
-                                                             length(alpha))...)
 
-    derivative_split = dg.basis.derivative_split
-    inverse_weights = dg.basis.inverse_weights
     volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                       size(u, 4))
     volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                       size(u, 4))
 
-    fstar1_L = cache.fstar1_L
-    fstar1_R = cache.fstar1_R
-    fstar2_L = cache.fstar2_L
-    fstar2_R = cache.fstar2_R
-
     volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1,
                                                                           volume_flux_arr2,
                                                                           fstar1_L, fstar1_R,
                                                                           fstar2_L, fstar2_R,
-                                                                          u, element_ids_dgfv,
+                                                                          u, alpha, atol,
                                                                           equations,
                                                                           volume_flux_dg,
                                                                           volume_flux_fv)
     volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R, fstar2_L,
-                            fstar2_R, u, element_ids_dgfv, equations, volume_flux_dg,
-                            volume_flux_fv;
+                            fstar2_R, u, alpha, atol, equations, volume_flux_dg, volume_flux_fv;
                             kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^3,
                                                    size(u, 4))...)
 
-    volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg,
-                                                                              element_ids_dgfv,
-                                                                              alpha,
-                                                                              derivative_split,
-                                                                              volume_flux_arr1,
-                                                                              volume_flux_arr2,
-                                                                              equations)
-    volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr1, volume_flux_arr2, equations;
-                              kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1),
-                                                     size(du, 2)^2, size(du, 4))...)
-
-    volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L,
-                                                                              fstar1_R,
-                                                                              fstar2_L, fstar2_R,
-                                                                              inverse_weights,
-                                                                              element_ids_dgfv,
-                                                                              alpha)
-    volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, inverse_weights,
-                              element_ids_dgfv, alpha;
-                              kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^2,
-                                                     size(u, 4))...)
+    volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha,
+                                                                                  derivative_split,
+                                                                                  inverse_weights,
+                                                                                  volume_flux_arr1,
+                                                                                  volume_flux_arr2,
+                                                                                  fstar1_L, fstar1_R,
+                                                                                  fstar2_L, fstar2_R,
+                                                                                  atol, equations)
+    volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr1,
+                                volume_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol,
+                                equations;
+                                kernel_configurator_3d(volume_integral_dgfv_kernel, size(du, 1),
+                                                       size(du, 2)^2, size(du, 4))...)
 
     return nothing
 end
@@ -1466,30 +1420,23 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
                                volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
     RealT = eltype(du)
 
-    volume_flux_dg, nonconservative_flux_dg = dg.volume_integral.volume_flux_dg
-    volume_flux_fv, nonconservative_flux_fv = dg.volume_integral.volume_flux_fv
+    volume_flux_dg, noncons_flux_dg = dg.volume_integral.volume_flux_dg
+    volume_flux_fv, noncons_flux_fv = dg.volume_integral.volume_flux_fv
     indicator = dg.volume_integral.indicator
+    derivative_split = dg.basis.derivative_split
+    inverse_weights = dg.basis.inverse_weights
+
+    fstar1_L = cache.fstar1_L
+    fstar1_R = cache.fstar1_R
+    fstar2_L = cache.fstar2_L
+    fstar2_R = cache.fstar2_R
 
     # TODO: Get copies of `u` and `du` on both device and host
     # FIXME: Scalar indexing on GPU arrays caused by using GPU cache 
     alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache
     alpha = CuArray(alpha)
-
     atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0))
-    element_ids_dg = CUDA.zeros(Int, length(alpha))
-    element_ids_dgfv = CUDA.zeros(Int, length(alpha))
-
-    # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated)
-    pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg,
-                                                                                              element_ids_dgfv,
-                                                                                              alpha,
-                                                                                              atol)
-    pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol;
-                                      kernel_configurator_1d(pure_blended_element_count_kernel,
-                                                             length(alpha))...)
 
-    derivative_split = dg.basis.derivative_split
-    inverse_weights = dg.basis.inverse_weights
     volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                       size(u, 4))
     volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
@@ -1499,56 +1446,41 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
     noncons_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                        size(u, 4))
 
-    fstar1_L = cache.fstar1_L
-    fstar1_R = cache.fstar1_R
-    fstar2_L = cache.fstar2_L
-    fstar2_R = cache.fstar2_R
-
     volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1,
                                                                           volume_flux_arr2,
                                                                           noncons_flux_arr1,
                                                                           noncons_flux_arr2,
                                                                           fstar1_L, fstar1_R,
                                                                           fstar2_L, fstar2_R,
-                                                                          u, element_ids_dgfv,
+                                                                          u, alpha, atol,
                                                                           derivative_split,
                                                                           equations,
                                                                           volume_flux_dg,
-                                                                          nonconservative_flux_dg,
+                                                                          noncons_flux_dg,
                                                                           volume_flux_fv,
-                                                                          nonconservative_flux_fv)
+                                                                          noncons_flux_fv)
     volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1,
                             noncons_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                            element_ids_dgfv, derivative_split, equations, volume_flux_dg,
-                            nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv;
+                            alpha, atol, derivative_split, equations, volume_flux_dg,
+                            noncons_flux_dg, volume_flux_fv, noncons_flux_fv;
                             kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^3,
                                                    size(u, 4))...)
 
-    volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg,
-                                                                              element_ids_dgfv,
-                                                                              alpha,
-                                                                              derivative_split,
-                                                                              volume_flux_arr1,
-                                                                              volume_flux_arr2,
-                                                                              noncons_flux_arr1,
-                                                                              noncons_flux_arr2,
-                                                                              equations)
-    volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1,
-                              noncons_flux_arr2, equations;
-                              kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1),
-                                                     size(du, 2)^2, size(du, 4))...)
-
-    volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L,
-                                                                              fstar1_R,
-                                                                              fstar2_L, fstar2_R,
-                                                                              inverse_weights,
-                                                                              element_ids_dgfv,
-                                                                              alpha)
-    volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, inverse_weights,
-                              element_ids_dgfv, alpha;
-                              kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^2,
-                                                     size(u, 4))...)
+    volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha,
+                                                                                  derivative_split,
+                                                                                  inverse_weights,
+                                                                                  volume_flux_arr1,
+                                                                                  volume_flux_arr2,
+                                                                                  noncons_flux_arr1,
+                                                                                  noncons_flux_arr2,
+                                                                                  fstar1_L, fstar1_R,
+                                                                                  fstar2_L, fstar2_R,
+                                                                                  atol, equations)
+    volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr1,
+                                volume_flux_arr2, noncons_flux_arr1, noncons_flux_arr2, fstar1_L,
+                                fstar1_R, fstar2_L, fstar2_R, atol, equations;
+                                kernel_configurator_3d(volume_integral_dgfv_kernel, size(du, 1),
+                                                       size(du, 2)^2, size(du, 4))...)
 
     return nothing
 end
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 3055a878..f8156318 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -62,7 +62,7 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3)
 end
 
 ############################################################################## New optimization
-# Kernel for calculating fluxes and weak form
+# Kernel for calculating volume integrals with weak form
 function flux_weak_form_kernel!(du, u, derivative_dhat,
                                 equations::AbstractEquations{3}, flux::Any)
     # Set tile width
@@ -185,7 +185,7 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_
 end
 
 ############################################################################## New optimization
-# Kernel for calculating volume fluxes and volume integrals
+# Kernel for calculating volume integrals without conservative terms
 function volume_flux_integral_kernel!(du, u, derivative_split,
                                       equations::AbstractEquations{3}, volume_flux::Any)
     # Set tile width
@@ -337,8 +337,7 @@ function volume_integral_kernel!(du, derivative_split,
 end
 
 ############################################################################## New optimization
-# Kernel for calculating symmetric and nonconservative volume fluxes and 
-# corresponding volume integrals
+# Kernel for calculating volume integrals with conservative terms
 function noncons_volume_flux_integral_kernel!(du, u, derivative_split,
                                               equations::AbstractEquations{3},
                                               symmetric_flux::Any, nonconservative_flux::Any)
@@ -431,13 +430,12 @@ end
 # Kernel for calculating pure DG and DG-FV volume fluxes
 function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
                                   fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                                  u, element_ids_dgfv, equations::AbstractEquations{3},
+                                  u, alpha, atol, equations::AbstractEquations{3},
                                   volume_flux_dg::Any, volume_flux_fv::Any)
     j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
     if (j <= size(u, 2)^4 && k <= size(u, 5))
-        # length(element_ids_dgfv) == size(u, 5)
         u2 = size(u, 2)
 
         j1 = div(j - 1, u2^3) + 1
@@ -445,10 +443,7 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu
         j3 = div(rem(j - 1, u2^2), u2) + 1
         j4 = rem(j - 1, u2) + 1
 
-        @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-
-        # The sets of `get_node_vars` operations may be combined
-        # into a single set of operation for better performance (to be explored).
+        dg_only = isapprox(alpha[k], 0, atol = atol)
 
         u_node = get_node_vars(u, equations, j1, j2, j3, k)
         u_node1 = get_node_vars(u, equations, j4, j2, j3, k)
@@ -465,106 +460,130 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu
                 volume_flux_arr2[ii, j1, j2, j4, j3, k] = volume_flux_node2[ii]
                 volume_flux_arr3[ii, j1, j2, j3, j4, k] = volume_flux_node3[ii]
             end
-        end
 
-        if j1 != 1 && j4 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv)
-            flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations)
+            # Small optimization, no much performance gain
+            if isequal(j1 + 1, j4) # avoid race condition
+                flux_fv_node1 = volume_flux_fv(u_node, u_node1, 1, equations)
 
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar1_L[ii, j1, j2, j3, element_dgfv] = flux_fv_node1[ii]
-                    fstar1_R[ii, j1, j2, j3, element_dgfv] = flux_fv_node1[ii]
+                    fstar1_L[ii, j4, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only)
+                    fstar1_R[ii, j4, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only)
                 end
             end
-        end
 
-        if j2 != 1 && j4 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv)
-            flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations)
+            if isequal(j2 + 1, j4) # avoid race condition
+                flux_fv_node2 = volume_flux_fv(u_node, u_node2, 2, equations)
 
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar2_L[ii, j1, j2, j3, element_dgfv] = flux_fv_node2[ii]
-                    fstar2_R[ii, j1, j2, j3, element_dgfv] = flux_fv_node2[ii]
+                    fstar2_L[ii, j1, j4, j3, k] = flux_fv_node2[ii] * (1 - dg_only)
+                    fstar2_R[ii, j1, j4, j3, k] = flux_fv_node2[ii] * (1 - dg_only)
                 end
             end
-        end
 
-        if j3 != 1 && j4 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv)
-            flux_fv_node3 = volume_flux_fv(u_ll, u_rr, 3, equations)
+            if isequal(j3 + 1, j4) # avoid race condition
+                flux_fv_node3 = volume_flux_fv(u_node, u_node3, 3, equations)
 
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar3_L[ii, j1, j2, j3, element_dgfv] = flux_fv_node3[ii]
-                    fstar3_R[ii, j1, j2, j3, element_dgfv] = flux_fv_node3[ii]
+                    fstar3_L[ii, j1, j2, j4, k] = flux_fv_node3[ii] * (1 - dg_only)
+                    fstar3_R[ii, j1, j2, j4, k] = flux_fv_node3[ii] * (1 - dg_only)
                 end
             end
         end
+
+        # if j1 != 1 && j4 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, j3, k)
+        #     flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations)
+
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar1_L[ii, j1, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only)
+        #             fstar1_R[ii, j1, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
+
+        # if j2 != 1 && j4 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, j3, k)
+        #     flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations)
+
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar2_L[ii, j1, j2, j3, k] = flux_fv_node2[ii] * (1 - dg_only)
+        #             fstar2_R[ii, j1, j2, j3, k] = flux_fv_node2[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
+
+        # if j3 != 1 && j4 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, j3, k)
+        #     flux_fv_node3 = volume_flux_fv(u_ll, u_rr, 3, equations)
+
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar3_L[ii, j1, j2, j3, k] = flux_fv_node3[ii] * (1 - dg_only)
+        #             fstar3_R[ii, j1, j2, j3, k] = flux_fv_node3[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
     end
 
     return nothing
 end
 
-# Kernel for calculating DG volume integral contribution
-function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
-                                    equations::AbstractEquations{3})
+# Kernel for calculating pure DG and DG-FV volume integrals
+function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights,
+                                      volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
+                                      fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
+                                      atol, equations::AbstractEquations{3})
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
     if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5))
-        # length(element_ids_dg) == size(u, 5)
         u2 = size(du, 2) # size(du, 2) == size(u, 2)
 
         j1 = div(j - 1, u2^2) + 1
         j2 = div(rem(j - 1, u2^2), u2) + 1
         j3 = rem(rem(j - 1, u2^2), u2) + 1
 
-        @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros
-
         @inbounds begin
-            element_dg = element_ids_dg[k] # check if `element_dg` is zero
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+            du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros
             alpha_element = alpha[k]
         end
 
-        if element_dg != 0 # bad
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j1, j2, j3, element_dg] += derivative_split[j1, ii] *
-                                                     (1 - isequal(j1, ii)) * # set diagonal elements to zeros
-                                                     volume_flux_arr1[i, j1, ii, j2, j3, element_dg]
-                    du[i, j1, j2, j3, element_dg] += derivative_split[j2, ii] *
-                                                     (1 - isequal(j2, ii)) * # set diagonal elements to zeros
-                                                     volume_flux_arr2[i, j1, j2, ii, j3, element_dg]
-                    du[i, j1, j2, j3, element_dg] += derivative_split[j3, ii] *
-                                                     (1 - isequal(j3, ii)) * # set diagonal elements to zeros
-                                                     volume_flux_arr3[i, j1, j2, j3, ii, element_dg]
-                end
-            end
-        end
+        dg_only = isapprox(alpha_element, 0, atol = atol)
 
-        if element_dgfv != 0 # bad
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * derivative_split[j1, ii] *
-                                                       (1 - isequal(j1, ii)) * # set diagonal elements to zeros
-                                                       volume_flux_arr1[i, j1, ii, j2, j3, element_dgfv]
-                    du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * derivative_split[j2, ii] *
-                                                       (1 - isequal(j2, ii)) * # set diagonal elements to zeros
-                                                       volume_flux_arr2[i, j1, j2, ii, j3, element_dgfv]
-                    du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) * derivative_split[j3, ii] *
-                                                       (1 - isequal(j3, ii)) * # set diagonal elements to zeros                   
-                                                       volume_flux_arr3[i, j1, j2, j3, ii, element_dgfv]
-                end
-            end
+        for ii in axes(du, 2)
+            @inbounds du[i, j1, j2, j3, k] += (derivative_split[j1, ii] *
+                                               (1 - isequal(j1, ii)) * # set diagonal elements to zeros
+                                               volume_flux_arr1[i, j1, ii, j2, j3, k] +
+                                               derivative_split[j2, ii] *
+                                               (1 - isequal(j2, ii)) * # set diagonal elements to zeros
+                                               volume_flux_arr2[i, j1, j2, ii, j3, k] +
+                                               derivative_split[j3, ii] *
+                                               (1 - isequal(j3, ii)) * # set diagonal elements to zeros
+                                               volume_flux_arr3[i, j1, j2, j3, ii, k]) * dg_only +
+                                              ((1 - alpha_element) * derivative_split[j1, ii] *
+                                               (1 - isequal(j1, ii)) * # set diagonal elements to zeros
+                                               volume_flux_arr1[i, j1, ii, j2, j3, k] +
+                                               (1 - alpha_element) * derivative_split[j2, ii] *
+                                               (1 - isequal(j2, ii)) * # set diagonal elements to zeros
+                                               volume_flux_arr2[i, j1, j2, ii, j3, k] +
+                                               (1 - alpha_element) * derivative_split[j3, ii] *
+                                               (1 - isequal(j3, ii)) * # set diagonal elements to zeros                   
+                                               volume_flux_arr3[i, j1, j2, j3, ii, k]) * (1 - dg_only)
         end
+
+        @inbounds du[i, j1, j2, j3, k] += alpha_element *
+                                          (inverse_weights[j1] *
+                                           (fstar1_L[i, j1 + 1, j2, j3, k] - fstar1_R[i, j1, j2, j3, k]) +
+                                           inverse_weights[j2] *
+                                           (fstar2_L[i, j1, j2 + 1, j3, k] - fstar2_R[i, j1, j2, j3, k]) +
+                                           inverse_weights[j3] *
+                                           (fstar3_L[i, j1, j2, j3 + 1, k] - fstar3_R[i, j1, j2, j3, k])) * (1 - dg_only)
     end
 
     return nothing
@@ -574,15 +593,14 @@ end
 function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
                                   noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3,
                                   fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                                  u, element_ids_dgfv, derivative_split,
+                                  u, alpha, atol, derivative_split,
                                   equations::AbstractEquations{3},
-                                  volume_flux_dg::Any, nonconservative_flux_dg::Any,
-                                  volume_flux_fv::Any, nonconservative_flux_fv::Any)
+                                  volume_flux_dg::Any, noncons_flux_dg::Any,
+                                  volume_flux_fv::Any, noncons_flux_fv::Any)
     j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
     if (j <= size(u, 2)^4 && k <= size(u, 5))
-        # length(element_ids_dgfv) == size(u, 5)
         u2 = size(u, 2)
 
         j1 = div(j - 1, u2^3) + 1
@@ -590,10 +608,7 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu
         j3 = div(rem(j - 1, u2^2), u2) + 1
         j4 = rem(j - 1, u2) + 1
 
-        @inbounds element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-
-        # The sets of `get_node_vars` operations may be combined
-        # into a single set of operation for better performance (to be explored).
+        dg_only = isapprox(alpha[k], 0, atol = atol)
 
         u_node = get_node_vars(u, equations, j1, j2, j3, k)
         u_node1 = get_node_vars(u, equations, j4, j2, j3, k)
@@ -604,9 +619,9 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu
         volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations)
         volume_flux_node3 = volume_flux_dg(u_node, u_node3, 3, equations)
 
-        noncons_flux_node1 = nonconservative_flux_dg(u_node, u_node1, 1, equations)
-        noncons_flux_node2 = nonconservative_flux_dg(u_node, u_node2, 2, equations)
-        noncons_flux_node3 = nonconservative_flux_dg(u_node, u_node3, 3, equations)
+        noncons_flux_node1 = noncons_flux_dg(u_node, u_node1, 1, equations)
+        noncons_flux_node2 = noncons_flux_dg(u_node, u_node2, 2, equations)
+        noncons_flux_node3 = noncons_flux_dg(u_node, u_node3, 3, equations)
 
         for ii in axes(u, 1)
             @inbounds begin
@@ -621,154 +636,108 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu
                 noncons_flux_arr2[ii, j1, j2, j4, j3, k] = noncons_flux_node2[ii]
                 noncons_flux_arr3[ii, j1, j2, j3, j4, k] = noncons_flux_node3[ii]
             end
-        end
-
-        if j1 != 1 && j4 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv)
 
-            f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
+            # Small optimization, no much performance gain
+            if isequal(j1 + 1, j4) # avoid race condition
+                f1_node = volume_flux_fv(u_node, u_node1, 1, equations)
+                f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations)
+                f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations)
 
-            f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations)
-            f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations)
-
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar1_L[ii, j1, j2, j3, element_dgfv] = f1_node[ii] + 0.5f0 * f1_L_node[ii]
-                    fstar1_R[ii, j1, j2, j3, element_dgfv] = f1_node[ii] + 0.5f0 * f1_R_node[ii]
+                    fstar1_L[ii, j4, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only)
+                    fstar1_R[ii, j4, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only)
                 end
             end
-        end
-
-        if j2 != 1 && j4 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv)
 
-            f2_node = volume_flux_fv(u_ll, u_rr, 2, equations)
+            if isequal(j2 + 1, j4) # avoid race condition
+                f2_node = volume_flux_fv(u_node, u_node2, 2, equations)
+                f2_L_node = noncons_flux_fv(u_node, u_node2, 2, equations)
+                f2_R_node = noncons_flux_fv(u_node2, u_node, 2, equations)
 
-            f2_L_node = nonconservative_flux_fv(u_ll, u_rr, 2, equations)
-            f2_R_node = nonconservative_flux_fv(u_rr, u_ll, 2, equations)
-
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar2_L[ii, j1, j2, j3, element_dgfv] = f2_node[ii] + 0.5f0 * f2_L_node[ii]
-                    fstar2_R[ii, j1, j2, j3, element_dgfv] = f2_node[ii] + 0.5f0 * f2_R_node[ii]
+                    fstar2_L[ii, j1, j4, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only)
+                    fstar2_R[ii, j1, j4, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only)
                 end
             end
-        end
-
-        if j3 != 1 && j4 == 1 && element_dgfv != 0 # bad
-            u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, j2, j3, element_dgfv)
 
-            f3_node = volume_flux_fv(u_ll, u_rr, 3, equations)
+            if isequal(j3 + 1, j4) # avoid race condition
+                f3_node = volume_flux_fv(u_node, u_node3, 3, equations)
+                f3_L_node = noncons_flux_fv(u_node, u_node3, 3, equations)
+                f3_R_node = noncons_flux_fv(u_node3, u_node, 3, equations)
 
-            f3_L_node = nonconservative_flux_fv(u_ll, u_rr, 3, equations)
-            f3_R_node = nonconservative_flux_fv(u_rr, u_ll, 3, equations)
-
-            for ii in axes(u, 1)
                 @inbounds begin
-                    fstar3_L[ii, j1, j2, j3, element_dgfv] = f3_node[ii] + 0.5f0 * f3_L_node[ii]
-                    fstar3_R[ii, j1, j2, j3, element_dgfv] = f3_node[ii] + 0.5f0 * f3_R_node[ii]
+                    fstar3_L[ii, j1, j2, j4, k] = f3_node[ii] + 0.5f0 * f3_L_node[ii] * (1 - dg_only)
+                    fstar3_R[ii, j1, j2, j4, k] = f3_node[ii] + 0.5f0 * f3_R_node[ii] * (1 - dg_only)
                 end
             end
         end
-    end
 
-    return nothing
-end
+        # if j1 != 1 && j4 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, j3, k)
 
-# Kernel for calculating DG volume integral contribution
-function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
-                                    noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3,
-                                    equations::AbstractEquations{3})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+        #     f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
 
-    if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5))
-        # length(element_ids_dg) == size(u, 5)
-        u2 = size(du, 2) # size(du, 2) == size(u, 2)
+        #     f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations)
+        #     f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations)
 
-        j1 = div(j - 1, u2^2) + 1
-        j2 = div(rem(j - 1, u2^2), u2) + 1
-        j3 = rem(rem(j - 1, u2^2), u2) + 1
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar1_L[ii, j1, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only)
+        #             fstar1_R[ii, j1, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
 
-        @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros
+        # if j2 != 1 && j4 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, j3, k)
 
-        @inbounds begin
-            element_dg = element_ids_dg[k] # check if `element_dg` is zero
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-            alpha_element = alpha[k]
-        end
+        #     f2_node = volume_flux_fv(u_ll, u_rr, 2, equations)
 
-        if element_dg != 0 # bad
-            integral_contribution = zero(eltype(du))
+        #     f2_L_node = noncons_flux_fv(u_ll, u_rr, 2, equations)
+        #     f2_R_node = noncons_flux_fv(u_rr, u_ll, 2, equations)
 
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j1, j2, j3, element_dg] += volume_flux_arr1[i, j1, ii, j2, j3, element_dg]
-                    du[i, j1, j2, j3, element_dg] += volume_flux_arr2[i, j1, j2, ii, j3, element_dg]
-                    du[i, j1, j2, j3, element_dg] += volume_flux_arr3[i, j1, j2, j3, ii, element_dg]
-
-                    integral_contribution += derivative_split[j1, ii] *
-                                             noncons_flux_arr1[i, j1, ii, j2, j3,
-                                                               element_dg]
-                    integral_contribution += derivative_split[j2, ii] *
-                                             noncons_flux_arr2[i, j1, j2, ii, j3,
-                                                               element_dg]
-                    integral_contribution += derivative_split[j3, ii] *
-                                             noncons_flux_arr3[i, j1, j2, j3, ii,
-                                                               element_dg]
-                end
-            end
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar2_L[ii, j1, j2, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only)
+        #             fstar2_R[ii, j1, j2, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
 
-            @inbounds du[i, j1, j2, j3, element_dg] += 0.5f0 * integral_contribution
-        end
+        # if j3 != 1 && j4 == 1 # bad
+        #     u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, k)
+        #     u_rr = get_node_vars(u, equations, j1, j2, j3, k)
 
-        if element_dgfv != 0 # bad
-            integral_contribution = zero(eltype(du))
+        #     f3_node = volume_flux_fv(u_ll, u_rr, 3, equations)
 
-            for ii in axes(du, 2)
-                @inbounds begin
-                    du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) *
-                                                       volume_flux_arr1[i, j1, ii, j2, j3,
-                                                                        element_dgfv]
-                    du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) *
-                                                       volume_flux_arr2[i, j1, j2, ii, j3,
-                                                                        element_dgfv]
-                    du[i, j1, j2, j3, element_dgfv] += (1 - alpha_element) *
-                                                       volume_flux_arr3[i, j1, j2, j3, ii,
-                                                                        element_dgfv]
-
-                    integral_contribution += derivative_split[j1, ii] *
-                                             noncons_flux_arr1[i, j1, ii, j2, j3,
-                                                               element_dgfv]
-                    integral_contribution += derivative_split[j2, ii] *
-                                             noncons_flux_arr2[i, j1, j2, ii, j3,
-                                                               element_dgfv]
-                    integral_contribution += derivative_split[j3, ii] *
-                                             noncons_flux_arr3[i, j1, j2, j3, ii,
-                                                               element_dgfv]
-                end
-            end
+        #     f3_L_node = noncons_flux_fv(u_ll, u_rr, 3, equations)
+        #     f3_R_node = noncons_flux_fv(u_rr, u_ll, 3, equations)
 
-            @inbounds du[i, j1, j2, j3, element_dgfv] += 0.5f0 * (1 - alpha_element) * integral_contribution
-        end
+        #     for ii in axes(u, 1)
+        #         @inbounds begin
+        #             fstar3_L[ii, j1, j2, j3, k] = f3_node[ii] + 0.5f0 * f3_L_node[ii] * (1 - dg_only)
+        #             fstar3_R[ii, j1, j2, j3, k] = f3_node[ii] + 0.5f0 * f3_R_node[ii] * (1 - dg_only)
+        #         end
+        #     end
+        # end
     end
 
     return nothing
 end
 
-# Kernel for calculating FV volume integral contribution 
-function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R,
-                                    fstar3_L, fstar3_R, inverse_weights, element_ids_dgfv, alpha)
-    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+# Kernel for calculating pure DG and DG-FV volume integrals
+function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights,
+                                      volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
+                                      noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3,
+                                      fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
+                                      atol, equations::AbstractEquations{3})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    if (j <= size(du, 2)^3 && k <= size(du, 5))
-        # length(element_ids_dgfv) == size(du, 5)
+    if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5))
         u2 = size(du, 2) # size(du, 2) == size(u, 2)
 
         j1 = div(j - 1, u2^2) + 1
@@ -776,32 +745,47 @@ function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R,
         j3 = rem(rem(j - 1, u2^2), u2) + 1
 
         @inbounds begin
-            element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+            du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros
             alpha_element = alpha[k]
         end
 
-        if element_dgfv != 0 # bad
-            for ii in axes(du, 1)
-                @inbounds du[ii, j1, j2, j3, element_dgfv] += (alpha_element *
-                                                               (inverse_weights[j1] *
-                                                                (fstar1_L[ii, j1 + 1, j2, j3,
-                                                                          element_dgfv] -
-                                                                 fstar1_R[ii, j1, j2, j3, element_dgfv]) +
-                                                                inverse_weights[j2] *
-                                                                (fstar2_L[ii, j1, j2 + 1, j3,
-                                                                          element_dgfv] -
-                                                                 fstar2_R[ii, j1, j2, j3, element_dgfv]) +
-                                                                inverse_weights[j3] *
-                                                                (fstar3_L[ii, j1, j2, j3 + 1,
-                                                                          element_dgfv] -
-                                                                 fstar3_R[ii, j1, j2, j3, element_dgfv])))
-            end
+        dg_only = isapprox(alpha_element, 0, atol = atol)
+
+        for ii in axes(du, 2)
+            @inbounds du[i, j1, j2, j3, k] += (volume_flux_arr1[i, j1, ii, j2, j3, k] +
+                                               volume_flux_arr2[i, j1, j2, ii, j3, k] +
+                                               volume_flux_arr3[i, j1, j2, j3, ii, k] +
+                                               0.5f0 *
+                                               (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] +
+                                                derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] +
+                                                derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k])) * dg_only +
+                                              ((1 - alpha_element) *
+                                               volume_flux_arr1[i, j1, ii, j2, j3, k] +
+                                               (1 - alpha_element) *
+                                               volume_flux_arr2[i, j1, j2, ii, j3, k] +
+                                               (1 - alpha_element) *
+                                               volume_flux_arr3[i, j1, j2, j3, ii, k] +
+                                               0.5f0 * (1 - alpha_element) *
+                                               (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] +
+                                                derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] +
+                                                derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k])) * (1 - dg_only)
         end
+
+        @inbounds du[i, j1, j2, j3, k] += alpha_element *
+                                          (inverse_weights[j1] *
+                                           (fstar1_L[i, j1 + 1, j2, j3, k] - fstar1_R[i, j1, j2, j3, k]) +
+                                           inverse_weights[j2] *
+                                           (fstar2_L[i, j1, j2 + 1, j3, k] - fstar2_R[i, j1, j2, j3, k]) +
+                                           inverse_weights[j3] *
+                                           (fstar3_L[i, j1, j2, j3 + 1, k] - fstar3_R[i, j1, j2, j3, k])) * (1 - dg_only)
     end
 
     return nothing
 end
 
+############################################################################## New optimization
+# Kernel for calculating DG-FV volume integrals without conservative terms
+
 # Kernel for prolonging two interfaces
 function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations,
                                     equations::AbstractEquations{3})
@@ -1990,27 +1974,22 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
     volume_flux_dg, volume_flux_fv = dg.volume_integral.volume_flux_dg,
                                      dg.volume_integral.volume_flux_fv
     indicator = dg.volume_integral.indicator
+    derivative_split = dg.basis.derivative_split
+    inverse_weights = dg.basis.inverse_weights
+
+    fstar1_L = cache.fstar1_L
+    fstar1_R = cache.fstar1_R
+    fstar2_L = cache.fstar2_L
+    fstar2_R = cache.fstar2_R
+    fstar3_L = cache.fstar3_L
+    fstar3_R = cache.fstar3_R
 
     # TODO: Get copies of `u` and `du` on both device and host
     # FIXME: Scalar indexing on GPU arrays caused by using GPU cache
     alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache
     alpha = CuArray(alpha)
-
     atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0))
-    element_ids_dg = CUDA.zeros(Int, length(alpha))
-    element_ids_dgfv = CUDA.zeros(Int, length(alpha))
-
-    # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated)
-    pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg,
-                                                                                              element_ids_dgfv,
-                                                                                              alpha,
-                                                                                              atol)
-    pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol;
-                                      kernel_configurator_1d(pure_blended_element_count_kernel,
-                                                             length(alpha))...)
 
-    derivative_split = dg.basis.derivative_split
-    inverse_weights = dg.basis.inverse_weights
     volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                       size(u, 2), size(u, 5))
     volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
@@ -2018,13 +1997,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
     volume_flux_arr3 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                       size(u, 2), size(u, 5))
 
-    fstar1_L = cache.fstar1_L
-    fstar1_R = cache.fstar1_R
-    fstar2_L = cache.fstar2_L
-    fstar2_R = cache.fstar2_R
-    fstar3_L = cache.fstar3_L
-    fstar3_R = cache.fstar3_R
-
     volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1,
                                                                           volume_flux_arr2,
                                                                           volume_flux_arr3,
@@ -2032,40 +2004,31 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
                                                                           fstar1_R, fstar2_L,
                                                                           fstar2_R,
                                                                           fstar3_L, fstar3_R, u,
-                                                                          element_ids_dgfv,
+                                                                          alpha, atol,
                                                                           equations,
                                                                           volume_flux_dg,
                                                                           volume_flux_fv)
     volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, fstar1_L,
-                            fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, element_ids_dgfv,
+                            fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, alpha, atol,
                             equations, volume_flux_dg, volume_flux_fv;
                             kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^4,
                                                    size(u, 5))...)
 
-    volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg,
-                                                                              element_ids_dgfv,
-                                                                              alpha,
-                                                                              derivative_split,
-                                                                              volume_flux_arr1,
-                                                                              volume_flux_arr2,
-                                                                              volume_flux_arr3,
-                                                                              equations)
-    volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, equations;
-                              kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1),
-                                                     size(du, 2)^3, size(du, 5))...)
-
-    volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L,
-                                                                              fstar1_R,
-                                                                              fstar2_L, fstar2_R,
-                                                                              fstar3_L, fstar3_R,
-                                                                              inverse_weights,
-                                                                              element_ids_dgfv,
-                                                                              alpha)
-    volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                              inverse_weights, element_ids_dgfv, alpha;
-                              kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^3,
-                                                     size(u, 5))...)
+    volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha,
+                                                                                  derivative_split,
+                                                                                  inverse_weights,
+                                                                                  volume_flux_arr1,
+                                                                                  volume_flux_arr2,
+                                                                                  volume_flux_arr3,
+                                                                                  fstar1_L, fstar1_R,
+                                                                                  fstar2_L, fstar2_R,
+                                                                                  fstar3_L, fstar3_R,
+                                                                                  atol, equations)
+    volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr1,
+                                volume_flux_arr2, volume_flux_arr3, fstar1_L, fstar1_R,
+                                fstar2_L, fstar2_R, fstar3_L, fstar3_R, atol, equations;
+                                kernel_configurator_3d(volume_integral_dgfv_kernel, size(du, 1),
+                                                       size(du, 2)^3, size(du, 5))...)
 
     return nothing
 end
@@ -2075,30 +2038,25 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
                                volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
     RealT = eltype(du)
 
-    volume_flux_dg, nonconservative_flux_dg = dg.volume_integral.volume_flux_dg
-    volume_flux_fv, nonconservative_flux_fv = dg.volume_integral.volume_flux_fv
+    volume_flux_dg, noncons_flux_dg = dg.volume_integral.volume_flux_dg
+    volume_flux_fv, noncons_flux_fv = dg.volume_integral.volume_flux_fv
     indicator = dg.volume_integral.indicator
+    derivative_split = dg.basis.derivative_split
+    inverse_weights = dg.basis.inverse_weights
+
+    fstar1_L = cache.fstar1_L
+    fstar1_R = cache.fstar1_R
+    fstar2_L = cache.fstar2_L
+    fstar2_R = cache.fstar2_R
+    fstar3_L = cache.fstar3_L
+    fstar3_R = cache.fstar3_R
 
     # TODO: Get copies of `u` and `du` on both device and host
     # FIXME: Scalar indexing on GPU arrays caused by using GPU cache
     alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache
     alpha = CuArray(alpha)
-
     atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0))
-    element_ids_dg = CUDA.zeros(Int, length(alpha))
-    element_ids_dgfv = CUDA.zeros(Int, length(alpha))
-
-    # See `pure_and_blended_element_ids!` in Trixi.jl (now deprecated)
-    pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg,
-                                                                                              element_ids_dgfv,
-                                                                                              alpha,
-                                                                                              atol)
-    pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol;
-                                      kernel_configurator_1d(pure_blended_element_count_kernel,
-                                                             length(alpha))...)
 
-    derivative_split = dg.basis.derivative_split
-    inverse_weights = dg.basis.inverse_weights
     volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                       size(u, 2), size(u, 5))
     volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
@@ -2112,13 +2070,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
     noncons_flux_arr3 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                        size(u, 2), size(u, 5))
 
-    fstar1_L = cache.fstar1_L
-    fstar1_R = cache.fstar1_R
-    fstar2_L = cache.fstar2_L
-    fstar2_R = cache.fstar2_R
-    fstar3_L = cache.fstar3_L
-    fstar3_R = cache.fstar3_R
-
     volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1,
                                                                           volume_flux_arr2,
                                                                           volume_flux_arr3,
@@ -2128,49 +2079,40 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
                                                                           fstar1_L, fstar1_R,
                                                                           fstar2_L, fstar2_R,
                                                                           fstar3_L, fstar3_R,
-                                                                          u, element_ids_dgfv,
+                                                                          u, alpha, atol,
                                                                           derivative_split,
                                                                           equations,
                                                                           volume_flux_dg,
-                                                                          nonconservative_flux_dg,
+                                                                          noncons_flux_dg,
                                                                           volume_flux_fv,
-                                                                          nonconservative_flux_fv)
+                                                                          noncons_flux_fv)
     volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
                             noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3,
                             fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                            u, element_ids_dgfv, derivative_split, equations, volume_flux_dg,
-                            nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv;
+                            u, alpha, atol, derivative_split, equations, volume_flux_dg,
+                            noncons_flux_dg, volume_flux_fv, noncons_flux_fv;
                             kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^4,
                                                    size(u, 5))...)
 
-    volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg,
-                                                                              element_ids_dgfv,
-                                                                              alpha,
-                                                                              derivative_split,
-                                                                              volume_flux_arr1,
-                                                                              volume_flux_arr2,
-                                                                              volume_flux_arr3,
-                                                                              noncons_flux_arr1,
-                                                                              noncons_flux_arr2,
-                                                                              noncons_flux_arr3,
-                                                                              equations)
-    volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
-                              noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, equations;
-                              kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1),
-                                                     size(du, 2)^3, size(du, 5))...)
-
-    volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L,
-                                                                              fstar1_R,
-                                                                              fstar2_L, fstar2_R,
-                                                                              fstar3_L, fstar3_R,
-                                                                              inverse_weights,
-                                                                              element_ids_dgfv,
-                                                                              alpha)
-    volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                              inverse_weights, element_ids_dgfv, alpha;
-                              kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^3,
-                                                     size(u, 5))...)
+    volume_integral_dgfv_kernel = @cuda launch=false volume_integral_dgfv_kernel!(du, alpha,
+                                                                                  derivative_split,
+                                                                                  inverse_weights,
+                                                                                  volume_flux_arr1,
+                                                                                  volume_flux_arr2,
+                                                                                  volume_flux_arr3,
+                                                                                  noncons_flux_arr1,
+                                                                                  noncons_flux_arr2,
+                                                                                  noncons_flux_arr3,
+                                                                                  fstar1_L, fstar1_R,
+                                                                                  fstar2_L, fstar2_R,
+                                                                                  fstar3_L, fstar3_R,
+                                                                                  atol, equations)
+    volume_integral_dgfv_kernel(du, alpha, derivative_split, inverse_weights, volume_flux_arr1,
+                                volume_flux_arr2, volume_flux_arr3, noncons_flux_arr1,
+                                noncons_flux_arr2, noncons_flux_arr3, fstar1_L, fstar1_R,
+                                fstar2_L, fstar2_R, fstar3_L, fstar3_R, atol, equations;
+                                kernel_configurator_3d(volume_integral_dgfv_kernel, size(du, 1),
+                                                       size(du, 2)^3, size(du, 5))...)
 
     return nothing
 end
diff --git a/test/tree_dgsem_2d/mhd_shock.jl b/test/tree_dgsem_2d/mhd_shock.jl
index e1a02165..5aa3f75f 100644
--- a/test/tree_dgsem_2d/mhd_shock.jl
+++ b/test/tree_dgsem_2d/mhd_shock.jl
@@ -12,9 +12,10 @@ include("../test_macros.jl")
 
     surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
     volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
-    polydeg = 4
-    basis = LobattoLegendreBasis(polydeg)
-    basis_gpu = LobattoLegendreBasisGPU(polydeg)
+
+    basis = LobattoLegendreBasis(4)
+    basis_gpu = LobattoLegendreBasisGPU(4)
+
     indicator_sc = IndicatorHennemannGassner(equations, basis,
                                              alpha_max = 0.5,
                                              alpha_min = 0.001,
@@ -24,10 +25,8 @@ include("../test_macros.jl")
                                                      volume_flux_dg = volume_flux,
                                                      volume_flux_fv = surface_flux)
 
-    solver = DGSEM(polydeg = polydeg, surface_flux = surface_flux,
-                   volume_integral = volume_integral)
-    solver_gpu = DGSEMGPU(polydeg = polydeg, surface_flux = surface_flux,
-                          volume_integral = volume_integral)
+    solver = DGSEM(basis, surface_flux, volume_integral)
+    solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)
 
     coordinates_min = (-2.0, -2.0)
     coordinates_max = (2.0, 2.0)
diff --git a/test/tree_dgsem_3d/mhd_shock.jl b/test/tree_dgsem_3d/mhd_shock.jl
index 0e14cd3b..ffcf7ea4 100644
--- a/test/tree_dgsem_3d/mhd_shock.jl
+++ b/test/tree_dgsem_3d/mhd_shock.jl
@@ -12,9 +12,10 @@ include("../test_macros.jl")
 
     surface_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
     volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
-    polydeg = 4
-    basis = LobattoLegendreBasis(polydeg)
-    basis_gpu = LobattoLegendreBasisGPU(polydeg)
+
+    basis = LobattoLegendreBasis(4)
+    basis_gpu = LobattoLegendreBasisGPU(4)
+
     indicator_sc = IndicatorHennemannGassner(equations, basis,
                                              alpha_max = 0.5,
                                              alpha_min = 0.001,
@@ -24,10 +25,8 @@ include("../test_macros.jl")
                                                      volume_flux_dg = volume_flux,
                                                      volume_flux_fv = surface_flux)
 
-    solver = DGSEM(polydeg = polydeg, surface_flux = surface_flux,
-                   volume_integral = volume_integral)
-    solver_gpu = DGSEMGPU(polydeg = polydeg, surface_flux = surface_flux,
-                          volume_integral = volume_integral)
+    solver = DGSEM(basis, surface_flux, volume_integral)
+    solver_gpu = DGSEMGPU(basis_gpu, surface_flux, volume_integral)
 
     coordinates_min = (-2.0, -2.0, -2.0)
     coordinates_max = (2.0, 2.0, 2.0)