diff --git a/docs/src/examples/3rd.md b/docs/src/examples/3rd.md
index 25135593e6..762b0b8d54 100644
--- a/docs/src/examples/3rd.md
+++ b/docs/src/examples/3rd.md
@@ -43,11 +43,11 @@ discretization = PhysicsInformedNN(chain, QuasiRandomTraining(20))
 prob = discretize(pde_system, discretization)
 
 callback = function (p, l)
-    println("Current loss is: $l")
+    (p.iter % 500 == 0 || p.iter == 2000) && println("Current loss is: $l")
     return false
 end
 
-res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); maxiters = 2000)
+res = solve(prob, OptimizationOptimisers.Adam(0.01); maxiters = 2000, callback)
 phi = discretization.phi
 ```
 
diff --git a/docs/src/examples/complex.md b/docs/src/examples/complex.md
index ff9f1339a5..8d69dacc8a 100644
--- a/docs/src/examples/complex.md
+++ b/docs/src/examples/complex.md
@@ -5,10 +5,7 @@ NeuralPDE supports training PINNs with complex differential equations. This exam
 As the input to this neural network is time which is real, we need to initialize the parameters of the neural network with complex values for it to output and train with complex values.
 
 ```@example complex
-using Random, NeuralPDE
-using OrdinaryDiffEq
-using Lux, OptimizationOptimisers
-using Plots
+using Random, NeuralPDE, OrdinaryDiffEq, Lux, OptimizationOptimisers, Plots
 rng = Random.default_rng()
 Random.seed!(100)
 
@@ -30,11 +27,9 @@ parameters = [2.0, 0.0, 1.0]
 
 problem = ODEProblem(bloch_equations, u0, time_span, parameters)
 
-chain = Lux.Chain(
-    Lux.Dense(1, 16, tanh;
-        init_weight = (rng, a...) -> Lux.kaiming_normal(rng, ComplexF64, a...)),
-    Lux.Dense(
-        16, 4; init_weight = (rng, a...) -> Lux.kaiming_normal(rng, ComplexF64, a...))
+chain = Chain(
+    Dense(1, 16, tanh; init_weight = kaiming_normal(ComplexF64)),
+    Dense(16, 4; init_weight = kaiming_normal(ComplexF64))
 )
 ps, st = Lux.setup(rng, chain)
 
diff --git a/docs/src/examples/heterogeneous.md b/docs/src/examples/heterogeneous.md
index 069116dede..ff04042081 100644
--- a/docs/src/examples/heterogeneous.md
+++ b/docs/src/examples/heterogeneous.md
@@ -31,11 +31,9 @@ domains = [x ∈ Interval(0.0, 1.0),
     y ∈ Interval(0.0, 1.0)]
 
 numhid = 3
-chains = [[Lux.Chain(Dense(1, numhid, Lux.σ), Dense(numhid, numhid, Lux.σ),
-               Dense(numhid, 1)) for i in 1:2]
-          [Lux.Chain(Dense(2, numhid, Lux.σ), Dense(numhid, numhid, Lux.σ),
-               Dense(numhid, 1)) for i in 1:2]]
-discretization = NeuralPDE.PhysicsInformedNN(chains, QuadratureTraining())
+chains = [[Chain(Dense(1, numhid, σ), Dense(numhid, numhid, σ), Dense(numhid, 1)) for i in 1:2]
+          [Chain(Dense(2, numhid, σ), Dense(numhid, numhid, σ), Dense(numhid, 1)) for i in 1:2]]
+discretization = PhysicsInformedNN(chains, QuadratureTraining())
 
 @named pde_system = PDESystem(eq, bcs, domains, [x, y], [p(x), q(y), r(x, y), s(y, x)])
 prob = SciMLBase.discretize(pde_system, discretization)
diff --git a/docs/src/examples/ks.md b/docs/src/examples/ks.md
index 55f75f825d..8afff0e29f 100644
--- a/docs/src/examples/ks.md
+++ b/docs/src/examples/ks.md
@@ -53,14 +53,13 @@ bcs = [u(x, 0) ~ u_analytic(x, 0),
     Dx(u(10, t)) ~ du(10, t)]
 
 # Space and time domains
-domains = [x ∈ Interval(-10.0, 10.0),
-    t ∈ Interval(0.0, 1.0)]
+domains = [x ∈ Interval(-10.0, 10.0), t ∈ Interval(0.0, 1.0)]
 # Discretization
 dx = 0.4;
 dt = 0.2;
 
 # Neural network
-chain = Lux.Chain(Dense(2, 12, Lux.σ), Dense(12, 12, Lux.σ), Dense(12, 1))
+chain = Chain(Dense(2, 12, σ), Dense(12, 12, σ), Dense(12, 1))
 
 discretization = PhysicsInformedNN(chain, GridTraining([dx, dt]))
 @named pde_system = PDESystem(eq, bcs, domains, [x, t], [u(x, t)])
@@ -72,7 +71,7 @@ callback = function (p, l)
 end
 
 opt = OptimizationOptimJL.BFGS()
-res = Optimization.solve(prob, opt; maxiters = 2000)
+res = Optimization.solve(prob, opt; maxiters = 2000, callback)
 phi = discretization.phi
 ```
 
diff --git a/docs/src/examples/linear_parabolic.md b/docs/src/examples/linear_parabolic.md
index c481114a20..6f454f1261 100644
--- a/docs/src/examples/linear_parabolic.md
+++ b/docs/src/examples/linear_parabolic.md
@@ -70,7 +70,7 @@ domains = [x ∈ Interval(0.0, 1.0),
 # Neural network
 input_ = length(domains)
 n = 15
-chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:2]
+chain = [Chain(Dense(input_, n, σ), Dense(n, n, σ), Dense(n, 1)) for _ in 1:2]
 
 strategy = StochasticTraining(500)
 discretization = PhysicsInformedNN(chain, strategy)
@@ -82,18 +82,17 @@ sym_prob = symbolic_discretize(pdesystem, discretization)
 pde_inner_loss_functions = sym_prob.loss_functions.pde_loss_functions
 bcs_inner_loss_functions = sym_prob.loss_functions.bc_loss_functions
 
-global iteration = 0
 callback = function (p, l)
-    if iteration % 10 == 0
+    if p.iter % 500 == 0
+        println("iter: ", p.iter)
         println("loss: ", l)
         println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
         println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
     end
-    global iteration += 1
     return false
 end
 
-res = Optimization.solve(prob, OptimizationOptimisers.Adam(1e-2); maxiters = 10000)
+res = solve(prob, OptimizationOptimisers.Adam(1e-2); maxiters = 5000, callback)
 
 phi = discretization.phi
 
diff --git a/docs/src/examples/nonlinear_elliptic.md b/docs/src/examples/nonlinear_elliptic.md
index d7f8a58579..50e2ab3351 100644
--- a/docs/src/examples/nonlinear_elliptic.md
+++ b/docs/src/examples/nonlinear_elliptic.md
@@ -71,13 +71,12 @@ der_ = [Dy(u(x, y)) ~ Dyu(x, y),
 bcs__ = [bcs_; der_]
 
 # Space and time domains
-domains = [x ∈ Interval(0.0, 1.0),
-    y ∈ Interval(0.0, 1.0)]
+domains = [x ∈ Interval(0.0, 1.0), y ∈ Interval(0.0, 1.0)]
 
 # Neural network
 input_ = length(domains)
 n = 15
-chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:6] # 1:number of @variables
+chain = [Chain(Dense(input_, n, σ), Dense(n, n, σ), Dense(n, 1)) for _ in 1:6] # 1:number of @variables
 
 strategy = GridTraining(0.01)
 discretization = PhysicsInformedNN(chain, strategy)
@@ -91,19 +90,17 @@ pde_inner_loss_functions = sym_prob.loss_functions.pde_loss_functions
 bcs_inner_loss_functions = sym_prob.loss_functions.bc_loss_functions[1:6]
 approx_derivative_loss_functions = sym_prob.loss_functions.bc_loss_functions[7:end]
 
-global iteration = 0
 callback = function (p, l)
-    if iteration % 10 == 0
+    if p.iter % 10 == 0
         println("loss: ", l)
         println("pde_losses: ", map(l_ -> l_(p.u), pde_inner_loss_functions))
         println("bcs_losses: ", map(l_ -> l_(p.u), bcs_inner_loss_functions))
         println("der_losses: ", map(l_ -> l_(p.u), approx_derivative_loss_functions))
     end
-    global iteration += 1
     return false
 end
 
-res = Optimization.solve(prob, BFGS(); maxiters = 100)
+res = solve(prob, BFGS(); maxiters = 100, callback)
 
 phi = discretization.phi
 
diff --git a/docs/src/examples/nonlinear_hyperbolic.md b/docs/src/examples/nonlinear_hyperbolic.md
index 08e2552c71..14688b8e9c 100644
--- a/docs/src/examples/nonlinear_hyperbolic.md
+++ b/docs/src/examples/nonlinear_hyperbolic.md
@@ -81,7 +81,7 @@ domains = [t ∈ Interval(0.0, 1.0),
 # Neural network
 input_ = length(domains)
 n = 15
-chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:2]
+chain = [Chain(Dense(input_, n, σ), Dense(n, n, σ), Dense(n, 1)) for _ in 1:2]
 
 strategy = QuadratureTraining()
 discretization = PhysicsInformedNN(chain, strategy)
@@ -100,7 +100,7 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); maxiters = 200)
+res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); maxiters = 200, callback)
 
 phi = discretization.phi
 
diff --git a/docs/src/examples/wave.md b/docs/src/examples/wave.md
index d53e4df65a..8ef6d33085 100644
--- a/docs/src/examples/wave.md
+++ b/docs/src/examples/wave.md
@@ -42,7 +42,7 @@ domains = [t ∈ Interval(0.0, 1.0),
 dx = 0.1
 
 # Neural network
-chain = Lux.Chain(Dense(2, 16, Lux.σ), Dense(16, 16, Lux.σ), Dense(16, 1))
+chain = Chain(Dense(2, 16, σ), Dense(16, 16, σ), Dense(16, 1))
 discretization = PhysicsInformedNN(chain, GridTraining(dx))
 
 @named pde_system = PDESystem(eq, bcs, domains, [t, x], [u(t, x)])
@@ -55,7 +55,7 @@ end
 
 # optimizer
 opt = OptimizationOptimJL.BFGS()
-res = Optimization.solve(prob, opt; callback = callback, maxiters = 1200)
+res = Optimization.solve(prob, opt; callback, maxiters = 1200)
 phi = discretization.phi
 ```
 
@@ -138,11 +138,11 @@ domains = [t ∈ Interval(0.0, L),
 # Neural network
 inn = 25
 innd = 4
-chain = [[Lux.Chain(Dense(2, inn, Lux.tanh),
-              Dense(inn, inn, Lux.tanh),
-              Dense(inn, inn, Lux.tanh),
+chain = [[Chain(Dense(2, inn, tanh),
+              Dense(inn, inn, tanh),
+              Dense(inn, inn, tanh),
               Dense(inn, 1)) for _ in 1:3]
-         [Lux.Chain(Dense(2, innd, Lux.tanh), Dense(innd, 1)) for _ in 1:2]]
+         [Chain(Dense(2, innd, tanh), Dense(innd, 1)) for _ in 1:2]]
 
 strategy = GridTraining(0.02)
 discretization = PhysicsInformedNN(chain, strategy;)
diff --git a/docs/src/tutorials/Lotka_Volterra_BPINNs.md b/docs/src/tutorials/Lotka_Volterra_BPINNs.md
index a8a2bb0eb3..e7d62c926f 100644
--- a/docs/src/tutorials/Lotka_Volterra_BPINNs.md
+++ b/docs/src/tutorials/Lotka_Volterra_BPINNs.md
@@ -70,8 +70,7 @@ Let's define a PINN.
 
 ```@example bpinn
 # Neural Networks must have 2 outputs as u -> [dx,dy] in function lotka_volterra()
-chain = Lux.Chain(Lux.Dense(1, 6, tanh), Lux.Dense(6, 6, tanh),
-    Lux.Dense(6, 2))
+chain = Chain(Dense(1, 6, tanh), Dense(6, 6, tanh), Dense(6, 2))
 ```
 
 The dataset we generated can be passed for doing parameter estimation using provided priors in `param` keyword argument for [`BNNODE`](@ref).
diff --git a/docs/src/tutorials/dae.md b/docs/src/tutorials/dae.md
index 1f468caedd..29491e77ab 100644
--- a/docs/src/tutorials/dae.md
+++ b/docs/src/tutorials/dae.md
@@ -12,10 +12,7 @@ This tutorial is an introduction to using physics-informed neural networks (PINN
 Let's solve a simple DAE system:
 
 ```@example dae
-using NeuralPDE
-using Random
-using OrdinaryDiffEq, Statistics
-using Lux, OptimizationOptimisers
+using NeuralPDE, Random, OrdinaryDiffEq, Statistics, Lux, OptimizationOptimisers
 
 example = (du, u, p, t) -> [cos(2pi * t) - du[1], u[2] + cos(2pi * t) - du[2]]
 u₀ = [1.0, -1.0]
diff --git a/docs/src/tutorials/derivative_neural_network.md b/docs/src/tutorials/derivative_neural_network.md
index 3963be4308..bd26ce50fe 100644
--- a/docs/src/tutorials/derivative_neural_network.md
+++ b/docs/src/tutorials/derivative_neural_network.md
@@ -91,14 +91,13 @@ input_ = length(domains)
 n = 15
 chain = [Lux.Chain(Dense(input_, n, Lux.σ), Dense(n, n, Lux.σ), Dense(n, 1)) for _ in 1:7]
 
-training_strategy = NeuralPDE.QuadratureTraining(;
-    batch = 200, reltol = 1e-6, abstol = 1e-6)
-discretization = NeuralPDE.PhysicsInformedNN(chain, training_strategy)
+training_strategy = QuadratureTraining(; batch = 200, reltol = 1e-6, abstol = 1e-6)
+discretization = PhysicsInformedNN(chain, training_strategy)
 
 vars = [u1(t, x), u2(t, x), u3(t, x), Dxu1(t, x), Dtu1(t, x), Dxu2(t, x), Dtu2(t, x)]
 @named pdesystem = PDESystem(eqs_, bcs__, domains, [t, x], vars)
-prob = NeuralPDE.discretize(pdesystem, discretization)
-sym_prob = NeuralPDE.symbolic_discretize(pdesystem, discretization)
+prob = discretize(pdesystem, discretization)
+sym_prob = symbolic_discretize(pdesystem, discretization)
 
 pde_inner_loss_functions = sym_prob.loss_functions.pde_loss_functions
 bcs_inner_loss_functions = sym_prob.loss_functions.bc_loss_functions[1:7]
@@ -112,9 +111,9 @@ callback = function (p, l)
     return false
 end
 
-res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); maxiters = 2000)
+res = Optimization.solve(prob, OptimizationOptimisers.Adam(0.01); maxiters = 2000, callback)
 prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, LBFGS(linesearch = BackTracking()); maxiters = 200)
+res = Optimization.solve(prob, LBFGS(linesearch = BackTracking()); maxiters = 200, callback)
 
 phi = discretization.phi
 ```
diff --git a/docs/src/tutorials/dgm.md b/docs/src/tutorials/dgm.md
index a769795eff..f684d419c5 100644
--- a/docs/src/tutorials/dgm.md
+++ b/docs/src/tutorials/dgm.md
@@ -53,7 +53,6 @@ u(t, 1) & = 0
 ```@example dgm
 using NeuralPDE
 using ModelingToolkit, Optimization, OptimizationOptimisers
-using Lux: tanh, identity
 using Distributions
 using ModelingToolkit: Interval, infimum, supremum
 using MethodOfLines, OrdinaryDiffEq
@@ -95,18 +94,15 @@ strategy = QuasiRandomTraining(256, minibatch = 32)
 discretization = DeepGalerkin(2, 1, 50, 5, tanh, tanh, identity, strategy)
 @named pde_system = PDESystem(eq, bcs, domains, [t, x], [u(t, x)])
 prob = discretize(pde_system, discretization)
-global iter = 0
+
 callback = function (p, l)
-    global iter += 1
-    if iter % 20 == 0
-        println("$iter => $l")
-    end
+    (p.iter % 20 == 0) && println("$(p.iter) => $l")
     return false
 end
 
-res = Optimization.solve(prob, Adam(0.1); maxiters = 100)
+res = solve(prob, Adam(0.1); maxiters = 100)
 prob = remake(prob, u0 = res.u)
-res = Optimization.solve(prob, Adam(0.01); maxiters = 500)
+res = solve(prob, Adam(0.01); maxiters = 500)
 phi = discretization.phi
 
 u_predict = [first(phi([t, x], res.minimizer)) for t in ts, x in xs]
diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md
index 82a07dceb2..b1f2923471 100644
--- a/docs/src/tutorials/gpu.md
+++ b/docs/src/tutorials/gpu.md
@@ -33,11 +33,8 @@ using the `gpu` function on the initial parameters, like:
 using Lux, LuxCUDA, ComponentArrays, Random
 const gpud = gpu_device()
 inner = 25
-chain = Chain(Dense(3, inner, Lux.σ),
-    Dense(inner, inner, Lux.σ),
-    Dense(inner, inner, Lux.σ),
-    Dense(inner, inner, Lux.σ),
-    Dense(inner, 1))
+chain = Chain(Dense(3, inner, σ), Dense(inner, inner, σ), Dense(inner, inner, σ),
+    Dense(inner, inner, σ), Dense(inner, 1))
 ps = Lux.setup(Random.default_rng(), chain)[1]
 ps = ps |> ComponentArray |> gpud .|> Float64
 ```
@@ -82,18 +79,13 @@ domains = [t ∈ Interval(t_min, t_max),
 
 # Neural network
 inner = 25
-chain = Chain(Dense(3, inner, Lux.σ),
-    Dense(inner, inner, Lux.σ),
-    Dense(inner, inner, Lux.σ),
-    Dense(inner, inner, Lux.σ),
-    Dense(inner, 1))
+chain = Chain(Dense(3, inner, σ), Dense(inner, inner, σ), Dense(inner, inner, σ),
+    Dense(inner, inner, σ), Dense(inner, 1))
 
 strategy = QuasiRandomTraining(100)
 ps = Lux.setup(Random.default_rng(), chain)[1]
 ps = ps |> ComponentArray |> gpud .|> Float64
-discretization = PhysicsInformedNN(chain,
-    strategy,
-    init_params = ps)
+discretization = PhysicsInformedNN(chain, strategy; init_params = ps)
 
 @named pde_system = PDESystem(eq, bcs, domains, [t, x, y], [u(t, x, y)])
 prob = discretize(pde_system, discretization)
diff --git a/docs/src/tutorials/low_level.md b/docs/src/tutorials/low_level.md
index 90c75de303..4f7a232654 100644
--- a/docs/src/tutorials/low_level.md
+++ b/docs/src/tutorials/low_level.md
@@ -36,8 +36,8 @@ domains = [t ∈ Interval(0.0, 1.0),
     x ∈ Interval(-1.0, 1.0)]
 
 # Neural network
-chain = Lux.Chain(Dense(2, 16, Lux.σ), Dense(16, 16, Lux.σ), Dense(16, 1))
-strategy = NeuralPDE.QuadratureTraining(; abstol = 1e-6, reltol = 1e-6, batch = 200)
+chain = Chain(Dense(2, 16, σ), Dense(16, 16, σ), Dense(16, 1))
+strategy = QuadratureTraining(; abstol = 1e-6, reltol = 1e-6, batch = 200)
 
 indvars = [t, x]
 depvars = [u(t, x)]
@@ -60,14 +60,12 @@ end
 
 loss_functions = [pde_loss_functions; bc_loss_functions]
 
-function loss_function(θ, p)
-    sum(map(l -> l(θ), loss_functions))
-end
+loss_function(θ, p) = sum(map(l -> l(θ), loss_functions))
 
-f_ = OptimizationFunction(loss_function, Optimization.AutoZygote())
-prob = Optimization.OptimizationProblem(f_, sym_prob.flat_init_params)
+f_ = OptimizationFunction(loss_function, AutoZygote())
+prob = OptimizationProblem(f_, sym_prob.flat_init_params)
 
-res = Optimization.solve(prob, BFGS(linesearch = BackTracking()); maxiters = 3000)
+res = solve(prob, BFGS(linesearch = BackTracking()); maxiters = 3000)
 ```
 
 And some analysis:
diff --git a/docs/src/tutorials/low_level_2.md b/docs/src/tutorials/low_level_2.md
index 381026ab67..3a3b008c27 100644
--- a/docs/src/tutorials/low_level_2.md
+++ b/docs/src/tutorials/low_level_2.md
@@ -27,7 +27,7 @@ where $\theta = t - x/2$ and with initial and boundary conditions:
 With Bayesian Physics-Informed Neural Networks, here is an example of using `BayesianPINN` discretization with `ahmc_bayesian_pinn_pde` :
 
 ```@example low_level_2
-using NeuralPDE, Flux, Lux, ModelingToolkit, LinearAlgebra, AdvancedHMC
+using NeuralPDE, Lux, ModelingToolkit, LinearAlgebra, AdvancedHMC
 import ModelingToolkit: Interval, infimum, supremum, Distributions
 using Plots, MonteCarloMeasurements
 
@@ -102,9 +102,7 @@ plot!(noisydataset[1][:, 2], noisydataset[1][:, 1])
 
 ```@example low_level_2
 # Neural network
-chain = Lux.Chain(Lux.Dense(2, 8, Lux.tanh),
-    Lux.Dense(8, 8, Lux.tanh),
-    Lux.Dense(8, 1))
+chain = Chain(Dense(2, 8, tanh), Dense(8, 8, tanh), Dense(8, 1))
 
 discretization = NeuralPDE.BayesianPINN([chain],
     GridTraining([dx, dt]), param_estim = true, dataset = [noisydataset, nothing])
diff --git a/src/dgm.jl b/src/dgm.jl
index e274975e10..15b872ef60 100644
--- a/src/dgm.jl
+++ b/src/dgm.jl
@@ -107,8 +107,6 @@ end
         activation2::Function, out_activation::Function, strategy::AbstractTrainingStrategy;
         kwargs...)
 
-returns a `discretize` algorithm for the ModelingToolkit PDESystem interface, which transforms a `PDESystem` into an `OptimizationProblem` using the Deep Galerkin method.
-
 ## Arguments:
 
 - `in_dims`: number of input dimensions = (spatial dimension + 1).