diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 7f1de74..cb40e92 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1,3 +1,2 @@
 style = "blue"
 format_markdown = true
-
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 0109f73..da6639a 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -39,31 +39,4 @@ jobs:
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v3
         with:
-          files: lcov.info
-  docs:
-    name: Documentation
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      statuses: write
-    steps:
-      - uses: actions/checkout@v4
-      - uses: julia-actions/setup-julia@v1
-        with:
-          version: '1'
-      - name: Configure doc environment
-        run: |
-          julia --project=docs/ -e '
-            using Pkg
-            Pkg.develop(PackageSpec(path=pwd()))
-            Pkg.instantiate()'
-      - uses: julia-actions/julia-buildpkg@v1
-      - uses: julia-actions/julia-docdeploy@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - run: |
-          julia --project=docs -e '
-            using Documenter: DocMeta, doctest
-            using AbstractDifferentiation
-            DocMeta.setdocmeta!(AbstractDifferentiation, :DocTestSetup, :(using AbstractDifferentiation); recursive=true)
-            doctest(AbstractDifferentiation)'
\ No newline at end of file
+          files: lcov.info
\ No newline at end of file
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index cba9134..b60ef72 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -13,4 +13,4 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+        run: julia -e 'using CompatHelper; CompatHelper.main(; subdirs=["", "docs"])'
diff --git a/.github/workflows/Docs-Preview-Cleanup.yml b/.github/workflows/Docs-Preview-Cleanup.yml
new file mode 100644
index 0000000..c80fa76
--- /dev/null
+++ b/.github/workflows/Docs-Preview-Cleanup.yml
@@ -0,0 +1,28 @@
+name: Doc Preview Cleanup
+
+on:
+  pull_request:
+    types: [closed]
+
+jobs:
+  doc-preview-cleanup:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout gh-pages branch
+        uses: actions/checkout@v4
+        with:
+          ref: gh-pages
+      - name: Delete preview and history + push changes
+        run: |
+            if [ -d "previews/PR$PRNUM" ]; then
+              git config user.name "Documenter.jl"
+              git config user.email "documenter@juliadocs.github.io"
+              git rm -rf "previews/PR$PRNUM"
+              git commit -m "delete preview"
+              git branch gh-pages-new $(echo "delete history" | git commit-tree HEAD^{tree})
+              git push --force origin gh-pages-new:gh-pages
+            fi
+        env:
+            PRNUM: ${{ github.event.number }}
\ No newline at end of file
diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml
new file mode 100644
index 0000000..01a2f3b
--- /dev/null
+++ b/.github/workflows/Docs.yml
@@ -0,0 +1,36 @@
+name: Docs
+
+on:
+  push:
+    branches: [master]
+    tags: '*'
+  pull_request:
+
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      statuses: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: '1'
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-docdeploy@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - run: |
+          julia --project=docs -e '
+            using Documenter: DocMeta, doctest
+            using AbstractDifferentiation
+            DocMeta.setdocmeta!(AbstractDifferentiation, :DocTestSetup, :(using AbstractDifferentiation); recursive=true)
+            doctest(AbstractDifferentiation)'
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index afacc72..5189bff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 /Manifest.toml
 /docs/build
-/docs/src/index.md
\ No newline at end of file
+/docs/src/index.md
+/docs/Manifest.toml
\ No newline at end of file
diff --git a/Project.toml b/Project.toml
index 3bb3a50..11f1470 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AbstractDifferentiation"
 uuid = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 authors = ["Mohamed Tarek <mohamed82008@gmail.com> and contributors"]
-version = "0.6.0-DEV"
+version = "0.6.2"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -13,6 +13,7 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 [compat]
 ChainRulesCore = "1"
 DiffResults = "1"
+Documenter = "1"
 ExprTools = "0.1"
 FiniteDifferences = "0.12"
 ForwardDiff = "0.10"
@@ -35,6 +36,7 @@ AbstractDifferentiationEnzymeExt = "Enzyme"
 [extras]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -45,14 +47,4 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 
 [targets]
-test = ["Test", "ChainRulesCore", "DiffResults", "FiniteDifferences", "ForwardDiff", "Random", "ReverseDiff", "Tracker", "Zygote", "Enzyme"]
-
-[weakdeps]
-ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
-Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+test = ["ChainRulesCore", "DiffResults", "Documenter", "FiniteDifferences", "ForwardDiff", "Random", "ReverseDiff", "Test", "Tracker", "Zygote", "Enzyme"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 6ef4087..a640687 100644
--- a/README.md
+++ b/README.md
@@ -13,100 +13,10 @@ This is a package that implements an abstract interface for differentiation in J
 
 Julia has more (automatic) differentiation packages than you can count on 2 hands. Different packages have different user interfaces. Therefore, having a backend-agnostic interface to request the function value and its gradient for example is necessary to avoid a combinatorial explosion of code when trying to support every differentiation package in Julia in every algorithm package requiring gradients. For higher order derivatives, the situation is even more dire since you can combine any 2 differentiation backends together to create a new higher-order backend.
 
-## Loading `AbstractDifferentiation`
+## Getting started
 
-To load `AbstractDifferentiation`, it is recommended to use
-
-```julia
-import AbstractDifferentiation as AD
-```
-
-With the `AD` alias you can access names inside of `AbstractDifferentiation` using `AD.<>` instead of typing the long name `AbstractDifferentiation`.
-
-## `AbstractDifferentiation` backends
-
-To use `AbstractDifferentiation`, first construct a backend instance `ab::AD.AbstractBackend` using your favorite differentiation package in Julia that supports `AbstractDifferentiation`.
-In particular, you may want to use `AD.ReverseRuleConfigBackend(ruleconfig)` for any [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl)-compatible reverse mode differentiation package.
-
-The following backends are temporarily made available by `AbstractDifferentiation` as soon as their corresponding package is loaded (thanks to [weak dependencies](https://pkgdocs.julialang.org/dev/creating-packages/#Weak-dependencies) on Julia ≥ 1.9 and [Requires.jl](https://github.com/JuliaPackaging/Requires.jl) on older Julia versions):
-
-  - `AD.ForwardDiffBackend()` for [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl)
-  - `AD.FiniteDifferencesBackend()` for [FiniteDifferences.jl](https://github.com/JuliaDiff/FiniteDifferences.jl)
-  - `AD.ReverseDiffBackend()` for [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl)
-  - `AD.TrackerBackend()` for [Tracker.jl](https://github.com/FluxML/Tracker.jl)
-  - `AD.ZygoteBackend()` for [Zygote.jl](https://github.com/FluxML/Zygote.jl), which is a special case of `AD.ReverseRuleConfigBackend`
-
-In the long term, these backend objects (and many more) will be defined within their respective packages to enforce the `AbstractDifferentiation` interface.
-This is already the case for some of them:
-
-  - `Diffractor.DiffractorForwardBackend()` for [Diffractor.jl](https://github.com/JuliaDiff/Diffractor.jl) in forward mode
-
-Here's an example:
-
-```julia
-julia> import AbstractDifferentiation as AD, Zygote
-
-julia> ab = AD.ZygoteBackend()
-AbstractDifferentiation.ReverseRuleConfigBackend{Zygote.ZygoteRuleConfig{Zygote.Context}}(Zygote.ZygoteRuleConfig{Zygote.Context}(Zygote.Context(nothing)))
-
-julia> f(x) = log(sum(exp, x))
-f (generic function with 1 method)
-
-julia> AD.gradient(ab, f, rand(10))
-([0.07163448353282538, 0.08520350535348796, 0.09675622487503996, 0.1522744408520505, 0.12174662595572318, 0.07996969757526722, 0.07832665607158593, 0.11001685581681672, 0.06691909637037166, 0.1371524135968315],)
-```
-
-For higher order derivatives, you can build higher order backends using `AD.HigherOrderBackend`. For instance, let `ab_f` be a forward-mode automatic differentiation backend and let `ab_r` be a reverse-mode automatic differentiation backend. To construct a higher order backend for doing forward-over-reverse-mode automatic differentiation, use `AD.HigherOrderBackend((ab_f, ab_r))`. To construct a higher order backend for doing reverse-over-forward-mode automatic differentiation, use `AD.HigherOrderBackend((ab_r, ab_f))`.
-
-## Backend-agnostic interface
-
-The following list of functions is the officially supported differentiation interface in `AbstractDifferentiation`.
-
-### Derivative/Gradient/Jacobian/Hessian
-
-The following list of functions can be used to request the derivative, gradient, Jacobian or Hessian without the function value.
-
-  - `ds = AD.derivative(ab::AD.AbstractBackend, f, xs::Number...)`: computes the derivatives `ds` of `f` wrt the numbers `xs` using the backend `ab`. `ds` is a tuple of derivatives, one for each element in `xs`.
-  - `gs = AD.gradient(ab::AD.AbstractBackend, f, xs...)`: computes the gradients `gs` of `f` wrt the inputs `xs` using the backend `ab`. `gs` is a tuple of gradients, one for each element in `xs`.
-  - `js = AD.jacobian(ab::AD.AbstractBackend, f, xs...)`: computes the Jacobians `js` of `f` wrt the inputs `xs` using the backend `ab`. `js` is a tuple of Jacobians, one for each element in `xs`.
-  - `h = AD.hessian(ab::AD.AbstractBackend, f, x)`: computes the Hessian `h` of `f` wrt the input `x` using the backend `ab`. `hessian` currently only supports a single input.
-
-### Value and Derivative/Gradient/Jacobian/Hessian
-
-The following list of functions can be used to request the function value along with its derivative, gradient, Jacobian or Hessian. You can also request the function value, its gradient and Hessian for single-input functions.
-
-  - `(v, ds) = AD.value_and_derivative(ab::AD.AbstractBackend, f, xs::Number...)`: computes the function value `v = f(xs...)` and the derivatives `ds` of `f` wrt the numbers `xs` using the backend `ab`. `ds` is a tuple of derivatives, one for each element in `xs`.
-  - `(v, gs) = AD.value_and_gradient(ab::AD.AbstractBackend, f, xs...)`: computes the function value `v = f(xs...)` and the gradients `gs` of `f` wrt the inputs `xs` using the backend `ab`. `gs` is a tuple of gradients, one for each element in `xs`.
-  - `(v, js) = AD.value_and_jacobian(ab::AD.AbstractBackend, f, xs...)`: computes the function value `v = f(xs...)` and the Jacobians `js` of `f` wrt the inputs `xs` using the backend `ab`. `js` is a tuple of Jacobians, one for each element in `xs`.
-  - `(v, h) = AD.value_and_hessian(ab::AD.AbstractBackend, f, x)`: computes the function value `v = f(x)` and the Hessian `h` of `f` wrt the input `x` using the backend `ab`. `hessian` currently only supports a single input.
-  - `(v, g, h) = AD.value_gradient_and_hessian(ab::AD.AbstractBackend, f, x)`: computes the function value `v = f(x)` and the gradient `g` and Hessian `h` of `f` wrt the input `x` using the backend `ab`. `hessian` currently only supports a single input.
-
-### Jacobian vector products (aka pushforward)
-
-This operation goes by a few names. Refer to the [ChainRules documentation](https://juliadiff.org/ChainRulesCore.jl/stable/#The-propagators:-pushforward-and-pullback) for more on terminology. For a single input, single output function `f` with a Jacobian `J`, the pushforward operator `pf_f` is equivalent to applying the function `v -> J * v` on a (tangent) vector `v`.
-
-The following functions can be used to request a function that returns the pushforward operator/function. In order to request the pushforward function `pf_f` of a function `f` at the inputs `xs`, you can use either of:
-
-  - `pf_f = AD.pushforward_function(ab::AD.AbstractBackend, f, xs...)`: returns the pushforward function `pf_f` of the function `f` at the inputs `xs`. `pf_f` is a function that accepts the tangents `vs` as input which is a tuple of length equal to the length of the tuple `xs`. If `f` has a single input, `pf_f` can also accept a single input instead of a 1-tuple.
-  - `value_and_pf_f = AD.value_and_pushforward_function(ab::AD.AbstractBackend, f, xs...)`: returns a function `value_and_pf_f` which accepts the tangent `vs` as input which is a tuple of length equal to the length of the tuple `xs`. If `f` has a single input, `value_and_pf_f` can accept a single input instead of a 1-tuple. `value_and_pf_f` returns a 2-tuple, namely the value `f(xs...)` and output of the pushforward operator.
-
-### Vector Jacobian products (aka pullback)
-
-This operation goes by a few names. Refer to the [ChainRules documentation](https://juliadiff.org/ChainRulesCore.jl/stable/#The-propagators:-pushforward-and-pullback) for more on terminology. For a single input, single output function `f` with a Jacobian `J`, the pullback operator `pb_f` is equivalent to applying the function `v -> v' * J` on a (co-tangent) vector `v`.
-
-The following functions can be used to request the pullback operator/function with or without the function value. In order to request the pullback function `pb_f` of a function `f` at the inputs `xs`, you can use either of:
-
-  - `pb_f = AD.pullback_function(ab::AD.AbstractBackend, f, xs...)`: returns the pullback function `pb_f` of the function `f` at the inputs `xs`. `pb_f` is a function that accepts the co-tangents `vs` as input which is a tuple of length equal to the number of outputs of `f`. If `f` has a single output, `pb_f` can also accept a single input instead of a 1-tuple.
-  - `value_and_pb_f = AD.value_and_pullback_function(ab::AD.AbstractBackend, f, xs...)`: computes the function value `v = f(xs...)` and returns a 2-tuple containing the value `v` and a function `pb_f` that accepts the co-tangent `vs` as input, which is a tuple of length equal to the number of outputs of `f`. If `f` has a single output, `pb_f` can accept a single input instead of a 1-tuple.
-
-### Lazy operators
-
-You can also get a struct for the lazy derivative/gradient/Jacobian/Hessian of a function. You can then use the `*` operator to apply the lazy operator on a value or tuple of the correct shape. To get a lazy derivative/gradient/Jacobian/Hessian use any one of:
-
-  - `ld = lazy_derivative(ab::AbstractBackend, f, xs::Number...)`: returns an operator `ld` for multiplying by the derivative of `f` at `xs`. You can apply the operator by multiplication e.g. `ld * y` where `y` is a number if `f` has a single input, a tuple of the same length as `xs` if `f` has multiple inputs, or an array of numbers/tuples.
-  - `lg = lazy_gradient(ab::AbstractBackend, f, xs...)`: returns an operator `lg` for multiplying by the gradient of `f` at `xs`. You can apply the operator by multiplication e.g. `lg * y` where `y` is a number if `f` has a single input or a tuple of the same length as `xs` if `f` has multiple inputs.
-  - `lh = lazy_hessian(ab::AbstractBackend, f, x)`: returns an operator `lh` for multiplying by the Hessian of the scalar-valued function `f` at `x`. You can apply the operator by multiplication e.g. `lh * y` or `y' * lh` where `y` is a number or a vector of the appropriate length.
-  - `lj = lazy_jacobian(ab::AbstractBackend, f, xs...)`: returns an operator `lj` for multiplying by the Jacobian of `f` at `xs`. You can apply the operator by multiplication e.g. `lj * y` or `y' * lj` where `y` is a number, vector or tuple of numbers and/or vectors. If `f` has multiple inputs, `y` in `lj * y` should be a tuple. If `f` has multiply outputs, `y` in `y' * lj` should be a tuple. Otherwise, it should be a scalar or a vector of the appropriate length.
+  - If you are an autodiff user and want to write code in a backend-agnostic way, read the _user guide_ in the docs.
+  - If you are an autodiff developer and want your backend to implement the interface, read the _implementer guide_ in the docs (still in construction).
 
 ## Citing this package
 
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
deleted file mode 100644
index 8e7cbc8..0000000
--- a/docs/Manifest.toml
+++ /dev/null
@@ -1,178 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-julia_version = "1.9.2"
-manifest_format = "2.0"
-project_hash = "2ea368e6dfe17054675f1828ff2b2c55382ede29"
-
-[[deps.ANSIColoredPrinters]]
-git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c"
-uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9"
-version = "0.0.1"
-
-[[deps.AbstractDifferentiation]]
-deps = ["ExprTools", "LinearAlgebra", "Requires"]
-path = ".."
-uuid = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
-version = "0.5.2"
-
-    [deps.AbstractDifferentiation.extensions]
-    AbstractDifferentiationChainRulesCoreExt = "ChainRulesCore"
-    AbstractDifferentiationFiniteDifferencesExt = "FiniteDifferences"
-    AbstractDifferentiationForwardDiffExt = ["DiffResults", "ForwardDiff"]
-    AbstractDifferentiationReverseDiffExt = ["DiffResults", "ReverseDiff"]
-    AbstractDifferentiationTrackerExt = "Tracker"
-    AbstractDifferentiationZygoteExt = "Zygote"
-
-    [deps.AbstractDifferentiation.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-    DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
-    FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
-    ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-    ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
-    Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
-    Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-
-[[deps.Artifacts]]
-uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-
-[[deps.Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[deps.CompilerSupportLibraries_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.5+0"
-
-[[deps.Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[deps.DocStringExtensions]]
-deps = ["LibGit2"]
-git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
-uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.9.3"
-
-[[deps.Documenter]]
-deps = ["ANSIColoredPrinters", "Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
-git-tree-sha1 = "39fd748a73dce4c05a9655475e437170d8fb1b67"
-uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-version = "0.27.25"
-
-[[deps.ExprTools]]
-git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
-uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.10"
-
-[[deps.IOCapture]]
-deps = ["Logging", "Random"]
-git-tree-sha1 = "d75853a0bdbfb1ac815478bacd89cd27b550ace6"
-uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
-version = "0.2.3"
-
-[[deps.InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[deps.JSON]]
-deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a"
-uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.4"
-
-[[deps.LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[deps.Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[deps.LinearAlgebra]]
-deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
-uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-
-[[deps.Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[deps.Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[deps.Mmap]]
-uuid = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[[deps.NetworkOptions]]
-uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
-version = "1.2.0"
-
-[[deps.OpenBLAS_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
-uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.21+4"
-
-[[deps.Parsers]]
-deps = ["Dates", "PrecompileTools", "UUIDs"]
-git-tree-sha1 = "716e24b21538abc91f6205fd1d8363f39b442851"
-uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.7.2"
-
-[[deps.PrecompileTools]]
-deps = ["Preferences"]
-git-tree-sha1 = "9673d39decc5feece56ef3940e5dafba15ba0f81"
-uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
-version = "1.1.2"
-
-[[deps.Preferences]]
-deps = ["TOML"]
-git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1"
-uuid = "21216c6a-2e73-6563-6e65-726566657250"
-version = "1.4.0"
-
-[[deps.Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[deps.REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[deps.Random]]
-deps = ["SHA", "Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[deps.Requires]]
-deps = ["UUIDs"]
-git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
-uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.3.0"
-
-[[deps.SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-version = "0.7.0"
-
-[[deps.Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[deps.Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[deps.TOML]]
-deps = ["Dates"]
-uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-version = "1.0.3"
-
-[[deps.Test]]
-deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[deps.UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[deps.Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
-
-[[deps.libblastrampoline_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.8.0+0"
diff --git a/docs/Project.toml b/docs/Project.toml
index 1629afd..a9bd692 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,3 +1,9 @@
 [deps]
 AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[compat]
+AbstractDifferentiation = "0.6"
+Documenter = "1"
+Zygote = "0.6"
diff --git a/docs/make.jl b/docs/make.jl
index 29b1212..656656d 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,8 +1,12 @@
 using AbstractDifferentiation
+import AbstractDifferentiation as AD
 using Documenter
 
 DocMeta.setdocmeta!(
-    AbstractDifferentiation, :DocTestSetup, :(using AbstractDifferentiation); recursive=true
+    AbstractDifferentiation,
+    :DocTestSetup,
+    :(import AbstractDifferentiation as AD);
+    recursive=true,
 )
 
 generated_path = joinpath(@__DIR__, "src")
@@ -28,15 +32,23 @@ end
 makedocs(;
     modules=[AbstractDifferentiation],
     authors="Mohamed Tarek <mohamed82008@gmail.com> and contributors",
-    repo="https://github.com/JuliaDiff/AbstractDifferentiation.jl/blob/{commit}{path}#{line}",
     sitename="AbstractDifferentiation.jl",
     format=Documenter.HTML(;
+        repolink="https://github.com/JuliaDiff/AbstractDifferentiation.jl",
         prettyurls=get(ENV, "CI", "false") == "true",
         canonical="https://JuliaDiff.github.io/AbstractDifferentiation.jl",
         edit_link="master",
         assets=String[],
     ),
-    pages=["Home" => "index.md", "API reference" => "api.md"],
+    pages=[
+        "Home" => "index.md",
+        "User guide" => "user_guide.md",
+        "Implementer guide" => "implementer_guide.md",
+    ],
 )
 
-deploydocs(; repo="github.com/JuliaDiff/AbstractDifferentiation.jl", devbranch="master")
+deploydocs(;
+    repo="github.com/JuliaDiff/AbstractDifferentiation.jl",
+    devbranch="master",
+    push_preview=true,
+)
diff --git a/docs/src/api.md b/docs/src/api.md
deleted file mode 100644
index 9d7eac0..0000000
--- a/docs/src/api.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# API reference
-
-## Index
-
-```@index
-```
-
-## Docstrings
-
-```@autodocs
-Modules = [AbstractDifferentiation]
-```
diff --git a/docs/src/implementer_guide.md b/docs/src/implementer_guide.md
new file mode 100644
index 0000000..4a14e43
--- /dev/null
+++ b/docs/src/implementer_guide.md
@@ -0,0 +1,43 @@
+# Implementer guide
+
+!!! warning "Work in progress"
+    
+    Come back later!
+
+## The macro `@primitive`
+
+To implement the `AbstractDifferentiation` interface for your backend, you only _need_ to provide a "primitive" from which the rest of the functions can be deduced.
+However, for performance reasons, you _can_ implement more of the interface to make certain calls faster.
+
+At the moment, the only primitives supported are `AD.pushforward_function` and `AD.value_and_pullback_function`.
+The `AD.@primitive` macro uses the provided function to implement `AD.jacobian`, and all the other functions follow.
+
+```julia
+AD.@primitive function AD.myprimitive(ab::MyBackend, f, xs...)
+    # write your code here
+end
+```
+
+See the backend-specific extensions in the `ext/` folder of the repository for example implementations.
+
+## Function dependency graph
+
+These details are not part of the public API and are expected to change.
+They are just listed here to help readers figure out the code structure:
+
+  - `jacobian` has no default implementation
+  - `derivative` calls `jacobian`
+  - `gradient` calls `jacobian`
+  - `hessian` calls `jacobian` and `gradient`
+  - `second_derivative` calls `derivative`
+  - `value_and_jacobian` calls `jacobian`
+  - `value_and_derivative` calls `value_and_jacobian`
+  - `value_and_gradient` calls `value_and_jacobian`
+  - `value_and_hessian` calls `jacobian` and `gradient`
+  - `value_and_second_derivative` calls `second_derivative`
+  - `value_gradient_and_hessian` calls `value_and_jacobian` and `gradient`
+  - `value_derivative_and_second_derivative` calls `value_and_derivative` and `second_derivative`
+  - `pushforward_function` calls `jacobian`
+  - `value_and_pushforward_function` calls `pushforward_function`
+  - `pullback_function` calls `value_and_pullback_function`
+  - `value_and_pullback_function` calls `gradient`
diff --git a/docs/src/user_guide.md b/docs/src/user_guide.md
new file mode 100644
index 0000000..e09768c
--- /dev/null
+++ b/docs/src/user_guide.md
@@ -0,0 +1,116 @@
+# User guide
+
+The list of functions on this page is the officially supported differentiation interface in `AbstractDifferentiation`.
+
+## Loading `AbstractDifferentiation`
+
+To load `AbstractDifferentiation`, it is recommended to use
+
+```julia
+import AbstractDifferentiation as AD
+```
+
+With the `AD` alias you can access names inside of `AbstractDifferentiation` using `AD.<>` instead of typing the long name `AbstractDifferentiation`.
+
+## `AbstractDifferentiation` backends
+
+To use `AbstractDifferentiation`, first construct a backend instance `ab::AD.AbstractBackend` using your favorite differentiation package in Julia that supports `AbstractDifferentiation`.
+
+Here's an example:
+
+```jldoctest
+julia> import AbstractDifferentiation as AD, Zygote
+
+julia> backend = AD.ZygoteBackend();
+
+julia> f(x) = log(sum(exp, x));
+
+julia> AD.gradient(backend, f, collect(1:3))
+([0.09003057317038046, 0.2447284710547977, 0.665240955774822],)
+```
+
+The following backends are temporarily made available by `AbstractDifferentiation` as soon as their corresponding package is loaded (thanks to [weak dependencies](https://pkgdocs.julialang.org/dev/creating-packages/#Weak-dependencies) on Julia ≥ 1.9 and [Requires.jl](https://github.com/JuliaPackaging/Requires.jl) on older Julia versions):
+
+```@docs
+AbstractDifferentiation.ReverseDiffBackend
+AbstractDifferentiation.ReverseRuleConfigBackend
+AbstractDifferentiation.FiniteDifferencesBackend
+AbstractDifferentiation.ZygoteBackend
+AbstractDifferentiation.ForwardDiffBackend
+AbstractDifferentiation.TrackerBackend
+```
+
+In the long term, these backend objects (and many more) will be defined within their respective packages to enforce the `AbstractDifferentiation` interface.
+This is already the case for:
+
+  - `Diffractor.DiffractorForwardBackend()` for [Diffractor.jl](https://github.com/JuliaDiff/Diffractor.jl) in forward mode
+
+For higher order derivatives, you can build higher order backends using `AD.HigherOrderBackend`.
+
+```@docs
+AbstractDifferentiation.HigherOrderBackend
+```
+
+## Derivatives
+
+The following list of functions can be used to request the derivative, gradient, Jacobian, second derivative or Hessian without the function value.
+
+```@docs
+AbstractDifferentiation.derivative
+AbstractDifferentiation.gradient
+AbstractDifferentiation.jacobian
+AbstractDifferentiation.second_derivative
+AbstractDifferentiation.hessian
+```
+
+## Value and derivatives
+
+The following list of functions can be used to request the function value along with its derivative, gradient, Jacobian, second derivative, or Hessian. You can also request the function value, its derivative (or its gradient) and its second derivative (or Hessian) for single-input functions.
+
+```@docs
+AbstractDifferentiation.value_and_derivative
+AbstractDifferentiation.value_and_gradient
+AbstractDifferentiation.value_and_jacobian
+AbstractDifferentiation.value_and_second_derivative
+AbstractDifferentiation.value_and_hessian
+AbstractDifferentiation.value_derivative_and_second_derivative
+AbstractDifferentiation.value_gradient_and_hessian
+```
+
+## Jacobian-vector products
+
+This operation goes by a few names, like "pushforward". Refer to the [ChainRules documentation](https://juliadiff.org/ChainRulesCore.jl/stable/#The-propagators:-pushforward-and-pullback) for more on terminology. For a single input, single output function `f` with a Jacobian `J`, the pushforward operator `pf_f` is equivalent to applying the function `v -> J * v` on a (tangent) vector `v`.
+
+The following functions can be used to request a function that returns the pushforward operator/function. In order to request the pushforward function `pf_f` of a function `f` at the inputs `xs`, you can use either of:
+
+```@docs
+AbstractDifferentiation.pushforward_function
+AbstractDifferentiation.value_and_pushforward_function
+```
+
+## Vector-Jacobian products
+
+This operation goes by a few names, like "pullback". Refer to the [ChainRules documentation](https://juliadiff.org/ChainRulesCore.jl/stable/#The-propagators:-pushforward-and-pullback) for more on terminology. For a single input, single output function `f` with a Jacobian `J`, the pullback operator `pb_f` is equivalent to applying the function `v -> v' * J` on a (co-tangent) vector `v`.
+
+The following functions can be used to request the pullback operator/function with or without the function value. In order to request the pullback function `pb_f` of a function `f` at the inputs `xs`, you can use either of:
+
+```@docs
+AbstractDifferentiation.pullback_function
+AbstractDifferentiation.value_and_pullback_function
+```
+
+## Lazy operators
+
+You can also get a struct for the lazy derivative/gradient/Jacobian/Hessian of a function. You can then use the `*` operator to apply the lazy operator on a value or tuple of the correct shape. To get a lazy derivative/gradient/Jacobian/Hessian use any one of:
+
+```@docs
+AbstractDifferentiation.lazy_derivative
+AbstractDifferentiation.lazy_gradient
+AbstractDifferentiation.lazy_jacobian
+AbstractDifferentiation.lazy_hessian
+```
+
+## Index
+
+```@index
+```
diff --git a/ext/AbstractDifferentiationForwardDiffExt.jl b/ext/AbstractDifferentiationForwardDiffExt.jl
index ff0c52c..b8e82cf 100644
--- a/ext/AbstractDifferentiationForwardDiffExt.jl
+++ b/ext/AbstractDifferentiationForwardDiffExt.jl
@@ -61,6 +61,12 @@ function AD.hessian(ba::AD.ForwardDiffBackend, f, x::AbstractArray)
     return (ForwardDiff.hessian(f, x, cfg),)
 end
 
+function AD.value_and_derivative(::AD.ForwardDiffBackend, f, x::Real)
+    T = typeof(ForwardDiff.Tag(f, typeof(x)))
+    ydual = f(ForwardDiff.Dual{T}(x, one(x)))
+    return ForwardDiff.value(T, ydual), (ForwardDiff.partials(T, ydual, 1),)
+end
+
 function AD.value_and_gradient(ba::AD.ForwardDiffBackend, f, x::AbstractArray)
     result = DiffResults.GradientResult(x)
     cfg = ForwardDiff.GradientConfig(f, x, chunk(ba, x))
@@ -68,6 +74,16 @@ function AD.value_and_gradient(ba::AD.ForwardDiffBackend, f, x::AbstractArray)
     return DiffResults.value(result), (DiffResults.derivative(result),)
 end
 
+function AD.value_and_second_derivative(ba::AD.ForwardDiffBackend, f, x::Real)
+    T = typeof(ForwardDiff.Tag(f, typeof(x)))
+    xdual = ForwardDiff.Dual{T}(x, one(x))
+    T2 = typeof(ForwardDiff.Tag(f, typeof(xdual)))
+    ydual = f(ForwardDiff.Dual{T2}(xdual, one(xdual)))
+    v = ForwardDiff.value(T, ForwardDiff.value(T2, ydual))
+    d2 = ForwardDiff.partials(T, ForwardDiff.partials(T2, ydual, 1), 1)
+    return v, (d2,)
+end
+
 function AD.value_and_hessian(ba::AD.ForwardDiffBackend, f, x)
     result = DiffResults.HessianResult(x)
     cfg = ForwardDiff.HessianConfig(f, result, x, chunk(ba, x))
@@ -75,6 +91,17 @@ function AD.value_and_hessian(ba::AD.ForwardDiffBackend, f, x)
     return DiffResults.value(result), (DiffResults.hessian(result),)
 end
 
+function AD.value_derivative_and_second_derivative(ba::AD.ForwardDiffBackend, f, x::Real)
+    T = typeof(ForwardDiff.Tag(f, typeof(x)))
+    xdual = ForwardDiff.Dual{T}(x, one(x))
+    T2 = typeof(ForwardDiff.Tag(f, typeof(xdual)))
+    ydual = f(ForwardDiff.Dual{T2}(xdual, one(xdual)))
+    v = ForwardDiff.value(T, ForwardDiff.value(T2, ydual))
+    d = ForwardDiff.partials(T, ForwardDiff.value(T2, ydual), 1)
+    d2 = ForwardDiff.partials(T, ForwardDiff.partials(T2, ydual, 1), 1)
+    return v, (d,), (d2,)
+end
+
 @inline step_toward(x::Number, v::Number, h) = x + h * v
 # support arrays and tuples
 @noinline step_toward(x, v, h) = x .+ h .* v
diff --git a/ext/AbstractDifferentiationZygoteExt.jl b/ext/AbstractDifferentiationZygoteExt.jl
index 808bf39..1610006 100644
--- a/ext/AbstractDifferentiationZygoteExt.jl
+++ b/ext/AbstractDifferentiationZygoteExt.jl
@@ -8,11 +8,27 @@ else
     using ..Zygote: Zygote
 end
 
-AD.ZygoteBackend() = AD.ReverseRuleConfigBackend(Zygote.ZygoteRuleConfig())
-
 # Context should not persist between different AD calls: fixes #69
 function AD.ruleconfig(::AD.ReverseRuleConfigBackend{<:Zygote.ZygoteRuleConfig})
     return Zygote.ZygoteRuleConfig()
 end
 
+function AD.value_and_pullback_function(::AD.ZygoteBackend, f, args...)
+    return Zygote.pullback(f, args...)
+end
+
+AD.gradient(::AD.ZygoteBackend, f, args...) = Zygote.gradient(f, args...)
+function AD.value_and_gradient(::AD.ZygoteBackend, f, args...)
+    res = Zygote.withgradient(f, args...)
+    return res.val, res.grad
+end
+
+AD.jacobian(::AD.ZygoteBackend, f, args...) = Zygote.jacobian(f, args...)
+function AD.value_and_jacobian(::AD.ZygoteBackend, f, args...)
+    res = Zygote.withjacobian(f, args...)
+    return res.val, res.grad
+end
+
+AD.hessian(::AD.ZygoteBackend, f, arg) = Zygote.hessian(f, arg)
+
 end # module
diff --git a/src/AbstractDifferentiation.jl b/src/AbstractDifferentiation.jl
index fb14e3e..755c7f5 100644
--- a/src/AbstractDifferentiation.jl
+++ b/src/AbstractDifferentiation.jl
@@ -7,10 +7,23 @@ abstract type AbstractFiniteDifference <: AbstractBackend end
 abstract type AbstractForwardMode <: AbstractBackend end
 abstract type AbstractReverseMode <: AbstractBackend end
 
+"""
+    AD.HigherOrderBackend{B}
+
+Let `ab_f` be a forward-mode automatic differentiation backend and let `ab_r` be a reverse-mode automatic differentiation backend.
+To construct a higher order backend for doing forward-over-reverse-mode automatic differentiation, use `AD.HigherOrderBackend((ab_f, ab_r))`.
+To construct a higher order backend for doing reverse-over-forward-mode automatic differentiation, use `AD.HigherOrderBackend((ab_r, ab_f))`.
+
+# Fields
+
+- `backends::B`
+"""
 struct HigherOrderBackend{B} <: AbstractBackend
     backends::B
 end
+
 reduce_order(b::AbstractBackend) = b
+
 function reduce_order(b::HigherOrderBackend)
     if length(b.backends) == 1
         return lowest(b) # prevent zero tuple and subsequent error when reducing over HigherOrderBackend
@@ -18,8 +31,10 @@ function reduce_order(b::HigherOrderBackend)
         return HigherOrderBackend(reverse(Base.tail(reverse(b.backends))))
     end
 end
+
 lowest(b::AbstractBackend) = b
 lowest(b::HigherOrderBackend) = b.backends[end]
+
 second_lowest(b::AbstractBackend) = b
 second_lowest(b::HigherOrderBackend) = lowest(reduce_order(b))
 
@@ -30,6 +45,13 @@ primal_value(::AbstractBackend, ys, ::Any, ::Any) = primal_value(ys)
 primal_value(x::Tuple) = map(primal_value, x)
 primal_value(x) = x
 
+"""
+    AD.derivative(ab::AD.AbstractBackend, f, xs::Number...)
+
+Compute the derivatives of `f` with respect to the numbers `xs` using the backend `ab`.
+
+The function returns a `Tuple` of derivatives, one for each element in `xs`.
+"""
 function derivative(ab::AbstractBackend, f, xs::Number...)
     der = getindex.(jacobian(lowest(ab), f, xs...), 1)
     if der isa Tuple
@@ -39,14 +61,55 @@ function derivative(ab::AbstractBackend, f, xs::Number...)
     end
 end
 
+"""
+    AD.gradient(ab::AD.AbstractBackend, f, xs...)
+
+Compute the gradients of `f` with respect to the inputs `xs` using the backend `ab`.
+
+The function returns a `Tuple` of gradients, one for each element in `xs`.
+"""
 function gradient(ab::AbstractBackend, f, xs...)
     return reshape.(adjoint.(jacobian(lowest(ab), f, xs...)), size.(xs))
 end
+
+"""
+    AD.jacobian(ab::AD.AbstractBackend, f, xs...)
+
+Compute the Jacobians of `f` with respect to the inputs `xs` using the backend `ab`.
+
+The function returns a `Tuple` of Jacobians, one for each element in `xs`.
+"""
 function jacobian(ab::AbstractBackend, f, xs...) end
+
 function jacobian(ab::HigherOrderBackend, f, xs...)
     return jacobian(lowest(ab), f, xs...)
 end
 
+"""
+    AD.second_derivative(ab::AD.AbstractBackend, f, x)
+
+Compute the second derivative of `f` with respect to the input `x` using the backend `ab`.
+
+The function returns a single value because `second_derivative` currently only supports a single input.
+"""
+function second_derivative(ab::AbstractBackend, f, x)
+    if x isa Tuple
+        # only support computation of second derivative for functions with single input argument
+        x = only(x)
+    end
+    return derivative(second_lowest(ab), x -> begin
+        d = derivative(lowest(ab), f, x)
+        return d[1] # derivative returns a tuple
+    end, x)
+end
+
+"""
+    AD.hessian(ab::AD.AbstractBackend, f, x)
+
+Compute the Hessian of `f` wrt the input `x` using the backend `ab`.
+
+The function returns a single matrix because `hessian` currently only supports a single input.
+"""
 function hessian(ab::AbstractBackend, f, x)
     if x isa Tuple
         # only support computation of Hessian for functions with single input argument
@@ -57,19 +120,61 @@ function hessian(ab::AbstractBackend, f, x)
     end, x)
 end
 
+"""
+    AD.value_and_derivative(ab::AD.AbstractBackend, f, xs::Number...)
+
+Return the tuple `(v, ds)` of the function value `v = f(xs...)` and the derivatives `ds = AD.derivative(ab, f, xs...)`.
+
+See also [`AbstractDifferentiation.derivative`](@ref).
+"""
 function value_and_derivative(ab::AbstractBackend, f, xs::Number...)
     value, jacs = value_and_jacobian(lowest(ab), f, xs...)
     return value[1], getindex.(jacs, 1)
 end
+
+"""
+    AD.value_and_gradient(ab::AD.AbstractBackend, f, xs...)
+
+Return the tuple `(v, gs)` of the function value `v = f(xs...)` and the gradients `gs = AD.gradient(ab, f, xs...)`.
+    
+See also [`AbstractDifferentiation.gradient`](@ref).
+"""
 function value_and_gradient(ab::AbstractBackend, f, xs...)
     value, jacs = value_and_jacobian(lowest(ab), f, xs...)
     return value, reshape.(adjoint.(jacs), size.(xs))
 end
+
+"""
+    AD.value_and_jacobian(ab::AD.AbstractBackend, f, xs...)
+
+Return the tuple `(v, Js)` of the function value `v = f(xs...)` and the Jacobians `Js = AD.jacobian(ab, f, xs...)`.
+    
+See also [`AbstractDifferentiation.jacobian`](@ref).
+"""
 function value_and_jacobian(ab::AbstractBackend, f, xs...)
     value = f(xs...)
     jacs = jacobian(lowest(ab), f, xs...)
     return value, jacs
 end
+
+"""
+    AD.value_and_second_derivative(ab::AD.AbstractBackend, f, x)
+
+Return the tuple `(v, d2)` of the function value `v = f(x)` and the second derivative `d2 = AD.second_derivative(ab, f, x)`.
+
+See also [`AbstractDifferentiation.second_derivative`](@ref)
+"""
+function value_and_second_derivative(ab::AbstractBackend, f, x)
+    return f(x), second_derivative(ab, f, x)
+end
+
+"""
+    AD.value_and_hessian(ab::AD.AbstractBackend, f, x)
+
+Return the tuple `(v, H)` of the function value `v = f(x)` and the Hessian `H = AD.hessian(ab, f, x)`.
+
+See also [`AbstractDifferentiation.hessian`](@ref). 
+"""
 function value_and_hessian(ab::AbstractBackend, f, x)
     if x isa Tuple
         # only support computation of Hessian for functions with single input argument
@@ -84,37 +189,37 @@ function value_and_hessian(ab::AbstractBackend, f, x)
 
     return value, hess
 end
-function value_and_hessian(ab::HigherOrderBackend, f, x)
-    if x isa Tuple
-        # only support computation of Hessian for functions with single input argument
-        x = only(x)
-    end
 
-    value = f(x)
-    hess = jacobian(second_lowest(ab), (_x,) -> begin
-        g = gradient(lowest(ab), f, _x)
-        return g[1]  # gradient returns a tuple
-    end, x)
+"""
+    AD.value_derivative_and_second_derivative(ab::AD.AbstractBackend, f, x)
 
-    return value, hess
-end
-function value_gradient_and_hessian(ab::AbstractBackend, f, x)
+Return the tuple `(v, d, d2)` of the function value `v = f(x)`, the first derivative `d = AD.derivative(ab, f, x)`, and the second derivative `d2 = AD.second_derivative(ab, f, x)`.
+"""
+function value_derivative_and_second_derivative(ab::AbstractBackend, f, x)
     if x isa Tuple
         # only support computation of Hessian for functions with single input argument
         x = only(x)
     end
 
     value = f(x)
-    grads, hess = value_and_jacobian(
+    deriv, secondderiv = value_and_derivative(
         second_lowest(ab), _x -> begin
-            g = gradient(lowest(ab), f, _x)
-            return g[1] # gradient returns a tuple
+            d = derivative(lowest(ab), f, _x)
+            return d[1] # derivative returns a tuple
         end, x
     )
 
-    return value, (grads,), hess
+    return value, (deriv,), secondderiv
 end
-function value_gradient_and_hessian(ab::HigherOrderBackend, f, x)
+
+"""
+    AD.value_gradient_and_hessian(ab::AD.AbstractBackend, f, x)
+    
+Return the tuple `(v, g, H)` of the function value `v = f(x)`, the gradient `g = AD.gradient(ab, f, x)`, and the Hessian `H = AD.hessian(ab, f, x)`.
+
+See also [`AbstractDifferentiation.gradient`](@ref) and [`AbstractDifferentiation.hessian`](@ref).
+"""
+function value_gradient_and_hessian(ab::AbstractBackend, f, x)
     if x isa Tuple
         # only support computation of Hessian for functions with single input argument
         x = only(x)
@@ -131,37 +236,57 @@ function value_gradient_and_hessian(ab::HigherOrderBackend, f, x)
     return value, (grads,), hess
 end
 
+"""
+    AD.pushforward_function(ab::AD.AbstractBackend, f, xs...)
+    
+Return the pushforward function `pff` of the function `f` at the inputs `xs` using backend `ab`. 
+    
+The pushfoward function `pff` accepts as input a `Tuple` of tangents, one for each element in `xs`.
+If `xs` consists of a single element, `pff` can also accept a single tangent instead of a 1-tuple.
+"""
 function pushforward_function(ab::AbstractBackend, f, xs...)
-    return (ds) -> begin
-        return jacobian(
-            lowest(ab),
-            (xds...,) -> begin
-                if ds isa Tuple
-                    @assert length(xs) == length(ds)
-                    newxs = xs .+ ds .* xds
-                    return f(newxs...)
-                else
-                    newx = only(xs) + ds * only(xds)
-                    return f(newx)
-                end
-            end,
-            _zero.(xs, ds)...,
-        )
+    function pff(ds)
+        function pff_aux(xds...)
+            if ds isa Tuple
+                @assert length(xs) == length(ds)
+                newxs = xs .+ ds .* xds
+                return f(newxs...)
+            else
+                newx = only(xs) + ds * only(xds)
+                return f(newx)
+            end
+        end
+        return jacobian(lowest(ab), pff_aux, _zero.(xs, ds)...)
     end
+    return pff
 end
+
+"""
+    AD.value_and_pushforward_function(ab::AD.AbstractBackend, f, xs...)
+    
+Return a single function `vpff` which, given tangents `ts`, computes the tuple `(v, p) = vpff(ts)` composed of
+    
+- the function value `v = f(xs...)`
+- the pushforward value `p = pff(ts)` given by the pushforward function `pff = AD.pushforward_function(ab, f, xs...)` applied to `ts`.
+
+See also [`AbstractDifferentiation.pushforward_function`](@ref).
+
+!!! warning
+    This name should be understood as "(value and pushforward) function", and thus is not aligned with the reverse mode counterpart [`AbstractDifferentiation.value_and_pullback_function`](@ref).
+"""
 function value_and_pushforward_function(ab::AbstractBackend, f, xs...)
     n = length(xs)
     value = f(xs...)
-    pf_function = pushforward_function(lowest(ab), f, xs...)
+    pff = pushforward_function(lowest(ab), f, xs...)
 
-    return ds -> begin
+    function vpff(ds)
         if !(ds isa Tuple)
             ds = (ds,)
         end
         @assert length(ds) == n
-        pf = pf_function(ds)
-        return value, pf
+        return value, pff(ds)
     end
+    return vpff
 end
 
 _zero(::Number, d::Number) = zero(d)
@@ -180,14 +305,33 @@ end
     return dot(x, y)
 end
 
+"""
+    AD.pullback_function(ab::AD.AbstractBackend, f, xs...)
+
+Return the pullback function `pbf` of the function `f` at the inputs `xs` using backend `ab`. 
+    
+The pullback function `pbf` accepts as input a `Tuple` of cotangents, one for each output of `f`.
+If `f` has a single output, `pbf` can also accept a single input instead of a 1-tuple.
+"""
 function pullback_function(ab::AbstractBackend, f, xs...)
     _, pbf = value_and_pullback_function(ab, f, xs...)
     return pbf
 end
+
+"""
+    AD.value_and_pullback_function(ab::AD.AbstractBackend, f, xs...)
+
+Return a tuple `(v, pbf)` of the function value `v = f(xs...)` and the pullback function `pbf = AD.pullback_function(ab, f, xs...)`.
+
+See also [`AbstractDifferentiation.pullback_function`](@ref).
+
+!!! warning
+    This name should be understood as "value and (pullback function)", and thus is not aligned with the forward mode counterpart [`AbstractDifferentiation.value_and_pushforward_function`](@ref).
+"""
 function value_and_pullback_function(ab::AbstractBackend, f, xs...)
     value = f(xs...)
-    function pullback_function(ws)
-        function pullback_gradient_function(_xs...)
+    function pbf(ws)
+        function pbf_aux(_xs...)
             vs = f(_xs...)
             if ws isa Tuple
                 @assert length(vs) == length(ws)
@@ -198,9 +342,9 @@ function value_and_pullback_function(ab::AbstractBackend, f, xs...)
                 return _dot(vs, ws)
             end
         end
-        return gradient(lowest(ab), pullback_gradient_function, xs...)
+        return gradient(lowest(ab), pbf_aux, xs...)
     end
-    return value, pullback_function
+    return value, pbf
 end
 
 struct LazyDerivative{B,F,X}
@@ -244,6 +388,7 @@ struct LazyGradient{B,F,X}
     f::F
     xs::X
 end
+
 Base.:*(d::LazyGradient, y) = gradient(d.backend, d.f, d.xs...) * y
 Base.:*(y, d::LazyGradient) = y * gradient(d.backend, d.f, d.xs...)
 
@@ -392,15 +537,49 @@ function Base.:*(ys::Number, d::LazyHessian)
     end
 end
 
+"""
+    AD.lazy_derivative(ab::AbstractBackend, f, xs::Number...)
+    
+Return an operator `ld` for multiplying by the derivative of `f` at `xs`.
+    
+You can apply the operator by multiplication e.g. `ld * y` where `y` is a number if `f` has a single input, a tuple of the same length as `xs` if `f` has multiple inputs, or an array of numbers/tuples.
+"""
 function lazy_derivative(ab::AbstractBackend, f, xs::Number...)
     return LazyDerivative(ab, f, xs)
 end
+
+"""
+    AD.lazy_gradient(ab::AbstractBackend, f, xs...)
+    
+Return an operator `lg` for multiplying by the gradient of `f` at `xs`.
+    
+You can apply the operator by multiplication e.g. `lg * y` where `y` is a number if `f` has a single input or a tuple of the same length as `xs` if `f` has multiple inputs.
+"""
 function lazy_gradient(ab::AbstractBackend, f, xs...)
     return LazyGradient(ab, f, xs)
 end
+
+"""
+    AD.lazy_hessian(ab::AbstractBackend, f, x)
+    
+Return an operator `lh` for multiplying by the Hessian of the scalar-valued function `f` at `x`.
+    
+You can apply the operator by multiplication e.g. `lh * y` or `y' * lh` where `y` is a number or a vector of the appropriate length.
+"""
 function lazy_hessian(ab::AbstractBackend, f, xs...)
     return LazyHessian(ab, f, xs)
 end
+
+"""
+    AD.lazy_jacobian(ab::AbstractBackend, f, xs...)
+    
+Return an operator `lj` for multiplying by the Jacobian of `f` at `xs`.
+    
+You can apply the operator by multiplication e.g. `lj * y` or `y' * lj` where `y` is a number, vector or tuple of numbers and/or vectors. 
+If `f` has multiple inputs, `y` in `lj * y` should be a tuple.
+If `f` has multiple outputs, `y` in `y' * lj` should be a tuple.
+Otherwise, it should be a scalar or a vector of the appropriate length.
+"""
 function lazy_jacobian(ab::AbstractBackend, f, xs...)
     return LazyJacobian(ab, f, xs)
 end
@@ -409,8 +588,10 @@ struct D{B,F}
     backend::B
     f::F
 end
+
 D(b::AbstractBackend, d::D) = H(HigherOrderBackend((b, d.b)), d.f)
 D(d::D) = H(HigherOrderBackend((d.backend, d.backend)), d.f)
+
 function (d::D)(xs...; lazy=true)
     if lazy
         return lazy_jacobian(d.ab, d.f, xs...)
@@ -423,6 +604,7 @@ struct H{B,F}
     backend::B
     f::F
 end
+
 function (h::H)(xs...; lazy=true)
     if lazy
         return lazy_hessian(h.ab, h.f, xs...)
@@ -514,13 +696,17 @@ _eachcol(a) = eachcol(a)
 function identity_matrix_like(x)
     throw("The function `identity_matrix_like` is not defined for the type $(typeof(x)).")
 end
+
 function identity_matrix_like(x::AbstractVector)
     return (Matrix{eltype(x)}(I, length(x), length(x)),)
 end
+
 function identity_matrix_like(x::Number)
     return (one(x),)
 end
+
 identity_matrix_like(x::Tuple) = identity_matrix_like(x...)
+
 @generated function identity_matrix_like(x...)
     expr = :(())
     for i in 1:length(x)
@@ -540,6 +726,7 @@ zero_matrix_like(x::Tuple) = zero_matrix_like(x...)
 zero_matrix_like(x...) = map(zero_matrix_like, x)
 zero_matrix_like(x::AbstractVector) = (zero(similar(x, length(x), length(x))),)
 zero_matrix_like(x::Number) = (zero(x),)
+
 function zero_matrix_like(x)
     throw("The function `zero_matrix_like` is not defined for the type $(typeof(x)).")
 end
@@ -551,10 +738,12 @@ include("backends.jl")
 
 # TODO: Replace with proper version
 const EXTENSIONS_SUPPORTED = isdefined(Base, :get_extension)
+
 if !EXTENSIONS_SUPPORTED
     using Requires: @require
     include("../ext/AbstractDifferentiationChainRulesCoreExt.jl")
 end
+
 @static if !EXTENSIONS_SUPPORTED
     function __init__()
         @require DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" begin
diff --git a/src/backends.jl b/src/backends.jl
index 095f30c..4d77284 100644
--- a/src/backends.jl
+++ b/src/backends.jl
@@ -1,7 +1,7 @@
 """
     FiniteDifferencesBackend{M}
 
-AD backend that uses forward mode with FiniteDifferences.jl.
+AD backend that uses forward mode with [FiniteDifferences.jl](https://github.com/JuliaDiff/FiniteDifferences.jl).
 
 The type parameter `M` is the type of the method used to perform finite differences.
 
@@ -15,10 +15,10 @@ end
 """
     ForwardDiffBackend{CS}
 
-AD backend that uses forward mode with ForwardDiff.jl.
+AD backend that uses forward mode with [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl).
 
-The type parameter `CS` denotes the chunk size of the differentiation algorithm. If it is
-`Nothing`, then ForwardiffDiff uses a heuristic to set the chunk size based on the input.
+The type parameter `CS` denotes the chunk size of the differentiation algorithm. 
+If it is `Nothing`, then ForwardiffDiff uses a heuristic to set the chunk size based on the input.
 
 See also: [ForwardDiff.jl: Configuring Chunk Size](https://juliadiff.org/ForwardDiff.jl/dev/user/advanced/#Configuring-Chunk-Size)
 
@@ -30,7 +30,7 @@ struct ForwardDiffBackend{CS} <: AbstractForwardMode end
 """
     ReverseDiffBackend
 
-AD backend that uses reverse mode with ReverseDiff.jl.
+AD backend that uses reverse mode with [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl).
 
 !!! note
     To be able to use this backend, you have to load ReverseDiff.
@@ -40,7 +40,7 @@ struct ReverseDiffBackend <: AbstractReverseMode end
 """
     TrackerBackend
 
-AD backend that uses reverse mode with Tracker.jl.
+AD backend that uses reverse mode with [Tracker.jl](https://github.com/FluxML/Tracker.jl).
 
 !!! note
     To be able to use this backend, you have to load Tracker.
@@ -50,11 +50,16 @@ struct TrackerBackend <: AbstractReverseMode end
 """
     ReverseRuleConfigBackend
 
-AD backend that uses reverse mode with any ChainRules-compatible reverse-mode AD package.
+AD backend that uses reverse mode with any [ChainRulesCore.jl](https://github.com/JuliaDiff/ChainRulesCore.jl)-compatible reverse-mode AD package.
+
+Constructed with a [`RuleConfig`](https://juliadiff.org/ChainRulesCore.jl/stable/rule_author/superpowers/ruleconfig.html) object:
+
+```julia
+backend = AD.ReverseRuleConfigBackend(rc)
+```
 
 !!! note
-    On Julia >= 1.9, you have to load ChainRulesCore (possibly implicitly by loading
-    a ChainRules-compatible AD package) to be able to use this backend.
+    On Julia >= 1.9, you have to load ChainRulesCore (possibly implicitly by loading a ChainRules-compatible AD package) to be able to use this backend.
 """
 struct ReverseRuleConfigBackend{RC} <: AbstractReverseMode
     ruleconfig::RC
@@ -66,16 +71,17 @@ end
 ruleconfig(ba::ReverseRuleConfigBackend) = ba.ruleconfig
 
 """
-    ZygoteBackend()
+    ZygoteBackend
 
-Create an AD backend that uses reverse mode with Zygote.jl.
+Create an AD backend that uses reverse mode with [Zygote.jl](https://github.com/FluxML/Zygote.jl).
 
-It is a special case of [`ReverseRuleConfigBackend`](@ref).
+Alternatively, you can perform AD with Zygote using a special [`ReverseRuleConfigBackend`](@ref), namely `ReverseRuleConfigBackend(Zygote.ZygoteRuleConfig())`.
+Note, however, that the behaviour of this backend is not equivalent to `ZygoteBackend()` since the former uses a generic implementation of jacobian etc. for ChainRules-compatible AD backends whereas `ZygoteBackend` uses implementations in Zygote.jl.
 
 !!! note
     To be able to use this backend, you have to load Zygote.
 """
-function ZygoteBackend end
+struct ZygoteBackend <: AbstractReverseMode end
 
 """
     EnzymeReverseBackend
@@ -95,4 +101,4 @@ AD backend that uses forward mode of Enzyme.jl.
 !!! note
     To be able to use this backend, you have to load Enzyme.
 """
-struct EnzymeForwardBackend <: AbstractForwardMode end
+struct EnzymeForwardBackend <: AbstractForwardMode end
\ No newline at end of file
diff --git a/test/finitedifferences.jl b/test/finitedifferences.jl
index 568f0e9..df97b5e 100644
--- a/test/finitedifferences.jl
+++ b/test/finitedifferences.jl
@@ -21,6 +21,9 @@ using FiniteDifferences
         @testset "Jacobian" begin
             test_jacobians(backend)
         end
+        @testset "Second derivative" begin
+            test_second_derivatives(backend)
+        end
         @testset "Hessian" begin
             test_hessians(backend)
         end
diff --git a/test/forwarddiff.jl b/test/forwarddiff.jl
index 0b6bf26..47a95c9 100644
--- a/test/forwarddiff.jl
+++ b/test/forwarddiff.jl
@@ -19,6 +19,9 @@ using ForwardDiff
         @testset "Jacobian" begin
             test_jacobians(backend)
         end
+        @testset "Second derivative" begin
+            test_second_derivatives(backend)
+        end
         @testset "Hessian" begin
             test_hessians(backend)
         end
diff --git a/test/reversediff.jl b/test/reversediff.jl
index 06da46f..ed6ad21 100644
--- a/test/reversediff.jl
+++ b/test/reversediff.jl
@@ -14,6 +14,9 @@ using ReverseDiff
         @testset "Jacobian" begin
             test_jacobians(backend)
         end
+        @testset "Second derivative" begin
+            test_second_derivatives(backend)
+        end
         @testset "Hessian" begin
             test_hessians(backend)
         end
diff --git a/test/ruleconfig.jl b/test/ruleconfig.jl
index 0a97b66..37ce561 100644
--- a/test/ruleconfig.jl
+++ b/test/ruleconfig.jl
@@ -4,7 +4,10 @@ using Test
 using Zygote
 
 @testset "ReverseRuleConfigBackend(ZygoteRuleConfig())" begin
-    backends = [@inferred(AD.ZygoteBackend())]
+    backends = [
+        @inferred(AD.ZygoteBackend()),
+        @inferred(AD.ReverseRuleConfigBackend(Zygote.ZygoteRuleConfig()))
+    ]
     @testset for backend in backends
         @testset "Derivative" begin
             test_derivatives(backend)
@@ -21,6 +24,9 @@ using Zygote
         @testset "j′vp" begin
             test_j′vp(backend)
         end
+        @testset "Second derivative" begin
+            test_second_derivatives(backend)
+        end
         @testset "Lazy Derivative" begin
             test_lazy_derivatives(backend)
         end
@@ -34,7 +40,7 @@ using Zygote
 
     # issue #69
     @testset "Zygote context" begin
-        ad = AD.ZygoteBackend()
+        ad = AD.ReverseRuleConfigBackend(Zygote.ZygoteRuleConfig())
 
         # example in #69: context is not mutated
         @test ad.ruleconfig.context.cache === nothing
@@ -53,6 +59,13 @@ using Zygote
         end
         @test AD.jacobian(ad, f, [1, 2, 3], 3) ==
             ([6.0 0.0 0.0; 0.0 6.0 0.0; 0.0 0.0 6.0], [2.0, 4.0, 6.0])
+
+        # With `AD.ZygoteBackend`:
+        ad = AD.ZygoteBackend()
+        @test AD.derivative(ad, exp, 1.0) === (exp(1.0),)
+        @test AD.derivative(ad, exp, 1.0) === (exp(1.0),)
+        @test AD.jacobian(ad, f, [1, 2, 3], 3) ==
+            ([6.0 0.0 0.0; 0.0 6.0 0.0; 0.0 0.0 6.0], [2.0, 4.0, 6.0])
     end
 
     # issue #57
@@ -65,5 +78,17 @@ using Zygote
 
         @test_logs Zygote.gradient(myfunc, 1) # nothing is logged
         @test_logs AD.derivative(AD.ZygoteBackend(), myfunc, 1) # nothing is logged
+        @test_logs AD.derivative(
+            AD.ReverseRuleConfigBackend(Zygote.ZygoteRuleConfig()), myfunc, 1
+        ) # nothing is logged
+    end
+
+    # issue #54
+    @testset "allocations of jacobian" begin
+        f(x) = x .^ 2
+        x = rand(100)
+        ad = AD.ZygoteBackend()
+        @test AD.jacobian(ad, f, x) == Zygote.jacobian(f, x)
+        @test @allocated(AD.jacobian(ad, f, x)) == @allocated(Zygote.jacobian(f, x))
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 5f6d1d1..abfbc49 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,9 @@
 using AbstractDifferentiation
+using Documenter
 using Test
 
-@testset "AbstractDifferentiation.jl" begin
+@testset verbose = true "AbstractDifferentiation.jl" begin
+    doctest(AbstractDifferentiation)
     include("test_utils.jl")
     include("defaults.jl")
     include("forwarddiff.jl")
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 6eb4677..22e00c3 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -6,6 +6,7 @@ Random.seed!(1234)
 fder(x, y) = exp(y) * x + y * log(x)
 dfderdx(x, y) = exp(y) + y * 1 / x
 dfderdy(x, y) = exp(y) * x + log(x)
+dfderdxdx(x, y) = -y / x^2
 
 fgrad(x, y) = prod(x) + sum(y ./ (1:length(y)))
 dfgraddx(x, y) = prod(x) ./ x
@@ -143,6 +144,44 @@ function test_jacobians(backend; multiple_inputs=true, test_types=true)
     @test yvec == yvec2
 end
 
+function test_second_derivatives(backend; test_types=true)
+    # explicit test that AbstractDifferentiation throws an error
+    # don't support tuple of second derivatives
+    @test_throws ArgumentError AD.second_derivative(
+        backend, x -> fder(x, yscalar), (xscalar, yscalar)
+    )
+    @test_throws MethodError AD.second_derivative(
+        backend, x -> fder(x, yscalar), xscalar, yscalar
+    )
+
+    # test if single input (no tuple works)
+    dder1 = AD.second_derivative(backend, x -> fder(x, yscalar), xscalar)
+    if test_types
+        @test only(dder1) isa Float64
+    end
+    @test dfderdxdx(xscalar, yscalar) ≈ only(dder1) atol = 1e-8
+    valscalar, dder2 = AD.value_and_second_derivative(
+        backend, x -> fder(x, yscalar), xscalar
+    )
+    if test_types
+        @test valscalar isa Float64
+        @test only(dder2) isa Float64
+    end
+    @test valscalar == fder(xscalar, yscalar)
+    @test dder2 == dder1
+    valscalar, der, dder3 = AD.value_derivative_and_second_derivative(
+        backend, x -> fder(x, yscalar), xscalar
+    )
+    if test_types
+        @test valscalar isa Float64
+        @test only(der) isa Float64
+        @test only(dder3) isa Float64
+    end
+    @test valscalar == fder(xscalar, yscalar)
+    @test der == AD.derivative(backend, x -> fder(x, yscalar), xscalar)
+    @test dder3 == dder1
+end
+
 function test_hessians(backend; multiple_inputs=false, test_types=true)
     if multiple_inputs
         # ... but