diff --git a/AUTHORS b/AUTHORS
deleted file mode 100644
index 16012dc..0000000
--- a/AUTHORS
+++ /dev/null
@@ -1,10 +0,0 @@
-AUTHORS
-=======
-
-Florent Hivert <florent.hivert@lri.fr>
-
-Contributors
-============
-
-- James Mitchell <jdm3@st-andrews.ac.uk> : discussions + test cases + Travis CI
-- Viviane Pons <viviane.pons@lri.fr> : algorithms discussions
diff --git a/BUILDING.md b/BUILDING.md
index cd6bb1f..289246a 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -18,7 +18,7 @@ tests, experiments, examples, or benchmarks.
 
 - [optional] : Google `sparsehash/dense_hash_map` and/or `sparsehash/dense_hash_set`.
 
-- [optional] Doxygen for generating the API documentation (in progress).
+- [optional] Doxygen for generating the API documentation (in [build/doc/html/index.html](build/doc/html/index.html)).
 
 ## Building
 
diff --git a/README.md b/README.md
index 034b66f..ed2333d 100644
--- a/README.md
+++ b/README.md
@@ -2,10 +2,9 @@
 High Performance Combinatorics in C++ using vector instructions v1.0.1
 
 HPCombi is a C++17 header-only library using the SSE and AVX instruction sets,
-and some equivalents, for very fast manipulation of combinatorial objects such
-as transformations, permutations, and boolean matrices of small size. The goal
-of this project is to implement various new algorithms and benchmark them on
-various compiler and architectures.
+and some equivalents, for very fast manipulation of small combinatorial objects
+such as transformations, permutations, and boolean matrices. HPCombi implements
+new algorithms and benchmarks them on various compilers and architectures.
 
 HPCombi was initially designed using the SSE and AVX instruction sets, and did
 not work on machines without these instructions (such as ARM). From v1.0.1
@@ -25,12 +24,12 @@ other processors too.
 
 - Reinis Cirpons <rc234@st-andrews.ac.uk> : CI
 - Finn Smith <fls3@st-andrews.ac.uk> : discussions + BMat8 reference code
-- Viviane Pons : algorithms discussions
+- Viviane Pons <viviane.pons@lri.fr> : algorithms discussions
 - Daniel Vanzo <daniel.vanzo@lri.fr> : GPU experiments
 
 ## Documentation
 
-- The Doxygen auto generated [API](https://libsemigroups.github.io/HPCombi/)
+- See the [Doxygen generated doc](https://libsemigroups.github.io/HPCombi/)
 
 ## Thanks
 
diff --git a/TODO.txt b/TODO.txt
index 229d50d..9109b08 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -2,5 +2,6 @@
 - https://cmake.org/Wiki/CMake:How_To_Write_Platform_Checks
 - https://stackoverflow.com/questions/11944060/how-to-detect-target-architecture-using-cmake
 
-
 Add method data in perm16 and perm_generic
+
+Document examples. Eg for each file foo.cpp in examples/, add "@example foo.cpp" in a relevant file of include/hpcombi/.
diff --git a/doc.html b/doc.html
new file mode 120000
index 0000000..f1c101d
--- /dev/null
+++ b/doc.html
@@ -0,0 +1 @@
+build/doc/html/index.html
\ No newline at end of file
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 51a7ff2..b62bed7 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -934,8 +934,7 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = ${CMAKE_SOURCE_DIR}/include/hpcombi \
-                         ${CMAKE_SOURCE_DIR}/examples \
-                         ${CMAKE_SOURCE_DIR}/README.md
+                         ${CMAKE_SOURCE_DIR}/examples
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1101,7 +1100,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-USE_MDFILE_AS_MAINPAGE = ${CMAKE_SOURCE_DIR}/README.md
+USE_MDFILE_AS_MAINPAGE =
 
 # The Fortran standard specifies that for fixed formatted Fortran code all
 # characters from position 72 are to be considered as comment. A common
diff --git a/include/hpcombi/arch.hpp b/include/hpcombi/arch.hpp
index eaf9da6..cb395c2 100644
--- a/include/hpcombi/arch.hpp
+++ b/include/hpcombi/arch.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief check the required compiler flags for SSE-4.1 */
+
 #ifndef HPCOMBI_ARCH_HPP_
 #define HPCOMBI_ARCH_HPP_
 
diff --git a/include/hpcombi/bmat8.hpp b/include/hpcombi/bmat8.hpp
index cb789fa..ef68187 100644
--- a/include/hpcombi/bmat8.hpp
+++ b/include/hpcombi/bmat8.hpp
@@ -19,7 +19,8 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
-// This file contains a declaration of fast boolean matrices up to dimension 8.
+/** @file
+@brief declaration of HPCombi::BMat8 */
 
 #ifndef HPCOMBI_BMAT8_HPP_
 #define HPCOMBI_BMAT8_HPP_
@@ -40,15 +41,17 @@
 
 namespace HPCombi {
 
-//! Class for fast boolean matrices of dimension up to 8 x 8
-//!
-//! The methods for these small matrices over the boolean semiring
-//! are more optimised than the generic methods for boolean matrices.
-//! Note that all BMat8 are represented internally as an 8 x 8 matrix;
-//! any entries not defined by the user are taken to be 0. This does
-//! not affect the results of any calculations.
-//!
-//! BMat8 is a trivial class.
+/** Boolean matrices of dimension up to 8×8, stored as a single uint64;
+isomorph to binary relations with methods for composition.
+
+The methods for these small matrices over the boolean semiring
+are more optimised than the generic methods for boolean matrices.
+Note that all BMat8 are represented internally as an 8×8 matrix;
+any entries not defined by the user are taken to be 0. This does
+not affect the results of any calculation.
+
+BMat8 is a trivial class.
+*/
 class BMat8 {
  public:
     //! A default constructor.
diff --git a/include/hpcombi/bmat8_impl.hpp b/include/hpcombi/bmat8_impl.hpp
index da0f18b..35dd9fa 100644
--- a/include/hpcombi/bmat8_impl.hpp
+++ b/include/hpcombi/bmat8_impl.hpp
@@ -19,8 +19,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
-// This file contains an implementation of fast boolean matrices up to
-// dimension 8 x 8.
+/** @file
+@brief implementation of bmat8.hpp ; this file should not be included directly.
+*/
 
 // NOLINT(build/header_guard)
 
diff --git a/include/hpcombi/builder.hpp b/include/hpcombi/builder.hpp
index f8cfeab..20d553c 100644
--- a/include/hpcombi/builder.hpp
+++ b/include/hpcombi/builder.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief HPCombi::TPUBuild and casts from HPCombi::TPU */
+
 #ifndef HPCOMBI_BUILDER_HPP_
 #define HPCOMBI_BUILDER_HPP_
 
@@ -30,8 +33,11 @@
 
 namespace HPCombi {
 
-/** Class for factory object associated to a SIMD packed unsigned integers.
+/** Given a transformation from 0..15 → 0..15,
+ * build at compile-time the array representing the transformation.
+ *
  * @details
+ * Class for factory object associated to a SIMD packed unsigned integers.
  * The main purpose of this class is to be able to construct in a \c constexpr
  * way various instances of the \c TPU SIMD vector type. The behavior of
  * an instance of \c TPUBuild<TPU> is designed to mimic the behavior of \c TPU
diff --git a/include/hpcombi/debug.hpp b/include/hpcombi/debug.hpp
index 2844316..b3fdf94 100644
--- a/include/hpcombi/debug.hpp
+++ b/include/hpcombi/debug.hpp
@@ -18,6 +18,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief defines the macro \c HPCOMBI_ASSERT */
+
 #ifndef HPCOMBI_DEBUG_HPP_
 #define HPCOMBI_DEBUG_HPP_
 
diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index 10678a2..2d8af73 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -17,6 +17,12 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief declaration of HPCombi::epu8.
+
+Contains renaming of some low level functions,
+eg simde_mm_testz_si128(a,a) → is_all_zero(a) */
+
 #ifndef HPCOMBI_EPU8_HPP_
 #define HPCOMBI_EPU8_HPP_
 
@@ -41,7 +47,14 @@ operator"" _u8(unsigned long long arg) noexcept {  // NOLINT
     return static_cast<uint8_t>(arg);
 }
 
-/// SIMD vector of 16 unsigned bytes
+/**
+epu8 stands for *Extended Packed Unsigned, grouped by 8 bits*;
+this is the low level type chosen by Intel for their API to intrinsics,
+ie a SIMD vector of 16 unsigned bytes (16×8 = 128bits).
+Functions using this type use semantically equivalent types,
+eg a _m128 which is a vector containing 2 signed 64 bits integers.
+A flag tells the compiler to silently consider those types equivalent.
+*/
 using epu8 = uint8_t __attribute__((vector_size(16)));
 
 static_assert(alignof(epu8) == 16,
@@ -66,22 +79,28 @@ inline bool equal(epu8 a, epu8 b) noexcept {
 /** Non equality of #HPCombi::epu8 */
 inline bool not_equal(epu8 a, epu8 b) noexcept { return !equal(a, b); }
 
-/** Permuting a #HPCombi::epu8 */
+/** Apply a permutation \c b on the vector \c a: for i=0..16 {result[i] =
+ * a[b[i]} */
 inline epu8 permuted_ref(epu8 a, epu8 b) noexcept;
-/** Permuting a #HPCombi::epu8 */
+
+/** Same as \ref HPCombi::permuted_ref "permuted_ref"
+but with an optimized implementation using intrinsics. */
 inline epu8 permuted(epu8 a, epu8 b) noexcept {
     return simde_mm_shuffle_epi8(a, b);
 }
+
 /** Left shifted of a #HPCombi::epu8 inserting a 0
  * @warning we use the convention that the 0 entry is on the left !
  */
 inline epu8 shifted_right(epu8 a) noexcept {
     return simde_mm_bslli_si128(a, 1);
 }
+
 /** Right shifted of a #HPCombi::epu8 inserting a 0
  * @warning we use the convention that the 0 entry is on the left !
  */
 inline epu8 shifted_left(epu8 a) noexcept { return simde_mm_bsrli_si128(a, 1); }
+
 /** Reverting a #HPCombi::epu8 */
 inline epu8 reverted(epu8 a) noexcept { return permuted(a, Epu8.rev()); }
 
@@ -98,7 +117,7 @@ inline bool is_sorted(epu8 a) noexcept;
  * Uses the 9 stages sorting network #sorting_rounds
  */
 inline epu8 sorted(epu8 a) noexcept;
-/** Return a #HPCombi::epu8 with the two half sorted
+/** Return a #HPCombi::epu8 with both halves sorted
  * @details
  * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8
  */
@@ -109,7 +128,7 @@ inline epu8 sorted8(epu8 a) noexcept;
  * Uses the 9 stages sorting network #sorting_rounds
  */
 inline epu8 revsorted(epu8 a) noexcept;
-/** Return a #HPCombi::epu8 with the two half reverse sorted
+/** Return a #HPCombi::epu8 with both halves reverse sorted
  * @details
  * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8
  */
@@ -126,37 +145,37 @@ inline epu8 sort_perm(epu8 &a) noexcept;
  */
 inline epu8 sort8_perm(epu8 &a) noexcept;
 
-/** @class common_merge
+/**
  * @brief Merge two sorted epu8
  * @details
  * @param a, b: two #HPCombi::epu8
- * after executing merge, \c a and \c are sorted \c a[15] <= \c b[0]
- */
-/** @copydoc common_merge
- *  @par Algorithm: bitonic merge sorting network
+ * after executing merge, \c a and \c b are sorted and \c a[15] <= \c b[0]
+ * @par Algorithm: bitonic merge sorting network
  */
 inline void merge(epu8 &a, epu8 &b) noexcept;
 
-/** @class common_permutation_of
- * @brief Find if a vector is a permutation of one other
- * @details
- * @param a, b: two #HPCombi::epu8
- * @returns a #HPCombi::epu8
- * For each @f$0 \leq i < 16@f$, \c res[i] is the position in \c a of \c b[i]
-     if \c b[i] appears exactly once in \c a, or undefined if not.
- */
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** @copydoc common_permutation_of
+/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a
+   different implementation.
     @par Algorithm: uses string matching cpmestrm intrinsics
  */
 inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept;
 #endif
-/** @copydoc common_permutation_of
+
+/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a
+   different implementation.
     @par Algorithm: reference implementation
  */
 inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept;
-/** @copydoc common_permutation_of
-    @par Algorithm: architecture dependent
+
+/**
+ * @brief Find if a vector is a permutation of another one
+ * @details
+ * @param a, b: two #HPCombi::epu8
+ * @returns a #HPCombi::epu8
+ * For each @f$0 \leq i < 16@f$, \c res[i] is the position in \c a of \c b[i]
+     if \c b[i] appears exactly once in \c a, or undefined if not.
+ * @par Algorithm: architecture dependent
  */
 inline epu8 permutation_of(epu8 a, epu8 b) noexcept;
 
@@ -181,233 +200,302 @@ inline epu8 random_epu8(uint16_t bnd);
  */
 inline epu8 remove_dups(epu8 a, uint8_t repl = 0) noexcept;
 
-/** @class common_horiz_sum
- * @brief Horizontal sum of a  #HPCombi::epu8
- * @details
- * @returns the horizontal sum of the input
- * @par Example:
- * @code
- * horiz_sum(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
- * @endcode
- * Returns `110`
- * @warning The result is supposed to fit in a \c uint8_t
- */
-/** @copydoc common_horiz_sum
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
+
 inline uint8_t horiz_sum_ref(epu8) noexcept;
-/** @copydoc common_horiz_sum
+
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
+
 inline uint8_t horiz_sum_gen(epu8) noexcept;
-/** @copydoc common_horiz_sum
+
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_sum4(epu8) noexcept;
-/** @copydoc common_horiz_sum
+
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
 inline uint8_t horiz_sum3(epu8) noexcept;
-/** @copydoc common_horiz_sum */
-inline uint8_t horiz_sum(epu8 v) noexcept { return horiz_sum3(v); }
 
-/** @class common_partial_sums
- * @brief Horizontal partial sum of a #HPCombi::epu8
+/**
+ * @brief Horizontal sum of a #HPCombi::epu8
  * @details
- * @returns the partials sums of the input
+ * @returns the horizontal sum of the input
  * @par Example:
  * @code
- * partial_sums(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * horiz_sum(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
  * @endcode
- * Returns `{ 5,10,12,17,18,24,36,40,40,43,45,56,68,81,95,110}`
+ * Returns `110`
+ * @warning The result is supposed to fit in a \c uint8_t
  */
-/** @copydoc common_partial_sums
+inline uint8_t horiz_sum(epu8 v) noexcept { return horiz_sum3(v); }
+
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_sums_ref(epu8) noexcept;
-/** @copydoc common_partial_sums
+
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_sums_gen(epu8) noexcept;
-/** @copydoc common_partial_sums
+
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline epu8 partial_sums_round(epu8) noexcept;
-/** @copydoc common_partial_sums */
-inline epu8 partial_sums(epu8 v) noexcept { return partial_sums_round(v); }
 
-/** @class common_horiz_max
- * @brief Horizontal sum of a  #HPCombi::epu8
+/**
+ * @brief Horizontal partial sum of a #HPCombi::epu8
  * @details
- * @returns the horizontal sum of the input
+ * @returns the partials sums of the input
  * @par Example:
  * @code
- * horiz_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2, 0,12, 0, 0, 0});
+ * partial_sums(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
  * @endcode
- * Returns `12`
+ * Returns `{ 5,10,12,17,18,24,36,40,40,43,45,56,68,81,95,110}`
  */
-/** @copydoc common_horiz_max
+inline epu8 partial_sums(epu8 v) noexcept { return partial_sums_round(v); }
+
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint8_t horiz_max_ref(epu8) noexcept;
-/** @copydoc common_horiz_max
+
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline uint8_t horiz_max_gen(epu8) noexcept;
-/** @copydoc common_horiz_max
+
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_max4(epu8) noexcept;
-/** @copydoc common_horiz_max
+
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
 inline uint8_t horiz_max3(epu8) noexcept;
-/** @copydoc common_horiz_max */
-inline uint8_t horiz_max(epu8 v) noexcept { return horiz_max4(v); }
 
-/** @class common_partial_max
- * @brief Horizontal partial sum of a #HPCombi::epu8
+/**
+ * @brief Horizontal sum of a  #HPCombi::epu8
  * @details
- * @returns the partials max of the input
+ * @returns the horizontal sum of the input
  * @par Example:
  * @code
- * partial_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * horiz_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2, 0,12, 0, 0, 0});
  * @endcode
- * Returns `{ 5, 5, 5, 5, 5, 6,12,12,12,12,12,12,12,13,14,15}`
+ * Returns `12`
  */
-/** @copydoc common_partial_max
+inline uint8_t horiz_max(epu8 v) noexcept { return horiz_max4(v); }
+
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_max_ref(epu8) noexcept;
-/** @copydoc common_partial_max
+
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_max_gen(epu8) noexcept;
-/** @copydoc common_partial_max
+
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline epu8 partial_max_round(epu8) noexcept;
-/** @copydoc common_partial_max */
-inline epu8 partial_max(epu8 v) noexcept { return partial_max_round(v); }
 
-/** @class common_horiz_min
- * @brief Horizontal sum of a  #HPCombi::epu8
+/**
+ * @brief Horizontal partial sum of a #HPCombi::epu8
  * @details
- * @returns the horizontal sum of the input
+ * @returns the partials max of the input
  * @par Example:
  * @code
- * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4});
+ * partial_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
  * @endcode
- * Returns `1`
+ * Returns `{ 5, 5, 5, 5, 5, 6,12,12,12,12,12,12,12,13,14,15}`
  */
-/** @copydoc common_horiz_min
+inline epu8 partial_max(epu8 v) noexcept { return partial_max_round(v); }
+
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint8_t horiz_min_ref(epu8) noexcept;
-/** @copydoc common_horiz_min
+
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline uint8_t horiz_min_gen(epu8) noexcept;
-/** @copydoc common_horiz_min
+
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_min4(epu8) noexcept;
-/** @copydoc common_horiz_min
+
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
 inline uint8_t horiz_min3(epu8) noexcept;
-/** @copydoc common_horiz_min */
-inline uint8_t horiz_min(epu8 v) noexcept { return horiz_min4(v); }
 
-/** @class common_partial_min
- * @brief Horizontal partial sum of a #HPCombi::epu8
+/**
+ * @brief Horizontal sum of a  #HPCombi::epu8
  * @details
- * @returns the partials min of the input
+ * @returns the horizontal sum of the input
  * @par Example:
  * @code
- * partial_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4});
  * @endcode
- * Returns `{ 5, 5, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}`
+ * Returns `1`
  */
-/** @copydoc common_partial_min
+inline uint8_t horiz_min(epu8 v) noexcept { return horiz_min4(v); }
+
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_min_ref(epu8) noexcept;
-/** @copydoc common_partial_min
+
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_min_gen(epu8) noexcept;
-/** @copydoc common_partial_min
+
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline epu8 partial_min_round(epu8) noexcept;
-/** @copydoc common_partial_min */
-inline epu8 partial_min(epu8 v) noexcept { return partial_min_round(v); }
 
-/** @class common_eval16
- * @brief Evaluation of a #HPCombi::epu8
+/**
+ * @brief Horizontal partial sum of a #HPCombi::epu8
  * @details
- * @param v : a #HPCombi::epu8
- * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
- *     \c r[i] is the number of occurrence of \c i in the input \c v
+ * @returns the partials min of the input
  * @par Example:
  * @code
- * eval16(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * partial_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
  * @endcode
- * Returns `{ 1, 1, 2, 1, 1, 3, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1}`
- * @warning The entries larger than 15 are ignored
+ * Returns `{ 5, 5, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}`
  */
-/** @copydoc common_eval16
+inline epu8 partial_min(epu8 v) noexcept { return partial_min_round(v); }
+
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 eval16_ref(epu8 v) noexcept;
-/** @copydoc common_eval16
+
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and cast to array
  */
 inline epu8 eval16_arr(epu8 v) noexcept;
-/** @copydoc common_eval16
+
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Vector @f$O(n)@f$ using cyclic shifting
  */
 inline epu8 eval16_cycle(epu8 v) noexcept;
-/** @copydoc common_eval16
+
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Vector @f$O(n)@f$ using popcount
  */
 inline epu8 eval16_popcount(epu8 v) noexcept;
-/** @copydoc common_eval16 */
+
+/**
+ * @brief Evaluation of a #HPCombi::epu8: count how many times each int of 0..15
+ * appears in the input.
+ * @details
+ * @param v : a #HPCombi::epu8
+ * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
+ *     \c r[i] is the number of occurrence of \c i in the input \c v
+ * @par Example:
+ * @code
+ * eval16(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * @endcode
+ * Returns `{ 1, 1, 2, 1, 1, 3, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1}`
+ * @warning The entries larger than 15 are ignored
+ */
 inline epu8 eval16(epu8 v) noexcept { return eval16_cycle(v); }
 
-/** @class common_first_diff
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
+ *  @par Algorithm:
+ *  Reference @f$O(n)@f$ algorithm using loop and indexed access
+ */
+inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
+
+#ifdef SIMDE_X86_SSE4_2_NATIVE
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
+ *  @par Algorithm:
+ *  Using \c cmpestri instruction
+ */
+inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
+#endif
+
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
+ *  @par Algorithm:
+ *  Using vector comparison and mask
+ */
+inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound = 16) noexcept;
+
+/**
  * @brief The first difference between two #HPCombi::epu8
  * @details
  * @param a, b : two #HPCombi::epu8
@@ -425,32 +513,37 @@ inline epu8 eval16(epu8 v) noexcept { return eval16_cycle(v); }
  * `first_diff(a, b, 7)` returns `3`.
  * @warning `bound` is assumed to be smaller or equal than 16
  */
-/** @copydoc common_first_diff
+inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
+    return first_diff_mask(a, b, bound);
+}
+
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
-inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
+inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
+
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** @copydoc common_first_diff
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using \c cmpestri instruction
  */
-inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
+inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
 #endif
-/** @copydoc common_first_diff
+
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using vector comparison and mask
  */
-inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound = 16) noexcept;
-/** @copydoc common_first_diff */
-inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
-    return first_diff_mask(a, b, bound);
-}
+inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound = 16) noexcept;
 
-/** @class common_last_diff
+/**
  * @brief The last difference between two #HPCombi::epu8
  * @details
- * @param a, b : two #HPCombi::epu8
+ * @param a, b : #HPCombi::epu8
  * @param bound : a \c size_t
  * @returns the largest index @f$i<bound@f$ such that \c a[i] and \c b[i]
  * differ, 16 if there is no differences before bound.
@@ -465,24 +558,6 @@ inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
  * `last_diff(a, b, 7)` returns `3`.
  * @warning `bound` is assumed to be smaller or equal than 16
  */
-/** @copydoc common_last_diff
- *  @par Algorithm:
- *  Reference @f$O(n)@f$ algorithm using loop and indexed access
- */
-inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
-#ifdef SIMDE_X86_SSE4_2_NATIVE
-/** @copydoc common_last_diff
- *  @par Algorithm:
- *  Using \c cmpestri instruction
- */
-inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
-#endif
-/** @copydoc common_last_diff
- *  @par Algorithm:
- *  Using vector comparison and mask
- */
-inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound = 16) noexcept;
-/** @copydoc common_last_diff */
 inline uint64_t last_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
     return last_diff_mask(a, b, bound);
 }
@@ -569,7 +644,26 @@ inline bool is_transformation(epu8 v, const size_t k = 16) noexcept;
  */
 inline bool is_partial_permutation(epu8 v, const size_t k = 16) noexcept;
 
-/** @class common_is_permutation
+#ifdef SIMDE_X86_SSE4_2_NATIVE
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
+    @par Algorithm: uses string matching cpmestri intrinsics
+ */
+inline bool is_permutation_cpmestri(epu8 v, const size_t k = 16) noexcept;
+#endif
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
+    @par Algorithm: sort the vector and compare to identity
+ */
+inline bool is_permutation_sort(epu8 v, const size_t k = 16) noexcept;
+
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
+    @par Algorithm: uses evaluation
+ */
+inline bool is_permutation_eval(epu8 v, const size_t k = 16) noexcept;
+
+/**
  * @details
  * @returns whether \c *this is a permutation.
  * @param v the vector to test
@@ -578,28 +672,12 @@ inline bool is_partial_permutation(epu8 v, const size_t k = 16) noexcept;
  * If \c *this is a permutation of @f$0\dots n-1@f$ for @f$n<16@f$,
  * it should be completed to a permutation of @f$0\dots 15@f$
  * by adding fixed points. That is the values @f$i\geq n@f$ should be
- * mapped to themself.
+ * mapped to themselves.
  * @par Example:
  * The permutation
  * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 3 1 4 \end{matrix}@f$
  * is encoded by the array {2,0,5,3,1,4,6,7,8,9,10,11,12,13,14,15}
- */
-#ifdef SIMDE_X86_SSE4_2_NATIVE
-/** @copydoc common_is_permutation
-    @par Algorithm: uses string matching cpmestri intrinsics
- */
-inline bool is_permutation_cpmestri(epu8 v, const size_t k = 16) noexcept;
-#endif
-/** @copydoc common_is_permutation
-    @par Algorithm: sort the vector and compare to identity
- */
-inline bool is_permutation_sort(epu8 v, const size_t k = 16) noexcept;
-/** @copydoc common_is_permutation
-    @par Algorithm: uses evaluation
- */
-inline bool is_permutation_eval(epu8 v, const size_t k = 16) noexcept;
-/** @copydoc common_is_permutation
-    @par Algorithm: architecture dependent
+ * @par Algorithm: architecture dependent
  */
 inline bool is_permutation(epu8 v, const size_t k = 16) noexcept;
 
diff --git a/include/hpcombi/epu8_impl.hpp b/include/hpcombi/epu8_impl.hpp
index 69b0035..702372c 100644
--- a/include/hpcombi/epu8_impl.hpp
+++ b/include/hpcombi/epu8_impl.hpp
@@ -19,8 +19,9 @@
 
 // NOLINT(build/header_guard)
 
-// This is the implementation part of epu8.hpp this should be seen as
-// implementation details and should not be included directly.
+/** @file
+@brief implementation of epu8.hpp ; this file should not be included directly.
+*/
 
 #include <initializer_list>
 #include <iostream>
@@ -553,6 +554,8 @@ inline std::string to_string(HPCombi::epu8 const &a) {
     return ss.str();
 }
 
+//! This type appears in the doc because we provide an equal operator for
+//! HPCombi::epu8.
 template <> struct equal_to<HPCombi::epu8> {
     bool operator()(const HPCombi::epu8 &lhs,
                     const HPCombi::epu8 &rhs) const noexcept {
@@ -560,6 +563,8 @@ template <> struct equal_to<HPCombi::epu8> {
     }
 };
 
+//! This type appears in the doc because we provide a not_equal operator for
+//! HPCombi::epu8.
 template <> struct not_equal_to<HPCombi::epu8> {
     bool operator()(const HPCombi::epu8 &lhs,
                     const HPCombi::epu8 &rhs) const noexcept {
@@ -567,6 +572,8 @@ template <> struct not_equal_to<HPCombi::epu8> {
     }
 };
 
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::epu8.
 template <> struct hash<HPCombi::epu8> {
     inline size_t operator()(HPCombi::epu8 a) const noexcept {
         unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);
@@ -583,10 +590,12 @@ template <> struct hash<HPCombi::epu8> {
     }
 };
 
+//! This type appears in the doc because we provide a less operator for
+//! HPCombi::epu8.
 template <> struct less<HPCombi::epu8> {
     // WARNING: due to endianness this is not lexicographic comparison,
     //          but we don't care when using in std::set.
-    // 10% faster than calling the lexicographic comparison operator !
+    // 10% faster than calling the lexicographic comparison operator!
     inline size_t operator()(const HPCombi::epu8 &v1,
                              const HPCombi::epu8 &v2) const noexcept {
         simde__m128 v1v = simde__m128(v1), v2v = simde__m128(v2);
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index bdc8145..af9282c 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -17,6 +17,10 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief Main entry point; includes the API files: bmat8.hpp, perm16.hpp, etc
+and also debug.hpp, epu8.hpp, etc.*/
+
 #ifndef HPCOMBI_HPCOMBI_HPP_
 #define HPCOMBI_HPCOMBI_HPP_
 
@@ -30,3 +34,87 @@
 #include "vect_generic.hpp"
 
 #endif  // HPCOMBI_HPCOMBI_HPP_
+
+/*! \mainpage HPCombi
+
+\section readme_sec Readme
+
+You might want to have a look at [the Readme in the
+sources](https://github.com/libsemigroups/HPCombi/blob/main/README.md).
+
+\section sec_philo Philosophy
+This library provides high performance computations in combinatorics (hence its
+name). In practice we observe large speedups in several enumeration problems.
+
+The main idea of the library is a way to encode data as a small sequence of
+small integers, that can be handled efficiently by a creative use of vector
+instructions. For example, on the current x86 machines, small permutations (N ≤
+16) are very well handled. Indeed thanks to machine instructions such as PSHUFB
+(Packed SHUFfle Bytes), applying a permutation on a vector only takes a few CPU
+cycles.
+
+Further ideas are:
+- Vectorization (MMX, SSE, AVX instructions sets) and careful memory alignment,
+- Careful memory management: avoid all dynamic allocation during the
+computation,
+- Avoid all unnecessary copies (it is often needed to rewrite the containers),
+- Due to combinatorial explosion, sets often don’t fit in memory or disk and are
+enumerated on the fly.
+
+Here are some examples,
+the speedup is in comparison to an implementation without vector instructions:
+
+Operation |   Speedup
+----------|-----------
+Inverting a permutation | 1.28
+Sorting a list of bytes | 21.3
+Number of cycles of a permutation |  41.5
+Number of inversions of a permutation  | 9.39
+Cycle type of a permutation | 8.94
+
+
+
+\section sec_tips Tips to the user
+
+Note that memory access can become a problem.
+If your algorithm stores many things, most of the time will be spent in fetching
+from RAM, not computing. The data structures your client code uses should
+preserve locality. You might want to compute some stats on data structure usage
+(eg avg size of buckets used, lengths of lists, lifetime of objects, etc.)
+and write custom data structure optimized for your usage profile.
+
+This lib is implemented with speed in mind, not code safety.
+Eg. there are no checks when building a permutation, which could be invalid
+(like non injective).
+
+We suggest having a look, in the menus above, at Classes → [Class
+list](annotated.html), esp. at the classes HPCombi::Perm16 and HPCombi::BMat8.
+
+\section Parallelism
+There is no parallelisation here. To use parallelism with this lib, see for
+instance:
+- Florent Hivert, High Performance Computing Experiments in Enumerative and
+Algebraic Combinatorics
+([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf),
+[DOI](https://dx.doi.org/10.1145/3115936.3115938)).
+- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing
+framework.
+
+Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to
+ease parallelism. Intel decided not to maintain Cilk anymore so it is
+deprecated. [OpencilK](https://github.com/OpenCilk/) is an open source project
+to continue it.
+
+We tested OpenMP and it was 2 orders of magnitude slower.
+
+OpencilK adds the keyword `spawn`,
+which adds a special tag to the stack and launches a recursive call.
+If a thread finishes its work, it will look at other threads' stacks and steal
+their work. The value of Cilk is that recursive calls cost only 4 or 5 times
+more, much faster than launching true threads (which would take 6-7 orders of
+magnitude more time to create, measured in μs).
+
+OpencilK provides some primitives for concurrent access to data.
+It guarantees the semantics of serial execution.
+
+*/
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index 3170e4d..05c4a36 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -17,6 +17,14 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief declaration of
+\ref HPCombi::PTransf16 "PTransf16",
+\ref HPCombi::Transf16  "Transf16",
+\ref HPCombi::PPerm16   "PPerm16" and
+\ref HPCombi::Perm16    "Perm16"
+*/
+
 #ifndef HPCOMBI_PERM16_HPP_
 #define HPCOMBI_PERM16_HPP_
 
@@ -41,9 +49,9 @@ struct Perm16;
 struct PTransf16;
 struct Transf16;
 
-/** Partial transformation of @f$\{0\dots 15\}@f$
- *
- */
+/** Partial transformation of @f$\{0\dots 15\}@f$; see HPCombi::Transf16;
+partial means it might not be defined everywhere.
+Undefined images are encoded as 0xFF. */
 struct alignas(16) PTransf16 : public Vect16 {
     static constexpr size_t size() { return 16; }
 
@@ -116,9 +124,10 @@ struct alignas(16) PTransf16 : public Vect16 {
     uint8_t nb_fix_points() const;
 };
 
-/** Full transformation of @f$\{0\dots 15\}@f$
- *
- */
+/** Full transformation of @f$\{0\dots 15\}@f$:
+a transformation is a mapping of a set of n elements *into* itself;
+ie as opposed to a permutation, it is not necessarily injective.
+Here n is hard-coded to 16. */
 struct Transf16 : public PTransf16 {
     Transf16() = default;
     constexpr Transf16(const Transf16 &v) = default;
@@ -145,7 +154,9 @@ struct Transf16 : public PTransf16 {
     explicit operator uint64_t() const;
 };
 
-//! Partial permutation of @f$\{0, \dots, 15\}@f$
+/** Partial permutation of @f$\{0\dots 15\}@f$; see also HPCombi::Perm16;
+partial means it might not be defined everywhere (but where it's defined, it's
+injective). Undefined images are encoded as 0xFF. */
 struct PPerm16 : public PTransf16 {
     PPerm16() = default;
     constexpr PPerm16(const PPerm16 &v) = default;
@@ -169,7 +180,7 @@ struct PPerm16 : public PTransf16 {
         return this->PTransf16::operator*(p);
     }
 
-    /** @class common_inverse_pperm
+    /**
      * @brief The inverse of a partial permutation
      * @details
      * @returns the inverse of \c *this. The inverse of @f$p@f$ is the unique
@@ -183,14 +194,14 @@ struct PPerm16 : public PTransf16 {
      * Returns
      * @verbatim {0,0xFF,2,1,3,5,6,0xFF,8,9,0xFF,10,12,0xFF,0xFF,0xFF}
      * @endverbatim
-     */
-    /** @copydoc common_inverse_pperm
-     *  @par Algorithm:
-     *  @f$O(n)@f$ algorithm using reference cast to arrays
+     * @par Algorithm:
+     * @f$O(n)@f$ algorithm using reference cast to arrays
      */
     PPerm16 inverse_ref() const;
+
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-    /** @copydoc common_inverse_pperm
+    /** Same as \ref HPCombi::PPerm16::inverse_ref "inverse_ref" but with a
+     * different algorithm.
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
@@ -202,8 +213,9 @@ struct PPerm16 : public PTransf16 {
     PPerm16 left_one() const { return PTransf16::left_one(); }
 };
 
-/** Permutations of @f$\{0\dots 15\}@f$
- *
+/** Permutations of @f$\{0\dots 15\}@f$:
+ * A permutation is a bijective mapping of a set of n elements onto itself.
+ * Here n is hard-coded to 16.
  */
 struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
     Perm16() = default;
@@ -229,8 +241,8 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
     //! Construct a permutations from its 64 bits compressed.
     explicit Perm16(uint64_t compressed) : Transf16(compressed) {}
 
-    /** @class common_inverse
-     * @brief The inverse permutation
+    /** @brief The inverse permutation
+     *
      * @details
      * @returns the inverse of \c *this
      * @par Example:
@@ -240,47 +252,58 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * @endcode
      * Returns
      * @verbatim {0,4,2,1,3,5,6,7,8,9,10,11,12,13,14,15} @endverbatim
-     */
-    /** @copydoc common_inverse
+
+     * Frontend method: currently aliased to #inverse_cycl */
+    Perm16 inverse() const { return inverse_cycl(); }
+
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ algorithm using loop and indexed access
      */
     Perm16 inverse_ref() const;
-    /** @copydoc common_inverse
+
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  @f$O(n)@f$ algorithm using reference cast to arrays
      */
     Perm16 inverse_arr() const;
-    /** @copydoc common_inverse
+
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Insert the identity in the least significant bits and sort using a
-     *  sorting network. The number of round of the optimal sorting network is
+     *  sorting network. The number of rounds of the optimal sorting network is
      *  open as far as I know, therefore the complexity is unknown.
      */
     Perm16 inverse_sort() const;
-    /** @copydoc common_inverse
+
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
      */
     Perm16 inverse_find() const { return permutation_of(v, one()); }
-    /** @copydoc common_inverse
+
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *
-     * Raise \e *this to power @f$\text{LCM}(1, 2, ..., n) - 1@f$ so complexity
-     * is in @f$O(log (\text{LCM}(1, 2, ..., n) - 1)) = O(n)@f$
+     * Use HPCombi::pow to
+     * raise \e *this to power @f$\text{LCM}(1, 2, ..., n) - 1@f$ so complexity
+     * is @f$O(log (\text{LCM}(1, 2, ..., n) - 1)) = O(n)@f$
      */
     Perm16 inverse_pow() const;
-    /** @copydoc common_inverse
+
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Compute power from @f$n/2@f$ to @f$n@f$, when @f$\sigma^k(i)=i@f$ then
      *  @f$\sigma^{-1}(i)=\sigma^{k-1}(i)@f$. Complexity @f$O(n)@f$
      */
     Perm16 inverse_cycl() const;
-    /** @copydoc common_inverse
-     *
-     *  Frontend method: currently aliased to #inverse_cycl */
-    Perm16 inverse() const { return inverse_cycl(); }
 
     /** The elementary transposition exchanging @f$i@f$ and @f$i+1@f$ */
     static Perm16 elementary_transposition(uint64_t i);
@@ -291,7 +314,7 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     static Perm16 unrankSJT(int n, int r);
 
-    /** @class common_lehmer
+    /**
      * @brief The Lehmer code of a permutation
      * @details
      * @returns the Lehmer code of \c *this
@@ -302,24 +325,26 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * @endcode
      * Returns
      * @verbatim {0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0} @endverbatim
+     * @par Algorithm:
+     * Fast @f$O(n)@f$ algorithm using vector comparison
      */
-    /** @copydoc common_lehmer
-     *  @par Algorithm:
-     *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access
+    epu8 lehmer() const;
+
+    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a
+     * different implementation.
+     * @par Algorithm:
+     * Reference @f$O(n^2)@f$ algorithm using loop and indexed access
      */
     epu8 lehmer_ref() const;
-    /** @copydoc common_lehmer
-     *  @par Algorithm:
-     *  Reference @f$O(n^2)@f$ algorithm using array, loop and indexed access
+
+    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a
+     * different implementation.
+     * @par Algorithm:
+     * Reference @f$O(n^2)@f$ algorithm using array, loop and indexed access
      */
     epu8 lehmer_arr() const;
-    /** @copydoc common_lehmer
-     *  @par Algorithm:
-     *  Fast @f$O(n)@f$ algorithm using vector comparison
-     */
-    epu8 lehmer() const;
 
-    /** @class common_length
+    /**
      * @brief The Coxeter length (ie: number of inversion) of a permutation
      * @details
      * @returns the number of inversions of \c *this
@@ -329,25 +354,27 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * x.length()
      * @endcode
      * Returns @verbatim 4 @endverbatim
+     *  @par Algorithm:
+     *  @f$O(n)@f$ using vector lehmer and fast horizontal sum
      */
-    /** @copydoc common_length
+    uint8_t length() const;
+
+    /** Same interface as \ref HPCombi::Perm16::length "length", with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access
      */
     uint8_t length_ref() const;
-    /** @copydoc common_length
+
+    /** Same interface as \ref HPCombi::Perm16::length "length", with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access after
      *     a cast to \c std::array
      */
     uint8_t length_arr() const;
-    /** @copydoc common_length
-     *  @par Algorithm:
-     *  @f$O(n)@f$ using vector lehmer and fast horizontal sum
-     */
-    uint8_t length() const;
 
-    /** @class common_nb_descent
+    /**
      * @brief The number of descent of a permutation
      * @details
      * @returns the number of inversions of \c *this
@@ -357,17 +384,17 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * x.length()
      * @endcode
      * Returns @verbatim 2 @endverbatim
+     *  @par Algorithm:
+     *  Reference @f$O(1)@f$ using vector shift and comparison
      */
-    /** @copydoc common_nb_descent
+    uint8_t nb_descents() const;
+
+    /** Same interface as \ref HPCombi::Perm16::nb_descents "nb_descents", with
+     * a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ using a loop
      */
     uint8_t nb_descents_ref() const;
-    /** @copydoc common_nb_descent
-     *  @par Algorithm:
-     *  Reference @f$O(1)@f$ using vector shift and comparison
-     */
-    uint8_t nb_descents() const;
 
     /** The set partition of the cycles of a permutation
      * @details
@@ -385,7 +412,7 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     epu8 cycles_partition() const;
 
-    /** @class common_nb_cycles
+    /**
      * @brief The number of cycles of a permutation
      * @details
      * @returns the number of cycles of \c *this
@@ -395,23 +422,25 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * x.nb_cycles()
      * @endcode
      * Returns @verbatim 10 @endverbatim
+     *  @par Algorithm: aliased to #nb_cycles_unroll
      */
-    /** @copydoc common_nb_cycles
+    uint8_t nb_cycles() const { return nb_cycles_unroll(); }
+
+    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ using a boolean vector
      */
     uint8_t nb_cycles_ref() const;
-    /** @copydoc common_nb_cycles
+
+    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(\log(n))@f$ using #cycles_partition
      */
     uint8_t nb_cycles_unroll() const;
-    /** @copydoc common_nb_cycles
-     *  @par Algorithm: aliased to #nb_cycles_unroll
-     */
-    uint8_t nb_cycles() const { return nb_cycles_unroll(); }
 
-    /** @class common_left_weak_leq
+    /**
      * @brief Compare two permutations for the left weak order
      * @par Example:
      * @code
@@ -419,22 +448,24 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * x.left_weak_leq(y)
      * @endcode
      * Returns @verbatim true @endverbatim
+     *  @par Algorithm:
+     *  @f$O(n)@f$ algorithm using length
      */
-    /** @copydoc common_left_weak_leq
+    bool left_weak_leq(Perm16 other) const;
+
+    /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq"
+     * but with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ testing inclusion of inversions one by one
      */
     bool left_weak_leq_ref(Perm16 other) const;
-    /** @copydoc common_left_weak_leq
+
+    /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq"
+     * but with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ with vectorized test of inclusion
      */
     bool left_weak_leq_length(Perm16 other) const;
-    /** @copydoc common_left_weak_leq
-     *  @par Algorithm:
-     *  @f$O(n)@f$ algorithm using length
-     */
-    bool left_weak_leq(Perm16 other) const;
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -451,7 +482,10 @@ static_assert(std::is_trivial<Perm16>(), "Perm16 is not a trivial class !");
 #include "perm16_impl.hpp"
 
 namespace std {
+// Hash operators for Transf and Perm:
 
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::PTransf16.
 template <> struct hash<HPCombi::PTransf16> {
     //! A hash operator for #HPCombi::PTransf16
     size_t operator()(const HPCombi::PTransf16 &ar) const {
@@ -459,6 +493,8 @@ template <> struct hash<HPCombi::PTransf16> {
     }
 };
 
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::Transf16.
 template <> struct hash<HPCombi::Transf16> {
     //! A hash operator for #HPCombi::Transf16
     size_t operator()(const HPCombi::Transf16 &ar) const {
@@ -466,6 +502,8 @@ template <> struct hash<HPCombi::Transf16> {
     }
 };
 
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::PPerm16.
 template <> struct hash<HPCombi::PPerm16> {
     //! A hash operator for #HPCombi::PPerm16
     size_t operator()(const HPCombi::PPerm16 &ar) const {
@@ -473,6 +511,8 @@ template <> struct hash<HPCombi::PPerm16> {
     }
 };
 
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::Perm16.
 template <> struct hash<HPCombi::Perm16> {
     //! A hash operator for #HPCombi::Perm16
     size_t operator()(const HPCombi::Perm16 &ar) const { return uint64_t(ar); }
diff --git a/include/hpcombi/perm16_impl.hpp b/include/hpcombi/perm16_impl.hpp
index 544b8a0..4d2daed 100644
--- a/include/hpcombi/perm16_impl.hpp
+++ b/include/hpcombi/perm16_impl.hpp
@@ -19,12 +19,11 @@
 
 // NOLINT(build/header_guard)
 
-namespace HPCombi {
-
-///////////////////////////////////////////////////////////////////////////////
-// Implementation part for inline functions  //////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
+/** @file
+@brief implementation of perm16.hpp ; this file should not be included directly.
+*/
 
+namespace HPCombi {
 inline PTransf16::PTransf16(std::initializer_list<uint8_t> il)
     : Vect16(Epu8.id()) {
     HPCOMBI_ASSERT(il.size() <= 16);
diff --git a/include/hpcombi/perm_generic.hpp b/include/hpcombi/perm_generic.hpp
index b48be21..164cb71 100644
--- a/include/hpcombi/perm_generic.hpp
+++ b/include/hpcombi/perm_generic.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief declaration of HPCombi::PermGeneric */
+
 #ifndef HPCOMBI_PERM_GENERIC_HPP_
 #define HPCOMBI_PERM_GENERIC_HPP_
 
@@ -35,6 +38,14 @@
 
 namespace HPCombi {
 
+/** Vanilla (ie NOT optimized) implementation of a permutation, used to check
+for test correctness and as baseline to measure speedup. Implemented as an std
+array, so the permutation is not necessarily of size n=16. PermGeneric<16>
+should implement as much as possibles of Perm16 (currently not everything due to
+lack of time/need). No optimisation, so prefer to use Perm16.
+
+About Expo, see comment on HPCombi::VectGeneric.
+*/
 template <size_t Size, typename Expo = uint8_t>
 struct PermGeneric : public VectGeneric<Size, Expo> {
     using vect = VectGeneric<Size, Expo>;
diff --git a/include/hpcombi/perm_generic_impl.hpp b/include/hpcombi/perm_generic_impl.hpp
index 9930e0b..744ac3c 100644
--- a/include/hpcombi/perm_generic_impl.hpp
+++ b/include/hpcombi/perm_generic_impl.hpp
@@ -19,6 +19,10 @@
 
 // NOLINT(build/header_guard)
 
+/** @file
+@brief implementation of perm_generic.hpp ;
+this file should not be included directly. */
+
 namespace HPCombi {
 
 template <size_t Size, typename Expo>
@@ -115,6 +119,8 @@ bool PermGeneric<Size, Expo>::left_weak_leq(PermGeneric other) const {
 
 namespace std {
 
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::PermGeneric.
 template <size_t Size, typename Expo>
 struct hash<HPCombi::PermGeneric<Size, Expo>> {
     size_t operator()(const HPCombi::PermGeneric<Size, Expo> &ar) const {
diff --git a/include/hpcombi/power.hpp b/include/hpcombi/power.hpp
index 06a9349..6afb7b2 100644
--- a/include/hpcombi/power.hpp
+++ b/include/hpcombi/power.hpp
@@ -18,23 +18,31 @@
 //****************************************************************************//
 
 /** @file
- * @brief Generic compile time power
- *
- * The goal of this file is to be able to write expressions such as @c
- * pow<23>(2.5) or @c pow<n>(x) where the first expression is entirely
- * computed as compile time and the second one is expanded also as compile
- * time to a O(log n) long sequence of multiplication. Furthermore such
- * expression not only works for numbers for for any type where there is a
- * neutral element and an associative (non necessarily commutative) product,
- * namely what mathematicians call \e monoids. These include for example,
- * strings where the neutral element is the empty string and the product is
- * the concatenation.
- *
- * see HPCombi::power_helper::Monoid<std::string>
- *
- * @example stringmonoid.cpp
- * This is an example of how to use pow with a non numerical Monoid.
- */
+@brief  Generic compile-time unrolling of the fast exponentiation algorithm.
+
+Allows to write expressions such as
+- @c pow<23>(2.5) : entirely computed at compile time
+- @c pow<n>(x) expanded at compile time to a O(log n) long sequence of
+multiplications.
+
+Such expressions work for numbers but also for any type where there is a
+neutral element and an associative (non necessarily commutative) product,
+ie what mathematicians call \e monoids.
+These include for example
+strings where the neutral element is the empty string and the product is
+the concatenation.
+
+See HPCombi::power_helper::Monoid<std::string>
+
+The algorithm used here is based on the base-2 representation of n,
+it is a 2-approximation of the optimum number of multiplications.
+The general problem is called *addition chain* and one can sometimes do better,
+eg on fibonaci numbers, use rather the fibonacci recurrence relation
+to choose which products to compute.
+
+@example stringmonoid.cpp
+how to use pow with a non numerical Monoid.
+*/
 
 #ifndef HPCOMBI_POWER_HPP_
 #define HPCOMBI_POWER_HPP_
diff --git a/include/hpcombi/vect16.hpp b/include/hpcombi/vect16.hpp
index 14be7a2..d0e13b3 100644
--- a/include/hpcombi/vect16.hpp
+++ b/include/hpcombi/vect16.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief HPCombi::Vect16 */
+
 #ifndef HPCOMBI_VECT16_HPP_
 #define HPCOMBI_VECT16_HPP_
 
@@ -31,6 +34,8 @@
 
 namespace HPCombi {
 
+/** Vector of 16 bytes, with some optimized methods, superclass of
+ * HPCombi::Transf16. */
 struct alignas(16) Vect16 {
     static constexpr size_t size() { return 16; }
     using array = typename decltype(Epu8)::array;
@@ -113,6 +118,8 @@ inline std::ostream &operator<<(std::ostream &stream,
     return operator<<(stream, ar.v);
 }
 
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::Vect16.
 template <> struct hash<HPCombi::Vect16> {
     size_t operator()(const HPCombi::Vect16 &ar) const {
         return std::hash<HPCombi::epu8>{}(ar.v);
diff --git a/include/hpcombi/vect_generic.hpp b/include/hpcombi/vect_generic.hpp
index 767512d..0927f18 100644
--- a/include/hpcombi/vect_generic.hpp
+++ b/include/hpcombi/vect_generic.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief HPCombi::VectGeneric */
+
 #ifndef HPCOMBI_VECT_GENERIC_HPP_
 #define HPCOMBI_VECT_GENERIC_HPP_
 
@@ -43,7 +46,12 @@ std::array<Expo, Size> sorted_vect(std::array<Expo, Size> v) {
     return v;
 }
 
-/** A generic class for combinatorial integer vectors.
+/** \ref HPCombi::VectGeneric "VectGeneric" is to \ref HPCombi::Vect16 "Vect16"
+what \ref HPCombi::PermGeneric "PermGeneric" is to \ref HPCombi::Perm16
+"Perm16"; see \ref HPCombi::PermGeneric "PermGeneric".
+
+HPCombi started as a library to manipulate monomials on several variables,
+ie a tuple of *expo*nents. The elements of arrays were thus named Expo.
  */
 template <size_t Size, typename Expo = uint8_t> struct VectGeneric {
     static constexpr size_t size() { return Size; }
@@ -236,6 +244,8 @@ std::ostream &operator<<(std::ostream &stream,
     return stream;
 }
 
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::VectGeneric.
 template <size_t Size, typename Expo>
 struct hash<HPCombi::VectGeneric<Size, Expo>> {
     size_t operator()(const HPCombi::VectGeneric<Size, Expo> &ar) const {