From e5347212ceb86a10e31e9bf355e69aaa0cf3d7a3 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Tue, 1 Oct 2024 23:24:48 +0200
Subject: [PATCH 01/15] remove file AUTHORS, as it's already included in
 README.md

---
 AUTHORS   | 10 ----------
 README.md |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)
 delete mode 100644 AUTHORS

diff --git a/AUTHORS b/AUTHORS
deleted file mode 100644
index 16012dcc..00000000
--- a/AUTHORS
+++ /dev/null
@@ -1,10 +0,0 @@
-AUTHORS
-=======
-
-Florent Hivert <florent.hivert@lri.fr>
-
-Contributors
-============
-
-- James Mitchell <jdm3@st-andrews.ac.uk> : discussions + test cases + Travis CI
-- Viviane Pons <viviane.pons@lri.fr> : algorithms discussions
diff --git a/README.md b/README.md
index 034b66f3..63c4423a 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ other processors too.
 
 - Reinis Cirpons <rc234@st-andrews.ac.uk> : CI
 - Finn Smith <fls3@st-andrews.ac.uk> : discussions + BMat8 reference code
-- Viviane Pons : algorithms discussions
+- Viviane Pons <viviane.pons@lri.fr> : algorithms discussions
 - Daniel Vanzo <daniel.vanzo@lri.fr> : GPU experiments
 
 ## Documentation

From b28f49b376c4be231de229edf232db0b65baa6ec Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Wed, 11 Dec 2024 17:06:44 +0100
Subject: [PATCH 02/15] doc

---
 include/hpcombi/epu8.hpp   | 8 ++++----
 include/hpcombi/perm16.hpp | 7 ++++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index 10678a2e..a8623446 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -98,7 +98,7 @@ inline bool is_sorted(epu8 a) noexcept;
  * Uses the 9 stages sorting network #sorting_rounds
  */
 inline epu8 sorted(epu8 a) noexcept;
-/** Return a #HPCombi::epu8 with the two half sorted
+/** Return a #HPCombi::epu8 with both halves sorted
  * @details
  * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8
  */
@@ -109,7 +109,7 @@ inline epu8 sorted8(epu8 a) noexcept;
  * Uses the 9 stages sorting network #sorting_rounds
  */
 inline epu8 revsorted(epu8 a) noexcept;
-/** Return a #HPCombi::epu8 with the two half reverse sorted
+/** Return a #HPCombi::epu8 with both halves reverse sorted
  * @details
  * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8
  */
@@ -372,7 +372,7 @@ inline epu8 partial_min_round(epu8) noexcept;
 inline epu8 partial_min(epu8 v) noexcept { return partial_min_round(v); }
 
 /** @class common_eval16
- * @brief Evaluation of a #HPCombi::epu8
+ * @brief Evaluation of a #HPCombi::epu8: count how many times each int of 0..15 appears in the input.
  * @details
  * @param v : a #HPCombi::epu8
  * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
@@ -450,7 +450,7 @@ inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
 /** @class common_last_diff
  * @brief The last difference between two #HPCombi::epu8
  * @details
- * @param a, b : two #HPCombi::epu8
+ * @param a, b : #HPCombi::epu8
  * @param bound : a \c size_t
  * @returns the largest index @f$i<bound@f$ such that \c a[i] and \c b[i]
  * differ, 16 if there is no differences before bound.
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index 3170e4d3..bd83ea15 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -116,7 +116,11 @@ struct alignas(16) PTransf16 : public Vect16 {
     uint8_t nb_fix_points() const;
 };
 
-/** Full transformation of @f$\{0\dots 15\}@f$
+/** Full transformation of @f$\{0\dots 15\}@f$.
+ *
+ * A transformation is a mapping of a set of n elements *into* itself.
+ * I.e. as opposed to a permutation, it is not necessarily injective.
+ * Here n=16.
  *
  */
 struct Transf16 : public PTransf16 {
@@ -204,6 +208,7 @@ struct PPerm16 : public PTransf16 {
 
 /** Permutations of @f$\{0\dots 15\}@f$
  *
+ * A permutation is a bijective mapping of a set of n elements onto itself. Here n=16.
  */
 struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
     Perm16() = default;

From cf724d28376d9ff76c9ef63518221d7305fbe237 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Wed, 11 Dec 2024 17:21:37 +0100
Subject: [PATCH 03/15] doc: do not duplicate readme.md

Avoid copy-pasting doc: it's a waste of time for the reader.
Just point to relevant docs but have them
---
 README.md                   | 6 +++---
 doc/Doxyfile.in             | 5 ++---
 include/hpcombi/hpcombi.hpp | 7 +++++++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 63c4423a..2a9dfb7e 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 High Performance Combinatorics in C++ using vector instructions v1.0.1
 
 HPCombi is a C++17 header-only library using the SSE and AVX instruction sets,
-and some equivalents, for very fast manipulation of combinatorial objects such
-as transformations, permutations, and boolean matrices of small size. The goal
+and some equivalents, for very fast manipulation of small combinatorial objects such
+as transformations, permutations, and boolean matrices. The goal
 of this project is to implement various new algorithms and benchmark them on
 various compiler and architectures.
 
@@ -30,7 +30,7 @@ other processors too.
 
 ## Documentation
 
-- The Doxygen auto generated [API](https://libsemigroups.github.io/HPCombi/)
+- See the [Doxygen generated doc](https://libsemigroups.github.io/HPCombi/)
 
 ## Thanks
 
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 51a7ff29..b62bed77 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -934,8 +934,7 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = ${CMAKE_SOURCE_DIR}/include/hpcombi \
-                         ${CMAKE_SOURCE_DIR}/examples \
-                         ${CMAKE_SOURCE_DIR}/README.md
+                         ${CMAKE_SOURCE_DIR}/examples
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1101,7 +1100,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-USE_MDFILE_AS_MAINPAGE = ${CMAKE_SOURCE_DIR}/README.md
+USE_MDFILE_AS_MAINPAGE =
 
 # The Fortran standard specifies that for fixed formatted Fortran code all
 # characters from position 72 are to be considered as comment. A common
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index bdc81453..aa7a21ab 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -30,3 +30,10 @@
 #include "vect_generic.hpp"
 
 #endif  // HPCOMBI_HPCOMBI_HPP_
+
+/*! \mainpage HPCombi
+ *
+ * \section readme_sec Readme
+ *
+ * You might want to have a look at [the Readme in the sources](https://github.com/libsemigroups/HPCombi/blob/main/README.md).
+ */
\ No newline at end of file

From 40589ebb2c7a7cce5c5be94f6fd4e953abce1cc2 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Wed, 11 Dec 2024 22:23:16 +0100
Subject: [PATCH 04/15] doc: intro

---
 BUILDING.md                 |  2 +-
 include/hpcombi/hpcombi.hpp | 54 +++++++++++++++++++++++++++++++++----
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/BUILDING.md b/BUILDING.md
index cd6bb1f4..289246a4 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -18,7 +18,7 @@ tests, experiments, examples, or benchmarks.
 
 - [optional] : Google `sparsehash/dense_hash_map` and/or `sparsehash/dense_hash_set`.
 
-- [optional] Doxygen for generating the API documentation (in progress).
+- [optional] Doxygen for generating the API documentation (in [build/doc/html/index.html](build/doc/html/index.html)).
 
 ## Building
 
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index aa7a21ab..74286f7d 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -32,8 +32,52 @@
 #endif  // HPCOMBI_HPCOMBI_HPP_
 
 /*! \mainpage HPCombi
- *
- * \section readme_sec Readme
- *
- * You might want to have a look at [the Readme in the sources](https://github.com/libsemigroups/HPCombi/blob/main/README.md).
- */
\ No newline at end of file
+
+\section readme_sec Readme
+
+You might want to have a look at [the Readme in the sources](https://github.com/libsemigroups/HPCombi/blob/main/README.md).
+
+\section sec_philo Philosophy
+This library provides high performance computations in combinatorics (hence its name).
+In practice we observe large speedups in several enumeration problems.
+
+The main idea of the library is a way to encode data as a small sequence of small integers,
+that can be handled efficiently by a creative use of vector instructions.
+For example, on the current x86 machines, small permutations (N ≤ 16) are very well handled.
+Indeed thanks to machine instructions such as PSHUFB (Packed SHUFfle Bytes),
+applying a permutation on a vector only takes a few CPU cycles.
+
+Further ideas are:
+- Vectorization (MMX, SSE, AVX instructions sets) and careful memory alignment,
+- Careful memory management: avoiding all dynamic allocation during the computation,
+- Avoid all unnecessary copies (often needed to rewrite the containers),
+- Due to combinatorial explosion, sets often don’t fit in the computer’s memory or disks and are enumerated on the fly.
+
+Here are some examples,
+the speedup is in comparison to an implementation without vector instructions:
+
+Operation |   Speedup
+----------|-----------
+Inverting a permutation | 1.28
+Sorting a list of bytes | 21.3
+Number of cycles of a permutation |  41.5
+Number of inversions of a permutation  | 9.39
+Cycle type of a permutation | 8.94
+
+
+
+\section sec_tips Tips to the user
+
+There is no parallelisation here. To use parallelism with this lib, see for instance:
+- Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics
+([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf), [DOI](https://dx.doi.org/10.1145/3115936.3115938)).
+- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework.
+
+Note that memory access can become a problem. It you store many things, most of the time will be spent in fetching from RAM, not computing.
+Data structure should preserve locality. You might want to compute some stats on data structure usage and write custom ones.
+
+This lib is implemented with speed in mind, not code safety.
+Eg. there are no checks when building a permutation, which could be invalid (like non injective).
+
+We now suggest to have a look, in the menus above, at Classes → [Class list](annotated.html).
+*/
\ No newline at end of file

From 199589f63f807e03ec059248c6fe88071aab8326 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Wed, 11 Dec 2024 22:23:33 +0100
Subject: [PATCH 05/15] doc: add link

---
 doc.html | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 doc.html

diff --git a/doc.html b/doc.html
new file mode 120000
index 00000000..f1c101d8
--- /dev/null
+++ b/doc.html
@@ -0,0 +1 @@
+build/doc/html/index.html
\ No newline at end of file

From f456b29e60201dfcd8acfe767876f76a4ec50433 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Thu, 12 Dec 2024 10:36:04 +0100
Subject: [PATCH 06/15] style: fix big hole

---
 include/hpcombi/bmat8_impl.hpp | 129 +++++++++++++++++----------------
 1 file changed, 65 insertions(+), 64 deletions(-)

diff --git a/include/hpcombi/bmat8_impl.hpp b/include/hpcombi/bmat8_impl.hpp
index da0f18bd..0bc15296 100644
--- a/include/hpcombi/bmat8_impl.hpp
+++ b/include/hpcombi/bmat8_impl.hpp
@@ -36,70 +36,71 @@ static const constexpr std::array<uint64_t, 8> COL_MASK = {
     0x1010101010101010, 0x808080808080808,  0x404040404040404,
     0x202020202020202,  0x101010101010101};
 
-static const constexpr std::array<uint64_t, 64> BIT_MASK = {{0x8000000000000000,
-                                                             0x4000000000000000,
-                                                             0x2000000000000000,
-                                                             0x1000000000000000,
-                                                             0x800000000000000,
-                                                             0x400000000000000,
-                                                             0x200000000000000,
-                                                             0x100000000000000,
-                                                             0x80000000000000,
-                                                             0x40000000000000,
-                                                             0x20000000000000,
-                                                             0x10000000000000,
-                                                             0x8000000000000,
-                                                             0x4000000000000,
-                                                             0x2000000000000,
-                                                             0x1000000000000,
-                                                             0x800000000000,
-                                                             0x400000000000,
-                                                             0x200000000000,
-                                                             0x100000000000,
-                                                             0x80000000000,
-                                                             0x40000000000,
-                                                             0x20000000000,
-                                                             0x10000000000,
-                                                             0x8000000000,
-                                                             0x4000000000,
-                                                             0x2000000000,
-                                                             0x1000000000,
-                                                             0x800000000,
-                                                             0x400000000,
-                                                             0x200000000,
-                                                             0x100000000,
-                                                             0x80000000,
-                                                             0x40000000,
-                                                             0x20000000,
-                                                             0x10000000,
-                                                             0x8000000,
-                                                             0x4000000,
-                                                             0x2000000,
-                                                             0x1000000,
-                                                             0x800000,
-                                                             0x400000,
-                                                             0x200000,
-                                                             0x100000,
-                                                             0x80000,
-                                                             0x40000,
-                                                             0x20000,
-                                                             0x10000,
-                                                             0x8000,
-                                                             0x4000,
-                                                             0x2000,
-                                                             0x1000,
-                                                             0x800,
-                                                             0x400,
-                                                             0x200,
-                                                             0x100,
-                                                             0x80,
-                                                             0x40,
-                                                             0x20,
-                                                             0x10,
-                                                             0x8,
-                                                             0x4,
-                                                             0x2,
-                                                             0x1}};
+static const constexpr std::array<uint64_t, 64> BIT_MASK = {{
+    0x8000000000000000,
+    0x4000000000000000,
+    0x2000000000000000,
+    0x1000000000000000,
+    0x800000000000000,
+    0x400000000000000,
+    0x200000000000000,
+    0x100000000000000,
+    0x80000000000000,
+    0x40000000000000,
+    0x20000000000000,
+    0x10000000000000,
+    0x8000000000000,
+    0x4000000000000,
+    0x2000000000000,
+    0x1000000000000,
+    0x800000000000,
+    0x400000000000,
+    0x200000000000,
+    0x100000000000,
+    0x80000000000,
+    0x40000000000,
+    0x20000000000,
+    0x10000000000,
+    0x8000000000,
+    0x4000000000,
+    0x2000000000,
+    0x1000000000,
+    0x800000000,
+    0x400000000,
+    0x200000000,
+    0x100000000,
+    0x80000000,
+    0x40000000,
+    0x20000000,
+    0x10000000,
+    0x8000000,
+    0x4000000,
+    0x2000000,
+    0x1000000,
+    0x800000,
+    0x400000,
+    0x200000,
+    0x100000,
+    0x80000,
+    0x40000,
+    0x20000,
+    0x10000,
+    0x8000,
+    0x4000,
+    0x2000,
+    0x1000,
+    0x800,
+    0x400,
+    0x200,
+    0x100,
+    0x80,
+    0x40,
+    0x20,
+    0x10,
+    0x8,
+    0x4,
+    0x2,
+    0x1}};
 
 inline bool BMat8::operator()(size_t i, size_t j) const noexcept {
     HPCOMBI_ASSERT(i < 8);

From 4b66b049050aba5e5d6148608356ce939f631c1a Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Thu, 12 Dec 2024 11:16:15 +0100
Subject: [PATCH 07/15] doc: add doc for all files

---
 include/hpcombi/arch.hpp              |  3 +++
 include/hpcombi/bmat8.hpp             |  3 ++-
 include/hpcombi/bmat8_impl.hpp        |  4 ++--
 include/hpcombi/builder.hpp           |  3 +++
 include/hpcombi/debug.hpp             |  3 +++
 include/hpcombi/epu8.hpp              |  3 +++
 include/hpcombi/epu8_impl.hpp         |  4 ++--
 include/hpcombi/hpcombi.hpp           |  4 ++++
 include/hpcombi/perm16.hpp            |  3 +++
 include/hpcombi/perm16_impl.hpp       |  8 +++----
 include/hpcombi/perm_generic.hpp      |  3 +++
 include/hpcombi/perm_generic_impl.hpp |  4 ++++
 include/hpcombi/power.hpp             | 34 +++++++++++++--------------
 include/hpcombi/vect16.hpp            |  3 +++
 include/hpcombi/vect_generic.hpp      |  3 +++
 15 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/include/hpcombi/arch.hpp b/include/hpcombi/arch.hpp
index eaf9da60..cb395c22 100644
--- a/include/hpcombi/arch.hpp
+++ b/include/hpcombi/arch.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief check the required compiler flags for SSE-4.1 */
+
 #ifndef HPCOMBI_ARCH_HPP_
 #define HPCOMBI_ARCH_HPP_
 
diff --git a/include/hpcombi/bmat8.hpp b/include/hpcombi/bmat8.hpp
index cb789fa3..15b12305 100644
--- a/include/hpcombi/bmat8.hpp
+++ b/include/hpcombi/bmat8.hpp
@@ -19,7 +19,8 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
-// This file contains a declaration of fast boolean matrices up to dimension 8.
+/** @file
+@brief declaration of HPCombi::BMat8 */
 
 #ifndef HPCOMBI_BMAT8_HPP_
 #define HPCOMBI_BMAT8_HPP_
diff --git a/include/hpcombi/bmat8_impl.hpp b/include/hpcombi/bmat8_impl.hpp
index 0bc15296..6d17585c 100644
--- a/include/hpcombi/bmat8_impl.hpp
+++ b/include/hpcombi/bmat8_impl.hpp
@@ -19,8 +19,8 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
-// This file contains an implementation of fast boolean matrices up to
-// dimension 8 x 8.
+/** @file
+@brief implementation of bmat8.hpp ; this file should not be included directly. */
 
 // NOLINT(build/header_guard)
 
diff --git a/include/hpcombi/builder.hpp b/include/hpcombi/builder.hpp
index f8cfeabe..aa48d6e8 100644
--- a/include/hpcombi/builder.hpp
+++ b/include/hpcombi/builder.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief HPCombi::TPUBuild and casts from HPCombi::TPU */
+
 #ifndef HPCOMBI_BUILDER_HPP_
 #define HPCOMBI_BUILDER_HPP_
 
diff --git a/include/hpcombi/debug.hpp b/include/hpcombi/debug.hpp
index 28443167..b3fdf942 100644
--- a/include/hpcombi/debug.hpp
+++ b/include/hpcombi/debug.hpp
@@ -18,6 +18,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief defines the macro \c HPCOMBI_ASSERT */
+
 #ifndef HPCOMBI_DEBUG_HPP_
 #define HPCOMBI_DEBUG_HPP_
 
diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index a8623446..0fd2598f 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief declaration of HPCombi::epu8 */
+
 #ifndef HPCOMBI_EPU8_HPP_
 #define HPCOMBI_EPU8_HPP_
 
diff --git a/include/hpcombi/epu8_impl.hpp b/include/hpcombi/epu8_impl.hpp
index 69b0035a..a143b818 100644
--- a/include/hpcombi/epu8_impl.hpp
+++ b/include/hpcombi/epu8_impl.hpp
@@ -19,8 +19,8 @@
 
 // NOLINT(build/header_guard)
 
-// This is the implementation part of epu8.hpp this should be seen as
-// implementation details and should not be included directly.
+/** @file
+@brief implementation of epu8.hpp ; this file should not be included directly. */
 
 #include <initializer_list>
 #include <iostream>
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index 74286f7d..352c10df 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -17,6 +17,10 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief Main entry point; includes the API files: bmat8.hpp, perm16.hpp, etc
+and also debug.hpp, epu8.hpp, etc.*/
+
 #ifndef HPCOMBI_HPCOMBI_HPP_
 #define HPCOMBI_HPCOMBI_HPP_
 
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index bd83ea15..a4126379 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief declaration of HPCombi::PTransf16, HPCombi::Transf16, HPCombi::PPerm16 and HPCombi::Perm16. */
+
 #ifndef HPCOMBI_PERM16_HPP_
 #define HPCOMBI_PERM16_HPP_
 
diff --git a/include/hpcombi/perm16_impl.hpp b/include/hpcombi/perm16_impl.hpp
index 544b8a03..5c4d1029 100644
--- a/include/hpcombi/perm16_impl.hpp
+++ b/include/hpcombi/perm16_impl.hpp
@@ -19,12 +19,10 @@
 
 // NOLINT(build/header_guard)
 
-namespace HPCombi {
-
-///////////////////////////////////////////////////////////////////////////////
-// Implementation part for inline functions  //////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
+/** @file
+@brief implementation of perm16.hpp ; this file should not be included directly. */
 
+namespace HPCombi {
 inline PTransf16::PTransf16(std::initializer_list<uint8_t> il)
     : Vect16(Epu8.id()) {
     HPCOMBI_ASSERT(il.size() <= 16);
diff --git a/include/hpcombi/perm_generic.hpp b/include/hpcombi/perm_generic.hpp
index b48be212..b641abd6 100644
--- a/include/hpcombi/perm_generic.hpp
+++ b/include/hpcombi/perm_generic.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief declaration of HPCombi::PermGeneric */
+
 #ifndef HPCOMBI_PERM_GENERIC_HPP_
 #define HPCOMBI_PERM_GENERIC_HPP_
 
diff --git a/include/hpcombi/perm_generic_impl.hpp b/include/hpcombi/perm_generic_impl.hpp
index 9930e0be..53693877 100644
--- a/include/hpcombi/perm_generic_impl.hpp
+++ b/include/hpcombi/perm_generic_impl.hpp
@@ -19,6 +19,10 @@
 
 // NOLINT(build/header_guard)
 
+/** @file
+@brief implementation of perm_generic.hpp ;
+this file should not be included directly. */
+
 namespace HPCombi {
 
 template <size_t Size, typename Expo>
diff --git a/include/hpcombi/power.hpp b/include/hpcombi/power.hpp
index 06a93491..eb621c4a 100644
--- a/include/hpcombi/power.hpp
+++ b/include/hpcombi/power.hpp
@@ -18,23 +18,23 @@
 //****************************************************************************//
 
 /** @file
- * @brief Generic compile time power
- *
- * The goal of this file is to be able to write expressions such as @c
- * pow<23>(2.5) or @c pow<n>(x) where the first expression is entirely
- * computed as compile time and the second one is expanded also as compile
- * time to a O(log n) long sequence of multiplication. Furthermore such
- * expression not only works for numbers for for any type where there is a
- * neutral element and an associative (non necessarily commutative) product,
- * namely what mathematicians call \e monoids. These include for example,
- * strings where the neutral element is the empty string and the product is
- * the concatenation.
- *
- * see HPCombi::power_helper::Monoid<std::string>
- *
- * @example stringmonoid.cpp
- * This is an example of how to use pow with a non numerical Monoid.
- */
+@brief Generic compile-time exponentiation algorithm.
+
+The goal of this file is to be able to write expressions such as @c
+pow<23>(2.5) or @c pow<n>(x) where the first expression is entirely
+computed as compile time and the second one is expanded also as compile
+time to a O(log n) long sequence of multiplication. Furthermore such
+expression not only works for numbers for for any type where there is a
+neutral element and an associative (non necessarily commutative) product,
+namely what mathematicians call \e monoids. These include for example,
+strings where the neutral element is the empty string and the product is
+the concatenation.
+
+see HPCombi::power_helper::Monoid<std::string>
+
+@example stringmonoid.cpp
+This is an example of how to use pow with a non numerical Monoid.
+*/
 
 #ifndef HPCOMBI_POWER_HPP_
 #define HPCOMBI_POWER_HPP_
diff --git a/include/hpcombi/vect16.hpp b/include/hpcombi/vect16.hpp
index 14be7a2d..251167cb 100644
--- a/include/hpcombi/vect16.hpp
+++ b/include/hpcombi/vect16.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief HPCombi::Vect16 */
+
 #ifndef HPCOMBI_VECT16_HPP_
 #define HPCOMBI_VECT16_HPP_
 
diff --git a/include/hpcombi/vect_generic.hpp b/include/hpcombi/vect_generic.hpp
index 767512d5..fa89254a 100644
--- a/include/hpcombi/vect_generic.hpp
+++ b/include/hpcombi/vect_generic.hpp
@@ -17,6 +17,9 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+/** @file
+@brief HPCombi::VectGeneric */
+
 #ifndef HPCOMBI_VECT_GENERIC_HPP_
 #define HPCOMBI_VECT_GENERIC_HPP_
 

From 01002eb2bf24a587f8010eb57c0581a80cf585fa Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Thu, 12 Dec 2024 17:48:13 +0100
Subject: [PATCH 08/15] doc: types in namesapce std

---
 include/hpcombi/epu8_impl.hpp         | 6 +++++-
 include/hpcombi/perm16.hpp            | 5 +++++
 include/hpcombi/perm_generic_impl.hpp | 1 +
 include/hpcombi/vect16.hpp            | 1 +
 include/hpcombi/vect_generic.hpp      | 1 +
 5 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/hpcombi/epu8_impl.hpp b/include/hpcombi/epu8_impl.hpp
index a143b818..2cc6c872 100644
--- a/include/hpcombi/epu8_impl.hpp
+++ b/include/hpcombi/epu8_impl.hpp
@@ -553,6 +553,7 @@ inline std::string to_string(HPCombi::epu8 const &a) {
     return ss.str();
 }
 
+//! This type appears in the doc because we provide an equal operator for HPCombi::epu8.
 template <> struct equal_to<HPCombi::epu8> {
     bool operator()(const HPCombi::epu8 &lhs,
                     const HPCombi::epu8 &rhs) const noexcept {
@@ -560,6 +561,7 @@ template <> struct equal_to<HPCombi::epu8> {
     }
 };
 
+//! This type appears in the doc because we provide a not_equal operator for HPCombi::epu8.
 template <> struct not_equal_to<HPCombi::epu8> {
     bool operator()(const HPCombi::epu8 &lhs,
                     const HPCombi::epu8 &rhs) const noexcept {
@@ -567,6 +569,7 @@ template <> struct not_equal_to<HPCombi::epu8> {
     }
 };
 
+//! This type appears in the doc because we provide a hash function for HPCombi::epu8.
 template <> struct hash<HPCombi::epu8> {
     inline size_t operator()(HPCombi::epu8 a) const noexcept {
         unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);
@@ -583,10 +586,11 @@ template <> struct hash<HPCombi::epu8> {
     }
 };
 
+//! This type appears in the doc because we provide a less operator for HPCombi::epu8.
 template <> struct less<HPCombi::epu8> {
     // WARNING: due to endianness this is not lexicographic comparison,
     //          but we don't care when using in std::set.
-    // 10% faster than calling the lexicographic comparison operator !
+    // 10% faster than calling the lexicographic comparison operator!
     inline size_t operator()(const HPCombi::epu8 &v1,
                              const HPCombi::epu8 &v2) const noexcept {
         simde__m128 v1v = simde__m128(v1), v2v = simde__m128(v2);
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index a4126379..b0ff471a 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -459,7 +459,9 @@ static_assert(std::is_trivial<Perm16>(), "Perm16 is not a trivial class !");
 #include "perm16_impl.hpp"
 
 namespace std {
+// Hash operators for Transf and Perm:
 
+//! This type appears in the doc because we provide a hash function for HPCombi::PTransf16.
 template <> struct hash<HPCombi::PTransf16> {
     //! A hash operator for #HPCombi::PTransf16
     size_t operator()(const HPCombi::PTransf16 &ar) const {
@@ -467,6 +469,7 @@ template <> struct hash<HPCombi::PTransf16> {
     }
 };
 
+//! This type appears in the doc because we provide a hash function for HPCombi::Transf16.
 template <> struct hash<HPCombi::Transf16> {
     //! A hash operator for #HPCombi::Transf16
     size_t operator()(const HPCombi::Transf16 &ar) const {
@@ -474,6 +477,7 @@ template <> struct hash<HPCombi::Transf16> {
     }
 };
 
+//! This type appears in the doc because we provide a hash function for HPCombi::PPerm16.
 template <> struct hash<HPCombi::PPerm16> {
     //! A hash operator for #HPCombi::PPerm16
     size_t operator()(const HPCombi::PPerm16 &ar) const {
@@ -481,6 +485,7 @@ template <> struct hash<HPCombi::PPerm16> {
     }
 };
 
+//! This type appears in the doc because we provide a hash function for HPCombi::Perm16.
 template <> struct hash<HPCombi::Perm16> {
     //! A hash operator for #HPCombi::Perm16
     size_t operator()(const HPCombi::Perm16 &ar) const { return uint64_t(ar); }
diff --git a/include/hpcombi/perm_generic_impl.hpp b/include/hpcombi/perm_generic_impl.hpp
index 53693877..72b5a45b 100644
--- a/include/hpcombi/perm_generic_impl.hpp
+++ b/include/hpcombi/perm_generic_impl.hpp
@@ -119,6 +119,7 @@ bool PermGeneric<Size, Expo>::left_weak_leq(PermGeneric other) const {
 
 namespace std {
 
+//! This type appears in the doc because we provide a hash function for HPCombi::PermGeneric.
 template <size_t Size, typename Expo>
 struct hash<HPCombi::PermGeneric<Size, Expo>> {
     size_t operator()(const HPCombi::PermGeneric<Size, Expo> &ar) const {
diff --git a/include/hpcombi/vect16.hpp b/include/hpcombi/vect16.hpp
index 251167cb..20197040 100644
--- a/include/hpcombi/vect16.hpp
+++ b/include/hpcombi/vect16.hpp
@@ -116,6 +116,7 @@ inline std::ostream &operator<<(std::ostream &stream,
     return operator<<(stream, ar.v);
 }
 
+//! This type appears in the doc because we provide a hash function for HPCombi::Vect16.
 template <> struct hash<HPCombi::Vect16> {
     size_t operator()(const HPCombi::Vect16 &ar) const {
         return std::hash<HPCombi::epu8>{}(ar.v);
diff --git a/include/hpcombi/vect_generic.hpp b/include/hpcombi/vect_generic.hpp
index fa89254a..381b941e 100644
--- a/include/hpcombi/vect_generic.hpp
+++ b/include/hpcombi/vect_generic.hpp
@@ -239,6 +239,7 @@ std::ostream &operator<<(std::ostream &stream,
     return stream;
 }
 
+//! This type appears in the doc because we provide a hash function for HPCombi::VectGeneric.
 template <size_t Size, typename Expo>
 struct hash<HPCombi::VectGeneric<Size, Expo>> {
     size_t operator()(const HPCombi::VectGeneric<Size, Expo> &ar) const {

From ae7bed2081c0d586b0de90862d9b59ab5337b51b Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Mon, 16 Dec 2024 12:35:23 +0100
Subject: [PATCH 09/15] doc: add doc for all classes

---
 include/hpcombi/bmat8.hpp        | 20 ++++++++++---------
 include/hpcombi/builder.hpp      |  5 ++++-
 include/hpcombi/epu8.hpp         | 14 +++++++++++--
 include/hpcombi/perm16.hpp       | 34 ++++++++++++++++++--------------
 include/hpcombi/perm_generic.hpp |  7 +++++++
 include/hpcombi/power.hpp        | 26 +++++++++++++++---------
 include/hpcombi/vect16.hpp       |  1 +
 include/hpcombi/vect_generic.hpp |  7 ++++++-
 8 files changed, 77 insertions(+), 37 deletions(-)

diff --git a/include/hpcombi/bmat8.hpp b/include/hpcombi/bmat8.hpp
index 15b12305..ef681873 100644
--- a/include/hpcombi/bmat8.hpp
+++ b/include/hpcombi/bmat8.hpp
@@ -41,15 +41,17 @@
 
 namespace HPCombi {
 
-//! Class for fast boolean matrices of dimension up to 8 x 8
-//!
-//! The methods for these small matrices over the boolean semiring
-//! are more optimised than the generic methods for boolean matrices.
-//! Note that all BMat8 are represented internally as an 8 x 8 matrix;
-//! any entries not defined by the user are taken to be 0. This does
-//! not affect the results of any calculations.
-//!
-//! BMat8 is a trivial class.
+/** Boolean matrices of dimension up to 8×8, stored as a single uint64;
+isomorph to binary relations with methods for composition.
+
+The methods for these small matrices over the boolean semiring
+are more optimised than the generic methods for boolean matrices.
+Note that all BMat8 are represented internally as an 8×8 matrix;
+any entries not defined by the user are taken to be 0. This does
+not affect the results of any calculation.
+
+BMat8 is a trivial class.
+*/
 class BMat8 {
  public:
     //! A default constructor.
diff --git a/include/hpcombi/builder.hpp b/include/hpcombi/builder.hpp
index aa48d6e8..20d553c7 100644
--- a/include/hpcombi/builder.hpp
+++ b/include/hpcombi/builder.hpp
@@ -33,8 +33,11 @@
 
 namespace HPCombi {
 
-/** Class for factory object associated to a SIMD packed unsigned integers.
+/** Given a transformation from 0..15 → 0..15,
+ * build at compile-time the array representing the transformation.
+ *
  * @details
+ * Class for factory object associated to a SIMD packed unsigned integers.
  * The main purpose of this class is to be able to construct in a \c constexpr
  * way various instances of the \c TPU SIMD vector type. The behavior of
  * an instance of \c TPUBuild<TPU> is designed to mimic the behavior of \c TPU
diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index 0fd2598f..36587c81 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -18,7 +18,10 @@
 //****************************************************************************//
 
 /** @file
-@brief declaration of HPCombi::epu8 */
+@brief declaration of HPCombi::epu8.
+
+Contains renaming of some low level functions,
+eg simde_mm_testz_si128(a,a) → is_all_zero(a) */
 
 #ifndef HPCOMBI_EPU8_HPP_
 #define HPCOMBI_EPU8_HPP_
@@ -44,7 +47,14 @@ operator"" _u8(unsigned long long arg) noexcept {  // NOLINT
     return static_cast<uint8_t>(arg);
 }
 
-/// SIMD vector of 16 unsigned bytes
+/**
+epu8 stands for *Extended Packed Unsigned, grouped by 8 bits*;
+this is the low level type chosen by Intel for their API to intrinsics,
+ie a SIMD vector of 16 unsigned bytes (16×8 = 128bits).
+Functions using this type uses semantically equivalent types,
+eg a _m128 which is 2 vect of 64bits.
+a flag tells the compiler to silently consider those types equivalent.
+ */
 using epu8 = uint8_t __attribute__((vector_size(16)));
 
 static_assert(alignof(epu8) == 16,
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index b0ff471a..0b177517 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -18,7 +18,12 @@
 //****************************************************************************//
 
 /** @file
-@brief declaration of HPCombi::PTransf16, HPCombi::Transf16, HPCombi::PPerm16 and HPCombi::Perm16. */
+@brief declaration of
+\ref HPCombi::PTransf16 "PTransf16",
+\ref HPCombi::Transf16  "Transf16",
+\ref HPCombi::PPerm16   "PPerm16" and
+\ref HPCombi::Perm16    "Perm16"
+*/
 
 #ifndef HPCOMBI_PERM16_HPP_
 #define HPCOMBI_PERM16_HPP_
@@ -44,9 +49,9 @@ struct Perm16;
 struct PTransf16;
 struct Transf16;
 
-/** Partial transformation of @f$\{0\dots 15\}@f$
- *
- */
+/** Partial transformation of @f$\{0\dots 15\}@f$; see HPCombi::Transf16;
+partial means it might not be defined everywhere.
+Undefined images are encoded as 0xFF. */
 struct alignas(16) PTransf16 : public Vect16 {
     static constexpr size_t size() { return 16; }
 
@@ -119,13 +124,10 @@ struct alignas(16) PTransf16 : public Vect16 {
     uint8_t nb_fix_points() const;
 };
 
-/** Full transformation of @f$\{0\dots 15\}@f$.
- *
- * A transformation is a mapping of a set of n elements *into* itself.
- * I.e. as opposed to a permutation, it is not necessarily injective.
- * Here n=16.
- *
- */
+/** Full transformation of @f$\{0\dots 15\}@f$:
+a transformation is a mapping of a set of n elements *into* itself;
+ie as opposed to a permutation, it is not necessarily injective.
+Here n is hard-coded to 16. */
 struct Transf16 : public PTransf16 {
     Transf16() = default;
     constexpr Transf16(const Transf16 &v) = default;
@@ -152,7 +154,9 @@ struct Transf16 : public PTransf16 {
     explicit operator uint64_t() const;
 };
 
-//! Partial permutation of @f$\{0, \dots, 15\}@f$
+/** Partial permutation of @f$\{0\dots 15\}@f$; see HPCombi::Perm16;
+partial means it might not be defined everywhere (but where it's defined, it's injective).
+Undefined images are encoded as 0xFF. */
 struct PPerm16 : public PTransf16 {
     PPerm16() = default;
     constexpr PPerm16(const PPerm16 &v) = default;
@@ -209,9 +213,9 @@ struct PPerm16 : public PTransf16 {
     PPerm16 left_one() const { return PTransf16::left_one(); }
 };
 
-/** Permutations of @f$\{0\dots 15\}@f$
- *
- * A permutation is a bijective mapping of a set of n elements onto itself. Here n=16.
+/** Permutations of @f$\{0\dots 15\}@f$:
+ * A permutation is a bijective mapping of a set of n elements onto itself.
+ * Here n is hard-coded to 16.
  */
 struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
     Perm16() = default;
diff --git a/include/hpcombi/perm_generic.hpp b/include/hpcombi/perm_generic.hpp
index b641abd6..9335a3f9 100644
--- a/include/hpcombi/perm_generic.hpp
+++ b/include/hpcombi/perm_generic.hpp
@@ -38,6 +38,13 @@
 
 namespace HPCombi {
 
+/** Vanilla (ie NOT optimized) implementation of a permutation, used to check for test correctness and as baseline to measure speedup.
+Implemented as an std array, so the permutation is not necessarily of size n=16.
+PermGeneric<16> should implment as much as possibles of Perm16 (currently not everything due to lack of time/need).
+No optimisation, so prefer to use Perm16.
+
+About Expo, see comment on HPCombi::VectGeneric.
+*/
 template <size_t Size, typename Expo = uint8_t>
 struct PermGeneric : public VectGeneric<Size, Expo> {
     using vect = VectGeneric<Size, Expo>;
diff --git a/include/hpcombi/power.hpp b/include/hpcombi/power.hpp
index eb621c4a..a98db24f 100644
--- a/include/hpcombi/power.hpp
+++ b/include/hpcombi/power.hpp
@@ -17,23 +17,31 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
+
 /** @file
-@brief Generic compile-time exponentiation algorithm.
+@brief  Generic compile-time unrolling of the fast exponentiation algorithm.
+
+Allows to write expressions such as
+- @c pow<23>(2.5) : entirely computed at compile time
+- @c pow<n>(x) expanded at compile time to a O(log n) long sequence of multiplications.
 
-The goal of this file is to be able to write expressions such as @c
-pow<23>(2.5) or @c pow<n>(x) where the first expression is entirely
-computed as compile time and the second one is expanded also as compile
-time to a O(log n) long sequence of multiplication. Furthermore such
-expression not only works for numbers for for any type where there is a
+Such expressions work for numbers but also for any type where there is a
 neutral element and an associative (non necessarily commutative) product,
-namely what mathematicians call \e monoids. These include for example,
+ie what mathematicians call \e monoids.
+These include for example
 strings where the neutral element is the empty string and the product is
 the concatenation.
 
-see HPCombi::power_helper::Monoid<std::string>
+See HPCombi::power_helper::Monoid<std::string>
+
+The algorithm used here is based on the base-2 representation of n,
+it is a 2-approximation of the optimum number of multiplications.
+The general problem is called *addition chain* and one can sometimes do better,
+eg on fibonaci numbers, use rather the fibonacci recurrence relation
+to choose which products to compute.
 
 @example stringmonoid.cpp
-This is an example of how to use pow with a non numerical Monoid.
+how to use pow with a non numerical Monoid.
 */
 
 #ifndef HPCOMBI_POWER_HPP_
diff --git a/include/hpcombi/vect16.hpp b/include/hpcombi/vect16.hpp
index 20197040..f9a0b2ea 100644
--- a/include/hpcombi/vect16.hpp
+++ b/include/hpcombi/vect16.hpp
@@ -34,6 +34,7 @@
 
 namespace HPCombi {
 
+/** Vector of 16 bytes, with some optimized methods, superclass of HPCombi::Transf16. */
 struct alignas(16) Vect16 {
     static constexpr size_t size() { return 16; }
     using array = typename decltype(Epu8)::array;
diff --git a/include/hpcombi/vect_generic.hpp b/include/hpcombi/vect_generic.hpp
index 381b941e..65cf4dab 100644
--- a/include/hpcombi/vect_generic.hpp
+++ b/include/hpcombi/vect_generic.hpp
@@ -46,7 +46,12 @@ std::array<Expo, Size> sorted_vect(std::array<Expo, Size> v) {
     return v;
 }
 
-/** A generic class for combinatorial integer vectors.
+/** \ref HPCombi::VectGeneric "VectGeneric" is to \ref HPCombi::Vect16 "Vect16"
+what \ref HPCombi::PermGeneric "PermGeneric" is to \ref HPCombi::Perm16 "Perm16";
+see \ref HPCombi::PermGeneric "PermGeneric".
+
+HPCombi started as a library to manipulate monomials on several variables,
+ie a tuple of *expo*nents. The elements of arrays were thus named Expo.
  */
 template <size_t Size, typename Expo = uint8_t> struct VectGeneric {
     static constexpr size_t size() { return Size; }

From 588774aaa53a7c0f12a5421b4e641b56dff9aca0 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Mon, 16 Dec 2024 12:41:26 +0100
Subject: [PATCH 10/15] doc: misc

---
 include/hpcombi/epu8.hpp    |  2 +-
 include/hpcombi/hpcombi.hpp |  3 ++-
 include/hpcombi/perm16.hpp  | 15 ++++++++++++---
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index 36587c81..b13301b8 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -51,7 +51,7 @@ operator"" _u8(unsigned long long arg) noexcept {  // NOLINT
 epu8 stands for *Extended Packed Unsigned, grouped by 8 bits*;
 this is the low level type chosen by Intel for their API to intrinsics,
 ie a SIMD vector of 16 unsigned bytes (16×8 = 128bits).
-Functions using this type uses semantically equivalent types,
+Functions using this type use semantically equivalent types,
 eg a _m128 which is 2 vect of 64bits.
 a flag tells the compiler to silently consider those types equivalent.
  */
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index 352c10df..fcb094a4 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -83,5 +83,6 @@ Data structure should preserve locality. You might want to compute some stats on
 This lib is implemented with speed in mind, not code safety.
 Eg. there are no checks when building a permutation, which could be invalid (like non injective).
 
-We now suggest to have a look, in the menus above, at Classes → [Class list](annotated.html).
+We now suggest to have a look, in the menus above, at Classes → [Class list](annotated.html),
+esp. at classes are HPCombi::Perm16 and HPCombi::BMat8.
 */
\ No newline at end of file
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index 0b177517..f4098f01 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -195,6 +195,7 @@ struct PPerm16 : public PTransf16 {
      * @verbatim {0,0xFF,2,1,3,5,6,0xFF,8,9,0xFF,10,12,0xFF,0xFF,0xFF}
      * @endverbatim
      */
+
     /** @copydoc common_inverse_pperm
      *  @par Algorithm:
      *  @f$O(n)@f$ algorithm using reference cast to arrays
@@ -253,42 +254,50 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * Returns
      * @verbatim {0,4,2,1,3,5,6,7,8,9,10,11,12,13,14,15} @endverbatim
      */
+
     /** @copydoc common_inverse
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ algorithm using loop and indexed access
      */
     Perm16 inverse_ref() const;
+
     /** @copydoc common_inverse
      *  @par Algorithm:
      *  @f$O(n)@f$ algorithm using reference cast to arrays
      */
     Perm16 inverse_arr() const;
+
     /** @copydoc common_inverse
      *  @par Algorithm:
      *  Insert the identity in the least significant bits and sort using a
-     *  sorting network. The number of round of the optimal sorting network is
+     *  sorting network. The number of rounds of the optimal sorting network is
      *  open as far as I know, therefore the complexity is unknown.
      */
     Perm16 inverse_sort() const;
+
     /** @copydoc common_inverse
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
      */
     Perm16 inverse_find() const { return permutation_of(v, one()); }
+
     /** @copydoc common_inverse
      *  @par Algorithm:
      *
-     * Raise \e *this to power @f$\text{LCM}(1, 2, ..., n) - 1@f$ so complexity
-     * is in @f$O(log (\text{LCM}(1, 2, ..., n) - 1)) = O(n)@f$
+     * Use HPCombi::pow to
+     * raise \e *this to power @f$\text{LCM}(1, 2, ..., n) - 1@f$ so complexity
+     * is @f$O(log (\text{LCM}(1, 2, ..., n) - 1)) = O(n)@f$
      */
     Perm16 inverse_pow() const;
+
     /** @copydoc common_inverse
      *  @par Algorithm:
      *  Compute power from @f$n/2@f$ to @f$n@f$, when @f$\sigma^k(i)=i@f$ then
      *  @f$\sigma^{-1}(i)=\sigma^{k-1}(i)@f$. Complexity @f$O(n)@f$
      */
     Perm16 inverse_cycl() const;
+
     /** @copydoc common_inverse
      *
      *  Frontend method: currently aliased to #inverse_cycl */

From ad17e3c0589e1f6111c1a5c856e73147739f1e48 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Mon, 16 Dec 2024 14:05:54 +0100
Subject: [PATCH 11/15] typo

---
 include/hpcombi/epu8.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index b13301b8..c31fbf1d 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -143,7 +143,7 @@ inline epu8 sort8_perm(epu8 &a) noexcept;
  * @brief Merge two sorted epu8
  * @details
  * @param a, b: two #HPCombi::epu8
- * after executing merge, \c a and \c are sorted \c a[15] <= \c b[0]
+ * after executing merge, \c a and \c b are sorted and \c a[15] <= \c b[0]
  */
 /** @copydoc common_merge
  *  @par Algorithm: bitonic merge sorting network

From 2a1f8dcc3c9606d365a8737a42a8f419ae9353aa Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Mon, 16 Dec 2024 14:46:58 +0100
Subject: [PATCH 12/15] doc: DRY. remove the @copydoc's

Let the reader know they have already read some text
(and don't need to read this doc),
just point to it if they need to read it again.
---
 include/hpcombi/epu8.hpp   | 318 ++++++++++++++++++++-----------------
 include/hpcombi/perm16.hpp | 123 +++++++-------
 2 files changed, 230 insertions(+), 211 deletions(-)

diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index c31fbf1d..7b616227 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -139,37 +139,35 @@ inline epu8 sort_perm(epu8 &a) noexcept;
  */
 inline epu8 sort8_perm(epu8 &a) noexcept;
 
-/** @class common_merge
+/**
  * @brief Merge two sorted epu8
  * @details
  * @param a, b: two #HPCombi::epu8
  * after executing merge, \c a and \c b are sorted and \c a[15] <= \c b[0]
- */
-/** @copydoc common_merge
- *  @par Algorithm: bitonic merge sorting network
+ * @par Algorithm: bitonic merge sorting network
  */
 inline void merge(epu8 &a, epu8 &b) noexcept;
 
-/** @class common_permutation_of
- * @brief Find if a vector is a permutation of one other
- * @details
- * @param a, b: two #HPCombi::epu8
- * @returns a #HPCombi::epu8
- * For each @f$0 \leq i < 16@f$, \c res[i] is the position in \c a of \c b[i]
-     if \c b[i] appears exactly once in \c a, or undefined if not.
- */
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** @copydoc common_permutation_of
+/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a different implementation.
     @par Algorithm: uses string matching cpmestrm intrinsics
  */
 inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept;
 #endif
-/** @copydoc common_permutation_of
+
+/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a different implementation.
     @par Algorithm: reference implementation
  */
 inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept;
-/** @copydoc common_permutation_of
-    @par Algorithm: architecture dependent
+
+/**
+ * @brief Find if a vector is a permutation of one other
+ * @details
+ * @param a, b: two #HPCombi::epu8
+ * @returns a #HPCombi::epu8
+ * For each @f$0 \leq i < 16@f$, \c res[i] is the position in \c a of \c b[i]
+     if \c b[i] appears exactly once in \c a, or undefined if not.
+ * @par Algorithm: architecture dependent
  */
 inline epu8 permutation_of(epu8 a, epu8 b) noexcept;
 
@@ -194,233 +192,273 @@ inline epu8 random_epu8(uint16_t bnd);
  */
 inline epu8 remove_dups(epu8 a, uint8_t repl = 0) noexcept;
 
-/** @class common_horiz_sum
- * @brief Horizontal sum of a  #HPCombi::epu8
- * @details
- * @returns the horizontal sum of the input
- * @par Example:
- * @code
- * horiz_sum(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
- * @endcode
- * Returns `110`
- * @warning The result is supposed to fit in a \c uint8_t
- */
-/** @copydoc common_horiz_sum
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
+
 inline uint8_t horiz_sum_ref(epu8) noexcept;
-/** @copydoc common_horiz_sum
+
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
+
 inline uint8_t horiz_sum_gen(epu8) noexcept;
-/** @copydoc common_horiz_sum
+
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_sum4(epu8) noexcept;
-/** @copydoc common_horiz_sum
+
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
 inline uint8_t horiz_sum3(epu8) noexcept;
-/** @copydoc common_horiz_sum */
-inline uint8_t horiz_sum(epu8 v) noexcept { return horiz_sum3(v); }
 
-/** @class common_partial_sums
- * @brief Horizontal partial sum of a #HPCombi::epu8
+/**
+ * @brief Horizontal sum of a #HPCombi::epu8
  * @details
- * @returns the partials sums of the input
+ * @returns the horizontal sum of the input
  * @par Example:
  * @code
- * partial_sums(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * horiz_sum(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
  * @endcode
- * Returns `{ 5,10,12,17,18,24,36,40,40,43,45,56,68,81,95,110}`
+ * Returns `110`
+ * @warning The result is supposed to fit in a \c uint8_t
  */
-/** @copydoc common_partial_sums
+inline uint8_t horiz_sum(epu8 v) noexcept { return horiz_sum3(v); }
+
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_sums_ref(epu8) noexcept;
-/** @copydoc common_partial_sums
+
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_sums_gen(epu8) noexcept;
-/** @copydoc common_partial_sums
+
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline epu8 partial_sums_round(epu8) noexcept;
-/** @copydoc common_partial_sums */
-inline epu8 partial_sums(epu8 v) noexcept { return partial_sums_round(v); }
 
-/** @class common_horiz_max
- * @brief Horizontal sum of a  #HPCombi::epu8
+/**
+ * @brief Horizontal partial sum of a #HPCombi::epu8
  * @details
- * @returns the horizontal sum of the input
+ * @returns the partials sums of the input
  * @par Example:
  * @code
- * horiz_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2, 0,12, 0, 0, 0});
+ * partial_sums(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
  * @endcode
- * Returns `12`
+ * Returns `{ 5,10,12,17,18,24,36,40,40,43,45,56,68,81,95,110}`
  */
-/** @copydoc common_horiz_max
+inline epu8 partial_sums(epu8 v) noexcept { return partial_sums_round(v); }
+
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint8_t horiz_max_ref(epu8) noexcept;
-/** @copydoc common_horiz_max
+
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline uint8_t horiz_max_gen(epu8) noexcept;
-/** @copydoc common_horiz_max
+
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_max4(epu8) noexcept;
-/** @copydoc common_horiz_max
+
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
 inline uint8_t horiz_max3(epu8) noexcept;
-/** @copydoc common_horiz_max */
-inline uint8_t horiz_max(epu8 v) noexcept { return horiz_max4(v); }
 
-/** @class common_partial_max
- * @brief Horizontal partial sum of a #HPCombi::epu8
+/**
+ * @brief Horizontal sum of a  #HPCombi::epu8
  * @details
- * @returns the partials max of the input
+ * @returns the horizontal sum of the input
  * @par Example:
  * @code
- * partial_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * horiz_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2, 0,12, 0, 0, 0});
  * @endcode
- * Returns `{ 5, 5, 5, 5, 5, 6,12,12,12,12,12,12,12,13,14,15}`
+ * Returns `12`
  */
-/** @copydoc common_partial_max
+inline uint8_t horiz_max(epu8 v) noexcept { return horiz_max4(v); }
+
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_max_ref(epu8) noexcept;
-/** @copydoc common_partial_max
+
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_max_gen(epu8) noexcept;
-/** @copydoc common_partial_max
+
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline epu8 partial_max_round(epu8) noexcept;
-/** @copydoc common_partial_max */
-inline epu8 partial_max(epu8 v) noexcept { return partial_max_round(v); }
 
-/** @class common_horiz_min
- * @brief Horizontal sum of a  #HPCombi::epu8
+/**
+ * @brief Horizontal partial sum of a #HPCombi::epu8
  * @details
- * @returns the horizontal sum of the input
+ * @returns the partials max of the input
  * @par Example:
  * @code
- * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4});
+ * partial_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
  * @endcode
- * Returns `1`
+ * Returns `{ 5, 5, 5, 5, 5, 6,12,12,12,12,12,12,12,13,14,15}`
  */
-/** @copydoc common_horiz_min
+inline epu8 partial_max(epu8 v) noexcept { return partial_max_round(v); }
+
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint8_t horiz_min_ref(epu8) noexcept;
-/** @copydoc common_horiz_min
+
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline uint8_t horiz_min_gen(epu8) noexcept;
-/** @copydoc common_horiz_min
+
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_min4(epu8) noexcept;
-/** @copydoc common_horiz_min
+
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
 inline uint8_t horiz_min3(epu8) noexcept;
-/** @copydoc common_horiz_min */
-inline uint8_t horiz_min(epu8 v) noexcept { return horiz_min4(v); }
 
-/** @class common_partial_min
- * @brief Horizontal partial sum of a #HPCombi::epu8
+/**
+ * @brief Horizontal sum of a  #HPCombi::epu8
  * @details
- * @returns the partials min of the input
+ * @returns the horizontal sum of the input
  * @par Example:
  * @code
- * partial_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4});
  * @endcode
- * Returns `{ 5, 5, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}`
- */
-/** @copydoc common_partial_min
+ * Returns `1`
+*/
+inline uint8_t horiz_min(epu8 v) noexcept { return horiz_min4(v); }
+
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_min_ref(epu8) noexcept;
-/** @copydoc common_partial_min
+
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_min_gen(epu8) noexcept;
-/** @copydoc common_partial_min
+
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline epu8 partial_min_round(epu8) noexcept;
-/** @copydoc common_partial_min */
-inline epu8 partial_min(epu8 v) noexcept { return partial_min_round(v); }
 
-/** @class common_eval16
- * @brief Evaluation of a #HPCombi::epu8: count how many times each int of 0..15 appears in the input.
+/**
+ * @brief Horizontal partial sum of a #HPCombi::epu8
  * @details
- * @param v : a #HPCombi::epu8
- * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
- *     \c r[i] is the number of occurrence of \c i in the input \c v
+ * @returns the partials min of the input
  * @par Example:
  * @code
- * eval16(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * partial_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
  * @endcode
- * Returns `{ 1, 1, 2, 1, 1, 3, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1}`
- * @warning The entries larger than 15 are ignored
+ * Returns `{ 5, 5, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}`
  */
-/** @copydoc common_eval16
+inline epu8 partial_min(epu8 v) noexcept { return partial_min_round(v); }
+
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 eval16_ref(epu8 v) noexcept;
-/** @copydoc common_eval16
+
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and cast to array
  */
 inline epu8 eval16_arr(epu8 v) noexcept;
-/** @copydoc common_eval16
+
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
  *  @par Algorithm:
  *  Vector @f$O(n)@f$ using cyclic shifting
  */
 inline epu8 eval16_cycle(epu8 v) noexcept;
-/** @copydoc common_eval16
+
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
  *  @par Algorithm:
  *  Vector @f$O(n)@f$ using popcount
  */
 inline epu8 eval16_popcount(epu8 v) noexcept;
-/** @copydoc common_eval16 */
+
+/**
+ * @brief Evaluation of a #HPCombi::epu8: count how many times each int of 0..15 appears in the input.
+ * @details
+ * @param v : a #HPCombi::epu8
+ * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
+ *     \c r[i] is the number of occurrence of \c i in the input \c v
+ * @par Example:
+ * @code
+ * eval16(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
+ * @endcode
+ * Returns `{ 1, 1, 2, 1, 1, 3, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1}`
+ * @warning The entries larger than 15 are ignored
+ */
 inline epu8 eval16(epu8 v) noexcept { return eval16_cycle(v); }
 
-/** @class common_first_diff
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+ *  @par Algorithm:
+ *  Reference @f$O(n)@f$ algorithm using loop and indexed access
+ */
+inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
+
+#ifdef SIMDE_X86_SSE4_2_NATIVE
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+ *  @par Algorithm:
+ *  Using \c cmpestri instruction
+ */
+inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
+#endif
+
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+ *  @par Algorithm:
+ *  Using vector comparison and mask
+ */
+inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound = 16) noexcept;
+
+/**
  * @brief The first difference between two #HPCombi::epu8
  * @details
  * @param a, b : two #HPCombi::epu8
@@ -438,29 +476,31 @@ inline epu8 eval16(epu8 v) noexcept { return eval16_cycle(v); }
  * `first_diff(a, b, 7)` returns `3`.
  * @warning `bound` is assumed to be smaller or equal than 16
  */
-/** @copydoc common_first_diff
+inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
+    return first_diff_mask(a, b, bound);
+}
+
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
-inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
+inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
+
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** @copydoc common_first_diff
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
  *  @par Algorithm:
  *  Using \c cmpestri instruction
  */
-inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
+inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
 #endif
-/** @copydoc common_first_diff
+
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
  *  @par Algorithm:
  *  Using vector comparison and mask
  */
-inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound = 16) noexcept;
-/** @copydoc common_first_diff */
-inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
-    return first_diff_mask(a, b, bound);
-}
+inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound = 16) noexcept;
 
-/** @class common_last_diff
+/**
  * @brief The last difference between two #HPCombi::epu8
  * @details
  * @param a, b : #HPCombi::epu8
@@ -478,24 +518,6 @@ inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
  * `last_diff(a, b, 7)` returns `3`.
  * @warning `bound` is assumed to be smaller or equal than 16
  */
-/** @copydoc common_last_diff
- *  @par Algorithm:
- *  Reference @f$O(n)@f$ algorithm using loop and indexed access
- */
-inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
-#ifdef SIMDE_X86_SSE4_2_NATIVE
-/** @copydoc common_last_diff
- *  @par Algorithm:
- *  Using \c cmpestri instruction
- */
-inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
-#endif
-/** @copydoc common_last_diff
- *  @par Algorithm:
- *  Using vector comparison and mask
- */
-inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound = 16) noexcept;
-/** @copydoc common_last_diff */
 inline uint64_t last_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
     return last_diff_mask(a, b, bound);
 }
@@ -582,7 +604,23 @@ inline bool is_transformation(epu8 v, const size_t k = 16) noexcept;
  */
 inline bool is_partial_permutation(epu8 v, const size_t k = 16) noexcept;
 
-/** @class common_is_permutation
+#ifdef SIMDE_X86_SSE4_2_NATIVE
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+    @par Algorithm: uses string matching cpmestri intrinsics
+ */
+inline bool is_permutation_cpmestri(epu8 v, const size_t k = 16) noexcept;
+#endif
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+    @par Algorithm: sort the vector and compare to identity
+ */
+inline bool is_permutation_sort(epu8 v, const size_t k = 16) noexcept;
+
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+    @par Algorithm: uses evaluation
+ */
+inline bool is_permutation_eval(epu8 v, const size_t k = 16) noexcept;
+
+/**
  * @details
  * @returns whether \c *this is a permutation.
  * @param v the vector to test
@@ -591,28 +629,12 @@ inline bool is_partial_permutation(epu8 v, const size_t k = 16) noexcept;
  * If \c *this is a permutation of @f$0\dots n-1@f$ for @f$n<16@f$,
  * it should be completed to a permutation of @f$0\dots 15@f$
  * by adding fixed points. That is the values @f$i\geq n@f$ should be
- * mapped to themself.
+ * mapped to themselves.
  * @par Example:
  * The permutation
  * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 3 1 4 \end{matrix}@f$
  * is encoded by the array {2,0,5,3,1,4,6,7,8,9,10,11,12,13,14,15}
- */
-#ifdef SIMDE_X86_SSE4_2_NATIVE
-/** @copydoc common_is_permutation
-    @par Algorithm: uses string matching cpmestri intrinsics
- */
-inline bool is_permutation_cpmestri(epu8 v, const size_t k = 16) noexcept;
-#endif
-/** @copydoc common_is_permutation
-    @par Algorithm: sort the vector and compare to identity
- */
-inline bool is_permutation_sort(epu8 v, const size_t k = 16) noexcept;
-/** @copydoc common_is_permutation
-    @par Algorithm: uses evaluation
- */
-inline bool is_permutation_eval(epu8 v, const size_t k = 16) noexcept;
-/** @copydoc common_is_permutation
-    @par Algorithm: architecture dependent
+ * @par Algorithm: architecture dependent
  */
 inline bool is_permutation(epu8 v, const size_t k = 16) noexcept;
 
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index f4098f01..d6207490 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -180,7 +180,7 @@ struct PPerm16 : public PTransf16 {
         return this->PTransf16::operator*(p);
     }
 
-    /** @class common_inverse_pperm
+    /**
      * @brief The inverse of a partial permutation
      * @details
      * @returns the inverse of \c *this. The inverse of @f$p@f$ is the unique
@@ -194,15 +194,13 @@ struct PPerm16 : public PTransf16 {
      * Returns
      * @verbatim {0,0xFF,2,1,3,5,6,0xFF,8,9,0xFF,10,12,0xFF,0xFF,0xFF}
      * @endverbatim
-     */
-
-    /** @copydoc common_inverse_pperm
-     *  @par Algorithm:
-     *  @f$O(n)@f$ algorithm using reference cast to arrays
+     * @par Algorithm:
+     * @f$O(n)@f$ algorithm using reference cast to arrays
      */
     PPerm16 inverse_ref() const;
+
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-    /** @copydoc common_inverse_pperm
+    /** Same as \ref HPCombi::PPerm16::inverse_ref "inverse_ref" but with a different algorithm.
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
@@ -242,8 +240,8 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
     //! Construct a permutations from its 64 bits compressed.
     explicit Perm16(uint64_t compressed) : Transf16(compressed) {}
 
-    /** @class common_inverse
-     * @brief The inverse permutation
+    /** @brief The inverse permutation
+     *
      * @details
      * @returns the inverse of \c *this
      * @par Example:
@@ -253,21 +251,24 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * @endcode
      * Returns
      * @verbatim {0,4,2,1,3,5,6,7,8,9,10,11,12,13,14,15} @endverbatim
-     */
 
-    /** @copydoc common_inverse
+     * Frontend method: currently aliased to #inverse_cycl */
+    Perm16 inverse() const { return inverse_cycl(); }
+
+
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ algorithm using loop and indexed access
      */
     Perm16 inverse_ref() const;
 
-    /** @copydoc common_inverse
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
      *  @par Algorithm:
      *  @f$O(n)@f$ algorithm using reference cast to arrays
      */
     Perm16 inverse_arr() const;
 
-    /** @copydoc common_inverse
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
      *  @par Algorithm:
      *  Insert the identity in the least significant bits and sort using a
      *  sorting network. The number of rounds of the optimal sorting network is
@@ -275,14 +276,14 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     Perm16 inverse_sort() const;
 
-    /** @copydoc common_inverse
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
      */
     Perm16 inverse_find() const { return permutation_of(v, one()); }
 
-    /** @copydoc common_inverse
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
      *  @par Algorithm:
      *
      * Use HPCombi::pow to
@@ -291,18 +292,13 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     Perm16 inverse_pow() const;
 
-    /** @copydoc common_inverse
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
      *  @par Algorithm:
      *  Compute power from @f$n/2@f$ to @f$n@f$, when @f$\sigma^k(i)=i@f$ then
      *  @f$\sigma^{-1}(i)=\sigma^{k-1}(i)@f$. Complexity @f$O(n)@f$
      */
     Perm16 inverse_cycl() const;
 
-    /** @copydoc common_inverse
-     *
-     *  Frontend method: currently aliased to #inverse_cycl */
-    Perm16 inverse() const { return inverse_cycl(); }
-
     /** The elementary transposition exchanging @f$i@f$ and @f$i+1@f$ */
     static Perm16 elementary_transposition(uint64_t i);
     /** A random permutation of size @f$n@f$*/
@@ -312,7 +308,8 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     static Perm16 unrankSJT(int n, int r);
 
-    /** @class common_lehmer
+
+    /**
      * @brief The Lehmer code of a permutation
      * @details
      * @returns the Lehmer code of \c *this
@@ -323,24 +320,24 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * @endcode
      * Returns
      * @verbatim {0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0} @endverbatim
+     * @par Algorithm:
+     * Fast @f$O(n)@f$ algorithm using vector comparison
      */
-    /** @copydoc common_lehmer
-     *  @par Algorithm:
-     *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access
+    epu8 lehmer() const;
+
+    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a different implementation.
+     * @par Algorithm:
+     * Reference @f$O(n^2)@f$ algorithm using loop and indexed access
      */
     epu8 lehmer_ref() const;
-    /** @copydoc common_lehmer
-     *  @par Algorithm:
-     *  Reference @f$O(n^2)@f$ algorithm using array, loop and indexed access
+
+    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a different implementation.
+     * @par Algorithm:
+     * Reference @f$O(n^2)@f$ algorithm using array, loop and indexed access
      */
     epu8 lehmer_arr() const;
-    /** @copydoc common_lehmer
-     *  @par Algorithm:
-     *  Fast @f$O(n)@f$ algorithm using vector comparison
-     */
-    epu8 lehmer() const;
 
-    /** @class common_length
+    /** 
      * @brief The Coxeter length (ie: number of inversion) of a permutation
      * @details
      * @returns the number of inversions of \c *this
@@ -350,25 +347,25 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * x.length()
      * @endcode
      * Returns @verbatim 4 @endverbatim
+     *  @par Algorithm:
+     *  @f$O(n)@f$ using vector lehmer and fast horizontal sum
      */
-    /** @copydoc common_length
+    uint8_t length() const;
+
+    /** Same interface as \ref HPCombi::Perm16::length "length", with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access
      */
     uint8_t length_ref() const;
-    /** @copydoc common_length
+
+    /** Same interface as \ref HPCombi::Perm16::length "length", with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access after
      *     a cast to \c std::array
      */
     uint8_t length_arr() const;
-    /** @copydoc common_length
-     *  @par Algorithm:
-     *  @f$O(n)@f$ using vector lehmer and fast horizontal sum
-     */
-    uint8_t length() const;
 
-    /** @class common_nb_descent
+    /**
      * @brief The number of descent of a permutation
      * @details
      * @returns the number of inversions of \c *this
@@ -378,17 +375,16 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * x.length()
      * @endcode
      * Returns @verbatim 2 @endverbatim
+     *  @par Algorithm:
+     *  Reference @f$O(1)@f$ using vector shift and comparison
      */
-    /** @copydoc common_nb_descent
+    uint8_t nb_descents() const;
+
+    /** Same interface as \ref HPCombi::Perm16::nb_descents "nb_descents", with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ using a loop
      */
     uint8_t nb_descents_ref() const;
-    /** @copydoc common_nb_descent
-     *  @par Algorithm:
-     *  Reference @f$O(1)@f$ using vector shift and comparison
-     */
-    uint8_t nb_descents() const;
 
     /** The set partition of the cycles of a permutation
      * @details
@@ -406,7 +402,7 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     epu8 cycles_partition() const;
 
-    /** @class common_nb_cycles
+    /**
      * @brief The number of cycles of a permutation
      * @details
      * @returns the number of cycles of \c *this
@@ -416,23 +412,24 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * x.nb_cycles()
      * @endcode
      * Returns @verbatim 10 @endverbatim
+     *  @par Algorithm: aliased to #nb_cycles_unroll
      */
-    /** @copydoc common_nb_cycles
+    uint8_t nb_cycles() const { return nb_cycles_unroll(); }
+
+    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ using a boolean vector
      */
     uint8_t nb_cycles_ref() const;
-    /** @copydoc common_nb_cycles
+
+    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(\log(n))@f$ using #cycles_partition
      */
     uint8_t nb_cycles_unroll() const;
-    /** @copydoc common_nb_cycles
-     *  @par Algorithm: aliased to #nb_cycles_unroll
-     */
-    uint8_t nb_cycles() const { return nb_cycles_unroll(); }
 
-    /** @class common_left_weak_leq
+
+    /**
      * @brief Compare two permutations for the left weak order
      * @par Example:
      * @code
@@ -440,22 +437,22 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * x.left_weak_leq(y)
      * @endcode
      * Returns @verbatim true @endverbatim
+     *  @par Algorithm:
+     *  @f$O(n)@f$ algorithm using length
      */
-    /** @copydoc common_left_weak_leq
+    bool left_weak_leq(Perm16 other) const;
+
+    /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq" but with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ testing inclusion of inversions one by one
      */
     bool left_weak_leq_ref(Perm16 other) const;
-    /** @copydoc common_left_weak_leq
+
+    /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq" but with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ with vectorized test of inclusion
      */
     bool left_weak_leq_length(Perm16 other) const;
-    /** @copydoc common_left_weak_leq
-     *  @par Algorithm:
-     *  @f$O(n)@f$ algorithm using length
-     */
-    bool left_weak_leq(Perm16 other) const;
 };
 
 ///////////////////////////////////////////////////////////////////////////////

From 0f36ef6efae75c1108e1652a3a77943548529f4c Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Mon, 16 Dec 2024 16:35:29 +0100
Subject: [PATCH 13/15] doc: misc

---
 TODO.txt                         |  3 ++-
 include/hpcombi/epu8.hpp         | 11 ++++++++---
 include/hpcombi/hpcombi.hpp      | 28 +++++++++++++++++++++++-----
 include/hpcombi/perm_generic.hpp |  2 +-
 4 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/TODO.txt b/TODO.txt
index 229d50df..9109b089 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -2,5 +2,6 @@
 - https://cmake.org/Wiki/CMake:How_To_Write_Platform_Checks
 - https://stackoverflow.com/questions/11944060/how-to-detect-target-architecture-using-cmake
 
-
 Add method data in perm16 and perm_generic
+
+Document examples. Eg for each file foo.cpp in examples/, add "@example foo.cpp" in a relevant file of include/hpcombi/.
diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index 7b616227..8c47b5ef 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -79,22 +79,27 @@ inline bool equal(epu8 a, epu8 b) noexcept {
 /** Non equality of #HPCombi::epu8 */
 inline bool not_equal(epu8 a, epu8 b) noexcept { return !equal(a, b); }
 
-/** Permuting a #HPCombi::epu8 */
+/** Apply a permutation \c b on the vector \c a: for i=0..16 {result[i] = a[b[i]} */
 inline epu8 permuted_ref(epu8 a, epu8 b) noexcept;
-/** Permuting a #HPCombi::epu8 */
+
+/** Same as \ref HPCombi::permuted_ref "permuted_ref"
+but with an optimized implementation using intrinsics. */
 inline epu8 permuted(epu8 a, epu8 b) noexcept {
     return simde_mm_shuffle_epi8(a, b);
 }
+
 /** Left shifted of a #HPCombi::epu8 inserting a 0
  * @warning we use the convention that the 0 entry is on the left !
  */
 inline epu8 shifted_right(epu8 a) noexcept {
     return simde_mm_bslli_si128(a, 1);
 }
+
 /** Right shifted of a #HPCombi::epu8 inserting a 0
  * @warning we use the convention that the 0 entry is on the left !
  */
 inline epu8 shifted_left(epu8 a) noexcept { return simde_mm_bsrli_si128(a, 1); }
+
 /** Reverting a #HPCombi::epu8 */
 inline epu8 reverted(epu8 a) noexcept { return permuted(a, Epu8.rev()); }
 
@@ -161,7 +166,7 @@ inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept;
 inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept;
 
 /**
- * @brief Find if a vector is a permutation of one other
+ * @brief Find if a vector is a permutation of another one
  * @details
  * @param a, b: two #HPCombi::epu8
  * @returns a #HPCombi::epu8
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index fcb094a4..c4cacad5 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -72,11 +72,6 @@ Cycle type of a permutation | 8.94
 
 \section sec_tips Tips to the user
 
-There is no parallelisation here. To use parallelism with this lib, see for instance:
-- Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics
-([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf), [DOI](https://dx.doi.org/10.1145/3115936.3115938)).
-- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework.
-
 Note that memory access can become a problem. It you store many things, most of the time will be spent in fetching from RAM, not computing.
 Data structure should preserve locality. You might want to compute some stats on data structure usage and write custom ones.
 
@@ -85,4 +80,27 @@ Eg. there are no checks when building a permutation, which could be invalid (lik
 
 We now suggest to have a look, in the menus above, at Classes → [Class list](annotated.html),
 esp. at classes are HPCombi::Perm16 and HPCombi::BMat8.
+
+\section Parallelism
+There is no parallelisation here. To use parallelism with this lib, see for instance:
+- Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics
+([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf), [DOI](https://dx.doi.org/10.1145/3115936.3115938)).
+- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework.
+
+Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to ease parallelism.
+Intel decided not to maintain it anymore so its deprecated.
+OpencilK is an open source project to continue it.
+
+We tested OpenMP and it was 2 orders of magnitude slower.
+
+OpencilK adds the keyword `spawn`,
+which adds a special tag to the stack and launches a recursive call.
+If a thread finishes its work, it will look at other threads' stacks and steal their work.
+The value of Cilk is that recursive calls cost only 4 or 5 times more,
+much faster than launching true threads
+(which would take 6-7 orders of magnitude more time to create, measured in μs).
+
+OpencilK provides some primitives for concurrent access to data.
+It guarantees the semantics of serial execution.
+
 */
\ No newline at end of file
diff --git a/include/hpcombi/perm_generic.hpp b/include/hpcombi/perm_generic.hpp
index 9335a3f9..db441266 100644
--- a/include/hpcombi/perm_generic.hpp
+++ b/include/hpcombi/perm_generic.hpp
@@ -40,7 +40,7 @@ namespace HPCombi {
 
 /** Vanilla (ie NOT optimized) implementation of a permutation, used to check for test correctness and as baseline to measure speedup.
 Implemented as an std array, so the permutation is not necessarily of size n=16.
-PermGeneric<16> should implment as much as possibles of Perm16 (currently not everything due to lack of time/need).
+PermGeneric<16> should implement as much as possibles of Perm16 (currently not everything due to lack of time/need).
 No optimisation, so prefer to use Perm16.
 
 About Expo, see comment on HPCombi::VectGeneric.

From 9de3055c46907a9f00d8321451885c8fc40e0e41 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Tue, 17 Dec 2024 12:07:09 +0100
Subject: [PATCH 14/15] proofread the pull request

---
 README.md                   |  7 +++----
 include/hpcombi/epu8.hpp    |  6 +++---
 include/hpcombi/hpcombi.hpp | 27 ++++++++++++++++-----------
 include/hpcombi/perm16.hpp  | 25 ++++++++++++++-----------
 4 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 2a9dfb7e..ed2333d3 100644
--- a/README.md
+++ b/README.md
@@ -2,10 +2,9 @@
 High Performance Combinatorics in C++ using vector instructions v1.0.1
 
 HPCombi is a C++17 header-only library using the SSE and AVX instruction sets,
-and some equivalents, for very fast manipulation of small combinatorial objects such
-as transformations, permutations, and boolean matrices. The goal
-of this project is to implement various new algorithms and benchmark them on
-various compiler and architectures.
+and some equivalents, for very fast manipulation of small combinatorial objects
+such as transformations, permutations, and boolean matrices. HPCombi implements
+new algorithms and benchmarks them on various compilers and architectures.
 
 HPCombi was initially designed using the SSE and AVX instruction sets, and did
 not work on machines without these instructions (such as ARM). From v1.0.1
diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index 8c47b5ef..c5f9b3b7 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -52,9 +52,9 @@ epu8 stands for *Extended Packed Unsigned, grouped by 8 bits*;
 this is the low level type chosen by Intel for their API to intrinsics,
 ie a SIMD vector of 16 unsigned bytes (16×8 = 128bits).
 Functions using this type use semantically equivalent types,
-eg a _m128 which is 2 vect of 64bits.
-a flag tells the compiler to silently consider those types equivalent.
- */
+eg a _m128 which is a vector containing 2 signed 64 bits integers.
+A flag tells the compiler to silently consider those types equivalent.
+*/
 using epu8 = uint8_t __attribute__((vector_size(16)));
 
 static_assert(alignof(epu8) == 16,
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index c4cacad5..b84f6fdc 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -53,9 +53,9 @@ applying a permutation on a vector only takes a few CPU cycles.
 
 Further ideas are:
 - Vectorization (MMX, SSE, AVX instructions sets) and careful memory alignment,
-- Careful memory management: avoiding all dynamic allocation during the computation,
-- Avoid all unnecessary copies (often needed to rewrite the containers),
-- Due to combinatorial explosion, sets often don’t fit in the computer’s memory or disks and are enumerated on the fly.
+- Careful memory management: avoid all dynamic allocation during the computation,
+- Avoid all unnecessary copies (it is often needed to rewrite the containers),
+- Due to combinatorial explosion, sets often don’t fit in memory or disk and are enumerated on the fly.
 
 Here are some examples,
 the speedup is in comparison to an implementation without vector instructions:
@@ -72,24 +72,29 @@ Cycle type of a permutation | 8.94
 
 \section sec_tips Tips to the user
 
-Note that memory access can become a problem. It you store many things, most of the time will be spent in fetching from RAM, not computing.
-Data structure should preserve locality. You might want to compute some stats on data structure usage and write custom ones.
+Note that memory access can become a problem.
+If your algorithm stores many things, most of the time will be spent in fetching from RAM, not computing.
+The data structures your client code uses should preserve locality.
+You might want to compute some stats on data structure usage
+(eg avg size of buckets used, lengths of lists, lifetime of objects, etc.)
+and write custom data structure optimized for your usage profile.
 
 This lib is implemented with speed in mind, not code safety.
 Eg. there are no checks when building a permutation, which could be invalid (like non injective).
 
-We now suggest to have a look, in the menus above, at Classes → [Class list](annotated.html),
-esp. at classes are HPCombi::Perm16 and HPCombi::BMat8.
+We suggest having a look, in the menus above, at Classes → [Class list](annotated.html),
+esp. at the classes HPCombi::Perm16 and HPCombi::BMat8.
 
 \section Parallelism
 There is no parallelisation here. To use parallelism with this lib, see for instance:
 - Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics
-([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf), [DOI](https://dx.doi.org/10.1145/3115936.3115938)).
+([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf),
+[DOI](https://dx.doi.org/10.1145/3115936.3115938)).
 - [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework.
 
 Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to ease parallelism.
-Intel decided not to maintain it anymore so its deprecated.
-OpencilK is an open source project to continue it.
+Intel decided not to maintain Cilk anymore so it is deprecated.
+[OpencilK](https://github.com/OpenCilk/) is an open source project to continue it.
 
 We tested OpenMP and it was 2 orders of magnitude slower.
 
@@ -103,4 +108,4 @@ much faster than launching true threads
 OpencilK provides some primitives for concurrent access to data.
 It guarantees the semantics of serial execution.
 
-*/
\ No newline at end of file
+*/
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index d6207490..b4e09599 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -154,7 +154,7 @@ struct Transf16 : public PTransf16 {
     explicit operator uint64_t() const;
 };
 
-/** Partial permutation of @f$\{0\dots 15\}@f$; see HPCombi::Perm16;
+/** Partial permutation of @f$\{0\dots 15\}@f$; see also HPCombi::Perm16;
 partial means it might not be defined everywhere (but where it's defined, it's injective).
 Undefined images are encoded as 0xFF. */
 struct PPerm16 : public PTransf16 {
@@ -255,7 +255,6 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * Frontend method: currently aliased to #inverse_cycl */
     Perm16 inverse() const { return inverse_cycl(); }
 
-
     /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ algorithm using loop and indexed access
@@ -308,7 +307,6 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     static Perm16 unrankSJT(int n, int r);
 
-
     /**
      * @brief The Lehmer code of a permutation
      * @details
@@ -337,7 +335,7 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     epu8 lehmer_arr() const;
 
-    /** 
+    /**
      * @brief The Coxeter length (ie: number of inversion) of a permutation
      * @details
      * @returns the number of inversions of \c *this
@@ -428,7 +426,6 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     uint8_t nb_cycles_unroll() const;
 
-
     /**
      * @brief Compare two permutations for the left weak order
      * @par Example:
@@ -442,13 +439,15 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     bool left_weak_leq(Perm16 other) const;
 
-    /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq"
+     * but with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ testing inclusion of inversions one by one
      */
     bool left_weak_leq_ref(Perm16 other) const;
 
-    /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq"
+     * but with a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ with vectorized test of inclusion
      */
@@ -471,7 +470,8 @@ static_assert(std::is_trivial<Perm16>(), "Perm16 is not a trivial class !");
 namespace std {
 // Hash operators for Transf and Perm:
 
-//! This type appears in the doc because we provide a hash function for HPCombi::PTransf16.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::PTransf16.
 template <> struct hash<HPCombi::PTransf16> {
     //! A hash operator for #HPCombi::PTransf16
     size_t operator()(const HPCombi::PTransf16 &ar) const {
@@ -479,7 +479,8 @@ template <> struct hash<HPCombi::PTransf16> {
     }
 };
 
-//! This type appears in the doc because we provide a hash function for HPCombi::Transf16.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::Transf16.
 template <> struct hash<HPCombi::Transf16> {
     //! A hash operator for #HPCombi::Transf16
     size_t operator()(const HPCombi::Transf16 &ar) const {
@@ -487,7 +488,8 @@ template <> struct hash<HPCombi::Transf16> {
     }
 };
 
-//! This type appears in the doc because we provide a hash function for HPCombi::PPerm16.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::PPerm16.
 template <> struct hash<HPCombi::PPerm16> {
     //! A hash operator for #HPCombi::PPerm16
     size_t operator()(const HPCombi::PPerm16 &ar) const {
@@ -495,7 +497,8 @@ template <> struct hash<HPCombi::PPerm16> {
     }
 };
 
-//! This type appears in the doc because we provide a hash function for HPCombi::Perm16.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::Perm16.
 template <> struct hash<HPCombi::Perm16> {
     //! A hash operator for #HPCombi::Perm16
     size_t operator()(const HPCombi::Perm16 &ar) const { return uint64_t(ar); }

From 7581f3f7b9a2eca4ff92f476be78627e98764fea Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Thu, 19 Dec 2024 17:51:55 +0100
Subject: [PATCH 15/15] git clang-format main

---
 include/hpcombi/bmat8_impl.hpp        | 132 +++++++++++++-------------
 include/hpcombi/epu8.hpp              | 116 ++++++++++++++--------
 include/hpcombi/epu8_impl.hpp         |  15 ++-
 include/hpcombi/hpcombi.hpp           |  61 +++++++-----
 include/hpcombi/perm16.hpp            |  46 +++++----
 include/hpcombi/perm16_impl.hpp       |   3 +-
 include/hpcombi/perm_generic.hpp      |   9 +-
 include/hpcombi/perm_generic_impl.hpp |   3 +-
 include/hpcombi/power.hpp             |   4 +-
 include/hpcombi/vect16.hpp            |   6 +-
 include/hpcombi/vect_generic.hpp      |   7 +-
 11 files changed, 237 insertions(+), 165 deletions(-)

diff --git a/include/hpcombi/bmat8_impl.hpp b/include/hpcombi/bmat8_impl.hpp
index 6d17585c..35dd9faa 100644
--- a/include/hpcombi/bmat8_impl.hpp
+++ b/include/hpcombi/bmat8_impl.hpp
@@ -20,7 +20,8 @@
 //****************************************************************************//
 
 /** @file
-@brief implementation of bmat8.hpp ; this file should not be included directly. */
+@brief implementation of bmat8.hpp ; this file should not be included directly.
+*/
 
 // NOLINT(build/header_guard)
 
@@ -36,71 +37,70 @@ static const constexpr std::array<uint64_t, 8> COL_MASK = {
     0x1010101010101010, 0x808080808080808,  0x404040404040404,
     0x202020202020202,  0x101010101010101};
 
-static const constexpr std::array<uint64_t, 64> BIT_MASK = {{
-    0x8000000000000000,
-    0x4000000000000000,
-    0x2000000000000000,
-    0x1000000000000000,
-    0x800000000000000,
-    0x400000000000000,
-    0x200000000000000,
-    0x100000000000000,
-    0x80000000000000,
-    0x40000000000000,
-    0x20000000000000,
-    0x10000000000000,
-    0x8000000000000,
-    0x4000000000000,
-    0x2000000000000,
-    0x1000000000000,
-    0x800000000000,
-    0x400000000000,
-    0x200000000000,
-    0x100000000000,
-    0x80000000000,
-    0x40000000000,
-    0x20000000000,
-    0x10000000000,
-    0x8000000000,
-    0x4000000000,
-    0x2000000000,
-    0x1000000000,
-    0x800000000,
-    0x400000000,
-    0x200000000,
-    0x100000000,
-    0x80000000,
-    0x40000000,
-    0x20000000,
-    0x10000000,
-    0x8000000,
-    0x4000000,
-    0x2000000,
-    0x1000000,
-    0x800000,
-    0x400000,
-    0x200000,
-    0x100000,
-    0x80000,
-    0x40000,
-    0x20000,
-    0x10000,
-    0x8000,
-    0x4000,
-    0x2000,
-    0x1000,
-    0x800,
-    0x400,
-    0x200,
-    0x100,
-    0x80,
-    0x40,
-    0x20,
-    0x10,
-    0x8,
-    0x4,
-    0x2,
-    0x1}};
+static const constexpr std::array<uint64_t, 64> BIT_MASK = {{0x8000000000000000,
+                                                             0x4000000000000000,
+                                                             0x2000000000000000,
+                                                             0x1000000000000000,
+                                                             0x800000000000000,
+                                                             0x400000000000000,
+                                                             0x200000000000000,
+                                                             0x100000000000000,
+                                                             0x80000000000000,
+                                                             0x40000000000000,
+                                                             0x20000000000000,
+                                                             0x10000000000000,
+                                                             0x8000000000000,
+                                                             0x4000000000000,
+                                                             0x2000000000000,
+                                                             0x1000000000000,
+                                                             0x800000000000,
+                                                             0x400000000000,
+                                                             0x200000000000,
+                                                             0x100000000000,
+                                                             0x80000000000,
+                                                             0x40000000000,
+                                                             0x20000000000,
+                                                             0x10000000000,
+                                                             0x8000000000,
+                                                             0x4000000000,
+                                                             0x2000000000,
+                                                             0x1000000000,
+                                                             0x800000000,
+                                                             0x400000000,
+                                                             0x200000000,
+                                                             0x100000000,
+                                                             0x80000000,
+                                                             0x40000000,
+                                                             0x20000000,
+                                                             0x10000000,
+                                                             0x8000000,
+                                                             0x4000000,
+                                                             0x2000000,
+                                                             0x1000000,
+                                                             0x800000,
+                                                             0x400000,
+                                                             0x200000,
+                                                             0x100000,
+                                                             0x80000,
+                                                             0x40000,
+                                                             0x20000,
+                                                             0x10000,
+                                                             0x8000,
+                                                             0x4000,
+                                                             0x2000,
+                                                             0x1000,
+                                                             0x800,
+                                                             0x400,
+                                                             0x200,
+                                                             0x100,
+                                                             0x80,
+                                                             0x40,
+                                                             0x20,
+                                                             0x10,
+                                                             0x8,
+                                                             0x4,
+                                                             0x2,
+                                                             0x1}};
 
 inline bool BMat8::operator()(size_t i, size_t j) const noexcept {
     HPCOMBI_ASSERT(i < 8);
diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index c5f9b3b7..2d8af734 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -79,7 +79,8 @@ inline bool equal(epu8 a, epu8 b) noexcept {
 /** Non equality of #HPCombi::epu8 */
 inline bool not_equal(epu8 a, epu8 b) noexcept { return !equal(a, b); }
 
-/** Apply a permutation \c b on the vector \c a: for i=0..16 {result[i] = a[b[i]} */
+/** Apply a permutation \c b on the vector \c a: for i=0..16 {result[i] =
+ * a[b[i]} */
 inline epu8 permuted_ref(epu8 a, epu8 b) noexcept;
 
 /** Same as \ref HPCombi::permuted_ref "permuted_ref"
@@ -154,13 +155,15 @@ inline epu8 sort8_perm(epu8 &a) noexcept;
 inline void merge(epu8 &a, epu8 &b) noexcept;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a different implementation.
+/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a
+   different implementation.
     @par Algorithm: uses string matching cpmestrm intrinsics
  */
 inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept;
 #endif
 
-/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a different implementation.
+/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a
+   different implementation.
     @par Algorithm: reference implementation
  */
 inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept;
@@ -197,14 +200,16 @@ inline epu8 random_epu8(uint16_t bnd);
  */
 inline epu8 remove_dups(epu8 a, uint8_t repl = 0) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 
 inline uint8_t horiz_sum_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
@@ -212,13 +217,15 @@ inline uint8_t horiz_sum_ref(epu8) noexcept;
 
 inline uint8_t horiz_sum_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_sum4(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
@@ -237,20 +244,23 @@ inline uint8_t horiz_sum3(epu8) noexcept;
  */
 inline uint8_t horiz_sum(epu8 v) noexcept { return horiz_sum3(v); }
 
-/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_sums_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_sums_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
@@ -268,26 +278,30 @@ inline epu8 partial_sums_round(epu8) noexcept;
  */
 inline epu8 partial_sums(epu8 v) noexcept { return partial_sums_round(v); }
 
-/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint8_t horiz_max_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline uint8_t horiz_max_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_max4(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
@@ -305,20 +319,23 @@ inline uint8_t horiz_max3(epu8) noexcept;
  */
 inline uint8_t horiz_max(epu8 v) noexcept { return horiz_max4(v); }
 
-/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_max_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_max_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
@@ -336,26 +353,30 @@ inline epu8 partial_max_round(epu8) noexcept;
  */
 inline epu8 partial_max(epu8 v) noexcept { return partial_max_round(v); }
 
-/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint8_t horiz_min_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline uint8_t horiz_min_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_min4(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
@@ -370,23 +391,26 @@ inline uint8_t horiz_min3(epu8) noexcept;
  * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4});
  * @endcode
  * Returns `1`
-*/
+ */
 inline uint8_t horiz_min(epu8 v) noexcept { return horiz_min4(v); }
 
-/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_min_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_min_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
@@ -404,32 +428,37 @@ inline epu8 partial_min_round(epu8) noexcept;
  */
 inline epu8 partial_min(epu8 v) noexcept { return partial_min_round(v); }
 
-/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 eval16_ref(epu8 v) noexcept;
 
-/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and cast to array
  */
 inline epu8 eval16_arr(epu8 v) noexcept;
 
-/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Vector @f$O(n)@f$ using cyclic shifting
  */
 inline epu8 eval16_cycle(epu8 v) noexcept;
 
-/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Vector @f$O(n)@f$ using popcount
  */
 inline epu8 eval16_popcount(epu8 v) noexcept;
 
 /**
- * @brief Evaluation of a #HPCombi::epu8: count how many times each int of 0..15 appears in the input.
+ * @brief Evaluation of a #HPCombi::epu8: count how many times each int of 0..15
+ * appears in the input.
  * @details
  * @param v : a #HPCombi::epu8
  * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
@@ -443,21 +472,24 @@ inline epu8 eval16_popcount(epu8 v) noexcept;
  */
 inline epu8 eval16(epu8 v) noexcept { return eval16_cycle(v); }
 
-/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using \c cmpestri instruction
  */
 inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
 #endif
 
-/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using vector comparison and mask
  */
@@ -485,21 +517,24 @@ inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
     return first_diff_mask(a, b, bound);
 }
 
-/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using \c cmpestri instruction
  */
 inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
 #endif
 
-/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using vector comparison and mask
  */
@@ -610,17 +645,20 @@ inline bool is_transformation(epu8 v, const size_t k = 16) noexcept;
 inline bool is_partial_permutation(epu8 v, const size_t k = 16) noexcept;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
     @par Algorithm: uses string matching cpmestri intrinsics
  */
 inline bool is_permutation_cpmestri(epu8 v, const size_t k = 16) noexcept;
 #endif
-/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
     @par Algorithm: sort the vector and compare to identity
  */
 inline bool is_permutation_sort(epu8 v, const size_t k = 16) noexcept;
 
-/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
     @par Algorithm: uses evaluation
  */
 inline bool is_permutation_eval(epu8 v, const size_t k = 16) noexcept;
diff --git a/include/hpcombi/epu8_impl.hpp b/include/hpcombi/epu8_impl.hpp
index 2cc6c872..702372c3 100644
--- a/include/hpcombi/epu8_impl.hpp
+++ b/include/hpcombi/epu8_impl.hpp
@@ -20,7 +20,8 @@
 // NOLINT(build/header_guard)
 
 /** @file
-@brief implementation of epu8.hpp ; this file should not be included directly. */
+@brief implementation of epu8.hpp ; this file should not be included directly.
+*/
 
 #include <initializer_list>
 #include <iostream>
@@ -553,7 +554,8 @@ inline std::string to_string(HPCombi::epu8 const &a) {
     return ss.str();
 }
 
-//! This type appears in the doc because we provide an equal operator for HPCombi::epu8.
+//! This type appears in the doc because we provide an equal operator for
+//! HPCombi::epu8.
 template <> struct equal_to<HPCombi::epu8> {
     bool operator()(const HPCombi::epu8 &lhs,
                     const HPCombi::epu8 &rhs) const noexcept {
@@ -561,7 +563,8 @@ template <> struct equal_to<HPCombi::epu8> {
     }
 };
 
-//! This type appears in the doc because we provide a not_equal operator for HPCombi::epu8.
+//! This type appears in the doc because we provide a not_equal operator for
+//! HPCombi::epu8.
 template <> struct not_equal_to<HPCombi::epu8> {
     bool operator()(const HPCombi::epu8 &lhs,
                     const HPCombi::epu8 &rhs) const noexcept {
@@ -569,7 +572,8 @@ template <> struct not_equal_to<HPCombi::epu8> {
     }
 };
 
-//! This type appears in the doc because we provide a hash function for HPCombi::epu8.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::epu8.
 template <> struct hash<HPCombi::epu8> {
     inline size_t operator()(HPCombi::epu8 a) const noexcept {
         unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);
@@ -586,7 +590,8 @@ template <> struct hash<HPCombi::epu8> {
     }
 };
 
-//! This type appears in the doc because we provide a less operator for HPCombi::epu8.
+//! This type appears in the doc because we provide a less operator for
+//! HPCombi::epu8.
 template <> struct less<HPCombi::epu8> {
     // WARNING: due to endianness this is not lexicographic comparison,
     //          but we don't care when using in std::set.
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index b84f6fdc..af9282ce 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -39,23 +39,27 @@ and also debug.hpp, epu8.hpp, etc.*/
 
 \section readme_sec Readme
 
-You might want to have a look at [the Readme in the sources](https://github.com/libsemigroups/HPCombi/blob/main/README.md).
+You might want to have a look at [the Readme in the
+sources](https://github.com/libsemigroups/HPCombi/blob/main/README.md).
 
 \section sec_philo Philosophy
-This library provides high performance computations in combinatorics (hence its name).
-In practice we observe large speedups in several enumeration problems.
+This library provides high performance computations in combinatorics (hence its
+name). In practice we observe large speedups in several enumeration problems.
 
-The main idea of the library is a way to encode data as a small sequence of small integers,
-that can be handled efficiently by a creative use of vector instructions.
-For example, on the current x86 machines, small permutations (N ≤ 16) are very well handled.
-Indeed thanks to machine instructions such as PSHUFB (Packed SHUFfle Bytes),
-applying a permutation on a vector only takes a few CPU cycles.
+The main idea of the library is a way to encode data as a small sequence of
+small integers, that can be handled efficiently by a creative use of vector
+instructions. For example, on the current x86 machines, small permutations (N ≤
+16) are very well handled. Indeed thanks to machine instructions such as PSHUFB
+(Packed SHUFfle Bytes), applying a permutation on a vector only takes a few CPU
+cycles.
 
 Further ideas are:
 - Vectorization (MMX, SSE, AVX instructions sets) and careful memory alignment,
-- Careful memory management: avoid all dynamic allocation during the computation,
+- Careful memory management: avoid all dynamic allocation during the
+computation,
 - Avoid all unnecessary copies (it is often needed to rewrite the containers),
-- Due to combinatorial explosion, sets often don’t fit in memory or disk and are enumerated on the fly.
+- Due to combinatorial explosion, sets often don’t fit in memory or disk and are
+enumerated on the fly.
 
 Here are some examples,
 the speedup is in comparison to an implementation without vector instructions:
@@ -73,37 +77,42 @@ Cycle type of a permutation | 8.94
 \section sec_tips Tips to the user
 
 Note that memory access can become a problem.
-If your algorithm stores many things, most of the time will be spent in fetching from RAM, not computing.
-The data structures your client code uses should preserve locality.
-You might want to compute some stats on data structure usage
+If your algorithm stores many things, most of the time will be spent in fetching
+from RAM, not computing. The data structures your client code uses should
+preserve locality. You might want to compute some stats on data structure usage
 (eg avg size of buckets used, lengths of lists, lifetime of objects, etc.)
 and write custom data structure optimized for your usage profile.
 
 This lib is implemented with speed in mind, not code safety.
-Eg. there are no checks when building a permutation, which could be invalid (like non injective).
+Eg. there are no checks when building a permutation, which could be invalid
+(like non injective).
 
-We suggest having a look, in the menus above, at Classes → [Class list](annotated.html),
-esp. at the classes HPCombi::Perm16 and HPCombi::BMat8.
+We suggest having a look, in the menus above, at Classes → [Class
+list](annotated.html), esp. at the classes HPCombi::Perm16 and HPCombi::BMat8.
 
 \section Parallelism
-There is no parallelisation here. To use parallelism with this lib, see for instance:
-- Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics
+There is no parallelisation here. To use parallelism with this lib, see for
+instance:
+- Florent Hivert, High Performance Computing Experiments in Enumerative and
+Algebraic Combinatorics
 ([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf),
 [DOI](https://dx.doi.org/10.1145/3115936.3115938)).
-- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework.
+- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing
+framework.
 
-Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to ease parallelism.
-Intel decided not to maintain Cilk anymore so it is deprecated.
-[OpencilK](https://github.com/OpenCilk/) is an open source project to continue it.
+Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to
+ease parallelism. Intel decided not to maintain Cilk anymore so it is
+deprecated. [OpencilK](https://github.com/OpenCilk/) is an open source project
+to continue it.
 
 We tested OpenMP and it was 2 orders of magnitude slower.
 
 OpencilK adds the keyword `spawn`,
 which adds a special tag to the stack and launches a recursive call.
-If a thread finishes its work, it will look at other threads' stacks and steal their work.
-The value of Cilk is that recursive calls cost only 4 or 5 times more,
-much faster than launching true threads
-(which would take 6-7 orders of magnitude more time to create, measured in μs).
+If a thread finishes its work, it will look at other threads' stacks and steal
+their work. The value of Cilk is that recursive calls cost only 4 or 5 times
+more, much faster than launching true threads (which would take 6-7 orders of
+magnitude more time to create, measured in μs).
 
 OpencilK provides some primitives for concurrent access to data.
 It guarantees the semantics of serial execution.
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index b4e09599..05c4a362 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -155,8 +155,8 @@ struct Transf16 : public PTransf16 {
 };
 
 /** Partial permutation of @f$\{0\dots 15\}@f$; see also HPCombi::Perm16;
-partial means it might not be defined everywhere (but where it's defined, it's injective).
-Undefined images are encoded as 0xFF. */
+partial means it might not be defined everywhere (but where it's defined, it's
+injective). Undefined images are encoded as 0xFF. */
 struct PPerm16 : public PTransf16 {
     PPerm16() = default;
     constexpr PPerm16(const PPerm16 &v) = default;
@@ -200,7 +200,8 @@ struct PPerm16 : public PTransf16 {
     PPerm16 inverse_ref() const;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-    /** Same as \ref HPCombi::PPerm16::inverse_ref "inverse_ref" but with a different algorithm.
+    /** Same as \ref HPCombi::PPerm16::inverse_ref "inverse_ref" but with a
+     * different algorithm.
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
@@ -255,19 +256,22 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * Frontend method: currently aliased to #inverse_cycl */
     Perm16 inverse() const { return inverse_cycl(); }
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ algorithm using loop and indexed access
      */
     Perm16 inverse_ref() const;
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  @f$O(n)@f$ algorithm using reference cast to arrays
      */
     Perm16 inverse_arr() const;
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Insert the identity in the least significant bits and sort using a
      *  sorting network. The number of rounds of the optimal sorting network is
@@ -275,14 +279,16 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     Perm16 inverse_sort() const;
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
      */
     Perm16 inverse_find() const { return permutation_of(v, one()); }
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *
      * Use HPCombi::pow to
@@ -291,7 +297,8 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     Perm16 inverse_pow() const;
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Compute power from @f$n/2@f$ to @f$n@f$, when @f$\sigma^k(i)=i@f$ then
      *  @f$\sigma^{-1}(i)=\sigma^{k-1}(i)@f$. Complexity @f$O(n)@f$
@@ -323,13 +330,15 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     epu8 lehmer() const;
 
-    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a
+     * different implementation.
      * @par Algorithm:
      * Reference @f$O(n^2)@f$ algorithm using loop and indexed access
      */
     epu8 lehmer_ref() const;
 
-    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a
+     * different implementation.
      * @par Algorithm:
      * Reference @f$O(n^2)@f$ algorithm using array, loop and indexed access
      */
@@ -350,13 +359,15 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     uint8_t length() const;
 
-    /** Same interface as \ref HPCombi::Perm16::length "length", with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::length "length", with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access
      */
     uint8_t length_ref() const;
 
-    /** Same interface as \ref HPCombi::Perm16::length "length", with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::length "length", with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access after
      *     a cast to \c std::array
@@ -378,7 +389,8 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     uint8_t nb_descents() const;
 
-    /** Same interface as \ref HPCombi::Perm16::nb_descents "nb_descents", with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::nb_descents "nb_descents", with
+     * a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ using a loop
      */
@@ -414,13 +426,15 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     uint8_t nb_cycles() const { return nb_cycles_unroll(); }
 
-    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ using a boolean vector
      */
     uint8_t nb_cycles_ref() const;
 
-    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(\log(n))@f$ using #cycles_partition
      */
diff --git a/include/hpcombi/perm16_impl.hpp b/include/hpcombi/perm16_impl.hpp
index 5c4d1029..4d2daed9 100644
--- a/include/hpcombi/perm16_impl.hpp
+++ b/include/hpcombi/perm16_impl.hpp
@@ -20,7 +20,8 @@
 // NOLINT(build/header_guard)
 
 /** @file
-@brief implementation of perm16.hpp ; this file should not be included directly. */
+@brief implementation of perm16.hpp ; this file should not be included directly.
+*/
 
 namespace HPCombi {
 inline PTransf16::PTransf16(std::initializer_list<uint8_t> il)
diff --git a/include/hpcombi/perm_generic.hpp b/include/hpcombi/perm_generic.hpp
index db441266..164cb711 100644
--- a/include/hpcombi/perm_generic.hpp
+++ b/include/hpcombi/perm_generic.hpp
@@ -38,10 +38,11 @@
 
 namespace HPCombi {
 
-/** Vanilla (ie NOT optimized) implementation of a permutation, used to check for test correctness and as baseline to measure speedup.
-Implemented as an std array, so the permutation is not necessarily of size n=16.
-PermGeneric<16> should implement as much as possibles of Perm16 (currently not everything due to lack of time/need).
-No optimisation, so prefer to use Perm16.
+/** Vanilla (ie NOT optimized) implementation of a permutation, used to check
+for test correctness and as baseline to measure speedup. Implemented as an std
+array, so the permutation is not necessarily of size n=16. PermGeneric<16>
+should implement as much as possibles of Perm16 (currently not everything due to
+lack of time/need). No optimisation, so prefer to use Perm16.
 
 About Expo, see comment on HPCombi::VectGeneric.
 */
diff --git a/include/hpcombi/perm_generic_impl.hpp b/include/hpcombi/perm_generic_impl.hpp
index 72b5a45b..744ac3c0 100644
--- a/include/hpcombi/perm_generic_impl.hpp
+++ b/include/hpcombi/perm_generic_impl.hpp
@@ -119,7 +119,8 @@ bool PermGeneric<Size, Expo>::left_weak_leq(PermGeneric other) const {
 
 namespace std {
 
-//! This type appears in the doc because we provide a hash function for HPCombi::PermGeneric.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::PermGeneric.
 template <size_t Size, typename Expo>
 struct hash<HPCombi::PermGeneric<Size, Expo>> {
     size_t operator()(const HPCombi::PermGeneric<Size, Expo> &ar) const {
diff --git a/include/hpcombi/power.hpp b/include/hpcombi/power.hpp
index a98db24f..6afb7b2d 100644
--- a/include/hpcombi/power.hpp
+++ b/include/hpcombi/power.hpp
@@ -17,13 +17,13 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
-
 /** @file
 @brief  Generic compile-time unrolling of the fast exponentiation algorithm.
 
 Allows to write expressions such as
 - @c pow<23>(2.5) : entirely computed at compile time
-- @c pow<n>(x) expanded at compile time to a O(log n) long sequence of multiplications.
+- @c pow<n>(x) expanded at compile time to a O(log n) long sequence of
+multiplications.
 
 Such expressions work for numbers but also for any type where there is a
 neutral element and an associative (non necessarily commutative) product,
diff --git a/include/hpcombi/vect16.hpp b/include/hpcombi/vect16.hpp
index f9a0b2ea..d0e13b3b 100644
--- a/include/hpcombi/vect16.hpp
+++ b/include/hpcombi/vect16.hpp
@@ -34,7 +34,8 @@
 
 namespace HPCombi {
 
-/** Vector of 16 bytes, with some optimized methods, superclass of HPCombi::Transf16. */
+/** Vector of 16 bytes, with some optimized methods, superclass of
+ * HPCombi::Transf16. */
 struct alignas(16) Vect16 {
     static constexpr size_t size() { return 16; }
     using array = typename decltype(Epu8)::array;
@@ -117,7 +118,8 @@ inline std::ostream &operator<<(std::ostream &stream,
     return operator<<(stream, ar.v);
 }
 
-//! This type appears in the doc because we provide a hash function for HPCombi::Vect16.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::Vect16.
 template <> struct hash<HPCombi::Vect16> {
     size_t operator()(const HPCombi::Vect16 &ar) const {
         return std::hash<HPCombi::epu8>{}(ar.v);
diff --git a/include/hpcombi/vect_generic.hpp b/include/hpcombi/vect_generic.hpp
index 65cf4dab..0927f180 100644
--- a/include/hpcombi/vect_generic.hpp
+++ b/include/hpcombi/vect_generic.hpp
@@ -47,8 +47,8 @@ std::array<Expo, Size> sorted_vect(std::array<Expo, Size> v) {
 }
 
 /** \ref HPCombi::VectGeneric "VectGeneric" is to \ref HPCombi::Vect16 "Vect16"
-what \ref HPCombi::PermGeneric "PermGeneric" is to \ref HPCombi::Perm16 "Perm16";
-see \ref HPCombi::PermGeneric "PermGeneric".
+what \ref HPCombi::PermGeneric "PermGeneric" is to \ref HPCombi::Perm16
+"Perm16"; see \ref HPCombi::PermGeneric "PermGeneric".
 
 HPCombi started as a library to manipulate monomials on several variables,
 ie a tuple of *expo*nents. The elements of arrays were thus named Expo.
@@ -244,7 +244,8 @@ std::ostream &operator<<(std::ostream &stream,
     return stream;
 }
 
-//! This type appears in the doc because we provide a hash function for HPCombi::VectGeneric.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::VectGeneric.
 template <size_t Size, typename Expo>
 struct hash<HPCombi::VectGeneric<Size, Expo>> {
     size_t operator()(const HPCombi::VectGeneric<Size, Expo> &ar) const {