[feature/SPO_apply] Storing the map to slightly improve speed on the …

…final contraction. This is also how we'd do it if we were going to reuse that map from P^n --> P^2n
qognitive · Nov 4, 2024 · e616935 · e616935
1 parent 777d8df
commit e616935
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 67 deletions.
diff --git a/fast_pauli/cpp/examples/05_summed_pauli_op_sq.cpp b/fast_pauli/cpp/examples/05_summed_pauli_op_sq.cpp
@@ -28,10 +28,9 @@ int main()
     //
     // User settings
     //
-    size_t const n_operators = 100;
+    size_t const n_operators = 1000;
     size_t const n_qubits = 12;
     size_t const weight = 2;
-    size_t const n_states = 100;
     using fp_type = float;
 
     //
@@ -43,24 +42,6 @@ int main()
     std::vector<std::complex<fp_type>> coeff_raw(n_paulis_per_operator * n_operators, 1);
     fp::SummedPauliOp<fp_type> summed_op{pauli_strings, coeff_raw};
 
-    //
-    // Setup states
-    //
-    size_t const dim = summed_op.dim();
-    size_t const n_ops = summed_op.n_operators();
-
-    std::vector<std::complex<fp_type>> states_raw(dim * n_states, 1);
-    std::mdspan<std::complex<fp_type>, std::dextents<size_t, 2>> states(states_raw.data(), dim, n_states);
-
-    auto new_states_raw = std::vector<std::complex<fp_type>>(dim * n_states, 0);
-    std::mdspan<std::complex<fp_type>, std::dextents<size_t, 2>> new_states(new_states_raw.data(), dim, n_states);
-
-    //
-    // Init weights (aka data)
-    //
-    std::vector<fp_type> weights_raw(n_ops * n_states, 1);
-    std::mdspan<fp_type, std::dextents<size_t, 2>> weights(weights_raw.data(), n_ops, n_states);
-
     //
     // Apply the states
     //

diff --git a/fast_pauli/cpp/include/__summed_pauli_op.hpp b/fast_pauli/cpp/include/__summed_pauli_op.hpp
@@ -204,11 +204,15 @@ template <std::floating_point T> struct SummedPauliOp
         return pauli_strings.size();
     }
 
+    /**
+     * @brief Square the SummedPauliOp, mathematically \f$ A_k \rightarrow A_k^2 = \sum_{a,b} T_{ab} A_a A_b \f$.
+     *
+     * We use the sprarse structure of A_k operators and PauliString products to calculate the new coefficients.
+     *
+     * @return SummedPauliOp<T>
+     */
     fast_pauli::SummedPauliOp<T> square() const
     {
-        using namespace std::chrono;
-        auto start = high_resolution_clock::now();
-
         // Part 1: Get the squared pauli strings
         size_t weight = std::reduce(
             pauli_strings.begin(), pauli_strings.end(), size_t(0),
@@ -223,37 +227,28 @@ template <std::floating_point T> struct SummedPauliOp
             sq_idx_map[pauli_strings_sq[i]] = i;
         }
 
-        auto end = high_resolution_clock::now();
-        fmt::print("Part 1 took {} ms\n", duration_cast<milliseconds>(end - start).count());
-
         // Part 2: Create the T_aij tensor that maps the coeffiencts from h_i * h_j to h'_a
-        start = high_resolution_clock::now();
-
-        // std::vector<std::complex<T>> t_aij_raw(pauli_strings_sq.size() * pauli_strings.size() *
-        // pauli_strings.size()); Tensor<3> t_aij(t_aij_raw.data(), pauli_strings_sq.size(), pauli_strings.size(),
-        // pauli_strings.size());
-
-        // #pragma omp parallel for collapse(2)
-        //         for (size_t i = 0; i < pauli_strings.size(); ++i)
-        //         {
-        //             for (size_t j = 0; j < pauli_strings.size(); ++j)
-        //             {
-        //                 std::complex<T> phase;
-        //                 fast_pauli::PauliString prod;
-        //                 std::tie(phase, prod) = pauli_strings[i] * pauli_strings[j];
-        //                 size_t a = sq_idx_map[prod];
-        //                 t_aij(a, i, j) = phase;
-        //             }
-        //         }
-
-        end = high_resolution_clock::now();
-        fmt::print("Part 2 took {} ms\n", duration_cast<milliseconds>(end - start).count());
-
-        // Part 3: Create the new coeffs tensor
-        start = high_resolution_clock::now();
+        // Create t_aij sparse tensor
+        std::vector<std::vector<std::tuple<size_t, size_t, std::complex<T>>>> t_aij(
+            pauli_strings_sq.size(), std::vector<std::tuple<size_t, size_t, std::complex<T>>>());
 
+        // Serial because we'll have a race condition on the t_aij[a] vector
+        for (size_t i = 0; i < pauli_strings.size(); ++i)
+        {
+            for (size_t j = 0; j < pauli_strings.size(); ++j)
+            {
+                std::complex<T> phase;
+                fast_pauli::PauliString prod;
+                std::tie(phase, prod) = pauli_strings[i] * pauli_strings[j];
+                size_t a = sq_idx_map[prod];
+                t_aij[a].emplace_back(i, j, phase);
+            }
+        }
+
+        // Part 3: Create the new coeffs tensor (transpose of the old one for better memory access)
         std::vector<std::complex<T>> coeffs_t_raw(coeffs.size());
         Tensor<2> coeffs_t(coeffs_t_raw.data(), coeffs.extent(1), coeffs.extent(0));
+
 #pragma omp parallel for collapse(2)
         for (size_t i = 0; i < coeffs.extent(0); ++i)
         {
@@ -263,35 +258,26 @@ template <std::floating_point T> struct SummedPauliOp
             }
         }
 
-        end = high_resolution_clock::now();
-        fmt::print("Part 3 took {} ms\n", duration_cast<milliseconds>(end - start).count());
-
         // Part 4: Contract the T_aij tensor with the coeffs to get the new coeffs
-        start = high_resolution_clock::now();
-
         std::vector<std::complex<T>> coeffs_sq_raw(pauli_strings_sq.size() * _n_operators);
         Tensor<2> coeffs_sq(coeffs_sq_raw.data(), pauli_strings_sq.size(), _n_operators);
 
-#pragma omp parallel for
-        for (size_t k = 0; k < _n_operators; ++k)
+        size_t i, j;
+        std::complex<T> phase;
+
+#pragma omp parallel for schedule(dynamic) collapse(2) private(i, j, phase)
+        for (size_t a = 0; a < pauli_strings_sq.size(); ++a)
         {
-            for (size_t i = 0; i < pauli_strings.size(); ++i)
+            for (size_t k = 0; k < _n_operators; ++k)
             {
-                for (size_t j = 0; j < pauli_strings.size(); ++j)
+                for (size_t x = 0; x < t_aij[a].size(); ++x)
                 {
-                    // coeffs_sq(a, k) += t_aij(a, i, j) * coeffs_t(k, i) * coeffs_t(k, j);
-                    std::complex<T> phase;
-                    fast_pauli::PauliString prod;
-                    std::tie(phase, prod) = pauli_strings[i] * pauli_strings[j];
-                    size_t a = sq_idx_map[prod];
+                    std::tie(i, j, phase) = t_aij[a][x];
                     coeffs_sq(a, k) += phase * coeffs_t(k, i) * coeffs_t(k, j);
                 }
             }
         }
 
-        end = high_resolution_clock::now();
-        fmt::print("Part 4 took {} ms\n", duration_cast<milliseconds>(end - start).count());
-
         return SummedPauliOp<T>(pauli_strings_sq, coeffs_sq);
     }