From aeb2eee306b87e2a9d43ab1aecc5779d1f5f008a Mon Sep 17 00:00:00 2001
From: Balint Joo <bjoo@jlab.org>
Date: Fri, 1 May 2020 18:53:51 -0400
Subject: [PATCH 1/5] Added general permute in simd_common.hpp. Added integer
 simd in avx.h, and single prec permute in avx.h

---
 avx.hpp         | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
 simd_common.hpp | 20 +++++++++++
 2 files changed, 116 insertions(+)
diff --git a/avx.hpp b/avx.hpp
index e1e6bf9..ba80a1b 100644
--- a/avx.hpp
+++ b/avx.hpp
@@ -392,6 +392,102 @@ SIMD_ALWAYS_INLINE inline simd<double, simd_abi::avx> choose(
   return simd<double, simd_abi::avx>(_mm256_blendv_pd(c.get(), b.get(), a.get()));
 }
 
+
+template <>
+class simd<int, simd_abi::avx> {
+  __m256i m_value;
+ public:
+  using value_type = int;
+  using abi_type = simd_abi::avx;
+  using mask_type = simd_mask<int, abi_type>;
+  using storage_type = simd_storage<int, abi_type>;
+  SIMD_ALWAYS_INLINE inline simd() = default;
+  SIMD_ALWAYS_INLINE inline static constexpr int size() { return 8; }
+  SIMD_ALWAYS_INLINE inline simd(int value)
+    :m_value(_mm256_set1_epi32(value))
+  {}
+  SIMD_ALWAYS_INLINE inline simd(
+      int a, int b, int c, int d,
+      int e, int f, int g, int h)
+    :m_value(_mm256_setr_epi32(a, b, c, d, e, f, g, h))
+  {}
+  SIMD_ALWAYS_INLINE inline simd(
+       int a, int b, int c, int d)
+     :m_value(_mm256_setr_epi32(a, b, c, d, 0, 0, 0, 0))
+   {}
+
+  SIMD_ALWAYS_INLINE inline
+  simd(storage_type const& value) {
+    copy_from(value.data(), element_aligned_tag());
+  }
+  SIMD_ALWAYS_INLINE inline
+  simd& operator=(storage_type const& value) {
+    copy_from(value.data(), element_aligned_tag());
+    return *this;
+  }
+  template <class Flags>
+  SIMD_ALWAYS_INLINE inline simd(int const* ptr, Flags /*flags*/)
+    :m_value(::_mm256_loadu_si256((__m256i const *)ptr))
+  {}
+  SIMD_ALWAYS_INLINE inline simd(int const* ptr, int stride)
+    :simd(ptr[0],        ptr[stride],   ptr[2*stride], ptr[3*stride],
+          ptr[4*stride], ptr[5*stride], ptr[6*stride], ptr[7*stride])
+  {}
+  SIMD_ALWAYS_INLINE inline constexpr simd(__m256i const& value_in)
+    :m_value(value_in)
+  {}
+
+#if defined(__AVX2__)
+
+  SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const {
+    return simd(_mm256_mul_epi32(m_value, other.m_value));
+  }
+
+  SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const {
+    return simd(_mm256_add_epi32(m_value, other.m_value));
+  }
+  SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const {
+    return simd(_mm256_sub_epi32(m_value, other.m_value));
+  }
+
+
+  SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const {
+    return simd(_mm256_sub_epi32(_mm256_set1_epi32(0.0), m_value));
+  }
+#endif
+
+#ifdef __INTEL_COMPILER
+  SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const {
+    return simd(_mm256_div_epi32(m_value, other.m_value));
+  }
+#endif
+
+  SIMD_ALWAYS_INLINE inline void copy_from(int const* ptr, element_aligned_tag) {
+    m_value = _mm256_loadu_si256((__m256i const *)ptr);
+  }
+  SIMD_ALWAYS_INLINE inline void copy_to(int* ptr, element_aligned_tag) const {
+    _mm256_storeu_si256((__m256i *)ptr, m_value);
+  }
+  SIMD_ALWAYS_INLINE inline constexpr __m256i get() const { return m_value; }
+
+  /*
+  SIMD_ALWAYS_INLINE inline simd_mask<float, simd_abi::avx> operator<(simd const& other) const {
+    return simd_mask<float, simd_abi::avx>(_mm256_cmp_ps(m_value, other.m_value, _CMP_LT_OS));
+  }
+  SIMD_ALWAYS_INLINE inline simd_mask<float, simd_abi::avx> operator==(simd const& other) const {
+    return simd_mask<float, simd_abi::avx>(_mm256_cmp_ps(m_value, other.m_value, _CMP_EQ_OS));
+  }
+  */
+};
+
+// Specialized permute
+SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
+inline simd<float, simd_abi::avx> permute(simd<int, simd_abi::avx> const& control,
+		simd<float, simd_abi::avx> const& a) {
+   	simd<float,simd_abi::avx> result(_mm256_permutexvar_ps(control.get(),a.get())  );
+  return result;
+}
+
 }
 
 #endif
diff --git a/simd_common.hpp b/simd_common.hpp
index 6d4d4be..bc3f5b0 100644
--- a/simd_common.hpp
+++ b/simd_common.hpp
@@ -278,4 +278,24 @@ class simd_size<simd<T, Abi>> {
   static constexpr int value = simd<T, Abi>::size();
 };
 
+
+// Generic Permute
+template <class T, class Abi>
+SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
+inline simd<T, Abi> permute(simd<int, Abi> const& control, simd<T, Abi> const& a) {
+  T stack_a[simd<T, Abi>::size()];
+  T stack_res[simd<T,Abi>::size()];
+  int stack_control[ simd<T,Abi>::size()];
+
+  a.copy_to(stack_a, element_aligned_tag());
+  control.copy_to(stack_control, element_aligned_tag());
+
+  for (int i = 0; i < simd<T, Abi>::size(); ++i) {
+	  stack_res[i] = stack_a[ stack_control[i] ];
+  }
+  simd<T,Abi> result;
+  result.copy_from(stack_res, element_aligned_tag());
+  return result;
+}
+
 }

From 0b390a914e6c6b918bd0acf5cbe619eb7edc2602 Mon Sep 17 00:00:00 2001
From: Balint Joo <bjoo@sciml2001.jlab.org>
Date: Sat, 2 May 2020 14:15:13 -0400
Subject: [PATCH 2/5] AVX512 int simd, int simd mask and permutes added

---
 avx512.hpp | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

diff --git a/avx512.hpp b/avx512.hpp
index f592b98..9bb5a30 100644
--- a/avx512.hpp
+++ b/avx512.hpp
@@ -397,6 +397,153 @@ SIMD_ALWAYS_INLINE inline simd<double, simd_abi::avx512> choose(
   return simd<double, simd_abi::avx512>(_mm512_mask_blend_pd(a.get(), c.get(), b.get()));
 }
 
+  // SIMD MASK FOR SIMD INT
+  // Essentially this is the same as the mask for simd<float> but I tried
+  // Deriving from that and that didn't work for me.
+  // The only difference is the line 'using simd_type='
+template <>
+class simd_mask<int, simd_abi::avx512>
+{
+  __mmask16 m_value;
+ public:
+  using value_type = bool;
+  using simd_type = simd<int, simd_abi::avx512>;
+  using abi_type = simd_abi::avx512;
+  SIMD_ALWAYS_INLINE inline simd_mask() = default;
+  SIMD_ALWAYS_INLINE inline simd_mask(bool value)
+    :m_value(-std::int16_t(value))
+  {}
+  SIMD_ALWAYS_INLINE inline static constexpr int size() { return 16; }
+  SIMD_ALWAYS_INLINE inline constexpr simd_mask(__mmask16 const& value_in)
+    :m_value(value_in)
+  {}
+  SIMD_ALWAYS_INLINE inline constexpr __mmask16 get() const { return m_value; }
+  SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const {
+    return simd_mask(_kor_mask16(m_value, other.m_value));
+  }
+  SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const {
+    return simd_mask(_kand_mask16(m_value, other.m_value));
+  }
+  SIMD_ALWAYS_INLINE inline simd_mask operator!() const {
+    return simd_mask(_knot_mask16(m_value));
+  }
+};
+
+  // An integer SIMD class for AVX512
+template <>
+class simd<int, simd_abi::avx512> {
+  __m512i m_value;
+ public:
+  SIMD_ALWAYS_INLINE simd() = default;
+  using value_type = float;
+  using abi_type = simd_abi::avx512;
+  using mask_type = simd_mask<float, abi_type>;
+  using storage_type = simd_storage<int, abi_type>;
+  SIMD_ALWAYS_INLINE inline static constexpr int size() { return 16; }
+  SIMD_ALWAYS_INLINE inline simd(int value)
+    :m_value(_mm512_set1_epi32(value))
+  {}
+  SIMD_ALWAYS_INLINE inline simd(
+      int a, int b, int c, int d,
+      int e, int f, int g, int h,
+      int i, int j, int k, int l,
+      int m, int n, int o, int p)
+    :m_value(_mm512_setr_epi32(
+          a, b, c, d, e, f, g, h,
+          i, j, k, l, m, n, o, p))
+  {}
+
+  /*
+   * CAVEAT: Loading 8 element here
+   *  sets every second element as needed
+   *  for permutes. Is this a general pattern?
+   *  This is different from AVX & AVX2 where
+   *  there is not a DP shuffle actually,
+   *  We can use the 256 bit permute from AVX512
+   *  buf I believe that expects zeros at the high
+   *  end only 
+   */
+  SIMD_ALWAYS_INLINE inline simd(
+      int a, int b, int c, int d,
+      int e, int f, int g, int h)
+    :m_value(_mm512_setr_epi32(
+			       a, 0, b, 0, c, 0, d, 0,
+			       e, 0, f, 0, g, 0, h, 0))  {}
+  SIMD_ALWAYS_INLINE inline
+  simd(storage_type const& value) {
+    copy_from(value.data(), element_aligned_tag());
+  }
+  SIMD_ALWAYS_INLINE inline
+  simd& operator=(storage_type const& value) {
+    copy_from(value.data(), element_aligned_tag());
+    return *this;
+  }
+  template <class Flags>
+  SIMD_ALWAYS_INLINE inline simd(int const* ptr, Flags /*flags*/)
+    :m_value(_mm512_load_epi32(static_cast<void const*>(ptr)))
+  {}
+  SIMD_ALWAYS_INLINE inline simd(int const* ptr, int stride)
+    :simd(ptr[0],        ptr[stride],   ptr[2*stride], ptr[3*stride],
+          ptr[4*stride], ptr[5*stride], ptr[6*stride], ptr[7*stride],
+          ptr[8*stride], ptr[9*stride], ptr[10*stride], ptr[11*stride],
+          ptr[12*stride], ptr[13*stride], ptr[14*stride], ptr[15*stride])
+  {}
+  SIMD_ALWAYS_INLINE inline constexpr simd(__m512i const& value_in)
+    :m_value(value_in)
+  {}
+  SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const {
+    return simd(_mm512_mul_epi32(m_value, other.m_value));
+  }
+
+#if 0
+  // This needs SVML extension
+  SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const {
+    return simd(_mm512_div_epi32(m_value, other.m_value));
+  }
+#endif
+
+  SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const {
+    return simd(_mm512_add_epi32(m_value, other.m_value));
+  }
+  SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const {
+    return simd(_mm512_sub_epi32(m_value, other.m_value));
+  }
+  SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const {
+    return simd(_mm512_sub_epi32(_mm512_set1_epi32(0), m_value));
+  }
+  SIMD_ALWAYS_INLINE inline void copy_from(int const* ptr, element_aligned_tag) {
+    m_value = _mm512_load_epi32(static_cast<void const*>(ptr));
+  }
+  SIMD_ALWAYS_INLINE inline void copy_to(int* ptr, element_aligned_tag) const {
+    _mm512_store_epi32(ptr, m_value);
+  }
+  SIMD_ALWAYS_INLINE inline constexpr __m512i get() const { return m_value; }
+  SIMD_ALWAYS_INLINE inline simd_mask<int, simd_abi::avx512> operator<(simd const& other) const {
+    return simd_mask<int, simd_abi::avx512>(_mm512_cmp_epi32_mask(m_value, other.m_value, _CMP_LT_OS));
+  }
+  SIMD_ALWAYS_INLINE inline simd_mask<int, simd_abi::avx512> operator==(simd const& other) const {
+    return simd_mask<int, simd_abi::avx512>(_mm512_cmp_epi32_mask(m_value, other.m_value, _CMP_EQ_OS));
+  }
+};
+
+// Specialized permute
+SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
+inline simd<float, simd_abi::avx512> permute(simd<int, simd_abi::avx512> const& control,
+		simd<float, simd_abi::avx512> const& a) {
+   	simd<float,simd_abi::avx512> result(_mm512_permutexvar_ps(control.get(),a.get())  );
+  return result;
+}
+
+// Specialized permute
+SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
+inline simd<double, simd_abi::avx512> permute(simd<int, simd_abi::avx512> const& control,
+		simd<double, simd_abi::avx512> const& a) {
+   	simd<double,simd_abi::avx512> result(_mm512_permutexvar_pd(control.get(),a.get())  );
+  return result;
+}
+
+
+
 }
 
 #endif

From 99ad38057a8d66f562ea8e481e9b47b78cf18c73 Mon Sep 17 00:00:00 2001
From: Balint Joo <bjoo@jlab.org>
Date: Mon, 4 May 2020 12:32:15 -0500
Subject: [PATCH 3/5] Added Permutes for CUDA and HIP Guarded permute in
 simd_common.h as it won't work for GPU architectures  FIXME: need better
 guards for Clang + GPU targets

Fixed: any_of all_of in HIP backend
Fixed: potential mask overflow in cuda_warp
---
 cuda_warp.hpp     | 11 ++++++++++-
 hip_wavefront.hpp | 30 ++++++++++++++++++++++++++----
 simd_common.hpp   |  6 +++++-
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/cuda_warp.hpp b/cuda_warp.hpp
index 40a1233..53a8adc 100644
--- a/cuda_warp.hpp
+++ b/cuda_warp.hpp
@@ -72,7 +72,8 @@ class cuda_warp {
   static_assert(N <= 32, "CUDA warps can't be more than 32 threads");
  public:
   SIMD_HOST_DEVICE static unsigned mask() {
-    return (unsigned(1) << N) - unsigned(1);
+
+    return N == 32 ? 0xffffffff : ( (unsigned(1) << N) - unsigned(1) ) ;
   }
 };
 
@@ -267,6 +268,14 @@ SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd<T, simd_abi::cuda_warp<N>> choose(
   return simd<T, simd_abi::cuda_warp<N>>(a.get() ? b.get() : c.get());
 }
 
+  // Generic Permute
+  template <class T, int N>
+SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
+  inline simd<T, simd_abi::cuda_warp<N>> permute(simd<int, simd_abi::cuda_warp<N>> const& control, simd<T, simd_abi::cuda_warp<N>> const& a) {
+    return simd<T,simd_abi::cuda_warp<N>>(  __shfl_sync(simd_abi::cuda_warp<N>::mask(), a.get(),  control.get(), N));
+  }
+
+
 }
 
 #endif
diff --git a/hip_wavefront.hpp b/hip_wavefront.hpp
index 888708d..e460cc3 100644
--- a/hip_wavefront.hpp
+++ b/hip_wavefront.hpp
@@ -64,6 +64,8 @@
 #ifdef __HIPCC__
 #include <hip/math_functions.h>
 
+#include <cstdint>
+
 namespace SIMD_NAMESPACE {
 
 namespace simd_abi {
@@ -72,8 +74,11 @@ template <int N>
 class hip_wavefront {
   static_assert(N <= 64, "HIP wavefronts can't be more than 64 threads");
  public:
-  SIMD_HOST_DEVICE static unsigned mask() {
-    return (unsigned(1) << N) - unsigned(1);
+
+  // Do we need strictly 64 bit masks now?
+  // 
+  SIMD_HOST_DEVICE static uint64_t mask() {
+    return ( N==64 ) ? 0xffffffffffffffff : ( ( uint64_t(1) << N ) - uint64_t(1) ); 
   }
 };
 
@@ -144,16 +149,23 @@ class simd_mask<T, simd_abi::hip_wavefront<N>> {
   }
 };
 
+/*! FIXME: HIP does not support warp lane masked __all_of like CUDA 
+ *         yet 
+ */
 template <class T, int N>
 SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE
 bool all_of(simd_mask<T, simd_abi::hip_wavefront<N>> const& a) {
-  return bool(__all_sync(simd_abi::hip_wavefront<N>::mask(), int(a.get())));
+  return bool(__all(int(a.get())));
 }
 
+/*! FIXME: HIP does not support warp lane masked __any_of  like CUDA
+ *  yet
+ */
+ 
 template <class T, int N>
 SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE
 bool any_of(simd_mask<T, simd_abi::hip_wavefront<N>> const& a) {
-  return bool(__any_sync(simd_abi::hip_wavefront<N>::mask(), int(a.get())));
+  return bool(__any(int(a.get())));
 }
 
 template <class T, int N>
@@ -289,6 +301,16 @@ SIMD_HIP_ALWAYS_INLINE SIMD_HOST_DEVICE simd<T, simd_abi::hip_wavefront<N>> choo
   return simd<T, simd_abi::hip_wavefront<N>>(a.get() ? b.get() : c.get());
 }
 
+
+// Generic Permute
+template <class T, int N>
+SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
+inline simd<T, simd_abi::hip_wavefront<N>> permute(simd<int, simd_abi::hip_wavefront<N>> const& control, simd<T, simd_abi::hip_wavefront<N>> const& a) {
+  return simd<T,simd_abi::hip_wavefront<N>>(  __shfl(a.get(), control.get(), N));
+}
+
+
+
 } // SIMD_NAMESPACE
 
 #endif
diff --git a/simd_common.hpp b/simd_common.hpp
index bc3f5b0..5dd92e2 100644
--- a/simd_common.hpp
+++ b/simd_common.hpp
@@ -279,7 +279,10 @@ class simd_size<simd<T, Abi>> {
 };
 
 
-// Generic Permute
+// Generic Permute -- does not work on GPUs
+// FIXME: This guard is not sufficient: it should guard against 
+// General Clang GPU Combinations too
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
 template <class T, class Abi>
 SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
 inline simd<T, Abi> permute(simd<int, Abi> const& control, simd<T, Abi> const& a) {
@@ -297,5 +300,6 @@ inline simd<T, Abi> permute(simd<int, Abi> const& control, simd<T, Abi> const& a
   result.copy_from(stack_res, element_aligned_tag());
   return result;
 }
+#endif
 
 }

From 7c0880037aee6018721820008d9a52bd4c505781 Mon Sep 17 00:00:00 2001
From: Balint Joo <bjoo@jlab.org>
Date: Mon, 4 May 2020 17:31:00 -0500
Subject: [PATCH 4/5] Added static make_permute() function in simd_utils<
 simd<T,Abi> > traits class

---
 simd_common.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/simd_common.hpp b/simd_common.hpp
index 5dd92e2..6171962 100644
--- a/simd_common.hpp
+++ b/simd_common.hpp
@@ -279,6 +279,24 @@ class simd_size<simd<T, Abi>> {
 };
 
 
+template<typename T> 
+class simd_utils;
+
+template<typename T, class Abi>
+class simd_utils< simd<T,Abi> > {
+public:
+  SIMD_ALWAYS_INLINE 
+  static inline typename simd<int, Abi>::storage_type make_permute(const int source_lanes[simd<T,Abi>::size()] ) {
+    using control_storage_t = typename simd<int, Abi>::storage_type;
+    control_storage_t my_mask_storage;
+    for(int i=0; i < simd<T,Abi>::size(); ++i) {
+      my_mask_storage[i] = source_lanes[i];
+    }
+
+    return my_mask_storage;
+  }
+};
+
 // Generic Permute -- does not work on GPUs
 // FIXME: This guard is not sufficient: it should guard against 
 // General Clang GPU Combinations too

From 9440b03f4f8fe7f125ae7501da30595264a9cd8a Mon Sep 17 00:00:00 2001
From: Balint Joo <bjoo@sciml2001.jlab.org>
Date: Mon, 4 May 2020 19:02:21 -0400
Subject: [PATCH 5/5] Added simd_utils< simd<T,Abi>::make_permute() function
 for AVX and AVX512 Removed weird simd<int,> initialization for AVX512 --
 essentially the zeros get put into a simd_t::storage_type by make_permute
 now.

---
 avx.hpp    |  1 +
 avx512.hpp | 48 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/avx.hpp b/avx.hpp
index ba80a1b..f13846a 100644
--- a/avx.hpp
+++ b/avx.hpp
@@ -480,6 +480,7 @@ class simd<int, simd_abi::avx> {
   */
 };
 
+
 // Specialized permute
 SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
 inline simd<float, simd_abi::avx> permute(simd<int, simd_abi::avx> const& control,
diff --git a/avx512.hpp b/avx512.hpp
index 9bb5a30..dc7c43c 100644
--- a/avx512.hpp
+++ b/avx512.hpp
@@ -453,22 +453,6 @@ class simd<int, simd_abi::avx512> {
           i, j, k, l, m, n, o, p))
   {}
 
-  /*
-   * CAVEAT: Loading 8 element here
-   *  sets every second element as needed
-   *  for permutes. Is this a general pattern?
-   *  This is different from AVX & AVX2 where
-   *  there is not a DP shuffle actually,
-   *  We can use the 256 bit permute from AVX512
-   *  buf I believe that expects zeros at the high
-   *  end only 
-   */
-  SIMD_ALWAYS_INLINE inline simd(
-      int a, int b, int c, int d,
-      int e, int f, int g, int h)
-    :m_value(_mm512_setr_epi32(
-			       a, 0, b, 0, c, 0, d, 0,
-			       e, 0, f, 0, g, 0, h, 0))  {}
   SIMD_ALWAYS_INLINE inline
   simd(storage_type const& value) {
     copy_from(value.data(), element_aligned_tag());
@@ -526,6 +510,38 @@ class simd<int, simd_abi::avx512> {
   }
 };
 
+
+  template<>
+  class simd_utils< simd<float,simd_abi::avx512> > {
+public:
+  SIMD_ALWAYS_INLINE
+  inline static typename simd<int, simd_abi::avx512>::storage_type make_permute(const int source_lanes[simd<float,simd_abi::avx512>::size()] ) {
+    using simd_t = simd<float,simd_abi::avx512>;
+    using control_storage_t = typename simd<int, simd_abi::avx512>::storage_type;
+    control_storage_t my_mask_storage;
+    for(int i=0; i < simd_t::size(); ++i) {
+      my_mask_storage[i] = source_lanes[i];
+    }
+    return my_mask_storage;
+  }
+};
+
+  template<>
+  class simd_utils< simd<double,simd_abi::avx512> > {
+public:
+  SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
+  inline static typename simd<int, simd_abi::avx512>::storage_type make_permute(const int source_lanes[simd<double,simd_abi::avx512>::size()] ) {
+    using control_storage_t = typename simd<int, simd_abi::avx512>::storage_type;
+    control_storage_t my_mask_storage;
+    for(int i=0; i < simd<double,simd_abi::avx512>::size(); ++i) {
+      my_mask_storage[2*i] = source_lanes[i];
+      my_mask_storage[2*i+1] = 0;
+    }
+
+    return my_mask_storage;
+  }
+};
+
 // Specialized permute
 SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE
 inline simd<float, simd_abi::avx512> permute(simd<int, simd_abi::avx512> const& control,