Commit 99c173b2 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

add unaligned stores and loads

parent 64866dcd
Pipeline #77564 passed with stages
in 16 minutes and 19 seconds
......@@ -41,6 +41,7 @@
#endif
#endif
#include <cstdint>
#include <cstdlib>
#include <cmath>
#include <algorithm>
......@@ -104,6 +105,10 @@ template<typename T, size_t len> class vtp
vtp(const vtp &other) = default;
vtp &operator=(const T &other) { v=hlp::from_scalar(other); return *this; }
operator Tv() const { return v; }
static vtp loadu(const T *ptr) { return vtp(hlp::loadu(ptr)); }
void storeu(T *ptr) const { hlp::storeu(ptr, v); }
vtp operator-() const { return vtp(-v); }
vtp operator+(vtp other) const { return vtp(v+other.v); }
vtp operator-(vtp other) const { return vtp(v-other.v); }
......@@ -240,6 +245,9 @@ template<typename T> class helper_<T,1>
using Tv = pseudoscalar<T>;
using Tm = bool;
static Tv loadu(const T *ptr) { return *ptr; }
static void storeu(T *ptr, Tv v) { *ptr = v.v; }
static Tv from_scalar(T v) { return v; }
static Tv abs(Tv v) { return v.abs(); }
static Tv max(Tv v1, Tv v2) { return v1.max(v2); }
......@@ -265,6 +273,9 @@ template<> class helper_<double,8>
using Tv = __m512d;
using Tm = __mmask8;
static Tv loadu(const T *ptr) { return _mm512_loadu_pd(ptr); }
static void storeu(T *ptr, Tv v) { _mm512_storeu_pd(ptr, v); }
static Tv from_scalar(T v) { return _mm512_set1_pd(v); }
static Tv abs(Tv v) { return __m512d(_mm512_andnot_epi64(__m512i(_mm512_set1_pd(-0.)),__m512i(v))); }
static Tv max(Tv v1, Tv v2) { return _mm512_max_pd(v1, v2); }
......@@ -286,6 +297,9 @@ template<> class helper_<float,16>
using Tv = __m512;
using Tm = __mmask16;
static Tv loadu(const T *ptr) { return _mm512_loadu_ps(ptr); }
static void storeu(T *ptr, Tv v) { _mm512_storeu_ps(ptr, v); }
static Tv from_scalar(T v) { return _mm512_set1_ps(v); }
static Tv abs(Tv v) { return __m512(_mm512_andnot_epi32(__m512i(_mm512_set1_ps(-0.)),__m512i(v))); }
static Tv max(Tv v1, Tv v2) { return _mm512_max_ps(v1, v2); }
......@@ -310,6 +324,9 @@ template<> class helper_<double,4>
using Tv = __m256d;
using Tm = __m256d;
static Tv loadu(const T *ptr) { return _mm256_loadu_pd(ptr); }
static void storeu(T *ptr, Tv v) { _mm256_storeu_pd(ptr, v); }
static Tv from_scalar(T v) { return _mm256_set1_pd(v); }
static Tv abs(Tv v) { return _mm256_andnot_pd(_mm256_set1_pd(-0.),v); }
static Tv max(Tv v1, Tv v2) { return _mm256_max_pd(v1, v2); }
......@@ -331,6 +348,9 @@ template<> class helper_<float,8>
using Tv = __m256;
using Tm = __m256;
static Tv loadu(const T *ptr) { return _mm256_loadu_ps(ptr); }
static void storeu(T *ptr, Tv v) { _mm256_storeu_ps(ptr, v); }
static Tv from_scalar(T v) { return _mm256_set1_ps(v); }
static Tv abs(Tv v) { return _mm256_andnot_ps(_mm256_set1_ps(-0.),v); }
static Tv max(Tv v1, Tv v2) { return _mm256_max_ps(v1, v2); }
......@@ -355,6 +375,9 @@ template<> class helper_<double,2>
using Tv = __m128d;
using Tm = __m128d;
static Tv loadu(const T *ptr) { return _mm_loadu_pd(ptr); }
static void storeu(T *ptr, Tv v) { _mm_storeu_pd(ptr, v); }
static Tv from_scalar(T v) { return _mm_set1_pd(v); }
static Tv abs(Tv v) { return _mm_andnot_pd(_mm_set1_pd(-0.),v); }
static Tv max(Tv v1, Tv v2) { return _mm_max_pd(v1, v2); }
......@@ -383,6 +406,9 @@ template<> class helper_<float,4>
using Tv = __m128;
using Tm = __m128;
static Tv loadu(const T *ptr) { return _mm_loadu_ps(ptr); }
static void storeu(T *ptr, Tv v) { _mm_storeu_ps(ptr, v); }
static Tv from_scalar(T v) { return _mm_set1_ps(v); }
static Tv abs(Tv v) { return _mm_andnot_ps(_mm_set1_ps(-0.),v); }
static Tv max(Tv v1, Tv v2) { return _mm_max_ps(v1, v2); }
......@@ -429,7 +455,6 @@ using std::abs;
using std::sqrt;
using std::max;
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment