/* * This file is part of the MR utility library. * * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Copyright (C) 2019-2020 Max-Planck-Society Author: Martin Reinecke */ #ifndef MRUTIL_SIMD_H #define MRUTIL_SIMD_H // only enable SIMD support for gcc>=5.0 and clang>=5.0 #ifndef MRUTIL_NO_SIMD #define MRUTIL_NO_SIMD #if defined(__clang__) // AppleClang has their own version numbering #ifdef __apple_build_version__ # if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1) # undef MRUTIL_NO_SIMD # endif #elif __clang_major__ >= 5 # undef MRUTIL_NO_SIMD #endif #elif defined(__GNUC__) #if __GNUC__>=5 #undef MRUTIL_NO_SIMD #endif #endif #endif #include #include #include #ifndef MRUTIL_NO_SIMD #include #endif namespace mr { namespace detail_simd { template constexpr static const bool vectorizable = false; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template<> constexpr static const bool vectorizable = true; template constexpr size_t vlen = vectorizable ? reglen/sizeof(T) : 1; template class helper_; template struct vmask_ { private: using hlp = helper_; using Tm = typename hlp::Tm; Tm v; public: vmask_() = default; vmask_(const vmask_ &other) = default; vmask_(Tm v_): v(v_) {} operator Tm() const { return v; } size_t bits() const { return hlp::maskbits(v); } vmask_ operator& (const vmask_ &other) const { return hlp::mask_and(v,other.v); } }; template class vtp { private: using hlp = helper_; public: using Tv = typename hlp::Tv; using Tm = vmask_; static constexpr size_t size() { return len; } private: Tv v; public: vtp () = default; vtp(T other): vtp(hlp::from_scalar(other)) {} vtp(const Tv &other) : v(other) {} vtp(const vtp &other) = default; vtp &operator=(const T &other) { v=hlp::from_scalar(other); return *this; } operator Tv() const { return v; } vtp operator-() const { return vtp(-v); } vtp operator+(vtp other) const { return vtp(v+other.v); } vtp operator-(vtp other) const { return vtp(v-other.v); } vtp operator*(vtp other) const { return vtp(v*other.v); } vtp operator/(vtp other) const { return vtp(v/other.v); } vtp &operator+=(vtp other) { v+=other.v; return *this; } vtp &operator-=(vtp other) { v-=other.v; return *this; } vtp &operator*=(vtp other) { v*=other.v; return *this; } vtp &operator/=(vtp other) { v/=other.v; return *this; } vtp abs() const { return hlp::abs(v); } inline vtp sqrt() const { return hlp::sqrt(v); } vtp max(const vtp &other) const { return hlp::max(v, other.v); } Tm operator>(const vtp &other) const { return hlp::gt(v, other.v); } Tm operator>=(const vtp &other) const { return hlp::ge(v, other.v); } Tm operator<(const vtp &other) const { return hlp::lt(v, other.v); } Tm operator!=(const vtp &other) const { return hlp::ne(v, other.v); } class reference { private: vtp &v; size_t i; public: reference (vtp &v_, size_t i_) : v(v_), i(i_) {} reference &operator= (T other) { v.v[i] = other; return *this; } reference &operator*= (T other) { v.v[i] *= other; return *this; } operator T() const { return v.v[i]; } }; reference operator[](size_t i) { return reference(*this, i); } T operator[](size_t i) const { return v[i]; } class where_expr { private: vtp &v; Tm m; public: where_expr (Tm m_, vtp &v_) : v(v_), m(m_) {} where_expr &operator*= (const vtp &other) { v=hlp::blend(m, v.v*other.v, v.v); return *this; } where_expr &operator+= (const vtp &other) { v=hlp::blend(m, v.v+other.v, v.v); return *this; } where_expr &operator-= (const vtp &other) { v=hlp::blend(m, v.v-other.v, v.v); return *this; } }; }; template inline vtp abs(vtp v) { return v.abs(); } template typename vtp::where_expr where(typename vtp::Tm m, vtp &v) { return typename vtp::where_expr(m, v); } template vtp operator*(T0 a, vtp b) { return b*a; } template vtp operator+(T a, vtp b) { return b+a; } template vtp operator-(T a, vtp b) { return vtp(a) - b; } template vtp max(vtp a, vtp b) { return a.max(b); } template vtp sqrt(vtp v) { return v.sqrt(); } template inline bool any_of(const vmask_ &mask) { return mask.bits()!=0; } template inline bool none_of(const vmask_ &mask) { return mask.bits()==0; } template inline bool all_of(const vmask_ &mask) { return mask.bits()==(size_t(1)< T reduce(const vtp &v, Op op) { T res=v[0]; for (size_t i=1; i class pseudoscalar { private: T v; public: pseudoscalar() = default; pseudoscalar(const pseudoscalar &other) = default; pseudoscalar(T v_):v(v_) {} pseudoscalar operator-() const { return pseudoscalar(-v); } pseudoscalar operator+(pseudoscalar other) const { return pseudoscalar(v+other.v); } pseudoscalar operator-(pseudoscalar other) const { return pseudoscalar(v-other.v); } pseudoscalar operator*(pseudoscalar other) const { return pseudoscalar(v*other.v); } pseudoscalar operator/(pseudoscalar other) const { return pseudoscalar(v/other.v); } pseudoscalar &operator+=(pseudoscalar other) { v+=other.v; return *this; } pseudoscalar &operator-=(pseudoscalar other) { v-=other.v; return *this; } pseudoscalar &operator*=(pseudoscalar other) { v*=other.v; return *this; } pseudoscalar &operator/=(pseudoscalar other) { v/=other.v; return *this; } pseudoscalar abs() const { return std::abs(v); } inline pseudoscalar sqrt() const { return std::sqrt(v); } pseudoscalar max(const pseudoscalar &other) const { return std::max(v, other.v); } bool operator>(const pseudoscalar &other) const { return v>other.v; } bool operator>=(const pseudoscalar &other) const { return v>=other.v; } bool operator<(const pseudoscalar &other) const { return v class helper_ { private: static constexpr size_t len = 1; public: using Tv = pseudoscalar; using Tm = bool; static Tv from_scalar(T v) { return v; } static Tv abs(Tv v) { return v.abs(); } static Tv max(Tv v1, Tv v2) { return v1.max(v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return m ? v1 : v2; } static Tv sqrt(Tv v) { return v.sqrt(); } static Tm gt (Tv v1, Tv v2) { return v1>v2; } static Tm ge (Tv v1, Tv v2) { return v1>=v2; } static Tm lt (Tv v1, Tv v2) { return v1 class helper_ { private: using T = double; static constexpr size_t len = 8; public: using Tv = __m512d; using Tm = __mmask8; static Tv from_scalar(T v) { return _mm512_set1_pd(v); } static Tv abs(Tv v) { return __m512d(_mm512_andnot_epi64(__m512i(_mm512_set1_pd(-0.)),__m512i(v))); } static Tv max(Tv v1, Tv v2) { return _mm512_max_pd(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_pd(m, v2, v1); } static Tv sqrt(Tv v) { return _mm512_sqrt_pd(v); } static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GT_OQ); } static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GE_OQ); } static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LT_OQ); } static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_NEQ_OQ); } static Tm mask_and (Tm v1, Tm v2) { return v1&v2; } static size_t maskbits(Tm v) { return v; } }; template<> class helper_ { private: using T = float; static constexpr size_t len = 16; public: using Tv = __m512; using Tm = __mmask16; static Tv from_scalar(T v) { return _mm512_set1_ps(v); } static Tv abs(Tv v) { return __m512(_mm512_andnot_epi32(__m512i(_mm512_set1_ps(-0.)),__m512i(v))); } static Tv max(Tv v1, Tv v2) { return _mm512_max_ps(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_ps(m, v2, v1); } static Tv sqrt(Tv v) { return _mm512_sqrt_ps(v); } static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GT_OQ); } static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GE_OQ); } static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LT_OQ); } static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_NEQ_OQ); } static Tm mask_and (Tm v1, Tm v2) { return v1&v2; } static size_t maskbits(Tm v) { return v; } }; template using native_simd = vtp>; #elif defined(__AVX__) template<> class helper_ { private: using T = double; static constexpr size_t len = 4; public: using Tv = __m256d; using Tm = __m256d; static Tv from_scalar(T v) { return _mm256_set1_pd(v); } static Tv abs(Tv v) { return _mm256_andnot_pd(_mm256_set1_pd(-0.),v); } static Tv max(Tv v1, Tv v2) { return _mm256_max_pd(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_pd(v2, v1, m); } static Tv sqrt(Tv v) { return _mm256_sqrt_pd(v); } static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GT_OQ); } static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GE_OQ); } static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LT_OQ); } static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_NEQ_OQ); } static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_pd(v1,v2); } static size_t maskbits(Tm v) { return size_t(_mm256_movemask_pd(v)); } }; template<> class helper_ { private: using T = float; static constexpr size_t len = 8; public: using Tv = __m256; using Tm = __m256; static Tv from_scalar(T v) { return _mm256_set1_ps(v); } static Tv abs(Tv v) { return _mm256_andnot_ps(_mm256_set1_ps(-0.),v); } static Tv max(Tv v1, Tv v2) { return _mm256_max_ps(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_ps(v2, v1, m); } static Tv sqrt(Tv v) { return _mm256_sqrt_ps(v); } static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GT_OQ); } static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GE_OQ); } static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LT_OQ); } static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_NEQ_OQ); } static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_ps(v1,v2); } static size_t maskbits(Tm v) { return size_t(_mm256_movemask_ps(v)); } }; template using native_simd = vtp>; #elif defined(__SSE2__) template<> class helper_ { private: using T = double; static constexpr size_t len = 2; public: using Tv = __m128d; using Tm = __m128d; static Tv from_scalar(T v) { return _mm_set1_pd(v); } static Tv abs(Tv v) { return _mm_andnot_pd(_mm_set1_pd(-0.),v); } static Tv max(Tv v1, Tv v2) { return _mm_max_pd(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { #if defined(__SSE4_1__) return _mm_blendv_pd(v2,v1,m); #else return _mm_or_pd(_mm_and_pd(m,v1),_mm_andnot_pd(m,v2)); #endif } static Tv sqrt(Tv v) { return _mm_sqrt_pd(v); } static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_pd(v1,v2); } static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_pd(v1,v2); } static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_pd(v1,v2); } static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_pd(v1,v2); } static Tm mask_and (Tm v1, Tm v2) { return _mm_and_pd(v1,v2); } static size_t maskbits(Tm v) { return size_t(_mm_movemask_pd(v)); } }; template<> class helper_ { private: using T = float; static constexpr size_t len = 4; public: using Tv = __m128; using Tm = __m128; static Tv from_scalar(T v) { return _mm_set1_ps(v); } static Tv abs(Tv v) { return _mm_andnot_ps(_mm_set1_ps(-0.),v); } static Tv max(Tv v1, Tv v2) { return _mm_max_ps(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { #if defined(__SSE4_1__) return _mm_blendv_ps(v2,v1,m); #else return _mm_or_ps(_mm_and_ps(m,v1),_mm_andnot_ps(m,v2)); #endif } static Tv sqrt(Tv v) { return _mm_sqrt_ps(v); } static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_ps(v1,v2); } static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_ps(v1,v2); } static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_ps(v1,v2); } static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_ps(v1,v2); } static Tm mask_and (Tm v1, Tm v2) { return _mm_and_ps(v1,v2); } static size_t maskbits(Tm v) { return size_t(_mm_movemask_ps(v)); } }; template using native_simd = vtp>; #else template using native_simd = vtp; #endif #else template using native_simd = vtp; #endif } using detail_simd::native_simd; using detail_simd::reduce; using detail_simd::max; using detail_simd::abs; using detail_simd::sqrt; using detail_simd::any_of; using detail_simd::none_of; using detail_simd::all_of; } #endif