simd.h 16.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 *  This file is part of the MR utility library.
 *
 *  This code is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This code is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this code; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

19
/* Copyright (C) 2019-2020 Max-Planck-Society
20
21
   Author: Martin Reinecke */

Martin Reinecke's avatar
Martin Reinecke committed
22
23
#ifndef DUCC0_SIMD_H
#define DUCC0_SIMD_H
24
25

// only enable SIMD support for gcc>=5.0 and clang>=5.0
Martin Reinecke's avatar
Martin Reinecke committed
26
27
#ifndef DUCC0_NO_SIMD
#define DUCC0_NO_SIMD
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
28
29
30
31
#if defined(__clang__)
// AppleClang has their own version numbering
#ifdef __apple_build_version__
#  if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1)
Martin Reinecke's avatar
Martin Reinecke committed
32
#     undef DUCC0_NO_SIMD
33
34
#  endif
#elif __clang_major__ >= 5
Martin Reinecke's avatar
Martin Reinecke committed
35
#  undef DUCC0_NO_SIMD
36
37
38
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
Martin Reinecke's avatar
Martin Reinecke committed
39
#undef DUCC0_NO_SIMD
40
41
42
43
#endif
#endif
#endif

44
#include <cstdint>
45
46
#include <cstdlib>
#include <cmath>
47
#include <algorithm>
Martin Reinecke's avatar
Martin Reinecke committed
48
#ifndef DUCC0_NO_SIMD
49
#include <x86intrin.h>
50
#endif
51

Martin Reinecke's avatar
Martin Reinecke committed
52
namespace ducc0 {
53
54
55

namespace detail_simd {

Martin Reinecke's avatar
Martin Reinecke committed
56
57
58
59
60
61
62
63
64
65
66
template<typename T> constexpr inline bool vectorizable = false;
template<> constexpr inline bool vectorizable<float> = true;
template<> constexpr inline bool vectorizable<double> = true;
template<> constexpr inline bool vectorizable<int8_t> = true;
template<> constexpr inline bool vectorizable<uint8_t> = true;
template<> constexpr inline bool vectorizable<int16_t> = true;
template<> constexpr inline bool vectorizable<uint16_t> = true;
template<> constexpr inline bool vectorizable<int32_t> = true;
template<> constexpr inline bool vectorizable<uint32_t> = true;
template<> constexpr inline bool vectorizable<int64_t> = true;
template<> constexpr inline bool vectorizable<uint64_t> = true;
Martin Reinecke's avatar
Martin Reinecke committed
67
68
69
70

template<typename T, size_t reglen> constexpr size_t vlen
  = vectorizable<T> ? reglen/sizeof(T) : 1;

Martin Reinecke's avatar
Martin Reinecke committed
71
72
73
74
75
76
77
template<typename T, size_t len> class helper_;
template<typename T, size_t len> struct vmask_
  {
  private:
    using hlp = helper_<T, len>;
    using Tm = typename hlp::Tm;
    Tm v;
78

Martin Reinecke's avatar
Martin Reinecke committed
79
80
81
82
83
  public:
    vmask_() = default;
    vmask_(const vmask_ &other) = default;
    vmask_(Tm v_): v(v_) {}
    operator Tm() const  { return v; }
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
84
    size_t bits() const { return hlp::maskbits(v); }
Martin Reinecke's avatar
Martin Reinecke committed
85
86
    vmask_ operator& (const vmask_ &other) const { return hlp::mask_and(v,other.v); }
  };
87

Martin Reinecke's avatar
Martin Reinecke committed
88
template<typename T, size_t len> class vtp
89
  {
Martin Reinecke's avatar
Martin Reinecke committed
90
91
92
  private:
    using hlp = helper_<T, len>;

93
  public:
Martin Reinecke's avatar
Martin Reinecke committed
94
95
96
    using Tv = typename hlp::Tv;
    using Tm = vmask_<T, len>;
    static constexpr size_t size() { return len; }
97

Martin Reinecke's avatar
Martin Reinecke committed
98
99
  private:
    Tv v;
100
101
102

  public:
    vtp () = default;
Martin Reinecke's avatar
Martin Reinecke committed
103
104
    vtp(T other): vtp(hlp::from_scalar(other)) {}
    vtp(const Tv &other) : v(other) {}
105
    vtp(const vtp &other) = default;
Martin Reinecke's avatar
Martin Reinecke committed
106
107
    vtp &operator=(const T &other) { v=hlp::from_scalar(other); return *this; }
    operator Tv() const { return v; }
108
109
110
111

    static vtp loadu(const T *ptr) { return vtp(hlp::loadu(ptr)); }
    void storeu(T *ptr) const { hlp::storeu(ptr, v); }

112
113
114
115
116
117
118
119
120
    vtp operator-() const { return vtp(-v); }
    vtp operator+(vtp other) const { return vtp(v+other.v); }
    vtp operator-(vtp other) const { return vtp(v-other.v); }
    vtp operator*(vtp other) const { return vtp(v*other.v); }
    vtp operator/(vtp other) const { return vtp(v/other.v); }
    vtp &operator+=(vtp other) { v+=other.v; return *this; }
    vtp &operator-=(vtp other) { v-=other.v; return *this; }
    vtp &operator*=(vtp other) { v*=other.v; return *this; }
    vtp &operator/=(vtp other) { v/=other.v; return *this; }
Martin Reinecke's avatar
Martin Reinecke committed
121
    vtp abs() const { return hlp::abs(v); }
Martin Reinecke's avatar
Martin Reinecke committed
122
123
124
125
126
127
128
    template<typename Func> vtp apply(Func func) const
      {
      vtp res;
      for (size_t i=0; i<len; ++i)
        res[i] = func(v[i]);
      return res;
      }
129
    inline vtp sqrt() const
Martin Reinecke's avatar
Martin Reinecke committed
130
      { return hlp::sqrt(v); }
131
    vtp max(const vtp &other) const
Martin Reinecke's avatar
Martin Reinecke committed
132
133
134
135
136
137
138
139
140
      { return hlp::max(v, other.v); }
    Tm operator>(const vtp &other) const
      { return hlp::gt(v, other.v); }
    Tm operator>=(const vtp &other) const
      { return hlp::ge(v, other.v); }
    Tm operator<(const vtp &other) const
      { return hlp::lt(v, other.v); }
    Tm operator!=(const vtp &other) const
      { return hlp::ne(v, other.v); }
141
142
143
144
145
146
147

    class reference
      {
      private:
        vtp &v;
        size_t i;
      public:
Martin Reinecke's avatar
Martin Reinecke committed
148
        reference (vtp<T, len> &v_, size_t i_)
149
150
151
152
153
154
155
156
          : v(v_), i(i_) {}
        reference &operator= (T other)
          { v.v[i] = other; return *this; }
        reference &operator*= (T other)
          { v.v[i] *= other; return *this; }
        operator T() const { return v.v[i]; }
      };

Martin Reinecke's avatar
Martin Reinecke committed
157
158
159
    reference operator[](size_t i) { return reference(*this, i); }
    T operator[](size_t i) const { return v[i]; }

160
161
162
163
    class where_expr
      {
      private:
        vtp &v;
Martin Reinecke's avatar
Martin Reinecke committed
164
165
        Tm m;

166
      public:
Martin Reinecke's avatar
Martin Reinecke committed
167
        where_expr (Tm m_, vtp &v_)
168
169
          : v(v_), m(m_) {}
        where_expr &operator*= (const vtp &other)
Martin Reinecke's avatar
Martin Reinecke committed
170
          { v=hlp::blend(m, v.v*other.v, v.v); return *this; }
171
        where_expr &operator+= (const vtp &other)
Martin Reinecke's avatar
Martin Reinecke committed
172
          { v=hlp::blend(m, v.v+other.v, v.v); return *this; }
173
        where_expr &operator-= (const vtp &other)
Martin Reinecke's avatar
Martin Reinecke committed
174
          { v=hlp::blend(m, v.v-other.v, v.v); return *this; }
175
176
      };
  };
Martin Reinecke's avatar
Martin Reinecke committed
177
178
template<typename T, size_t len> inline vtp<T, len> abs(vtp<T, len> v) { return v.abs(); }
template<typename T, size_t len> typename vtp<T, len>::where_expr where(typename vtp<T, len>::Tm m, vtp<T, len> &v)
179
180
181
182
183
184
185
186
187
188
189
  { return typename vtp<T, len>::where_expr(m, v); }
template<typename T0, typename T, size_t len> vtp<T, len> operator*(T0 a, vtp<T, len> b)
  { return b*a; }
template<typename T, size_t len> vtp<T, len> operator+(T a, vtp<T, len> b)
  { return b+a; }
template<typename T, size_t len> vtp<T, len> operator-(T a, vtp<T, len> b)
  { return vtp<T, len>(a) - b; }
template<typename T, size_t len> vtp<T, len> max(vtp<T, len> a, vtp<T, len> b)
  { return a.max(b); }
template<typename T, size_t len> vtp<T, len> sqrt(vtp<T, len> v)
  { return v.sqrt(); }
Martin Reinecke's avatar
Martin Reinecke committed
190
191
192
193
194
template<typename T, size_t len> inline bool any_of(const vmask_<T, len> &mask)
  { return mask.bits()!=0; }
template<typename T, size_t len> inline bool none_of(const vmask_<T, len> &mask)
  { return mask.bits()==0; }
template<typename T, size_t len> inline bool all_of(const vmask_<T, len> &mask)
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
195
  { return mask.bits()==(size_t(1)<<len)-1; }
196
197
198
199
200
201
202
template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op op)
  {
  T res=v[0];
  for (size_t i=1; i<len; ++i)
    res = op(res, v[i]);
  return res;
  }
203
204
template<typename Op, typename T, size_t len> T accumulate(const vtp<T, len> &v, Op op)
  { return reduce(v, op); }
205
206
207
208
template<typename T> class pseudoscalar
  {
  private:
    T v;
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
209

210
211
212
213
214
215
216
217
218
219
220
221
222
  public:
    pseudoscalar() = default;
    pseudoscalar(const pseudoscalar &other) = default;
    pseudoscalar(T v_):v(v_) {}
    pseudoscalar operator-() const { return pseudoscalar(-v); }
    pseudoscalar operator+(pseudoscalar other) const { return pseudoscalar(v+other.v); }
    pseudoscalar operator-(pseudoscalar other) const { return pseudoscalar(v-other.v); }
    pseudoscalar operator*(pseudoscalar other) const { return pseudoscalar(v*other.v); }
    pseudoscalar operator/(pseudoscalar other) const { return pseudoscalar(v/other.v); }
    pseudoscalar &operator+=(pseudoscalar other) { v+=other.v; return *this; }
    pseudoscalar &operator-=(pseudoscalar other) { v-=other.v; return *this; }
    pseudoscalar &operator*=(pseudoscalar other) { v*=other.v; return *this; }
    pseudoscalar &operator/=(pseudoscalar other) { v/=other.v; return *this; }
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
223
224
225

    pseudoscalar abs() const { return std::abs(v); }
    inline pseudoscalar sqrt() const { return std::sqrt(v); }
226
    pseudoscalar max(const pseudoscalar &other) const
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
227
228
229
230
231
232
233
234
235
236
      { return std::max(v, other.v); }

    bool operator>(const pseudoscalar &other) const
      { return v>other.v; }
    bool operator>=(const pseudoscalar &other) const
      { return v>=other.v; }
    bool operator<(const pseudoscalar &other) const
      { return v<other.v; }
    bool operator!=(const pseudoscalar &other) const
      { return v!=other.v; }
237
238
239
    const T &operator[] (size_t /*i*/) const { return v; }
    T &operator[](size_t /*i*/) { return v; }
  };
Martin Reinecke's avatar
Martin Reinecke committed
240
template<typename T> class helper_<T,1>
241
  {
Martin Reinecke's avatar
Martin Reinecke committed
242
243
244
  private:
    static constexpr size_t len = 1;
  public:
245
    using Tv = pseudoscalar<T>;
Martin Reinecke's avatar
Martin Reinecke committed
246
247
    using Tm = bool;

248
249
250
    static Tv loadu(const T *ptr) { return *ptr; }
    static void storeu(T *ptr, Tv v) { *ptr = v.v; }

Martin Reinecke's avatar
Martin Reinecke committed
251
    static Tv from_scalar(T v) { return v; }
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
252
253
254
255
    static Tv abs(Tv v) { return v.abs(); }
    static Tv max(Tv v1, Tv v2) { return v1.max(v2); }
    static Tv blend(Tm m, Tv v1, Tv v2) { return m ? v1 : v2; }
    static Tv sqrt(Tv v) { return v.sqrt(); }
Martin Reinecke's avatar
Martin Reinecke committed
256
257
258
259
260
261
262
263
    static Tm gt (Tv v1, Tv v2) { return v1>v2; }
    static Tm ge (Tv v1, Tv v2) { return v1>=v2; }
    static Tm lt (Tv v1, Tv v2) { return v1<v2; }
    static Tm ne (Tv v1, Tv v2) { return v1!=v2; }
    static Tm mask_and (Tm v1, Tm v2) { return v1&&v2; }
    static size_t maskbits(Tm v) { return v; }
  };

Martin Reinecke's avatar
Martin Reinecke committed
264
#ifndef DUCC0_NO_SIMD
265

Martin Reinecke's avatar
Martin Reinecke committed
266
267
#if defined(__AVX512F__)
template<> class helper_<double,8>
268
  {
Martin Reinecke's avatar
Martin Reinecke committed
269
270
271
272
273
274
275
  private:
    using T = double;
    static constexpr size_t len = 8;
  public:
    using Tv = __m512d;
    using Tm = __mmask8;

276
277
278
    static Tv loadu(const T *ptr) { return _mm512_loadu_pd(ptr); }
    static void storeu(T *ptr, Tv v) { _mm512_storeu_pd(ptr, v); }

Martin Reinecke's avatar
Martin Reinecke committed
279
    static Tv from_scalar(T v) { return _mm512_set1_pd(v); }
Martin Reinecke's avatar
Martin Reinecke committed
280
    static Tv abs(Tv v) { return __m512d(_mm512_andnot_epi64(__m512i(_mm512_set1_pd(-0.)),__m512i(v))); }
Martin Reinecke's avatar
Martin Reinecke committed
281
    static Tv max(Tv v1, Tv v2) { return _mm512_max_pd(v1, v2); }
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
282
    static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_pd(m, v2, v1); }
Martin Reinecke's avatar
Martin Reinecke committed
283
284
285
286
287
288
289
290
    static Tv sqrt(Tv v) { return _mm512_sqrt_pd(v); }
    static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GT_OQ); }
    static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GE_OQ); }
    static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LT_OQ); }
    static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_NEQ_OQ); }
    static Tm mask_and (Tm v1, Tm v2) { return v1&v2; }
    static size_t maskbits(Tm v) { return v; }
  };
Martin Reinecke's avatar
fix    
Martin Reinecke committed
291
template<> class helper_<float,16>
292
  {
Martin Reinecke's avatar
Martin Reinecke committed
293
294
  private:
    using T = float;
Martin Reinecke's avatar
fix    
Martin Reinecke committed
295
    static constexpr size_t len = 16;
Martin Reinecke's avatar
Martin Reinecke committed
296
297
  public:
    using Tv = __m512;
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
298
    using Tm = __mmask16;
Martin Reinecke's avatar
Martin Reinecke committed
299

300
301
302
    static Tv loadu(const T *ptr) { return _mm512_loadu_ps(ptr); }
    static void storeu(T *ptr, Tv v) { _mm512_storeu_ps(ptr, v); }

Martin Reinecke's avatar
Martin Reinecke committed
303
    static Tv from_scalar(T v) { return _mm512_set1_ps(v); }
Martin Reinecke's avatar
Martin Reinecke committed
304
    static Tv abs(Tv v) { return __m512(_mm512_andnot_epi32(__m512i(_mm512_set1_ps(-0.)),__m512i(v))); }
Martin Reinecke's avatar
Martin Reinecke committed
305
    static Tv max(Tv v1, Tv v2) { return _mm512_max_ps(v1, v2); }
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
306
    static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_ps(m, v2, v1); }
Martin Reinecke's avatar
Martin Reinecke committed
307
308
309
310
311
312
313
314
315
    static Tv sqrt(Tv v) { return _mm512_sqrt_ps(v); }
    static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GT_OQ); }
    static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GE_OQ); }
    static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LT_OQ); }
    static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_NEQ_OQ); }
    static Tm mask_and (Tm v1, Tm v2) { return v1&v2; }
    static size_t maskbits(Tm v) { return v; }
  };

Martin Reinecke's avatar
Martin Reinecke committed
316
template<typename T> using native_simd = vtp<T,vlen<T,64>>;
317
#elif defined(__AVX__)
Martin Reinecke's avatar
Martin Reinecke committed
318
319
320
321
322
323
324
325
326
template<> class helper_<double,4>
  {
  private:
    using T = double;
    static constexpr size_t len = 4;
  public:
    using Tv = __m256d;
    using Tm = __m256d;

327
328
329
    static Tv loadu(const T *ptr) { return _mm256_loadu_pd(ptr); }
    static void storeu(T *ptr, Tv v) { _mm256_storeu_pd(ptr, v); }

Martin Reinecke's avatar
Martin Reinecke committed
330
331
332
333
334
335
336
337
338
339
    static Tv from_scalar(T v) { return _mm256_set1_pd(v); }
    static Tv abs(Tv v) { return _mm256_andnot_pd(_mm256_set1_pd(-0.),v); }
    static Tv max(Tv v1, Tv v2) { return _mm256_max_pd(v1, v2); }
    static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_pd(v2, v1, m); }
    static Tv sqrt(Tv v) { return _mm256_sqrt_pd(v); }
    static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GT_OQ); }
    static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GE_OQ); }
    static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LT_OQ); }
    static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_NEQ_OQ); }
    static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_pd(v1,v2); }
340
    static size_t maskbits(Tm v) { return size_t(_mm256_movemask_pd(v)); }
Martin Reinecke's avatar
Martin Reinecke committed
341
342
343
344
345
346
347
348
349
350
  };
template<> class helper_<float,8>
  {
  private:
    using T = float;
    static constexpr size_t len = 8;
  public:
    using Tv = __m256;
    using Tm = __m256;

351
352
353
    static Tv loadu(const T *ptr) { return _mm256_loadu_ps(ptr); }
    static void storeu(T *ptr, Tv v) { _mm256_storeu_ps(ptr, v); }

Martin Reinecke's avatar
Martin Reinecke committed
354
355
356
357
358
359
360
361
362
363
    static Tv from_scalar(T v) { return _mm256_set1_ps(v); }
    static Tv abs(Tv v) { return _mm256_andnot_ps(_mm256_set1_ps(-0.),v); }
    static Tv max(Tv v1, Tv v2) { return _mm256_max_ps(v1, v2); }
    static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_ps(v2, v1, m); }
    static Tv sqrt(Tv v) { return _mm256_sqrt_ps(v); }
    static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GT_OQ); }
    static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GE_OQ); }
    static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LT_OQ); }
    static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_NEQ_OQ); }
    static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_ps(v1,v2); }
364
    static size_t maskbits(Tm v) { return size_t(_mm256_movemask_ps(v)); }
Martin Reinecke's avatar
Martin Reinecke committed
365
366
  };

Martin Reinecke's avatar
Martin Reinecke committed
367
template<typename T> using native_simd = vtp<T,vlen<T,32>>;
Martin Reinecke's avatar
Martin Reinecke committed
368
369
370
371
372
373
374
375
376
377
#elif defined(__SSE2__)
template<> class helper_<double,2>
  {
  private:
    using T = double;
    static constexpr size_t len = 2;
  public:
    using Tv = __m128d;
    using Tm = __m128d;

378
379
380
    static Tv loadu(const T *ptr) { return _mm_loadu_pd(ptr); }
    static void storeu(T *ptr, Tv v) { _mm_storeu_pd(ptr, v); }

Martin Reinecke's avatar
Martin Reinecke committed
381
382
383
384
385
386
387
388
389
    static Tv from_scalar(T v) { return _mm_set1_pd(v); }
    static Tv abs(Tv v) { return _mm_andnot_pd(_mm_set1_pd(-0.),v); }
    static Tv max(Tv v1, Tv v2) { return _mm_max_pd(v1, v2); }
    static Tv blend(Tm m, Tv v1, Tv v2)
      {
#if defined(__SSE4_1__)
      return _mm_blendv_pd(v2,v1,m);
#else
      return _mm_or_pd(_mm_and_pd(m,v1),_mm_andnot_pd(m,v2));
390
#endif
Martin Reinecke's avatar
Martin Reinecke committed
391
392
393
394
395
396
397
      }
    static Tv sqrt(Tv v) { return _mm_sqrt_pd(v); }
    static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_pd(v1,v2); }
    static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_pd(v1,v2); }
    static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_pd(v1,v2); }
    static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_pd(v1,v2); }
    static Tm mask_and (Tm v1, Tm v2) { return _mm_and_pd(v1,v2); }
398
    static size_t maskbits(Tm v) { return size_t(_mm_movemask_pd(v)); }
Martin Reinecke's avatar
Martin Reinecke committed
399
400
401
402
403
404
405
406
407
408
  };
template<> class helper_<float,4>
  {
  private:
    using T = float;
    static constexpr size_t len = 4;
  public:
    using Tv = __m128;
    using Tm = __m128;

409
410
411
    static Tv loadu(const T *ptr) { return _mm_loadu_ps(ptr); }
    static void storeu(T *ptr, Tv v) { _mm_storeu_ps(ptr, v); }

Martin Reinecke's avatar
Martin Reinecke committed
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
    static Tv from_scalar(T v) { return _mm_set1_ps(v); }
    static Tv abs(Tv v) { return _mm_andnot_ps(_mm_set1_ps(-0.),v); }
    static Tv max(Tv v1, Tv v2) { return _mm_max_ps(v1, v2); }
    static Tv blend(Tm m, Tv v1, Tv v2)
      {
#if defined(__SSE4_1__)
      return _mm_blendv_ps(v2,v1,m);
#else
      return _mm_or_ps(_mm_and_ps(m,v1),_mm_andnot_ps(m,v2));
#endif
      }
    static Tv sqrt(Tv v) { return _mm_sqrt_ps(v); }
    static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_ps(v1,v2); }
    static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_ps(v1,v2); }
    static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_ps(v1,v2); }
    static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_ps(v1,v2); }
    static Tm mask_and (Tm v1, Tm v2) { return _mm_and_ps(v1,v2); }
429
    static size_t maskbits(Tm v) { return size_t(_mm_movemask_ps(v)); }
Martin Reinecke's avatar
Martin Reinecke committed
430
  };
431

Martin Reinecke's avatar
Martin Reinecke committed
432
template<typename T> using native_simd = vtp<T,vlen<T,16>>;
Martin Reinecke's avatar
fixes    
Martin Reinecke committed
433
434
#else
template<typename T> using native_simd = vtp<T,1>;
Martin Reinecke's avatar
Martin Reinecke committed
435
#endif
436
437
438
#else
template<typename T> using native_simd = vtp<T,1>;
#endif
Martin Reinecke's avatar
Martin Reinecke committed
439
}
440

441
442
443
444
445
446
447
448
using detail_simd::native_simd;
using detail_simd::reduce;
using detail_simd::max;
using detail_simd::abs;
using detail_simd::sqrt;
using detail_simd::any_of;
using detail_simd::none_of;
using detail_simd::all_of;
449
450
451
452
453
454
455
456
457

// since we are explicitly introducing a few names that are also available in
// std::, we need to import them from std::as well, otherwise name resolution
// can fail in certain circumstances.

using std::abs;
using std::sqrt;
using std::max;

458
459
460
}

#endif