Commit e0d1c40c authored by Martin Reinecke's avatar Martin Reinecke
Browse files

make FFT use experimental::simd

parent 462974b0
...@@ -54,7 +54,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ...@@ -54,7 +54,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <array> #include <array>
#endif #endif
#include "mr_util/threading.h" #include "mr_util/threading.h"
#include "mr_util/simd.h" #include <experimental/simd>
#include "mr_util/cmplx.h" #include "mr_util/cmplx.h"
#include "mr_util/aligned_array.h" #include "mr_util/aligned_array.h"
#include "mr_util/unity_roots.h" #include "mr_util/unity_roots.h"
...@@ -75,12 +75,11 @@ namespace mr { ...@@ -75,12 +75,11 @@ namespace mr {
namespace detail_fft { namespace detail_fft {
using namespace std; using namespace std;
using namespace std::experimental;
using shape_t = vector<size_t>; using shape_t = vector<size_t>;
using stride_t = vector<ptrdiff_t>; using stride_t = vector<ptrdiff_t>;
template<typename T> using arr=aligned_array<T>;
constexpr bool FORWARD = true, constexpr bool FORWARD = true,
BACKWARD = false; BACKWARD = false;
...@@ -103,8 +102,8 @@ constexpr bool FORWARD = true, ...@@ -103,8 +102,8 @@ constexpr bool FORWARD = true,
template<typename T> struct VLEN { static constexpr size_t val=1; }; template<typename T> struct VLEN { static constexpr size_t val=1; };
#ifndef MRUTIL_NO_VECTORS #ifndef MRUTIL_NO_VECTORS
template<> struct VLEN<float> { static constexpr size_t val=vtp<float>::vlen; }; template<> struct VLEN<float> { static constexpr size_t val=simd<float>::size(); };
template<> struct VLEN<double> { static constexpr size_t val=vtp<double>::vlen; }; template<> struct VLEN<double> { static constexpr size_t val=simd<double>::size(); };
#endif #endif
template<typename T> inline void PM(T &a, T &b, T c, T d) template<typename T> inline void PM(T &a, T &b, T c, T d)
...@@ -158,7 +157,7 @@ struct util // hack to avoid duplicate symbols ...@@ -158,7 +157,7 @@ struct util // hack to avoid duplicate symbols
} }
/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
static POCKETFFT_NOINLINE size_t good_size_Cmplx(size_t n) static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
{ {
if (n<=12) return n; if (n<=12) return n;
...@@ -288,7 +287,7 @@ template<typename T0> class cfftp ...@@ -288,7 +287,7 @@ template<typename T0> class cfftp
}; };
size_t length; size_t length;
arr<Cmplx<T0>> mem; aligned_array<Cmplx<T0>> mem;
vector<fctdata> fact; vector<fctdata> fact;
void add_factor(size_t factor) void add_factor(size_t factor)
...@@ -785,7 +784,7 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip, ...@@ -785,7 +784,7 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
auto CH2 = [ch, idl1](size_t a, size_t b) -> const T& auto CH2 = [ch, idl1](size_t a, size_t b) -> const T&
{ return ch[a+idl1*b]; }; { return ch[a+idl1*b]; };
arr<Cmplx<T0>> wal(ip); aligned_array<Cmplx<T0>> wal(ip);
wal[0] = Cmplx<T0>(1., 0.); wal[0] = Cmplx<T0>(1., 0.);
for (size_t i=1; i<ip; ++i) for (size_t i=1; i<ip; ++i)
wal[i]=Cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i); wal[i]=Cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
...@@ -878,7 +877,7 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const ...@@ -878,7 +877,7 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
{ {
if (length==1) { c[0]*=fct; return; } if (length==1) { c[0]*=fct; return; }
size_t l1=1; size_t l1=1;
arr<T> ch(length); aligned_array<T> ch(length);
T *p1=c, *p2=ch.data(); T *p1=c, *p2=ch.data();
for(size_t k1=0; k1<fact.size(); k1++) for(size_t k1=0; k1<fact.size(); k1++)
...@@ -1014,7 +1013,7 @@ template<typename T0> class rfftp ...@@ -1014,7 +1013,7 @@ template<typename T0> class rfftp
}; };
size_t length; size_t length;
arr<T0> mem; aligned_array<T0> mem;
vector<fctdata> fact; vector<fctdata> fact;
void add_factor(size_t factor) void add_factor(size_t factor)
...@@ -1683,7 +1682,7 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1, ...@@ -1683,7 +1682,7 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
{ {
if (length==1) { c[0]*=fct; return; } if (length==1) { c[0]*=fct; return; }
size_t n=length, nf=fact.size(); size_t n=length, nf=fact.size();
arr<T> ch(n); aligned_array<T> ch(n);
T *p1=c, *p2=ch.data(); T *p1=c, *p2=ch.data();
if (r2hc) if (r2hc)
...@@ -1818,12 +1817,12 @@ template<typename T0> class fftblue ...@@ -1818,12 +1817,12 @@ template<typename T0> class fftblue
private: private:
size_t n, n2; size_t n, n2;
cfftp<T0> plan; cfftp<T0> plan;
arr<Cmplx<T0>> mem; aligned_array<Cmplx<T0>> mem;
Cmplx<T0> *bk, *bkf; Cmplx<T0> *bk, *bkf;
template<bool fwd, typename T> void fft(Cmplx<T> c[], T0 fct) const template<bool fwd, typename T> void fft(Cmplx<T> c[], T0 fct) const
{ {
arr<Cmplx<T>> akf(n2); aligned_array<Cmplx<T>> akf(n2);
/* initialize a_k and FFT it */ /* initialize a_k and FFT it */
for (size_t m=0; m<n; ++m) for (size_t m=0; m<n; ++m)
...@@ -1854,7 +1853,7 @@ template<typename T0> class fftblue ...@@ -1854,7 +1853,7 @@ template<typename T0> class fftblue
public: public:
POCKETFFT_NOINLINE fftblue(size_t length) POCKETFFT_NOINLINE fftblue(size_t length)
: n(length), n2(util::good_size_Cmplx(n*2-1)), plan(n2), mem(n+n2/2+1), : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
bk(mem.data()), bkf(mem.data()+n) bk(mem.data()), bkf(mem.data()+n)
{ {
/* initialize b_k */ /* initialize b_k */
...@@ -1870,7 +1869,7 @@ template<typename T0> class fftblue ...@@ -1870,7 +1869,7 @@ template<typename T0> class fftblue
} }
/* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
arr<Cmplx<T0>> tbkf(n2); aligned_array<Cmplx<T0>> tbkf(n2);
T0 xn2 = T0(1)/T0(n2); T0 xn2 = T0(1)/T0(n2);
tbkf[0] = bk[0]*xn2; tbkf[0] = bk[0]*xn2;
for (size_t m=1; m<n; ++m) for (size_t m=1; m<n; ++m)
...@@ -1887,7 +1886,7 @@ template<typename T0> class fftblue ...@@ -1887,7 +1886,7 @@ template<typename T0> class fftblue
template<typename T> void exec_r(T c[], T0 fct, bool fwd) template<typename T> void exec_r(T c[], T0 fct, bool fwd)
{ {
arr<Cmplx<T>> tmp(n); aligned_array<Cmplx<T>> tmp(n);
if (fwd) if (fwd)
{ {
auto zero = T0(0)*c[0]; auto zero = T0(0)*c[0];
...@@ -1935,7 +1934,7 @@ template<typename T0> class pocketfft_c ...@@ -1935,7 +1934,7 @@ template<typename T0> class pocketfft_c
return; return;
} }
double comp1 = util::cost_guess(length); double comp1 = util::cost_guess(length);
double comp2 = 2*util::cost_guess(util::good_size_Cmplx(2*length-1)); double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
comp2*=1.5; /* fudge factor that appears to give good overall performance */ comp2*=1.5; /* fudge factor that appears to give good overall performance */
if (comp2<comp1) // use Bluestein if (comp2<comp1) // use Bluestein
blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length)); blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
...@@ -1972,7 +1971,7 @@ template<typename T0> class pocketfft_r ...@@ -1972,7 +1971,7 @@ template<typename T0> class pocketfft_r
return; return;
} }
double comp1 = 0.5*util::cost_guess(length); double comp1 = 0.5*util::cost_guess(length);
double comp2 = 2*util::cost_guess(util::good_size_Cmplx(2*length-1)); double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
comp2*=1.5; /* fudge factor that appears to give good overall performance */ comp2*=1.5; /* fudge factor that appears to give good overall performance */
if (comp2<comp1) // use Bluestein if (comp2<comp1) // use Bluestein
blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length)); blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
...@@ -2007,7 +2006,7 @@ template<typename T0> class T_dct1 ...@@ -2007,7 +2006,7 @@ template<typename T0> class T_dct1
size_t N=fftplan.length(), n=N/2+1; size_t N=fftplan.length(), n=N/2+1;
if (ortho) if (ortho)
{ c[0]*=sqrt2; c[n-1]*=sqrt2; } { c[0]*=sqrt2; c[n-1]*=sqrt2; }
arr<T> tmp(N); aligned_array<T> tmp(N);
tmp[0] = c[0]; tmp[0] = c[0];
for (size_t i=1; i<n; ++i) for (size_t i=1; i<n; ++i)
tmp[i] = tmp[N-i] = c[i]; tmp[i] = tmp[N-i] = c[i];
...@@ -2035,7 +2034,7 @@ template<typename T0> class T_dst1 ...@@ -2035,7 +2034,7 @@ template<typename T0> class T_dst1
bool /*ortho*/, int /*type*/, bool /*cosine*/) const bool /*ortho*/, int /*type*/, bool /*cosine*/) const
{ {
size_t N=fftplan.length(), n=N/2-1; size_t N=fftplan.length(), n=N/2-1;
arr<T> tmp(N); aligned_array<T> tmp(N);
tmp[0] = tmp[n+1] = c[0]*0; tmp[0] = tmp[n+1] = c[0]*0;
for (size_t i=0; i<n; ++i) for (size_t i=0; i<n; ++i)
{ tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; } { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
...@@ -2123,7 +2122,7 @@ template<typename T0> class T_dcst4 ...@@ -2123,7 +2122,7 @@ template<typename T0> class T_dcst4
size_t N; size_t N;
unique_ptr<pocketfft_c<T0>> fft; unique_ptr<pocketfft_c<T0>> fft;
unique_ptr<pocketfft_r<T0>> rfft; unique_ptr<pocketfft_r<T0>> rfft;
arr<Cmplx<T0>> C2; aligned_array<Cmplx<T0>> C2;
public: public:
POCKETFFT_NOINLINE T_dcst4(size_t length) POCKETFFT_NOINLINE T_dcst4(size_t length)
...@@ -2153,7 +2152,7 @@ template<typename T0> class T_dcst4 ...@@ -2153,7 +2152,7 @@ template<typename T0> class T_dcst4
// and is released under the 3-clause BSD license with friendly // and is released under the 3-clause BSD license with friendly
// permission of Matteo Frigo and Steven G. Johnson. // permission of Matteo Frigo and Steven G. Johnson.
arr<T> y(N); aligned_array<T> y(N);
{ {
size_t i=0, m=n2; size_t i=0, m=n2;
for (; m<N; ++i, m+=4) for (; m<N; ++i, m+=4)
...@@ -2196,7 +2195,7 @@ template<typename T0> class T_dcst4 ...@@ -2196,7 +2195,7 @@ template<typename T0> class T_dcst4
{ {
// even length algorithm from // even length algorithm from
// https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/ // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
arr<Cmplx<T>> y(n2); aligned_array<Cmplx<T>> y(n2);
for(size_t i=0; i<n2; ++i) for(size_t i=0; i<n2; ++i)
{ {
y[i].Set(c[2*i],c[N-1-2*i]); y[i].Set(c[2*i],c[N-1-2*i]);
...@@ -2502,26 +2501,26 @@ template <typename T> using vtype_t = typename VTYPE<T>::type; ...@@ -2502,26 +2501,26 @@ template <typename T> using vtype_t = typename VTYPE<T>::type;
#ifndef POCKETFFT_NO_VECTORS #ifndef POCKETFFT_NO_VECTORS
template<> struct VTYPE<float> template<> struct VTYPE<float>
{ {
using type = vtp<float>; using type = simd<float>;
}; };
template<> struct VTYPE<double> template<> struct VTYPE<double>
{ {
using type = vtp<double>; using type = simd<double>;
}; };
template<> struct VTYPE<long double> template<> struct VTYPE<long double>
{ {
using type =vtp<long double>; using type =simd<long double, simd_abi::fixed_size<1>>;
}; };
#endif #endif
template<typename T> arr<char> alloc_tmp(const shape_t &shape, template<typename T> aligned_array<char> alloc_tmp(const shape_t &shape,
size_t axsize, size_t elemsize) size_t axsize, size_t elemsize)
{ {
auto othersize = util::prod(shape)/axsize; auto othersize = util::prod(shape)/axsize;
auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1); auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
return arr<char>(tmpsize*elemsize); return aligned_array<char>(tmpsize*elemsize);
} }
template<typename T> arr<char> alloc_tmp(const shape_t &shape, template<typename T> aligned_array<char> alloc_tmp(const shape_t &shape,
const shape_t &axes, size_t elemsize) const shape_t &axes, size_t elemsize)
{ {
size_t fullsize=util::prod(shape); size_t fullsize=util::prod(shape);
...@@ -2533,7 +2532,7 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape, ...@@ -2533,7 +2532,7 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1); auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
if (sz>tmpsize) tmpsize=sz; if (sz>tmpsize) tmpsize=sz;
} }
return arr<char>(tmpsize*elemsize); return aligned_array<char>(tmpsize*elemsize);
} }
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
...@@ -2542,8 +2541,8 @@ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, ...@@ -2542,8 +2541,8 @@ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
for (size_t i=0; i<it.length_in(); ++i) for (size_t i=0; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
{ {
dst[i].r.Set(j,src[it.iofs(j,i)].r); dst[i].r[j] = src[it.iofs(j,i)].r;
dst[i].i.Set(j,src[it.iofs(j,i)].i); dst[i].i[j] = src[it.iofs(j,i)].i;
} }
} }
...@@ -2786,11 +2785,17 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r( ...@@ -2786,11 +2785,17 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
if (forward) if (forward)
for (; i<len-1; i+=2, ++ii) for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
in[it.iofs(j,ii)].SplitConj(tdatav[i][j], tdatav[i+1][j]); {
tdatav[i ][j] = in[it.iofs(j,ii)].r;
tdatav[i+1][j] = -in[it.iofs(j,ii)].i;
}
else else
for (; i<len-1; i+=2, ++ii) for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
in[it.iofs(j,ii)].Split(tdatav[i][j], tdatav[i+1][j]); {
tdatav[i ][j] = in[it.iofs(j,ii)].r;
tdatav[i+1][j] = in[it.iofs(j,ii)].i;
}
if (i<len) if (i<len)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = in[it.iofs(j,ii)].r; tdatav[i][j] = in[it.iofs(j,ii)].r;
...@@ -2808,10 +2813,16 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r( ...@@ -2808,10 +2813,16 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
size_t i=1, ii=1; size_t i=1, ii=1;
if (forward) if (forward)
for (; i<len-1; i+=2, ++ii) for (; i<len-1; i+=2, ++ii)
in[it.iofs(ii)].SplitConj(tdata[i], tdata[i+1]); {
tdata[i ] = in[it.iofs(ii)].r;
tdata[i+1] = -in[it.iofs(ii)].i;
}
else else
for (; i<len-1; i+=2, ++ii) for (; i<len-1; i+=2, ++ii)
in[it.iofs(ii)].Split(tdata[i], tdata[i+1]); {
tdata[i ] = in[it.iofs(ii)].r;
tdata[i+1] = in[it.iofs(ii)].i;
}
if (i<len) if (i<len)
tdata[i] = in[it.iofs(ii)].r; tdata[i] = in[it.iofs(ii)].r;
} }
...@@ -2953,7 +2964,7 @@ template<typename T> void c2r(const shape_t &shape_out, ...@@ -2953,7 +2964,7 @@ template<typename T> void c2r(const shape_t &shape_out,
for (int i=int(shape_in.size())-2; i>=0; --i) for (int i=int(shape_in.size())-2; i>=0; --i)
stride_inter[size_t(i)] = stride_inter[size_t(i)] =
stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]); stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]);
arr<complex<T>> tmp(nval); aligned_array<complex<T>> tmp(nval);
auto newaxes = shape_t({axes.begin(), --axes.end()}); auto newaxes = shape_t({axes.begin(), --axes.end()});
c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(), c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(),
T(1), nthreads); T(1), nthreads);
...@@ -2997,7 +3008,7 @@ template<typename T> void r2r_genuine_hartley(const shape_t &shape, ...@@ -2997,7 +3008,7 @@ template<typename T> void r2r_genuine_hartley(const shape_t &shape,
util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
shape_t tshp(shape); shape_t tshp(shape);
tshp[axes.back()] = tshp[axes.back()]/2+1; tshp[axes.back()] = tshp[axes.back()]/2+1;
arr<complex<T>> tdata(util::prod(tshp)); aligned_array<complex<T>> tdata(util::prod(tshp));
stride_t tstride(shape.size()); stride_t tstride(shape.size());
tstride.back()=sizeof(complex<T>); tstride.back()=sizeof(complex<T>);
for (size_t i=tstride.size()-1; i>0; --i) for (size_t i=tstride.size()-1; i>0; --i)
......
/*
* This file is part of the MR utility library.
*
* This code is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This code is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this code; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Copyright (C) 2019 Max-Planck-Society
Author: Martin Reinecke */
#ifndef MRUTIL_SIMD_H
#define MRUTIL_SIMD_H
// only enable SIMD support for gcc>=5.0 and clang>=5.0
#ifndef MRUTIL_NO_SIMD
#define MRUTIL_NO_SIMD
#if defined(__INTEL_COMPILER)
// do nothing. This is necessary because this compiler also sets __GNUC__.
#elif defined(__clang__)
#if __clang_major__>=5
#undef MRUTIL_NO_SIMD
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef MRUTIL_NO_SIMD
#endif
#endif
#endif
#ifndef MRUTIL_NO_SIMD
#include <cstdlib>
#include <cmath>
namespace mr {
namespace detail_simd {
using namespace std;
#if (defined(__AVX512F__))
constexpr size_t vbytes = 64;
#elif (defined(__AVX__))
constexpr size_t vbytes = 32;
#elif (defined(__SSE2__))
constexpr size_t vbytes = 16;
#elif (defined(__VSX__))
constexpr size_t vbytes = 16;
#endif
template<typename T, size_t len=vbytes/sizeof(T)> class vtp
{
protected:
using Tv [[gnu::vector_size (len*sizeof(T))]] = T;
static_assert((len>0) && ((len&(len-1))==0), "bad vector length");
Tv v;
void from_scalar(T other)
// { for (size_t i=0; i<len; ++i) v[i]=other; }
{ v=v*0+other; }
public:
static constexpr size_t vlen=len;
vtp () {}
vtp(T other)
{ from_scalar(other); }
vtp(const Tv &other)
: v(other) {}
vtp(const vtp &other) = default;
vtp &operator=(T other)
{ from_scalar(other); return *this; }
vtp operator-() const { return vtp(-v); }
vtp operator+(vtp other) const { return vtp(v+other.v); }
vtp operator-(vtp other) const { return vtp(v-other.v); }
vtp operator*(vtp other) const { return vtp(v*other.v); }
vtp operator/(vtp other) const { return vtp(v/other.v); }
vtp &operator+=(vtp other) { v+=other.v; return *this; }
vtp &operator-=(vtp other) { v-=other.v; return *this; }
vtp &operator*=(vtp other) { v*=other.v; return *this; }
vtp &operator/=(vtp other) { v/=other.v; return *this; }
inline vtp exp() const
{
vtp res;
for (size_t i=0; i<len; ++i) res.v[i] = std::exp(v[i]);
return res;
}
vtp sqrt() const
{
vtp res;
for (size_t i=0; i<len; ++i) res.v[i] = std::sqrt(v[i]);
return res;
}
template<typename I> void Set (I i, T val) { v[i]=val; }
template<typename I> T operator[](I i) const { return v[i]; }
};
template<typename T, size_t len> vtp<T, len> operator*(T a, vtp<T, len> b)
{ return b*a; }
}
using detail_simd::vtp;
}
#endif
#endif
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment