Commit e0d1c40c authored by Martin Reinecke's avatar Martin Reinecke
Browse files

make FFT use experimental::simd

parent 462974b0
......@@ -54,7 +54,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <array>
#endif
#include "mr_util/threading.h"
#include "mr_util/simd.h"
#include <experimental/simd>
#include "mr_util/cmplx.h"
#include "mr_util/aligned_array.h"
#include "mr_util/unity_roots.h"
......@@ -75,12 +75,11 @@ namespace mr {
namespace detail_fft {
using namespace std;
using namespace std::experimental;
using shape_t = vector<size_t>;
using stride_t = vector<ptrdiff_t>;
template<typename T> using arr=aligned_array<T>;
constexpr bool FORWARD = true,
BACKWARD = false;
......@@ -103,8 +102,8 @@ constexpr bool FORWARD = true,
template<typename T> struct VLEN { static constexpr size_t val=1; };
#ifndef MRUTIL_NO_VECTORS
template<> struct VLEN<float> { static constexpr size_t val=vtp<float>::vlen; };
template<> struct VLEN<double> { static constexpr size_t val=vtp<double>::vlen; };
template<> struct VLEN<float> { static constexpr size_t val=simd<float>::size(); };
template<> struct VLEN<double> { static constexpr size_t val=simd<double>::size(); };
#endif
template<typename T> inline void PM(T &a, T &b, T c, T d)
......@@ -158,7 +157,7 @@ struct util // hack to avoid duplicate symbols
}
/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
static POCKETFFT_NOINLINE size_t good_size_Cmplx(size_t n)
static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
{
if (n<=12) return n;
......@@ -288,7 +287,7 @@ template<typename T0> class cfftp
};
size_t length;
arr<Cmplx<T0>> mem;
aligned_array<Cmplx<T0>> mem;
vector<fctdata> fact;
void add_factor(size_t factor)
......@@ -785,7 +784,7 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
auto CH2 = [ch, idl1](size_t a, size_t b) -> const T&
{ return ch[a+idl1*b]; };
arr<Cmplx<T0>> wal(ip);
aligned_array<Cmplx<T0>> wal(ip);
wal[0] = Cmplx<T0>(1., 0.);
for (size_t i=1; i<ip; ++i)
wal[i]=Cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
......@@ -878,7 +877,7 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
{
if (length==1) { c[0]*=fct; return; }
size_t l1=1;
arr<T> ch(length);
aligned_array<T> ch(length);
T *p1=c, *p2=ch.data();
for(size_t k1=0; k1<fact.size(); k1++)
......@@ -1014,7 +1013,7 @@ template<typename T0> class rfftp
};
size_t length;
arr<T0> mem;
aligned_array<T0> mem;
vector<fctdata> fact;
void add_factor(size_t factor)
......@@ -1683,7 +1682,7 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
{
if (length==1) { c[0]*=fct; return; }
size_t n=length, nf=fact.size();
arr<T> ch(n);
aligned_array<T> ch(n);
T *p1=c, *p2=ch.data();
if (r2hc)
......@@ -1818,12 +1817,12 @@ template<typename T0> class fftblue
private:
size_t n, n2;
cfftp<T0> plan;
arr<Cmplx<T0>> mem;
aligned_array<Cmplx<T0>> mem;
Cmplx<T0> *bk, *bkf;
template<bool fwd, typename T> void fft(Cmplx<T> c[], T0 fct) const
{
arr<Cmplx<T>> akf(n2);
aligned_array<Cmplx<T>> akf(n2);
/* initialize a_k and FFT it */
for (size_t m=0; m<n; ++m)
......@@ -1854,7 +1853,7 @@ template<typename T0> class fftblue
public:
POCKETFFT_NOINLINE fftblue(size_t length)
: n(length), n2(util::good_size_Cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
: n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
bk(mem.data()), bkf(mem.data()+n)
{
/* initialize b_k */
......@@ -1870,7 +1869,7 @@ template<typename T0> class fftblue
}
/* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
arr<Cmplx<T0>> tbkf(n2);
aligned_array<Cmplx<T0>> tbkf(n2);
T0 xn2 = T0(1)/T0(n2);
tbkf[0] = bk[0]*xn2;
for (size_t m=1; m<n; ++m)
......@@ -1887,7 +1886,7 @@ template<typename T0> class fftblue
template<typename T> void exec_r(T c[], T0 fct, bool fwd)
{
arr<Cmplx<T>> tmp(n);
aligned_array<Cmplx<T>> tmp(n);
if (fwd)
{
auto zero = T0(0)*c[0];
......@@ -1935,7 +1934,7 @@ template<typename T0> class pocketfft_c
return;
}
double comp1 = util::cost_guess(length);
double comp2 = 2*util::cost_guess(util::good_size_Cmplx(2*length-1));
double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
comp2*=1.5; /* fudge factor that appears to give good overall performance */
if (comp2<comp1) // use Bluestein
blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
......@@ -1972,7 +1971,7 @@ template<typename T0> class pocketfft_r
return;
}
double comp1 = 0.5*util::cost_guess(length);
double comp2 = 2*util::cost_guess(util::good_size_Cmplx(2*length-1));
double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
comp2*=1.5; /* fudge factor that appears to give good overall performance */
if (comp2<comp1) // use Bluestein
blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
......@@ -2007,7 +2006,7 @@ template<typename T0> class T_dct1
size_t N=fftplan.length(), n=N/2+1;
if (ortho)
{ c[0]*=sqrt2; c[n-1]*=sqrt2; }
arr<T> tmp(N);
aligned_array<T> tmp(N);
tmp[0] = c[0];
for (size_t i=1; i<n; ++i)
tmp[i] = tmp[N-i] = c[i];
......@@ -2035,7 +2034,7 @@ template<typename T0> class T_dst1
bool /*ortho*/, int /*type*/, bool /*cosine*/) const
{
size_t N=fftplan.length(), n=N/2-1;
arr<T> tmp(N);
aligned_array<T> tmp(N);
tmp[0] = tmp[n+1] = c[0]*0;
for (size_t i=0; i<n; ++i)
{ tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
......@@ -2123,7 +2122,7 @@ template<typename T0> class T_dcst4
size_t N;
unique_ptr<pocketfft_c<T0>> fft;
unique_ptr<pocketfft_r<T0>> rfft;
arr<Cmplx<T0>> C2;
aligned_array<Cmplx<T0>> C2;
public:
POCKETFFT_NOINLINE T_dcst4(size_t length)
......@@ -2153,7 +2152,7 @@ template<typename T0> class T_dcst4
// and is released under the 3-clause BSD license with friendly
// permission of Matteo Frigo and Steven G. Johnson.
arr<T> y(N);
aligned_array<T> y(N);
{
size_t i=0, m=n2;
for (; m<N; ++i, m+=4)
......@@ -2196,7 +2195,7 @@ template<typename T0> class T_dcst4
{
// even length algorithm from
// https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
arr<Cmplx<T>> y(n2);
aligned_array<Cmplx<T>> y(n2);
for(size_t i=0; i<n2; ++i)
{
y[i].Set(c[2*i],c[N-1-2*i]);
......@@ -2502,26 +2501,26 @@ template <typename T> using vtype_t = typename VTYPE<T>::type;
#ifndef POCKETFFT_NO_VECTORS
template<> struct VTYPE<float>
{
using type = vtp<float>;
using type = simd<float>;
};
template<> struct VTYPE<double>
{
using type = vtp<double>;
using type = simd<double>;
};
template<> struct VTYPE<long double>
{
using type =vtp<long double>;
using type =simd<long double, simd_abi::fixed_size<1>>;
};
#endif
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
template<typename T> aligned_array<char> alloc_tmp(const shape_t &shape,
size_t axsize, size_t elemsize)
{
auto othersize = util::prod(shape)/axsize;
auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
return arr<char>(tmpsize*elemsize);
return aligned_array<char>(tmpsize*elemsize);
}
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
template<typename T> aligned_array<char> alloc_tmp(const shape_t &shape,
const shape_t &axes, size_t elemsize)
{
size_t fullsize=util::prod(shape);
......@@ -2533,7 +2532,7 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
if (sz>tmpsize) tmpsize=sz;
}
return arr<char>(tmpsize*elemsize);
return aligned_array<char>(tmpsize*elemsize);
}
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
......@@ -2542,8 +2541,8 @@ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
for (size_t i=0; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
{
dst[i].r.Set(j,src[it.iofs(j,i)].r);
dst[i].i.Set(j,src[it.iofs(j,i)].i);
dst[i].r[j] = src[it.iofs(j,i)].r;
dst[i].i[j] = src[it.iofs(j,i)].i;
}
}
......@@ -2786,11 +2785,17 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
if (forward)
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
in[it.iofs(j,ii)].SplitConj(tdatav[i][j], tdatav[i+1][j]);
{
tdatav[i ][j] = in[it.iofs(j,ii)].r;
tdatav[i+1][j] = -in[it.iofs(j,ii)].i;
}
else
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
in[it.iofs(j,ii)].Split(tdatav[i][j], tdatav[i+1][j]);
{
tdatav[i ][j] = in[it.iofs(j,ii)].r;
tdatav[i+1][j] = in[it.iofs(j,ii)].i;
}
if (i<len)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = in[it.iofs(j,ii)].r;
......@@ -2808,10 +2813,16 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
size_t i=1, ii=1;
if (forward)
for (; i<len-1; i+=2, ++ii)
in[it.iofs(ii)].SplitConj(tdata[i], tdata[i+1]);
{
tdata[i ] = in[it.iofs(ii)].r;
tdata[i+1] = -in[it.iofs(ii)].i;
}
else
for (; i<len-1; i+=2, ++ii)
in[it.iofs(ii)].Split(tdata[i], tdata[i+1]);
{
tdata[i ] = in[it.iofs(ii)].r;
tdata[i+1] = in[it.iofs(ii)].i;
}
if (i<len)
tdata[i] = in[it.iofs(ii)].r;
}
......@@ -2953,7 +2964,7 @@ template<typename T> void c2r(const shape_t &shape_out,
for (int i=int(shape_in.size())-2; i>=0; --i)
stride_inter[size_t(i)] =
stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]);
arr<complex<T>> tmp(nval);
aligned_array<complex<T>> tmp(nval);
auto newaxes = shape_t({axes.begin(), --axes.end()});
c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(),
T(1), nthreads);
......@@ -2997,7 +3008,7 @@ template<typename T> void r2r_genuine_hartley(const shape_t &shape,
util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
shape_t tshp(shape);
tshp[axes.back()] = tshp[axes.back()]/2+1;
arr<complex<T>> tdata(util::prod(tshp));
aligned_array<complex<T>> tdata(util::prod(tshp));
stride_t tstride(shape.size());
tstride.back()=sizeof(complex<T>);
for (size_t i=tstride.size()-1; i>0; --i)
......
/*
* This file is part of the MR utility library.
*
* This code is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This code is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this code; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Copyright (C) 2019 Max-Planck-Society
Author: Martin Reinecke */
#ifndef MRUTIL_SIMD_H
#define MRUTIL_SIMD_H
// only enable SIMD support for gcc>=5.0 and clang>=5.0
#ifndef MRUTIL_NO_SIMD
#define MRUTIL_NO_SIMD
#if defined(__INTEL_COMPILER)
// do nothing. This is necessary because this compiler also sets __GNUC__.
#elif defined(__clang__)
#if __clang_major__>=5
#undef MRUTIL_NO_SIMD
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef MRUTIL_NO_SIMD
#endif
#endif
#endif
#ifndef MRUTIL_NO_SIMD
#include <cstdlib>
#include <cmath>
namespace mr {
namespace detail_simd {
using namespace std;
#if (defined(__AVX512F__))
constexpr size_t vbytes = 64;
#elif (defined(__AVX__))
constexpr size_t vbytes = 32;
#elif (defined(__SSE2__))
constexpr size_t vbytes = 16;
#elif (defined(__VSX__))
constexpr size_t vbytes = 16;
#endif
template<typename T, size_t len=vbytes/sizeof(T)> class vtp
{
protected:
using Tv [[gnu::vector_size (len*sizeof(T))]] = T;
static_assert((len>0) && ((len&(len-1))==0), "bad vector length");
Tv v;
void from_scalar(T other)
// { for (size_t i=0; i<len; ++i) v[i]=other; }
{ v=v*0+other; }
public:
static constexpr size_t vlen=len;
vtp () {}
vtp(T other)
{ from_scalar(other); }
vtp(const Tv &other)
: v(other) {}
vtp(const vtp &other) = default;
vtp &operator=(T other)
{ from_scalar(other); return *this; }
vtp operator-() const { return vtp(-v); }
vtp operator+(vtp other) const { return vtp(v+other.v); }
vtp operator-(vtp other) const { return vtp(v-other.v); }
vtp operator*(vtp other) const { return vtp(v*other.v); }
vtp operator/(vtp other) const { return vtp(v/other.v); }
vtp &operator+=(vtp other) { v+=other.v; return *this; }
vtp &operator-=(vtp other) { v-=other.v; return *this; }
vtp &operator*=(vtp other) { v*=other.v; return *this; }
vtp &operator/=(vtp other) { v/=other.v; return *this; }
inline vtp exp() const
{
vtp res;
for (size_t i=0; i<len; ++i) res.v[i] = std::exp(v[i]);
return res;
}
vtp sqrt() const
{
vtp res;
for (size_t i=0; i<len; ++i) res.v[i] = std::sqrt(v[i]);
return res;
}
template<typename I> void Set (I i, T val) { v[i]=val; }
template<typename I> T operator[](I i) const { return v[i]; }
};
template<typename T, size_t len> vtp<T, len> operator*(T a, vtp<T, len> b)
{ return b*a; }
}
using detail_simd::vtp;
}
#endif
#endif
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment