Commit a872975f authored by Martin Reinecke's avatar Martin Reinecke
Browse files

vectorize!

parent ebd22a9f
......@@ -1109,8 +1109,8 @@ template<typename T> NOINLINE void radf5(size_t ido, size_t l1, const T * restri
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
template<typename T> NOINLINE void radfg(size_t ido, size_t ip, size_t l1,
T0 * restrict cc, T0 * restrict ch, const T * restrict wa,
const T * restrict csarr)
T * restrict cc, T * restrict ch, const T0 * restrict wa,
const T0 * restrict csarr)
{
const size_t cdim=ip;
size_t ipph=(ip+1)/2;
......@@ -1289,7 +1289,7 @@ template<typename T> NOINLINE void radb3(size_t ido, size_t l1, const T * restri
T tr2=2.*CC(ido-1,1,k);
T cr2=CC(0,0,k)+taur*tr2;
CH(0,k,0)=CC(0,0,k)+tr2;
T ci3=2.*taui*CC(0,2,k);
T ci3=T0(2.)*taui*CC(0,2,k);
PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
}
if (ido==1) return;
......@@ -1736,7 +1736,7 @@ template<typename T0> class fftblue
template<bool bwd, typename T> NOINLINE
void fft(T c[], T0 fct)
{
constexpr int isign = bwd ? 1 : -1;
constexpr T0 isign = bwd ? 1 : -1;
arr<T> akf(2*n2);
/* initialize a_k and FFT it */
......@@ -1812,9 +1812,9 @@ template<typename T0> class fftblue
{
arr<T> tmp(2*n);
tmp[0]=c[0];
tmp[1]=0.;
tmp[1]=c[0]*0.;
memcpy (tmp.data()+2,c+1, (n-1)*sizeof(T));
if ((n&1)==0) tmp[n+1]=0.;
if ((n&1)==0) tmp[n+1]=c[0]*0.;
for (size_t m=2; m<n; m+=2)
{
tmp[2*n-m]=tmp[m];
......@@ -1831,7 +1831,7 @@ template<typename T0> class fftblue
for (size_t m=0; m<n; ++m)
{
tmp[2*m] = c[m];
tmp[2*m+1] = 0.;
tmp[2*m+1] = c[m]*0.;
}
fft<false>(tmp.data(),fct);
c[0] = tmp[0];
......@@ -1946,12 +1946,13 @@ class multi_iter
vector<size_t> pos;
size_t ofs_, len;
int64_t str;
int64_t rem;
bool done_;
public:
multi_iter(const multiarr &arr, size_t idim)
: pos(arr.ndim()-1, 0), ofs_(0), len(arr.size(idim)),
str(arr.stride(idim)), done_(false)
str(arr.stride(idim)), rem(1), done_(false)
{
dim.reserve(arr.ndim()-1);
for (size_t i=0; i<arr.ndim(); ++i)
......@@ -1959,10 +1960,12 @@ class multi_iter
{
dim.push_back({arr.size(i), arr.stride(i)});
done_ = done_ || (arr.size(i)==0);
rem *= arr.size(i);
}
}
void advance()
{
if (--rem<=0) {done_=true; return; }
for (int i=pos.size()-1; i>=0; --i)
{
++pos[i];
......@@ -1978,8 +1981,10 @@ class multi_iter
size_t offset() const { return ofs_; }
size_t length() const { return len; }
int64_t stride() const { return str; }
int64_t remaining() const { return rem; }
};
#if 0
template<typename T> void pocketfft_general_c(int ndim, const size_t *shape,
const int64_t *stride_in, const int64_t *stride_out, int nax,
const size_t *axes, bool forward, const cmplx<T> *data_in,
......@@ -2036,7 +2041,100 @@ template<typename T> void pocketfft_general_c(int ndim, const size_t *shape,
fct = T(1);
}
}
#else
#if defined(__AVX__)
#include <x86intrin.h>
#define HAVE_VECSUPPORT
template<typename T> struct VTYPE{};
template<> struct VTYPE<double>
{ using type = __m256d; };
template<> struct VTYPE<float>
{ using type = __m256; };
#elif defined(__SSE2__)
#include <x86intrin.h>
#define HAVE_VECSUPPORT
template<typename T> struct VTYPE{};
template<> struct VTYPE<double>
{ using type = __m128d; };
template<> struct VTYPE<float>
{ using type = __m128; };
#endif
template<typename T> void pocketfft_general_c(int ndim, const size_t *shape,
const int64_t *stride_in, const int64_t *stride_out, int nax,
const size_t *axes, bool forward, const cmplx<T> *data_in,
cmplx<T> *data_out, T fct)
{
// allocate temporary 1D array storage
size_t tmpsize = 0;
for (int iax=0; iax<nax; ++iax)
tmpsize = max(tmpsize, shape[axes[iax]]);
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
constexpr int vlen = sizeof(vtype)/sizeof(T);
arr<cmplx<vtype>> xdata(tmpsize);
auto tdata = (cmplx<T> *)xdata.data();
auto tdatav = xdata.data();
#else
arr<cmplx<T>> xdata(tmpsize);
auto tdata = xdata.data();
#endif
multiarr a_in(ndim, shape, stride_in), a_out(ndim, shape, stride_out);
unique_ptr<pocketfft_c<T>> plan;
for (int iax=0; iax<nax; ++iax)
{
int axis = axes[iax];
multi_iter it_in(a_in, axis), it_out(a_out, axis);
if ((!plan) || (it_in.length()!=plan->length()))
plan.reset(new pocketfft_c<T>(it_in.length()));
#ifdef HAVE_VECSUPPORT
while (it_in.remaining()>=vlen)
{
size_t p_i[vlen];
for (size_t i=0; i<vlen; ++i)
{ p_i[i] = it_in.offset(); it_in.advance(); }
size_t p_o[vlen];
for (size_t i=0; i<vlen; ++i)
{ p_o[i] = it_out.offset(); it_out.advance(); }
for (size_t i=0; i<it_in.length(); ++i)
for (size_t j=0; j<vlen; ++j)
{
tdatav[i].r[j] = data_in[p_i[j]+i*it_in.stride()].r;
tdatav[i].i[j] = data_in[p_i[j]+i*it_in.stride()].i;
}
forward ? plan->forward((vtype *)tdatav, fct)
: plan->backward((vtype *)tdatav, fct);
for (size_t i=0; i<it_out.length(); ++i)
for (size_t j=0; j<vlen; ++j)
{
data_out[p_o[j]+i*it_out.stride()].r = tdatav[i].r[j];
data_out[p_o[j]+i*it_out.stride()].i = tdatav[i].i[j];
}
}
#endif
while (it_in.remaining()>0)
{
for (size_t i=0; i<it_in.length(); ++i)
tdata[i] = data_in[it_in.offset() + i*it_in.stride()];
forward ? plan->forward((T *)tdata, fct)
: plan->backward((T *)tdata, fct);
for (size_t i=0; i<it_out.length(); ++i)
data_out[it_out.offset()+i*it_out.stride()] = tdata[i];
it_in.advance();
it_out.advance();
}
// after the first dimension, take data from output array
a_in = a_out;
data_in = data_out;
// factor has been applied, use 1 for remaining axes
fct = T(1);
}
}
#endif
template<typename T> void pocketfft_general_hartley(int ndim, const size_t *shape,
const int64_t *stride_in, const int64_t *stride_out, int nax,
const size_t *axes, const T *data_in, T *data_out, T fct)
......@@ -2044,8 +2142,19 @@ template<typename T> void pocketfft_general_hartley(int ndim, const size_t *shap
// allocate temporary 1D array storage, if necessary
size_t tmpsize = 0;
for (int iax=0; iax<nax; ++iax)
if (shape[axes[iax]] > tmpsize) tmpsize = shape[axes[iax]];
arr<T> tdata(tmpsize);
tmpsize = max(tmpsize, shape[axes[iax]]);
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
constexpr int vlen = sizeof(vtype)/sizeof(T);
arr<vtype> xdata(tmpsize);
auto tdata = (T *)xdata.data();
auto tdatav = xdata.data();
#else
arr<T> xdata(tmpsize);
auto tdata = xdata.data();
#endif
multiarr a_in(ndim, shape, stride_in), a_out(ndim, shape, stride_out);
unique_ptr<pocketfft_r<T>> plan;
......@@ -2055,11 +2164,38 @@ template<typename T> void pocketfft_general_hartley(int ndim, const size_t *shap
multi_iter it_in(a_in, axis), it_out(a_out, axis);
if ((!plan) || (it_in.length()!=plan->length()))
plan.reset(new pocketfft_r<T>(it_in.length()));
while (!it_in.done())
#ifdef HAVE_VECSUPPORT
while (it_in.remaining()>=vlen)
{
size_t p_i[vlen];
for (size_t i=0; i<vlen; ++i)
{ p_i[i] = it_in.offset(); it_in.advance(); }
size_t p_o[vlen];
for (size_t i=0; i<vlen; ++i)
{ p_o[i] = it_out.offset(); it_out.advance(); }
for (size_t i=0; i<it_in.length(); ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+i*it_in.stride()];
plan->forward((vtype *)tdatav, fct);
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]] = tdatav[0][j];
size_t i=1, i1=1, i2=it_out.length()-1;
for (i=1; i<it_out.length()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
data_out[p_o[j]+i1*it_out.stride()] = tdatav[i][j]+tdatav[i+1][j];
data_out[p_o[j]+i2*it_out.stride()] = tdatav[i][j]-tdatav[i+1][j];
}
if (i<it_out.length())
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i1*it_out.stride()] = tdatav[i][j];
}
#endif
while (it_in.remaining()>0)
{
for (size_t i=0; i<it_in.length(); ++i)
tdata[i] = data_in[it_in.offset()+i*it_in.stride()];
plan->forward((T *)tdata.data(), fct);
plan->forward((T *)tdata, fct);
// Hartley order
data_out[it_out.offset()] = tdata[0];
size_t i=1, i1=1, i2=it_out.length()-1;
......@@ -2086,18 +2222,67 @@ template<typename T> void pocketfft_general_r2c(int ndim, const size_t *shape,
const T *data_in, cmplx<T> *data_out, T fct)
{
// allocate temporary 1D array storage
arr<T> tdata(shape[axis]);
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
constexpr int vlen = sizeof(vtype)/sizeof(T);
arr<vtype> xdata(shape[axis]);
auto tdata = (T *)xdata.data();
auto tdatav = xdata.data();
#else
arr<T> xdata(shape[axis]);
auto tdata = xdata.data();
#endif
multiarr a_in(ndim, shape, stride_in), a_out(ndim, shape, stride_out);
pocketfft_r<T> plan(shape[axis]);
multi_iter it_in(a_in, axis), it_out(a_out, axis);
size_t len=shape[axis], s_i=it_in.stride(), s_o=it_out.stride();
while (!it_in.done())
#ifdef HAVE_VECSUPPORT
while (it_in.remaining()>=vlen)
{
size_t p_i[vlen];
for (size_t i=0; i<vlen; ++i)
{ p_i[i] = it_in.offset(); it_in.advance(); }
size_t p_o[vlen];
for (size_t i=0; i<vlen; ++i)
{ p_o[i] = it_out.offset(); it_out.advance(); }
for (size_t i=0; i<it_in.length(); ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+i*it_in.stride()];
plan.forward((vtype *)tdatav, fct);
for (size_t j=0; j<vlen; ++j)
{
data_out[p_o[j]].r = tdatav[0][j];
data_out[p_o[j]].i = 0.;
}
size_t i;
for (i=1; i<len-1; i+=2)
{
size_t io = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
{
data_out[p_o[j]+io*it_out.stride()].r = tdatav[i][j];
data_out[p_o[j]+io*it_out.stride()].i = tdatav[i+1][j];
}
}
if (i<len)
{
size_t io = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
{
data_out[p_o[j]+io*it_out.stride()].r = tdatav[i][j];
data_out[p_o[j]+io*it_out.stride()].i = 0.;
}
}
}
#endif
while (it_in.remaining()>0)
{
const T *d_i = data_in+it_in.offset();
cmplx<T> *d_o = data_out+it_out.offset();
for (size_t i=0; i<len; ++i)
tdata[i] = d_i[i*s_i];
plan.forward(tdata.data(), fct);
plan.forward(tdata, fct);
d_o[0].r = tdata[0];
d_o[0].i = 0.;
size_t i;
......@@ -2122,11 +2307,54 @@ template<typename T> void pocketfft_general_c2r(int ndim, const size_t *shape_ou
const cmplx<T> *data_in, T *data_out, T fct)
{
// allocate temporary 1D array storage
arr<T> tdata(shape_out[axis]);
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
constexpr int vlen = sizeof(vtype)/sizeof(T);
arr<vtype> xdata(shape_out[axis]);
auto tdata = (T *)xdata.data();
auto tdatav = xdata.data();
#else
arr<T> xdata(shape_out[axis]);
auto tdata = xdata.data();
#endif
multiarr a_in(ndim, shape_out, stride_in), a_out(ndim, shape_out, stride_out);
pocketfft_r<T> plan(shape_out[axis]);
multi_iter it_in(a_in, axis), it_out(a_out, axis);
size_t len=shape_out[axis], s_i=it_in.stride(), s_o=it_out.stride();
#ifdef HAVE_VECSUPPORT
while (it_in.remaining()>=vlen)
{
size_t p_i[vlen];
for (size_t i=0; i<vlen; ++i)
{ p_i[i] = it_in.offset(); it_in.advance(); }
size_t p_o[vlen];
for (size_t i=0; i<vlen; ++i)
{ p_o[i] = it_out.offset(); it_out.advance(); }
for (size_t j=0; j<vlen; ++j)
tdatav[0][j]=data_in[p_i[j]].r;
size_t i;
for (i=1; i<len-1; i+=2)
{
size_t ii = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
{
tdatav[i][j] = data_in[p_i[j]+ii*s_i].r;
tdatav[i+1][j] = data_in[p_i[j]+ii*s_i].i;
}
}
if (i<len)
{
size_t ii = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+ii*s_i].r;
}
plan.backward(tdatav, fct);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i*s_o] = tdatav[i][j];
}
#endif
while (!it_in.done())
{
const cmplx<T> *d_i = data_in+it_in.offset();
......@@ -2144,7 +2372,7 @@ template<typename T> void pocketfft_general_c2r(int ndim, const size_t *shape_ou
size_t ii = (i+1)/2;
tdata[i] = d_i[ii*s_i].r;
}
plan.backward(tdata.data(), fct);
plan.backward(tdata, fct);
for (size_t i=0; i<len; ++i)
d_o[i*s_o] = tdata[i];
it_in.advance();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment