Commit 09034339 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

nicer looking vector support

parent 09e1d7c9
...@@ -2035,57 +2035,56 @@ class multi_iter ...@@ -2035,57 +2035,56 @@ class multi_iter
#if (defined(__AVX512F__)) #if (defined(__AVX512F__))
#include <x86intrin.h>
#define HAVE_VECSUPPORT #define HAVE_VECSUPPORT
template<typename T> struct VTYPE{}; constexpr int VBYTELEN=64;
template<> struct VTYPE<double>
{ using type = __m512d; };
template<> struct VTYPE<float>
{ using type = __m512; };
#elif (defined(__AVX__)) #elif (defined(__AVX__))
#include <x86intrin.h>
#define HAVE_VECSUPPORT #define HAVE_VECSUPPORT
constexpr int VBYTELEN=32;
#elif (defined(__SSE2__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=16;
#endif
#if defined(HAVE_VECSUPPORT)
template<typename T> struct VTYPE{}; template<typename T> struct VTYPE{};
template<> struct VTYPE<double> template<> struct VTYPE<double>
{ using type = __m256d; }; {
using type = double __attribute__ ((vector_size (VBYTELEN)));
static constexpr int vlen=VBYTELEN/8;
};
template<> struct VTYPE<float> template<> struct VTYPE<float>
{ using type = __m256; }; {
#elif (defined(__SSE2__)) using type = float __attribute__ ((vector_size (VBYTELEN)));
#include <x86intrin.h> static constexpr int vlen=VBYTELEN/4;
#define HAVE_VECSUPPORT };
#else
template<typename T> struct VTYPE{}; template<typename T> struct VTYPE{};
template<> struct VTYPE<double> template<> struct VTYPE<double>
{ using type = __m128d; }; {
using type = double;
constexpr int vlen=1;
};
template<> struct VTYPE<float> template<> struct VTYPE<float>
{ using type = __m128; }; {
using type = float;
constexpr int vlen=1;
};
#endif #endif
template<typename T> arr<char> alloc_tmp(const shape_t &shape, template<typename T> arr<char> alloc_tmp(const shape_t &shape,
size_t axsize, size_t elemsize) size_t axsize, size_t elemsize)
{ {
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
constexpr int vlen = sizeof(vtype)/sizeof(T);
#else
constexpr int vlen = 1;
#endif
size_t fullsize=1; size_t fullsize=1;
size_t ndim = shape.size(); size_t ndim = shape.size();
for (size_t i=0; i<ndim; ++i) for (size_t i=0; i<ndim; ++i)
fullsize*=shape[i]; fullsize*=shape[i];
auto othersize = fullsize/axsize; auto othersize = fullsize/axsize;
auto tmpsize = axsize*((othersize>vlen) ? vlen : 1); auto tmpsize = axsize*((othersize>VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1);
return arr<char>(tmpsize*elemsize); return arr<char>(tmpsize*elemsize);
} }
template<typename T> arr<char> alloc_tmp(const shape_t &shape, template<typename T> arr<char> alloc_tmp(const shape_t &shape,
const shape_t &axes, size_t elemsize) const shape_t &axes, size_t elemsize)
{ {
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
constexpr int vlen = sizeof(vtype)/sizeof(T);
#else
constexpr int vlen = 1;
#endif
size_t fullsize=1; size_t fullsize=1;
size_t ndim = shape.size(); size_t ndim = shape.size();
for (size_t i=0; i<ndim; ++i) for (size_t i=0; i<ndim; ++i)
...@@ -2095,7 +2094,7 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape, ...@@ -2095,7 +2094,7 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
{ {
auto axsize = shape[axes[i]]; auto axsize = shape[axes[i]];
auto othersize = fullsize/axsize; auto othersize = fullsize/axsize;
tmpsize = max(tmpsize, axsize*((othersize>vlen) ? vlen : 1)); tmpsize = max(tmpsize, axsize*((othersize>VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1));
} }
return arr<char>(tmpsize*elemsize); return arr<char>(tmpsize*elemsize);
} }
...@@ -2117,39 +2116,35 @@ template<typename T> void pocketfft_general_c(const shape_t &shape, ...@@ -2117,39 +2116,35 @@ template<typename T> void pocketfft_general_c(const shape_t &shape,
cmplx<T> *data_out, T fct) cmplx<T> *data_out, T fct)
{ {
auto storage = alloc_tmp<T>(shape, axes, sizeof(cmplx<T>)); auto storage = alloc_tmp<T>(shape, axes, sizeof(cmplx<T>));
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type; using vtype = typename VTYPE<T>::type;
auto tdatav = (cmplx<vtype> *)storage.data(); auto tdatav = (cmplx<vtype> *)storage.data();
constexpr int vlen = sizeof(vtype)/sizeof(T);
#endif
auto tdata = (cmplx<T> *)storage.data(); auto tdata = (cmplx<T> *)storage.data();
constexpr int vlen = VTYPE<T>::vlen;
multiarr a_in(shape, stride_in), a_out(shape, stride_out); multiarr a_in(shape, stride_in), a_out(shape, stride_out);
unique_ptr<pocketfft_c<T>> plan; unique_ptr<pocketfft_c<T>> plan;
for (size_t iax=0; iax<axes.size(); ++iax) for (size_t iax=0; iax<axes.size(); ++iax)
{ {
int axis = axes[iax]; multi_iter it_in(a_in, axes[iax]), it_out(a_out, axes[iax]);
multi_iter it_in(a_in, axis), it_out(a_out, axis);
if ((!plan) || (it_in.length()!=plan->length())) if ((!plan) || (it_in.length()!=plan->length()))
plan.reset(new pocketfft_c<T>(it_in.length())); plan.reset(new pocketfft_c<T>(it_in.length()));
#ifdef HAVE_VECSUPPORT if (vlen>1)
while (it_in.remaining()>=vlen) while (it_in.remaining()>=vlen)
{ {
multioffset<vlen> p_i(it_in), p_o(it_out); multioffset<vlen> p_i(it_in), p_o(it_out);
for (size_t i=0; i<it_in.length(); ++i) for (size_t i=0; i<it_in.length(); ++i)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
{ {
tdatav[i].r[j] = data_in[p_i[j]+i*it_in.stride()].r; tdatav[i].r[j] = data_in[p_i[j]+i*it_in.stride()].r;
tdatav[i].i[j] = data_in[p_i[j]+i*it_in.stride()].i; tdatav[i].i[j] = data_in[p_i[j]+i*it_in.stride()].i;
} }
forward ? plan->forward((vtype *)tdatav, fct) forward ? plan->forward((vtype *)tdatav, fct)
: plan->backward((vtype *)tdatav, fct); : plan->backward((vtype *)tdatav, fct);
for (size_t i=0; i<it_out.length(); ++i) for (size_t i=0; i<it_out.length(); ++i)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i*it_out.stride()].Set(tdatav[i].r[j],tdatav[i].i[j]); data_out[p_o[j]+i*it_out.stride()].Set(tdatav[i].r[j],tdatav[i].i[j]);
} }
#endif
while (it_in.remaining()>0) while (it_in.remaining()>0)
{ {
for (size_t i=0; i<it_in.length(); ++i) for (size_t i=0; i<it_in.length(); ++i)
...@@ -2174,11 +2169,9 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape, ...@@ -2174,11 +2169,9 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape,
const shape_t &axes, const T *data_in, T *data_out, T fct) const shape_t &axes, const T *data_in, T *data_out, T fct)
{ {
auto storage = alloc_tmp<T>(shape, axes, sizeof(T)); auto storage = alloc_tmp<T>(shape, axes, sizeof(T));
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type; using vtype = typename VTYPE<T>::type;
auto tdatav = (vtype *)storage.data(); auto tdatav = (vtype *)storage.data();
constexpr int vlen = sizeof(vtype)/sizeof(T); constexpr int vlen = VTYPE<T>::vlen;
#endif
auto tdata = (T *)storage.data(); auto tdata = (T *)storage.data();
multiarr a_in(shape, stride_in), a_out(shape, stride_out); multiarr a_in(shape, stride_in), a_out(shape, stride_out);
...@@ -2186,32 +2179,30 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape, ...@@ -2186,32 +2179,30 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape,
for (size_t iax=0; iax<axes.size(); ++iax) for (size_t iax=0; iax<axes.size(); ++iax)
{ {
int axis = axes[iax]; multi_iter it_in(a_in, axes[iax]), it_out(a_out, axes[iax]);
multi_iter it_in(a_in, axis), it_out(a_out, axis);
if ((!plan) || (it_in.length()!=plan->length())) if ((!plan) || (it_in.length()!=plan->length()))
plan.reset(new pocketfft_r<T>(it_in.length())); plan.reset(new pocketfft_r<T>(it_in.length()));
#ifdef HAVE_VECSUPPORT if (vlen>1)
while (it_in.remaining()>=vlen) while (it_in.remaining()>=vlen)
{ {
multioffset<vlen> p_i(it_in), p_o(it_out); multioffset<vlen> p_i(it_in), p_o(it_out);
for (size_t i=0; i<it_in.length(); ++i) for (size_t i=0; i<it_in.length(); ++i)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+i*it_in.stride()]; tdatav[i][j] = data_in[p_i[j]+i*it_in.stride()];
plan->forward((vtype *)tdatav, fct); plan->forward((vtype *)tdatav, fct);
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]] = tdatav[0][j];
size_t i=1, i1=1, i2=it_out.length()-1;
for (i=1; i<it_out.length()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
data_out[p_o[j]+i1*it_out.stride()] = tdatav[i][j]+tdatav[i+1][j];
data_out[p_o[j]+i2*it_out.stride()] = tdatav[i][j]-tdatav[i+1][j];
}
if (i<it_out.length())
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i1*it_out.stride()] = tdatav[i][j]; data_out[p_o[j]] = tdatav[0][j];
} size_t i=1, i1=1, i2=it_out.length()-1;
#endif for (i=1; i<it_out.length()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
data_out[p_o[j]+i1*it_out.stride()] = tdatav[i][j]+tdatav[i+1][j];
data_out[p_o[j]+i2*it_out.stride()] = tdatav[i][j]-tdatav[i+1][j];
}
if (i<it_out.length())
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i1*it_out.stride()] = tdatav[i][j];
}
while (it_in.remaining()>0) while (it_in.remaining()>0)
{ {
for (size_t i=0; i<it_in.length(); ++i) for (size_t i=0; i<it_in.length(); ++i)
...@@ -2243,42 +2234,39 @@ template<typename T> void pocketfft_general_r2c(const shape_t &shape, ...@@ -2243,42 +2234,39 @@ template<typename T> void pocketfft_general_r2c(const shape_t &shape,
const T *data_in, cmplx<T> *data_out, T fct) const T *data_in, cmplx<T> *data_out, T fct)
{ {
auto storage = alloc_tmp<T>(shape, shape[axis], sizeof(T)); auto storage = alloc_tmp<T>(shape, shape[axis], sizeof(T));
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type; using vtype = typename VTYPE<T>::type;
auto tdatav = (vtype *)storage.data(); auto tdatav = (vtype *)storage.data();
constexpr int vlen = sizeof(vtype)/sizeof(T); constexpr int vlen = VTYPE<T>::vlen;
#endif
auto tdata = (T *)storage.data(); auto tdata = (T *)storage.data();
multiarr a_in(shape, stride_in), a_out(shape, stride_out); multiarr a_in(shape, stride_in), a_out(shape, stride_out);
pocketfft_r<T> plan(shape[axis]); pocketfft_r<T> plan(shape[axis]);
multi_iter it_in(a_in, axis), it_out(a_out, axis); multi_iter it_in(a_in, axis), it_out(a_out, axis);
size_t len=shape[axis], s_i=it_in.stride(), s_o=it_out.stride(); size_t len=shape[axis], s_i=it_in.stride(), s_o=it_out.stride();
#ifdef HAVE_VECSUPPORT if (vlen>1)
while (it_in.remaining()>=vlen) while (it_in.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
for (size_t i=0; i<it_in.length(); ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+i*it_in.stride()];
plan.forward((vtype *)tdatav, fct);
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]].Set(tdatav[0][j]);
size_t i;
for (i=1; i<len-1; i+=2)
{
size_t io = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+io*it_out.stride()].Set(tdatav[i][j], tdatav[i+1][j]);
}
if (i<len)
{ {
size_t io = (i+1)/2; multioffset<vlen> p_i(it_in), p_o(it_out);
for (size_t i=0; i<it_in.length(); ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+i*it_in.stride()];
plan.forward((vtype *)tdatav, fct);
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+io*it_out.stride()].Set(tdatav[i][j]); data_out[p_o[j]].Set(tdatav[0][j]);
size_t i;
for (i=1; i<len-1; i+=2)
{
size_t io = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+io*it_out.stride()].Set(tdatav[i][j], tdatav[i+1][j]);
}
if (i<len)
{
size_t io = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+io*it_out.stride()].Set(tdatav[i][j]);
}
} }
}
#endif
while (it_in.remaining()>0) while (it_in.remaining()>0)
{ {
const T *d_i = data_in+it_in.offset(); const T *d_i = data_in+it_in.offset();
...@@ -2301,45 +2289,42 @@ template<typename T> void pocketfft_general_c2r(const shape_t &shape_out, ...@@ -2301,45 +2289,42 @@ template<typename T> void pocketfft_general_c2r(const shape_t &shape_out,
const cmplx<T> *data_in, T *data_out, T fct) const cmplx<T> *data_in, T *data_out, T fct)
{ {
auto storage = alloc_tmp<T>(shape_out, shape_out[axis], sizeof(T)); auto storage = alloc_tmp<T>(shape_out, shape_out[axis], sizeof(T));
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type; using vtype = typename VTYPE<T>::type;
auto tdatav = (vtype *)storage.data(); auto tdatav = (vtype *)storage.data();
constexpr int vlen = sizeof(vtype)/sizeof(T); constexpr int vlen = VTYPE<T>::vlen;
#endif
auto tdata = (T *)storage.data(); auto tdata = (T *)storage.data();
multiarr a_in(shape_out, stride_in), a_out(shape_out, stride_out); multiarr a_in(shape_out, stride_in), a_out(shape_out, stride_out);
pocketfft_r<T> plan(shape_out[axis]); pocketfft_r<T> plan(shape_out[axis]);
multi_iter it_in(a_in, axis), it_out(a_out, axis); multi_iter it_in(a_in, axis), it_out(a_out, axis);
size_t len=shape_out[axis], s_i=it_in.stride(), s_o=it_out.stride(); size_t len=shape_out[axis], s_i=it_in.stride(), s_o=it_out.stride();
#ifdef HAVE_VECSUPPORT if (vlen>1)
while (it_in.remaining()>=vlen) while (it_in.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
for (size_t j=0; j<vlen; ++j)
tdatav[0][j]=data_in[p_i[j]].r;
size_t i;
for (i=1; i<len-1; i+=2)
{ {
size_t ii = (i+1)/2; multioffset<vlen> p_i(it_in), p_o(it_out);
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
tdatav[0][j]=data_in[p_i[j]].r;
size_t i;
for (i=1; i<len-1; i+=2)
{
size_t ii = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
{
tdatav[i][j] = data_in[p_i[j]+ii*s_i].r;
tdatav[i+1][j] = data_in[p_i[j]+ii*s_i].i;
}
}
if (i<len)
{ {
tdatav[i][j] = data_in[p_i[j]+ii*s_i].r; size_t ii = (i+1)/2;
tdatav[i+1][j] = data_in[p_i[j]+ii*s_i].i; for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+ii*s_i].r;
} }
plan.backward(tdatav, fct);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i*s_o] = tdatav[i][j];
} }
if (i<len)
{
size_t ii = (i+1)/2;
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+ii*s_i].r;
}
plan.backward(tdatav, fct);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i*s_o] = tdatav[i][j];
}
#endif
while (!it_in.done()) while (!it_in.done())
{ {
const cmplx<T> *d_i = data_in+it_in.offset(); const cmplx<T> *d_i = data_in+it_in.offset();
......
...@@ -19,15 +19,15 @@ def test1D(len): ...@@ -19,15 +19,15 @@ def test1D(len):
a=np.random.rand(len)-0.5 + 1j*np.random.rand(len)-0.5j a=np.random.rand(len)-0.5 + 1j*np.random.rand(len)-0.5j
b=a.astype(np.complex64) b=a.astype(np.complex64)
assert(_l2error(a, pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./len))<1.5e-15) assert(_l2error(a, pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./len))<1.5e-15)
assert(_l2error(a.real, pypocketfft.irfftn(pypocketfft.rfftn(a.real),fct=1./len,lastsize=len))<1e-15) assert(_l2error(a.real, pypocketfft.irfftn(pypocketfft.rfftn(a.real),fct=1./len,lastsize=len))<1.5e-15)
tmp=a.copy() tmp=a.copy()
assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./len, inplace=True) is tmp) assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./len, inplace=True) is tmp)
assert(_l2error(tmp, a)<1.5e-15) assert(_l2error(tmp, a)<1.5e-15)
assert(_l2error(b, pypocketfft.ifftn(pypocketfft.fftn(b),fct=1./len))<5e-7) assert(_l2error(b, pypocketfft.ifftn(pypocketfft.fftn(b),fct=1./len))<6e-7)
assert(_l2error(b.real, pypocketfft.irfftn(pypocketfft.rfftn(b.real),fct=1./len,lastsize=len))<5e-7) assert(_l2error(b.real, pypocketfft.irfftn(pypocketfft.rfftn(b.real),fct=1./len,lastsize=len))<6e-7)
tmp=b.copy() tmp=b.copy()
assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./len, inplace=True) is tmp) assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./len, inplace=True) is tmp)
assert(_l2error(tmp, b)<5e-7) assert(_l2error(tmp, b)<6e-7)
@pmp("shp", shapes) @pmp("shp", shapes)
def test_fftn(shp): def test_fftn(shp):
...@@ -67,12 +67,12 @@ def test_rfftn2D(shp, axes): ...@@ -67,12 +67,12 @@ def test_rfftn2D(shp, axes):
def test_identity(shp): def test_identity(shp):
a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j
vmax = np.max(np.abs(a)) vmax = np.max(np.abs(a))
assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1e-15*vmax) assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1.5e-15*vmax)
tmp=a.copy() tmp=a.copy()
assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./a.size, inplace=True) is tmp) assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./a.size, inplace=True) is tmp)
assert(_l2error(tmp, a)<1.5e-15*vmax) assert(_l2error(tmp, a)<1.5e-15*vmax)
a=a.astype(np.complex64) a=a.astype(np.complex64)
assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<5e-7*vmax) assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<6e-7*vmax)
@pmp("shp", shapes) @pmp("shp", shapes)
def test_identity_r(shp): def test_identity_r(shp):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment