Commit 09034339 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

nicer looking vector support

parent 09e1d7c9
......@@ -2035,57 +2035,56 @@ class multi_iter
#if (defined(__AVX512F__))
#include <x86intrin.h>
#define HAVE_VECSUPPORT
template<typename T> struct VTYPE{};
template<> struct VTYPE<double>
{ using type = __m512d; };
template<> struct VTYPE<float>
{ using type = __m512; };
constexpr int VBYTELEN=64;
#elif (defined(__AVX__))
#include <x86intrin.h>
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=32;
#elif (defined(__SSE2__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=16;
#endif
#if defined(HAVE_VECSUPPORT)
template<typename T> struct VTYPE{};
template<> struct VTYPE<double>
{ using type = __m256d; };
{
using type = double __attribute__ ((vector_size (VBYTELEN)));
static constexpr int vlen=VBYTELEN/8;
};
template<> struct VTYPE<float>
{ using type = __m256; };
#elif (defined(__SSE2__))
#include <x86intrin.h>
#define HAVE_VECSUPPORT
{
using type = float __attribute__ ((vector_size (VBYTELEN)));
static constexpr int vlen=VBYTELEN/4;
};
#else
template<typename T> struct VTYPE{};
template<> struct VTYPE<double>
{ using type = __m128d; };
{
using type = double;
constexpr int vlen=1;
};
template<> struct VTYPE<float>
{ using type = __m128; };
{
using type = float;
constexpr int vlen=1;
};
#endif
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
size_t axsize, size_t elemsize)
{
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
constexpr int vlen = sizeof(vtype)/sizeof(T);
#else
constexpr int vlen = 1;
#endif
size_t fullsize=1;
size_t ndim = shape.size();
for (size_t i=0; i<ndim; ++i)
fullsize*=shape[i];
auto othersize = fullsize/axsize;
auto tmpsize = axsize*((othersize>vlen) ? vlen : 1);
auto tmpsize = axsize*((othersize>VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1);
return arr<char>(tmpsize*elemsize);
}
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
const shape_t &axes, size_t elemsize)
{
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
constexpr int vlen = sizeof(vtype)/sizeof(T);
#else
constexpr int vlen = 1;
#endif
size_t fullsize=1;
size_t ndim = shape.size();
for (size_t i=0; i<ndim; ++i)
......@@ -2095,7 +2094,7 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
{
auto axsize = shape[axes[i]];
auto othersize = fullsize/axsize;
tmpsize = max(tmpsize, axsize*((othersize>vlen) ? vlen : 1));
tmpsize = max(tmpsize, axsize*((othersize>VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1));
}
return arr<char>(tmpsize*elemsize);
}
......@@ -2117,23 +2116,20 @@ template<typename T> void pocketfft_general_c(const shape_t &shape,
cmplx<T> *data_out, T fct)
{
auto storage = alloc_tmp<T>(shape, axes, sizeof(cmplx<T>));
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
auto tdatav = (cmplx<vtype> *)storage.data();
constexpr int vlen = sizeof(vtype)/sizeof(T);
#endif
auto tdata = (cmplx<T> *)storage.data();
constexpr int vlen = VTYPE<T>::vlen;
multiarr a_in(shape, stride_in), a_out(shape, stride_out);
unique_ptr<pocketfft_c<T>> plan;
for (size_t iax=0; iax<axes.size(); ++iax)
{
int axis = axes[iax];
multi_iter it_in(a_in, axis), it_out(a_out, axis);
multi_iter it_in(a_in, axes[iax]), it_out(a_out, axes[iax]);
if ((!plan) || (it_in.length()!=plan->length()))
plan.reset(new pocketfft_c<T>(it_in.length()));
#ifdef HAVE_VECSUPPORT
if (vlen>1)
while (it_in.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
......@@ -2149,7 +2145,6 @@ template<typename T> void pocketfft_general_c(const shape_t &shape,
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i*it_out.stride()].Set(tdatav[i].r[j],tdatav[i].i[j]);
}
#endif
while (it_in.remaining()>0)
{
for (size_t i=0; i<it_in.length(); ++i)
......@@ -2174,11 +2169,9 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape,
const shape_t &axes, const T *data_in, T *data_out, T fct)
{
auto storage = alloc_tmp<T>(shape, axes, sizeof(T));
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
auto tdatav = (vtype *)storage.data();
constexpr int vlen = sizeof(vtype)/sizeof(T);
#endif
constexpr int vlen = VTYPE<T>::vlen;
auto tdata = (T *)storage.data();
multiarr a_in(shape, stride_in), a_out(shape, stride_out);
......@@ -2186,11 +2179,10 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape,
for (size_t iax=0; iax<axes.size(); ++iax)
{
int axis = axes[iax];
multi_iter it_in(a_in, axis), it_out(a_out, axis);
multi_iter it_in(a_in, axes[iax]), it_out(a_out, axes[iax]);
if ((!plan) || (it_in.length()!=plan->length()))
plan.reset(new pocketfft_r<T>(it_in.length()));
#ifdef HAVE_VECSUPPORT
if (vlen>1)
while (it_in.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
......@@ -2211,7 +2203,6 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape,
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i1*it_out.stride()] = tdatav[i][j];
}
#endif
while (it_in.remaining()>0)
{
for (size_t i=0; i<it_in.length(); ++i)
......@@ -2243,18 +2234,16 @@ template<typename T> void pocketfft_general_r2c(const shape_t &shape,
const T *data_in, cmplx<T> *data_out, T fct)
{
auto storage = alloc_tmp<T>(shape, shape[axis], sizeof(T));
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
auto tdatav = (vtype *)storage.data();
constexpr int vlen = sizeof(vtype)/sizeof(T);
#endif
constexpr int vlen = VTYPE<T>::vlen;
auto tdata = (T *)storage.data();
multiarr a_in(shape, stride_in), a_out(shape, stride_out);
pocketfft_r<T> plan(shape[axis]);
multi_iter it_in(a_in, axis), it_out(a_out, axis);
size_t len=shape[axis], s_i=it_in.stride(), s_o=it_out.stride();
#ifdef HAVE_VECSUPPORT
if (vlen>1)
while (it_in.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
......@@ -2278,7 +2267,6 @@ template<typename T> void pocketfft_general_r2c(const shape_t &shape,
data_out[p_o[j]+io*it_out.stride()].Set(tdatav[i][j]);
}
}
#endif
while (it_in.remaining()>0)
{
const T *d_i = data_in+it_in.offset();
......@@ -2301,18 +2289,16 @@ template<typename T> void pocketfft_general_c2r(const shape_t &shape_out,
const cmplx<T> *data_in, T *data_out, T fct)
{
auto storage = alloc_tmp<T>(shape_out, shape_out[axis], sizeof(T));
#ifdef HAVE_VECSUPPORT
using vtype = typename VTYPE<T>::type;
auto tdatav = (vtype *)storage.data();
constexpr int vlen = sizeof(vtype)/sizeof(T);
#endif
constexpr int vlen = VTYPE<T>::vlen;
auto tdata = (T *)storage.data();
multiarr a_in(shape_out, stride_in), a_out(shape_out, stride_out);
pocketfft_r<T> plan(shape_out[axis]);
multi_iter it_in(a_in, axis), it_out(a_out, axis);
size_t len=shape_out[axis], s_i=it_in.stride(), s_o=it_out.stride();
#ifdef HAVE_VECSUPPORT
if (vlen>1)
while (it_in.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
......@@ -2339,7 +2325,6 @@ template<typename T> void pocketfft_general_c2r(const shape_t &shape_out,
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i*s_o] = tdatav[i][j];
}
#endif
while (!it_in.done())
{
const cmplx<T> *d_i = data_in+it_in.offset();
......
......@@ -19,15 +19,15 @@ def test1D(len):
a=np.random.rand(len)-0.5 + 1j*np.random.rand(len)-0.5j
b=a.astype(np.complex64)
assert(_l2error(a, pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./len))<1.5e-15)
assert(_l2error(a.real, pypocketfft.irfftn(pypocketfft.rfftn(a.real),fct=1./len,lastsize=len))<1e-15)
assert(_l2error(a.real, pypocketfft.irfftn(pypocketfft.rfftn(a.real),fct=1./len,lastsize=len))<1.5e-15)
tmp=a.copy()
assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./len, inplace=True) is tmp)
assert(_l2error(tmp, a)<1.5e-15)
assert(_l2error(b, pypocketfft.ifftn(pypocketfft.fftn(b),fct=1./len))<5e-7)
assert(_l2error(b.real, pypocketfft.irfftn(pypocketfft.rfftn(b.real),fct=1./len,lastsize=len))<5e-7)
assert(_l2error(b, pypocketfft.ifftn(pypocketfft.fftn(b),fct=1./len))<6e-7)
assert(_l2error(b.real, pypocketfft.irfftn(pypocketfft.rfftn(b.real),fct=1./len,lastsize=len))<6e-7)
tmp=b.copy()
assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./len, inplace=True) is tmp)
assert(_l2error(tmp, b)<5e-7)
assert(_l2error(tmp, b)<6e-7)
@pmp("shp", shapes)
def test_fftn(shp):
......@@ -67,12 +67,12 @@ def test_rfftn2D(shp, axes):
def test_identity(shp):
a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j
vmax = np.max(np.abs(a))
assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1e-15*vmax)
assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1.5e-15*vmax)
tmp=a.copy()
assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./a.size, inplace=True) is tmp)
assert(_l2error(tmp, a)<1.5e-15*vmax)
a=a.astype(np.complex64)
assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<5e-7*vmax)
assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<6e-7*vmax)
@pmp("shp", shapes)
def test_identity_r(shp):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment