Commit 1c168d5b authored by Martin Reinecke's avatar Martin Reinecke
Browse files

current state

parent da0ac67a
...@@ -118,25 +118,26 @@ def bench_nd(ndim, nmax, nthr, ntry, tp, funcs, nrepeat, ttl="", filename="", ...@@ -118,25 +118,26 @@ def bench_nd(ndim, nmax, nthr, ntry, tp, funcs, nrepeat, ttl="", filename="",
tmp = func(a, nrepeat, nthr) tmp = func(a, nrepeat, nthr)
res.append(tmp[0]) res.append(tmp[0])
output.append(tmp[1]) output.append(tmp[1])
print("{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}".format(results[0][n],results[1][n],results[0][n]/results[1][n],_l2error(output[0],output[1]))) # print("{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}".format(results[0][n],results[1][n],results[0][n]/results[1][n],_l2error(output[0],output[1])))
results = np.array(results) # results = np.array(results)
plt.title("{}: {}D, {}, max_extent={}".format( # plt.title("{}: {}D, {}, max_extent={}".format(
ttl, ndim, str(tp), nmax)) # ttl, ndim, str(tp), nmax))
plt.xlabel("time ratio") # plt.xlabel("time ratio")
plt.ylabel("counts") # plt.ylabel("counts")
plt.hist(results[0, :]/results[1, :], bins="auto") # plt.hist(results[0, :]/results[1, :], bins="auto")
if filename != "": # if filename != "":
plt.savefig(filename) # plt.savefig(filename)
plt.show() # plt.show()
funcs = (measure_pypocketfft, measure_fftw) funcs = (measure_pypocketfft,)
ttl = "pypocketfft/FFTW()" ttl = "pypocketfft/FFTW()"
ntry=100
nthr = 1 nthr = 1
nice_sizes = True nice_sizes = True
bench_nd(1, 8192, nthr, 100, "c16", funcs, 10, ttl, "1d.png", nice_sizes) #bench_nd(1, 8192, nthr, ntry, "c16", funcs, 10, ttl, "1d.png", nice_sizes)
bench_nd(2, 2048, nthr, 100, "c16", funcs, 2, ttl, "2d.png", nice_sizes) bench_nd(2, 2048, nthr, ntry, "c16", funcs, 2, ttl, "2d.png", nice_sizes)
bench_nd(3, 256, nthr, 100, "c16", funcs, 2, ttl, "3d.png", nice_sizes) # bench_nd(3, 256, nthr, ntry, "c16", funcs, 2, ttl, "3d.png", nice_sizes)
bench_nd(1, 8192, nthr, 100, "c8", funcs, 10, ttl, "1d_single.png", nice_sizes) # bench_nd(1, 8192, nthr, ntry, "c8", funcs, 10, ttl, "1d_single.png", nice_sizes)
bench_nd(2, 2048, nthr, 100, "c8", funcs, 2, ttl, "2d_single.png", nice_sizes) # bench_nd(2, 2048, nthr, ntry, "c8", funcs, 2, ttl, "2d_single.png", nice_sizes)
bench_nd(3, 256, nthr, 100, "c8", funcs, 2, ttl, "3d_single.png", nice_sizes) # bench_nd(3, 256, nthr, ntry, "c8", funcs, 2, ttl, "3d_single.png", nice_sizes)
...@@ -24,6 +24,8 @@ ...@@ -24,6 +24,8 @@
#ifndef MRUTIL_COMMUNICATION_H #ifndef MRUTIL_COMMUNICATION_H
#define MRUTIL_COMMUNICATION_H #define MRUTIL_COMMUNICATION_H
#define MRUTIL_USE_MPI
#include <vector> #include <vector>
#ifdef MRUTIL_USE_MPI #ifdef MRUTIL_USE_MPI
#include <mpi.h> #include <mpi.h>
...@@ -95,8 +97,7 @@ class Communicator ...@@ -95,8 +97,7 @@ class Communicator
template<typename T> void sendrecvRaw (const T *sendbuf, size_t sendcnt, template<typename T> void sendrecvRaw (const T *sendbuf, size_t sendcnt,
size_t dest, T *recvbuf, size_t recvcnt, size_t src) const size_t dest, T *recvbuf, size_t recvcnt, size_t src) const
{ {
sendrecvRawVoid(sendbuf, sendcnt, dest, recvbuf, recvcnt, src, sendrecvRawVoid(sendbuf, sendcnt, dest, recvbuf, recvcnt, src, tidx<T>());
tidx<T>());
} }
template<typename T> void sendrecv_replaceRaw (T *data, size_t num, template<typename T> void sendrecv_replaceRaw (T *data, size_t num,
size_t dest, size_t src) const size_t dest, size_t src) const
...@@ -110,6 +111,12 @@ class Communicator ...@@ -110,6 +111,12 @@ class Communicator
template<typename T> void allgathervRaw (const T *in, int numin, T *out, template<typename T> void allgathervRaw (const T *in, int numin, T *out,
const int *numout, const int *disout) const const int *numout, const int *disout) const
{ allgathervRawVoid (in, numin, out, numout, disout, tidx<T>()); } { allgathervRawVoid (in, numin, out, numout, disout, tidx<T>()); }
template<typename T> vector<T> allgatherVec (const T &in) const
{
vector<T> res(num_ranks_);
allgatherRaw(&in, res.data(), 1);
return res;
}
template<typename T> T allreduce(const T &in, redOp op) const template<typename T> T allreduce(const T &in, redOp op) const
{ {
...@@ -117,13 +124,18 @@ class Communicator ...@@ -117,13 +124,18 @@ class Communicator
allreduceRaw (&in, &out, 1, op); allreduceRaw (&in, &out, 1, op);
return out; return out;
} }
template<typename T> std::vector<T> allreduce template<typename T> std::vector<T> allreduceVec
(const std::vector<T> &in, redOp op) const (const std::vector<T> &in, redOp op) const
{ {
std::vector<T> out(in.size()); std::vector<T> out(in.size());
allreduceRaw (in.data(), out.data(), in.size(), op); allreduceRaw (in.data(), out.data(), in.size(), op);
return out; return out;
} }
template<typename T> void sendrecvVec(const vector<T> &sendbuf, size_t dest,
vector<T> &recvbuf, size_t src) const
{
sendrecvRaw(sendbuf.data(), sendbuf.size(), dest, recvbuf.data(), recvbuf.size(), src);
}
/*! NB: \a num refers to the <i>total</i> number of items in the arrays; /*! NB: \a num refers to the <i>total</i> number of items in the arrays;
the individual message size is \a num/num_ranks(). */ the individual message size is \a num/num_ranks(). */
template<typename T> void all2allRaw (const T *in, T *out, size_t num) const template<typename T> void all2allRaw (const T *in, T *out, size_t num) const
......
...@@ -52,6 +52,10 @@ namespace mr { ...@@ -52,6 +52,10 @@ namespace mr {
namespace detail_simd { namespace detail_simd {
template<typename T> T myexp(T);// {return -42;}
template<> inline double myexp(double v) {return std::exp(v);}
template<> inline float myexp(float v) {return std::exp(v);}
template<typename T> constexpr inline bool vectorizable = false; template<typename T> constexpr inline bool vectorizable = false;
template<> constexpr inline bool vectorizable<float> = true; template<> constexpr inline bool vectorizable<float> = true;
template<> constexpr inline bool vectorizable<double> = true; template<> constexpr inline bool vectorizable<double> = true;
...@@ -114,6 +118,13 @@ template<typename T, size_t len> class vtp ...@@ -114,6 +118,13 @@ template<typename T, size_t len> class vtp
vtp &operator*=(vtp other) { v*=other.v; return *this; } vtp &operator*=(vtp other) { v*=other.v; return *this; }
vtp &operator/=(vtp other) { v/=other.v; return *this; } vtp &operator/=(vtp other) { v/=other.v; return *this; }
vtp abs() const { return hlp::abs(v); } vtp abs() const { return hlp::abs(v); }
template<typename Func> vtp apply(Func func) const
{
vtp res;
for (size_t i=0; i<len; ++i)
res[i] = func(v[i]);
return res;
}
inline vtp sqrt() const inline vtp sqrt() const
{ return hlp::sqrt(v); } { return hlp::sqrt(v); }
vtp max(const vtp &other) const vtp max(const vtp &other) const
...@@ -188,6 +199,8 @@ template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op ...@@ -188,6 +199,8 @@ template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op
res = op(res, v[i]); res = op(res, v[i]);
return res; return res;
} }
template<typename T, size_t len> vtp<T, len> exp(const vtp<T, len> &v)
{ return v.apply(myexp<T>); }
template<typename T> class pseudoscalar template<typename T> class pseudoscalar
{ {
private: private:
...@@ -407,6 +420,7 @@ using detail_simd::native_simd; ...@@ -407,6 +420,7 @@ using detail_simd::native_simd;
using detail_simd::reduce; using detail_simd::reduce;
using detail_simd::max; using detail_simd::max;
using detail_simd::abs; using detail_simd::abs;
using detail_simd::exp;
using detail_simd::sqrt; using detail_simd::sqrt;
using detail_simd::any_of; using detail_simd::any_of;
using detail_simd::none_of; using detail_simd::none_of;
......
...@@ -38,7 +38,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ...@@ -38,7 +38,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MRUTIL_FFT_H #ifndef MRUTIL_FFT_H
#define MRUTIL_FFT_H #define MRUTIL_FFT_H
#include <iostream>
#include "mr_util/math/fft1d.h" #include "mr_util/math/fft1d.h"
#ifndef POCKETFFT_CACHE_SIZE #ifndef POCKETFFT_CACHE_SIZE
...@@ -67,6 +67,7 @@ namespace mr { ...@@ -67,6 +67,7 @@ namespace mr {
namespace detail_fft { namespace detail_fft {
using shape_t=fmav_info::shape_t; using shape_t=fmav_info::shape_t;
using stride_t=fmav_info::stride_t;
constexpr bool FORWARD = true, constexpr bool FORWARD = true,
BACKWARD = false; BACKWARD = false;
...@@ -435,34 +436,79 @@ template<typename T> std::shared_ptr<T> get_plan(size_t length) ...@@ -435,34 +436,79 @@ template<typename T> std::shared_ptr<T> get_plan(size_t length)
template<size_t N> class multi_iter template<size_t N> class multi_iter
{ {
private: private:
shape_t pos; shape_t shp, pos;
fmav_info iarr, oarr; stride_t str_i, str_o;
ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o; size_t cshp_i, cshp_o, rem;
size_t idim, rem; ptrdiff_t cstr_i, cstr_o, sstr_i, sstr_o, p_ii, p_i[N], p_oi, p_o[N];
bool uni_i, uni_o;
void advance_i() void advance_i()
{ {
for (int i_=int(pos.size())-1; i_>=0; --i_) for (size_t i=0; i<pos.size(); ++i)
{ {
auto i = size_t(i_); p_ii += str_i[i];
if (i==idim) continue; p_oi += str_o[i];
p_ii += iarr.stride(i); if (++pos[i] < shp[i])
p_oi += oarr.stride(i);
if (++pos[i] < iarr.shape(i))
return; return;
pos[i] = 0; pos[i] = 0;
p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i); p_ii -= ptrdiff_t(shp[i])*str_i[i];
p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i); p_oi -= ptrdiff_t(shp[i])*str_o[i];
} }
} }
public: public:
multi_iter(const fmav_info &iarr_, const fmav_info &oarr_, size_t idim_, multi_iter(const fmav_info &iarr, const fmav_info &oarr, size_t idim,
size_t nshares, size_t myshare) size_t nshares, size_t myshare)
: pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0), : rem(iarr.size()/iarr.shape(idim)), sstr_i(0), sstr_o(0), p_ii(0), p_oi(0)
str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)),
idim(idim_), rem(iarr.size()/iarr.shape(idim))
{ {
MR_assert(oarr.ndim()==iarr.ndim(), "dimension mismatch");
MR_assert(iarr.ndim()>=1, "not enough dimensions");
// Sort the extraneous dimensions in order of ascending output stride;
// this should improve overall cache re-use and avoid clashes between
// threads as much as possible.
shape_t idx(iarr.ndim());
std::iota(idx.begin(), idx.end(), 0);
sort(idx.begin(), idx.end(),
[&oarr](size_t i1, size_t i2) {return oarr.stride(i1) < oarr.stride(i2);});
for (auto i: idx)
if (i!=idim)
{
pos.push_back(0);
MR_assert(iarr.shape(i)==oarr.shape(i), "shape mismatch");
shp.push_back(iarr.shape(i));
str_i.push_back(iarr.stride(i));
str_o.push_back(oarr.stride(i));
}
MR_assert(idim<iarr.ndim(), "bad active dimension");
cstr_i = iarr.stride(idim);
cstr_o = oarr.stride(idim);
cshp_i = iarr.shape(idim);
cshp_o = oarr.shape(idim);
// collapse unneeded dimensions
bool done = false;
while(!done)
{
done=true;
for (size_t i=1; i<shp.size(); ++i)
if ((str_i[i] == str_i[i-1]*ptrdiff_t(shp[i-1]))
&& (str_o[i] == str_o[i-1]*ptrdiff_t(shp[i-1])))
{
shp[i-1] *= shp[i];
str_i.erase(str_i.begin()+ptrdiff_t(i));
str_o.erase(str_o.begin()+ptrdiff_t(i));
shp.erase(shp.begin()+ptrdiff_t(i));
pos.pop_back();
done=false;
// std::cout << "reduced dims" << std::endl;
}
}
if (pos.size()>0)
{
sstr_i = str_i[0];
sstr_o = str_o[0];
}
if (nshares==1) return; if (nshares==1) return;
if (nshares==0) throw std::runtime_error("can't run with zero threads"); if (nshares==0) throw std::runtime_error("can't run with zero threads");
if (myshare>=nshares) throw std::runtime_error("impossible share requested"); if (myshare>=nshares) throw std::runtime_error("impossible share requested");
...@@ -473,16 +519,16 @@ template<size_t N> class multi_iter ...@@ -473,16 +519,16 @@ template<size_t N> class multi_iter
size_t todo = hi-lo; size_t todo = hi-lo;
size_t chunk = rem; size_t chunk = rem;
for (size_t i=0; i<pos.size(); ++i) for (size_t i2=0, i=pos.size()-1; i2<pos.size(); ++i2,--i)
{ {
if (i==idim) continue; chunk /= shp[i];
chunk /= iarr.shape(i);
size_t n_advance = lo/chunk; size_t n_advance = lo/chunk;
pos[i] += n_advance; pos[i] += n_advance;
p_ii += ptrdiff_t(n_advance)*iarr.stride(i); p_ii += ptrdiff_t(n_advance)*str_i[i];
p_oi += ptrdiff_t(n_advance)*oarr.stride(i); p_oi += ptrdiff_t(n_advance)*str_o[i];
lo -= n_advance*chunk; lo -= n_advance*chunk;
} }
MR_assert(lo==0, "must not happen");
rem = todo; rem = todo;
} }
void advance(size_t n) void advance(size_t n)
...@@ -494,16 +540,30 @@ template<size_t N> class multi_iter ...@@ -494,16 +540,30 @@ template<size_t N> class multi_iter
p_o[i] = p_oi; p_o[i] = p_oi;
advance_i(); advance_i();
} }
uni_i = uni_o = true;
for (size_t i=1; i<n; ++i)
{
// std::cout << (p_i[i]-p_i[i-1]) << " " << sstr_i << std::endl;
uni_i = uni_i && (p_i[i]-p_i[i-1] == sstr_i);
uni_o = uni_o && (p_o[i]-p_o[i-1] == sstr_o);
}
// for (size_t i=0; i<n; ++i)
rem -= n; rem -= n;
} }
ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*str_i; } ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*cstr_i; }
ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*str_i; } ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*cstr_i; }
ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*str_o; } ptrdiff_t iofs_uni(size_t j, size_t i) const { return p_i[0] + ptrdiff_t(j)*sstr_i + ptrdiff_t(i)*cstr_i; }
ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*str_o; } ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*cstr_o; }
size_t length_in() const { return iarr.shape(idim); } ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*cstr_o; }
size_t length_out() const { return oarr.shape(idim); } ptrdiff_t oofs_uni(size_t j, size_t i) const { return p_o[0] + ptrdiff_t(j)*sstr_o + ptrdiff_t(i)*cstr_o; }
ptrdiff_t stride_in() const { return str_i; } bool uniform_i() const { return uni_i; }
ptrdiff_t stride_out() const { return str_o; } ptrdiff_t unistride_i() const { return sstr_i; }
bool uniform_o() const { return uni_o; }
ptrdiff_t unistride_o() const { return sstr_o; }
size_t length_in() const { return cshp_i; }
size_t length_out() const { return cshp_o; }
ptrdiff_t stride_in() const { return cstr_i; }
ptrdiff_t stride_out() const { return cstr_o; }
size_t remaining() const { return rem; } size_t remaining() const { return rem; }
}; };
...@@ -579,49 +639,106 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp ...@@ -579,49 +639,106 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp
return aligned_array<T>(tmpsize); return aligned_array<T>(tmpsize);
} }
//#define MRFFT_PREFETCH #define MRFFT_PREFETCH
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, #define MRUTIL_PREFETCH_R(addr) __builtin_prefetch(addr);
#define MRUTIL_PREFETCH_W(addr) __builtin_prefetch(addr,1);
template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_iter<vlen> &it,
const fmav<Cmplx<T>> &src, Cmplx<native_simd<T>> *MRUTIL_RESTRICT dst) const fmav<Cmplx<T>> &src, Cmplx<native_simd<T>> *MRUTIL_RESTRICT dst)
{ {
size_t i=0; if (it.uniform_i())
#ifdef MRFFT_PREFETCH {
constexpr size_t dist=16; auto ptr = &src[it.iofs_uni(0,0)];
for (; i+dist<it.length_in(); ++i) auto jstr = it.unistride_i();
for (size_t j=0; j<vlen; ++j) auto istr = it.stride_in();
{ if (istr==1)
__builtin_prefetch(&src[it.iofs(j,i+dist)]); for (size_t i=0; i<it.length_in(); ++i)
dst[i].r[j] = src[it.iofs(j,i)].r; {
dst[i].i[j] = src[it.iofs(j,i)].i; Cmplx<native_simd<T>> stmp;
} for (size_t j=0; j<vlen; ++j)
#endif {
for (; i<it.length_in(); ++i) auto tmp = ptr[j*jstr+i];
for (size_t j=0; j<vlen; ++j) stmp.r[j] = tmp.r;
stmp.i[j] = tmp.i;
}
dst[i] = stmp;
}
else if (jstr==1)
for (size_t i=0; i<it.length_in(); ++i)
{
Cmplx<native_simd<T>> stmp;
for (size_t j=0; j<vlen; ++j)
{
auto tmp = ptr[j+i*istr];
stmp.r[j] = tmp.r;
stmp.i[j] = tmp.i;
}
dst[i] = stmp;
}
else
for (size_t i=0; i<it.length_in(); ++i)
{
Cmplx<native_simd<T>> stmp;
for (size_t j=0; j<vlen; ++j)
{
auto tmp = src[it.iofs_uni(j,i)];
stmp.r[j] = tmp.r;
stmp.i[j] = tmp.i;
}
dst[i] = stmp;
}
}
else
for (size_t i=0; i<it.length_in(); ++i)
{ {
auto tmp = src[it.iofs(j,i)]; Cmplx<native_simd<T>> stmp;
dst[i].r[j] = tmp.r; for (size_t j=0; j<vlen; ++j)
dst[i].i[j] = tmp.i; {
auto tmp = src[it.iofs(j,i)];
stmp.r[j] = tmp.r;
stmp.i[j] = tmp.i;
}
dst[i] = stmp;
} }
} }
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_iter<vlen> &it,
const fmav<T> &src, native_simd<T> *MRUTIL_RESTRICT dst) const fmav<T> &src, native_simd<T> *MRUTIL_RESTRICT dst)
{ {
size_t i=0; size_t i=0;
#ifdef MRFFT_PREFETCH #ifdef MRFFT_PREFETCH
constexpr size_t dist=16; constexpr size_t dist=32;
for (; i+dist<it.length_in(); ++i) if (it.uniform_i())
for (size_t j=0; j<vlen; ++j) for (; i+dist<it.length_in(); ++i)
{ {
__builtin_prefetch(&src[it.oofs(j,i+dist)]); native_simd<T> stmp;
dst[i][j] = src[it.iofs(j,i)]; MRUTIL_PREFETCH_W(&dst[i+dist]);
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_R(&src[it.iofs_uni(j,i+dist)]);
stmp[j] = src[it.iofs_uni(j,i)];
}
dst[i] = stmp;
} }
else
for (; i+dist<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_R(&src[it.iofs(j,i+dist)]);
MRUTIL_PREFETCH_W(&dst[i+dist]);
dst[i][j] = src[it.iofs(j,i)];
}
#endif #endif
for (; i<it.length_in(); ++i) if (it.uniform_i())
for (size_t j=0; j<vlen; ++j) for (; i<it.length_in(); ++i)
dst[i][j] = src[it.iofs(j,i)]; for (size_t j=0; j<vlen; ++j)
dst[i][j] = src[it.iofs_uni(j,i)];
else
for (; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
dst[i][j] = src[it.iofs(j,i)];
} }
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_iter<vlen> &it,
const fmav<T> &src, T *MRUTIL_RESTRICT dst) const fmav<T> &src, T *MRUTIL_RESTRICT dst)
{ {
if (dst == &src[it.iofs(0)]) return; // in-place if (dst == &src[it.iofs(0)]) return; // in-place
...@@ -629,45 +746,71 @@ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, ...@@ -629,45 +746,71 @@ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
dst[i] = src[it.iofs(i)]; dst[i] = src[it.iofs(i)];
} }
template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it, template<typename T, size_t vlen> MRUTIL_NOINLINE void copy_output(const multi_iter<vlen> &it,
const Cmplx<native_simd<T>> *MRUTIL_RESTRICT src, fmav<Cmplx<T>> &dst) const Cmplx<native_simd<T>> *MRUTIL_RESTRICT src, fmav<Cmplx<T>> &dst)
{ {
auto ptr=dst.vdata(); auto ptr=dst.vdata();
size_t i=0; size_t i=0;
#ifdef MRFFT_PREFETCH #ifdef MRFFT_PREFETCH
constexpr size_t dist=16; constexpr size_t dist=32;
for (; i+dist<it.length_out(); ++i) if (it.uniform_o())
for (size_t j=0; j<vlen; ++j) for (; i+dist<it.length_out(); ++i)
{ for (size_t j=0; j<vlen; ++j)
__builtin_prefetch(&ptr[it.oofs(j,i+dist)],1,3); {
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]); MRUTIL_PREFETCH_W(&ptr[it.oofs_uni(j,i+dist)]);
} ptr[it.oofs_uni(j,i)].Set(src[i].r[j],src[i].i[j]);
}
else
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_W(&ptr[it.oofs(j,i+dist)]);
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
}
#endif #endif
for (; i<it.length_out(); ++i) if (it.uniform_o())
for (size_t j=0; j<vlen; ++j) for (; i<it.length_out(); ++i)
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]); for (size_t j=0; j<vlen; ++j)
ptr[it.oofs_uni(j,i)].Set(src[i].r[j],src[i].i[j]);
else