Commit 1c168d5b authored by Martin Reinecke's avatar Martin Reinecke
Browse files

current state

parent da0ac67a
......@@ -118,25 +118,26 @@ def bench_nd(ndim, nmax, nthr, ntry, tp, funcs, nrepeat, ttl="", filename="",
tmp = func(a, nrepeat, nthr)
res.append(tmp[0])
output.append(tmp[1])
print("{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}".format(results[0][n],results[1][n],results[0][n]/results[1][n],_l2error(output[0],output[1])))
results = np.array(results)
plt.title("{}: {}D, {}, max_extent={}".format(
ttl, ndim, str(tp), nmax))
plt.xlabel("time ratio")
plt.ylabel("counts")
plt.hist(results[0, :]/results[1, :], bins="auto")
if filename != "":
plt.savefig(filename)
plt.show()
funcs = (measure_pypocketfft, measure_fftw)
# print("{0:5.2e}/{1:5.2e} = {2:5.2f} L2 error={3}".format(results[0][n],results[1][n],results[0][n]/results[1][n],_l2error(output[0],output[1])))
# results = np.array(results)
# plt.title("{}: {}D, {}, max_extent={}".format(
# ttl, ndim, str(tp), nmax))
# plt.xlabel("time ratio")
# plt.ylabel("counts")
# plt.hist(results[0, :]/results[1, :], bins="auto")
# if filename != "":
# plt.savefig(filename)
# plt.show()
funcs = (measure_pypocketfft,)
ttl = "pypocketfft/FFTW()"
ntry=100
nthr = 1
nice_sizes = True
bench_nd(1, 8192, nthr, 100, "c16", funcs, 10, ttl, "1d.png", nice_sizes)
bench_nd(2, 2048, nthr, 100, "c16", funcs, 2, ttl, "2d.png", nice_sizes)
bench_nd(3, 256, nthr, 100, "c16", funcs, 2, ttl, "3d.png", nice_sizes)
bench_nd(1, 8192, nthr, 100, "c8", funcs, 10, ttl, "1d_single.png", nice_sizes)
bench_nd(2, 2048, nthr, 100, "c8", funcs, 2, ttl, "2d_single.png", nice_sizes)
bench_nd(3, 256, nthr, 100, "c8", funcs, 2, ttl, "3d_single.png", nice_sizes)
#bench_nd(1, 8192, nthr, ntry, "c16", funcs, 10, ttl, "1d.png", nice_sizes)
bench_nd(2, 2048, nthr, ntry, "c16", funcs, 2, ttl, "2d.png", nice_sizes)
# bench_nd(3, 256, nthr, ntry, "c16", funcs, 2, ttl, "3d.png", nice_sizes)
# bench_nd(1, 8192, nthr, ntry, "c8", funcs, 10, ttl, "1d_single.png", nice_sizes)
# bench_nd(2, 2048, nthr, ntry, "c8", funcs, 2, ttl, "2d_single.png", nice_sizes)
# bench_nd(3, 256, nthr, ntry, "c8", funcs, 2, ttl, "3d_single.png", nice_sizes)
......@@ -24,6 +24,8 @@
#ifndef MRUTIL_COMMUNICATION_H
#define MRUTIL_COMMUNICATION_H
#define MRUTIL_USE_MPI
#include <vector>
#ifdef MRUTIL_USE_MPI
#include <mpi.h>
......@@ -95,8 +97,7 @@ class Communicator
template<typename T> void sendrecvRaw (const T *sendbuf, size_t sendcnt,
size_t dest, T *recvbuf, size_t recvcnt, size_t src) const
{
sendrecvRawVoid(sendbuf, sendcnt, dest, recvbuf, recvcnt, src,
tidx<T>());
sendrecvRawVoid(sendbuf, sendcnt, dest, recvbuf, recvcnt, src, tidx<T>());
}
template<typename T> void sendrecv_replaceRaw (T *data, size_t num,
size_t dest, size_t src) const
......@@ -110,6 +111,12 @@ class Communicator
template<typename T> void allgathervRaw (const T *in, int numin, T *out,
const int *numout, const int *disout) const
{ allgathervRawVoid (in, numin, out, numout, disout, tidx<T>()); }
template<typename T> vector<T> allgatherVec (const T &in) const
{
vector<T> res(num_ranks_);
allgatherRaw(&in, res.data(), 1);
return res;
}
template<typename T> T allreduce(const T &in, redOp op) const
{
......@@ -117,13 +124,18 @@ class Communicator
allreduceRaw (&in, &out, 1, op);
return out;
}
template<typename T> std::vector<T> allreduce
template<typename T> std::vector<T> allreduceVec
(const std::vector<T> &in, redOp op) const
{
std::vector<T> out(in.size());
allreduceRaw (in.data(), out.data(), in.size(), op);
return out;
}
template<typename T> void sendrecvVec(const vector<T> &sendbuf, size_t dest,
vector<T> &recvbuf, size_t src) const
{
sendrecvRaw(sendbuf.data(), sendbuf.size(), dest, recvbuf.data(), recvbuf.size(), src);
}
/*! NB: \a num refers to the <i>total</i> number of items in the arrays;
the individual message size is \a num/num_ranks(). */
template<typename T> void all2allRaw (const T *in, T *out, size_t num) const
......
......@@ -52,6 +52,10 @@ namespace mr {
namespace detail_simd {
template<typename T> T myexp(T);// {return -42;}
template<> inline double myexp(double v) {return std::exp(v);}
template<> inline float myexp(float v) {return std::exp(v);}
template<typename T> constexpr inline bool vectorizable = false;
template<> constexpr inline bool vectorizable<float> = true;
template<> constexpr inline bool vectorizable<double> = true;
......@@ -114,6 +118,13 @@ template<typename T, size_t len> class vtp
vtp &operator*=(vtp other) { v*=other.v; return *this; }
vtp &operator/=(vtp other) { v/=other.v; return *this; }
vtp abs() const { return hlp::abs(v); }
template<typename Func> vtp apply(Func func) const
{
vtp res;
for (size_t i=0; i<len; ++i)
res[i] = func(v[i]);
return res;
}
inline vtp sqrt() const
{ return hlp::sqrt(v); }
vtp max(const vtp &other) const
......@@ -188,6 +199,8 @@ template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op
res = op(res, v[i]);
return res;
}
template<typename T, size_t len> vtp<T, len> exp(const vtp<T, len> &v)
{ return v.apply(myexp<T>); }
template<typename T> class pseudoscalar
{
private:
......@@ -407,6 +420,7 @@ using detail_simd::native_simd;
using detail_simd::reduce;
using detail_simd::max;
using detail_simd::abs;
using detail_simd::exp;
using detail_simd::sqrt;
using detail_simd::any_of;
using detail_simd::none_of;
......
......@@ -38,7 +38,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MRUTIL_FFT_H
#define MRUTIL_FFT_H
#include <iostream>
#include "mr_util/math/fft1d.h"
#ifndef POCKETFFT_CACHE_SIZE
......@@ -67,6 +67,7 @@ namespace mr {
namespace detail_fft {
using shape_t=fmav_info::shape_t;
using stride_t=fmav_info::stride_t;
constexpr bool FORWARD = true,
BACKWARD = false;
......@@ -435,34 +436,79 @@ template<typename T> std::shared_ptr<T> get_plan(size_t length)
template<size_t N> class multi_iter
{
private:
shape_t pos;
fmav_info iarr, oarr;
ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o;
size_t idim, rem;
shape_t shp, pos;
stride_t str_i, str_o;
size_t cshp_i, cshp_o, rem;
ptrdiff_t cstr_i, cstr_o, sstr_i, sstr_o, p_ii, p_i[N], p_oi, p_o[N];
bool uni_i, uni_o;
void advance_i()
{
for (int i_=int(pos.size())-1; i_>=0; --i_)
for (size_t i=0; i<pos.size(); ++i)
{
auto i = size_t(i_);
if (i==idim) continue;
p_ii += iarr.stride(i);
p_oi += oarr.stride(i);
if (++pos[i] < iarr.shape(i))
p_ii += str_i[i];
p_oi += str_o[i];
if (++pos[i] < shp[i])
return;
pos[i] = 0;
p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i);
p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i);
p_ii -= ptrdiff_t(shp[i])*str_i[i];
p_oi -= ptrdiff_t(shp[i])*str_o[i];
}
}
public:
multi_iter(const fmav_info &iarr_, const fmav_info &oarr_, size_t idim_,
size_t nshares, size_t myshare)
: pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0),
str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)),
idim(idim_), rem(iarr.size()/iarr.shape(idim))
multi_iter(const fmav_info &iarr, const fmav_info &oarr, size_t idim,
size_t nshares, size_t myshare)
: rem(iarr.size()/iarr.shape(idim)), sstr_i(0), sstr_o(0), p_ii(0), p_oi(0)
{
MR_assert(oarr.ndim()==iarr.ndim(), "dimension mismatch");
MR_assert(iarr.ndim()>=1, "not enough dimensions");
// Sort the extraneous dimensions in order of ascending output stride;
// this should improve overall cache re-use and avoid clashes between
// threads as much as possible.
shape_t idx(iarr.ndim());
std::iota(idx.begin(), idx.end(), 0);
sort(idx.begin(), idx.end(),
[&oarr](size_t i1, size_t i2) {return oarr.stride(i1) < oarr.stride(i2);});
for (auto i: idx)
if (i!=idim)
{
pos.push_back(0);
MR_assert(iarr.shape(i)==oarr.shape(i), "shape mismatch");
shp.push_back(iarr.shape(i));
str_i.push_back(iarr.stride(i));
str_o.push_back(oarr.stride(i));
}
MR_assert(idim<iarr.ndim(), "bad active dimension");
cstr_i = iarr.stride(idim);
cstr_o = oarr.stride(idim);
cshp_i = iarr.shape(idim);
cshp_o = oarr.shape(idim);
// collapse unneeded dimensions
bool done = false;
while(!done)
{
done=true;
for (size_t i=1; i<shp.size(); ++i)
if ((str_i[i] == str_i[i-1]*ptrdiff_t(shp[i-1]))
&& (str_o[i] == str_o[i-1]*ptrdiff_t(shp[i-1])))
{
shp[i-1] *= shp[i];
str_i.erase(str_i.begin()+ptrdiff_t(i));
str_o.erase(str_o.begin()+ptrdiff_t(i));
shp.erase(shp.begin()+ptrdiff_t(i));
pos.pop_back();
done=false;
// std::cout << "reduced dims" << std::endl;
}
}
if (pos.size()>0)
{
sstr_i = str_i[0];
sstr_o = str_o[0];
}
if (nshares==1) return;
if (nshares==0) throw std::runtime_error("can't run with zero threads");
if (myshare>=nshares) throw std::runtime_error("impossible share requested");
......@@ -473,16 +519,16 @@ template<size_t N> class multi_iter
size_t todo = hi-lo;
size_t chunk = rem;
for (size_t i=0; i<pos.size(); ++i)
for (size_t i2=0, i=pos.size()-1; i2<pos.size(); ++i2,--i)
{
if (i==idim) continue;
chunk /= iarr.shape(i);
chunk /= shp[i];
size_t n_advance = lo/chunk;
pos[i] += n_advance;
p_ii += ptrdiff_t(n_advance)*iarr.stride(i);
p_oi += ptrdiff_t(n_advance)*oarr.stride(i);
p_ii += ptrdiff_t(n_advance)*str_i[i];
p_oi += ptrdiff_t(n_advance)*str_o[i];
lo -= n_advance*chunk;
}
MR_assert(lo==0, "must not happen");
rem = todo;
}
void advance(size_t n)
......@@ -494,16 +540,30 @@ template<size_t N> class multi_iter
p_o[i] = p_oi;
advance_i();
}
uni_i = uni_o = true;
for (size_t i=1; i<n; ++i)
{
// std::cout << (p_i[i]-p_i[i-1]) << " " << sstr_i << std::endl;
uni_i = uni_i && (p_i[i]-p_i[i-1] == sstr_i);
uni_o = uni_o && (p_o[i]-p_o[i-1] == sstr_o);
}
// for (size_t i=0; i<n; ++i)
rem -= n;
}
ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*str_i; }
ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*str_i; }
ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*str_o; }
ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*str_o; }
size_t length_in() const { return iarr.shape(idim); }
size_t length_out() const { return oarr.shape(idim); }
ptrdiff_t stride_in() const { return str_i; }
ptrdiff_t stride_out() const { return str_o; }
ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*cstr_i; }
ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*cstr_i; }
ptrdiff_t iofs_uni(size_t j, size_t i) const { return p_i[0] + ptrdiff_t(j)*sstr_i + ptrdiff_t(i)*cstr_i; }
ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*cstr_o; }
ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*cstr_o; }
ptrdiff_t oofs_uni(size_t j, size_t i) const { return p_o[0] + ptrdiff_t(j)*sstr_o + ptrdiff_t(i)*cstr_o; }
bool uniform_i() const { return uni_i; }
ptrdiff_t unistride_i() const { return sstr_i; }
bool uniform_o() const { return uni_o; }
ptrdiff_t unistride_o() const { return sstr_o; }
size_t length_in() const { return cshp_i; }
size_t length_out() const { return cshp_o; }
ptrdiff_t stride_in() const { return cstr_i; }
ptrdiff_t stride_out() const { return cstr_o; }
size_t remaining() const { return rem; }
};
......@@ -579,49 +639,106 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp
return aligned_array<T>(tmpsize);
}
//#define MRFFT_PREFETCH
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
#define MRFFT_PREFETCH
#define MRUTIL_PREFETCH_R(addr) __builtin_prefetch(addr);
#define MRUTIL_PREFETCH_W(addr) __builtin_prefetch(addr,1);
template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_iter<vlen> &it,
const fmav<Cmplx<T>> &src, Cmplx<native_simd<T>> *MRUTIL_RESTRICT dst)
{
size_t i=0;
#ifdef MRFFT_PREFETCH
constexpr size_t dist=16;
for (; i+dist<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
{
__builtin_prefetch(&src[it.iofs(j,i+dist)]);
dst[i].r[j] = src[it.iofs(j,i)].r;
dst[i].i[j] = src[it.iofs(j,i)].i;
}
#endif
for (; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
if (it.uniform_i())
{
auto ptr = &src[it.iofs_uni(0,0)];
auto jstr = it.unistride_i();
auto istr = it.stride_in();
if (istr==1)
for (size_t i=0; i<it.length_in(); ++i)
{
Cmplx<native_simd<T>> stmp;
for (size_t j=0; j<vlen; ++j)
{
auto tmp = ptr[j*jstr+i];
stmp.r[j] = tmp.r;
stmp.i[j] = tmp.i;
}
dst[i] = stmp;
}
else if (jstr==1)
for (size_t i=0; i<it.length_in(); ++i)
{
Cmplx<native_simd<T>> stmp;
for (size_t j=0; j<vlen; ++j)
{
auto tmp = ptr[j+i*istr];
stmp.r[j] = tmp.r;
stmp.i[j] = tmp.i;
}
dst[i] = stmp;
}
else
for (size_t i=0; i<it.length_in(); ++i)
{
Cmplx<native_simd<T>> stmp;
for (size_t j=0; j<vlen; ++j)
{
auto tmp = src[it.iofs_uni(j,i)];
stmp.r[j] = tmp.r;
stmp.i[j] = tmp.i;
}
dst[i] = stmp;
}
}
else
for (size_t i=0; i<it.length_in(); ++i)
{
auto tmp = src[it.iofs(j,i)];
dst[i].r[j] = tmp.r;
dst[i].i[j] = tmp.i;
Cmplx<native_simd<T>> stmp;
for (size_t j=0; j<vlen; ++j)
{
auto tmp = src[it.iofs(j,i)];
stmp.r[j] = tmp.r;
stmp.i[j] = tmp.i;
}
dst[i] = stmp;
}
}
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_iter<vlen> &it,
const fmav<T> &src, native_simd<T> *MRUTIL_RESTRICT dst)
{
size_t i=0;
#ifdef MRFFT_PREFETCH
constexpr size_t dist=16;
for (; i+dist<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
constexpr size_t dist=32;
if (it.uniform_i())
for (; i+dist<it.length_in(); ++i)
{
__builtin_prefetch(&src[it.oofs(j,i+dist)]);
dst[i][j] = src[it.iofs(j,i)];
native_simd<T> stmp;
MRUTIL_PREFETCH_W(&dst[i+dist]);
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_R(&src[it.iofs_uni(j,i+dist)]);
stmp[j] = src[it.iofs_uni(j,i)];
}
dst[i] = stmp;
}
else
for (; i+dist<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_R(&src[it.iofs(j,i+dist)]);
MRUTIL_PREFETCH_W(&dst[i+dist]);
dst[i][j] = src[it.iofs(j,i)];
}
#endif
for (; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
dst[i][j] = src[it.iofs(j,i)];
if (it.uniform_i())
for (; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
dst[i][j] = src[it.iofs_uni(j,i)];
else
for (; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
dst[i][j] = src[it.iofs(j,i)];
}
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
template <typename T, size_t vlen> MRUTIL_NOINLINE void copy_input(const multi_iter<vlen> &it,
const fmav<T> &src, T *MRUTIL_RESTRICT dst)
{
if (dst == &src[it.iofs(0)]) return; // in-place
......@@ -629,45 +746,71 @@ template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
dst[i] = src[it.iofs(i)];
}
template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
template<typename T, size_t vlen> MRUTIL_NOINLINE void copy_output(const multi_iter<vlen> &it,
const Cmplx<native_simd<T>> *MRUTIL_RESTRICT src, fmav<Cmplx<T>> &dst)
{
auto ptr=dst.vdata();
size_t i=0;
#ifdef MRFFT_PREFETCH
constexpr size_t dist=16;
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
__builtin_prefetch(&ptr[it.oofs(j,i+dist)],1,3);
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
}
constexpr size_t dist=32;
if (it.uniform_o())
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_W(&ptr[it.oofs_uni(j,i+dist)]);
ptr[it.oofs_uni(j,i)].Set(src[i].r[j],src[i].i[j]);
}
else
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_W(&ptr[it.oofs(j,i+dist)]);
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
}
#endif
for (; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
if (it.uniform_o())
for (; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs_uni(j,i)].Set(src[i].r[j],src[i].i[j]);
else
for (; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
}
template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
template<typename T, size_t vlen> MRUTIL_NOINLINE void copy_output(const multi_iter<vlen> &it,
const native_simd<T> *MRUTIL_RESTRICT src, fmav<T> &dst)
{
auto ptr=dst.vdata();
size_t i=0;
#ifdef MRFFT_PREFETCH
constexpr size_t dist=16;
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
__builtin_prefetch(&ptr[it.oofs(j,i+dist)],1,3);
ptr[it.oofs(j,i)] = src[i][j];
}
constexpr size_t dist=32;
if (it.uniform_o())
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_W(&ptr[it.oofs_uni(j,i+dist)]);
ptr[it.oofs_uni(j,i)] = src[i][j];
}
else
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
MRUTIL_PREFETCH_W(&ptr[it.oofs(j,i+dist)]);
ptr[it.oofs(j,i)] = src[i][j];
}
#endif
for (; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i)] = src[i][j];
if (it.uniform_o())
for (; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs_uni(j,i)] = src[i][j];
else
for (; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i)] = src[i][j];
}
template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
template<typename T, size_t vlen> MRUTIL_NOINLINE void copy_output(const multi_iter<vlen> &it,
const T *MRUTIL_RESTRICT src, fmav<T> &dst)
{
auto ptr=dst.vdata();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment