Commit 367ace56 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

evolution

parent b0b3677a
pypocketfft
===========
This package provides Fast Fourier and Hartley transforms with a simple
Python interface.
Features
--------
- supports fully complex and half-complex (i.e. complex-to-real and
real-to-complex) FFTs
- supports multidimensional arrays and selection of the axes to be transformed.
- supports single and double precision
- makes use of CPU vector instructions when performing 2D and higher-dimensional
transforms
- does not have persistent transform plans, which makes the interface simpler
- supports prime-length transforms without degrading to O(N**2) performance
......@@ -1601,7 +1601,7 @@ template<typename T> void copy_and_norm(T *c, T *p1, size_t n, T0 fct)
public:
template<typename T> void forward(T c[], T0 fact)
template<typename T> NOINLINE void forward(T c[], T0 fact)
{
if (length==1) { c[0]*=fact; return; }
size_t n=length;
......@@ -1633,7 +1633,7 @@ template<typename T> void forward(T c[], T0 fact)
copy_and_norm(c,p1,n,fact);
}
template<typename T> void backward(T c[], T0 fact)
template<typename T> NOINLINE void backward(T c[], T0 fact)
{
if (length==1) { c[0]*=fact; return; }
size_t n=length;
......@@ -1834,13 +1834,13 @@ template<typename T0> class fftblue
plan.forward((cmplx<T0> *)bkf,1.);
}
template<typename T> void backward(T c[], T0 fct)
template<typename T> NOINLINE void backward(T c[], T0 fct)
{ fft<true>(c,fct); }
template<typename T> void forward(T c[], T0 fct)
template<typename T> NOINLINE void forward(T c[], T0 fct)
{ fft<false>(c,fct); }
template<typename T> void backward_r(T c[], T0 fct)
template<typename T> NOINLINE void backward_r(T c[], T0 fct)
{
arr<T> tmp(2*n);
tmp[0]=c[0];
......@@ -1857,7 +1857,7 @@ template<typename T0> class fftblue
c[m] = tmp[2*m];
}
template<typename T> void forward_r(T c[], T0 fct)
template<typename T> NOINLINE void forward_r(T c[], T0 fct)
{
arr<T> tmp(2*n);
for (size_t m=0; m<n; ++m)
......@@ -1901,13 +1901,13 @@ template<typename T0> class pocketfft_c
packplan=unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
}
template<typename T> void backward(T c[], T0 fct)
template<typename T> NOINLINE void backward(T c[], T0 fct)
{
packplan ? packplan->backward((cmplx<T> *)c,fct)
: blueplan->backward(c,fct);
}
template<typename T> void forward(T c[], T0 fct)
template<typename T> NOINLINE void forward(T c[], T0 fct)
{
packplan ? packplan->forward((cmplx<T> *)c,fct)
: blueplan->forward(c,fct);
......@@ -1946,13 +1946,13 @@ template<typename T0> class pocketfft_r
packplan=unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
}
template<typename T> void backward(T c[], T0 fct)
template<typename T> NOINLINE void backward(T c[], T0 fct)
{
packplan ? packplan->backward(c,fct)
: blueplan->backward_r(c,fct);
}
template<typename T> void forward(T c[], T0 fct)
template<typename T> NOINLINE void forward(T c[], T0 fct)
{
packplan ? packplan->forward(c,fct)
: blueplan->forward_r(c,fct);
......@@ -1970,6 +1970,8 @@ using stride_t = vector<int64_t>;
struct diminfo
{ size_t n; int64_t s; };
struct diminfo_io
{ size_t n; int64_t s_i, s_o; };
class multiarr
{
private:
......@@ -1987,49 +1989,51 @@ class multiarr
int64_t stride(size_t i) const { return dim[i].s; }
};
class multi_iter
class multi_iter_io
{
public:
vector<diminfo> dim;
vector<diminfo_io> dim;
shape_t pos;
int64_t ofs_;
size_t len;
int64_t str;
int64_t ofs_i, ofs_o;
size_t len_i, len_o;
int64_t str_i, str_o;
size_t rem;
bool done_;
public:
multi_iter(const multiarr &arr, size_t idim)
: pos(arr.ndim()-1, 0), ofs_(0), len(arr.size(idim)),
str(arr.stride(idim)), rem(1), done_(false)
multi_iter_io(const multiarr &arr_i, const multiarr &arr_o, size_t idim)
: pos(arr_i.ndim()-1, 0), ofs_i(0), ofs_o(0), len_i(arr_i.size(idim)),
len_o(arr_o.size(idim)), str_i(arr_i.stride(idim)),
str_o(arr_o.stride(idim)), rem(1)
{
dim.reserve(arr.ndim()-1);
for (size_t i=0; i<arr.ndim(); ++i)
dim.reserve(arr_i.ndim()-1);
for (size_t i=0; i<arr_i.ndim(); ++i)
if (i!=idim)
{
dim.push_back({arr.size(i), arr.stride(i)});
done_ = done_ || (arr.size(i)==0);
rem *= arr.size(i);
dim.push_back({arr_i.size(i), arr_i.stride(i), arr_o.stride(i)});
rem *= arr_i.size(i);
}
}
void advance()
{
if (--rem==0) {done_=true; return; }
if (--rem==0) return;
for (int i=pos.size()-1; i>=0; --i)
{
++pos[i];
ofs_ += dim[i].s;
ofs_i += dim[i].s_i;
ofs_o += dim[i].s_o;
if (pos[i] < dim[i].n)
return;
pos[i] = 0;
ofs_ -= dim[i].n*dim[i].s;
ofs_i -= dim[i].n*dim[i].s_i;
ofs_o -= dim[i].n*dim[i].s_o;
}
done_ = true;
}
bool done() const { return done_; }
int64_t offset() const { return ofs_; }
size_t length() const { return len; }
int64_t stride() const { return str; }
int64_t offset_in() const { return ofs_i; }
int64_t offset_out() const { return ofs_o; }
size_t length_in() const { return len_i; }
size_t length_out() const { return len_o; }
int64_t stride_in() const { return str_i; }
int64_t stride_out() const { return str_o; }
size_t remaining() const { return rem; }
};
......@@ -2071,22 +2075,24 @@ template<> struct VTYPE<float>
};
#endif
size_t prod(const shape_t &shape)
{
size_t res=1;
for (auto sz: shape)
res*=sz;
return res;
}
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
size_t axsize, size_t elemsize)
{
size_t fullsize=1;
for (auto sz: shape)
fullsize*=sz;
auto othersize = fullsize/axsize;
auto othersize = prod(shape)/axsize;
auto tmpsize = axsize*((othersize>=VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1);
return arr<char>(tmpsize*elemsize);
}
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
const shape_t &axes, size_t elemsize)
{
size_t fullsize=1;
for (auto sz:shape)
fullsize*=sz;
size_t fullsize=prod(shape);
size_t tmpsize=0;
for (size_t i=0; i<axes.size(); ++i)
{
......@@ -2097,18 +2103,19 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
return arr<char>(tmpsize*elemsize);
}
template<size_t vlen> struct multioffset
template<size_t vlen> struct multioffset_io
{
int64_t ofs[vlen];
multioffset(multi_iter &it)
int64_t ofs_i[vlen], ofs_o[vlen];
multioffset_io(multi_iter_io &it)
{
for (size_t i=0; i<vlen; ++i)
{ ofs[i] = it.offset(); it.advance(); }
{ ofs_i[i] = it.offset_in(); ofs_o[i] = it.offset_out(); it.advance(); }
}
int64_t operator[](size_t i) const { return ofs[i]; }
int64_t in(size_t i) const { return ofs_i[i]; }
int64_t out(size_t i) const { return ofs_o[i]; }
};
template<typename T> void pocketfft_general_c(const shape_t &shape,
template<typename T> NOINLINE void pocketfft_general_c(const shape_t &shape,
const stride_t &stride_in, const stride_t &stride_out,
const shape_t &axes, bool forward, const cmplx<T> *data_in,
cmplx<T> *data_out, T fct)
......@@ -2124,50 +2131,49 @@ template<typename T> void pocketfft_general_c(const shape_t &shape,
for (size_t iax=0; iax<axes.size(); ++iax)
{
multi_iter it_in(a_in, axes[iax]), it_out(a_out, axes[iax]);
size_t len=it_in.length();
int64_t s_i=it_in.stride(), s_o=it_out.stride();
multi_iter_io it(a_in, a_out, axes[iax]);
size_t len=it.length_in();
int64_t s_i=it.stride_in(), s_o=it.stride_out();
if ((!plan) || (len!=plan->length()))
plan.reset(new pocketfft_c<T>(len));
if (vlen>1)
while (it_in.remaining()>=vlen)
while (it.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
multioffset_io<vlen> p(it);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
{
tdatav[i].r[j] = data_in[p_i[j]+i*s_i].r;
tdatav[i].i[j] = data_in[p_i[j]+i*s_i].i;
tdatav[i].r[j] = data_in[p.in(j)+i*s_i].r;
tdatav[i].i[j] = data_in[p.out(j)+i*s_i].i;
}
forward ? plan->forward((vtype *)tdatav, fct)
: plan->backward((vtype *)tdatav, fct);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i*s_o].Set(tdatav[i].r[j],tdatav[i].i[j]);
data_out[p.out(j)+i*s_o].Set(tdatav[i].r[j],tdatav[i].i[j]);
}
while (it_in.remaining()>0)
while (it.remaining()>0)
{
if ((data_in==data_out) && s_i==1) // fully in-place
forward ? plan->forward((T *)(data_in+it_in.offset()), fct)
: plan->backward((T *)(data_in+it_in.offset()), fct);
forward ? plan->forward((T *)(data_in+it.offset_in()), fct)
: plan->backward((T *)(data_in+it.offset_in()), fct);
else if (s_o==1) // compute FFT in output location
{
for (size_t i=0; i<len; ++i)
data_out[it_out.offset()+i] = data_in[it_in.offset()+i*s_i];
forward ? plan->forward((T *)(data_out+it_out.offset()), fct)
: plan->backward((T *)(data_out+it_out.offset()), fct);
data_out[it.offset_out()+i] = data_in[it.offset_in()+i*s_i];
forward ? plan->forward((T *)(data_out+it.offset_out()), fct)
: plan->backward((T *)(data_out+it.offset_out()), fct);
}
else
{
for (size_t i=0; i<len; ++i)
tdata[i] = data_in[it_in.offset() + i*s_i];
tdata[i] = data_in[it.offset_in() + i*s_i];
forward ? plan->forward((T *)tdata, fct)
: plan->backward((T *)tdata, fct);
for (size_t i=0; i<len; ++i)
data_out[it_out.offset()+i*s_o] = tdata[i];
data_out[it.offset_out()+i*s_o] = tdata[i];
}
it_in.advance();
it_out.advance();
it.advance();
}
// after the first dimension, take data from output array
a_in = a_out;
......@@ -2177,7 +2183,7 @@ template<typename T> void pocketfft_general_c(const shape_t &shape,
}
}
template<typename T> void pocketfft_general_hartley(const shape_t &shape,
template<typename T> NOINLINE void pocketfft_general_hartley(const shape_t &shape,
const stride_t &stride_in, const stride_t &stride_out,
const shape_t &axes, const T *data_in, T *data_out, T fct)
{
......@@ -2192,49 +2198,48 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape,
for (size_t iax=0; iax<axes.size(); ++iax)
{
multi_iter it_in(a_in, axes[iax]), it_out(a_out, axes[iax]);
size_t len=it_in.length();
int64_t s_i=it_in.stride(), s_o=it_out.stride();
multi_iter_io it(a_in, a_out, axes[iax]);
size_t len=it.length_in();
int64_t s_i=it.stride_in(), s_o=it.stride_out();
if ((!plan) || (len!=plan->length()))
plan.reset(new pocketfft_r<T>(len));
if (vlen>1)
while (it_in.remaining()>=vlen)
while (it.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
multioffset_io<vlen> p(it);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+i*s_i];
tdatav[i][j] = data_in[p.in(j)+i*s_i];
plan->forward((vtype *)tdatav, fct);
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]] = tdatav[0][j];
data_out[p.out(j)] = tdatav[0][j];
size_t i=1, i1=1, i2=len-1;
for (i=1; i<len-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
data_out[p_o[j]+i1*s_o] = tdatav[i][j]+tdatav[i+1][j];
data_out[p_o[j]+i2*s_o] = tdatav[i][j]-tdatav[i+1][j];
data_out[p.out(j)+i1*s_o] = tdatav[i][j]+tdatav[i+1][j];
data_out[p.out(j)+i2*s_o] = tdatav[i][j]-tdatav[i+1][j];
}
if (i<len)
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i1*s_o] = tdatav[i][j];
data_out[p.out(j)+i1*s_o] = tdatav[i][j];
}
while (it_in.remaining()>0)
while (it.remaining()>0)
{
for (size_t i=0; i<len; ++i)
tdata[i] = data_in[it_in.offset()+i*s_i];
tdata[i] = data_in[it.offset_in()+i*s_i];
plan->forward((T *)tdata, fct);
// Hartley order
data_out[it_out.offset()] = tdata[0];
data_out[it.offset_out()] = tdata[0];
size_t i=1, i1=1, i2=len-1;
for (i=1; i<len-1; i+=2, ++i1, --i2)
{
data_out[it_out.offset()+i1*s_o] = tdata[i]+tdata[i+1];
data_out[it_out.offset()+i2*s_o] = tdata[i]-tdata[i+1];
data_out[it.offset_out()+i1*s_o] = tdata[i]+tdata[i+1];
data_out[it.offset_out()+i2*s_o] = tdata[i]-tdata[i+1];
}
if (i<len)
data_out[it_out.offset()+i1*s_o] = tdata[i];
it_in.advance();
it_out.advance();
data_out[it.offset_out()+i1*s_o] = tdata[i];
it.advance();
}
// after the first dimension, take data from output array
a_in = a_out;
......@@ -2244,7 +2249,7 @@ template<typename T> void pocketfft_general_hartley(const shape_t &shape,
}
}
template<typename T> void pocketfft_general_r2c(const shape_t &shape,
template<typename T> NOINLINE void pocketfft_general_r2c(const shape_t &shape,
const stride_t &stride_in, const stride_t &stride_out, size_t axis,
const T *data_in, cmplx<T> *data_out, T fct)
{
......@@ -2256,31 +2261,31 @@ template<typename T> void pocketfft_general_r2c(const shape_t &shape,
multiarr a_in(shape, stride_in), a_out(shape, stride_out);
pocketfft_r<T> plan(shape[axis]);
multi_iter it_in(a_in, axis), it_out(a_out, axis);
multi_iter_io it(a_in, a_out, axis);
size_t len=shape[axis];
int64_t s_i=it_in.stride(), s_o=it_out.stride();
int64_t s_i=it.stride_in(), s_o=it.stride_out();
if (vlen>1)
while (it_in.remaining()>=vlen)
while (it.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
multioffset_io<vlen> p(it);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+i*s_i];
tdatav[i][j] = data_in[p.in(j)+i*s_i];
plan.forward((vtype *)tdatav, fct);
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]].Set(tdatav[0][j]);
data_out[p.out(j)].Set(tdatav[0][j]);
size_t i=1, ii=1;
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+ii*s_o].Set(tdatav[i][j], tdatav[i+1][j]);
data_out[p.out(j)+ii*s_o].Set(tdatav[i][j], tdatav[i+1][j]);
if (i<len)
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+ii*s_o].Set(tdatav[i][j]);
data_out[p.out(j)+ii*s_o].Set(tdatav[i][j]);
}
while (it_in.remaining()>0)
while (it.remaining()>0)
{
const T *d_i = data_in+it_in.offset();
cmplx<T> *d_o = data_out+it_out.offset();
const T *d_i = data_in+it.offset_in();
cmplx<T> *d_o = data_out+it.offset_out();
for (size_t i=0; i<len; ++i)
tdata[i] = d_i[i*s_i];
plan.forward(tdata, fct);
......@@ -2290,11 +2295,10 @@ template<typename T> void pocketfft_general_r2c(const shape_t &shape,
d_o[ii*s_o].Set(tdata[i], tdata[i+1]);
if (i<len)
d_o[ii*s_o].Set(tdata[i]);
it_in.advance();
it_out.advance();
it.advance();
}
}
template<typename T> void pocketfft_general_c2r(const shape_t &shape_out,
template<typename T> NOINLINE void pocketfft_general_c2r(const shape_t &shape_out,
const stride_t &stride_in, const stride_t &stride_out, size_t axis,
const cmplx<T> *data_in, T *data_out, T fct)
{
......@@ -2306,34 +2310,34 @@ template<typename T> void pocketfft_general_c2r(const shape_t &shape_out,
multiarr a_in(shape_out, stride_in), a_out(shape_out, stride_out);
pocketfft_r<T> plan(shape_out[axis]);
multi_iter it_in(a_in, axis), it_out(a_out, axis);
multi_iter_io it(a_in, a_out, axis);
size_t len=shape_out[axis];
int64_t s_i=it_in.stride(), s_o=it_out.stride();
int64_t s_i=it.stride_in(), s_o=it.stride_out();
if (vlen>1)
while (it_in.remaining()>=vlen)
while (it.remaining()>=vlen)
{
multioffset<vlen> p_i(it_in), p_o(it_out);
multioffset_io<vlen> p(it);
for (size_t j=0; j<vlen; ++j)
tdatav[0][j]=data_in[p_i[j]].r;
tdatav[0][j]=data_in[p.in(j)].r;
size_t i=1, ii=1;
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
{
tdatav[i][j] = data_in[p_i[j]+ii*s_i].r;
tdatav[i+1][j] = data_in[p_i[j]+ii*s_i].i;
tdatav[i][j] = data_in[p.in(j)+ii*s_i].r;
tdatav[i+1][j] = data_in[p.in(j)+ii*s_i].i;
}
if (i<len)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = data_in[p_i[j]+ii*s_i].r;
tdatav[i][j] = data_in[p.in(j)+ii*s_i].r;
plan.backward(tdatav, fct);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
data_out[p_o[j]+i*s_o] = tdatav[i][j];
data_out[p.out(j)+i*s_o] = tdatav[i][j];
}
while (!it_in.done())
while (it.remaining()>0)
{
const cmplx<T> *d_i = data_in+it_in.offset();
T *d_o = data_out+it_out.offset();
const cmplx<T> *d_i = data_in+it.offset_in();
T *d_o = data_out+it.offset_out();
tdata[0]=d_i[0].r;
size_t i=1, ii=1;
for (; i<len-1; i+=2, ++ii)
......@@ -2346,8 +2350,7 @@ template<typename T> void pocketfft_general_c2r(const shape_t &shape_out,
plan.backward(tdata, fct);
for (size_t i=0; i<len; ++i)
d_o[i*s_o] = tdata[i];
it_in.advance();
it_out.advance();
it.advance();
}
}
......@@ -2508,38 +2511,37 @@ template<typename T>py::array complex2hartley(const py::array &in,
size_t axis = axes.back();
multiarr a_tmp(dims_tmp, stride_tmp),
a_out(dims_out, stride_out);
multi_iter it_tmp(a_tmp, axis), it_out(a_out, axis);
multi_iter_io it(a_tmp, a_out, axis);
const cmplx<T> *tdata = (const cmplx<T> *)tmp.data();
T *odata = (T *)out.mutable_data();
vector<bool> swp(ndim-1,false);
for (auto i: axes)
if (i!=axis)
swp[i<axis ? i : i-1] = true;
while(!it_tmp.done())
while(it.remaining()>0)
{
auto tofs = it_tmp.offset();
auto ofs = it_out.offset();
auto tofs = it.offset_in();
auto ofs = it.offset_out();
int64_t rofs = 0;
for (size_t i=0; i<it_out.pos.size(); ++i)
for (size_t i=0; i<it.pos.size(); ++i)
{
if (!swp[i])
rofs += it_out.pos[i]*it_out.dim[i].s;
rofs += it.pos[i]*it.dim[i].s_o;
else
{
auto x = (it_out.pos[i]==0) ? 0 : it_out.dim[i].n-it_out.pos[i];
rofs += x*it_out.dim[i].s;
auto x = (it.pos[i]==0) ? 0 : it.dim[i].n-it.pos[i];
rofs += x*it.dim[i].s_o;
}
}
for (size_t i=0; i<it_tmp.length(); ++i)
for (size_t i=0; i<it.length_in(); ++i)
{
auto re = tdata[tofs + i*it_tmp.stride()].r;
auto im = tdata[tofs + i*it_tmp.stride()].i;
auto rev_i = (i==0) ? 0 : it_out.length()-i;
odata[ofs + i*it_out.stride()] = re+im;
odata[rofs + rev_i*it_out.stride()] = re-im;
auto re = tdata[tofs + i*it.stride_in()].r;
auto im = tdata[tofs + i*it.stride_in()].i;
auto rev_i = (i==0) ? 0 : it.length_out()-i;
odata[ofs + i*it.stride_out()] = re+im;
odata[rofs + rev_i*it.stride_out()] = re-im;
}
it_tmp.advance();
it_out.advance();
it.advance();
}
return out;
}
......
import numpy as np
import pypocketfft
def _l2error(a,b):
return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))
cmaxerr=0.
fmaxerr=0.
cmaxerrf=0.
fmaxerrf=0.
hmaxerr=0.
hmaxerrf=0.
def test():
global cmaxerr, fmaxerr, hmaxerr, cmaxerrf, fmaxerrf, hmaxerrf
ndim=np.random.randint(1,5)
axlen=int((2**20)**(1./ndim))
shape = np.random.randint(1,axlen,ndim)
axes = np.arange(ndim)
np.random.shuffle(axes)
nax = np.random.randint(1,ndim+1)
axes = axes[:nax]
lastsize = shape[axes[-1]]
fct = 1./np.prod(np.take(shape, axes))
a=np.random.rand(*shape)-0.5 + 1j*np.random.rand(*shape)-0.5j
b=pypocketfft.ifftn(pypocketfft.fftn(a,axes=axes),axes=axes,fct=fct)
err = _l2error(a,b)
if err > cmaxerr:
cmaxerr = err
print("cmaxerr:", cmaxerr, shape, axes)
b=pypocketfft.irfftn(pypocketfft.rfftn(a.real,axes=axes),axes=axes,fct=fct,lastsize=lastsize)
err = _l2error(a.real,b)
if err > fmaxerr: