Commit b87f51f5 authored by Martin Reinecke's avatar Martin Reinecke

use system sin/cos

parent 189bffbf
import numpy as np
import pypocketfft
import pyfftw
from time import time
import matplotlib.pyplot as plt
import math
......@@ -108,7 +107,7 @@ def bench_nd(ndim, nmax, ntry, tp, funcs, nrepeat, ttl="", filename=""):
plt.show()
funcs = (measure_pypocketfft, measure_fftw)
funcs = (measure_pypocketfft, measure_fftw_np_interface)
ttl = "pypocketfft/FFTW()"
bench_nd(1, 8192, 100, "c16", funcs, 10, ttl, "1d.png")
bench_nd(2, 2048, 100, "c16", funcs, 2, ttl, "2d.png")
......
......@@ -266,295 +266,69 @@ template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
//
// twiddle factor section
//
/** Approximate sin(pi*a) within the range [-0.25, 0.25] */
inline float sinpi0(float a)
{
// adapted from https://stackoverflow.com/questions/42792939/
float s = a * a;
float r = -5.957031250000000000e-01f;
r = fma (r, s, 2.550399541854858398e+00f);
r = fma (r, s, -5.167724132537841797e+00f);
r = (a * s) * r;
return fma (a, 3.141592741012573242e+00f, r);
}
/** Approximate sin(pi*a) within the range [-0.25, 0.25] */
inline double sinpi0(double a)
{
// adapted from https://stackoverflow.com/questions/42792939/
double s = a * a;
double r = 4.6151442520157035e-4;
r = fma (r, s, -7.3700183130883555e-3);
r = fma (r, s, 8.2145868949323936e-2);
r = fma (r, s, -5.9926452893214921e-1);
r = fma (r, s, 2.5501640398732688e+0);
r = fma (r, s, -5.1677127800499516e+0);
s = s * a;
r = r * s;
return fma (a, 3.1415926535897931e+0, r);
}
/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
inline float cosm1pi0(float a)
{
// adapted from https://stackoverflow.com/questions/42792939/
float s = a * a;
float r = 2.313842773437500000e-01f;
r = fmaf (r, s, -1.335021972656250000e+00f);
r = fmaf (r, s, 4.058703899383544922e+00f);
r = fmaf (r, s, -4.934802055358886719e+00f);
return r*s;
}
/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
inline double cosm1pi0(double a)
{
// adapted from https://stackoverflow.com/questions/42792939/
double s = a * a;
double r = -1.0369917389758117e-4;
r = fma (r, s, 1.9294935641298806e-3);
r = fma (r, s, -2.5806887942825395e-2);
r = fma (r, s, 2.3533063028328211e-1);
r = fma (r, s, -1.3352627688538006e+0);
r = fma (r, s, 4.0587121264167623e+0);
r = fma (r, s, -4.9348022005446790e+0);
return r*s;
}
template <typename T> void sincosm1pi0(T a_, T *POCKETFFT_RESTRICT res)
{
if (sizeof(T)>sizeof(double)) // don't have the code for long double
{
constexpr T pi = T(3.141592653589793238462643383279502884197L);
auto s = sin(pi*a_);
res[1] = s;
res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1);
return;
}
res[0] = T(cosm1pi0(double(a_)));
res[1] = T(sinpi0(double(a_)));
}
template <typename T> T sinpi(T a)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
auto i = (int64_t)r;
auto t = fma (T(-0.5), r, a);
switch (i%4)
{
case 0:
return sinpi0(t);
case 1: case -3:
return cosm1pi0(t) + T(1.);
case 2: case -2:
return T(0.) - sinpi0(t);
case 3: case -1:
return T(-1.) - cosm1pi0(t);
}
throw runtime_error("cannot happen");
}
template <typename T> T cospi(T a)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
auto i = (int64_t)r;
auto t = fma (T(-0.5), r, a);
switch (i%4)
{
case 0:
return cosm1pi0(t) + T(1.);
case 1: case -3:
return T(0.) - sinpi0(t);
case 2: case -2:
return T(-1.) - cosm1pi0(t);
case 3: case -1:
return sinpi0(t);
}
throw runtime_error("cannot happen");
}
inline long double cospi(long double a)
{
constexpr auto pi = 3.141592653589793238462643383279502884197L;
return sizeof(long double) > sizeof(double) ? cos(a * pi) :
static_cast<long double>(cospi(static_cast<double>(a)));
}
inline long double sinpi(long double a)
{
constexpr auto pi = 3.141592653589793238462643383279502884197L;
return sizeof(long double) > sizeof(double) ? sin(a * pi) :
static_cast<long double>(sinpi(static_cast<double>(a)));
}
template <typename T> void sincospi(T a, T *POCKETFFT_RESTRICT res)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
auto i = (int64_t)r;
auto t = fma (T(-0.5), r, a);
auto c=cosm1pi0(t)+T(1.);
auto s=sinpi0(t);
// map results according to quadrant
if (i & 2)
{
s = T(0.)-s;
c = T(0.)-c;
}
if (i & 1)
{
swap(s, c);
c = T(0.)-c;
}
res[0]=c;
res[1]=s;
}
inline void sincospi(long double a, long double *POCKETFFT_RESTRICT res)
{
if (sizeof(long double) > sizeof(double))
{
constexpr auto pi = 3.141592653589793238462643383279502884197L;
res[0] = cos(pi * a);
res[1] = sin(pi * a);
}
else
{
double sincos[2];
sincospi(static_cast<double>(a), sincos);
res[0] = static_cast<long double>(sincos[0]);
res[1] = static_cast<long double>(sincos[1]);
}
}
template<typename T> class sincos_2pibyn
{
private:
using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
size_t sz;
arr<cmplx<T>> data;
POCKETFFT_NOINLINE void calc_first_octant(size_t den,
cmplx<T> * POCKETFFT_RESTRICT res)
{
size_t n = (den+4)>>3;
if (n==0) return;
res[0].Set(1.,0.);
if (n==1) return;
size_t l1 = size_t(sqrt(n));
arr<cmplx<Thigh>> tmp(l1);
for (size_t i=1; i<l1; ++i)
{
sincosm1pi0(Thigh(2*i)/Thigh(den),&tmp[i].r);
res[i].Set(T(tmp[i].r)+1,T(tmp[i].i));
}
size_t start=l1;
while(start<n)
{
cmplx<Thigh> cs;
sincosm1pi0((Thigh(2*start))/Thigh(den),&cs.r);
res[start].Set(T(cs.r+1), T(cs.i));
size_t end = l1;
if (start+end>n) end = n-start;
for (size_t i=1; i<end; ++i)
{
cmplx<Thigh> csx=tmp[i];
res[start+i].Set(T(((cs.r*csx.r - cs.i*csx.i + cs.r) + csx.r) + 1),
T((cs.r*csx.i + cs.i*csx.r) + cs.i + csx.i));
}
start += l1;
}
}
size_t mask, shift;
arr<cmplx<Thigh>> v1, v2;
void calc_first_quadrant(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
{
arr<cmplx<T>> p(n/2); // n is always even here
calc_first_octant(n<<1, p.data());
size_t ndone=(n+2)>>2;
size_t i=0, idx1=0, idx2=ndone-1;
for (; i+1<ndone; i+=2, ++idx1, --idx2)
x<<=3;
if (x<4*n) // first half
{
res[idx1] = p[i];
res[idx2].Set(p[i+1].i, p[i+1].r);
if (x<2*n) // first quadrant
{
if (x<n) return cmplx<Thigh>(cos(Thigh(x)*ang), sin(Thigh(x)*ang));
return cmplx<Thigh>(sin(Thigh(2*n-x)*ang), cos(Thigh(2*n-x)*ang));
}
else // second quadrant
{
x-=2*n;
if (x<n) return cmplx<Thigh>(-sin(Thigh(x)*ang), cos(Thigh(x)*ang));
return cmplx<Thigh>(-cos(Thigh(2*n-x)*ang), sin(Thigh(2*n-x)*ang));
}
}
if (i!=ndone)
res[idx1] = p[i];
}
void calc_first_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
{
int ndone=int(n+1)>>1;
arr<cmplx<T>> p((n+1)/2); // n is always odd here
calc_first_octant(n<<2, p.data());
int i4=0, in=int(n), i=0;
for (; i4<=in-i4; ++i, i4+=4) // octant 0
res[i] = p[i4];
for (; i4-in <= 0; ++i, i4+=4) // octant 1
{ auto xm = in-i4; res[i].Set(p[xm].i, p[xm].r); }
for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
{ auto xm = i4-in; res[i].Set(-p[xm].i, p[xm].r); }
for (; i<ndone; ++i, i4+=4) // octant 3
{ auto xm = 2*in-i4; res[i].Set(-p[xm].r, p[xm].i); }
}
void fill_first_quadrant(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
{
constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
size_t quart = n>>2;
if ((n&7)==0)
res[quart/2].Set(hsqt2, hsqt2);
for (size_t i=1, j=quart-1; 2*i<quart; ++i, --j)
{ res[j].Set(res[i].i, res[i].r); }
}
POCKETFFT_NOINLINE void fill_first_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
{
size_t half = n>>1;
if ((n&3)==0)
for (size_t i=0; i<half/2; ++i)
res[i+half/2].Set(-res[i].i, res[i].r);
else
for (size_t i=1, j=half-1; 2*i<half; ++i, --j)
res[j].Set(-res[i].r, res[i].i);
res[half].Set(T(-1),T(0));
}
POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
{
if ((n&3)==0)
{
calc_first_octant(n, res);
fill_first_quadrant(n, res);
fill_first_half(n, res);
}
else if ((n&1)==0)
{
calc_first_quadrant(n, res);
fill_first_half(n, res);
x=8*n-x;
if (x<2*n) // third quadrant
{
if (x<n) return cmplx<Thigh>(cos(Thigh(x)*ang), -sin(Thigh(x)*ang));
return cmplx<Thigh>(sin(Thigh(2*n-x)*ang), -cos(Thigh(2*n-x)*ang));
}
else // fourth quadrant
{
x-=2*n;
if (x<n) return cmplx<Thigh>(-sin(Thigh(x)*ang), -cos(Thigh(x)*ang));
return cmplx<Thigh>(-cos(Thigh(2*n-x)*ang), -sin(Thigh(2*n-x)*ang));
}
}
else
calc_first_half(n, res);
}
public:
POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
: sz(n), data(n/2+1)
{ sincos_2pibyn_half(n, data.data()); }
{
constexpr auto pi = 3.141592653589793238462643383279502884197L;
Thigh ang = Thigh(0.25L*pi/n);
shift = 1;
while((size_t(1)<<shift)*(size_t(1)<<shift) < n) ++shift;
mask = (size_t(1)<<shift)-1;
v1.resize(mask+1);
v1[0].Set(Thigh(1), Thigh(0));
for (size_t i=1; i<v1.size(); ++i)
v1[i]=calc(i,n,ang);
v2.resize((n+mask)/(mask+1));
v2[0].Set(Thigh(1), Thigh(0));
for (size_t i=1; i<v2.size(); ++i)
v2[i]=calc(i*(mask+1),n,ang);
}
cmplx<T> operator[](size_t idx) const
{
if (idx<data.size())
return data[idx];
auto c = data[sz-idx];
c.i = -c.i;
return c;
auto x1=v1[idx&mask], x2=v2[idx>>shift];
return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
}
};
......@@ -2710,9 +2484,10 @@ template<typename T0> class T_dcst23
POCKETFFT_NOINLINE T_dcst23(size_t length)
: fftplan(length), twiddle(length)
{
const auto oo2n = T0(0.5)/T0(length);
constexpr auto pi = T0(3.141592653589793238462643383279502884197L);
const auto oo2n = pi*T0(0.5)/T0(length);
for (size_t i=0; i<length; ++i)
twiddle[i] = cospi(oo2n*T0(i+1));
twiddle[i] = cos(oo2n*T0(i+1));
}
template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
......@@ -2789,9 +2564,9 @@ template<typename T0> class T_dcst4
if ((N&1)==0)
for (size_t i=0; i<N/2; ++i)
{
T0 sincos[2];
sincospi(oon*(T0(i)+T0(0.125)), sincos);
C2[i].Set(sincos[0], sincos[1]);
constexpr auto pi = T0(3.141592653589793238462643383279502884197L);
T0 ang = pi*oon*(T0(i)+T0(0.125));
C2[i].Set(cos(ang), sin(ang));
}
}
......
......@@ -2,8 +2,10 @@ import numpy as np
import pypocketfft
def _l2error(a, b):
return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))
np.random.seed(42)
def _l2error(a, b, axes):
return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))/np.log2(np.max([2,np.prod(np.take(a.shape,axes))]))
def fftn(a, axes=None, inorm=0, out=None, nthreads=1):
......@@ -26,7 +28,7 @@ def irfftn(a, axes=None, lastsize=0, inorm=0, nthreads=1):
inorm=inorm, nthreads=nthreads)
nthreads = 0
nthreads = 1
def update_err(err, name, value):
......@@ -40,9 +42,9 @@ def update_err(err, name, value):
def test(err):
ndim = np.random.randint(1, 5)
ndim = 3# np.random.randint(1, 5)
axlen = int((2**20)**(1./ndim))
shape = np.random.randint(1, axlen, ndim)
shape = (np.random.randint(1, axlen, ndim)//2)*2+1
axes = np.arange(ndim)
np.random.shuffle(axes)
nax = np.random.randint(1, ndim+1)
......@@ -52,85 +54,85 @@ def test(err):
a_32 = a.astype(np.complex64)
b = ifftn(fftn(a, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a, b))
err = update_err(err, "cmax", _l2error(a, b, axes))
b = ifftn(fftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b))
err = update_err(err, "cmax", _l2error(a.real, b, axes))
b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b))
err = update_err(err, "cmax", _l2error(a.real, b, axes))
b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmax", _l2error(a.real, b))
err = update_err(err, "rmax", _l2error(a.real, b, axes))
b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b))
err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes))
b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b))
err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes))
b = pypocketfft.separable_hartley(
pypocketfft.separable_hartley(a.real, axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmax", _l2error(a.real, b))
err = update_err(err, "hmax", _l2error(a.real, b, axes))
b = pypocketfft.genuine_hartley(
pypocketfft.genuine_hartley(a.real, axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmax", _l2error(a.real, b))
err = update_err(err, "hmax", _l2error(a.real, b, axes))
b = pypocketfft.separable_hartley(
pypocketfft.separable_hartley(
a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b))
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes))
b = pypocketfft.genuine_hartley(
pypocketfft.genuine_hartley(a.real.astype(np.float32), axes=axes,
nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b))
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes))
if all(a.shape[i] > 1 for i in axes):
b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "c1max", _l2error(a.real, b))
err = update_err(err, "c1max", _l2error(a.real, b, axes))
b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "c1maxf", _l2error(a_32.real, b))
err = update_err(err, "c1maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "c23max", _l2error(a.real, b))
err = update_err(err, "c23max", _l2error(a.real, b, axes))
b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "c23maxf", _l2error(a_32.real, b))
err = update_err(err, "c23maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4max", _l2error(a.real, b))
err = update_err(err, "c4max", _l2error(a.real, b, axes))
b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4maxf", _l2error(a_32.real, b))
err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "s1maxf", _l2error(a_32.real, b))
err = update_err(err, "s1maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "s23max", _l2error(a.real, b))
err = update_err(err, "s23max", _l2error(a.real, b, axes))
b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "s23maxf", _l2error(a_32.real, b))
err = update_err(err, "s23maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "s4max", _l2error(a.real, b))
err = update_err(err, "s4max", _l2error(a.real, b, axes))
b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "s4maxf", _l2error(a_32.real, b))
err = update_err(err, "s4maxf", _l2error(a_32.real, b, axes))
err = dict()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment