Commit b87f51f5 authored by Martin Reinecke's avatar Martin Reinecke

use system sin/cos

parent 189bffbf
import numpy as np import numpy as np
import pypocketfft import pypocketfft
import pyfftw
from time import time from time import time
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import math import math
...@@ -108,7 +107,7 @@ def bench_nd(ndim, nmax, ntry, tp, funcs, nrepeat, ttl="", filename=""): ...@@ -108,7 +107,7 @@ def bench_nd(ndim, nmax, ntry, tp, funcs, nrepeat, ttl="", filename=""):
plt.show() plt.show()
funcs = (measure_pypocketfft, measure_fftw) funcs = (measure_pypocketfft, measure_fftw_np_interface)
ttl = "pypocketfft/FFTW()" ttl = "pypocketfft/FFTW()"
bench_nd(1, 8192, 100, "c16", funcs, 10, ttl, "1d.png") bench_nd(1, 8192, 100, "c16", funcs, 10, ttl, "1d.png")
bench_nd(2, 2048, 100, "c16", funcs, 2, ttl, "2d.png") bench_nd(2, 2048, 100, "c16", funcs, 2, ttl, "2d.png")
......
...@@ -266,295 +266,69 @@ template<bool fwd, typename T> void ROTX90(cmplx<T> &a) ...@@ -266,295 +266,69 @@ template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
// //
// twiddle factor section // twiddle factor section
// //
/** Approximate sin(pi*a) within the range [-0.25, 0.25] */
inline float sinpi0(float a)
{
// adapted from https://stackoverflow.com/questions/42792939/
float s = a * a;
float r = -5.957031250000000000e-01f;
r = fma (r, s, 2.550399541854858398e+00f);
r = fma (r, s, -5.167724132537841797e+00f);
r = (a * s) * r;
return fma (a, 3.141592741012573242e+00f, r);
}
/** Approximate sin(pi*a) within the range [-0.25, 0.25] */
inline double sinpi0(double a)
{
// adapted from https://stackoverflow.com/questions/42792939/
double s = a * a;
double r = 4.6151442520157035e-4;
r = fma (r, s, -7.3700183130883555e-3);
r = fma (r, s, 8.2145868949323936e-2);
r = fma (r, s, -5.9926452893214921e-1);
r = fma (r, s, 2.5501640398732688e+0);
r = fma (r, s, -5.1677127800499516e+0);
s = s * a;
r = r * s;
return fma (a, 3.1415926535897931e+0, r);
}
/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
inline float cosm1pi0(float a)
{
// adapted from https://stackoverflow.com/questions/42792939/
float s = a * a;
float r = 2.313842773437500000e-01f;
r = fmaf (r, s, -1.335021972656250000e+00f);
r = fmaf (r, s, 4.058703899383544922e+00f);
r = fmaf (r, s, -4.934802055358886719e+00f);
return r*s;
}
/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
inline double cosm1pi0(double a)
{
// adapted from https://stackoverflow.com/questions/42792939/
double s = a * a;
double r = -1.0369917389758117e-4;
r = fma (r, s, 1.9294935641298806e-3);
r = fma (r, s, -2.5806887942825395e-2);
r = fma (r, s, 2.3533063028328211e-1);
r = fma (r, s, -1.3352627688538006e+0);
r = fma (r, s, 4.0587121264167623e+0);
r = fma (r, s, -4.9348022005446790e+0);
return r*s;
}
template <typename T> void sincosm1pi0(T a_, T *POCKETFFT_RESTRICT res)
{
if (sizeof(T)>sizeof(double)) // don't have the code for long double
{
constexpr T pi = T(3.141592653589793238462643383279502884197L);
auto s = sin(pi*a_);
res[1] = s;
res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1);
return;
}
res[0] = T(cosm1pi0(double(a_)));
res[1] = T(sinpi0(double(a_)));
}
template <typename T> T sinpi(T a)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
auto i = (int64_t)r;
auto t = fma (T(-0.5), r, a);
switch (i%4)
{
case 0:
return sinpi0(t);
case 1: case -3:
return cosm1pi0(t) + T(1.);
case 2: case -2:
return T(0.) - sinpi0(t);
case 3: case -1:
return T(-1.) - cosm1pi0(t);
}
throw runtime_error("cannot happen");
}
template <typename T> T cospi(T a)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
auto i = (int64_t)r;
auto t = fma (T(-0.5), r, a);
switch (i%4)
{
case 0:
return cosm1pi0(t) + T(1.);
case 1: case -3:
return T(0.) - sinpi0(t);
case 2: case -2:
return T(-1.) - cosm1pi0(t);
case 3: case -1:
return sinpi0(t);
}
throw runtime_error("cannot happen");
}
inline long double cospi(long double a)
{
constexpr auto pi = 3.141592653589793238462643383279502884197L;
return sizeof(long double) > sizeof(double) ? cos(a * pi) :
static_cast<long double>(cospi(static_cast<double>(a)));
}
inline long double sinpi(long double a)
{
constexpr auto pi = 3.141592653589793238462643383279502884197L;
return sizeof(long double) > sizeof(double) ? sin(a * pi) :
static_cast<long double>(sinpi(static_cast<double>(a)));
}
template <typename T> void sincospi(T a, T *POCKETFFT_RESTRICT res)
{
// reduce argument to primary approximation interval (-0.25, 0.25)
auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
auto i = (int64_t)r;
auto t = fma (T(-0.5), r, a);
auto c=cosm1pi0(t)+T(1.);
auto s=sinpi0(t);
// map results according to quadrant
if (i & 2)
{
s = T(0.)-s;
c = T(0.)-c;
}
if (i & 1)
{
swap(s, c);
c = T(0.)-c;
}
res[0]=c;
res[1]=s;
}
inline void sincospi(long double a, long double *POCKETFFT_RESTRICT res)
{
if (sizeof(long double) > sizeof(double))
{
constexpr auto pi = 3.141592653589793238462643383279502884197L;
res[0] = cos(pi * a);
res[1] = sin(pi * a);
}
else
{
double sincos[2];
sincospi(static_cast<double>(a), sincos);
res[0] = static_cast<long double>(sincos[0]);
res[1] = static_cast<long double>(sincos[1]);
}
}
template<typename T> class sincos_2pibyn template<typename T> class sincos_2pibyn
{ {
private: private:
using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
size_t sz; size_t mask, shift;
arr<cmplx<T>> data; arr<cmplx<Thigh>> v1, v2;
POCKETFFT_NOINLINE void calc_first_octant(size_t den,
cmplx<T> * POCKETFFT_RESTRICT res)
{
size_t n = (den+4)>>3;
if (n==0) return;
res[0].Set(1.,0.);
if (n==1) return;
size_t l1 = size_t(sqrt(n));
arr<cmplx<Thigh>> tmp(l1);
for (size_t i=1; i<l1; ++i)
{
sincosm1pi0(Thigh(2*i)/Thigh(den),&tmp[i].r);
res[i].Set(T(tmp[i].r)+1,T(tmp[i].i));
}
size_t start=l1;
while(start<n)
{
cmplx<Thigh> cs;
sincosm1pi0((Thigh(2*start))/Thigh(den),&cs.r);
res[start].Set(T(cs.r+1), T(cs.i));
size_t end = l1;
if (start+end>n) end = n-start;
for (size_t i=1; i<end; ++i)
{
cmplx<Thigh> csx=tmp[i];
res[start+i].Set(T(((cs.r*csx.r - cs.i*csx.i + cs.r) + csx.r) + 1),
T((cs.r*csx.i + cs.i*csx.r) + cs.i + csx.i));
}
start += l1;
}
}
void calc_first_quadrant(size_t n, cmplx<T> * POCKETFFT_RESTRICT res) static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
{ {
arr<cmplx<T>> p(n/2); // n is always even here x<<=3;
calc_first_octant(n<<1, p.data()); if (x<4*n) // first half
size_t ndone=(n+2)>>2;
size_t i=0, idx1=0, idx2=ndone-1;
for (; i+1<ndone; i+=2, ++idx1, --idx2)
{ {
res[idx1] = p[i]; if (x<2*n) // first quadrant
res[idx2].Set(p[i+1].i, p[i+1].r); {
if (x<n) return cmplx<Thigh>(cos(Thigh(x)*ang), sin(Thigh(x)*ang));
return cmplx<Thigh>(sin(Thigh(2*n-x)*ang), cos(Thigh(2*n-x)*ang));
}
else // second quadrant
{
x-=2*n;
if (x<n) return cmplx<Thigh>(-sin(Thigh(x)*ang), cos(Thigh(x)*ang));
return cmplx<Thigh>(-cos(Thigh(2*n-x)*ang), sin(Thigh(2*n-x)*ang));
}
} }
if (i!=ndone)
res[idx1] = p[i];
}
void calc_first_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
{
int ndone=int(n+1)>>1;
arr<cmplx<T>> p((n+1)/2); // n is always odd here
calc_first_octant(n<<2, p.data());
int i4=0, in=int(n), i=0;
for (; i4<=in-i4; ++i, i4+=4) // octant 0
res[i] = p[i4];
for (; i4-in <= 0; ++i, i4+=4) // octant 1
{ auto xm = in-i4; res[i].Set(p[xm].i, p[xm].r); }
for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
{ auto xm = i4-in; res[i].Set(-p[xm].i, p[xm].r); }
for (; i<ndone; ++i, i4+=4) // octant 3
{ auto xm = 2*in-i4; res[i].Set(-p[xm].r, p[xm].i); }
}
void fill_first_quadrant(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
{
constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
size_t quart = n>>2;
if ((n&7)==0)
res[quart/2].Set(hsqt2, hsqt2);
for (size_t i=1, j=quart-1; 2*i<quart; ++i, --j)
{ res[j].Set(res[i].i, res[i].r); }
}
POCKETFFT_NOINLINE void fill_first_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
{
size_t half = n>>1;
if ((n&3)==0)
for (size_t i=0; i<half/2; ++i)
res[i+half/2].Set(-res[i].i, res[i].r);
else else
for (size_t i=1, j=half-1; 2*i<half; ++i, --j)
res[j].Set(-res[i].r, res[i].i);
res[half].Set(T(-1),T(0));
}
POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
{
if ((n&3)==0)
{
calc_first_octant(n, res);
fill_first_quadrant(n, res);
fill_first_half(n, res);
}
else if ((n&1)==0)
{ {
calc_first_quadrant(n, res); x=8*n-x;
fill_first_half(n, res); if (x<2*n) // third quadrant
{
if (x<n) return cmplx<Thigh>(cos(Thigh(x)*ang), -sin(Thigh(x)*ang));
return cmplx<Thigh>(sin(Thigh(2*n-x)*ang), -cos(Thigh(2*n-x)*ang));
}
else // fourth quadrant
{
x-=2*n;
if (x<n) return cmplx<Thigh>(-sin(Thigh(x)*ang), -cos(Thigh(x)*ang));
return cmplx<Thigh>(-cos(Thigh(2*n-x)*ang), -sin(Thigh(2*n-x)*ang));
}
} }
else
calc_first_half(n, res);
} }
public: public:
POCKETFFT_NOINLINE sincos_2pibyn(size_t n) POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
: sz(n), data(n/2+1) {
{ sincos_2pibyn_half(n, data.data()); } constexpr auto pi = 3.141592653589793238462643383279502884197L;
Thigh ang = Thigh(0.25L*pi/n);
shift = 1;
while((size_t(1)<<shift)*(size_t(1)<<shift) < n) ++shift;
mask = (size_t(1)<<shift)-1;
v1.resize(mask+1);
v1[0].Set(Thigh(1), Thigh(0));
for (size_t i=1; i<v1.size(); ++i)
v1[i]=calc(i,n,ang);
v2.resize((n+mask)/(mask+1));
v2[0].Set(Thigh(1), Thigh(0));
for (size_t i=1; i<v2.size(); ++i)
v2[i]=calc(i*(mask+1),n,ang);
}
cmplx<T> operator[](size_t idx) const cmplx<T> operator[](size_t idx) const
{ {
if (idx<data.size()) auto x1=v1[idx&mask], x2=v2[idx>>shift];
return data[idx]; return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
auto c = data[sz-idx];
c.i = -c.i;
return c;
} }
}; };
...@@ -2710,9 +2484,10 @@ template<typename T0> class T_dcst23 ...@@ -2710,9 +2484,10 @@ template<typename T0> class T_dcst23
POCKETFFT_NOINLINE T_dcst23(size_t length) POCKETFFT_NOINLINE T_dcst23(size_t length)
: fftplan(length), twiddle(length) : fftplan(length), twiddle(length)
{ {
const auto oo2n = T0(0.5)/T0(length); constexpr auto pi = T0(3.141592653589793238462643383279502884197L);
const auto oo2n = pi*T0(0.5)/T0(length);
for (size_t i=0; i<length; ++i) for (size_t i=0; i<length; ++i)
twiddle[i] = cospi(oo2n*T0(i+1)); twiddle[i] = cos(oo2n*T0(i+1));
} }
template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
...@@ -2789,9 +2564,9 @@ template<typename T0> class T_dcst4 ...@@ -2789,9 +2564,9 @@ template<typename T0> class T_dcst4
if ((N&1)==0) if ((N&1)==0)
for (size_t i=0; i<N/2; ++i) for (size_t i=0; i<N/2; ++i)
{ {
T0 sincos[2]; constexpr auto pi = T0(3.141592653589793238462643383279502884197L);
sincospi(oon*(T0(i)+T0(0.125)), sincos); T0 ang = pi*oon*(T0(i)+T0(0.125));
C2[i].Set(sincos[0], sincos[1]); C2[i].Set(cos(ang), sin(ang));
} }
} }
......
...@@ -2,8 +2,10 @@ import numpy as np ...@@ -2,8 +2,10 @@ import numpy as np
import pypocketfft import pypocketfft
def _l2error(a, b): np.random.seed(42)
return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))
def _l2error(a, b, axes):
return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))/np.log2(np.max([2,np.prod(np.take(a.shape,axes))]))
def fftn(a, axes=None, inorm=0, out=None, nthreads=1): def fftn(a, axes=None, inorm=0, out=None, nthreads=1):
...@@ -26,7 +28,7 @@ def irfftn(a, axes=None, lastsize=0, inorm=0, nthreads=1): ...@@ -26,7 +28,7 @@ def irfftn(a, axes=None, lastsize=0, inorm=0, nthreads=1):
inorm=inorm, nthreads=nthreads) inorm=inorm, nthreads=nthreads)
nthreads = 0 nthreads = 1
def update_err(err, name, value): def update_err(err, name, value):
...@@ -40,9 +42,9 @@ def update_err(err, name, value): ...@@ -40,9 +42,9 @@ def update_err(err, name, value):
def test(err): def test(err):
ndim = np.random.randint(1, 5) ndim = 3# np.random.randint(1, 5)
axlen = int((2**20)**(1./ndim)) axlen = int((2**20)**(1./ndim))
shape = np.random.randint(1, axlen, ndim) shape = (np.random.randint(1, axlen, ndim)//2)*2+1
axes = np.arange(ndim) axes = np.arange(ndim)
np.random.shuffle(axes) np.random.shuffle(axes)
nax = np.random.randint(1, ndim+1) nax = np.random.randint(1, ndim+1)
...@@ -52,85 +54,85 @@ def test(err): ...@@ -52,85 +54,85 @@ def test(err):
a_32 = a.astype(np.complex64) a_32 = a.astype(np.complex64)
b = ifftn(fftn(a, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = ifftn(fftn(a, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads) nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a, b)) err = update_err(err, "cmax", _l2error(a, b, axes))
b = ifftn(fftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = ifftn(fftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads) nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b)) err = update_err(err, "cmax", _l2error(a.real, b, axes))
b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads) nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b)) err = update_err(err, "cmax", _l2error(a.real, b, axes))
b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
lastsize=lastsize, nthreads=nthreads) lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmax", _l2error(a.real, b)) err = update_err(err, "rmax", _l2error(a.real, b, axes))
b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads), b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b)) err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes))
b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads), b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads) axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b)) err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes))
b = pypocketfft.separable_hartley( b = pypocketfft.separable_hartley(
pypocketfft.separable_hartley(a.real, axes=axes, nthreads=nthreads), pypocketfft.separable_hartley(a.real, axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmax", _l2error(a.real, b)) err = update_err(err, "hmax", _l2error(a.real, b, axes))
b = pypocketfft.genuine_hartley( b = pypocketfft.genuine_hartley(
pypocketfft.genuine_hartley(a.real, axes=axes, nthreads=nthreads), pypocketfft.genuine_hartley(a.real, axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmax", _l2error(a.real, b)) err = update_err(err, "hmax", _l2error(a.real, b, axes))
b = pypocketfft.separable_hartley( b = pypocketfft.separable_hartley(
pypocketfft.separable_hartley( pypocketfft.separable_hartley(
a.real.astype(np.float32), axes=axes, nthreads=nthreads), a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b)) err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes))
b = pypocketfft.genuine_hartley( b = pypocketfft.genuine_hartley(
pypocketfft.genuine_hartley(a.real.astype(np.float32), axes=axes, pypocketfft.genuine_hartley(a.real.astype(np.float32), axes=axes,
nthreads=nthreads), nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b)) err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes))
if all(a.shape[i] > 1 for i in axes): if all(a.shape[i] > 1 for i in axes):
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=1), pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2) axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "c1max", _l2error(a.real, b)) err = update_err(err, "c1max", _l2error(a.real, b, axes))
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=1), pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2) axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "c1maxf", _l2error(a_32.real, b)) err = update_err(err, "c1maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=2), pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2) axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "c23max", _l2error(a.real, b)) err = update_err(err, "c23max", _l2error(a.real, b, axes))
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=2), pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2) axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "c23maxf", _l2error(a_32.real, b)) err = update_err(err, "c23maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=4), pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2) axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4max", _l2error(a.real, b)) err = update_err(err, "c4max", _l2error(a.real, b, axes))
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4), pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2) axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4maxf", _l2error(a_32.real, b)) err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dst( b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1), pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2) axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "s1maxf", _l2error(a_32.real, b))