Commit 1a1f6477 authored by Martin Reinecke's avatar Martin Reinecke

more tweaks

parent b87f51f5
...@@ -270,7 +270,7 @@ template<typename T> class sincos_2pibyn ...@@ -270,7 +270,7 @@ template<typename T> class sincos_2pibyn
{ {
private: private:
using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
size_t mask, shift; size_t N, mask, shift;
arr<cmplx<Thigh>> v1, v2; arr<cmplx<Thigh>> v1, v2;
static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang) static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
...@@ -309,17 +309,19 @@ template<typename T> class sincos_2pibyn ...@@ -309,17 +309,19 @@ template<typename T> class sincos_2pibyn
public: public:
POCKETFFT_NOINLINE sincos_2pibyn(size_t n) POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
: N(n)
{ {
constexpr auto pi = 3.141592653589793238462643383279502884197L; constexpr auto pi = 3.141592653589793238462643383279502884197L;
Thigh ang = Thigh(0.25L*pi/n); Thigh ang = Thigh(0.25L*pi/n);
size_t nval = (n+2)/2;
shift = 1; shift = 1;
while((size_t(1)<<shift)*(size_t(1)<<shift) < n) ++shift; while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
mask = (size_t(1)<<shift)-1; mask = (size_t(1)<<shift)-1;
v1.resize(mask+1); v1.resize(mask+1);
v1[0].Set(Thigh(1), Thigh(0)); v1[0].Set(Thigh(1), Thigh(0));
for (size_t i=1; i<v1.size(); ++i) for (size_t i=1; i<v1.size(); ++i)
v1[i]=calc(i,n,ang); v1[i]=calc(i,n,ang);
v2.resize((n+mask)/(mask+1)); v2.resize((nval+mask)/(mask+1));
v2[0].Set(Thigh(1), Thigh(0)); v2[0].Set(Thigh(1), Thigh(0));
for (size_t i=1; i<v2.size(); ++i) for (size_t i=1; i<v2.size(); ++i)
v2[i]=calc(i*(mask+1),n,ang); v2[i]=calc(i*(mask+1),n,ang);
...@@ -327,8 +329,14 @@ template<typename T> class sincos_2pibyn ...@@ -327,8 +329,14 @@ template<typename T> class sincos_2pibyn
cmplx<T> operator[](size_t idx) const cmplx<T> operator[](size_t idx) const
{ {
if (2*idx<=N)
{
auto x1=v1[idx&mask], x2=v2[idx>>shift];
return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
}
idx = N-idx;
auto x1=v1[idx&mask], x2=v2[idx>>shift]; auto x1=v1[idx&mask], x2=v2[idx>>shift];
return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r)); return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
} }
}; };
...@@ -2484,10 +2492,9 @@ template<typename T0> class T_dcst23 ...@@ -2484,10 +2492,9 @@ template<typename T0> class T_dcst23
POCKETFFT_NOINLINE T_dcst23(size_t length) POCKETFFT_NOINLINE T_dcst23(size_t length)
: fftplan(length), twiddle(length) : fftplan(length), twiddle(length)
{ {
constexpr auto pi = T0(3.141592653589793238462643383279502884197L); sincos_2pibyn<T0> tw(4*length);
const auto oo2n = pi*T0(0.5)/T0(length);
for (size_t i=0; i<length; ++i) for (size_t i=0; i<length; ++i)
twiddle[i] = cos(oo2n*T0(i+1)); twiddle[i] = tw[i+1].r;
} }
template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
...@@ -2560,14 +2567,12 @@ template<typename T0> class T_dcst4 ...@@ -2560,14 +2567,12 @@ template<typename T0> class T_dcst4
rfft((N&1)? new pocketfft_r<T0>(N) : nullptr), rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
C2((N&1) ? 0 : N/2) C2((N&1) ? 0 : N/2)
{ {
const auto oon = -T0(1.)/T0(N);
if ((N&1)==0) if ((N&1)==0)
{
sincos_2pibyn<T0> tw(8*N);
for (size_t i=0; i<N/2; ++i) for (size_t i=0; i<N/2; ++i)
{ C2[i] = tw[8*N-8*i-1];
constexpr auto pi = T0(3.141592653589793238462643383279502884197L); }
T0 ang = pi*oon*(T0(i)+T0(0.125));
C2[i].Set(cos(ang), sin(ang));
}
} }
template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
......
...@@ -61,12 +61,12 @@ def test(err): ...@@ -61,12 +61,12 @@ def test(err):
b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads) nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b, axes)) err = update_err(err, "cmax", _l2error(a.real, b, axes))
b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmax", _l2error(a.real, b, axes))
b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads), b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes)) err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes))
b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmax", _l2error(a.real, b, axes))
b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads), b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads) axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes)) err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes))
...@@ -113,6 +113,10 @@ def test(err): ...@@ -113,6 +113,10 @@ def test(err):
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4), pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2) axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes)) err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes))
b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "s1max", _l2error(a.real, b, axes))
b = pypocketfft.dst( b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1), pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2) axes=axes, type=1, nthreads=nthreads, inorm=2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment