Commit 8a6495d1 authored by Martin Reinecke's avatar Martin Reinecke

use high-quality algorithm for odd-sized type IV DCTs (derived from FFTW with...

use high-quality algorithm for odd-sized type IV DCTs (derived from FFTW with friendly permission); update README
parent 9918b72b
pypocketfft pypocketfft
=========== ===========
This package provides Fast Fourier and Hartley transforms with a simple This package provides Fast Fourier, trigonometric and Hartley transforms with a
Python interface. simple Python interface.
The central algorithms are derived from Paul Swarztrauber's FFTPACK code The central algorithms are derived from Paul Swarztrauber's FFTPACK code
(http://www.netlib.org/fftpack). (http://www.netlib.org/fftpack).
...@@ -10,11 +10,11 @@ The central algorithms are derived from Paul Swarztrauber's FFTPACK code ...@@ -10,11 +10,11 @@ The central algorithms are derived from Paul Swarztrauber's FFTPACK code
Features Features
-------- --------
- supports fully complex and half-complex (i.e. complex-to-real and - supports fully complex and half-complex (i.e. complex-to-real and
real-to-complex) FFTs real-to-complex) FFTs, discrete sine/cosine transforms and Hartley transforms
- supports multidimensional arrays and selection of the axes to be transformed. - achieves very high accuracy for all transforms
- supports single and double precision - supports multidimensional arrays and selection of the axes to be transformed
- supports single, double, and long double precision
- makes use of CPU vector instructions when performing 2D and higher-dimensional - makes use of CPU vector instructions when performing 2D and higher-dimensional
transforms transforms
- does not have persistent transform plans, which makes the interface simpler
- supports prime-length transforms without degrading to O(N**2) performance - supports prime-length transforms without degrading to O(N**2) performance
- Has optional OpenMP support for multidimensional transforms - has optional OpenMP support for multidimensional transforms
...@@ -2442,43 +2442,72 @@ template<typename T0> class T_dct4 ...@@ -2442,43 +2442,72 @@ template<typename T0> class T_dct4
// https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/ // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
private: private:
size_t N; size_t N;
T_dct2<T0> dct2; unique_ptr<pocketfft_c<T0>> fft;
arr<T0> C; unique_ptr<pocketfft_r<T0>> rfft;
pocketfft_c<T0> fft;
arr<cmplx<T0>> C2; arr<cmplx<T0>> C2;
public: public:
POCKETFFT_NOINLINE T_dct4(size_t length) POCKETFFT_NOINLINE T_dct4(size_t length)
: N(length), dct2((N&1)?N:1), C((N&1)?N:0), : N(length),
fft((N&1)?1:N/2), C2((N&1) ? 0:N/2) fft((N&1) ? nullptr : new pocketfft_c<T0>(N/2)),
rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
C2((N&1) ? 0 : N/2)
{ {
constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); constexpr T0 pi = T0(3.141592653589793238462643383279502884197L);
if (N&1) if ((N&1)==0)
{
for (size_t i=0; i<N; ++i)
C[i] = cos(T0(0.5)*pi*(T0(i)+T0(0.5))/T0(N));
}
else
{
for (size_t i=0; i<N/2; ++i) for (size_t i=0; i<N/2; ++i)
{ {
T0 ang = -pi/T0(N)*(T0(i)+T0(0.125)); T0 ang = -pi/T0(N)*(T0(i)+T0(0.125));
C2[i].Set(cos(ang), sin(ang)); C2[i].Set(cos(ang), sin(ang));
} }
} }
}
template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/) const template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/) const
{ {
// FIXME: due to the recurrence below, the algorithm is not very accurate constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
// Trying to find a better one...
if (N&1) if (N&1)
{ {
for (size_t i=0; i<N; ++i) // The following code is derived from the FFTW3 function apply_re11()
c[i] *= C[i]; // and is released under the 3-clause BSD license with friendly
dct2.exec(c, fct, false); // permission of Matteo Frigo.
for (size_t i=1; i<N; ++i)
c[i] = 2*c[i] - c[i-1]; auto SGN_SET = [](T x, size_t i) {return (i%2) ? -x : x;};
arr<T> y(N);
size_t n2 = N/2;
size_t i;
{
size_t m;
for (i=0, m=n2; m<N; ++i, m+=4)
y[i] = c[m];
for (; m<2*N; ++i, m+=4)
y[i] = -c[2*N-m-1];
for (; m<3*N; ++i, m+=4)
y[i] = -c[m-2*N];
for (; m<4*N; ++i, m+=4)
y[i] = c[4*N-m-1];
m -= 4*N;
for (; i<N; ++i, m+=4)
y[i] = c[m];
}
rfft->forward(y.data(), fct);
for (i=0; i+i+1<n2; ++i)
{
size_t k = i+i+1;
T c1=y[2*k-1], s1=y[2*k], c2=y[2*k+1], s2=y[2*k+2];
c[i] = sqrt2 * (SGN_SET(c1, (i+1)/2) + SGN_SET(s1, i/2));
c[N-(i+1)] = sqrt2 * (SGN_SET(c1, (N-i)/2) - SGN_SET(s1, (N-(i+1))/2));
c[n2-(i+1)] = sqrt2 * (SGN_SET(c2, (n2-i)/2) - SGN_SET(s2, (n2-(i+1))/2));
c[n2+(i+1)] = sqrt2 * (SGN_SET(c2, (n2+i+2)/2) + SGN_SET(s2, (n2+(i+1))/2));
}
if (i+i+1 == n2)
{
T cx=y[2*n2-1], sx=y[2*n2];
c[i] = sqrt2 * (SGN_SET(cx, (i+1)/2) + SGN_SET(sx, i/2));
c[N-(i+1)] = sqrt2 * (SGN_SET(cx, (i+2)/2) + SGN_SET(sx, (i+1)/2));
}
c[n2] = sqrt2 * SGN_SET(y[0], (n2+1)/2);
// FFTW-derived code ends here
} }
else else
{ {
...@@ -2488,7 +2517,7 @@ template<typename T0> class T_dct4 ...@@ -2488,7 +2517,7 @@ template<typename T0> class T_dct4
y[i].Set(c[2*i],c[N-1-2*i]); y[i].Set(c[2*i],c[N-1-2*i]);
y[i] *= C2[i]; y[i] *= C2[i];
} }
fft.forward(y.data(), fct); fft->forward(y.data(), fct);
for(size_t i=0; i<N/2; ++i) for(size_t i=0; i<N/2; ++i)
y[i] *= C2[i]; y[i] *= C2[i];
for(size_t i=0; i<N/2; ++i) for(size_t i=0; i<N/2; ++i)
......
...@@ -204,9 +204,6 @@ def testdcst1D(len, inorm, type, dtype): ...@@ -204,9 +204,6 @@ def testdcst1D(len, inorm, type, dtype):
a = (np.random.rand(len)-0.5).astype(dtype) a = (np.random.rand(len)-0.5).astype(dtype)
eps = tol[dtype] eps = tol[dtype]
itp = (0, 1, 3, 2, 4) itp = (0, 1, 3, 2, 4)
if type==4 and len%2 == 1: # relaxed accuracies for odd-length type 4 transforms
special_tol = {np.float32: 4e-5, np.float64: 6e-14, np.longfloat: 4e-17}
eps = special_tol[dtype]
itype = itp[type] itype = itp[type]
if type != 1 or len > 1: # there are no length-1 type 1 DCTs if type != 1 or len > 1: # there are no length-1 type 1 DCTs
_assert_close(a, pypocketfft.dct(pypocketfft.dct(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), eps) _assert_close(a, pypocketfft.dct(pypocketfft.dct(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), eps)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment