Commit 8a6495d1 authored by Martin Reinecke's avatar Martin Reinecke

use high-quality algorithm for odd-sized type IV DCTs (derived from FFTW with...

use high-quality algorithm for odd-sized type IV DCTs (derived from FFTW with friendly permission); update README
parent 9918b72b
pypocketfft
===========
This package provides Fast Fourier and Hartley transforms with a simple
Python interface.
This package provides Fast Fourier, trigonometric and Hartley transforms with a
simple Python interface.
The central algorithms are derived from Paul Swarztrauber's FFTPACK code
(http://www.netlib.org/fftpack).
......@@ -10,11 +10,11 @@ The central algorithms are derived from Paul Swarztrauber's FFTPACK code
Features
--------
- supports fully complex and half-complex (i.e. complex-to-real and
real-to-complex) FFTs
- supports multidimensional arrays and selection of the axes to be transformed.
- supports single and double precision
real-to-complex) FFTs, discrete sine/cosine transforms and Hartley transforms
- achieves very high accuracy for all transforms
- supports multidimensional arrays and selection of the axes to be transformed
- supports single, double, and long double precision
- makes use of CPU vector instructions when performing 2D and higher-dimensional
transforms
- does not have persistent transform plans, which makes the interface simpler
- supports prime-length transforms without degrading to O(N**2) performance
- Has optional OpenMP support for multidimensional transforms
- has optional OpenMP support for multidimensional transforms
......@@ -2442,43 +2442,72 @@ template<typename T0> class T_dct4
// https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
private:
size_t N;
T_dct2<T0> dct2;
arr<T0> C;
pocketfft_c<T0> fft;
unique_ptr<pocketfft_c<T0>> fft;
unique_ptr<pocketfft_r<T0>> rfft;
arr<cmplx<T0>> C2;
public:
POCKETFFT_NOINLINE T_dct4(size_t length)
: N(length), dct2((N&1)?N:1), C((N&1)?N:0),
fft((N&1)?1:N/2), C2((N&1) ? 0:N/2)
: N(length),
fft((N&1) ? nullptr : new pocketfft_c<T0>(N/2)),
rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
C2((N&1) ? 0 : N/2)
{
constexpr T0 pi = T0(3.141592653589793238462643383279502884197L);
if (N&1)
{
for (size_t i=0; i<N; ++i)
C[i] = cos(T0(0.5)*pi*(T0(i)+T0(0.5))/T0(N));
}
else
{
if ((N&1)==0)
for (size_t i=0; i<N/2; ++i)
{
T0 ang = -pi/T0(N)*(T0(i)+T0(0.125));
C2[i].Set(cos(ang), sin(ang));
}
}
}
template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/) const
{
// FIXME: due to the recurrence below, the algorithm is not very accurate
// Trying to find a better one...
constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
if (N&1)
{
for (size_t i=0; i<N; ++i)
c[i] *= C[i];
dct2.exec(c, fct, false);
for (size_t i=1; i<N; ++i)
c[i] = 2*c[i] - c[i-1];
// The following code is derived from the FFTW3 function apply_re11()
// and is released under the 3-clause BSD license with friendly
// permission of Matteo Frigo.
auto SGN_SET = [](T x, size_t i) {return (i%2) ? -x : x;};
arr<T> y(N);
size_t n2 = N/2;
size_t i;
{
size_t m;
for (i=0, m=n2; m<N; ++i, m+=4)
y[i] = c[m];
for (; m<2*N; ++i, m+=4)
y[i] = -c[2*N-m-1];
for (; m<3*N; ++i, m+=4)
y[i] = -c[m-2*N];
for (; m<4*N; ++i, m+=4)
y[i] = c[4*N-m-1];
m -= 4*N;
for (; i<N; ++i, m+=4)
y[i] = c[m];
}
rfft->forward(y.data(), fct);
for (i=0; i+i+1<n2; ++i)
{
size_t k = i+i+1;
T c1=y[2*k-1], s1=y[2*k], c2=y[2*k+1], s2=y[2*k+2];
c[i] = sqrt2 * (SGN_SET(c1, (i+1)/2) + SGN_SET(s1, i/2));
c[N-(i+1)] = sqrt2 * (SGN_SET(c1, (N-i)/2) - SGN_SET(s1, (N-(i+1))/2));
c[n2-(i+1)] = sqrt2 * (SGN_SET(c2, (n2-i)/2) - SGN_SET(s2, (n2-(i+1))/2));
c[n2+(i+1)] = sqrt2 * (SGN_SET(c2, (n2+i+2)/2) + SGN_SET(s2, (n2+(i+1))/2));
}
if (i+i+1 == n2)
{
T cx=y[2*n2-1], sx=y[2*n2];
c[i] = sqrt2 * (SGN_SET(cx, (i+1)/2) + SGN_SET(sx, i/2));
c[N-(i+1)] = sqrt2 * (SGN_SET(cx, (i+2)/2) + SGN_SET(sx, (i+1)/2));
}
c[n2] = sqrt2 * SGN_SET(y[0], (n2+1)/2);
// FFTW-derived code ends here
}
else
{
......@@ -2488,7 +2517,7 @@ template<typename T0> class T_dct4
y[i].Set(c[2*i],c[N-1-2*i]);
y[i] *= C2[i];
}
fft.forward(y.data(), fct);
fft->forward(y.data(), fct);
for(size_t i=0; i<N/2; ++i)
y[i] *= C2[i];
for(size_t i=0; i<N/2; ++i)
......
......@@ -204,9 +204,6 @@ def testdcst1D(len, inorm, type, dtype):
a = (np.random.rand(len)-0.5).astype(dtype)
eps = tol[dtype]
itp = (0, 1, 3, 2, 4)
if type==4 and len%2 == 1: # relaxed accuracies for odd-length type 4 transforms
special_tol = {np.float32: 4e-5, np.float64: 6e-14, np.longfloat: 4e-17}
eps = special_tol[dtype]
itype = itp[type]
if type != 1 or len > 1: # there are no length-1 type 1 DCTs
_assert_close(a, pypocketfft.dct(pypocketfft.dct(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), eps)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment