Commit 7e4b3513 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

smaller memory consumption during twiddle generation

parent 7023b907
......@@ -72,14 +72,58 @@ template<typename T> struct arr
size_t size() const { return sz; }
};
template<typename T> struct cmplx {
T r, i;
cmplx() {}
cmplx(T r_, T i_) : r(r_), i(i_) {}
void Set(T r_, T i_) { r=r_; i=i_; }
void Set(T r_) { r=r_; i=T(0); }
cmplx &operator+= (const cmplx &other)
{ r+=other.r; i+=other.i; return *this; }
template<typename T2>cmplx &operator*= (T2 other)
{ r*=other; i*=other; return *this; }
cmplx operator+ (const cmplx &other) const
{ return cmplx(r+other.r, i+other.i); }
cmplx operator- (const cmplx &other) const
{ return cmplx(r-other.r, i-other.i); }
template<typename T2> auto operator* (const T2 &other) const
-> cmplx<decltype(r*other)>
{
return {r*other, i*other};
}
template<typename T2> auto operator* (const cmplx<T2> &other) const
-> cmplx<decltype(r+other.r)>
{
return {r*other.r-i*other.i, r*other.i + i*other.r};
}
template<bool bwd, typename T2> auto special_mul (const cmplx<T2> &other) const
-> cmplx<decltype(r+other.r)>
{
if (bwd)
return {r*other.r-i*other.i, r*other.i + i*other.r};
else
return {r*other.r+i*other.i, i*other.r - r*other.i};
}
};
template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b,
const cmplx<T> &c, const cmplx<T> &d)
{ a = c+d; b = c-d; }
template<typename T> cmplx<T> conj(const cmplx<T> &a)
{ return {a.r, -a.i}; }
template<typename T> void ROT90(cmplx<T> &a)
{ auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
template<typename T> void ROTM90(cmplx<T> &a)
{ auto tmp_=-a.r; a.r=a.i; a.i=tmp_; }
//
// twiddle factor section
//
class sincos_2pibyn
template<typename T> class sincos_2pibyn
{
private:
arr<double> data;
arr<T> data;
// adapted from https://stackoverflow.com/questions/42792939/
// CAUTION: this function only works for arguments in the range
......@@ -110,15 +154,20 @@ class sincos_2pibyn
res[1] = s;
}
NOINLINE void calc_first_octant(size_t den, double * restrict res)
NOINLINE void calc_first_octant(size_t den, T * restrict res)
{
size_t n = (den+4)>>3;
if (n==0) return;
res[0]=1.; res[1]=0.;
if (n==1) return;
size_t l1=(size_t)sqrt(n);
arr<double> tmp(2*l1);
for (size_t i=1; i<l1; ++i)
my_sincosm1pi((2.*i)/den,&res[2*i]);
{
my_sincosm1pi((2.*i)/den,&tmp[2*i]);
res[2*i ] = tmp[2*i]+1.;
res[2*i+1] = tmp[2*i+1];
}
size_t start=l1;
while(start<n)
{
......@@ -130,19 +179,17 @@ class sincos_2pibyn
if (start+end>n) end = n-start;
for (size_t i=1; i<end; ++i)
{
double csx[2]={res[2*i], res[2*i+1]};
double csx[2]={tmp[2*i], tmp[2*i+1]};
res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
}
start += l1;
}
for (size_t i=1; i<l1; ++i)
res[2*i] += 1.;
}
NOINLINE void calc_first_quadrant(size_t n, double * restrict res)
NOINLINE void calc_first_quadrant(size_t n, T * restrict res)
{
double * restrict p = res+n;
T * restrict p = res+n;
calc_first_octant(n<<1, p);
size_t ndone=(n+2)>>2;
size_t i=0, idx1=0, idx2=2*ndone-2;
......@@ -160,10 +207,10 @@ class sincos_2pibyn
}
}
NOINLINE void calc_first_half(size_t n, double * restrict res)
NOINLINE void calc_first_half(size_t n, T * restrict res)
{
int ndone=(n+1)>>1;
double * p = res+n-1;
T * p = res+n-1;
calc_first_octant(n<<2, p);
int i4=0, in=n, i=0;
for (; i4<=in-i4; ++i, i4+=4) // octant 0
......@@ -187,7 +234,7 @@ class sincos_2pibyn
}
}
NOINLINE void fill_first_quadrant(size_t n, double * restrict res)
NOINLINE void fill_first_quadrant(size_t n, T * restrict res)
{
const double hsqt2 = 0.707106781186547524400844362104849;
size_t quart = n>>2;
......@@ -200,7 +247,7 @@ class sincos_2pibyn
}
}
NOINLINE void fill_first_half(size_t n, double * restrict res)
NOINLINE void fill_first_half(size_t n, T * restrict res)
{
size_t half = n>>1;
if ((n&3)==0)
......@@ -217,7 +264,7 @@ class sincos_2pibyn
}
}
NOINLINE void fill_second_half(size_t n, double * restrict res)
NOINLINE void fill_second_half(size_t n, T * restrict res)
{
if ((n&1)==0)
for (size_t i=0; i<n; ++i)
......@@ -230,7 +277,7 @@ class sincos_2pibyn
}
}
NOINLINE void sincos_2pibyn_half(size_t n, double * restrict res)
NOINLINE void sincos_2pibyn_half(size_t n, T * restrict res)
{
if ((n&3)==0)
{
......@@ -255,7 +302,7 @@ class sincos_2pibyn
if (!half) fill_second_half(n, data.data());
}
double operator[](size_t idx) const { return data[idx]; }
T operator[](size_t idx) const { return data[idx]; }
};
NOINLINE size_t largest_prime_factor (size_t n)
......@@ -313,50 +360,6 @@ NOINLINE size_t good_size(size_t n)
return bestfac;
}
template<typename T> struct cmplx {
T r, i;
cmplx() {}
cmplx(T r_, T i_) : r(r_), i(i_) {}
void Set(T r_, T i_) { r=r_; i=i_; }
void Set(T r_) { r=r_; i=T(0); }
cmplx &operator+= (const cmplx &other)
{ r+=other.r; i+=other.i; return *this; }
template<typename T2>cmplx &operator*= (T2 other)
{ r*=other; i*=other; return *this; }
cmplx operator+ (const cmplx &other) const
{ return cmplx(r+other.r, i+other.i); }
cmplx operator- (const cmplx &other) const
{ return cmplx(r-other.r, i-other.i); }
template<typename T2> auto operator* (const T2 &other) const
-> cmplx<decltype(r*other)>
{
return {r*other, i*other};
}
template<typename T2> auto operator* (const cmplx<T2> &other) const
-> cmplx<decltype(r+other.r)>
{
return {r*other.r-i*other.i, r*other.i + i*other.r};
}
template<bool bwd, typename T2> auto special_mul (const cmplx<T2> &other) const
-> cmplx<decltype(r+other.r)>
{
if (bwd)
return {r*other.r-i*other.i, r*other.i + i*other.r};
else
return {r*other.r+i*other.i, i*other.r - r*other.i};
}
};
template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b,
const cmplx<T> &c, const cmplx<T> &d)
{ a = c+d; b = c-d; }
template<typename T> cmplx<T> conj(const cmplx<T> &a)
{ return {a.r, -a.i}; }
template<typename T> void ROT90(cmplx<T> &a)
{ auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
template<typename T> void ROTM90(cmplx<T> &a)
{ auto tmp_=-a.r; a.r=a.i; a.i=tmp_; }
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
......@@ -910,7 +913,7 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
NOINLINE void comp_twiddle()
{
sincos_2pibyn twid(length, false);
sincos_2pibyn<T0> twid(length, false);
size_t l1=1;
size_t memofs=0;
for (size_t k=0; k<nfct; ++k)
......@@ -1705,7 +1708,7 @@ size_t twsize() const
NOINLINE void comp_twiddle()
{
sincos_2pibyn twid(length, true);
sincos_2pibyn<T0> twid(length, true);
size_t l1=1;
T0 *ptr=mem.data();
for (size_t k=0; k<nfct; ++k)
......@@ -1807,7 +1810,7 @@ template<typename T0> class fftblue
bk(mem.data()), bkf(mem.data()+2*n)
{
/* initialize b_k */
sincos_2pibyn tmp(2*n, false);
sincos_2pibyn<T0> tmp(2*n, false);
bk[0] = 1;
bk[1] = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment