Commit 343ff57e authored by Peter Bell's avatar Peter Bell

Isolate copying into its own function

parent cb72d602
......@@ -2844,6 +2844,57 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
#define POCKETFFT_NTHREADS
#endif
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
const cndarr<cmplx<T>> &src, cmplx<vtype_t<T>> *POCKETFFT_RESTRICT dst)
{
for (size_t i=0; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
{
dst[i].r[j] = src[it.iofs(j,i)].r;
dst[i].i[j] = src[it.iofs(j,i)].i;
}
}
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
const cndarr<T> &src, vtype_t<T> *POCKETFFT_RESTRICT dst)
{
for (size_t i=0; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
dst[i][j] = src[it.iofs(j,i)];
}
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
const cndarr<T> &src, T *POCKETFFT_RESTRICT dst)
{
if (dst == &src[it.iofs(0)]) return; // in-place
for (size_t i=0; i<it.length_in(); ++i)
dst[i] = src[it.iofs(i)];
}
template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
const cmplx<vtype_t<T>> *POCKETFFT_RESTRICT src, ndarr<cmplx<T>> &dst)
{
for (size_t i=0; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
dst[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
}
template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
{
for (size_t i=0; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
dst[it.oofs(j,i)] = src[i][j];
}
template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
{
if (src == &dst[it.oofs(0)]) return; // in-place
for (size_t i=0; i<it.length_out(); ++i)
dst[it.oofs(i)] = src[i];
}
template<typename T> POCKETFFT_NOINLINE void general_c(
const cndarr<cmplx<T>> &in, ndarr<cmplx<T>> &out,
const shape_t &axes, bool forward, T fct, size_t POCKETFFT_NTHREADS)
......@@ -2870,16 +2921,9 @@ template<typename T> POCKETFFT_NOINLINE void general_c(
{
it.advance(vlen);
auto tdatav = reinterpret_cast<cmplx<vtype_t<T>> *>(storage.data());
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
{
tdatav[i].r[j] = tin[it.iofs(j,i)].r;
tdatav[i].i[j] = tin[it.iofs(j,i)].i;
}
copy_input(it, tin, tdatav);
plan->exec (tdatav, fct, forward);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,i)].Set(tdatav[i].r[j],tdatav[i].i[j]);
copy_output(it, tdatav, out);
}
#endif
while (it.remaining()>0)
......@@ -2888,19 +2932,46 @@ template<typename T> POCKETFFT_NOINLINE void general_c(
auto buf = it.stride_out() == sizeof(cmplx<T>) ?
&out[it.oofs(0)] : reinterpret_cast<cmplx<T> *>(storage.data());
if (buf != &tin[it.iofs(0)])
for (size_t i=0; i<len; ++i)
buf[i] = tin[it.iofs(i)];
copy_input(it, tin, buf);
plan->exec (buf, fct, forward);
if (buf != &out[it.oofs(0)])
for (size_t i=0; i<len; ++i)
out[it.oofs(i)] = buf[i];
copy_output(it, buf, out);
}
} // end of parallel region
fct = T(1); // factor has been applied, use 1 for remaining axes
}
}
template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
{
for (size_t j=0; j<vlen; ++j)
dst[it.oofs(j,0)] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
dst[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
dst[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
dst[it.oofs(j,i1)] = src[i][j];
}
template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
{
dst[it.oofs(0)] = src[0];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
{
dst[it.oofs(i1)] = src[i]+src[i+1];
dst[it.oofs(i2)] = src[i]-src[i+1];
}
if (i<it.length_out())
dst[it.oofs(i1)] = src[i];
}
template<typename T> POCKETFFT_NOINLINE void general_hartley(
const cndarr<T> &in, ndarr<T> &out, const shape_t &axes, T fct,
size_t POCKETFFT_NTHREADS)
......@@ -2927,41 +2998,18 @@ template<typename T> POCKETFFT_NOINLINE void general_hartley(
{
it.advance(vlen);
auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = tin[it.iofs(j,i)];
copy_input(it, tin, tdatav);
plan->exec(tdatav, fct, true);
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,0)] = tdatav[0][j];
size_t i=1, i1=1, i2=len-1;
for (i=1; i<len-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
out[it.oofs(j,i1)] = tdatav[i][j]+tdatav[i+1][j];
out[it.oofs(j,i2)] = tdatav[i][j]-tdatav[i+1][j];
}
if (i<len)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,i1)] = tdatav[i][j];
copy_hartley(it, tdatav, out);
}
#endif
while (it.remaining()>0)
{
it.advance(1);
auto tdata = reinterpret_cast<T *>(storage.data());
for (size_t i=0; i<len; ++i)
tdata[i] = tin[it.iofs(i)];
copy_input(it, tin, tdata);
plan->exec(tdata, fct, true);
// Hartley order
out[it.oofs(0)] = tdata[0];
size_t i=1, i1=1, i2=len-1;
for (i=1; i<len-1; i+=2, ++i1, --i2)
{
out[it.oofs(i1)] = tdata[i]+tdata[i+1];
out[it.oofs(i2)] = tdata[i]-tdata[i+1];
}
if (i<len)
out[it.oofs(i1)] = tdata[i];
copy_hartley(it, tdata, out);
}
} // end of parallel region
fct = T(1); // factor has been applied, use 1 for remaining axes
......@@ -2994,13 +3042,9 @@ template<typename Trafo, typename T> POCKETFFT_NOINLINE void general_dcst(
{
it.advance(vlen);
auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = tin[it.iofs(j,i)];
copy_input(it, tin, tdatav);
plan->exec(tdatav, fct, ortho, type, cosine);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,i)] = tdatav[i][j];
copy_output(it, tdatav, out);
}
#endif
while (it.remaining()>0)
......@@ -3009,13 +3053,9 @@ template<typename Trafo, typename T> POCKETFFT_NOINLINE void general_dcst(
auto buf = it.stride_out() == sizeof(T) ? &out[it.oofs(0)]
: reinterpret_cast<T *>(storage.data());
if (buf != &tin[it.iofs(0)])
for (size_t i=0; i<len; ++i)
buf[i] = tin[it.iofs(i)];
copy_input(it, tin, buf);
plan->exec(buf, fct, ortho, type, cosine);
if (buf != &out[it.oofs(0)])
for (size_t i=0; i<len; ++i)
out[it.oofs(i)] = buf[i];
copy_output(it, buf, out);
}
} // end of parallel region
fct = T(1); // factor has been applied, use 1 for remaining axes
......@@ -3041,9 +3081,7 @@ template<typename T> POCKETFFT_NOINLINE void general_r2c(
{
it.advance(vlen);
auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = in[it.iofs(j,i)];
copy_input(it, in, tdatav);
plan->exec(tdatav, fct, true);
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,0)].Set(tdatav[0][j]);
......@@ -3065,8 +3103,7 @@ template<typename T> POCKETFFT_NOINLINE void general_r2c(
{
it.advance(1);
auto tdata = reinterpret_cast<T *>(storage.data());
for (size_t i=0; i<len; ++i)
tdata[i] = in[it.iofs(i)];
copy_input(it, in, tdata);
plan->exec(tdata, fct, true);
out[it.oofs(0)].Set(tdata[0]);
size_t i=1, ii=1;
......@@ -3117,9 +3154,7 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
tdatav[i][j] = in[it.iofs(j,ii)].r;
}
plan->exec(tdatav, fct, false);
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,i)] = tdatav[i][j];
copy_output(it, tdatav, out);
}
#endif
while (it.remaining()>0)
......@@ -3139,8 +3174,7 @@ template<typename T> POCKETFFT_NOINLINE void general_c2r(
tdata[i] = in[it.iofs(ii)].r;
}
plan->exec(tdata, fct, false);
for (size_t i=0; i<len; ++i)
out[it.oofs(i)] = tdata[i];
copy_output(it, tdata, out);
}
} // end of parallel region
}
......@@ -3171,9 +3205,7 @@ template<typename T> POCKETFFT_NOINLINE void general_r(
{
it.advance(vlen);
auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = tin[it.iofs(j,i)];
copy_input(it, tin, tdatav);
if ((!r2c) && forward)
for (size_t i=2; i<len; i+=2)
tdatav[i] = -tdatav[i];
......@@ -3181,9 +3213,7 @@ template<typename T> POCKETFFT_NOINLINE void general_r(
if (r2c && (!forward))
for (size_t i=2; i<len; i+=2)
tdatav[i] = -tdatav[i];
for (size_t i=0; i<len; ++i)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,i)] = tdatav[i][j];
copy_output(it, tdatav, out);
}
#endif
while (it.remaining()>0)
......@@ -3192,9 +3222,7 @@ template<typename T> POCKETFFT_NOINLINE void general_r(
auto buf = it.stride_out() == sizeof(T) ?
&out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
if (buf != &tin[it.iofs(0)])
for (size_t i=0; i<len; ++i)
buf[i] = tin[it.iofs(i)];
copy_input(it, tin, buf);
if ((!r2c) && forward)
for (size_t i=2; i<len; i+=2)
buf[i] = -buf[i];
......@@ -3202,9 +3230,7 @@ template<typename T> POCKETFFT_NOINLINE void general_r(
if (r2c && (!forward))
for (size_t i=2; i<len; i+=2)
buf[i] = -buf[i];
if (buf != &out[it.oofs(0)])
for (size_t i=0; i<len; ++i)
out[it.oofs(i)] = buf[i];
copy_output(it, buf, out);
}
} // end of parallel region
fct = T(1); // factor has been applied, use 1 for remaining axes
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment