Commit 8796c200 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

experimental prefetching code; disabled for now

parent 1aed03d6
...@@ -579,21 +579,44 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp ...@@ -579,21 +579,44 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp
return aligned_array<T>(tmpsize); return aligned_array<T>(tmpsize);
} }
//#define MRFFT_PREFETCH
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
const fmav<Cmplx<T>> &src, Cmplx<native_simd<T>> *MRUTIL_RESTRICT dst) const fmav<Cmplx<T>> &src, Cmplx<native_simd<T>> *MRUTIL_RESTRICT dst)
{ {
for (size_t i=0; i<it.length_in(); ++i) size_t i=0;
#ifdef MRFFT_PREFETCH
constexpr size_t dist=16;
for (; i+dist<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
{ {
__builtin_prefetch(&src[it.iofs(j,i+dist)]);
dst[i].r[j] = src[it.iofs(j,i)].r; dst[i].r[j] = src[it.iofs(j,i)].r;
dst[i].i[j] = src[it.iofs(j,i)].i; dst[i].i[j] = src[it.iofs(j,i)].i;
} }
#endif
for (; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
{
auto tmp = src[it.iofs(j,i)];
dst[i].r[j] = tmp.r;
dst[i].i[j] = tmp.i;
}
} }
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
const fmav<T> &src, native_simd<T> *MRUTIL_RESTRICT dst) const fmav<T> &src, native_simd<T> *MRUTIL_RESTRICT dst)
{ {
for (size_t i=0; i<it.length_in(); ++i) size_t i=0;
#ifdef MRFFT_PREFETCH
constexpr size_t dist=16;
for (; i+dist<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
{
__builtin_prefetch(&src[it.oofs(j,i+dist)]);
dst[i][j] = src[it.iofs(j,i)];
}
#endif
for (; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
dst[i][j] = src[it.iofs(j,i)]; dst[i][j] = src[it.iofs(j,i)];
} }
...@@ -610,7 +633,17 @@ template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it, ...@@ -610,7 +633,17 @@ template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
const Cmplx<native_simd<T>> *MRUTIL_RESTRICT src, fmav<Cmplx<T>> &dst) const Cmplx<native_simd<T>> *MRUTIL_RESTRICT src, fmav<Cmplx<T>> &dst)
{ {
auto ptr=dst.vdata(); auto ptr=dst.vdata();
for (size_t i=0; i<it.length_out(); ++i) size_t i=0;
#ifdef MRFFT_PREFETCH
constexpr size_t dist=16;
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
__builtin_prefetch(&ptr[it.oofs(j,i+dist)],1,3);
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
}
#endif
for (; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]); ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
} }
...@@ -619,7 +652,17 @@ template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it, ...@@ -619,7 +652,17 @@ template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
const native_simd<T> *MRUTIL_RESTRICT src, fmav<T> &dst) const native_simd<T> *MRUTIL_RESTRICT src, fmav<T> &dst)
{ {
auto ptr=dst.vdata(); auto ptr=dst.vdata();
for (size_t i=0; i<it.length_out(); ++i) size_t i=0;
#ifdef MRFFT_PREFETCH
constexpr size_t dist=16;
for (; i+dist<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j)
{
__builtin_prefetch(&ptr[it.oofs(j,i+dist)],1,3);
ptr[it.oofs(j,i)] = src[i][j];
}
#endif
for (; i<it.length_out(); ++i)
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i)] = src[i][j]; ptr[it.oofs(j,i)] = src[i][j];
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment