Commit 2779373f authored by Martin Reinecke's avatar Martin Reinecke
Browse files

cleanup

parent 9ca4bbb9
Pipeline #97023 passed with stages
in 17 minutes and 49 seconds
0.10.0:
- general:
- HTML documentation generation using Sphinx
- SIMD datatypes are now much more compatible with C++ upcoming SIMD types.
The code can be compiled with the types from <experimental/simd> if
available, with very small manual changes.
- reshuffling and renaming of files
- fft:
......
......@@ -96,9 +96,8 @@ template<typename T> class PointingProvider
omega[ii] = rangle[idx[ii]];
xsin[ii] = rxsin[idx[ii]];
}
auto mysin = [](double arg) { return sin(arg); };
w1 = apply((1.-frac)*omega, mysin)*xsin;
w2 = apply((frac*omega),mysin)*xsin;
w1 = sin((1.-frac)*omega)*xsin;
w2 = sin(frac*omega)*xsin;
for (size_t ii=0; ii<vlen; ++ii)
{
if (rotflip[idx[ii]]) w1[ii]=-w1[ii];
......
......@@ -22,7 +22,7 @@
#ifndef DUCC0_SIMD_H
#define DUCC0_SIMD_H
#if 0 //__has_include(<experimental/simd>)
#if __has_include(<experimental/simd>)
#include <cstdint>
#include <cstdlib>
#include <cmath>
......@@ -30,36 +30,39 @@
#include <experimental/simd>
namespace ducc0 {
namespace detail_simd {
namespace stdx=std::experimental;
using stdx::native_simd;
template<typename T, int len> using simd = stdx::simd<T, stdx::simd_abi::deduce_t<T, len>>;
using stdx::reduce;
using stdx::any_of;
using stdx::none_of;
using stdx::all_of;
template<typename T, int len> struct simd_select
{ using type = stdx::simd<T, stdx::simd_abi::deduce_t<T, len>>; };
using stdx::element_aligned_tag;
template<typename T> constexpr inline bool vectorizable = native_simd<T>::size()>1;
template<typename T, int N>
constexpr inline bool simd_exists = (N>1) && vectorizable<T> && (!std::is_same_v<stdx::simd<T, stdx::simd_abi::deduce_t<T, N>>, stdx::fixed_size_simd<T, N>>);
using std::abs;
using std::sqrt;
using std::max;
template<typename Func, typename T, int vlen> simd<T, vlen> apply(simd<T, vlen> in, Func func)
{
simd<T, vlen> res;
for (size_t i=0; i<in.size(); ++i)
res[i] = func(in[i]);
return res;
}
template<typename Func, typename T> native_simd<T> apply(native_simd<T> in, Func func)
template<typename Func, typename T, typename Abi> inline stdx::simd<T, Abi> apply(stdx::simd<T, Abi> in, Func func)
{
native_simd<T> res;
stdx::simd<T, Abi> res;
for (size_t i=0; i<in.size(); ++i)
res[i] = func(in[i]);
return res;
}
template<typename T, typename Abi> inline stdx::simd<T,Abi> sin(stdx::simd<T,Abi> in)
{ return apply(in,[](T v){return sin(v);}); }
template<typename T, typename Abi> inline stdx::simd<T,Abi> cos(stdx::simd<T,Abi> in)
{ return apply(in,[](T v){return cos(v);}); }
}
using detail_simd::element_aligned_tag;
using detail_simd::native_simd;
using detail_simd::simd_select;
using detail_simd::simd_exists;
using detail_simd::vectorizable;
}
......@@ -509,31 +512,21 @@ template<typename T> using native_simd = vtp<T,1>;
#else // DUCC0_NO_SIMD is defined
template<typename T> using native_simd = vtp<T,1>;
#endif
template<typename T, int len> struct simd_select
{ using type = vtp<T, len>; };
template<typename T, size_t len> inline vtp<T,len> sin(vtp<T,len> in)
{ return apply(in,[](T v){return std::sin(v);}); }
template<typename T, size_t len> inline vtp<T,len> cos(vtp<T,len> in)
{ return apply(in,[](T v){return std::cos(v);}); }
}
using detail_simd::element_aligned_tag;
using detail_simd::native_simd;
template<typename T, size_t len> using simd = detail_simd::vtp<T, len>;
using detail_simd::simd_select;
using detail_simd::simd_exists;
using detail_simd::reduce;
using detail_simd::apply;
using detail_simd::max;
using detail_simd::abs;
using detail_simd::sqrt;
using detail_simd::any_of;
using detail_simd::none_of;
using detail_simd::all_of;
using detail_simd::vectorizable;
// since we are explicitly introducing a few names that are also available in
// std::, we need to import them from std::as well, otherwise name resolution
// can fail in certain circumstances.
using std::abs;
using std::sqrt;
using std::max;
}
#endif
#endif
......@@ -771,9 +771,10 @@ template<typename T, size_t vlen> DUCC0_NOINLINE void copy_output(const multi_it
ptr[it.oofs(i)] = src[i];
}
template <typename T, size_t vlen> struct add_vec { using type = simd<T, vlen>; };
template <typename T, size_t vlen> struct add_vec
{ using type = typename simd_select<T, vlen>::type; };
template <typename T, size_t vlen> struct add_vec<Cmplx<T>, vlen>
{ using type = Cmplx<simd<T, vlen>>; };
{ using type = Cmplx<typename simd_select<T, vlen>::type>; };
template <typename T, size_t vlen> using add_vec_t = typename add_vec<T, vlen>::type;
template<typename Tplan, typename T, typename T0, typename Exec>
......@@ -981,7 +982,7 @@ template<typename T> DUCC0_NOINLINE void general_r2c(
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<simd<T,vlen> *>(storage.data());
auto tdatav = reinterpret_cast<native_simd<T> *>(storage.data());
copy_input(it, in, tdatav);
plan->exec(tdatav, fct, true);
auto vout = out.vdata();
......@@ -1005,7 +1006,7 @@ template<typename T> DUCC0_NOINLINE void general_r2c(
if (it.remaining()>=vlen/2)
{
it.advance(vlen/2);
auto tdatav = reinterpret_cast<simd<T,vlen/2> *>(storage.data());
auto tdatav = reinterpret_cast<typename simd_select<T,vlen/2>::type *>(storage.data());
copy_input(it, in, tdatav);
plan->exec(tdatav, fct, true);
auto vout = out.vdata();
......@@ -1029,7 +1030,7 @@ template<typename T> DUCC0_NOINLINE void general_r2c(
if (it.remaining()>=vlen/4)
{
it.advance(vlen/4);
auto tdatav = reinterpret_cast<simd<T,vlen/4> *>(storage.data());
auto tdatav = reinterpret_cast<typename simd_select<T,vlen/4>::type *>(storage.data());
copy_input(it, in, tdatav);
plan->exec(tdatav, fct, true);
auto vout = out.vdata();
......@@ -1117,7 +1118,7 @@ template<typename T> DUCC0_NOINLINE void general_c2r(
if (it.remaining()>=vlen/2)
{
it.advance(vlen/2);
auto tdatav = reinterpret_cast<simd<T,vlen/2> *>(storage.data());
auto tdatav = reinterpret_cast<typename simd_select<T,vlen/2>::type *>(storage.data());
for (size_t j=0; j<vlen/2; ++j)
tdatav[0][j]=in.craw(it.iofs(j,0)).r;
{
......@@ -1148,7 +1149,7 @@ template<typename T> DUCC0_NOINLINE void general_c2r(
if (it.remaining()>=vlen/4)
{
it.advance(vlen/4);
auto tdatav = reinterpret_cast<simd<T,vlen/4> *>(storage.data());
auto tdatav = reinterpret_cast<typename simd_select<T,vlen/4>::type *>(storage.data());
for (size_t j=0; j<vlen/4; ++j)
tdatav[0][j]=in.craw(it.iofs(j,0)).r;
{
......
......@@ -217,35 +217,41 @@ template <typename Tfs> class cfftpass
: exec_<false>(in1, copy1, buf1); \
} \
if constexpr (simd_exists<Tfs,8>) \
if (hcin==typeid(Cmplx<simd<Tfs,8>> *).hash_code()) \
{ \
using Tcv = Cmplx<typename simd_select<Tfs,8>::type>; \
if (hcin==typeid(Tcv *).hash_code()) \
{ \
using Tcv = Cmplx<simd<Tfs,8>>; \
auto in1 = any_cast<Tcv *>(in); \
auto copy1 = any_cast<Tcv *>(copy); \
auto buf1 = any_cast<Tcv *>(buf); \
return fwd ? exec_<true>(in1, copy1, buf1) \
: exec_<false>(in1, copy1, buf1); \
} \
} \
if constexpr (simd_exists<Tfs,4>) \
if (hcin==typeid(Cmplx<simd<Tfs,4>> *).hash_code()) \
{ \
using Tcv = Cmplx<typename simd_select<Tfs,4>::type>; \
if (hcin==typeid(Tcv *).hash_code()) \
{ \
using Tcv = Cmplx<simd<Tfs,4>>; \
auto in1 = any_cast<Tcv *>(in); \
auto copy1 = any_cast<Tcv *>(copy); \
auto buf1 = any_cast<Tcv *>(buf); \
return fwd ? exec_<true>(in1, copy1, buf1) \
: exec_<false>(in1, copy1, buf1); \
} \
} \
if constexpr (simd_exists<Tfs,2>) \
if (hcin==typeid(Cmplx<simd<Tfs,2>> *).hash_code()) \
{ \
using Tcv = Cmplx<typename simd_select<Tfs,2>::type>; \
if (hcin==typeid(Tcv *).hash_code()) \
{ \
using Tcv = Cmplx<simd<Tfs,2>>; \
auto in1 = any_cast<Tcv *>(in); \
auto copy1 = any_cast<Tcv *>(copy); \
auto buf1 = any_cast<Tcv *>(buf); \
return fwd ? exec_<true>(in1, copy1, buf1) \
: exec_<false>(in1, copy1, buf1); \
} \
} \
MR_fail("impossible vector length requested"); \
}
......@@ -1457,7 +1463,7 @@ template <size_t vlen, typename Tfs> class cfftp_vecpass: public cfftpass<Tfs>
private:
static_assert(simd_exists<Tfs, vlen>, "bad vlen");
using typename cfftpass<Tfs>::Tcs;
using Tfv=simd<Tfs, vlen>;
using Tfv=typename simd_select<Tfs, vlen>::type;
using Tcv=Cmplx<Tfv>;
size_t ip;
......@@ -1661,35 +1667,41 @@ template <typename Tfs> class rfftpass
: exec_<false>(in1, copy1, buf1); \
} \
if constexpr (simd_exists<Tfs,8>) \
if (hcin==typeid(simd<Tfs,8> *).hash_code()) \
{ \
using Tfv = typename simd_select<Tfs,8>::type; \
if (hcin==typeid(Tfv *).hash_code()) \
{ \
using Tfv = simd<Tfs,8>; \
auto in1 = any_cast<Tfv *>(in); \
auto copy1 = any_cast<Tfv *>(copy); \
auto buf1 = any_cast<Tfv *>(buf); \
return fwd ? exec_<true>(in1, copy1, buf1) \
: exec_<false>(in1, copy1, buf1); \
} \
} \
if constexpr (simd_exists<Tfs,4>) \
if (hcin==typeid(simd<Tfs,4> *).hash_code()) \
{ \
using Tfv = typename simd_select<Tfs,4>::type; \
if (hcin==typeid(Tfv *).hash_code()) \
{ \
using Tfv = simd<Tfs,4>; \
auto in1 = any_cast<Tfv *>(in); \
auto copy1 = any_cast<Tfv *>(copy); \
auto buf1 = any_cast<Tfv *>(buf); \
return fwd ? exec_<true>(in1, copy1, buf1) \
: exec_<false>(in1, copy1, buf1); \
} \
} \
if constexpr (simd_exists<Tfs,2>) \
if (hcin==typeid(simd<Tfs,2> *).hash_code()) \
{ \
using Tfv = typename simd_select<Tfs,2>::type; \
if (hcin==typeid(Tfv *).hash_code()) \
{ \
using Tfv = simd<Tfs,2>; \
auto in1 = any_cast<Tfv *>(in); \
auto copy1 = any_cast<Tfv *>(copy); \
auto buf1 = any_cast<Tfv *>(buf); \
return fwd ? exec_<true>(in1, copy1, buf1) \
: exec_<false>(in1, copy1, buf1); \
} \
} \
MR_fail("impossible vector length requested"); \
}
......
......@@ -371,7 +371,7 @@ struct ft_partial_sph_isometry_plan
j = eval_helper<native_simd<double>,2>(j, x, y);
j = eval_helper<native_simd<double>,1>(j, x, y);
}
eval_helper<simd<double,1>,1>(j, x, y);
eval_helper<typename simd_select<double,1>::type,1>(j, x, y);
}
};
......
......@@ -152,7 +152,7 @@ template<typename T> class ConvolverPlan
{
protected:
constexpr static auto vlen = min<size_t>(8, native_simd<T>::size());
using Tsimd = simd<T, vlen>;
using Tsimd = typename simd_select<T, vlen>::type;
size_t nthreads;
size_t lmax, kmax;
......
......@@ -53,7 +53,7 @@ using namespace std;
template<typename T> constexpr inline int mysimdlen
= min<int>(8, native_simd<T>::size());
template<typename T> using mysimd = simd<T,mysimdlen<T>>;
template<typename T> using mysimd = typename simd_select<T,mysimdlen<T>>::type;
template<typename T> T sqr(T val) { return val*val; }
......@@ -94,8 +94,8 @@ template<typename T, typename F> [[gnu::hot]] void expi(vector<complex<T>> &res,
for (; i+vlen-1<n; i+=vlen)
{
auto vang = Tsimd(&buf[i],element_aligned_tag());
auto vcos = apply(vang, [](T arg) { return cos(arg); });
auto vsin = apply(vang, [](T arg) { return sin(arg); });
auto vcos = cos(vang);
auto vsin = sin(vang);
for (size_t ii=0; ii<vlen; ++ii)
res[i+ii] = complex<T>(vcos[ii], vsin[ii]);
}
......@@ -109,7 +109,7 @@ template<typename T> complex<T> hsum_cmplx(mysimd<T> vr, mysimd<T> vi)
#if (!defined(DUCC0_NO_SIMD))
#if (defined(__AVX__))
#if 1
inline complex<float> hsum_cmplx(mysimd<float> vr, mysimd<float> vi)
template<> inline complex<float> hsum_cmplx<float>(mysimd<float> vr, mysimd<float> vi)
{
auto t1 = _mm256_hadd_ps(__m256(vr), __m256(vi));
auto t2 = _mm_hadd_ps(_mm256_extractf128_ps(t1, 0), _mm256_extractf128_ps(t1, 1));
......@@ -118,7 +118,7 @@ inline complex<float> hsum_cmplx(mysimd<float> vr, mysimd<float> vi)
}
#else
// this version may be slightly faster, but this needs more benchmarking
inline complex<float> hsum_cmplx(native_simd<float> vr, native_simd<float> vi)
template<> inline complex<float> hsum_cmplx<float>(mysimd<float> vr, mysimd<float> vi)
{
auto t1 = _mm256_shuffle_ps(vr, vi, _MM_SHUFFLE(0,2,0,2));
auto t2 = _mm256_shuffle_ps(vr, vi, _MM_SHUFFLE(1,3,1,3));
......@@ -130,7 +130,7 @@ inline complex<float> hsum_cmplx(native_simd<float> vr, native_simd<float> vi)
}
#endif
#elif defined(__SSE3__)
inline complex<float> hsum_cmplx(mysimd<float> vr, mysimd<float> vi)
template<> inline complex<float> hsum_cmplx<float>(mysimd<float> vr, mysimd<float> vi)
{
auto t1 = _mm_hadd_ps(__m128(vr), __m128(vi));
t1 += _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1));
......@@ -1097,7 +1097,7 @@ auto ix = ix_+ranges.size()/2; if (ix>=ranges.size()) ix -=ranges.size();
}
}
ri *= imflip;
auto r = hsum_cmplx(rr,ri);
auto r = hsum_cmplx<Tcalc>(rr,ri);
ms_out.v(row, ch) += r;
if (lastplane)
ms_out.v(row, ch) *= shifting ?
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment