Commit 656634b4 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

more cleanups

parent edf30f93
Pipeline #77749 passed with stages
in 13 minutes and 2 seconds
...@@ -126,11 +126,16 @@ template<size_t W, size_t D, typename T> class HornerKernel ...@@ -126,11 +126,16 @@ template<size_t W, size_t D, typename T> class HornerKernel
template<typename T> class HornerKernelFlexible template<typename T> class HornerKernelFlexible
{ {
private: private:
static constexpr size_t MAXW=16, MINDEG=5, MAXDEG=12; static constexpr size_t MAXW=16, MINDEG=0, MAXDEG=12;
using Tsimd = native_simd<T>; using Tsimd = native_simd<T>;
static constexpr auto vlen = Tsimd::size(); static constexpr auto vlen = Tsimd::size();
size_t W, D, nvec; size_t W, D, nvec;
vector<T> res;
union Tu {
Tsimd simd;
T scalar[vlen];
};
vector<Tu> res;
vector<Tsimd> coeff; vector<Tsimd> coeff;
const T *(HornerKernelFlexible<T>::* evalfunc) (T); const T *(HornerKernelFlexible<T>::* evalfunc) (T);
...@@ -143,9 +148,9 @@ template<typename T> class HornerKernelFlexible ...@@ -143,9 +148,9 @@ template<typename T> class HornerKernelFlexible
auto tval = coeff[i]; auto tval = coeff[i];
for (size_t j=1; j<=DEG; ++j) for (size_t j=1; j<=DEG; ++j)
tval = tval*x + coeff[j*NV+i]; tval = tval*x + coeff[j*NV+i];
tval.storeu(&res[vlen*i]); res[i].simd = tval;
} }
return res.data(); return &(res[0].scalar[0]);
} }
const T *eval_intern_general(T x) const T *eval_intern_general(T x)
...@@ -156,9 +161,9 @@ template<typename T> class HornerKernelFlexible ...@@ -156,9 +161,9 @@ template<typename T> class HornerKernelFlexible
auto tval = coeff[i]; auto tval = coeff[i];
for (size_t j=1; j<=D; ++j) for (size_t j=1; j<=D; ++j)
tval = tval*x+coeff[j*nvec+i]; tval = tval*x+coeff[j*nvec+i];
tval.storeu(&res[vlen*i]); res[i].simd = tval;
} }
return res.data(); return &(res[0].scalar[0]);
} }
template<size_t NV, size_t DEG> auto evfhelper2() const template<size_t NV, size_t DEG> auto evfhelper2() const
...@@ -177,16 +182,10 @@ template<typename T> class HornerKernelFlexible ...@@ -177,16 +182,10 @@ template<typename T> class HornerKernelFlexible
return evfhelper1<((NV*vlen>MAXW) ? NV : NV+1)>(); return evfhelper1<((NV*vlen>MAXW) ? NV : NV+1)>();
} }
auto get_evalfunc() const
{
return evfhelper1<1>();
}
public: public:
template<typename Func> HornerKernelFlexible(size_t W_, size_t D_, Func func) template<typename Func> HornerKernelFlexible(size_t W_, size_t D_, Func func)
: W(W_), D(D_), nvec((W+vlen-1)/vlen), res(nvec*vlen), : W(W_), D(D_), nvec((W+vlen-1)/vlen), res(nvec),
coeff(nvec*(D+1), 0), evalfunc(get_evalfunc()) coeff(nvec*(D+1), 0), evalfunc(evfhelper1<1>())
{ {
vector<double> chebroot(D+1); vector<double> chebroot(D+1);
for (size_t i=0; i<=D; ++i) for (size_t i=0; i<=D; ++i)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment