Commit 3dd271fc authored by Martin Reinecke's avatar Martin Reinecke
Browse files

Merge branch 'horner_kernel' into 'ducc0'

add code for fast polynomial representation of gridding kernels

See merge request !21
parents 64866dcd 06e33976
Pipeline #77989 passed with stages
in 13 minutes and 1 second
......@@ -41,6 +41,7 @@
#endif
#endif
#include <cstdint>
#include <cstdlib>
#include <cmath>
#include <algorithm>
......@@ -104,6 +105,10 @@ template<typename T, size_t len> class vtp
vtp(const vtp &other) = default;
vtp &operator=(const T &other) { v=hlp::from_scalar(other); return *this; }
operator Tv() const { return v; }
static vtp loadu(const T *ptr) { return vtp(hlp::loadu(ptr)); }
void storeu(T *ptr) const { hlp::storeu(ptr, v); }
vtp operator-() const { return vtp(-v); }
vtp operator+(vtp other) const { return vtp(v+other.v); }
vtp operator-(vtp other) const { return vtp(v-other.v); }
......@@ -240,6 +245,9 @@ template<typename T> class helper_<T,1>
using Tv = pseudoscalar<T>;
using Tm = bool;
static Tv loadu(const T *ptr) { return *ptr; }
static void storeu(T *ptr, Tv v) { *ptr = v.v; }
static Tv from_scalar(T v) { return v; }
static Tv abs(Tv v) { return v.abs(); }
static Tv max(Tv v1, Tv v2) { return v1.max(v2); }
......@@ -265,6 +273,9 @@ template<> class helper_<double,8>
using Tv = __m512d;
using Tm = __mmask8;
static Tv loadu(const T *ptr) { return _mm512_loadu_pd(ptr); }
static void storeu(T *ptr, Tv v) { _mm512_storeu_pd(ptr, v); }
static Tv from_scalar(T v) { return _mm512_set1_pd(v); }
static Tv abs(Tv v) { return __m512d(_mm512_andnot_epi64(__m512i(_mm512_set1_pd(-0.)),__m512i(v))); }
static Tv max(Tv v1, Tv v2) { return _mm512_max_pd(v1, v2); }
......@@ -286,6 +297,9 @@ template<> class helper_<float,16>
using Tv = __m512;
using Tm = __mmask16;
static Tv loadu(const T *ptr) { return _mm512_loadu_ps(ptr); }
static void storeu(T *ptr, Tv v) { _mm512_storeu_ps(ptr, v); }
static Tv from_scalar(T v) { return _mm512_set1_ps(v); }
static Tv abs(Tv v) { return __m512(_mm512_andnot_epi32(__m512i(_mm512_set1_ps(-0.)),__m512i(v))); }
static Tv max(Tv v1, Tv v2) { return _mm512_max_ps(v1, v2); }
......@@ -310,6 +324,9 @@ template<> class helper_<double,4>
using Tv = __m256d;
using Tm = __m256d;
static Tv loadu(const T *ptr) { return _mm256_loadu_pd(ptr); }
static void storeu(T *ptr, Tv v) { _mm256_storeu_pd(ptr, v); }
static Tv from_scalar(T v) { return _mm256_set1_pd(v); }
static Tv abs(Tv v) { return _mm256_andnot_pd(_mm256_set1_pd(-0.),v); }
static Tv max(Tv v1, Tv v2) { return _mm256_max_pd(v1, v2); }
......@@ -331,6 +348,9 @@ template<> class helper_<float,8>
using Tv = __m256;
using Tm = __m256;
static Tv loadu(const T *ptr) { return _mm256_loadu_ps(ptr); }
static void storeu(T *ptr, Tv v) { _mm256_storeu_ps(ptr, v); }
static Tv from_scalar(T v) { return _mm256_set1_ps(v); }
static Tv abs(Tv v) { return _mm256_andnot_ps(_mm256_set1_ps(-0.),v); }
static Tv max(Tv v1, Tv v2) { return _mm256_max_ps(v1, v2); }
......@@ -355,6 +375,9 @@ template<> class helper_<double,2>
using Tv = __m128d;
using Tm = __m128d;
static Tv loadu(const T *ptr) { return _mm_loadu_pd(ptr); }
static void storeu(T *ptr, Tv v) { _mm_storeu_pd(ptr, v); }
static Tv from_scalar(T v) { return _mm_set1_pd(v); }
static Tv abs(Tv v) { return _mm_andnot_pd(_mm_set1_pd(-0.),v); }
static Tv max(Tv v1, Tv v2) { return _mm_max_pd(v1, v2); }
......@@ -383,6 +406,9 @@ template<> class helper_<float,4>
using Tv = __m128;
using Tm = __m128;
static Tv loadu(const T *ptr) { return _mm_loadu_ps(ptr); }
static void storeu(T *ptr, Tv v) { _mm_storeu_ps(ptr, v); }
static Tv from_scalar(T v) { return _mm_set1_ps(v); }
static Tv abs(Tv v) { return _mm_andnot_ps(_mm_set1_ps(-0.),v); }
static Tv max(Tv v1, Tv v2) { return _mm_max_ps(v1, v2); }
......@@ -429,7 +455,6 @@ using std::abs;
using std::sqrt;
using std::max;
}
#endif
......@@ -39,6 +39,8 @@ class SimpleTimer
public:
SimpleTimer()
: starttime(clock::now()) {}
void reset()
{ starttime = clock::now(); }
double operator()() const
{
return std::chrono::duration<double>(clock::now() - starttime).count();
......
/*
* This file is part of the MR utility library.
*
* This code is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This code is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this code; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Copyright (C) 2020 Max-Planck-Society
Author: Martin Reinecke */
#ifndef DUCC0_HORNER_KERNEL_H
#define DUCC0_HORNER_KERNEL_H
#include <cmath>
#include <functional>
#include <array>
#include <vector>
#include "ducc0/infra/simd.h"
#include "ducc0/infra/useful_macros.h"
namespace ducc0 {
namespace detail_horner_kernel {
using namespace std;
constexpr double pi=3.141592653589793238462643383279502884197;
vector<double> getCoeffs(size_t W, size_t D, const function<double(double)> &func)
{
vector<double> coeff(W*(D+1));
vector<double> chebroot(D+1);
for (size_t i=0; i<=D; ++i)
chebroot[i] = cos((2*i+1.)*pi/(2*D+2));
vector<double> y(D+1), lcf(D+1), C((D+1)*(D+1)), lcf2(D+1);
for (size_t i=0; i<W; ++i)
{
double l = -1+2.*i/double(W);
double r = -1+2.*(i+1)/double(W);
// function values at Chebyshev nodes
for (size_t j=0; j<=D; ++j)
y[j] = func(chebroot[j]*(r-l)*0.5 + (r+l)*0.5);
// Chebyshev coefficients
for (size_t j=0; j<=D; ++j)
{
lcf[j] = 0;
for (size_t k=0; k<=D; ++k)
lcf[j] += 2./(D+1)*y[k]*cos(j*(2*k+1)*pi/(2*D+2));
}
lcf[0] *= 0.5;
// Polynomial coefficients
fill(C.begin(), C.end(), 0.);
C[0] = 1.;
C[1*(D+1) + 1] = 1.;
for (size_t j=2; j<=D; ++j)
{
C[j*(D+1) + 0] = -C[(j-2)*(D+1) + 0];
for (size_t k=1; k<=j; ++k)
C[j*(D+1) + k] = 2*C[(j-1)*(D+1) + k-1] - C[(j-2)*(D+1) + k];
}
for (size_t j=0; j<=D; ++j) lcf2[j] = 0;
for (size_t j=0; j<=D; ++j)
for (size_t k=0; k<=D; ++k)
lcf2[k] += C[j*(D+1) + k]*lcf[j];
for (size_t j=0; j<=D; ++j)
coeff[j*W + i] = lcf2[D-j];
}
return coeff;
}
/*! Class providing fast piecewise polynomial approximation of a function which
is defined on the interval [-1;1]
W is the number of equal-length intervals into which [-1;1] is subdivided.
D is the degree of the approximating polynomials.
T is the type at which the approximation is calculated;
should be float or double. */
template<size_t W, size_t D, typename T> class HornerKernel
{
private:
using Tsimd = native_simd<T>;
static constexpr auto vlen = Tsimd::size();
static constexpr auto nvec = (W+vlen-1)/vlen;
array<array<Tsimd,nvec>,D+1> coeff;
union {
array<Tsimd,nvec> v;
array<T,W> s;
} res;
public:
template<typename Func> HornerKernel(Func func)
{
auto coeff_raw = getCoeffs(W,D,func);
for (size_t j=0; j<=D; ++j)
{
for (size_t i=0; i<W; ++i)
coeff[j][i/vlen][i%vlen] = T(coeff_raw[j*W+i]);
for (size_t i=W; i<vlen*nvec; ++i)
coeff[j][i/vlen][i%vlen] = T(0);
}
}
/*! Returns the function approximation at W different locations with the
abscissas x, x+2./W, x+4./W, ..., x+(2.*W-2)/W.
x must lie in [-1; -1+2./W]. */
const T *DUCC0_NOINLINE eval(T x)
{
x = (x+1)*W-1;
for (size_t i=0; i<nvec; ++i)
{
auto tval = coeff[0][i];
for (size_t j=1; j<=D; ++j)
tval = tval*x + coeff[j][i];
res.v[i] = tval;
}
return &res.s[0];
}
/*! Returns the function approximation at location x.
x must lie in [-1; 1]. */
T DUCC0_NOINLINE eval_single(T x) const
{
auto nth = min(W-1, size_t(max(T(0), (x+1)*W*T(0.5))));
x = (x+1)*W-2*nth-1;
auto i = nth/vlen;
auto imod = nth%vlen;
auto tval = coeff[0][i][imod];
for (size_t j=1; j<=D; ++j)
tval = tval*x + coeff[j][i][imod];
return tval;
}
};
template<typename T> class HornerKernelFlexible
{
private:
static constexpr size_t MAXW=16, MINDEG=0, MAXDEG=20;
using Tsimd = native_simd<T>;
static constexpr auto vlen = Tsimd::size();
size_t W, D, nvec;
size_t n_gl;
vector<Tsimd> coeff;
void (HornerKernelFlexible<T>::* evalfunc) (T, native_simd<T> *) const;
template<size_t NV, size_t DEG> void eval_intern(T x, native_simd<T> *res) const
{
x = (x+1)*W-1;
for (size_t i=0; i<NV; ++i)
{
auto tval = coeff[i];
for (size_t j=1; j<=DEG; ++j)
tval = tval*x + coeff[j*NV+i];
res[i] = tval;
}
}
void eval_intern_general(T x, native_simd<T> *res) const
{
x = (x+1)*W-1;
for (size_t i=0; i<nvec; ++i)
{
auto tval = coeff[i];
for (size_t j=1; j<=D; ++j)
tval = tval*x+coeff[j*nvec+i];
res[i] = tval;
}
}
template<size_t NV, size_t DEG> auto evfhelper2() const
{
if (DEG==D)
return &HornerKernelFlexible::eval_intern<NV,DEG>;
if (DEG>MAXDEG)
return &HornerKernelFlexible::eval_intern_general;
return evfhelper2<NV, ((DEG>MAXDEG) ? DEG : DEG+1)>();
}
template<size_t NV> auto evfhelper1() const
{
if (nvec==NV) return evfhelper2<NV,0>();
if (nvec*vlen>MAXW) return &HornerKernelFlexible::eval_intern_general;
return evfhelper1<((NV*vlen>MAXW) ? NV : NV+1)>();
}
public:
HornerKernelFlexible(size_t W_, size_t D_, const function<double(double)> &func)
: W(W_), D(D_), nvec((W+vlen-1)/vlen),
coeff(nvec*(D+1), 0), evalfunc(evfhelper1<1>())
{
auto coeff_raw = getCoeffs(W,D,func);
for (size_t j=0; j<=D; ++j)
{
for (size_t i=0; i<W; ++i)
coeff[j*nvec + i/vlen][i%vlen] = T(coeff_raw[j*W+i]);
for (size_t i=W; i<vlen*nvec; ++i)
coeff[j*nvec + i/vlen][i%vlen] = T(0);
}
}
/*! Returns the function approximation at W different locations with the
abscissas x, x+2./W, x+4./W, ..., x+(2.*W-2)/W.
x must lie in [-1; -1+2./W].
NOTE: res must point to memory large enough to hold
((W+vlen-1)/vlen) objects of type native_simd<T>!
*/
void eval(T x, native_simd<T> *res) const
{ (this->*evalfunc)(x, res); }
/*! Returns the function approximation at location x.
x must lie in [-1; 1]. */
T DUCC0_NOINLINE eval_single(T x) const
{
auto nth = min(W-1, size_t(max(T(0), (x+1)*W*T(0.5))));
x = (x+1)*W-2*nth-1;
auto i = nth/vlen;
auto imod = nth%vlen;
auto tval = coeff[i][imod];
for (size_t j=1; j<=D; ++j)
tval = tval*x + coeff[j*nvec+i][imod];
return tval;
}
};
#if 0
class KernelCorrection
{
private:
vector<double> x, wgtpsi;
size_t supp;
public:
KernelCorrection(size_t W, const function<double(double)> &func)
: supp(W)
{
size_t p = 1.5*supp+2; // estimate; may need to be higher for arbitrary kernels
GL_Integrator integ(2*p, 1);
x = integ.coordsSymmetric();
wgtpsi = integ.weightsSymmetric();
for (size_t i=0; i<x.size(); ++i)
wgtpsi[i] *= func(x[i]);
}
/* Compute correction factors for gridding kernel
This implementation follows eqs. (3.8) to (3.10) of Barnett et al. 2018 */
double corfac(double v) const
{
double tmp=0;
for (int i=0; i<x.size(); ++i)
tmp += wgtpsi[i]*cos(pi*supp*v*x[i]);
return 2./(supp*tmp);
}
/* Compute correction factors for gridding kernel
This implementation follows eqs. (3.8) to (3.10) of Barnett et al. 2018 */
vector<double> correction_factors(size_t n, size_t nval, int nthreads=1)
{
vector<double> res(nval);
double xn = 1./n;
execStatic(nval, nthreads, 0, [&](auto &sched)
{
while (auto rng=sched.getNext()) for(auto i=rng.lo; i<rng.hi; ++i)
res[i] = corfac(i*xn);
});
return res;
}
};
#endif
}
using detail_horner_kernel::HornerKernel;
using detail_horner_kernel::HornerKernelFlexible;
//using detail_horner_kernel::KernelCorrection;
}
#endif
// compile with "g++ -O3 -ffast-math -std=c++17 -march=native -Isrc horner_test.cc"
#include "ducc0/math/horner_kernel.h"
#include "ducc0/infra/timers.h"
#include <iostream>
using namespace ducc0;
using namespace std;
// you can use any kernel defined on [-1; 1] here
double es_kernel(size_t w, double x)
{
//return exp(-9*x*x);
auto beta = 2.3*w;
return exp(beta*(sqrt((1-x)*(1+x))-1));
}
int main()
{
constexpr size_t W=8; // kernel support in pixels
constexpr size_t D=12; // degree of approximating polynomials
using FLT=double;
size_t Neval = 1000000000; // we will do a billion function evaluations altogether
size_t Ncall = Neval/W;
FLT delta = 2./W/double(Ncall); // small offsets between individual calls to prevent the compiler from optimizing too much
{
HornerKernel<W,D,FLT> hk([](double x){return es_kernel(W,x);});
double sum=0;
for (size_t i=0; i<Ncall; ++i)
{
FLT p0 = -FLT(1)+i*delta; // position of first sample
auto res = hk.eval(p0);
for (size_t j=0; j<W; ++j)
sum = max(sum,abs(res[j]- hk.eval_single(p0+j*2./W)));
}
cout << "Maximum deviation measured: " << sum << endl;
SimpleTimer timer;
sum = 0;
for (size_t i=0; i<Ncall; ++i)
{
FLT p0 = -FLT(1)+i*delta; // position of first sample
auto res = hk.eval(p0);
for (size_t j=0; j<W; ++j)
sum += res[j]; // needed to avoid over-optimization
}
cout << "HornerKernel: " << Neval/timer() << " function approximations per second" << endl;
cout << sum << endl;
}
{
HornerKernelFlexible<FLT> hk2(W,D,[](double x){return es_kernel(W,x);});
union {
FLT scalar[64];
native_simd<FLT> simd[64/native_simd<FLT>::size()];
} buf;
double sum=0;
for (size_t i=0; i<Ncall; ++i)
{
FLT p0 = -FLT(1)+i*delta; // position of first sample
hk2.eval(p0, buf.simd);
for (size_t j=0; j<W; ++j)
sum = max(sum,abs(buf.scalar[j]- hk2.eval_single(p0+j*2./W)));
}
cout << "HornerKernelFlexible: Maximum deviation measured: " << sum << endl;
SimpleTimer timer;
sum = 0;
for (size_t i=0; i<Ncall; ++i)
{
FLT p0 = -FLT(1)+i*delta; // position of first sample
hk2.eval(p0, buf.simd);
for (size_t j=0; j<W; ++j)
sum += buf.scalar[j]; // needed to avoid over-optimization
}
cout << "HornerKernelFlexible: " << Neval/timer() << " function approximations per second" << endl;
cout << sum << endl;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment