Commit e61cfd4d authored by Martin Reinecke's avatar Martin Reinecke

various improvements

parent 17c27fe4
...@@ -42,7 +42,7 @@ def measure_fftw_np_interface(a, nrepeat, nthr): ...@@ -42,7 +42,7 @@ def measure_fftw_np_interface(a, nrepeat, nthr):
def measure_pypocketfft(a, nrepeat, nthr): def measure_pypocketfft(a, nrepeat, nthr):
import pypocketfft as ppf import ducc_0_1.pypocketfft as ppf
tmin = 1e38 tmin = 1e38
b = a.copy() b = a.copy()
for i in range(nrepeat): for i in range(nrepeat):
......
...@@ -29,10 +29,11 @@ def irfftn(a, axes=None, lastsize=0, inorm=0, nthreads=1): ...@@ -29,10 +29,11 @@ def irfftn(a, axes=None, lastsize=0, inorm=0, nthreads=1):
nthreads = 0 nthreads = 0
def update_err(err, name, value): def update_err(err, name, value, shape):
if name in err and err[name] >= value: if name in err and err[name] >= value:
return err return err
err[name] = value err[name] = value
print(shape)
for (nm, v) in err.items(): for (nm, v) in err.items():
print("{}: {}".format(nm, v)) print("{}: {}".format(nm, v))
print() print()
...@@ -52,89 +53,89 @@ def test(err): ...@@ -52,89 +53,89 @@ def test(err):
a_32 = a.astype(np.complex64) a_32 = a.astype(np.complex64)
b = ifftn(fftn(a, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = ifftn(fftn(a, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads) nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a, b, axes)) err = update_err(err, "cmax", _l2error(a, b, axes), shape)
b = ifftn(fftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = ifftn(fftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads) nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b, axes)) err = update_err(err, "cmax", _l2error(a.real, b, axes), shape)
b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads) nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b, axes)) err = update_err(err, "cmax", _l2error(a.real, b, axes), shape)
b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads), b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes)) err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes), shape)
b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
lastsize=lastsize, nthreads=nthreads) lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmax", _l2error(a.real, b, axes)) err = update_err(err, "rmax", _l2error(a.real, b, axes), shape)
b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads), b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads) axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes)) err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes), shape)
b = pypocketfft.separable_hartley( b = pypocketfft.separable_hartley(
pypocketfft.separable_hartley(a.real, axes=axes, nthreads=nthreads), pypocketfft.separable_hartley(a.real, axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmax", _l2error(a.real, b, axes)) err = update_err(err, "hmax", _l2error(a.real, b, axes), shape)
b = pypocketfft.genuine_hartley( b = pypocketfft.genuine_hartley(
pypocketfft.genuine_hartley(a.real, axes=axes, nthreads=nthreads), pypocketfft.genuine_hartley(a.real, axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmax", _l2error(a.real, b, axes)) err = update_err(err, "hmax", _l2error(a.real, b, axes), shape)
b = pypocketfft.separable_hartley( b = pypocketfft.separable_hartley(
pypocketfft.separable_hartley( pypocketfft.separable_hartley(
a.real.astype(np.float32), axes=axes, nthreads=nthreads), a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes)) err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes), shape)
b = pypocketfft.genuine_hartley( b = pypocketfft.genuine_hartley(
pypocketfft.genuine_hartley(a.real.astype(np.float32), axes=axes, pypocketfft.genuine_hartley(a.real.astype(np.float32), axes=axes,
nthreads=nthreads), nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads) axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes)) err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes), shape)
if all(a.shape[i] > 1 for i in axes): if all(a.shape[i] > 1 for i in axes):
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=1), pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2) axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "c1max", _l2error(a.real, b, axes)) err = update_err(err, "c1max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=1), pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2) axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "c1maxf", _l2error(a_32.real, b, axes)) err = update_err(err, "c1maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=2), pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2) axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "c23max", _l2error(a.real, b, axes)) err = update_err(err, "c23max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=2), pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2) axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "c23maxf", _l2error(a_32.real, b, axes)) err = update_err(err, "c23maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=4), pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2) axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4max", _l2error(a.real, b, axes)) err = update_err(err, "c4max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dct( b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4), pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2) axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes)) err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dst( b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=1), pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2) axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "s1max", _l2error(a.real, b, axes)) err = update_err(err, "s1max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dst( b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1), pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2) axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "s1maxf", _l2error(a_32.real, b, axes)) err = update_err(err, "s1maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dst( b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=2), pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2) axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "s23max", _l2error(a.real, b, axes)) err = update_err(err, "s23max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dst( b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=2), pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2) axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "s23maxf", _l2error(a_32.real, b, axes)) err = update_err(err, "s23maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dst( b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=4), pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2) axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "s4max", _l2error(a.real, b, axes)) err = update_err(err, "s4max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dst( b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=4), pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2) axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "s4maxf", _l2error(a_32.real, b, axes)) err = update_err(err, "s4maxf", _l2error(a_32.real, b, axes), shape)
err = dict() err = dict()
......
from setuptools import setup, Extension from setuptools import setup, Extension
import sys import sys
import os.path
import itertools
from glob import iglob
pkgname = 'ducc_0_1' pkgname = 'ducc_0_1'
...@@ -12,6 +15,13 @@ class _deferred_pybind11_include(object): ...@@ -12,6 +15,13 @@ class _deferred_pybind11_include(object):
return pybind11.get_include(self.user) return pybind11.get_include(self.user)
def _get_files_by_suffix(directory, suffix):
path = directory
iterable_sources = (iglob(os.path.join(root, '*.'+suffix))
for root, dirs, files in os.walk(path))
return list(itertools.chain.from_iterable(iterable_sources))
include_dirs = ['./src/', include_dirs = ['./src/',
_deferred_pybind11_include(True), _deferred_pybind11_include(True),
_deferred_pybind11_include()] _deferred_pybind11_include()]
...@@ -34,10 +44,11 @@ else: ...@@ -34,10 +44,11 @@ else:
# if you don't want debugging info, add "-s" to python_module_link_args # if you don't want debugging info, add "-s" to python_module_link_args
def get_extension_modules(): def get_extension_modules():
depfiles = _get_files_by_suffix('.', 'h') + _get_files_by_suffix('.', 'cc') + ['setup.py']
return [Extension(pkgname, return [Extension(pkgname,
language='c++', language='c++',
sources=['module.cc'], sources=['module.cc'],
depends=[], depends=depfiles,
include_dirs=include_dirs, include_dirs=include_dirs,
define_macros=define_macros, define_macros=define_macros,
extra_compile_args=extra_compile_args, extra_compile_args=extra_compile_args,
......
...@@ -862,19 +862,73 @@ struct ExecC2C ...@@ -862,19 +862,73 @@ struct ExecC2C
template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it, template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
const native_simd<T> *MRUTIL_RESTRICT src, fmav<T> &dst) const native_simd<T> *MRUTIL_RESTRICT src, fmav<T> &dst)
{ {
auto ptr = dst.vdata(); if (it.uniform_o())
for (size_t j=0; j<vlen; ++j) {
ptr[it.oofs(j,0)] = src[0][j]; auto ptr = &dst.vraw(it.oofs_uni(0,0));
size_t i=1, i1=1, i2=it.length_out()-1; auto jstr = it.unistride_o();
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2) auto istr = it.stride_out();
for (size_t j=0; j<vlen; ++j) if (istr==1)
{ {
ptr[it.oofs(j,i1)] = src[i][j]+src[i+1][j]; for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i2)] = src[i][j]-src[i+1][j]; ptr[ptrdiff_t(j)*jstr] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i1)] = src[i][j]+src[i+1][j];
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i2)] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i1)] = src[i][j];
} }
if (i<it.length_out()) else if (jstr==1)
{
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
ptr[ptrdiff_t(j) + ptrdiff_t(i1)*istr] = src[i][j]+src[i+1][j];
ptr[ptrdiff_t(j) + ptrdiff_t(i2)*istr] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j) + ptrdiff_t(i1)*istr] = src[i][j];
}
else
{
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)*jstr] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i1)*istr] = src[i][j]+src[i+1][j];
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i2)*istr] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i1)*istr] = src[i][j];
}
}
else
{
auto ptr = dst.vdata();
for (size_t j=0; j<vlen; ++j) for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i1)] = src[i][j]; ptr[it.oofs(j,0)] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
ptr[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
ptr[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i1)] = src[i][j];
}
} }
template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it, template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment