Commit e61cfd4d authored by Martin Reinecke's avatar Martin Reinecke

various improvements

parent 17c27fe4
......@@ -42,7 +42,7 @@ def measure_fftw_np_interface(a, nrepeat, nthr):
def measure_pypocketfft(a, nrepeat, nthr):
import pypocketfft as ppf
import ducc_0_1.pypocketfft as ppf
tmin = 1e38
b = a.copy()
for i in range(nrepeat):
......
......@@ -29,10 +29,11 @@ def irfftn(a, axes=None, lastsize=0, inorm=0, nthreads=1):
nthreads = 0
def update_err(err, name, value):
def update_err(err, name, value, shape):
if name in err and err[name] >= value:
return err
err[name] = value
print(shape)
for (nm, v) in err.items():
print("{}: {}".format(nm, v))
print()
......@@ -52,89 +53,89 @@ def test(err):
a_32 = a.astype(np.complex64)
b = ifftn(fftn(a, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a, b, axes))
err = update_err(err, "cmax", _l2error(a, b, axes), shape)
b = ifftn(fftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b, axes))
err = update_err(err, "cmax", _l2error(a.real, b, axes), shape)
b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
nthreads=nthreads)
err = update_err(err, "cmax", _l2error(a.real, b, axes))
err = update_err(err, "cmax", _l2error(a.real, b, axes), shape)
b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes))
err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes), shape)
b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmax", _l2error(a.real, b, axes))
err = update_err(err, "rmax", _l2error(a.real, b, axes), shape)
b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads)
err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes))
err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes), shape)
b = pypocketfft.separable_hartley(
pypocketfft.separable_hartley(a.real, axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmax", _l2error(a.real, b, axes))
err = update_err(err, "hmax", _l2error(a.real, b, axes), shape)
b = pypocketfft.genuine_hartley(
pypocketfft.genuine_hartley(a.real, axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmax", _l2error(a.real, b, axes))
err = update_err(err, "hmax", _l2error(a.real, b, axes), shape)
b = pypocketfft.separable_hartley(
pypocketfft.separable_hartley(
a.real.astype(np.float32), axes=axes, nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes))
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes), shape)
b = pypocketfft.genuine_hartley(
pypocketfft.genuine_hartley(a.real.astype(np.float32), axes=axes,
nthreads=nthreads),
axes=axes, inorm=2, nthreads=nthreads)
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes))
err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes), shape)
if all(a.shape[i] > 1 for i in axes):
b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "c1max", _l2error(a.real, b, axes))
err = update_err(err, "c1max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "c1maxf", _l2error(a_32.real, b, axes))
err = update_err(err, "c1maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "c23max", _l2error(a.real, b, axes))
err = update_err(err, "c23max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "c23maxf", _l2error(a_32.real, b, axes))
err = update_err(err, "c23maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dct(
pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4max", _l2error(a.real, b, axes))
err = update_err(err, "c4max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dct(
pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes))
err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "s1max", _l2error(a.real, b, axes))
err = update_err(err, "s1max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1),
axes=axes, type=1, nthreads=nthreads, inorm=2)
err = update_err(err, "s1maxf", _l2error(a_32.real, b, axes))
err = update_err(err, "s1maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "s23max", _l2error(a.real, b, axes))
err = update_err(err, "s23max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=2),
axes=axes, type=3, nthreads=nthreads, inorm=2)
err = update_err(err, "s23maxf", _l2error(a_32.real, b, axes))
err = update_err(err, "s23maxf", _l2error(a_32.real, b, axes), shape)
b = pypocketfft.dst(
pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "s4max", _l2error(a.real, b, axes))
err = update_err(err, "s4max", _l2error(a.real, b, axes), shape)
b = pypocketfft.dst(
pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=4),
axes=axes, type=4, nthreads=nthreads, inorm=2)
err = update_err(err, "s4maxf", _l2error(a_32.real, b, axes))
err = update_err(err, "s4maxf", _l2error(a_32.real, b, axes), shape)
err = dict()
......
from setuptools import setup, Extension
import sys
import os.path
import itertools
from glob import iglob
pkgname = 'ducc_0_1'
......@@ -12,6 +15,13 @@ class _deferred_pybind11_include(object):
return pybind11.get_include(self.user)
def _get_files_by_suffix(directory, suffix):
path = directory
iterable_sources = (iglob(os.path.join(root, '*.'+suffix))
for root, dirs, files in os.walk(path))
return list(itertools.chain.from_iterable(iterable_sources))
include_dirs = ['./src/',
_deferred_pybind11_include(True),
_deferred_pybind11_include()]
......@@ -34,10 +44,11 @@ else:
# if you don't want debugging info, add "-s" to python_module_link_args
def get_extension_modules():
depfiles = _get_files_by_suffix('.', 'h') + _get_files_by_suffix('.', 'cc') + ['setup.py']
return [Extension(pkgname,
language='c++',
sources=['module.cc'],
depends=[],
depends=depfiles,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
......
......@@ -862,19 +862,73 @@ struct ExecC2C
template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
const native_simd<T> *MRUTIL_RESTRICT src, fmav<T> &dst)
{
auto ptr = dst.vdata();
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,0)] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
if (it.uniform_o())
{
auto ptr = &dst.vraw(it.oofs_uni(0,0));
auto jstr = it.unistride_o();
auto istr = it.stride_out();
if (istr==1)
{
ptr[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
ptr[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)*jstr] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i1)] = src[i][j]+src[i+1][j];
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i2)] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i1)] = src[i][j];
}
if (i<it.length_out())
else if (jstr==1)
{
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
ptr[ptrdiff_t(j) + ptrdiff_t(i1)*istr] = src[i][j]+src[i+1][j];
ptr[ptrdiff_t(j) + ptrdiff_t(i2)*istr] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j) + ptrdiff_t(i1)*istr] = src[i][j];
}
else
{
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)*jstr] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i1)*istr] = src[i][j]+src[i+1][j];
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i2)*istr] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i1)*istr] = src[i][j];
}
}
else
{
auto ptr = dst.vdata();
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i1)] = src[i][j];
ptr[it.oofs(j,0)] = src[0][j];
size_t i=1, i1=1, i2=it.length_out()-1;
for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
for (size_t j=0; j<vlen; ++j)
{
ptr[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
ptr[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
}
if (i<it.length_out())
for (size_t j=0; j<vlen; ++j)
ptr[it.oofs(j,i1)] = src[i][j];
}
}
template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment