Commit 3578717d authored by Martin Reinecke's avatar Martin Reinecke

Merge remote-tracking branch 'origin/master' into sincospi

parents b96ae36a 553e4f1c
......@@ -62,8 +62,16 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <array>
#include <mutex>
#endif
#ifdef POCKETFFT_OPENMP
#include <omp.h>
#include <mutex>
#include <condition_variable>
#include <thread>
#include <queue>
#include <atomic>
#include <functional>
#ifdef POCKETFFT_PTHREADS
# include <pthread.h>
#endif
......@@ -684,22 +692,206 @@ struct util // hack to avoid duplicate symbols
if (axis>=shape.size()) throw invalid_argument("bad axis number");
}
#ifdef POCKETFFT_OPENMP
static size_t nthreads() { return size_t(omp_get_num_threads()); }
static size_t thread_num() { return size_t(omp_get_thread_num()); }
static size_t thread_count (size_t nthreads, const shape_t &shape,
size_t axis)
size_t axis, size_t vlen)
{
if (nthreads==1) return 1;
if (prod(shape) < 20*shape[axis]) return 1;
return (nthreads==0) ? size_t(omp_get_max_threads()) : nthreads;
size_t size = prod(shape);
size_t parallel = size / (shape[axis] * vlen);
if (shape[axis] < 1000)
parallel /= 4;
size_t max_threads = nthreads == 0 ?
thread::hardware_concurrency() : nthreads;
return max(size_t(1), min(parallel, max_threads));
}
#else
static constexpr size_t nthreads() { return 1; }
static constexpr size_t thread_num() { return 0; }
#endif
};
namespace threading {
thread_local size_t thread_id = 0;
thread_local size_t num_threads = 1;
class latch
{
atomic<size_t> num_left_;
mutex mut_;
condition_variable completed_;
using lock_t = unique_lock<mutex>;
public:
latch(size_t n): num_left_(n) {}
void count_down()
{
{
lock_t lock(mut_);
if (--num_left_)
return;
}
completed_.notify_all();
}
void wait()
{
lock_t lock(mut_);
completed_.wait(lock, [this]{ return is_ready(); });
}
bool is_ready() { return num_left_ == 0; }
};
template <typename T> class concurrent_queue
{
queue<T> q_;
mutex mut_;
condition_variable item_added_;
bool shutdown_;
using lock_t = unique_lock<mutex>;
public:
concurrent_queue(): shutdown_(false) {}
void push(T val)
{
{
lock_t lock(mut_);
if (shutdown_)
throw runtime_error("Item added to queue after shutdown");
q_.push(move(val));
}
item_added_.notify_one();
}
bool pop(T & val)
{
lock_t lock(mut_);
item_added_.wait(lock, [this] { return (!q_.empty() || shutdown_); });
if (q_.empty())
return false; // We are shutting down
val = std::move(q_.front());
q_.pop();
return true;
}
void shutdown()
{
{
lock_t lock(mut_);
shutdown_ = true;
}
item_added_.notify_all();
}
void restart() { shutdown_ = false; }
};
class thread_pool
{
concurrent_queue<function<void()>> work_queue_;
vector<thread> threads_;
void worker_main()
{
function<void()> work;
while (work_queue_.pop(work))
work();
}
void create_threads()
{
size_t nthreads = threads_.size();
for (size_t i=0; i<nthreads; ++i)
{
try { threads_[i] = thread([this]{ worker_main(); }); }
catch (...)
{
shutdown();
throw;
}
}
}
public:
explicit thread_pool(size_t nthreads):
threads_(nthreads)
{ create_threads(); }
thread_pool(): thread_pool(thread::hardware_concurrency()) {}
~thread_pool() { shutdown(); }
void submit(function<void()> work)
{
work_queue_.push(move(work));
}
void shutdown()
{
work_queue_.shutdown();
for (auto &thread : threads_)
if (thread.joinable())
thread.join();
}
void restart()
{
work_queue_.restart();
create_threads();
}
};
thread_pool & get_pool()
{
static thread_pool pool;
#ifdef POCKETFFT_PTHREADS
static once_flag f;
call_once(f,
[]{
pthread_atfork(
+[]{ get_pool().shutdown(); }, // prepare
+[]{ get_pool().restart(); }, // parent
+[]{ get_pool().restart(); } // child
);
});
#endif
return pool;
}
/** Map a function f over nthreads */
template <typename Func>
void thread_map(size_t nthreads, Func f)
{
if (nthreads == 0)
nthreads = thread::hardware_concurrency();
if (nthreads == 1)
{ f(); return; }
auto & pool = get_pool();
latch counter(nthreads);
exception_ptr ex;
mutex ex_mut;
for (size_t i=0; i<nthreads; ++i)
{
pool.submit(
[&f, &counter, &ex, &ex_mut, i, nthreads] {
thread_id = i;
num_threads = nthreads;
try { f(); }
catch (...)
{
lock_guard<mutex> lock(ex_mut);
ex = current_exception();
}
counter.count_down();
});
}
counter.wait();
if (ex)
rethrow_exception(ex);
}
}
//
// complex FFTPACK transforms
//
......@@ -2789,10 +2981,10 @@ template<size_t N> class multi_iter
str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)),
idim(idim_), rem(iarr.size()/iarr.shape(idim))
{
auto nshares = util::nthreads();
auto nshares = threading::num_threads;
if (nshares==1) return;
if (nshares==0) throw runtime_error("can't run with zero threads");
auto myshare = util::thread_num();
auto myshare = threading::thread_id;
if (myshare>=nshares) throw runtime_error("impossible share requested");
size_t nbase = rem/nshares;
size_t additional = rem%nshares;
......@@ -2926,8 +3118,10 @@ class rev_iter
size_t remaining() const { return rem; }
};
#ifndef POCKETFFT_NO_VECTORS
template<typename T> struct VTYPE {};
template <typename T> using vtype_t = typename VTYPE<T>::type;
#ifndef POCKETFFT_NO_VECTORS
template<> struct VTYPE<float>
{
using type = float __attribute__ ((vector_size (VLEN<float>::val*sizeof(float))));
......@@ -2940,8 +3134,6 @@ template<> struct VTYPE<long double>
{
using type = long double __attribute__ ((vector_size (VLEN<long double>::val*sizeof(long double))));
};
template <typename T> using vtype_t = typename VTYPE<T>::type;
#endif
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
......@@ -2966,12 +3158,6 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
return arr<char>(tmpsize*elemsize);
}
#ifdef POCKETFFT_OPENMP
#define POCKETFFT_NTHREADS nthreads
#else
#define POCKETFFT_NTHREADS
#endif
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
const cndarr<cmplx<T>> &src, cmplx<vtype_t<T>> *POCKETFFT_RESTRICT dst)
{
......@@ -3030,42 +3216,41 @@ template <typename T> using add_vec_t = typename add_vec<T>::type;
template<typename Tplan, typename T, typename T0, typename Exec>
POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out,
const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec,
const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec,
const bool allow_inplace=true)
{
shared_ptr<Tplan> plan;
for (size_t iax=0; iax<axes.size(); ++iax)
{
constexpr auto vlen = VLEN<T0>::val;
size_t len=in.shape(axes[iax]);
if ((!plan) || (len!=plan->length()))
plan = get_plan<Tplan>(len);
#ifdef POCKETFFT_OPENMP
#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax]))
#endif
{
auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T));
const auto &tin(iax==0? in : out);
multi_iter<vlen> it(tin, out, axes[iax]);
threading::thread_map(
util::thread_count(nthreads, in.shape(), axes[iax], VLEN<T>::val),
[&] {
constexpr auto vlen = VLEN<T0>::val;
auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T));
const auto &tin(iax==0? in : out);
multi_iter<vlen> it(tin, out, axes[iax]);
#ifndef POCKETFFT_NO_VECTORS
if (vlen>1)
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
exec(it, tin, out, tdatav, *plan, fct);
}
if (vlen>1)
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
exec(it, tin, out, tdatav, *plan, fct);
}
#endif
while (it.remaining()>0)
{
it.advance(1);
auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
&out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
exec(it, tin, out, buf, *plan, fct);
}
} // end of parallel region
while (it.remaining()>0)
{
it.advance(1);
auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
&out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
exec(it, tin, out, buf, *plan, fct);
}
}); // end of parallel region
fct = T0(1); // factor has been applied, use 1 for remaining axes
}
}
......@@ -3145,119 +3330,117 @@ struct ExecDcst
template<typename T> POCKETFFT_NOINLINE void general_r2c(
const cndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, bool forward, T fct,
size_t POCKETFFT_NTHREADS)
size_t nthreads)
{
auto plan = get_plan<pocketfft_r<T>>(in.shape(axis));
constexpr auto vlen = VLEN<T>::val;
size_t len=in.shape(axis);
#ifdef POCKETFFT_OPENMP
#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axis))
#endif
{
auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T));
multi_iter<vlen> it(in, out, axis);
threading::thread_map(
util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
[&] {
constexpr auto vlen = VLEN<T>::val;
auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T));
multi_iter<vlen> it(in, out, axis);
#ifndef POCKETFFT_NO_VECTORS
if (vlen>1)
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
copy_input(it, in, tdatav);
plan->exec(tdatav, fct, true);
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,0)].Set(tdatav[0][j]);
if (vlen>1)
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
copy_input(it, in, tdatav);
plan->exec(tdatav, fct, true);
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,0)].Set(tdatav[0][j]);
size_t i=1, ii=1;
if (forward)
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
else
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
if (i<len)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,ii)].Set(tdatav[i][j]);
}
#endif
while (it.remaining()>0)
{
it.advance(1);
auto tdata = reinterpret_cast<T *>(storage.data());
copy_input(it, in, tdata);
plan->exec(tdata, fct, true);
out[it.oofs(0)].Set(tdata[0]);
size_t i=1, ii=1;
if (forward)
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
out[it.oofs(ii)].Set(tdata[i], tdata[i+1]);
else
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]);
if (i<len)
for (size_t j=0; j<vlen; ++j)
out[it.oofs(j,ii)].Set(tdatav[i][j]);
out[it.oofs(ii)].Set(tdata[i]);
}
#endif
while (it.remaining()>0)
{
it.advance(1);
auto tdata = reinterpret_cast<T *>(storage.data());
copy_input(it, in, tdata);
plan->exec(tdata, fct, true);
out[it.oofs(0)].Set(tdata[0]);
size_t i=1, ii=1;
if (forward)
for (; i<len-1; i+=2, ++ii)
out[it.oofs(ii)].Set(tdata[i], tdata[i+1]);
else
for (; i<len-1; i+=2, ++ii)
out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]);
if (i<len)
out[it.oofs(ii)].Set(tdata[i]);
}
} // end of parallel region
}); // end of parallel region
}
template<typename T> POCKETFFT_NOINLINE void general_c2r(
const cndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, bool forward, T fct,
size_t POCKETFFT_NTHREADS)
size_t nthreads)
{
auto plan = get_plan<pocketfft_r<T>>(out.shape(axis));
constexpr auto vlen = VLEN<T>::val;
size_t len=out.shape(axis);
#ifdef POCKETFFT_OPENMP
#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axis))
#endif
{
auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T));
multi_iter<vlen> it(in, out, axis);
threading::thread_map(
util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
[&] {
constexpr auto vlen = VLEN<T>::val;
auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T));
multi_iter<vlen> it(in, out, axis);
#ifndef POCKETFFT_NO_VECTORS
if (vlen>1)
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
for (size_t j=0; j<vlen; ++j)
tdatav[0][j]=in[it.iofs(j,0)].r;
{
size_t i=1, ii=1;
if (forward)
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
in[it.iofs(j,ii)].SplitConj(tdatav[i][j], tdatav[i+1][j]);
else
for (; i<len-1; i+=2, ++ii)
if (vlen>1)
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
for (size_t j=0; j<vlen; ++j)
in[it.iofs(j,ii)].Split(tdatav[i][j], tdatav[i+1][j]);
if (i<len)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = in[it.iofs(j,ii)].r;
}
plan->exec(tdatav, fct, false);
copy_output(it, tdatav, out);
}
tdatav[0][j]=in[it.iofs(j,0)].r;
{
size_t i=1, ii=1;
if (forward)
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
in[it.iofs(j,ii)].SplitConj(tdatav[i][j], tdatav[i+1][j]);
else
for (; i<len-1; i+=2, ++ii)
for (size_t j=0; j<vlen; ++j)
in[it.iofs(j,ii)].Split(tdatav[i][j], tdatav[i+1][j]);
if (i<len)
for (size_t j=0; j<vlen; ++j)
tdatav[i][j] = in[it.iofs(j,ii)].r;
}
plan->exec(tdatav, fct, false);
copy_output(it, tdatav, out);
}
#endif
while (it.remaining()>0)
{
it.advance(1);
auto tdata = reinterpret_cast<T *>(storage.data());
tdata[0]=in[it.iofs(0)].r;
{
size_t i=1, ii=1;
if (forward)
for (; i<len-1; i+=2, ++ii)
in[it.iofs(ii)].SplitConj(tdata[i], tdata[i+1]);
else
for (; i<len-1; i+=2, ++ii)
in[it.iofs(ii)].Split(tdata[i], tdata[i+1]);
if (i<len)
tdata[i] = in[it.iofs(ii)].r;
}
plan->exec(tdata, fct, false);
copy_output(it, tdata, out);
}
} // end of parallel region
while (it.remaining()>0)
{
it.advance(1);
auto tdata = reinterpret_cast<T *>(storage.data());
tdata[0]=in[it.iofs(0)].r;
{
size_t i=1, ii=1;
if (forward)
for (; i<len-1; i+=2, ++ii)
in[it.iofs(ii)].SplitConj(tdata[i], tdata[i+1]);
else
for (; i<len-1; i+=2, ++ii)
in[it.iofs(ii)].Split(tdata[i], tdata[i+1]);
if (i<len)
tdata[i] = in[it.iofs(ii)].r;
}
plan->exec(tdata, fct, false);
copy_output(it, tdata, out);
}
}); // end of parallel region
}
struct ExecR2R
......@@ -3280,8 +3463,6 @@ struct ExecR2R
}
};
#undef POCKETFFT_NTHREADS
template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in,
const stride_t &stride_out, const shape_t &axes, bool forward,
const complex<T> *data_in, complex<T> *data_out, T fct,
......
......@@ -15,6 +15,7 @@ include_dirs = ['./', _deferred_pybind11_include(True),
_deferred_pybind11_include()]
extra_compile_args = ['--std=c++11', '-march=native', '-O3']
python_module_link_args = []
define_macros = [('POCKETFFT_PTHREADS', None)]
if sys.platform == 'darwin':
import distutils.sysconfig
......@@ -22,9 +23,12 @@ if sys.platform == 'darwin':
vars = distutils.sysconfig.get_config_vars()
vars['LDSHARED'] = vars['LDSHARED'].replace('-bundle', '')
python_module_link_args += ['-bundle']
elif sys.platform == 'win32':
define_macros = []
extra_compile_args = ['/Ox', '/EHsc']
else:
extra_compile_args += ['-DPOCKETFFT_OPENMP', '-fopenmp', '-Wfatal-errors', '-Wfloat-conversion', '-Wsign-conversion', '-Wconversion' ,'-W', '-Wall', '-Wstrict-aliasing=2', '-Wwrite-strings', '-Wredundant-decls', '-Woverloaded-virtual', '-Wcast-qual', '-Wcast-align', '-Wpointer-arith']
python_module_link_args += ['-march=native', '-Wl,-rpath,$ORIGIN', '-fopenmp']
extra_compile_args += ['-Wfatal-errors', '-Wfloat-conversion', '-Wsign-conversion', '-Wconversion' ,'-W', '-Wall', '-Wstrict-aliasing=2', '-Wwrite-strings', '-Wredundant-decls', '-Woverloaded-virtual', '-Wcast-qual', '-Wcast-align', '-Wpointer-arith']
python_module_link_args += ['-march=native', '-Wl,-rpath,$ORIGIN']
# if you don't want debugging info, add "-s" to python_module_link_args
......@@ -35,6 +39,7 @@ def get_extension_modules():
sources=['pypocketfft.cc'],
depends=['pocketfft_hdronly.h'],
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
extra_link_args=python_module_link_args)]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment