diff --git a/bfps/_code.py b/bfps/_code.py index b436a8b40ea2bac2229b459f6fea5dc61000a139..9285d021bfb8f66ddf8252b6323e4f222d659eec 100644 --- a/bfps/_code.py +++ b/bfps/_code.py @@ -61,6 +61,8 @@ class _code(_base): #include <cstring> #include <fftw3-mpi.h> #include <omp.h> + #include <fenv.h> + #include <cstdlib> //endcpp """ self.variables = 'int myrank, nprocs;\n' @@ -69,63 +71,16 @@ class _code(_base): self.variables += ('hid_t parameter_file, stat_file, Cdset;\n') self.definitions = '' self.main_start = """ - - -#include <fenv.h> -#include <iostream> -#include <execinfo.h> -#include <cstdio> -#include <unistd.h> -#include <csignal> -#include <unistd.h> -#include <iostream> -#include <stdexcept> - -/** - * @brief f_sig_handler catch the system signals. - * @param signalReceived - */ -void f_sig_handler(int signalReceived){ - std::cerr << "[BFPS] Signal " << signalReceived << " has been intercepted." << std::endl; - const int maxStackCalls = 40; - void *stackArray[maxStackCalls]; - const int callsToPrint = backtrace(stackArray, maxStackCalls); - backtrace_symbols_fd(stackArray, callsToPrint, STDERR_FILENO); - throw std::runtime_error("abort"); -} - -/** - * @brief f_install_signal_handler - * This install several handler for various signals. - */ -void f_install_signal_handler(){ - // The SIGINT signal is sent to a process by its controlling terminal when a user wishes to interrupt the process - if( signal(SIGINT, f_sig_handler) == SIG_ERR ){ - std::cout << "Signal Handler: Cannot install signal SIGINT\\n"; - } - // The SIGTERM signal is sent to a process to request its termination - if( signal(SIGTERM, f_sig_handler) == SIG_ERR ){ - std::cout << "Signal Handler: Cannot install signal SIGTERM\\n"; - } - // The SIGFPE signal is sent to a process when it executes an erroneous arithmetic operation, such as division by zero - if( signal(SIGFPE, f_sig_handler) == SIG_ERR ){ - std::cout << "Signal Handler: Cannot install signal SIGFPE\\n"; - } - // The SIGABRT signal is sent to a process to tell it to abort, i.e. to terminate - if( signal(SIGABRT, f_sig_handler) == SIG_ERR ){ - std::cout << "Signal Handler: Cannot install signal SIGABRT\\n"; - } - // The SIGSEGV signal is sent to a process when it makes an invalid virtual memory reference, or segmentation fault - if( signal(SIGSEGV, f_sig_handler) == SIG_ERR ){ - std::cout << "Signal Handler: Cannot install signal SIGSEGV\\n"; - } -} - //begincpp int main(int argc, char *argv[]) { -feenableexcept(FE_INVALID | FE_OVERFLOW); -f_install_signal_handler(); + if(getenv("BFPS_FPE_OFF") == nullptr || getenv("BFPS_FPE_OFF") != std::string("TRUE")){ + feenableexcept(FE_INVALID | FE_OVERFLOW); + } + else{ + std::cout << "FPE have been turned OFF" << std::endl; + } + int mpiprovided; MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &mpiprovided); assert(mpiprovided >= MPI_THREAD_FUNNELED); @@ -228,11 +183,7 @@ f_install_signal_handler(); raise IOError('header not there:\n' + '{0}\n'.format(os.path.join(bfps.header_dir, 'base.hpp')) + '{0}\n'.format(bfps.dist_loc)) - libraries = ['bfps'] - libraries += bfps.install_info['libraries'] - libraries += ['fftw3_omp'] # Remove if enable mkl - libraries += ['fftw3f_omp'] - + command_strings = [bfps.install_info['compiler']] command_strings += [self.name + '.cpp', '-o', self.name] command_strings += bfps.install_info['extra_compile_args'] @@ -241,19 +192,27 @@ f_install_signal_handler(); command_strings += ['-L' + ldir for ldir in bfps.install_info['library_dirs']] command_strings.append('-L' + bfps.lib_dir) - # try: - # usemkl = int(os.environ['USEMKLFFTW']) - # except : - # usemkl = False - # - # if usemkl : - # command_strings.append('--start-group ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a ${MKLROOT}/lib/intel64/libmkl_gnu_thread.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group') - # else : - # libraries += ['fftw3_omp'] - # libraries += ['fftw3f_omp'] - + libraries = ['bfps'] + libraries += bfps.install_info['libraries'] + # libraries += ['fftw3_omp'] # Remove if enable mkl + # libraries += ['fftw3f_omp'] for libname in libraries: command_strings += ['-l' + libname] + + try: + usemkl = True if os.environ['USEMKLFFTW']=="true" else False + except : + usemkl = False + + if usemkl : + print("use MKL (USEMKLFFTW is true)") + command_strings.append('-Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a ${MKLROOT}/lib/intel64/libmkl_gnu_thread.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group') + command_strings += ['-I${MKLROOT}/include/fftw'] + else : + command_strings += ['-lfftw3_omp'] + command_strings += ['-lfftw3f_omp'] + + command_strings += ['-fopenmp'] self.write_src() print('compiling code with command\n' + ' '.join(command_strings)) @@ -329,14 +288,21 @@ f_install_signal_handler(); err_file = err_file + '_' + suffix) os.chdir(self.work_dir) qsub_atoms = ['sbatch'] - if len(job_id_list) >= 1: - qsub_atoms += ['--dependency=afterok:{0}'.format(job_id_list[-1])] - p = subprocess.Popen( - qsub_atoms + [qsub_script_name], - stdout = subprocess.PIPE) - out, err = p.communicate() - p.terminate() - job_id_list.append(int(out.split()[-1])) + + try: + submit = True if os.environ['SUBMITJOT']=="true" else False + except : + submit = False + + if submit: + if len(job_id_list) >= 1: + qsub_atoms += ['--dependency=afterok:{0}'.format(job_id_list[-1])] + p = subprocess.Popen( + qsub_atoms + [qsub_script_name], + stdout = subprocess.PIPE) + out, err = p.communicate() + p.terminate() + job_id_list.append(int(out.split()[-1])) os.chdir(current_dir) elif self.host_info['type'] == 'IBMLoadLeveler': suffix = self.simname + '_{0}'.format(iter0) @@ -363,7 +329,15 @@ f_install_signal_handler(); err_file = err_file + '_' + suffix, njobs = njobs) submit_atoms = ['llsubmit'] - subprocess.call(submit_atoms + [os.path.join(self.work_dir, job_script_name)]) + + try: + submit = True if os.environ['SUBMITJOT']=="true" else False + except : + submit = False + + if submit: + subprocess.call(submit_atoms + [os.path.join(self.work_dir, job_script_name)]) + elif self.host_info['type'] == 'pc': os.chdir(self.work_dir) os.environ['LD_LIBRARY_PATH'] += ':{0}'.format(bfps.lib_dir) @@ -386,6 +360,12 @@ f_install_signal_handler(); minutes = None, out_file = None, err_file = None): + + try: + useibm = (True if os.environ['USEIBMMPI'] == 'true' else False) + except : + useibm = False + script_file = open(file_name, 'w') script_file.write('# @ shell=/bin/bash\n') # error file @@ -396,7 +376,12 @@ f_install_signal_handler(); if type(out_file) == type(None): out_file = 'out.job.$(jobid)' script_file.write('# @ output = ' + os.path.join(self.work_dir, out_file) + '\n') - script_file.write('# @ job_type = parallel\n') + if not useibm : + print('not ibm') + script_file.write('# @ job_type = MPICH\n') + else : + print('ibm') + script_file.write('# @ job_type = parallel\n') script_file.write('# @ node_usage = not_shared\n') script_file.write('# @ notification = complete\n') script_file.write('# @ notify_user = $(user)@rzg.mpg.de\n') @@ -430,7 +415,11 @@ f_install_signal_handler(); script_file.write('# @ first_node_tasks = {0}\n'.format(first_node_tasks)) script_file.write('# @ queue\n') + if useibm: + script_file.write('export USEIBMMPI=true;\n') + script_file.write('source ~/.config/bfps/bashrc\n') + script_file.write('module li\n') script_file.write('export OMP_NUM_THREADS={}\n'.format(nb_cpus_per_task)) script_file.write('LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:' + @@ -440,13 +429,30 @@ f_install_signal_handler(); script_file.write('export HTMLOUTPUT={}.html\n'.format(command_atoms[-1])) script_file.write('cd ' + self.work_dir + '\n') # script_file.write('cp -s ../*.h5 ./\n') - script_file.write('poe ' + + + if not useibm: + print('use mpiexec') + script_file.write('export KMP_AFFINITY=compact,verbose\n') + script_file.write('export I_MPI_PIN_DOMAIN=omp\n') + script_file.write('mpiexec.hydra ' + + ' -np {} '.format(int(nprocesses/nb_cpus_per_task)) + + ' -ppn {} '.format(nb_process_per_node) + + ' -ordered-output -prepend-rank ' + + os.path.join( + self.work_dir, + command_atoms[0]) + + ' ' + + ' '.join(command_atoms[1:]) + + '\n') + else: + script_file.write('poe ' + os.path.join( self.work_dir, command_atoms[0]) + ' ' + ' '.join(command_atoms[1:]) + '\n') + script_file.write('echo "End time is `date`"\n') script_file.write('exit 0\n') script_file.close() @@ -505,21 +511,45 @@ f_install_signal_handler(); if (first_node_tasks > 0): script_file.write('# @ first_node_tasks = {0}\n'.format(first_node_tasks)) script_file.write('# @ queue\n') + script_file.write('source ~/.config/bfps/bashrc\n') + script_file.write('module li\n') script_file.write('LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:' + - ':'.join([bfps.lib_dir] + bfps.install_info['library_dirs']) + + # ':'.join([bfps.lib_dir] + bfps.install_info['library_dirs']) + + bfps.lib_dir + '\n') script_file.write('echo "This is step $LOADL_STEP_ID out of {0}"\n'.format(njobs)) script_file.write('echo "Start time is `date`"\n') script_file.write('export HTMLOUTPUT={}.html\n'.format(command_atoms[-1])) # script_file.write('cp -s ../*.h5 ./\n') script_file.write('cd ' + self.work_dir + '\n') - script_file.write('poe ' + + + try: + useibm = (True if os.environ['USEIBMMPI'] == 'true' else False) + except : + useibm = False + + if not useibm: + print('use mpiexec') + script_file.write('export KMP_AFFINITY=compact,verbose\n') + script_file.write('export I_MPI_PIN_DOMAIN=omp\n') + script_file.write('mpiexec.hydra ' + + ' -np {} '.format(nprocesses/nb_cpus_per_task) + + ' -ppn {} '.format(nb_process_per_node) + + os.path.join( + self.work_dir, + command_atoms[0]) + + ' ' + + ' '.join(command_atoms[1:]) + + '\n') + else: + script_file.write('poe ' + os.path.join( self.work_dir, command_atoms[0]) + ' ' + ' '.join(command_atoms[1:]) + '\n') + script_file.write('echo "End time is `date`"\n') script_file.write('exit 0\n') script_file.close() @@ -612,6 +642,7 @@ f_install_signal_handler(); script_file.write('#SBATCH --mail-type=none\n') script_file.write('#SBATCH --time={0}:{1:0>2d}:00\n'.format(hours, minutes)) + script_file.write('source ~/.config/bfps/bashrc\n') script_file.write('export OMP_NUM_THREADS={0}\n'.format(int(int(self.host_info['deltanprocs'])/nbprocesspernode))) script_file.write('export OMP_PLACES=cores\n') script_file.write('LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:' + diff --git a/bfps/cpp/field.cpp b/bfps/cpp/field.cpp index 5d9255ac434311c61859fd138747f6297280e46d..2a975613aef28310445f95ad5fd27c6aebdf0698 100644 --- a/bfps/cpp/field.cpp +++ b/bfps/cpp/field.cpp @@ -201,6 +201,7 @@ int field<rnumber, be, fc>::io( else file_id = H5Fcreate(fname.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, plist_id); } + assert(file_id >= 0); H5Pclose(plist_id); /* check what kind of representation is being used */ @@ -210,7 +211,9 @@ int field<rnumber, be, fc>::io( file_id, dset_name.c_str(), H5P_DEFAULT); + assert(dset_id >= 0); hid_t dset_type = H5Dget_type(dset_id); + assert(dset_type >= 0); bool io_for_real = ( H5Tequal(dset_type, H5T_IEEE_F32BE) || H5Tequal(dset_type, H5T_IEEE_F32LE) || @@ -300,7 +303,7 @@ int field<rnumber, be, fc>::io( /* check file space */ int ndims_fspace = H5Sget_simple_extent_dims(fspace, dims, NULL); - assert(ndims_fspace == ndim(fc)); + assert(((unsigned int)(ndims_fspace)) == ndim(fc)); if (this->real_space_representation) { for (unsigned int i=0; i<ndim(fc); i++) @@ -486,7 +489,7 @@ int field<rnumber, be, fc>::io_database( /* check file space */ int ndims_fspace = H5Sget_simple_extent_dims(fspace, dims, NULL); - assert(ndims_fspace == ndim(fc) + 1); + assert(ndims_fspace == int(ndim(fc) + 1)); offset[0] = toffset; if (this->real_space_representation) { @@ -637,28 +640,31 @@ void field<rnumber, be, fc>::compute_rspace_stats( MPI_Bcast(&nbins, 1, MPI_INT, 0, this->comm); } assert(nvals == int(max_estimate.size())); - double *moments = new double[nmoments*nvals]; + shared_array<double> local_moments_threaded(nmoments*nvals, [&](double* local_moments){ std::fill_n(local_moments, nmoments*nvals, 0); if (nvals == 4) local_moments[3] = max_estimate[3]; }); - shared_array<double> val_tmp_threaded(nvals); - - // Unchanged by the threads - double *binsize = new double[nvals]; - for (int i=0; i<nvals; i++) - binsize[i] = 2*max_estimate[i] / nbins; - + shared_array<double> val_tmp_threaded(nvals,[&](double *val_tmp){ + std::fill_n(val_tmp, nvals, 0); + }); shared_array<ptrdiff_t> local_hist_threaded(nbins*nvals,[&](ptrdiff_t* local_hist){ std::fill_n(local_hist, nbins*nvals, 0); }); + double *binsize = new double[nvals]; + for (int i=0; i<nvals; i++) + binsize[i] = 2*max_estimate[i] / nbins; + { TIMEZONE("field::RLOOP"); this->RLOOP( - [&](hsize_t /*zindex*/, hsize_t /*yindex*/, ptrdiff_t rindex, hsize_t /*xindex*/){ + [&](ptrdiff_t rindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ double *local_moments = local_moments_threaded.getMine(); double *val_tmp = val_tmp_threaded.getMine(); ptrdiff_t *local_hist = local_hist_threaded.getMine(); @@ -677,8 +683,9 @@ void field<rnumber, be, fc>::compute_rspace_stats( if (val_tmp[3] > local_moments[9*nvals+3]) local_moments[9*nvals+3] = val_tmp[3]; int bin = int(floor(val_tmp[3]*2/binsize[3])); - if (bin >= 0 && bin < nbins) + if (bin >= 0 && bin < nbins){ local_hist[bin*nvals+3]++; + } } for (unsigned int i=0; i<ncomp(fc); i++) { @@ -694,16 +701,31 @@ void field<rnumber, be, fc>::compute_rspace_stats( double pow_tmp = 1; for (int i=0; i<nvals; i++){ local_moments[n*nvals + i] += (pow_tmp = val_tmp[i]*pow_tmp); - } - } - }); + } + } + }); + + TIMEZONE("FIELD_RLOOP::Merge"); + local_moments_threaded.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double { + if(nvals == int(4) && idx == 0*nvals+3){ + return std::min(v1, v2); + } + if(nvals == int(4) && idx == 9*nvals+3){ + return std::max(v1, v2); + } + if(idx < ncomp(fc)){ + return std::min(v1, v2); + } + if((nmoments-1)*nvals <= idx && idx < (nmoments-1)*nvals+ncomp(fc)){ + return std::max(v1, v2); + } + return v1 + v2; + }); - TIMEZONE("FIELD_RLOOP::Merge"); - local_moments_threaded.mergeParallel(); local_hist_threaded.mergeParallel(); } - ptrdiff_t *hist = new ptrdiff_t[nbins*nvals]; + double *moments = new double[nmoments*nvals]; { TIMEZONE("MPI_Allreduce"); MPI_Allreduce( @@ -799,7 +821,7 @@ void field<rnumber, be, fc>::symmetrize() { for (cc = 0; cc < ncomp(fc); cc++) data[cc][1] = 0.0; - for (ii = 1; ii < this->clayout->sizes[1]/2; ii++) + for (ii = 1; ii < ptrdiff_t(this->clayout->sizes[1]/2); ii++) for (cc = 0; cc < ncomp(fc); cc++) { ( *(data + cc + ncomp(fc)*(this->clayout->sizes[1] - ii)*this->clayout->sizes[2]))[0] = (*(data + cc + ncomp(fc)*( ii)*this->clayout->sizes[2]))[0]; @@ -812,11 +834,11 @@ void field<rnumber, be, fc>::symmetrize() ptrdiff_t yy; /*ptrdiff_t tindex;*/ int ranksrc, rankdst; - for (yy = 1; yy < this->clayout->sizes[0]/2; yy++) { + for (yy = 1; yy < ptrdiff_t(this->clayout->sizes[0]/2); yy++) { ranksrc = this->clayout->rank[0][yy]; rankdst = this->clayout->rank[0][this->clayout->sizes[0] - yy]; if (this->clayout->myrank == ranksrc) - for (ii = 0; ii < this->clayout->sizes[1]; ii++) + for (ii = 0; ii < ptrdiff_t(this->clayout->sizes[1]); ii++) for (cc = 0; cc < ncomp(fc); cc++) for (int imag_comp=0; imag_comp<2; imag_comp++) (*(buffer + ncomp(fc)*ii+cc))[imag_comp] = @@ -834,7 +856,7 @@ void field<rnumber, be, fc>::symmetrize() } if (this->clayout->myrank == rankdst) { - for (ii = 1; ii < this->clayout->sizes[1]; ii++) + for (ii = 1; ii < ptrdiff_t(this->clayout->sizes[1]); ii++) for (cc = 0; cc < ncomp(fc); cc++) { (*(data + ncomp(fc)*((this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1] + ii)*this->clayout->sizes[2] + cc))[0] = @@ -1041,3 +1063,11 @@ template void compute_gradient<double, FFTW, THREE, THREExTHREE, SMOOTH>( field<double, FFTW, THREE> *, field<double, FFTW, THREExTHREE> *); +template void compute_gradient<float, FFTW, ONE, THREE, SMOOTH>( + kspace<FFTW, SMOOTH> *, + field<float, FFTW, ONE> *, + field<float, FFTW, THREE> *); +template void compute_gradient<double, FFTW, ONE, THREE, SMOOTH>( + kspace<FFTW, SMOOTH> *, + field<double, FFTW, ONE> *, + field<double, FFTW, THREE> *); diff --git a/bfps/cpp/field.hpp b/bfps/cpp/field.hpp index ca341a33c2d8832f201147614876122db0cfb1b4..20dbbfbef570b08dab83942f0ca4d5a275240fc6 100644 --- a/bfps/cpp/field.hpp +++ b/bfps/cpp/field.hpp @@ -29,6 +29,7 @@ #include <vector> #include <string> #include "kspace.hpp" +#include "omputils.hpp" #ifndef FIELD_HPP @@ -201,17 +202,22 @@ class field switch(be) { case FFTW: - #pragma omp parallel for schedule(static) - for (hsize_t zindex = 0; zindex < this->rlayout->subsizes[0]; zindex++) - for (hsize_t yindex = 0; yindex < this->rlayout->subsizes[1]; yindex++) + #pragma omp parallel { - ptrdiff_t rindex = ( - zindex * this->rlayout->subsizes[1] + yindex)*( - this->rmemlayout->subsizes[2]); - for (hsize_t xindex = 0; xindex < this->rlayout->subsizes[2]; xindex++) + const hsize_t start = OmpUtils::ForIntervalStart(this->rlayout->subsizes[0]); + const hsize_t end = OmpUtils::ForIntervalEnd(this->rlayout->subsizes[0]); + + for (hsize_t zindex = start; zindex < end; zindex++) + for (hsize_t yindex = 0; yindex < this->rlayout->subsizes[1]; yindex++) { - expression(rindex, xindex, yindex, zindex); - rindex++; + ptrdiff_t rindex = ( + zindex * this->rlayout->subsizes[1] + yindex)*( + this->rmemlayout->subsizes[2]); + for (hsize_t xindex = 0; xindex < this->rlayout->subsizes[2]; xindex++) + { + expression(rindex, xindex, yindex, zindex); + rindex++; + } } } break; diff --git a/bfps/cpp/fluid_solver_base.cpp b/bfps/cpp/fluid_solver_base.cpp index ba04072e1f4ca7beccb033500f57c3b4368a3bfc..bc224debf424511c476d5331ca6287d1208be162 100644 --- a/bfps/cpp/fluid_solver_base.cpp +++ b/bfps/cpp/fluid_solver_base.cpp @@ -34,7 +34,6 @@ #include "fftw_tools.hpp" #include "scope_timer.hpp" #include "shared_array.hpp" -#include "threadsafeupdate.hpp" template <class rnumber> void fluid_solver_base<rnumber>::fill_up_filename(const char *base_name, char *destination) @@ -236,7 +235,21 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( ); threaded_local_hist.mergeParallel(); - threaded_local_moments.mergeParallel(); + threaded_local_moments.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double { + if(nvals == int(4) && idx == 0*nvals+3){ + return std::min(v1, v2); + } + if(nvals == int(4) && idx == 9*nvals+3){ + return std::max(v1, v2); + } + if(idx < 3){ + return std::min(v1, v2); + } + if((nmoments-1)*nvals <= idx && idx < (nmoments-1)*nvals+3){ + return std::max(v1, v2); + } + return v1 + v2; + }); double *moments = new double[nmoments*nvals]; @@ -367,7 +380,21 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( } ); - threaded_local_moments.mergeParallel(); + threaded_local_moments.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double { + if(nvals == int(4) && idx == 0*nvals+3){ + return std::min(v1, v2); + } + if(nvals == int(4) && idx == 9*nvals+3){ + return std::max(v1, v2); + } + if(idx < 3){ + return std::min(v1, v2); + } + if(9*nvals <= idx && idx < 9*nvals+3){ + return std::max(v1, v2); + } + return v1 + v2; + }); threaded_local_hist.mergeParallel(); MPI_Allreduce( @@ -510,10 +537,13 @@ fluid_solver_base<rnumber>::fluid_solver_base( std::fill_n(this->kshell, this->nshells, 0.0); this->nshell = new int64_t[this->nshells]; std::fill_n(this->nshell, this->nshells, 0); - double *kshell_local = new double[this->nshells]; - std::fill_n(kshell_local, this->nshells, 0.0); - int64_t *nshell_local = new int64_t[this->nshells]; - std::fill_n(nshell_local, this->nshells, 0.0); + + shared_array<double> kshell_local_threaded(this->nshells,[&](double* kshell_local){ + std::fill_n(kshell_local, this->nshells, 0.0); + }); + shared_array<double> nshell_local_threaded(this->nshells,[&](double* nshell_local){ + std::fill_n(nshell_local, this->nshells, 0.0); + }); std::vector<std::unordered_map<int, double>> Fourier_filter_threaded(omp_get_max_threads()); @@ -525,13 +555,15 @@ fluid_solver_base<rnumber>::fluid_solver_base( if (k2 < this->kM2) { double knorm = sqrt(k2); - ThreadSafeUpdate(nshell_local[int(knorm/this->dk)]) += nxmodes; - ThreadSafeUpdate(kshell_local[int(knorm/this->dk)]) += nxmodes*knorm; + nshell_local_threaded.getMine()[int(knorm/this->dk)] += nxmodes; + kshell_local_threaded.getMine()[int(knorm/this->dk)] += nxmodes*knorm; } Fourier_filter_threaded[omp_get_thread_num()][int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));} ); // Merge results + nshell_local_threaded.mergeParallel(); + kshell_local_threaded.mergeParallel(); for(int idxMerge = 0 ; idxMerge < int(Fourier_filter_threaded.size()) ; ++idxMerge){ for(const auto kv : Fourier_filter_threaded[idxMerge]){ this->Fourier_filter[kv.first] = kv.second; @@ -539,12 +571,12 @@ fluid_solver_base<rnumber>::fluid_solver_base( } MPI_Allreduce( - (void*)(nshell_local), + (void*)(nshell_local_threaded.getMasterData()), (void*)(this->nshell), this->nshells, MPI_INT64_T, MPI_SUM, this->cd->comm); MPI_Allreduce( - (void*)(kshell_local), + (void*)(kshell_local_threaded.getMasterData()), (void*)(this->kshell), this->nshells, MPI_DOUBLE, MPI_SUM, this->cd->comm); @@ -552,8 +584,6 @@ fluid_solver_base<rnumber>::fluid_solver_base( { this->kshell[n] /= this->nshell[n]; } - delete[] nshell_local; - delete[] kshell_local; } template <class rnumber> diff --git a/bfps/cpp/fluid_solver_base.hpp b/bfps/cpp/fluid_solver_base.hpp index 02fd8173e87678fe68501fb9677b348bae23a8c6..078c1943018e37d59c3e8a179e74846c64e82c96 100644 --- a/bfps/cpp/fluid_solver_base.hpp +++ b/bfps/cpp/fluid_solver_base.hpp @@ -31,6 +31,7 @@ #include "base.hpp" #include "field_descriptor.hpp" #include "scope_timer.hpp" +#include "omputils.hpp" #ifndef FLUID_SOLVER_BASE @@ -140,15 +141,19 @@ template <class ObjectType, class FuncType> void CLOOP(ObjectType* obj, FuncType expression) { TIMEZONE("CLOOP"); - #pragma omp parallel for schedule(static) - for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){ - ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; - for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) - for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) - { - expression(cindex, xindex, yindex, zindex); - cindex++; - } + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[0]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[0]); + for (ptrdiff_t yindex = start; yindex < end; yindex++){ + ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; + for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) + for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) + { + expression(cindex, xindex, yindex, zindex); + cindex++; + } + } } } @@ -156,20 +161,24 @@ template <class ObjectType, class FuncType> void CLOOP_NXMODES(ObjectType* obj, FuncType expression) { TIMEZONE("CLOOP_NXMODES"); - #pragma omp parallel for schedule(static) - for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){ - ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; - for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) - { - int nxmodes = 1; - ptrdiff_t xindex = 0; - expression; - cindex++; - nxmodes = 2; - for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[0]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[0]); + for (ptrdiff_t yindex = start; yindex < end; yindex++){ + ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; + for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) { + int nxmodes = 1; + ptrdiff_t xindex = 0; expression(); cindex++; + nxmodes = 2; + for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) + { + expression(); + cindex++; + } } } } @@ -180,17 +189,21 @@ template <class ObjectType, class FuncType> void CLOOP_K2(ObjectType* obj, FuncType expression) { TIMEZONE("CLOOP_K2"); - #pragma omp parallel for schedule(static) - for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){ - ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; - for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) - for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) - { - double k2 = (obj->kx[xindex]*obj->kx[xindex] + - obj->ky[yindex]*obj->ky[yindex] + - obj->kz[zindex]*obj->kz[zindex]); - expression(cindex, xindex, yindex, zindex, k2); - cindex++; + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[0]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[0]); + for (ptrdiff_t yindex = start; yindex < end; yindex++){ + ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; + for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) + for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) + { + double k2 = (obj->kx[xindex]*obj->kx[xindex] + + obj->ky[yindex]*obj->ky[yindex] + + obj->kz[zindex]*obj->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2); + cindex++; + } } } } @@ -199,26 +212,30 @@ void CLOOP_K2(ObjectType* obj, FuncType expression) template <class ObjectType, class FuncType> void CLOOP_K2_NXMODES(ObjectType* obj, FuncType expression) { - #pragma omp parallel for schedule(static) - for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){ - ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; - for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) - { - int nxmodes = 1; - ptrdiff_t xindex = 0; - double k2 = (obj->kx[xindex]*obj->kx[xindex] + - obj->ky[yindex]*obj->ky[yindex] + - obj->kz[zindex]*obj->kz[zindex]); - expression(cindex, xindex, yindex, zindex, k2, nxmodes); - cindex++; - nxmodes = 2; - for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[0]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[0]); + for (ptrdiff_t yindex = start; yindex < end; yindex++){ + ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; + for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) { + int nxmodes = 1; + ptrdiff_t xindex = 0; double k2 = (obj->kx[xindex]*obj->kx[xindex] + obj->ky[yindex]*obj->ky[yindex] + obj->kz[zindex]*obj->kz[zindex]); expression(cindex, xindex, yindex, zindex, k2, nxmodes); cindex++; + nxmodes = 2; + for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) + { + double k2 = (obj->kx[xindex]*obj->kx[xindex] + + obj->ky[yindex]*obj->ky[yindex] + + obj->kz[zindex]*obj->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2, nxmodes); + cindex++; + } } } } @@ -228,15 +245,19 @@ void CLOOP_K2_NXMODES(ObjectType* obj, FuncType expression) template <class ObjectType, class FuncType> void RLOOP(ObjectType* obj, FuncType expression) { - #pragma omp parallel for schedule(static) - for (int zindex = 0; zindex < obj->rd->subsizes[0]; zindex++) - for (int yindex = 0; yindex < obj->rd->subsizes[1]; yindex++) + #pragma omp parallel { - ptrdiff_t rindex = (zindex * obj->rd->subsizes[1] + yindex)*(obj->rd->subsizes[2]+2); - for (int xindex = 0; xindex < obj->rd->subsizes[2]; xindex++) + const hsize_t start = OmpUtils::ForIntervalStart(obj->rd->subsizes[0]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->rd->subsizes[0]); + for (int zindex = start; zindex < end ; zindex++) + for (int yindex = 0; yindex < obj->rd->subsizes[1]; yindex++) { - expression(rindex, xindex, yindex, zindex); - rindex++; + ptrdiff_t rindex = (zindex * obj->rd->subsizes[1] + yindex)*(obj->rd->subsizes[2]+2); + for (int xindex = 0; xindex < obj->rd->subsizes[2]; xindex++) + { + expression(rindex, xindex, yindex, zindex); + rindex++; + } } } } diff --git a/bfps/cpp/kspace.cpp b/bfps/cpp/kspace.cpp index 9f76b2138eb0c4ae5e6cc2e2c7dc2fd974b66389..c00226486389f69aca7653ecf8309c59af4f453f 100644 --- a/bfps/cpp/kspace.cpp +++ b/bfps/cpp/kspace.cpp @@ -30,7 +30,6 @@ #include "kspace.hpp" #include "scope_timer.hpp" #include "shared_array.hpp" -#include "threadsafeupdate.hpp" template <field_backend be, kspace_dealias_type dt> @@ -109,10 +108,13 @@ kspace<be, dt>::kspace( this->nshells = int(this->kM / this->dk) + 2; this->kshell.resize(this->nshells, 0); this->nshell.resize(this->nshells, 0); - std::vector<double> kshell_local; - kshell_local.resize(this->nshells, 0); - std::vector<int64_t> nshell_local; - nshell_local.resize(this->nshells, 0); + + shared_array<double> kshell_local_thread(this->nshells,[&](double* kshell_local){ + std::fill_n(kshell_local, this->nshells, 0); + }); + shared_array<int64_t> nshell_local_thread(this->nshells,[&](int64_t* nshell_local){ + std::fill_n(nshell_local, this->nshells, 0); + }); std::vector<std::unordered_map<int, double>> dealias_filter_threaded(omp_get_max_threads()); @@ -126,8 +128,8 @@ kspace<be, dt>::kspace( if (k2 < this->kM2) { double knorm = sqrt(k2); - ThreadSafeUpdate(nshell_local[int(knorm/this->dk)]) += nxmodes; - ThreadSafeUpdate(kshell_local[int(knorm/this->dk)]) += nxmodes*knorm; + kshell_local_thread.getMine()[int(knorm/this->dk)] += nxmodes; + nshell_local_thread.getMine()[int(knorm/this->dk)] += nxmodes*knorm; } if (dt == SMOOTH){ dealias_filter_threaded[omp_get_thread_num()][int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.)); @@ -135,6 +137,10 @@ kspace<be, dt>::kspace( }); // Merge results + + kshell_local_thread.mergeParallel(); + nshell_local_thread.mergeParallel(); + if (dt == SMOOTH){ for(int idxMerge = 0 ; idxMerge < int(dealias_filter_threaded.size()) ; ++idxMerge){ for(const auto kv : dealias_filter_threaded[idxMerge]){ @@ -144,12 +150,12 @@ kspace<be, dt>::kspace( } MPI_Allreduce( - &nshell_local.front(), + nshell_local_thread.getMasterData(), &this->nshell.front(), this->nshells, MPI_INT64_T, MPI_SUM, this->layout->comm); MPI_Allreduce( - &kshell_local.front(), + kshell_local_thread.getMasterData(), &this->kshell.front(), this->nshells, MPI_DOUBLE, MPI_SUM, this->layout->comm); @@ -206,7 +212,7 @@ void kspace<be, dt>::dealias(typename fftw_interface<rnumber>::complex *__restri double tval = this->dealias_filter[int(round(k2 / this->dk2))]; for (int tcounter=0; tcounter<2*ncomp(fc); tcounter++) ((rnumber*)a)[2*ncomp(fc)*cindex + tcounter] *= tval; - }); + }); break; } } @@ -225,21 +231,21 @@ void kspace<be, dt>::force_divfree(typename fftw_interface<rnumber>::complex *__ double k2){ if (k2 > 0) { - typename fftw_interface<rnumber>::complex tval; - tval[0] = (this->kx[xindex]*((*(a + cindex*3 ))[0]) + - this->ky[yindex]*((*(a + cindex*3+1))[0]) + - this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2; - tval[1] = (this->kx[xindex]*((*(a + cindex*3 ))[1]) + - this->ky[yindex]*((*(a + cindex*3+1))[1]) + - this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2; - for (int imag_part=0; imag_part<2; imag_part++) - { - a[cindex*3 ][imag_part] -= tval[imag_part]*this->kx[xindex]; - a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex]; - a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex]; - } + typename fftw_interface<rnumber>::complex tval; + tval[0] = (this->kx[xindex]*((*(a + cindex*3 ))[0]) + + this->ky[yindex]*((*(a + cindex*3+1))[0]) + + this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2; + tval[1] = (this->kx[xindex]*((*(a + cindex*3 ))[1]) + + this->ky[yindex]*((*(a + cindex*3+1))[1]) + + this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2; + for (int imag_part=0; imag_part<2; imag_part++) + { + a[cindex*3 ][imag_part] -= tval[imag_part]*this->kx[xindex]; + a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex]; + a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex]; + } + } } - } ); if (this->layout->myrank == this->layout->rank[0][0]) std::fill_n((rnumber*)(a), 6, 0.0); @@ -273,10 +279,11 @@ void kspace<be, dt>::cospectrum( double* spec_local = spec_local_thread.getMine(); int tmp_int = int(sqrt(k2) / this->dk)*ncomp(fc)*ncomp(fc); for (hsize_t i=0; i<ncomp(fc); i++) - for (hsize_t j=0; j<ncomp(fc); j++) + for (hsize_t j=0; j<ncomp(fc); j++){ spec_local[tmp_int + i*ncomp(fc)+j] += nxmodes * ( - (a[ncomp(fc)*cindex + i][0] * b[ncomp(fc)*cindex + j][0]) + - (a[ncomp(fc)*cindex + i][1] * b[ncomp(fc)*cindex + j][1])); + (a[ncomp(fc)*cindex + i][0] * b[ncomp(fc)*cindex + j][0]) + + (a[ncomp(fc)*cindex + i][1] * b[ncomp(fc)*cindex + j][1])); + } } }); diff --git a/bfps/cpp/kspace.hpp b/bfps/cpp/kspace.hpp index be9e312c4991556a4307ab426ddee30dea6e8051..efac5d9cc539b0e4f2de96bd3751a553e123d6b6 100644 --- a/bfps/cpp/kspace.hpp +++ b/bfps/cpp/kspace.hpp @@ -28,6 +28,7 @@ #include <unordered_map> #include <vector> #include <string> +#include "omputils.hpp" #include "fftw_interface.hpp" #include "field_layout.hpp" @@ -88,14 +89,19 @@ class kspace template <class func_type> void CLOOP(func_type expression) { - #pragma omp parallel for schedule(static) - for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){ - ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]; - for (hsize_t zindex = 0; zindex < this->layout->subsizes[1]; zindex++){ - for (hsize_t xindex = 0; xindex < this->layout->subsizes[2]; xindex++) - { - expression(cindex, xindex, yindex, zindex); - cindex++; + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(this->layout->subsizes[0]); + const hsize_t end = OmpUtils::ForIntervalEnd(this->layout->subsizes[0]); + + for (hsize_t yindex = start; yindex < end; yindex++){ + ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]; + for (hsize_t zindex = 0; zindex < this->layout->subsizes[1]; zindex++){ + for (hsize_t xindex = 0; xindex < this->layout->subsizes[2]; xindex++) + { + expression(cindex, xindex, yindex, zindex); + cindex++; + } } } } @@ -103,17 +109,25 @@ class kspace template <class func_type> void CLOOP_K2(func_type expression) { - #pragma omp parallel for schedule(static) - for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){ - ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]; - for (hsize_t zindex = 0; zindex < this->layout->subsizes[1]; zindex++){ - for (hsize_t xindex = 0; xindex < this->layout->subsizes[2]; xindex++) - { - double k2 = (this->kx[xindex]*this->kx[xindex] + - this->ky[yindex]*this->ky[yindex] + - this->kz[zindex]*this->kz[zindex]); - expression(cindex, xindex, yindex, zindex, k2); - cindex++; + #pragma omp parallel + { + const double chunk = double(this->layout->subsizes[0])/double(omp_get_num_threads()); + const hsize_t start = hsize_t(chunk*double(omp_get_thread_num())); + const hsize_t end = (omp_get_thread_num() == omp_get_num_threads()-1) ? + this->layout->subsizes[0]: + hsize_t(chunk*double(omp_get_thread_num()+1)); + + for (hsize_t yindex = start; yindex < end; yindex++){ + ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]; + for (hsize_t zindex = 0; zindex < this->layout->subsizes[1]; zindex++){ + for (hsize_t xindex = 0; xindex < this->layout->subsizes[2]; xindex++) + { + double k2 = (this->kx[xindex]*this->kx[xindex] + + this->ky[yindex]*this->ky[yindex] + + this->kz[zindex]*this->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2); + cindex++; + } } } } @@ -121,24 +135,32 @@ class kspace template <class func_type> void CLOOP_K2_NXMODES(func_type expression) { - #pragma omp parallel for schedule(static) - for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){ - ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]; - for (hsize_t zindex = 0; zindex < this->layout->subsizes[1]; zindex++){ - hsize_t xindex = 0; - double k2 = ( - this->kx[xindex]*this->kx[xindex] + - this->ky[yindex]*this->ky[yindex] + - this->kz[zindex]*this->kz[zindex]); - expression(cindex, xindex, yindex, zindex, k2, 1); - cindex++; - for (xindex = 1; xindex < this->layout->subsizes[2]; xindex++) - { - k2 = (this->kx[xindex]*this->kx[xindex] + - this->ky[yindex]*this->ky[yindex] + - this->kz[zindex]*this->kz[zindex]); - expression(cindex, xindex, yindex, zindex, k2, 2); + #pragma omp parallel + { + const double chunk = double(this->layout->subsizes[0])/double(omp_get_num_threads()); + const hsize_t start = hsize_t(chunk*double(omp_get_thread_num())); + const hsize_t end = (omp_get_thread_num() == omp_get_num_threads()-1) ? + this->layout->subsizes[0]: + hsize_t(chunk*double(omp_get_thread_num()+1)); + + for (hsize_t yindex = start; yindex < end; yindex++){ + ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]; + for (hsize_t zindex = 0; zindex < this->layout->subsizes[1]; zindex++){ + hsize_t xindex = 0; + double k2 = ( + this->kx[xindex]*this->kx[xindex] + + this->ky[yindex]*this->ky[yindex] + + this->kz[zindex]*this->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2, 1); cindex++; + for (xindex = 1; xindex < this->layout->subsizes[2]; xindex++) + { + k2 = (this->kx[xindex]*this->kx[xindex] + + this->ky[yindex]*this->ky[yindex] + + this->kz[zindex]*this->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2, 2); + cindex++; + } } } } diff --git a/bfps/cpp/omputils.hpp b/bfps/cpp/omputils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cdd6c6c173c7cf002b72e0c1a7aebcf727f2d33e --- /dev/null +++ b/bfps/cpp/omputils.hpp @@ -0,0 +1,27 @@ +#ifndef OMPUTILS_HPP +#define OMPUTILS_HPP + +#include <omp.h> + +namespace OmpUtils{ + +template <class IndexType> +inline IndexType ForIntervalStart(const IndexType size){ + const double chunk = double(size)/double(omp_get_num_threads()); + const IndexType start = IndexType(chunk*double(omp_get_thread_num())); + return start; +} + +template <class IndexType> +inline IndexType ForIntervalEnd(const IndexType size){ + const double chunk = double(size)/double(omp_get_num_threads()); + const IndexType end = (omp_get_thread_num() == omp_get_num_threads()-1) ? + size: + IndexType(chunk*double(omp_get_thread_num()+1)); + return end; +} + +} + + +#endif diff --git a/bfps/cpp/shared_array.hpp b/bfps/cpp/shared_array.hpp index e56389f83c407707f31410fc07eaacadbfc0fb24..1951e2f9838ccf37367d859206453d3db91e8e19 100644 --- a/bfps/cpp/shared_array.hpp +++ b/bfps/cpp/shared_array.hpp @@ -9,10 +9,9 @@ template <class ValueType> class shared_array{ int currentNbThreads; - ValueType** values; + ValueType** __restrict__ values; size_t dim; - omp_lock_t locker; std::function<void(ValueType*)> initFunc; bool hasBeenMerged; @@ -26,7 +25,6 @@ public: for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){ values[idxThread] = nullptr; } - omp_init_lock(&locker); } shared_array(const size_t inDim, std::function<void(ValueType*)> inInitFunc) @@ -35,7 +33,6 @@ public: } ~shared_array(){ - omp_destroy_lock(&locker); for(int idxThread = 0 ; idxThread < currentNbThreads ; ++idxThread){ delete[] values[idxThread]; } @@ -64,70 +61,37 @@ public: } hasBeenMerged = true; } - - void mergeParallel(){ - /*#pragma omp parallel - { - int mergeFlag = 1; - while(mergeFlag < currentNbThreads) mergeFlag <<= 1; - - for( ; mergeFlag != 0 ; mergeFlag >>= 1){ - if(omp_get_thread_num() < mergeFlag){ - const int otherThread = mergeFlag + omp_get_thread_num(); - if(otherThread < currentNbThreads && values[otherThread]){ - if(!values[omp_get_thread_num()]){ - values[omp_get_thread_num()] = new ValueType[dim]; - ValueType* __restrict__ dest = values[omp_get_thread_num()]; - const ValueType* __restrict__ src = values[otherThread]; - for( size_t idxVal = 0 ; idxVal < dim ; ++idxVal){ - dest[idxVal] = src[idxVal]; - } - } - else{ - ValueType* __restrict__ dest = values[omp_get_thread_num()]; - const ValueType* __restrict__ src = values[otherThread]; - for( size_t idxVal = 0 ; idxVal < dim ; ++idxVal){ - dest[idxVal] += src[idxVal]; - } - } - } + + template <class Func> + void merge(Func func){ + ValueType* __restrict__ dest = values[0]; + for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){ + if(values[idxThread]){ + const ValueType* __restrict__ src = values[idxThread]; + for( size_t idxVal = 0 ; idxVal < dim ; ++idxVal){ + dest[idxVal] = func(idxVal, dest[idxVal], src[idxVal]); } - #pragma omp barrier } - }*/ - for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){ - if(values[idxThread]){ - ValueType* __restrict__ dest = values[0]; - const ValueType* __restrict__ src = values[idxThread]; - for( size_t idxVal = 0 ; idxVal < dim ; ++idxVal){ - dest[idxVal] += src[idxVal]; - } - } - } + } hasBeenMerged = true; } + void mergeParallel(){ + merge(); // not done yet + } + + template <class Func> + void mergeParallel(Func func){ + merge(func); // not done yet + } + void setInitFunction(std::function<void(ValueType*)> inInitFunc){ initFunc = inInitFunc; initFunc(values[0]); } ValueType* getMine(){ - if(omp_get_num_threads() > currentNbThreads){ - omp_set_lock(&locker); - if(omp_get_num_threads() > currentNbThreads){ - ValueType** newValues = new ValueType*[omp_get_num_threads()]; - for(int idxThread = 0 ; idxThread < currentNbThreads ; ++idxThread){ - newValues[idxThread] = values[idxThread]; - } - for(int idxThread = currentNbThreads ; idxThread < omp_get_num_threads() ; ++idxThread){ - newValues[idxThread] = nullptr; - } - values = newValues; - currentNbThreads = omp_get_num_threads(); - } - omp_unset_lock(&locker); - } + assert(omp_get_thread_num() < currentNbThreads); if(values[omp_get_thread_num()] == nullptr){ ValueType* myValue = new ValueType[dim]; @@ -135,16 +99,11 @@ public: initFunc(myValue); } - omp_set_lock(&locker); values[omp_get_thread_num()] = myValue; - omp_unset_lock(&locker); - return myValue; + return myValue; } - omp_set_lock(&locker); - ValueType* myValue = values[omp_get_thread_num()]; - omp_unset_lock(&locker); - return myValue; + return values[omp_get_thread_num()]; } }; diff --git a/bfps/cpp/threadsafeupdate.hpp b/bfps/cpp/threadsafeupdate.hpp deleted file mode 100644 index 72177a4d9556e99ba08361a800f40d75fb31b919..0000000000000000000000000000000000000000 --- a/bfps/cpp/threadsafeupdate.hpp +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef THREADSAFEUPDATE_HPP -#define THREADSAFEUPDATE_HPP - -template <class NumType> -class ThreadSafeUpdateCore{ - NumType& num; -public: - ThreadSafeUpdateCore(NumType& inNum) : num(inNum){} - - ThreadSafeUpdateCore& operator+=(const NumType& inNum){ - #pragma omp atomic update - num += inNum; - return *this; - } - - ThreadSafeUpdateCore& operator-=(const NumType& inNum){ - #pragma omp atomic update - num -= inNum; - return *this; - } - - ThreadSafeUpdateCore& operator*=(const NumType& inNum){ - #pragma omp atomic update - num *= inNum; - return *this; - } - - ThreadSafeUpdateCore& operator/=(const NumType& inNum){ - #pragma omp atomic update - num /= inNum; - return *this; - } -}; - -template <class NumType> -ThreadSafeUpdateCore<NumType> ThreadSafeUpdate(NumType& inNum){ - return ThreadSafeUpdateCore<NumType>(inNum); -} - - -#endif diff --git a/bfps/cpp/vorticity_equation.cpp b/bfps/cpp/vorticity_equation.cpp index 3a9b1750e86d73391df91863ccf4af93da10a3c6..9a0c0f29b32076bb222a94d162e6f191f9946813 100644 --- a/bfps/cpp/vorticity_equation.cpp +++ b/bfps/cpp/vorticity_equation.cpp @@ -302,7 +302,6 @@ void vorticity_equation<rnumber, be>::step(double dt) { DEBUG_MSG("vorticity_equation::step\n"); TIMEZONE("vorticity_equation::step"); - double factor0, factor1; *this->v[1] = 0.0; this->omega_nonlin(0); this->kk->CLOOP_K2( @@ -313,6 +312,7 @@ void vorticity_equation<rnumber, be>::step(double dt) double k2){ if (k2 <= this->kk->kM2) { + double factor0; factor0 = exp(-this->nu * k2 * dt); for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) this->v[1]->cval(cindex,cc,i) = ( @@ -334,6 +334,7 @@ void vorticity_equation<rnumber, be>::step(double dt) double k2){ if (k2 <= this->kk->kM2) { + double factor0, factor1; factor0 = exp(-this->nu * k2 * dt/2); factor1 = exp( this->nu * k2 * dt/2); for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) @@ -358,6 +359,7 @@ void vorticity_equation<rnumber, be>::step(double dt) double k2){ if (k2 <= this->kk->kM2) { + double factor0; factor0 = exp(-this->nu * k2 * dt * 0.5); for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) this->v[3]->cval(cindex,cc,i) = ( diff --git a/setup.py b/setup.py index f5d1c1c6cde76962de216a5d19ecb3478246ec87..bd7714f86f4604261377dc515ce100c5ac81614d 100644 --- a/setup.py +++ b/setup.py @@ -116,6 +116,8 @@ src_file_list = ['vorticity_equation', header_list = (['cpp/base.hpp'] + ['cpp/fftw_interface.hpp'] + ['cpp/bfps_timer.hpp'] + + ['cpp/omputils.hpp'] + + ['cpp/shared_array.hpp'] + ['cpp/' + fname + '.hpp' for fname in src_file_list]) @@ -127,10 +129,23 @@ with open('MANIFEST.in', 'w') as manifest_in_file: ### libraries -libraries = ['fftw3_mpi', - 'fftw3', - 'fftw3f_mpi', - 'fftw3f'] +# libraries = ['fftw3_mpi', +# 'fftw3', +# 'fftw3f_mpi', +# 'fftw3f'] +try: + usemkl = True if os.environ['USEMKLFFTW']=="true" else False +except : + usemkl = False + +if usemkl: + print('Use mkl TODO!') +else: + libraries = ['fftw3_mpi', + 'fftw3', + 'fftw3f_mpi', + 'fftw3f'] + libraries += extra_libraries