diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000000000000000000000000000000000000..921ffeda512e71d1a70c2797e5c676f80967aede --- /dev/null +++ b/AUTHORS @@ -0,0 +1,6 @@ +All people who contributed to bfps, in order of the date of their first +contribution. + +Cristian C Lalescu <Cristian.Lalescu@ds.mpg.de> +Dimitar Vlaykov +Berenger Bramas diff --git a/README.rst b/README.rst index 0379bc61d93b1a88baaf8c0d757c0092dbb6361a..ddb9f2447db919248100368a9a08b13297d5e3a4 100644 --- a/README.rst +++ b/README.rst @@ -12,11 +12,14 @@ Parameters and statistics are stored in HDF5 format, together with code information, so simulation data should be "future proof" --- suggestions of possible improvements to the current approach are always welcome. +The primary aim of bfps is to reduce the time spent on setting up and +baby sitting DNS, as well as simplify the analysis of the generated +data. The wish is that this Python package provides an easy and general way of constructing efficient specialized DNS C++ codes for different turbulence problems encountered in research. At the same time, the package should provide a unified way of -postprocessing data, and accessing the postprocessing results. +postprocessing, and accessing the postprocessing results. The code therefore consists of two main parts: the pure C++ code, a set of loosely related "building blocks", and the Python code, which can generate C++ code using the pure classes, but with a significant degree @@ -34,10 +37,10 @@ the user's machine, or submitted to a queue on a cluster. Installation ------------ -So far, the code has been run on an ubuntu 14.04 machine, an opensuse -13.2 desktop, and a reasonably standard linux cluster (biggest run so -far was 1344^3 on 16 nodes of 12 cores each, with about 24 seconds per -time step). +So far, the code has been run on laptops, desktops, and a couple of +clusters (biggest run so far was 1536^3 on 16 nodes of 32 cores each, +with about 11 seconds per time step, for a simple incompressible +Navier-Stokes problem). Postprocessing data may not be very computationally intensive, depending on the amount of data involved. @@ -55,21 +58,21 @@ Use a console; navigate to the ``bfps`` folder, and type: **Full installation** If you want to run simulations on the machine where you're installing, -you will need to call `build` before installing. +you will need to call `compile_library` before installing. Your machine needs to have an MPI compiler installed, the HDF5 C library and FFTW >= 3.4. The file `machine_settings_py.py` should be modified -appropriately for your machine (otherwise the `build` command will most +appropriately for your machine (otherwise the `compile_library` command will most likely fail). This file will be copied the first time you run `setup.py` into -`$HOME/.config/bfps/machine_settings.py`, where it will be imported from -afterwards. -You may, obviously, edit it afterwards and rerun the build command as +`$HOME/.config/bfps/machine_settings.py`, **where it will be imported from +afterwards** --- any future edits **must** be made to the new file. +You may, obviously, edit it afterwards and rerun the `compile_library` command as needed. .. code:: bash - python setup.py build + python setup.py compile_library python setup.py install ------------- @@ -99,9 +102,7 @@ Comments * particles: initialization of multistep solvers is done with lower order methods, so direct convergence tests will fail. -* Code is used mainly with Python 3.4, but Python 2.7 - compatibility should be kept since mayavi (well, vtk actually) only - works on Python 2. - Until vtk becomes compatible with Python 3.x, any Python 2.7 - incompatibilites can be reported as bugs. +* Code is used mainly with Python 3.4 and 3.5. + In principle it should be easy to maintain compatibility with Python + 2.7.x, but as of `bfps 1.8` this is no longer a main concern. diff --git a/bfps/FluidConvert.py b/bfps/FluidConvert.py index 14be9b985139fabf3b7e1cda1b5f9ee9618a8307..d924f2a1d5ed411855ca13687aa716fa3aa31dc5 100644 --- a/bfps/FluidConvert.py +++ b/bfps/FluidConvert.py @@ -43,7 +43,7 @@ class FluidConvert(_fluid_particle_base): work_dir = './', simname = 'test', fluid_precision = 'single', - use_fftw_wisdom = True): + use_fftw_wisdom = False): _fluid_particle_base.__init__( self, name = name + '-' + fluid_precision, @@ -98,7 +98,7 @@ class FluidConvert(_fluid_particle_base): nx, ny, nz, dkx, dky, dkz, dealias_type, - FFTW_ESTIMATE); + DEFAULT_FFTW_FLAG); //endcpp """.format(self.C_dtype) self.fluid_loop += """ @@ -109,11 +109,13 @@ class FluidConvert(_fluid_particle_base): """ self.fluid_end += 'delete fs;\n' return None - def add_parser_arguments( + def specific_parser_arguments( self, parser): - _fluid_particle_base.add_parser_arguments(self, parser) - self.parameters_to_parser_arguments(parser, parameters = self.spec_parameters) + _fluid_particle_base.specific_parser_arguments(self, parser) + self.parameters_to_parser_arguments( + parser, + parameters = self.spec_parameters) return None def launch( self, @@ -125,13 +127,13 @@ class FluidConvert(_fluid_particle_base): self.pars_from_namespace( opt, parameters = self.spec_parameters) - self.set_host_info(bfps.host_info) self.rewrite_par( group = 'conversion_parameters', parameters = self.spec_parameters) - self.run( - ncpu = opt.ncpu, - err_file = 'err_convert', - out_file = 'out_convert') + self.run(ncpu = opt.ncpu, + hours = opt.minutes // 60, + minutes = opt.minutes % 60, + err_file = 'err_convert', + out_file = 'out_convert') return None diff --git a/bfps/FluidResize.py b/bfps/FluidResize.py index be0af1fe8228ffd31f42c08b5d0fca45dadbf8b2..fb5e26208f6960d447bc927bd9e207354620d188 100644 --- a/bfps/FluidResize.py +++ b/bfps/FluidResize.py @@ -136,6 +136,8 @@ class FluidResize(_fluid_particle_base): for k in ['dst_nx', 'dst_ny', 'dst_nz']: if type(cmd_line_pars[k]) == type(None): cmd_line_pars[k] = opt.m + # the 3 dst_ni have been updated in opt itself at this point + # I'm not sure if this code is future-proof... self.parameters['niter_todo'] = 0 self.pars_from_namespace(opt) src_file = os.path.join( @@ -144,10 +146,11 @@ class FluidResize(_fluid_particle_base): read_file = os.path.join( self.work_dir, opt.src_simname + '_cvorticity_i{0:0>5x}'.format(opt.src_iteration)) - self.set_host_info(bfps.host_info) self.write_par(iter0 = opt.src_iteration) if not os.path.exists(read_file): os.symlink(src_file, read_file) - self.run(ncpu = opt.ncpu) + self.run(ncpu = opt.ncpu, + hours = opt.minutes // 60, + minutes = opt.minutes % 60) return None diff --git a/bfps/NSVorticityEquation.py b/bfps/NSVorticityEquation.py new file mode 100644 index 0000000000000000000000000000000000000000..f67ba6aee16e93d1c4a8a9a710c3136eae678398 --- /dev/null +++ b/bfps/NSVorticityEquation.py @@ -0,0 +1,849 @@ +####################################################################### +# # +# Copyright 2015 Max Planck Institute # +# for Dynamics and Self-Organization # +# # +# This file is part of bfps. # +# # +# bfps is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published # +# by the Free Software Foundation, either version 3 of the License, # +# or (at your option) any later version. # +# # +# bfps is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with bfps. If not, see <http://www.gnu.org/licenses/> # +# # +# Contact: Cristian.Lalescu@ds.mpg.de # +# # +####################################################################### + + + +import sys +import os +import numpy as np +import h5py +import argparse + +import bfps +import bfps.tools +from bfps._code import _code +from bfps._fluid_base import _fluid_particle_base + +class NSVorticityEquation(_fluid_particle_base): + def __init__( + self, + name = 'NSVE-v' + bfps.__version__, + work_dir = './', + simname = 'test', + fluid_precision = 'single', + fftw_plan_rigor = 'FFTW_MEASURE', + use_fftw_wisdom = True): + """ + This code uses checkpoints for DNS restarts, and it can be stopped + by creating the file "stop_<simname>" in the working directory. + For postprocessing of field snapshots, consider creating a separate + HDF5 file (from the python wrapper) which contains links to all the + different snapshots. + """ + self.fftw_plan_rigor = fftw_plan_rigor + _fluid_particle_base.__init__( + self, + name = name + '-' + fluid_precision, + work_dir = work_dir, + simname = simname, + dtype = fluid_precision, + use_fftw_wisdom = use_fftw_wisdom) + self.parameters['nu'] = float(0.1) + self.parameters['fmode'] = 1 + self.parameters['famplitude'] = float(0.5) + self.parameters['fk0'] = float(2.0) + self.parameters['fk1'] = float(4.0) + self.parameters['forcing_type'] = 'linear' + self.parameters['histogram_bins'] = int(256) + self.parameters['max_velocity_estimate'] = float(1) + self.parameters['max_vorticity_estimate'] = float(1) + self.parameters['checkpoints_per_file'] = int(1) + self.file_datasets_grow = """ + //begincpp + hid_t group; + group = H5Gopen(stat_file, "/statistics", H5P_DEFAULT); + H5Ovisit(group, H5_INDEX_NAME, H5_ITER_NATIVE, grow_statistics_dataset, NULL); + H5Gclose(group); + //endcpp + """ + self.style = {} + self.statistics = {} + self.fluid_output = """ + fs->io_checkpoint(false); + """ + # vorticity_equation specific things + self.includes += '#include "vorticity_equation.hpp"\n' + self.store_kspace = """ + //begincpp + if (myrank == 0 && iteration == 0) + { + TIMEZONE("fluid_base::store_kspace"); + hsize_t dims[4]; + hid_t space, dset; + // store kspace information + dset = H5Dopen(stat_file, "/kspace/kshell", H5P_DEFAULT); + space = H5Dget_space(dset); + H5Sget_simple_extent_dims(space, dims, NULL); + H5Sclose(space); + if (fs->kk->nshells != dims[0]) + { + DEBUG_MSG( + "ERROR: computed nshells %d not equal to data file nshells %d\\n", + fs->kk->nshells, dims[0]); + } + H5Dwrite(dset, H5T_NATIVE_DOUBLE, H5S_ALL, H5S_ALL, H5P_DEFAULT, &fs->kk->kshell.front()); + H5Dclose(dset); + dset = H5Dopen(stat_file, "/kspace/nshell", H5P_DEFAULT); + H5Dwrite(dset, H5T_NATIVE_INT64, H5S_ALL, H5S_ALL, H5P_DEFAULT, &fs->kk->nshell.front()); + H5Dclose(dset); + dset = H5Dopen(stat_file, "/kspace/kM", H5P_DEFAULT); + H5Dwrite(dset, H5T_NATIVE_DOUBLE, H5S_ALL, H5S_ALL, H5P_DEFAULT, &fs->kk->kM); + H5Dclose(dset); + dset = H5Dopen(stat_file, "/kspace/dk", H5P_DEFAULT); + H5Dwrite(dset, H5T_NATIVE_DOUBLE, H5S_ALL, H5S_ALL, H5P_DEFAULT, &fs->kk->dk); + H5Dclose(dset); + } + //endcpp + """ + return None + def add_particles( + self, + integration_steps = 2, + neighbours = 1, + smoothness = 1): + assert(integration_steps > 0 and integration_steps < 6) + self.particle_species = 1 + self.parameters['tracers0_integration_steps'] = int(integration_steps) + self.parameters['tracers0_neighbours'] = int(neighbours) + self.parameters['tracers0_smoothness'] = int(smoothness) + self.parameters['tracers0_interpolator'] = 'spline' + self.particle_includes += """ + #include "particles/particles_system_builder.hpp" + #include "particles/particles_output_hdf5.hpp" + """ + ## initialize + self.particle_start += """ + sprintf(fname, "%s_particles.h5", simname); + std::unique_ptr<abstract_particles_system<double>> ps = particles_system_builder( + fs->cvorticity, // (field object) + fs->kk, // (kspace object, contains dkx, dky, dkz) + tracers0_integration_steps, // to check coherency between parameters and hdf input file (nb rhs) + nparticles, // to check coherency between parameters and hdf input file + fname, // particles input filename + std::string("/tracers0/state/0"), // dataset name for initial input + std::string("/tracers0/rhs/0"), // dataset name for initial input + tracers0_neighbours, // parameter (interpolation no neighbours) + tracers0_smoothness, // parameter + MPI_COMM_WORLD); + particles_output_hdf5<double,3,3> particles_output_writer_mpi(MPI_COMM_WORLD, fname, nparticles, tracers0_integration_steps, + "/tracers0/state/", "/tracers0/rhs/"); + """ + self.particle_loop += """ + fs->compute_velocity(fs->cvorticity); + fs->cvelocity->ift(); + ps->completeLoop(dt); + """ + output_particles = """ + particles_output_writer_mpi.save(ps->getParticlesPositions(), + ps->getParticlesRhs(), + ps->getParticlesIndexes(), + ps->getLocalNbParticles(), + iteration+1); + """ + self.fluid_output += output_particles + self.particle_end += 'ps.release();\n' + return None + def create_stat_output( + self, + dset_name, + data_buffer, + data_type = 'H5T_NATIVE_DOUBLE', + size_setup = None, + close_spaces = True): + new_stat_output_txt = 'Cdset = H5Dopen(stat_file, "{0}", H5P_DEFAULT);\n'.format(dset_name) + if not type(size_setup) == type(None): + new_stat_output_txt += ( + size_setup + + 'wspace = H5Dget_space(Cdset);\n' + + 'ndims = H5Sget_simple_extent_dims(wspace, dims, NULL);\n' + + 'mspace = H5Screate_simple(ndims, count, NULL);\n' + + 'H5Sselect_hyperslab(wspace, H5S_SELECT_SET, offset, NULL, count, NULL);\n') + new_stat_output_txt += ('H5Dwrite(Cdset, {0}, mspace, wspace, H5P_DEFAULT, {1});\n' + + 'H5Dclose(Cdset);\n').format(data_type, data_buffer) + if close_spaces: + new_stat_output_txt += ('H5Sclose(mspace);\n' + + 'H5Sclose(wspace);\n') + return new_stat_output_txt + def write_fluid_stats(self): + self.fluid_includes += '#include <cmath>\n' + self.fluid_includes += '#include "fftw_tools.hpp"\n' + self.stat_src += """ + //begincpp + hid_t stat_group; + if (myrank == 0) + stat_group = H5Gopen(stat_file, "statistics", H5P_DEFAULT); + fs->compute_velocity(fs->cvorticity); + *tmp_vec_field = fs->cvelocity->get_cdata(); + tmp_vec_field->compute_stats( + fs->kk, + stat_group, + "velocity", + fs->iteration / niter_stat, + max_velocity_estimate/sqrt(3)); + //endcpp + """ + self.stat_src += """ + //begincpp + *tmp_vec_field = fs->cvorticity->get_cdata(); + tmp_vec_field->compute_stats( + fs->kk, + stat_group, + "vorticity", + fs->iteration / niter_stat, + max_vorticity_estimate/sqrt(3)); + //endcpp + """ + self.stat_src += """ + //begincpp + if (myrank == 0) + H5Gclose(stat_group); + if (myrank == 0) + {{ + hid_t Cdset, wspace, mspace; + int ndims; + hsize_t count[4], offset[4], dims[4]; + offset[0] = fs->iteration/niter_stat; + offset[1] = 0; + offset[2] = 0; + offset[3] = 0; + //endcpp + """.format(self.C_dtype) + if self.dtype == np.float32: + field_H5T = 'H5T_NATIVE_FLOAT' + elif self.dtype == np.float64: + field_H5T = 'H5T_NATIVE_DOUBLE' + self.stat_src += self.create_stat_output( + '/statistics/xlines/velocity', + 'fs->rvelocity->get_rdata()', + data_type = field_H5T, + size_setup = """ + count[0] = 1; + count[1] = nx; + count[2] = 3; + """, + close_spaces = False) + self.stat_src += self.create_stat_output( + '/statistics/xlines/vorticity', + 'fs->rvorticity->get_rdata()', + data_type = field_H5T) + self.stat_src += '}\n' + ## checkpoint + self.stat_src += """ + //begincpp + if (myrank == 0) + { + std::string fname = ( + std::string("stop_") + + std::string(simname)); + { + struct stat file_buffer; + stop_code_now = (stat(fname.c_str(), &file_buffer) == 0); + } + } + MPI_Bcast(&stop_code_now, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD); + //endcpp + """ + return None + def fill_up_fluid_code(self): + self.fluid_includes += '#include <cstring>\n' + self.fluid_variables += ( + 'vorticity_equation<{0}, FFTW> *fs;\n'.format(self.C_dtype) + + 'field<{0}, FFTW, THREE> *tmp_vec_field;\n'.format(self.C_dtype) + + 'field<{0}, FFTW, ONE> *tmp_scal_field;\n'.format(self.C_dtype)) + self.fluid_definitions += """ + typedef struct {{ + {0} re; + {0} im; + }} tmp_complex_type; + """.format(self.C_dtype) + self.write_fluid_stats() + if self.dtype == np.float32: + field_H5T = 'H5T_NATIVE_FLOAT' + elif self.dtype == np.float64: + field_H5T = 'H5T_NATIVE_DOUBLE' + self.variables += 'int checkpoint;\n' + self.variables += 'bool stop_code_now;\n' + self.read_checkpoint = """ + //begincpp + if (myrank == 0) + { + hid_t dset = H5Dopen(stat_file, "checkpoint", H5P_DEFAULT); + H5Dread( + dset, + H5T_NATIVE_INT, + H5S_ALL, + H5S_ALL, + H5P_DEFAULT, + &checkpoint); + H5Dclose(dset); + } + MPI_Bcast(&checkpoint, 1, MPI_INT, 0, MPI_COMM_WORLD); + fs->checkpoint = checkpoint; + //endcpp + """ + self.store_checkpoint = """ + //begincpp + checkpoint = fs->checkpoint; + if (myrank == 0) + { + hid_t dset = H5Dopen(stat_file, "checkpoint", H5P_DEFAULT); + H5Dwrite( + dset, + H5T_NATIVE_INT, + H5S_ALL, + H5S_ALL, + H5P_DEFAULT, + &checkpoint); + H5Dclose(dset); + } + //endcpp + """ + self.fluid_start += """ + //begincpp + char fname[512]; + fs = new vorticity_equation<{0}, FFTW>( + simname, + nx, ny, nz, + dkx, dky, dkz, + {1}); + tmp_vec_field = new field<{0}, FFTW, THREE>( + nx, ny, nz, + MPI_COMM_WORLD, + {1}); + tmp_scal_field = new field<{0}, FFTW, ONE>( + nx, ny, nz, + MPI_COMM_WORLD, + {1}); + fs->checkpoints_per_file = checkpoints_per_file; + fs->nu = nu; + fs->fmode = fmode; + fs->famplitude = famplitude; + fs->fk0 = fk0; + fs->fk1 = fk1; + strncpy(fs->forcing_type, forcing_type, 128); + fs->iteration = iteration; + {2} + fs->cvorticity->real_space_representation = false; + fs->io_checkpoint(); + //endcpp + """.format( + self.C_dtype, + self.fftw_plan_rigor, + self.read_checkpoint) + self.fluid_start += self.store_kspace + self.fluid_start += 'stop_code_now = false;\n' + self.fluid_loop = 'fs->step(dt);\n' + self.fluid_loop += ('if (fs->iteration % niter_out == 0)\n{\n' + + self.fluid_output + + self.store_checkpoint + + '\n}\n' + + 'if (stop_code_now){\n' + + 'iteration = fs->iteration;\n' + + 'break;\n}\n') + self.fluid_end = ('if (fs->iteration % niter_out != 0)\n{\n' + + self.fluid_output + + self.store_checkpoint + + 'DEBUG_MSG("checkpoint value is %d\\n", checkpoint);\n' + + '\n}\n' + + 'delete fs;\n' + + 'delete tmp_vec_field;\n' + + 'delete tmp_scal_field;\n') + return None + def get_postprocess_file_name(self): + return os.path.join(self.work_dir, self.simname + '_postprocess.h5') + def get_postprocess_file(self): + return h5py.File(self.get_postprocess_file_name(), 'r') + def compute_statistics(self, iter0 = 0, iter1 = None): + """Run basic postprocessing on raw data. + The energy spectrum :math:`E(t, k)` and the enstrophy spectrum + :math:`\\frac{1}{2}\omega^2(t, k)` are computed from the + + .. math:: + + \sum_{k \\leq \\|\\mathbf{k}\\| \\leq k+dk}\\hat{u_i} \\hat{u_j}^*, \\hskip .5cm + \sum_{k \\leq \\|\\mathbf{k}\\| \\leq k+dk}\\hat{\omega_i} \\hat{\\omega_j}^* + + tensors, and the enstrophy spectrum is also used to + compute the dissipation :math:`\\varepsilon(t)`. + These basic quantities are stored in a newly created HDF5 file, + ``simname_postprocess.h5``. + """ + if len(list(self.statistics.keys())) > 0: + return None + self.read_parameters() + with self.get_data_file() as data_file: + if 'moments' not in data_file['statistics'].keys(): + return None + iter0 = min((data_file['statistics/moments/velocity'].shape[0] * + self.parameters['niter_stat']-1), + iter0) + if type(iter1) == type(None): + iter1 = data_file['iteration'].value + else: + iter1 = min(data_file['iteration'].value, iter1) + ii0 = iter0 // self.parameters['niter_stat'] + ii1 = iter1 // self.parameters['niter_stat'] + self.statistics['kshell'] = data_file['kspace/kshell'].value + self.statistics['kM'] = data_file['kspace/kM'].value + self.statistics['dk'] = data_file['kspace/dk'].value + computation_needed = True + pp_file = h5py.File(self.get_postprocess_file_name(), 'a') + if 'ii0' in pp_file.keys(): + computation_needed = not (ii0 == pp_file['ii0'].value and + ii1 == pp_file['ii1'].value) + if computation_needed: + for k in pp_file.keys(): + del pp_file[k] + if computation_needed: + pp_file['iter0'] = iter0 + pp_file['iter1'] = iter1 + pp_file['ii0'] = ii0 + pp_file['ii1'] = ii1 + pp_file['t'] = (self.parameters['dt']* + self.parameters['niter_stat']* + (np.arange(ii0, ii1+1).astype(np.float))) + pp_file['energy(t, k)'] = ( + data_file['statistics/spectra/velocity_velocity'][ii0:ii1+1, :, 0, 0] + + data_file['statistics/spectra/velocity_velocity'][ii0:ii1+1, :, 1, 1] + + data_file['statistics/spectra/velocity_velocity'][ii0:ii1+1, :, 2, 2])/2 + pp_file['enstrophy(t, k)'] = ( + data_file['statistics/spectra/vorticity_vorticity'][ii0:ii1+1, :, 0, 0] + + data_file['statistics/spectra/vorticity_vorticity'][ii0:ii1+1, :, 1, 1] + + data_file['statistics/spectra/vorticity_vorticity'][ii0:ii1+1, :, 2, 2])/2 + pp_file['vel_max(t)'] = data_file['statistics/moments/velocity'] [ii0:ii1+1, 9, 3] + pp_file['renergy(t)'] = data_file['statistics/moments/velocity'][ii0:ii1+1, 2, 3]/2 + for k in ['t', + 'energy(t, k)', + 'enstrophy(t, k)', + 'vel_max(t)', + 'renergy(t)']: + if k in pp_file.keys(): + self.statistics[k] = pp_file[k].value + self.compute_time_averages() + return None + def compute_time_averages(self): + """Compute easy stats. + + Further computation of statistics based on the contents of + ``simname_postprocess.h5``. + Standard quantities are as follows + (consistent with [Ishihara]_): + + .. math:: + + U_{\\textrm{int}}(t) = \\sqrt{\\frac{2E(t)}{3}}, \\hskip .5cm + L_{\\textrm{int}}(t) = \\frac{\pi}{2U_{int}^2(t)} \\int \\frac{dk}{k} E(t, k), \\hskip .5cm + T_{\\textrm{int}}(t) = + \\frac{L_{\\textrm{int}}(t)}{U_{\\textrm{int}}(t)} + + \\eta_K = \\left(\\frac{\\nu^3}{\\varepsilon}\\right)^{1/4}, \\hskip .5cm + \\tau_K = \\left(\\frac{\\nu}{\\varepsilon}\\right)^{1/2}, \\hskip .5cm + \\lambda = \\sqrt{\\frac{15 \\nu U_{\\textrm{int}}^2}{\\varepsilon}} + + Re = \\frac{U_{\\textrm{int}} L_{\\textrm{int}}}{\\nu}, \\hskip + .5cm + R_{\\lambda} = \\frac{U_{\\textrm{int}} \\lambda}{\\nu} + + .. [Ishihara] T. Ishihara et al, + *Small-scale statistics in high-resolution direct numerical + simulation of turbulence: Reynolds number dependence of + one-point velocity gradient statistics*. + J. Fluid Mech., + **592**, 335-366, 2007 + """ + for key in ['energy', 'enstrophy']: + self.statistics[key + '(t)'] = (self.statistics['dk'] * + np.sum(self.statistics[key + '(t, k)'], axis = 1)) + self.statistics['Uint(t)'] = np.sqrt(2*self.statistics['energy(t)'] / 3) + self.statistics['Lint(t)'] = ((self.statistics['dk']*np.pi / + (2*self.statistics['Uint(t)']**2)) * + np.nansum(self.statistics['energy(t, k)'] / + self.statistics['kshell'][None, :], axis = 1)) + for key in ['energy', + 'enstrophy', + 'vel_max', + 'Uint', + 'Lint']: + if key + '(t)' in self.statistics.keys(): + self.statistics[key] = np.average(self.statistics[key + '(t)'], axis = 0) + for suffix in ['', '(t)']: + self.statistics['diss' + suffix] = (self.parameters['nu'] * + self.statistics['enstrophy' + suffix]*2) + self.statistics['etaK' + suffix] = (self.parameters['nu']**3 / + self.statistics['diss' + suffix])**.25 + self.statistics['tauK' + suffix] = (self.parameters['nu'] / + self.statistics['diss' + suffix])**.5 + self.statistics['Re' + suffix] = (self.statistics['Uint' + suffix] * + self.statistics['Lint' + suffix] / + self.parameters['nu']) + self.statistics['lambda' + suffix] = (15 * self.parameters['nu'] * + self.statistics['Uint' + suffix]**2 / + self.statistics['diss' + suffix])**.5 + self.statistics['Rlambda' + suffix] = (self.statistics['Uint' + suffix] * + self.statistics['lambda' + suffix] / + self.parameters['nu']) + self.statistics['kMeta' + suffix] = (self.statistics['kM'] * + self.statistics['etaK' + suffix]) + if self.parameters['dealias_type'] == 1: + self.statistics['kMeta' + suffix] *= 0.8 + self.statistics['Tint'] = self.statistics['Lint'] / self.statistics['Uint'] + self.statistics['Taylor_microscale'] = self.statistics['lambda'] + return None + def set_plt_style( + self, + style = {'dashes' : (None, None)}): + self.style.update(style) + return None + def convert_complex_from_binary( + self, + field_name = 'vorticity', + iteration = 0, + file_name = None): + """read the Fourier representation of a vector field. + + Read the binary file containing iteration ``iteration`` of the + field ``field_name``, and write it in a ``.h5`` file. + """ + data = np.memmap( + os.path.join(self.work_dir, + self.simname + '_{0}_i{1:0>5x}'.format('c' + field_name, iteration)), + dtype = self.ctype, + mode = 'r', + shape = (self.parameters['ny'], + self.parameters['nz'], + self.parameters['nx']//2+1, + 3)) + if type(file_name) == type(None): + file_name = self.simname + '_{0}_i{1:0>5x}.h5'.format('c' + field_name, iteration) + file_name = os.path.join(self.work_dir, file_name) + f = h5py.File(file_name, 'a') + f[field_name + '/complex/{0}'.format(iteration)] = data + f.close() + return None + def write_par( + self, + iter0 = 0, + particle_ic = None): + _fluid_particle_base.write_par(self, iter0 = iter0) + with h5py.File(self.get_data_file_name(), 'r+') as ofile: + kspace = self.get_kspace() + nshells = kspace['nshell'].shape[0] + vec_stat_datasets = ['velocity', 'vorticity'] + scal_stat_datasets = [] + for k in vec_stat_datasets: + time_chunk = 2**20//(8*3*self.parameters['nx']) # FIXME: use proper size of self.dtype + time_chunk = max(time_chunk, 1) + ofile.create_dataset('statistics/xlines/' + k, + (1, self.parameters['nx'], 3), + chunks = (time_chunk, self.parameters['nx'], 3), + maxshape = (None, self.parameters['nx'], 3), + dtype = self.dtype) + for k in vec_stat_datasets: + time_chunk = 2**20//(8*3*3*nshells) + time_chunk = max(time_chunk, 1) + ofile.create_dataset('statistics/spectra/' + k + '_' + k, + (1, nshells, 3, 3), + chunks = (time_chunk, nshells, 3, 3), + maxshape = (None, nshells, 3, 3), + dtype = np.float64) + time_chunk = 2**20//(8*4*10) + time_chunk = max(time_chunk, 1) + a = ofile.create_dataset('statistics/moments/' + k, + (1, 10, 4), + chunks = (time_chunk, 10, 4), + maxshape = (None, 10, 4), + dtype = np.float64) + time_chunk = 2**20//(8*4*self.parameters['histogram_bins']) + time_chunk = max(time_chunk, 1) + ofile.create_dataset('statistics/histograms/' + k, + (1, + self.parameters['histogram_bins'], + 4), + chunks = (time_chunk, + self.parameters['histogram_bins'], + 4), + maxshape = (None, + self.parameters['histogram_bins'], + 4), + dtype = np.int64) + ofile['checkpoint'] = int(0) + if self.particle_species == 0: + return None + + if type(particle_ic) == type(None): + pbase_shape = (self.parameters['nparticles'],) + number_of_particles = self.parameters['nparticles'] + else: + pbase_shape = particle_ic.shape[:-1] + assert(particle_ic.shape[-1] == 3) + number_of_particles = 1 + for val in pbase_shape[1:]: + number_of_particles *= val + with h5py.File(self.get_particle_file_name(), 'a') as ofile: + s = 0 + ofile.create_group('tracers{0}'.format(s)) + ofile.create_group('tracers{0}/rhs'.format(s)) + ofile.create_group('tracers{0}/state'.format(s)) + ofile['tracers{0}/rhs'.format(s)].create_dataset( + '0', + shape = ( + (self.parameters['tracers{0}_integration_steps'.format(s)],) + + pbase_shape + + (3,)), + dtype = np.float) + ofile['tracers{0}/state'.format(s)].create_dataset( + '0', + shape = ( + pbase_shape + + (3,)), + dtype = np.float) + return None + def specific_parser_arguments( + self, + parser): + _fluid_particle_base.specific_parser_arguments(self, parser) + parser.add_argument( + '--src-wd', + type = str, + dest = 'src_work_dir', + default = '') + parser.add_argument( + '--src-simname', + type = str, + dest = 'src_simname', + default = '') + parser.add_argument( + '--src-iteration', + type = int, + dest = 'src_iteration', + default = 0) + parser.add_argument( + '--njobs', + type = int, dest = 'njobs', + default = 1) + parser.add_argument( + '--kMeta', + type = float, + dest = 'kMeta', + default = 2.0) + parser.add_argument( + '--dtfactor', + type = float, + dest = 'dtfactor', + default = 0.5, + help = 'dt is computed as DTFACTOR / N') + parser.add_argument( + '--particle-rand-seed', + type = int, + dest = 'particle_rand_seed', + default = None) + parser.add_argument( + '--pclouds', + type = int, + dest = 'pclouds', + default = 1, + help = ('number of particle clouds. Particle "clouds" ' + 'consist of particles distributed according to ' + 'pcloud-type.')) + parser.add_argument( + '--pcloud-type', + choices = ['random-cube', + 'regular-cube'], + dest = 'pcloud_type', + default = 'random-cube') + parser.add_argument( + '--particle-cloud-size', + type = float, + dest = 'particle_cloud_size', + default = 2*np.pi) + parser.add_argument( + '--neighbours', + type = int, + dest = 'neighbours', + default = 1) + parser.add_argument( + '--smoothness', + type = int, + dest = 'smoothness', + default = 1) + return None + def prepare_launch( + self, + args = []): + """Set up reasonable parameters. + + With the default Lundgren forcing applied in the band [2, 4], + we can estimate the dissipation, therefore we can estimate + :math:`k_M \\eta_K` and constrain the viscosity. + + In brief, the command line parameter :math:`k_M \\eta_K` is + used in the following formula for :math:`\\nu` (:math:`N` is the + number of real space grid points per coordinate): + + .. math:: + + \\nu = \\left(\\frac{2 k_M \\eta_K}{N} \\right)^{4/3} + + With this choice, the average dissipation :math:`\\varepsilon` + will be close to 0.4, and the integral scale velocity will be + close to 0.77, yielding the approximate value for the Taylor + microscale and corresponding Reynolds number: + + .. math:: + + \\lambda \\approx 4.75\\left(\\frac{2 k_M \\eta_K}{N} \\right)^{4/6}, \\hskip .5in + R_\\lambda \\approx 3.7 \\left(\\frac{N}{2 k_M \\eta_K} \\right)^{4/6} + + """ + opt = _code.prepare_launch(self, args = args) + self.parameters['nu'] = (opt.kMeta * 2 / opt.n)**(4./3) + self.parameters['dt'] = (opt.dtfactor / opt.n) + # custom famplitude for 288 and 576 + if opt.n == 288: + self.parameters['famplitude'] = 0.45 + elif opt.n == 576: + self.parameters['famplitude'] = 0.47 + if ((self.parameters['niter_todo'] % self.parameters['niter_out']) != 0): + self.parameters['niter_out'] = self.parameters['niter_todo'] + if len(opt.src_work_dir) == 0: + opt.src_work_dir = os.path.realpath(opt.work_dir) + self.pars_from_namespace(opt) + return opt + def launch( + self, + args = [], + **kwargs): + opt = self.prepare_launch(args = args) + if type(opt.nparticles) != type(None): + if opt.nparticles > 0: + self.name += '-particles' + self.add_particles( + integration_steps = 4, + neighbours = opt.neighbours, + smoothness = opt.smoothness) + self.fill_up_fluid_code() + self.finalize_code() + self.launch_jobs(opt = opt) + return None + def generate_tracer_state( + self, + rseed = None, + iteration = 0, + species = 0, + write_to_file = False, + ncomponents = 3, + testing = False, + data = None): + if (type(data) == type(None)): + if not type(rseed) == type(None): + np.random.seed(rseed) + #point with problems: 5.37632864e+00, 6.10414710e+00, 6.25256493e+00] + data = np.zeros(self.parameters['nparticles']*ncomponents).reshape(-1, ncomponents) + data[:, :3] = np.random.random((self.parameters['nparticles'], 3))*2*np.pi + if testing: + #data[0] = np.array([3.26434, 4.24418, 3.12157]) + data[:] = np.array([ 0.72086101, 2.59043666, 6.27501953]) + with h5py.File(self.get_particle_file_name(), 'r+') as data_file: + data_file['tracers{0}/state/0'.format(species)][:] = data + if write_to_file: + data.tofile( + os.path.join( + self.work_dir, + "tracers{0}_state_i{1:0>5x}".format(species, iteration))) + return data + def launch_jobs( + self, + opt = None): + if not os.path.exists(os.path.join(self.work_dir, self.simname + '.h5')): + particle_initial_condition = None + if opt.pclouds > 1: + np.random.seed(opt.particle_rand_seed) + if opt.pcloud_type == 'random-cube': + particle_initial_condition = ( + np.random.random((opt.pclouds, 1, 3))*2*np.pi + + np.random.random((1, self.parameters['nparticles'], 3))*opt.particle_cloud_size) + elif opt.pcloud_type == 'regular-cube': + onedarray = np.linspace( + -opt.particle_cloud_size/2, + opt.particle_cloud_size/2, + self.parameters['nparticles']) + particle_initial_condition = np.zeros( + (opt.pclouds, + self.parameters['nparticles'], + self.parameters['nparticles'], + self.parameters['nparticles'], 3), + dtype = np.float64) + particle_initial_condition[:] = \ + np.random.random((opt.pclouds, 1, 1, 1, 3))*2*np.pi + particle_initial_condition[..., 0] += onedarray[None, None, None, :] + particle_initial_condition[..., 1] += onedarray[None, None, :, None] + particle_initial_condition[..., 2] += onedarray[None, :, None, None] + self.write_par( + particle_ic = particle_initial_condition) + if self.parameters['nparticles'] > 0: + data = self.generate_tracer_state( + species = 0, + rseed = opt.particle_rand_seed, + data = particle_initial_condition) + for s in range(1, self.particle_species): + self.generate_tracer_state(species = s, data = data) + init_condition_file = os.path.join( + self.work_dir, + self.simname + '_checkpoint_0.h5') + if not os.path.exists(init_condition_file): + f = h5py.File(init_condition_file, 'w') + if len(opt.src_simname) > 0: + source_cp = 0 + src_file = 'not_a_file' + while True: + src_file = os.path.join( + os.path.realpath(opt.src_work_dir), + opt.src_simname + '_checkpoint_{0}.h5'.format(source_cp)) + f0 = h5py.File(src_file, 'r') + if '{0}'.format(opt.src_iteration) in f0['vorticity/complex'].keys(): + f0.close() + break + source_cp += 1 + f['vorticity/complex/{0}'.format(0)] = h5py.ExternalLink( + src_file, + 'vorticity/complex/{0}'.format(opt.src_iteration)) + else: + data = self.generate_vector_field( + write_to_file = False, + spectra_slope = 2.0, + amplitude = 0.05) + f['vorticity/complex/{0}'.format(0)] = data + f.close() + self.run( + nb_processes = opt.nb_processes, + nb_threads_per_process = opt.nb_threads_per_process, + njobs = opt.njobs, + hours = opt.minutes // 60, + minutes = opt.minutes % 60, + no_submit = opt.no_submit) + return None + +if __name__ == '__main__': + pass + diff --git a/bfps/NavierStokes.py b/bfps/NavierStokes.py index af1982a60b0c2f35c3d5d53f81e0ac6a1cb6a94b..7ff89ebb6599264dec802272222471000ec79161 100644 --- a/bfps/NavierStokes.py +++ b/bfps/NavierStokes.py @@ -31,6 +31,7 @@ import h5py import argparse import bfps +import bfps.tools from ._code import _code from ._fluid_base import _fluid_particle_base @@ -262,20 +263,6 @@ class NavierStokes(_fluid_particle_base): field_H5T = 'H5T_NATIVE_FLOAT' elif self.dtype == np.float64: field_H5T = 'H5T_NATIVE_DOUBLE' - self.stat_src += self.create_stat_output( - '/statistics/xlines/velocity', - 'fs->rvelocity', - data_type = field_H5T, - size_setup = """ - count[0] = 1; - count[1] = nx; - count[2] = 3; - """, - close_spaces = False) - self.stat_src += self.create_stat_output( - '/statistics/xlines/vorticity', - 'fs->rvorticity', - data_type = field_H5T) if self.QR_stats_on: self.stat_src += self.create_stat_output( '/statistics/moments/trS2_Q_R', @@ -615,7 +602,9 @@ class NavierStokes(_fluid_particle_base): computation_needed = not (ii0 == pp_file['ii0'].value and ii1 == pp_file['ii1'].value) if computation_needed: - for k in pp_file.keys(): + for k in ['t', 'vel_max(t)', 'renergy(t)', + 'energy(t, k)', 'enstrophy(t, k)', + 'ii0', 'ii1', 'iter0', 'iter1']: del pp_file[k] if computation_needed: pp_file['iter0'] = iter0 @@ -751,12 +740,14 @@ class NavierStokes(_fluid_particle_base): vec_stat_datasets = ['velocity', 'vorticity'] scal_stat_datasets = [] for k in vec_stat_datasets: - time_chunk = 2**20//(8*3*self.parameters['nx']) # FIXME: use proper size of self.dtype + time_chunk = 2**20 // ( + self.dtype.itemsize*3* + self.parameters['nx']*self.parameters['ny']) time_chunk = max(time_chunk, 1) - ofile.create_dataset('statistics/xlines/' + k, - (1, self.parameters['nx'], 3), - chunks = (time_chunk, self.parameters['nx'], 3), - maxshape = (None, self.parameters['nx'], 3), + ofile.create_dataset('statistics/0slices/' + k + '/real', + (1, self.parameters['ny'], self.parameters['nx'], 3), + chunks = (time_chunk, self.parameters['ny'], self.parameters['nx'], 3), + maxshape = (None, self.parameters['ny'], self.parameters['nx'], 3), dtype = self.dtype) if self.Lag_acc_stats_on: vec_stat_datasets += ['Lagrangian_acceleration'] @@ -873,33 +864,6 @@ class NavierStokes(_fluid_particle_base): dtype = np.int64) if self.particle_species == 0: return None - def create_particle_dataset( - data_file, - dset_name, - dset_shape, - dset_maxshape, - dset_chunks, - # maybe something more general can be used here - dset_dtype = h5py.h5t.IEEE_F64LE): - # create the dataspace. - space_id = h5py.h5s.create_simple( - dset_shape, - dset_maxshape) - # create the dataset creation property list. - dcpl = h5py.h5p.create(h5py.h5p.DATASET_CREATE) - # set the allocation time to "early". - dcpl.set_alloc_time(h5py.h5d.ALLOC_TIME_EARLY) - dcpl.set_chunk(dset_chunks) - # and now create dataset - if sys.version_info[0] == 3: - dset_name = dset_name.encode() - return h5py.h5d.create( - data_file.id, - dset_name, - dset_dtype, - space_id, - dcpl, - h5py.h5p.DEFAULT) if type(particle_ic) == type(None): pbase_shape = (self.parameters['nparticles'],) @@ -920,30 +884,41 @@ class NavierStokes(_fluid_particle_base): self.parameters['tracers{0}_integration_steps'.format(s)]) + pbase_shape + (3,)) maxshape = (h5py.h5s.UNLIMITED,) + dims[1:] - chunks = (time_chunk, 1, 1) + dims[3:] - create_particle_dataset( + if len(pbase_shape) > 1: + chunks = (time_chunk, 1, 1) + dims[3:] + else: + chunks = (time_chunk, 1) + dims[2:] + bfps.tools.create_alloc_early_dataset( ofile, '/tracers{0}/rhs'.format(s), dims, maxshape, chunks) - create_particle_dataset( + if len(pbase_shape) > 1: + chunks = (time_chunk, 1) + pbase_shape[1:] + (3,) + else: + chunks = (time_chunk, pbase_shape[0], 3) + bfps.tools.create_alloc_early_dataset( ofile, '/tracers{0}/state'.format(s), (1,) + pbase_shape + (3,), (h5py.h5s.UNLIMITED,) + pbase_shape + (3,), - (time_chunk, 1) + pbase_shape[1:] + (3,)) - create_particle_dataset( + chunks) + # "velocity" is sampled, single precision is enough + # for the results we are interested in. + bfps.tools.create_alloc_early_dataset( ofile, '/tracers{0}/velocity'.format(s), (1,) + pbase_shape + (3,), (h5py.h5s.UNLIMITED,) + pbase_shape + (3,), - (time_chunk, 1) + pbase_shape[1:] + (3,)) + chunks, + dset_dtype = h5py.h5t.IEEE_F32LE) if self.parameters['tracers{0}_acc_on'.format(s)]: - create_particle_dataset( + bfps.tools.create_alloc_early_dataset( ofile, '/tracers{0}/acceleration'.format(s), (1,) + pbase_shape + (3,), (h5py.h5s.UNLIMITED,) + pbase_shape + (3,), - (time_chunk, 1) + pbase_shape[1:] + (3,)) + chunks, + dset_dtype = h5py.h5t.IEEE_F32LE) return None def add_particle_fields( self, @@ -1058,6 +1033,16 @@ class NavierStokes(_fluid_particle_base): type = float, dest = 'particle_cloud_size', default = 2*np.pi) + parser.add_argument( + '--neighbours', + type = int, + dest = 'neighbours', + default = 1) + parser.add_argument( + '--smoothness', + type = int, + dest = 'smoothness', + default = 1) return None def prepare_launch( self, @@ -1128,12 +1113,13 @@ class NavierStokes(_fluid_particle_base): opt.nparticles = 0 elif type(opt.nparticles) == int: if opt.nparticles > 0: + self.name += '-particles' self.add_3D_rFFTW_field( name = 'rFFTW_acc') self.add_interpolator( name = 'cubic_spline', - neighbours = 1, - smoothness = 1, + neighbours = opt.neighbours, + smoothness = opt.smoothness, class_name = 'rFFTW_interpolator') self.add_particles( integration_steps = [4], @@ -1193,8 +1179,12 @@ class NavierStokes(_fluid_particle_base): write_to_file = True, spectra_slope = 2.0, amplitude = 0.05) - self.run( - ncpu = opt.ncpu, - njobs = opt.njobs) + self.run( + nb_processes = opt.nb_processes, + nb_threads_per_process = opt.nb_threads_per_process, + njobs = opt.njobs, + hours = opt.minutes // 60, + minutes = opt.minutes % 60, + no_submit = opt.no_submit) return None diff --git a/bfps/__init__.py b/bfps/__init__.py index 4a90f95268cffe3b0c2e1d68d7f4763a4c142e84..09663e1da56539eb51d257032444b38ba7096bc9 100644 --- a/bfps/__init__.py +++ b/bfps/__init__.py @@ -49,4 +49,5 @@ from host_information import host_info from .FluidConvert import FluidConvert from .FluidResize import FluidResize from .NavierStokes import NavierStokes +from .NSVorticityEquation import NSVorticityEquation diff --git a/bfps/__main__.py b/bfps/__main__.py index a26d84d0e918cebe1a9351ca20b5249418d6a3c6..9db5e350340e67dfe99c5a40e3027b489399a42e 100644 --- a/bfps/__main__.py +++ b/bfps/__main__.py @@ -29,6 +29,7 @@ import argparse import bfps from .NavierStokes import NavierStokes +from .NSVorticityEquation import NSVorticityEquation from .FluidResize import FluidResize from .FluidConvert import FluidConvert from .NSManyParticles import NSManyParticles @@ -45,6 +46,12 @@ def main(): 'NS', 'NS-single', 'NS-double'] + NSVEoptions = ['NSVorticityEquation', + 'NSVorticityEquation-single', + 'NSVorticityEquation-double', + 'NSVE', + 'NSVE-single', + 'NSVE-double'] FRoptions = ['FluidResize', 'FluidResize-single', 'FluidResize-double', @@ -57,7 +64,7 @@ def main(): 'NSManyParticles-double'] parser.add_argument( 'base_class', - choices = NSoptions + FRoptions + FCoptions + NSMPopt, + choices = NSoptions + NSVEoptions + FRoptions + FCoptions + NSMPopt, type = str) # first option is the choice of base class or -h or -v # all other options are passed on to the base_class instance @@ -70,6 +77,8 @@ def main(): precision = 'single' if opt.base_class in NSoptions: base_class = NavierStokes + if opt.base_class in NSVEoptions: + base_class = NSVorticityEquation elif opt.base_class in FRoptions: base_class = FluidResize elif opt.base_class in FCoptions: diff --git a/bfps/_base.py b/bfps/_base.py index 2204fe666402eeccc4d815b6381d6b5060a0e7ac..1a112baa3775842f013640596768ad0597eaa187 100644 --- a/bfps/_base.py +++ b/bfps/_base.py @@ -94,11 +94,10 @@ class _base(object): elif type(parameters[key[i]]) == str: src_txt += ('space = H5Dget_space(dset);\n' + 'memtype = H5Dget_type(dset);\n' + - 'H5Sget_simple_extent_dims(space, dims, NULL);\n' + - 'string_data = (char*)malloc(dims[0]*sizeof(char));\n' + + 'string_data = (char*)malloc(256);\n' + 'H5Dread(dset, memtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &string_data);\n' + 'sprintf({0}, "%s", string_data);\n'.format(key[i]) + - 'free(string_data);\n' + + 'free(string_data);\n' 'H5Sclose(space);\n' + 'H5Tclose(memtype);\n') elif type(parameters[key[i]]) == np.ndarray: @@ -123,7 +122,7 @@ class _base(object): elif type(self.parameters[key[i]]) == str: src_txt += 'DEBUG_MSG("'+ key[i] + ' = %s\\n", ' + key[i] + ');\n' elif type(self.parameters[key[i]]) == np.ndarray: - src_txt += ('for (int array_counter=0; array_counter<' + + src_txt += ('for (unsigned int array_counter=0; array_counter<' + key[i] + '.size(); array_counter++)\n' + '{\n' + @@ -250,8 +249,27 @@ class _base(object): help = 'code is run by default in a grid of NxNxN') parser.add_argument( '--ncpu', - type = int, dest = 'ncpu', - default = 2) + type = int, + dest = 'ncpu', + default = -1) + parser.add_argument( + '--np', '--nprocesses', + metavar = 'NPROCESSES', + help = 'number of mpi processes to use', + type = int, + dest = 'nb_processes', + default = 4) + parser.add_argument( + '--ntpp', '--nthreads-per-process', + type = int, + dest = 'nb_threads_per_process', + metavar = 'NTHREADS_PER_PROCESS', + help = 'number of threads to use per MPI process', + default = 1) + parser.add_argument( + '--no-submit', + action = 'store_true', + dest = 'no_submit') parser.add_argument( '--simname', type = str, dest = 'simname', @@ -265,6 +283,13 @@ class _base(object): '--wd', type = str, dest = 'work_dir', default = './') + parser.add_argument( + '--minutes', + type = int, + dest = 'minutes', + default = 5, + help = 'If environment supports it, this is the requested wall-clock-limit.') + return None def parameters_to_parser_arguments( self, diff --git a/bfps/_code.py b/bfps/_code.py index 314681ada3bb81e5700fdb7f1307c9af96fa5011..faf151559f25078ffbb214659a81f0a2f418b177 100644 --- a/bfps/_code.py +++ b/bfps/_code.py @@ -32,6 +32,7 @@ import argparse import h5py from datetime import datetime import math +import warnings import bfps from ._base import _base @@ -45,19 +46,25 @@ class _code(_base): work_dir = './', simname = 'test'): _base.__init__(self, work_dir = work_dir, simname = simname) - self.version_message = ('/***********************************************************************\n' + - '* this code automatically generated by bfps\n' + - '* version {0}\n'.format(bfps.__version__) + - '***********************************************************************/\n\n\n') + self.version_message = ( + '/***********************************************************************\n' + + '* this code automatically generated by bfps\n' + + '* version {0}\n'.format(bfps.__version__) + + '***********************************************************************/\n\n\n') self.includes = """ //begincpp #include "base.hpp" #include "fluid_solver.hpp" + #include "scope_timer.hpp" + #include "fftw_interface.hpp" #include <iostream> #include <hdf5.h> #include <string> #include <cstring> #include <fftw3-mpi.h> + #include <omp.h> + #include <fenv.h> + #include <cstdlib> //endcpp """ self.variables = 'int myrank, nprocs;\n' @@ -69,23 +76,58 @@ class _code(_base): //begincpp int main(int argc, char *argv[]) { - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - fftw_mpi_init(); - fftwf_mpi_init(); + if(getenv("BFPS_FPE_OFF") == nullptr || getenv("BFPS_FPE_OFF") != std::string("TRUE")){ + feenableexcept(FE_INVALID | FE_OVERFLOW); + } + else{ + std::cout << "FPE have been turned OFF" << std::endl; + } if (argc != 2) { std::cerr << "Wrong number of command line arguments. Stopping." << std::endl; MPI_Finalize(); return EXIT_SUCCESS; } + #ifdef NO_FFTWOMP + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + fftw_mpi_init(); + fftwf_mpi_init(); + DEBUG_MSG("There are %d processes\\n", nprocs); + #else + int mpiprovided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &mpiprovided); + assert(mpiprovided >= MPI_THREAD_FUNNELED); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + const int nbThreads = omp_get_max_threads(); + DEBUG_MSG("Number of threads for the FFTW = %d\\n", nbThreads); + if (nbThreads > 1){ + fftw_init_threads(); + fftwf_init_threads(); + } + fftw_mpi_init(); + fftwf_mpi_init(); + DEBUG_MSG("There are %d processes and %d threads\\n", nprocs, nbThreads); + if (nbThreads > 1){ + fftw_plan_with_nthreads(nbThreads); + fftwf_plan_with_nthreads(nbThreads); + } + #endif strcpy(simname, argv[1]); sprintf(fname, "%s.h5", simname); parameter_file = H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT); Cdset = H5Dopen(parameter_file, "iteration", H5P_DEFAULT); - H5Dread(Cdset, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, &iteration); - DEBUG_MSG("simname is %s and iteration is %d\\n", simname, iteration); + H5Dread( + Cdset, + H5T_NATIVE_INT, + H5S_ALL, + H5S_ALL, + H5P_DEFAULT, + &iteration); + DEBUG_MSG("simname is %s and iteration is %d\\n", + simname, iteration); H5Dclose(Cdset); H5Fclose(parameter_file); read_parameters(); @@ -97,12 +139,16 @@ class _code(_base): DEBUG_MSG("when setting stat_file cache I got %d\\n", cache_err); stat_file = H5Fopen(fname, H5F_ACC_RDWR, fapl); } + { + TIMEZONE("code::main_start"); //endcpp """ for ostream in ['cout', 'cerr']: - self.main_start += 'if (myrank == 0) std::{1} << "{0}" << std::endl;'.format(self.version_message, ostream).replace('\n', '\\n') + '\n' + self.main_start += 'if (myrank == 0) std::{1} << "{0}" << std::endl;'.format( + self.version_message, ostream).replace('\n', '\\n') + '\n' self.main_end = """ //begincpp + } // clean up if (myrank == 0) { @@ -113,6 +159,17 @@ class _code(_base): } fftwf_mpi_cleanup(); fftw_mpi_cleanup(); + #ifndef NO_FFTWOMP + if (nbThreads > 1){ + fftw_cleanup_threads(); + fftwf_cleanup_threads(); + } + #endif + #ifdef USE_TIMINGOUTPUT + global_timer_manager.show(MPI_COMM_WORLD); + global_timer_manager.showMpi(MPI_COMM_WORLD); + global_timer_manager.showHtml(MPI_COMM_WORLD); + #endif MPI_Finalize(); return EXIT_SUCCESS; } @@ -147,15 +204,21 @@ class _code(_base): libraries = ['bfps'] libraries += bfps.install_info['libraries'] - command_strings = ['g++'] + command_strings = [bfps.install_info['compiler']] command_strings += [self.name + '.cpp', '-o', self.name] command_strings += bfps.install_info['extra_compile_args'] command_strings += ['-I' + idir for idir in bfps.install_info['include_dirs']] command_strings.append('-I' + bfps.header_dir) command_strings += ['-L' + ldir for ldir in bfps.install_info['library_dirs']] + command_strings += ['-Wl,-rpath=' + ldir for ldir in bfps.install_info['library_dirs']] command_strings.append('-L' + bfps.lib_dir) + command_strings.append('-Wl,-rpath=' + bfps.lib_dir) + for libname in libraries: command_strings += ['-l' + libname] + + command_strings += ['-fopenmp'] + self.write_src() print('compiling code with command\n' + ' '.join(command_strings)) return subprocess.call(command_strings) @@ -165,12 +228,14 @@ class _code(_base): self.host_info.update(host_info) return None def run(self, - ncpu = 2, + nb_processes, + nb_threads_per_process, out_file = 'out_file', err_file = 'err_file', - hours = 1, - minutes = 0, - njobs = 1): + hours = 0, + minutes = 10, + njobs = 1, + no_submit = False): self.read_parameters() with h5py.File(os.path.join(self.work_dir, self.simname + '.h5'), 'r') as data_file: iter0 = data_file['iteration'].value @@ -190,7 +255,9 @@ class _code(_base): os.chdir(current_dir) command_atoms = ['mpirun', '-np', - '{0}'.format(ncpu), + '{0}'.format(nb_processes), + '-x', + 'OMP_NUM_THREADS={0}'.format(nb_threads_per_process), './' + self.name, self.simname] if self.host_info['type'] == 'cluster': @@ -200,9 +267,9 @@ class _code(_base): qsub_script_name = 'run_' + suffix + '.sh' self.write_sge_file( file_name = os.path.join(self.work_dir, qsub_script_name), - nprocesses = ncpu, + nprocesses = nb_processes*nb_threads_per_process, name_of_run = suffix, - command_atoms = command_atoms[3:], + command_atoms = command_atoms[5:], hours = hours, minutes = minutes, out_file = out_file + '_' + suffix, @@ -214,6 +281,65 @@ class _code(_base): subprocess.call(qsub_atoms + [qsub_script_name]) os.chdir(current_dir) job_name_list.append(suffix) + if self.host_info['type'] == 'SLURM': + job_id_list = [] + for j in range(njobs): + suffix = self.simname + '_{0}'.format(iter0 + j*self.parameters['niter_todo']) + qsub_script_name = 'run_' + suffix + '.sh' + self.write_slurm_file( + file_name = os.path.join(self.work_dir, qsub_script_name), + name_of_run = suffix, + command_atoms = command_atoms[5:], + hours = hours, + minutes = minutes, + out_file = out_file + '_' + suffix, + err_file = err_file + '_' + suffix, + nb_mpi_processes = nb_processes, + nb_threads_per_process = nb_threads_per_process) + os.chdir(self.work_dir) + qsub_atoms = ['sbatch'] + + if not no_submit: + if len(job_id_list) >= 1: + qsub_atoms += ['--dependency=afterok:{0}'.format(job_id_list[-1])] + p = subprocess.Popen( + qsub_atoms + [qsub_script_name], + stdout = subprocess.PIPE) + out, err = p.communicate() + p.terminate() + job_id_list.append(int(out.split()[-1])) + os.chdir(current_dir) + elif self.host_info['type'] == 'IBMLoadLeveler': + suffix = self.simname + '_{0}'.format(iter0) + job_script_name = 'run_' + suffix + '.sh' + if (njobs == 1): + self.write_IBMLoadLeveler_file_single_job( + file_name = os.path.join(self.work_dir, job_script_name), + name_of_run = suffix, + command_atoms = command_atoms[5:], + hours = hours, + minutes = minutes, + out_file = out_file + '_' + suffix, + err_file = err_file + '_' + suffix, + nb_mpi_processes = nb_processes, + nb_threads_per_process = nb_threads_per_process) + else: + self.write_IBMLoadLeveler_file_many_job( + file_name = os.path.join(self.work_dir, job_script_name), + name_of_run = suffix, + command_atoms = command_atoms[5:], + hours = hours, + minutes = minutes, + out_file = out_file + '_' + suffix, + err_file = err_file + '_' + suffix, + njobs = njobs, + nb_mpi_processes = nb_processes, + nb_threads_per_process = nb_threads_per_process) + submit_atoms = ['llsubmit'] + + if not no_submit: + subprocess.call(submit_atoms + [os.path.join(self.work_dir, job_script_name)]) + elif self.host_info['type'] == 'pc': os.chdir(self.work_dir) os.environ['LD_LIBRARY_PATH'] += ':{0}'.format(bfps.lib_dir) @@ -226,6 +352,195 @@ class _code(_base): stderr = open(err_file + '_' + suffix, 'w')) os.chdir(current_dir) return None + def write_IBMLoadLeveler_file_single_job( + self, + file_name = None, + nprocesses = None, + name_of_run = None, + command_atoms = [], + hours = None, + minutes = None, + out_file = None, + err_file = None, + nb_mpi_processes = None, + nb_threads_per_process = None): + + script_file = open(file_name, 'w') + script_file.write('# @ shell=/bin/bash\n') + # error file + if type(err_file) == type(None): + err_file = 'err.job.$(jobid)' + script_file.write('# @ error = ' + os.path.join(self.work_dir, err_file) + '\n') + # output file + if type(out_file) == type(None): + out_file = 'out.job.$(jobid)' + script_file.write('# @ output = ' + os.path.join(self.work_dir, out_file) + '\n') + + # If Ibm is used should be : script_file.write('# @ job_type = parallel\n') + script_file.write('# @ job_type = MPICH\n') + + script_file.write('# @ node_usage = not_shared\n') + script_file.write('# @ notification = complete\n') + script_file.write('# @ notify_user = $(user)@rzg.mpg.de\n') + + nb_cpus_per_node = self.host_info['deltanprocs'] + assert(isinstance(nb_cpus_per_node, int) and nb_cpus_per_node >= 1, + 'nb_cpus_per_node is {}'.format(nb_cpus_per_node)) + + # No more threads than the number of cores + assert(nb_threads_per_process <= nb_cpus_per_node, + "Cannot use more threads ({} asked) than the number of cores ({})".format( + nb_threads_per_process, nb_cpus_per_node)) + # Warn if some core will not be ued + if nb_cpus_per_node%nb_threads_per_process != 0: + warnings.warn("The number of threads is smaller than the number of cores (machine will be underused)", + UserWarning) + + nb_cpus = nb_mpi_processes*nb_threads_per_process + if (nb_cpus < nb_cpus_per_node): + # in case we use only a few process on a single node + nb_nodes = 1 + nb_processes_per_node = nb_mpi_processes + first_node_tasks = nb_mpi_processes + else: + nb_nodes = int((nb_cpus+nb_cpus_per_node-1) // nb_cpus_per_node) + # if more than one node we requiere to have a multiple of deltanprocs + nb_processes_per_node = int(nb_cpus_per_node // nb_threads_per_process) + first_node_tasks = int(nb_mpi_processes - (nb_nodes-1)*nb_processes_per_node) + + script_file.write('# @ resources = ConsumableCpus({})\n'.format(nb_threads_per_process)) + script_file.write('# @ network.MPI = sn_all,not_shared,us\n') + script_file.write('# @ wall_clock_limit = {0}:{1:0>2d}:00\n'.format(hours, minutes)) + assert(type(self.host_info['environment']) != type(None)) + script_file.write('# @ node = {0}\n'.format(nb_nodes)) + script_file.write('# @ tasks_per_node = {0}\n'.format(nb_processes_per_node)) + if (first_node_tasks > 0): + script_file.write('# @ first_node_tasks = {0}\n'.format(first_node_tasks)) + script_file.write('# @ queue\n') + + + script_file.write('source ~/.config/bfps/bashrc\n') + script_file.write('module li\n') + script_file.write('export OMP_NUM_THREADS={}\n'.format(nb_threads_per_process)) + + script_file.write('LD_LIBRARY_PATH=' + + ':'.join([bfps.lib_dir] + bfps.install_info['library_dirs']) + + ':${LD_LIBRARY_PATH}\n') + script_file.write('echo "Start time is `date`"\n') + script_file.write('export HTMLOUTPUT={}.html\n'.format(command_atoms[-1])) + script_file.write('cd ' + self.work_dir + '\n') + + script_file.write('export KMP_AFFINITY=compact,verbose\n') + script_file.write('export I_MPI_PIN_DOMAIN=omp\n') + script_file.write('mpiexec.hydra ' + + ' -np {} '.format(nb_mpi_processes) + + ' -ppn {} '.format(nb_processes_per_node) + + ' -ordered-output -prepend-rank ' + + os.path.join( + self.work_dir, + command_atoms[0]) + + ' ' + + ' '.join(command_atoms[1:]) + + '\n') + + script_file.write('echo "End time is `date`"\n') + script_file.write('exit 0\n') + script_file.close() + return None + def write_IBMLoadLeveler_file_many_job( + self, + file_name = None, + nprocesses = None, + name_of_run = None, + command_atoms = [], + hours = None, + minutes = None, + out_file = None, + err_file = None, + njobs = 2, + nb_mpi_processes = None, + nb_threads_per_process = None): + assert(type(self.host_info['environment']) != type(None)) + script_file = open(file_name, 'w') + script_file.write('# @ shell=/bin/bash\n') + # error file + if type(err_file) == type(None): + err_file = 'err.job.$(jobid).$(stepid)' + script_file.write('# @ error = ' + os.path.join(self.work_dir, err_file) + '\n') + # output file + if type(out_file) == type(None): + out_file = 'out.job.$(jobid).$(stepid)' + script_file.write('# @ output = ' + os.path.join(self.work_dir, out_file) + '\n') + # If Ibm is used should be : script_file.write('# @ job_type = parallel\n') + script_file.write('# @ job_type = MPICH\n') + script_file.write('# @ node_usage = not_shared\n') + script_file.write('#\n') + + nb_cpus_per_node = self.host_info['deltanprocs'] + assert(isinstance(nb_cpus_per_node, int) and nb_cpus_per_node >= 1, 'nb_cpus_per_node is {}'.format(nb_cpus_per_node)) + + # No more threads than the number of cores + assert(nb_threads_per_process <= nb_cpus_per_node, + "Cannot use more threads ({} asked) than the number of cores ({})".format( + nb_threads_per_process, nb_cpus_per_node)) + # Warn if some core will not be ued + if nb_cpus_per_node%nb_threads_per_process != 0: + warnings.warn("The number of threads is smaller than the number of cores (machine will be underused)", + UserWarning) + + nb_cpus = nb_mpi_processes*nb_threads_per_process + if (nb_cpus < nb_cpus_per_node): + # in case we use only a few process on a single node + nb_nodes = 1 + nb_processes_per_node = nb_mpi_processes + first_node_tasks = nb_mpi_processes + else: + nb_nodes = int((nb_cpus+nb_cpus_per_node-1) // nb_cpus_per_node) + # if more than one node we requiere to have a multiple of deltanprocs + nb_processes_per_node = int(nb_cpus_per_node // nb_threads_per_process) + first_node_tasks = int(nb_mpi_processes - (nb_nodes-1)*nb_processes_per_node) + + for job in range(njobs): + script_file.write('# @ step_name = {0}.$(stepid)\n'.format(self.simname)) + script_file.write('# @ resources = ConsumableCpus({})\n'.format(nb_threads_per_process)) + script_file.write('# @ network.MPI = sn_all,not_shared,us\n') + script_file.write('# @ wall_clock_limit = {0}:{1:0>2d}:00\n'.format(hours, minutes)) + assert(type(self.host_info['environment']) != type(None)) + script_file.write('# @ node = {0}\n'.format(nb_nodes)) + script_file.write('# @ tasks_per_node = {0}\n'.format(nb_processes_per_node)) + if (first_node_tasks > 0): + script_file.write('# @ first_node_tasks = {0}\n'.format(first_node_tasks)) + script_file.write('# @ queue\n') + + script_file.write('source ~/.config/bfps/bashrc\n') + script_file.write('module li\n') + script_file.write('export OMP_NUM_THREADS={}\n'.format(nb_threads_per_process)) + + script_file.write('LD_LIBRARY_PATH=' + + ':'.join([bfps.lib_dir] + bfps.install_info['library_dirs']) + + ':${LD_LIBRARY_PATH}\n') + script_file.write('echo "Start time is `date`"\n') + script_file.write('export HTMLOUTPUT={}.html\n'.format(command_atoms[-1])) + script_file.write('cd ' + self.work_dir + '\n') + + script_file.write('export KMP_AFFINITY=compact,verbose\n') + script_file.write('export I_MPI_PIN_DOMAIN=omp\n') + + script_file.write('mpiexec.hydra ' + + ' -np {} '.format(nb_mpi_processes) + + ' -ppn {} '.format(nb_processes_per_node) + + ' -ordered-output -prepend-rank ' + + os.path.join( + self.work_dir, + command_atoms[0]) + + ' ' + + ' '.join(command_atoms[1:]) + + '\n') + + script_file.write('echo "End time is `date`"\n') + script_file.write('exit 0\n') + script_file.close() + return None def write_sge_file( self, file_name = None, @@ -267,6 +582,79 @@ class _code(_base): script_file.write('exit 0\n') script_file.close() return None + def write_slurm_file( + self, + file_name = None, + name_of_run = None, + command_atoms = [], + hours = None, + minutes = None, + out_file = None, + err_file = None, + nb_mpi_processes = None, + nb_threads_per_process = None): + script_file = open(file_name, 'w') + script_file.write('#!/bin/bash -l\n') + # job name + script_file.write('#SBATCH -J {0}\n'.format(name_of_run)) + # use current working directory + script_file.write('#SBATCH -D ./\n') + # error file + if not type(err_file) == type(None): + script_file.write('#SBATCH -e ' + err_file + '\n') + # output file + if not type(out_file) == type(None): + script_file.write('#SBATCH -o ' + out_file + '\n') + script_file.write('#SBATCH --partition={0}\n'.format( + self.host_info['environment'])) + + nb_cpus_per_node = self.host_info['deltanprocs'] + assert(isinstance(nb_cpus_per_node, int) and nb_cpus_per_node >= 1, + 'nb_cpus_per_node is {}'.format(nb_cpus_per_node)) + + # No more threads than the number of cores + assert(nb_threads_per_process <= nb_cpus_per_node, + "Cannot use more threads ({} asked) than the number of cores ({})".format( + nb_threads_per_process, nb_cpus_per_node)) + # Warn if some core will not be ued + if nb_cpus_per_node%nb_threads_per_process != 0: + warnings.warn( + "The number of threads is smaller than the number of cores (machine will be underused)", + UserWarning) + + nb_cpus = nb_mpi_processes*nb_threads_per_process + if (nb_cpus < nb_cpus_per_node): + # in case we use only a few process on a single node + nb_nodes = 1 + nb_processes_per_node = nb_mpi_processes + else: + nb_nodes = int((nb_cpus+nb_cpus_per_node-1) // nb_cpus_per_node) + # if more than one node we requiere to have a multiple of deltanprocs + nb_processes_per_node = int(nb_cpus_per_node // nb_threads_per_process) + + + script_file.write('#SBATCH --nodes={0}\n'.format(nb_nodes)) + script_file.write('#SBATCH --ntasks-per-node={0}\n'.format(nb_processes_per_node)) + script_file.write('#SBATCH --cpus-per-task={0}\n'.format(nb_threads_per_process)) + + script_file.write('#SBATCH --mail-type=none\n') + script_file.write('#SBATCH --time={0}:{1:0>2d}:00\n'.format(hours, minutes)) + script_file.write('source ~/.config/bfps/bashrc\n') + if nb_threads_per_process > 1: + script_file.write('export OMP_NUM_THREADS={0}\n'.format(nb_threads_per_process)) + script_file.write('export OMP_PLACES=cores\n') + + script_file.write('LD_LIBRARY_PATH=' + + ':'.join([bfps.lib_dir] + bfps.install_info['library_dirs']) + + ':${LD_LIBRARY_PATH}\n') + script_file.write('echo "Start time is `date`"\n') + script_file.write('cd ' + self.work_dir + '\n') + script_file.write('export HTMLOUTPUT={}.html\n'.format(command_atoms[-1])) + script_file.write('srun {0}\n'.format(' '.join(command_atoms))) + script_file.write('echo "End time is `date`"\n') + script_file.write('exit 0\n') + script_file.close() + return None def prepare_launch( self, args = [], @@ -274,6 +662,14 @@ class _code(_base): parser = argparse.ArgumentParser('bfps ' + type(self).__name__) self.add_parser_arguments(parser) opt = parser.parse_args(args) + + if opt.ncpu != -1: + warnings.warn( + 'ncpu should be replaced by np/ntpp', + DeprecationWarning) + opt.nb_processes = opt.ncpu + opt.nb_threads_per_process = 1 + self.set_host_info(bfps.host_info) if type(opt.environment) != type(None): self.host_info['environment'] = opt.environment diff --git a/bfps/_fluid_base.py b/bfps/_fluid_base.py index 7eef1e1569cf3f5f66b5adcf52494be8de2fbe49..dac5a581c73bc456adcbd82112a29b7353635075 100644 --- a/bfps/_fluid_base.py +++ b/bfps/_fluid_base.py @@ -95,6 +95,7 @@ class _fluid_particle_base(_code): //begincpp if (myrank == 0 && iteration == 0) { + TIMEZONE("fuild_base::store_kspace"); hsize_t dims[4]; hid_t space, dset; // store kspace information @@ -142,7 +143,7 @@ class _fluid_particle_base(_code): self.includes += self.fluid_includes self.includes += '#include <ctime>\n' self.variables += (self.fluid_variables + - 'hid_t particle_file;\n') + '//hid_t particle_file;\n') self.definitions += ('int grow_single_dataset(hid_t dset, int tincrement)\n{\n' + 'int ndims;\n' + 'hsize_t space;\n' + @@ -217,7 +218,7 @@ class _fluid_particle_base(_code): """.format(fftw_prefix) + self.main_end if self.particle_species > 0: self.main_start += """ - if (myrank == 0) + /*if (myrank == 0) { // set caching parameters hid_t fapl = H5Pcreate(H5P_FILE_ACCESS); @@ -225,12 +226,12 @@ class _fluid_particle_base(_code): DEBUG_MSG("when setting cache for particles I got %d\\n", cache_err); sprintf(fname, "%s_particles.h5", simname); particle_file = H5Fopen(fname, H5F_ACC_RDWR, fapl); - } + }*/ """ - self.main_end = ('if (myrank == 0)\n' + + self.main_end = ('/*if (myrank == 0)\n' + '{\n' + 'H5Fclose(particle_file);\n' + - '}\n') + self.main_end + '}*/\n') + self.main_end self.main = """ //begincpp int data_file_problem; @@ -263,8 +264,15 @@ class _fluid_particle_base(_code): '<< time_difference/nprocs << " seconds" << std::endl;\n' + 'time0 = time1;\n') if not postprocess_mode: - self.main += 'for (int max_iter = iteration+niter_todo; iteration < max_iter; iteration++)\n' + self.main += 'for (int max_iter = iteration+niter_todo-iteration%niter_todo; iteration < max_iter; iteration++)\n' self.main += '{\n' + + self.main += """ + #ifdef USE_TIMINGOUTPUT + const std::string loopLabel = "code::main_start::loop-" + std::to_string(iteration); + TIMEZONE(loopLabel.c_str()); + #endif + """ self.main += 'if (iteration % niter_stat == 0) do_stats();\n' if self.particle_species > 0: self.main += 'if (iteration % niter_part == 0) do_particle_stats();\n' @@ -278,6 +286,12 @@ class _fluid_particle_base(_code): else: self.main += 'for (int frame_index = iter0; frame_index <= iter1; frame_index += niter_out)\n' self.main += '{\n' + self.main += """ + #ifdef USE_TIMINGOUTPUT + const std::string loopLabel = "code::main_start::loop-" + std::to_string(frame_index); + TIMEZONE(loopLabel.c_str()); + #endif + """ if self.particle_species > 0: self.main += self.particle_loop self.main += self.fluid_loop @@ -292,6 +306,9 @@ class _fluid_particle_base(_code): field = 'velocity', iteration = 0, filename = None): + """ + :note: assumes field is a vector field + """ if type(filename) == type(None): filename = os.path.join( self.work_dir, @@ -299,6 +316,7 @@ class _fluid_particle_base(_code): return np.memmap( filename, dtype = self.dtype, + mode = 'r', shape = (self.parameters['nz'], self.parameters['ny'], self.parameters['nx'], 3)) @@ -437,7 +455,7 @@ class _fluid_particle_base(_code): #data[0] = np.array([3.26434, 4.24418, 3.12157]) data[0] = np.array([ 0.72086101, 2.59043666, 6.27501953]) with h5py.File(self.get_particle_file_name(), 'r+') as data_file: - data_file['tracers{0}/state'.format(species)][0] = data + data_file['tracers{0}/state/0'.format(species)][0] = data if write_to_file: data.tofile( os.path.join( diff --git a/bfps/cpp/base.hpp b/bfps/cpp/base.hpp index ee2d74d5b751451e9bb34600a0e2b09891a73d1f..adfdd62f772795269cbcc5241dcb881677e38e72 100644 --- a/bfps/cpp/base.hpp +++ b/bfps/cpp/base.hpp @@ -42,6 +42,9 @@ inline int MOD(int a, int n) return ((a%n) + n) % n; } +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// + #ifdef OMPI_MPI_H #define BFPS_MPICXX_DOUBLE_COMPLEX MPI_DOUBLE_COMPLEX @@ -52,6 +55,37 @@ inline int MOD(int a, int n) #endif//OMPI_MPI_H +template <class realtype> +class mpi_real_type; + +template <> +class mpi_real_type<float> +{ +public: + static constexpr MPI_Datatype real(){ + return MPI_FLOAT; + } + + static constexpr MPI_Datatype complex(){ + return MPI_COMPLEX; + } +}; + +template <> +class mpi_real_type<double> +{ +public: + static constexpr MPI_Datatype real(){ + return MPI_DOUBLE; + } + + static constexpr MPI_Datatype complex(){ + return BFPS_MPICXX_DOUBLE_COMPLEX; + } +}; + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// #ifndef NDEBUG @@ -99,5 +133,7 @@ inline void DEBUG_MSG_WAIT(MPI_Comm communicator, const char * format, ...) #endif//NDEBUG +#define variable_used_only_in_assert(x) ((void)(x)) + #endif//BASE diff --git a/bfps/cpp/bfps_timer.hpp b/bfps/cpp/bfps_timer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bec3cb681aed06d04f789bbe6e335f59958266be --- /dev/null +++ b/bfps/cpp/bfps_timer.hpp @@ -0,0 +1,104 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ +#ifndef BFPS_TIMER_HPP +#define BFPS_TIMER_HPP + +#include <chrono> + +/** + * @file + * + * Each section to measure should be embraced by start/stop. + * The measured time is given by "getElapsed". + * The total time measured by a timer is given by "getCumulated". + * Example : + * @code bfps_timer tm; // Implicit start + * @code ... + * @code tm.stop(); // stop the timer + * @code tm.getElapsed(); // return the duration in s [A] + * @code tm.start(); // restart the timer + * @code ... + * @code tm.stopAndGetElapsed(); // stop the timer and return the duraction in s + * [B] + * @code tm.getCumulated(); // Equal [A] + [B] + */ +class bfps_timer { + using double_second_time = std::chrono::duration<double, std::ratio<1, 1>>; + + std::chrono::high_resolution_clock::time_point + m_start; ///< m_start time (start) + std::chrono::high_resolution_clock::time_point m_end; ///< stop time (stop) + std::chrono::nanoseconds m_cumulate; ///< the m_cumulate time + +public: + /// Constructor + bfps_timer() { start(); } + + /// Copy constructor + bfps_timer(const bfps_timer& other) = delete; + /// Copies an other timer + bfps_timer& operator=(const bfps_timer& other) = delete; + /// Move constructor + bfps_timer(bfps_timer&& other) = delete; + /// Copies an other timer + bfps_timer& operator=(bfps_timer&& other) = delete; + + /** Rest all the values, and apply start */ + void reset() { + m_start = std::chrono::high_resolution_clock::time_point(); + m_end = std::chrono::high_resolution_clock::time_point(); + m_cumulate = std::chrono::nanoseconds(); + start(); + } + + /** Start the timer */ + void start() { + m_start = std::chrono::high_resolution_clock::now(); + } + + /** Stop the current timer */ + void stop() { + m_end = std::chrono::high_resolution_clock::now(); + m_cumulate += std::chrono::duration_cast<std::chrono::nanoseconds>(m_end - m_start); + } + + /** Return the elapsed time between start and stop (in second) */ + double getElapsed() const { + return std::chrono::duration_cast<double_second_time>( + std::chrono::duration_cast<std::chrono::nanoseconds>(m_end - m_start)).count(); + } + + /** Return the total counted time */ + double getCumulated() const { + return std::chrono::duration_cast<double_second_time>(m_cumulate).count(); + } + + /** End the current counter (stop) and return the elapsed time */ + double stopAndGetElapsed() { + stop(); + return getElapsed(); + } +}; + +#endif diff --git a/bfps/cpp/distributed_particles.cpp b/bfps/cpp/distributed_particles.cpp index 7d0808419cc0c7c001e37f38e25395fe3fd559b1..73fd0275d8138d41bb4ee7fbc28e2d41e8017661 100644 --- a/bfps/cpp/distributed_particles.cpp +++ b/bfps/cpp/distributed_particles.cpp @@ -24,17 +24,19 @@ -#define NDEBUG +//#define NDEBUG #include <cmath> #include <cassert> #include <cstring> #include <string> #include <sstream> +#include <array> #include "base.hpp" #include "distributed_particles.hpp" #include "fftw_tools.hpp" +#include "scope_timer.hpp" extern int myrank, nprocs; @@ -43,17 +45,17 @@ template <particle_types particle_type, class rnumber, int interp_neighbours> distributed_particles<particle_type, rnumber, interp_neighbours>::distributed_particles( const char *NAME, const hid_t data_file_id, - interpolator<rnumber, interp_neighbours> *FIELD, + interpolator<rnumber, interp_neighbours> *VEL, const int TRAJ_SKIP, const int INTEGRATION_STEPS) : particles_io_base<particle_type>( NAME, TRAJ_SKIP, data_file_id, - FIELD->descriptor->comm) + VEL->descriptor->comm) { assert((INTEGRATION_STEPS <= 6) && (INTEGRATION_STEPS >= 1)); - this->vel = FIELD; + this->vel = VEL; this->rhs.resize(INTEGRATION_STEPS); this->integration_steps = INTEGRATION_STEPS; this->state.reserve(2*this->nparticles / this->nprocs); @@ -72,14 +74,13 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::sample( const std::unordered_map<int, single_particle_state<particle_type>> &x, std::unordered_map<int, single_particle_state<POINT3D>> &y) { - double *yy = new double[3]; + std::array<double, 3> yy; y.clear(); for (auto &pp: x) { - (*field)(pp.second.data, yy); - y[pp.first] = yy; + (*field)(pp.second.data, &yy.front()); + y[pp.first] = &yy.front(); } - delete[] yy; } template <particle_types particle_type, class rnumber, int interp_neighbours> @@ -121,6 +122,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::redistrib std::unordered_map<int, single_particle_state<particle_type>> &x, std::vector<std::unordered_map<int, single_particle_state<particle_type>>> &vals) { + TIMEZONE("distributed_particles::redistribute"); //DEBUG_MSG("entered redistribute\n"); /* neighbouring rank offsets */ int ro[2]; @@ -312,6 +314,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::AdamsBash template <particle_types particle_type, class rnumber, int interp_neighbours> void distributed_particles<particle_type, rnumber, interp_neighbours>::step() { + TIMEZONE("distributed_particles::step"); this->AdamsBashforth((this->iteration < this->integration_steps) ? this->iteration+1 : this->integration_steps); @@ -368,6 +371,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::write( const char *dset_name, std::unordered_map<int, single_particle_state<POINT3D>> &y) { + TIMEZONE("distributed_particles::write"); double *data = new double[this->nparticles*3]; double *yy = new double[this->nparticles*3]; for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++) @@ -399,6 +403,7 @@ template <particle_types particle_type, class rnumber, int interp_neighbours> void distributed_particles<particle_type, rnumber, interp_neighbours>::write( const bool write_rhs) { + TIMEZONE("distributed_particles::write2"); double *temp0 = new double[this->chunk_size*state_dimension(particle_type)]; double *temp1 = new double[this->chunk_size*state_dimension(particle_type)]; for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++) @@ -411,7 +416,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::write( if (pp != this->state.end()) std::copy(pp->second.data, pp->second.data + state_dimension(particle_type), - temp0 + pp->first*state_dimension(particle_type)); + temp0 + p*state_dimension(particle_type)); } MPI_Allreduce( temp0, @@ -433,7 +438,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::write( if (pp != this->rhs[i].end()) std::copy(pp->second.data, pp->second.data + state_dimension(particle_type), - temp0 + pp->first*state_dimension(particle_type)); + temp0 + p*state_dimension(particle_type)); } MPI_Allreduce( temp0, diff --git a/bfps/cpp/fftw_interface.hpp b/bfps/cpp/fftw_interface.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2b2e5074b2dc346b00dcfab0090598b486234bb5 --- /dev/null +++ b/bfps/cpp/fftw_interface.hpp @@ -0,0 +1,170 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ + +#ifndef FFTW_INTERFACE_HPP +#define FFTW_INTERFACE_HPP + +#include <fftw3-mpi.h> + +#ifdef USE_FFTWESTIMATE +#define DEFAULT_FFTW_FLAG FFTW_ESTIMATE +#warning You are using FFTW estimate +#else +#define DEFAULT_FFTW_FLAG FFTW_PATIENT +#endif + +template <class realtype> +class fftw_interface; + +template <> +class fftw_interface<float> +{ +public: + using real = float; + using complex = fftwf_complex; + using plan = fftwf_plan; + using iodim = fftwf_iodim; + + static complex* alloc_complex(const size_t in_size){ + return fftwf_alloc_complex(in_size); + } + + static real* alloc_real(const size_t in_size){ + return fftwf_alloc_real(in_size); + } + + static void free(void* ptr){ + fftwf_free(ptr); + } + + static void execute(plan in_plan){ + fftwf_execute(in_plan); + } + + static void destroy_plan(plan in_plan){ + fftwf_destroy_plan(in_plan); + } + + template <class ... Params> + static plan mpi_plan_transpose(Params ... params){ + return fftwf_mpi_plan_transpose(params...); + } + + template <class ... Params> + static plan mpi_plan_many_transpose(Params ... params){ + return fftwf_mpi_plan_many_transpose(params...); + } + + template <class ... Params> + static plan plan_guru_r2r(Params ... params){ + return fftwf_plan_guru_r2r(params...); + } + + template <class ... Params> + static plan plan_guru_dft(Params ... params){ + return fftwf_plan_guru_dft(params...); + } + + template <class ... Params> + static plan mpi_plan_many_dft_c2r(Params ... params){ + return fftwf_mpi_plan_many_dft_c2r(params...); + } + + template <class ... Params> + static plan mpi_plan_many_dft_r2c(Params ... params){ + return fftwf_mpi_plan_many_dft_r2c(params...); + } + + template <class ... Params> + static plan mpi_plan_dft_c2r_3d(Params ... params){ + return fftwf_mpi_plan_dft_c2r_3d(params...); + } +}; + +template <> +class fftw_interface<double> +{ +public: + using real = double; + using complex = fftw_complex; + using plan = fftw_plan; + using iodim = fftw_iodim; + + static complex* alloc_complex(const size_t in_size){ + return fftw_alloc_complex(in_size); + } + + static real* alloc_real(const size_t in_size){ + return fftw_alloc_real(in_size); + } + + static void free(void* ptr){ + fftw_free(ptr); + } + + static void execute(plan in_plan){ + fftw_execute(in_plan); + } + + static void destroy_plan(plan in_plan){ + fftw_destroy_plan(in_plan); + } + + template <class ... Params> + static plan mpi_plan_transpose(Params ... params){ + return fftw_mpi_plan_transpose(params...); + } + + template <class ... Params> + static plan mpi_plan_many_transpose(Params ... params){ + return fftw_mpi_plan_many_transpose(params...); + } + + template <class ... Params> + static plan plan_guru_r2r(Params ... params){ + return fftw_plan_guru_r2r(params...); + } + + template <class ... Params> + static plan plan_guru_dft(Params ... params){ + return fftw_plan_guru_dft(params...); + } + + template <class ... Params> + static plan mpi_plan_many_dft_c2r(Params ... params){ + return fftw_mpi_plan_many_dft_c2r(params...); + } + + template <class ... Params> + static plan mpi_plan_many_dft_r2c(Params ... params){ + return fftw_mpi_plan_many_dft_r2c(params...); + } + + template <class ... Params> + static plan mpi_plan_dft_c2r_3d(Params ... params){ + return fftw_mpi_plan_dft_c2r_3d(params...); + } +}; + +#endif // FFTW_INTERFACE_HPP diff --git a/bfps/cpp/fftw_tools.cpp b/bfps/cpp/fftw_tools.cpp index f6eacbf1dfe2dfe31e603e9239c42d4639327d3d..61e03d292f81aed1fa4b2dfcab880fb7105b676e 100644 --- a/bfps/cpp/fftw_tools.cpp +++ b/bfps/cpp/fftw_tools.cpp @@ -27,6 +27,7 @@ #include <iostream> #include "base.hpp" #include "fftw_tools.hpp" +#include "fftw_interface.hpp" #define NDEBUG @@ -51,150 +52,171 @@ int clip_zero_padding( return EXIT_SUCCESS; } +template +int clip_zero_padding<float>( + field_descriptor<float> *f, + float *a, + int howmany); +template +int clip_zero_padding<double>( + field_descriptor<double> *f, + double *a, + int howmany); + + + +template <class rnumber> +int copy_complex_array( + field_descriptor<rnumber> *fi, + rnumber (*ai)[2], +field_descriptor<rnumber> *fo, +rnumber (*ao)[2], +int howmany) +{ + DEBUG_MSG("entered copy_complex_array\n"); + typename fftw_interface<rnumber>::complex *buffer; + buffer = fftw_interface<rnumber>::alloc_complex(fi->slice_size*howmany); + + int min_fast_dim; + min_fast_dim = + (fi->sizes[2] > fo->sizes[2]) ? + fo->sizes[2] : fi->sizes[2]; -#define TOOLS_IMPLEMENTATION(FFTW, R, MPI_RNUM, MPI_CNUM) \ -template <> \ -int copy_complex_array<R>( \ - field_descriptor<R> *fi, \ - R (*ai)[2], \ - field_descriptor<R> *fo, \ - R (*ao)[2], \ - int howmany) \ -{ \ - DEBUG_MSG("entered copy_complex_array\n"); \ - FFTW(complex) *buffer; \ - buffer = FFTW(alloc_complex)(fi->slice_size*howmany); \ - \ - int min_fast_dim; \ - min_fast_dim = \ - (fi->sizes[2] > fo->sizes[2]) ? \ - fo->sizes[2] : fi->sizes[2]; \ - \ /* clean up destination, in case we're padding with zeros - (even if only for one dimension) */ \ - std::fill_n((R*)ao, fo->local_size*2, 0.0); \ - \ - int64_t ii0, ii1; \ - int64_t oi0, oi1; \ - int64_t delta1, delta0; \ - int irank, orank; \ - delta0 = (fo->sizes[0] - fi->sizes[0]); \ - delta1 = (fo->sizes[1] - fi->sizes[1]); \ - for (ii0=0; ii0 < fi->sizes[0]; ii0++) \ - { \ - if (ii0 <= fi->sizes[0]/2) \ - { \ - oi0 = ii0; \ - if (oi0 > fo->sizes[0]/2) \ - continue; \ - } \ - else \ - { \ - oi0 = ii0 + delta0; \ - if ((oi0 < 0) || ((fo->sizes[0] - oi0) >= fo->sizes[0]/2)) \ - continue; \ - } \ - irank = fi->rank[ii0]; \ - orank = fo->rank[oi0]; \ - if ((irank == orank) && \ - (irank == fi->myrank)) \ - { \ - std::copy( \ - (R*)(ai + (ii0 - fi->starts[0] )*fi->slice_size), \ - (R*)(ai + (ii0 - fi->starts[0] + 1)*fi->slice_size), \ - (R*)buffer); \ - } \ - else \ - { \ - if (fi->myrank == irank) \ - { \ - MPI_Send( \ - (void*)(ai + (ii0-fi->starts[0])*fi->slice_size), \ - fi->slice_size, \ - MPI_CNUM, \ - orank, \ - ii0, \ - fi->comm); \ - } \ - if (fi->myrank == orank) \ - { \ - MPI_Recv( \ - (void*)(buffer), \ - fi->slice_size, \ - MPI_CNUM, \ - irank, \ - ii0, \ - fi->comm, \ - MPI_STATUS_IGNORE); \ - } \ - } \ - if (fi->myrank == orank) \ - { \ - for (ii1 = 0; ii1 < fi->sizes[1]; ii1++) \ - { \ - if (ii1 <= fi->sizes[1]/2) \ - { \ - oi1 = ii1; \ - if (oi1 > fo->sizes[1]/2) \ - continue; \ - } \ - else \ - { \ - oi1 = ii1 + delta1; \ - if ((oi1 < 0) || ((fo->sizes[1] - oi1) >= fo->sizes[1]/2)) \ - continue; \ - } \ - std::copy( \ - (R*)(buffer + (ii1*fi->sizes[2]*howmany)), \ - (R*)(buffer + (ii1*fi->sizes[2] + min_fast_dim)*howmany), \ - (R*)(ao + \ - ((oi0 - fo->starts[0])*fo->sizes[1] + \ - oi1)*fo->sizes[2]*howmany)); \ - } \ - } \ - } \ - fftw_free(buffer); \ - MPI_Barrier(fi->comm); \ - \ - DEBUG_MSG("exiting copy_complex_array\n"); \ - return EXIT_SUCCESS; \ -} \ - \ -template <> \ -int get_descriptors_3D<R>( \ - int n0, int n1, int n2, \ - field_descriptor<R> **fr, \ - field_descriptor<R> **fc) \ -{ \ - int ntmp[3]; \ - ntmp[0] = n0; \ - ntmp[1] = n1; \ - ntmp[2] = n2; \ - *fr = new field_descriptor<R>(3, ntmp, MPI_RNUM, MPI_COMM_WORLD); \ - ntmp[0] = n0; \ - ntmp[1] = n1; \ - ntmp[2] = n2/2+1; \ - *fc = new field_descriptor<R>(3, ntmp, MPI_CNUM, MPI_COMM_WORLD); \ - return EXIT_SUCCESS; \ -} \ - \ -template \ -int clip_zero_padding<R>( \ - field_descriptor<R> *f, \ - R *a, \ - int howmany); \ - - - -TOOLS_IMPLEMENTATION( - FFTW_MANGLE_FLOAT, - float, - MPI_FLOAT, - MPI_COMPLEX) -TOOLS_IMPLEMENTATION( - FFTW_MANGLE_DOUBLE, - double, - MPI_DOUBLE, - BFPS_MPICXX_DOUBLE_COMPLEX) + (even if only for one dimension) */ + std::fill_n((rnumber*)ao, fo->local_size*2, 0.0); + + int64_t ii0, ii1; + int64_t oi0, oi1; + int64_t delta1, delta0; + int irank, orank; + delta0 = (fo->sizes[0] - fi->sizes[0]); + delta1 = (fo->sizes[1] - fi->sizes[1]); + for (ii0=0; ii0 < fi->sizes[0]; ii0++) + { + if (ii0 <= fi->sizes[0]/2) + { + oi0 = ii0; + if (oi0 > fo->sizes[0]/2) + continue; + } + else + { + oi0 = ii0 + delta0; + if ((oi0 < 0) || ((fo->sizes[0] - oi0) >= fo->sizes[0]/2)) + continue; + } + irank = fi->rank[ii0]; + orank = fo->rank[oi0]; + if ((irank == orank) && + (irank == fi->myrank)) + { + std::copy( + (rnumber*)(ai + (ii0 - fi->starts[0] )*fi->slice_size), + (rnumber*)(ai + (ii0 - fi->starts[0] + 1)*fi->slice_size), + (rnumber*)buffer); + } + else + { + if (fi->myrank == irank) + { + MPI_Send( + (void*)(ai + (ii0-fi->starts[0])*fi->slice_size), + fi->slice_size, + mpi_real_type<rnumber>::complex(), + orank, + ii0, + fi->comm); + } + if (fi->myrank == orank) + { + MPI_Recv( + (void*)(buffer), + fi->slice_size, + mpi_real_type<rnumber>::complex(), + irank, + ii0, + fi->comm, + MPI_STATUS_IGNORE); + } + } + if (fi->myrank == orank) + { + for (ii1 = 0; ii1 < fi->sizes[1]; ii1++) + { + if (ii1 <= fi->sizes[1]/2) + { + oi1 = ii1; + if (oi1 > fo->sizes[1]/2) + continue; + } + else + { + oi1 = ii1 + delta1; + if ((oi1 < 0) || ((fo->sizes[1] - oi1) >= fo->sizes[1]/2)) + continue; + } + std::copy( + (rnumber*)(buffer + (ii1*fi->sizes[2]*howmany)), + (rnumber*)(buffer + (ii1*fi->sizes[2] + min_fast_dim)*howmany), + (rnumber*)(ao + + ((oi0 - fo->starts[0])*fo->sizes[1] + + oi1)*fo->sizes[2]*howmany)); + } + } + } + fftw_interface<rnumber>::free(buffer); + MPI_Barrier(fi->comm); + + DEBUG_MSG("exiting copy_complex_array\n"); + return EXIT_SUCCESS; +} + +template +int copy_complex_array<float>( + field_descriptor<float> *fi, + float (*ai)[2], + field_descriptor<float> *fo, + float (*ao)[2], + int howmany); + +template +int copy_complex_array<double>( + field_descriptor<double> *fi, + double (*ai)[2], + field_descriptor<double> *fo, + double (*ao)[2], + int howmany); + + +template <class rnumber> +int get_descriptors_3D( + int n0, int n1, int n2, + field_descriptor<rnumber> **fr, + field_descriptor<rnumber> **fc) +{ + int ntmp[3]; + ntmp[0] = n0; + ntmp[1] = n1; + ntmp[2] = n2; + *fr = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), MPI_COMM_WORLD); + ntmp[0] = n0; + ntmp[1] = n1; + ntmp[2] = n2/2+1; + *fc = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::complex(), MPI_COMM_WORLD); + return EXIT_SUCCESS; +} + +template +int get_descriptors_3D<float>( + int n0, int n1, int n2, + field_descriptor<float> **fr, + field_descriptor<float> **fc); + +template +int get_descriptors_3D<double>( + int n0, int n1, int n2, + field_descriptor<double> **fr, + field_descriptor<double> **fc); diff --git a/bfps/cpp/field.cpp b/bfps/cpp/field.cpp index ad1e77f107113952bd84a5a3f72f2a5d64064f9a..768e723da343cb344fb1e9583e777a621a10e864 100644 --- a/bfps/cpp/field.cpp +++ b/bfps/cpp/field.cpp @@ -23,87 +23,16 @@ **********************************************************************/ +#include <sys/stat.h> +#include <cmath> #include <cstdlib> #include <algorithm> #include <cassert> #include "field.hpp" +#include "scope_timer.hpp" +#include "shared_array.hpp" -template <field_components fc> -field_layout<fc>::field_layout( - const hsize_t *SIZES, - const hsize_t *SUBSIZES, - const hsize_t *STARTS, - const MPI_Comm COMM_TO_USE) -{ - this->comm = COMM_TO_USE; - MPI_Comm_rank(this->comm, &this->myrank); - MPI_Comm_size(this->comm, &this->nprocs); - - std::copy(SIZES, SIZES + 3, this->sizes); - std::copy(SUBSIZES, SUBSIZES + 3, this->subsizes); - std::copy(STARTS, STARTS + 3, this->starts); - if (fc == THREE || fc == THREExTHREE) - { - this->sizes[3] = 3; - this->subsizes[3] = 3; - this->starts[3] = 0; - } - if (fc == THREExTHREE) - { - this->sizes[4] = 3; - this->subsizes[4] = 3; - this->starts[4] = 0; - } - this->local_size = 1; - this->full_size = 1; - for (unsigned int i=0; i<ndim(fc); i++) - { - this->local_size *= this->subsizes[i]; - this->full_size *= this->sizes[i]; - } - /*field will at most be distributed in 2D*/ - this->rank.resize(2); - this->all_start.resize(2); - this->all_size.resize(2); - for (int i=0; i<2; i++) - { - this->rank[i].resize(this->sizes[i]); - std::vector<int> local_rank; - local_rank.resize(this->sizes[i], 0); - for (unsigned int ii=this->starts[i]; ii<this->starts[i]+this->subsizes[i]; ii++) - local_rank[ii] = this->myrank; - MPI_Allreduce( - &local_rank.front(), - &this->rank[i].front(), - this->sizes[i], - MPI_INT, - MPI_SUM, - this->comm); - this->all_start[i].resize(this->nprocs); - std::vector<int> local_start; - local_start.resize(this->nprocs, 0); - local_start[this->myrank] = this->starts[i]; - MPI_Allreduce( - &local_start.front(), - &this->all_start[i].front(), - this->nprocs, - MPI_INT, - MPI_SUM, - this->comm); - this->all_size[i].resize(this->nprocs); - std::vector<int> local_subsize; - local_subsize.resize(this->nprocs, 0); - local_subsize[this->myrank] = this->subsizes[i]; - MPI_Allreduce( - &local_subsize.front(), - &this->all_size[i].front(), - this->nprocs, - MPI_INT, - MPI_SUM, - this->comm); - } -} template <typename rnumber, field_backend be, @@ -115,6 +44,7 @@ field<rnumber, be, fc>::field( const MPI_Comm COMM_TO_USE, const unsigned FFTW_PLAN_RIGOR) { + TIMEZONE("field::field"); this->comm = COMM_TO_USE; MPI_Comm_rank(this->comm, &this->myrank); MPI_Comm_size(this->comm, &this->nprocs); @@ -164,47 +94,27 @@ field<rnumber, be, fc>::field( starts[0] = local_0_start; starts[1] = 0; starts[2] = 0; this->rmemlayout = new field_layout<fc>( sizes, subsizes, starts, this->comm); - sizes[0] = nz; sizes[1] = ny; sizes[2] = nx/2+1; - subsizes[0] = local_n1; subsizes[1] = ny; subsizes[2] = nx/2+1; + sizes[0] = ny; sizes[1] = nz; sizes[2] = nx/2+1; + subsizes[0] = local_n1; subsizes[1] = nz; subsizes[2] = nx/2+1; starts[0] = local_1_start; starts[1] = 0; starts[2] = 0; this->clayout = new field_layout<fc>( sizes, subsizes, starts, this->comm); - this->data = (rnumber*)fftw_malloc( - sizeof(rnumber)*this->rmemlayout->local_size); - if(typeid(rnumber) == typeid(float)) - { - this->c2r_plan = new fftwf_plan; - this->r2c_plan = new fftwf_plan; - *((fftwf_plan*)this->c2r_plan) = fftwf_mpi_plan_many_dft_c2r( - 3, nfftw, ncomp(fc), - FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, - (fftwf_complex*)this->data, (float*)this->data, - this->comm, - this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); - *((fftwf_plan*)this->r2c_plan) = fftwf_mpi_plan_many_dft_r2c( - 3, nfftw, ncomp(fc), - FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, - (float*)this->data, (fftwf_complex*)this->data, - this->comm, - this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); - } - if (typeid(rnumber) == typeid(double)) - { - this->c2r_plan = new fftw_plan; - this->r2c_plan = new fftw_plan; - *((fftw_plan*)this->c2r_plan) = fftw_mpi_plan_many_dft_c2r( - 3, nfftw, ncomp(fc), - FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, - (fftw_complex*)this->data, (double*)this->data, - this->comm, - this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); - *((fftw_plan*)this->r2c_plan) = fftw_mpi_plan_many_dft_r2c( - 3, nfftw, ncomp(fc), - FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, - (double*)this->data, (fftw_complex*)this->data, - this->comm, - this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); - } + this->data = fftw_interface<rnumber>::alloc_real( + this->rmemlayout->local_size); + this->c2r_plan = fftw_interface<rnumber>::mpi_plan_many_dft_c2r( + 3, nfftw, ncomp(fc), + FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + (typename fftw_interface<rnumber>::complex*)this->data, + this->data, + this->comm, + this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); + this->r2c_plan = fftw_interface<rnumber>::mpi_plan_many_dft_r2c( + 3, nfftw, ncomp(fc), + FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->data, + (typename fftw_interface<rnumber>::complex*)this->data, + this->comm, + this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); break; } } @@ -223,21 +133,9 @@ field<rnumber, be, fc>::~field() delete this->rlayout; delete this->rmemlayout; delete this->clayout; - fftw_free(this->data); - if (typeid(rnumber) == typeid(float)) - { - fftwf_destroy_plan(*(fftwf_plan*)this->c2r_plan); - delete (fftwf_plan*)this->c2r_plan; - fftwf_destroy_plan(*(fftwf_plan*)this->r2c_plan); - delete (fftwf_plan*)this->r2c_plan; - } - else if (typeid(rnumber) == typeid(double)) - { - fftw_destroy_plan(*(fftw_plan*)this->c2r_plan); - delete (fftw_plan*)this->c2r_plan; - fftw_destroy_plan(*(fftw_plan*)this->r2c_plan); - delete (fftw_plan*)this->r2c_plan; - } + fftw_interface<rnumber>::free(this->data); + fftw_interface<rnumber>::destroy_plan(this->c2r_plan); + fftw_interface<rnumber>::destroy_plan(this->r2c_plan); break; } } @@ -247,10 +145,8 @@ template <typename rnumber, field_components fc> void field<rnumber, be, fc>::ift() { - if (typeid(rnumber) == typeid(float)) - fftwf_execute(*((fftwf_plan*)this->c2r_plan)); - else if (typeid(rnumber) == typeid(double)) - fftw_execute(*((fftw_plan*)this->c2r_plan)); + TIMEZONE("field::ift"); + fftw_interface<rnumber>::execute(this->c2r_plan); this->real_space_representation = true; } @@ -259,10 +155,8 @@ template <typename rnumber, field_components fc> void field<rnumber, be, fc>::dft() { - if (typeid(rnumber) == typeid(float)) - fftwf_execute(*((fftwf_plan*)this->r2c_plan)); - else if (typeid(rnumber) == typeid(double)) - fftw_execute(*((fftw_plan*)this->r2c_plan)); + TIMEZONE("field::dft"); + fftw_interface<rnumber>::execute(this->r2c_plan); this->real_space_representation = false; } @@ -271,59 +165,340 @@ template <typename rnumber, field_components fc> int field<rnumber, be, fc>::io( const std::string fname, - const std::string dset_name, + const std::string field_name, + const int iteration, + const bool read) +{ + /* file dataset has same dimensions as field */ + TIMEZONE("field::io"); + hid_t file_id, dset_id, plist_id; + dset_id = H5I_BADID; + std::string representation = std::string( + this->real_space_representation ? + "real" : "complex"); + std::string dset_name = ( + "/" + field_name + + "/" + representation + + "/" + std::to_string(iteration)); + + /* open/create file */ + plist_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(plist_id, this->comm, MPI_INFO_NULL); + bool file_exists = false; + { + struct stat file_buffer; + file_exists = (stat(fname.c_str(), &file_buffer) == 0); + } + if (read) + { + assert(file_exists); + file_id = H5Fopen(fname.c_str(), H5F_ACC_RDONLY, plist_id); + } + else + { + if (file_exists) + file_id = H5Fopen(fname.c_str(), H5F_ACC_RDWR, plist_id); + else + file_id = H5Fcreate(fname.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, plist_id); + } + assert(file_id >= 0); + H5Pclose(plist_id); + + /* check what kind of representation is being used */ + if (read) + { + dset_id = H5Dopen( + file_id, + dset_name.c_str(), + H5P_DEFAULT); + assert(dset_id >= 0); + hid_t dset_type = H5Dget_type(dset_id); + assert(dset_type >= 0); + bool io_for_real = ( + H5Tequal(dset_type, H5T_IEEE_F32BE) || + H5Tequal(dset_type, H5T_IEEE_F32LE) || + H5Tequal(dset_type, H5T_INTEL_F32) || + H5Tequal(dset_type, H5T_NATIVE_FLOAT) || + H5Tequal(dset_type, H5T_IEEE_F64BE) || + H5Tequal(dset_type, H5T_IEEE_F64LE) || + H5Tequal(dset_type, H5T_INTEL_F64) || + H5Tequal(dset_type, H5T_NATIVE_DOUBLE)); + H5Tclose(dset_type); + assert(this->real_space_representation == io_for_real); + } + + /* generic space initialization */ + hid_t fspace, mspace; + hsize_t count[ndim(fc)], offset[ndim(fc)], dims[ndim(fc)]; + hsize_t memoffset[ndim(fc)], memshape[ndim(fc)]; + + if (this->real_space_representation) + { + for (unsigned int i=0; i<ndim(fc); i++) + { + count[i] = this->rlayout->subsizes[i]; + offset[i] = this->rlayout->starts[i]; + dims[i] = this->rlayout->sizes[i]; + memshape[i] = this->rmemlayout->subsizes[i]; + memoffset[i] = 0; + } + } + else + { + for (unsigned int i=0; i<ndim(fc); i++) + { + count [i] = this->clayout->subsizes[i]; + offset[i] = this->clayout->starts[i]; + dims [i] = this->clayout->sizes[i]; + memshape [i] = count[i]; + memoffset[i] = 0; + } + } + mspace = H5Screate_simple(ndim(fc), memshape, NULL); + H5Sselect_hyperslab(mspace, H5S_SELECT_SET, memoffset, NULL, count, NULL); + + /* open/create data set */ + if (read) + fspace = H5Dget_space(dset_id); + else + { + if (!H5Lexists(file_id, field_name.c_str(), H5P_DEFAULT)) + { + hid_t gid_tmp = H5Gcreate( + file_id, field_name.c_str(), + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + H5Gclose(gid_tmp); + } + + if (!H5Lexists(file_id, (field_name + "/" + representation).c_str(), H5P_DEFAULT)) + { + hid_t gid_tmp = H5Gcreate( + file_id, ("/" + field_name + "/" + representation).c_str(), + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + H5Gclose(gid_tmp); + } + if (H5Lexists(file_id, dset_name.c_str(), H5P_DEFAULT)) + { + dset_id = H5Dopen(file_id, dset_name.c_str(), H5P_DEFAULT); + fspace = H5Dget_space(dset_id); + } + else + { + fspace = H5Screate_simple( + ndim(fc), + dims, + NULL); + /* chunking needs to go in here */ + dset_id = H5Dcreate( + file_id, + dset_name.c_str(), + (this->real_space_representation ? this->rnumber_H5T : this->cnumber_H5T), + fspace, + H5P_DEFAULT, + H5P_DEFAULT, + H5P_DEFAULT); + } + } + /* both dset_id and fspace should now have sane values */ + + /* check file space */ + int ndims_fspace = H5Sget_simple_extent_dims(fspace, dims, NULL); + assert(((unsigned int)(ndims_fspace)) == ndim(fc)); + if (this->real_space_representation) + { + for (unsigned int i=0; i<ndim(fc); i++) + { + offset[i] = this->rlayout->starts[i]; + assert(dims[i] == this->rlayout->sizes[i]); + } + H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL); + if (read) + { + std::fill_n(this->data, this->rmemlayout->local_size, 0); + H5Dread(dset_id, this->rnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data); + } + else + { + assert(this->real_space_representation); + H5Dwrite(dset_id, this->rnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data); + } + H5Sclose(mspace); + } + else + { + for (unsigned int i=0; i<ndim(fc); i++) + { + offset[i] = this->clayout->starts[i]; + assert(dims[i] == this->clayout->sizes[i]); + } + H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL); + if (read) + { + std::fill_n(this->data, this->clayout->local_size*2, 0); + H5Dread(dset_id, this->cnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data); + this->symmetrize(); + } + else + { + assert(!this->real_space_representation); + H5Dwrite(dset_id, this->cnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data); + } + H5Sclose(mspace); + } + + H5Sclose(fspace); + /* close data set */ + H5Dclose(dset_id); + /* close file */ + H5Fclose(file_id); + return EXIT_SUCCESS; +} + +template <typename rnumber, + field_backend be, + field_components fc> +int field<rnumber, be, fc>::io_database( + const std::string fname, + const std::string field_name, const int toffset, const bool read) { + /* file dataset is has a time dimension as well */ + TIMEZONE("field::io_database"); hid_t file_id, dset_id, plist_id; - hid_t dset_type; - bool io_for_real = false; + dset_id = H5I_BADID; + std::string representation = std::string( + this->real_space_representation ? + "real" : "complex"); + std::string dset_name = ( + "/" + field_name + + "/" + representation); - /* open file */ + /* open/create file */ plist_id = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(plist_id, this->comm, MPI_INFO_NULL); + bool file_exists = false; + { + struct stat file_buffer; + file_exists = (stat(fname.c_str(), &file_buffer) == 0); + } if (read) + { + assert(file_exists); file_id = H5Fopen(fname.c_str(), H5F_ACC_RDONLY, plist_id); + } else - file_id = H5Fopen(fname.c_str(), H5F_ACC_RDWR, plist_id); + { + if (file_exists) + file_id = H5Fopen(fname.c_str(), H5F_ACC_RDWR, plist_id); + else + file_id = H5Fcreate(fname.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, plist_id); + } H5Pclose(plist_id); - /* open data set */ - dset_id = H5Dopen(file_id, dset_name.c_str(), H5P_DEFAULT); - dset_type = H5Dget_type(dset_id); - io_for_real = ( - H5Tequal(dset_type, H5T_IEEE_F32BE) || - H5Tequal(dset_type, H5T_IEEE_F32LE) || - H5Tequal(dset_type, H5T_INTEL_F32) || - H5Tequal(dset_type, H5T_NATIVE_FLOAT) || - H5Tequal(dset_type, H5T_IEEE_F64BE) || - H5Tequal(dset_type, H5T_IEEE_F64LE) || - H5Tequal(dset_type, H5T_INTEL_F64) || - H5Tequal(dset_type, H5T_NATIVE_DOUBLE)); + /* check what kind of representation is being used */ + if (read) + { + dset_id = H5Dopen( + file_id, + dset_name.c_str(), + H5P_DEFAULT); + hid_t dset_type = H5Dget_type(dset_id); + bool io_for_real = ( + H5Tequal(dset_type, H5T_IEEE_F32BE) || + H5Tequal(dset_type, H5T_IEEE_F32LE) || + H5Tequal(dset_type, H5T_INTEL_F32) || + H5Tequal(dset_type, H5T_NATIVE_FLOAT) || + H5Tequal(dset_type, H5T_IEEE_F64BE) || + H5Tequal(dset_type, H5T_IEEE_F64LE) || + H5Tequal(dset_type, H5T_INTEL_F64) || + H5Tequal(dset_type, H5T_NATIVE_DOUBLE)); + H5Tclose(dset_type); + assert(this->real_space_representation == io_for_real); + } /* generic space initialization */ hid_t fspace, mspace; - fspace = H5Dget_space(dset_id); hsize_t count[ndim(fc)+1], offset[ndim(fc)+1], dims[ndim(fc)+1]; hsize_t memoffset[ndim(fc)+1], memshape[ndim(fc)+1]; - H5Sget_simple_extent_dims(fspace, dims, NULL); + + int dim_counter_offset = 1; + dim_counter_offset = 1; count[0] = 1; - offset[0] = toffset; memshape[0] = 1; memoffset[0] = 0; - if (io_for_real) + if (this->real_space_representation) { for (unsigned int i=0; i<ndim(fc); i++) { - count[i+1] = this->rlayout->subsizes[i]; - offset[i+1] = this->rlayout->starts[i]; - assert(dims[i+1] == this->rlayout->sizes[i]); - memshape[i+1] = this->rmemlayout->subsizes[i]; - memoffset[i+1] = 0; + count[i+dim_counter_offset] = this->rlayout->subsizes[i]; + offset[i+dim_counter_offset] = this->rlayout->starts[i]; + dims[i+dim_counter_offset] = this->rlayout->sizes[i]; + memshape[i+dim_counter_offset] = this->rmemlayout->subsizes[i]; + memoffset[i+dim_counter_offset] = 0; } - mspace = H5Screate_simple(ndim(fc)+1, memshape, NULL); - H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL); + mspace = H5Screate_simple(dim_counter_offset + ndim(fc), memshape, NULL); + H5Sselect_hyperslab(mspace, H5S_SELECT_SET, memoffset, NULL, count, NULL); + } + else + { + for (unsigned int i=0; i<ndim(fc); i++) + { + count[i+dim_counter_offset] = this->clayout->subsizes[i]; + offset[i+dim_counter_offset] = this->clayout->starts[i]; + dims[i+dim_counter_offset] = this->clayout->sizes[i]; + memshape[i+dim_counter_offset] = count[i+dim_counter_offset]; + memoffset[i+dim_counter_offset] = 0; + } + mspace = H5Screate_simple(dim_counter_offset + ndim(fc), memshape, NULL); H5Sselect_hyperslab(mspace, H5S_SELECT_SET, memoffset, NULL, count, NULL); + } + + /* open/create data set */ + if (read) + fspace = H5Dget_space(dset_id); + else + { + if (!H5Lexists(file_id, field_name.c_str(), H5P_DEFAULT)) + H5Gcreate( + file_id, field_name.c_str(), + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + if (H5Lexists(file_id, dset_name.c_str(), H5P_DEFAULT)) + { + dset_id = H5Dopen(file_id, dset_name.c_str(), H5P_DEFAULT); + fspace = H5Dget_space(dset_id); + } + else + { + fspace = H5Screate_simple( + ndim(fc), + dims, + NULL); + /* chunking needs to go in here */ + dset_id = H5Dcreate( + file_id, + dset_name.c_str(), + (this->real_space_representation ? this->rnumber_H5T : this->cnumber_H5T), + fspace, + H5P_DEFAULT, + H5P_DEFAULT, + H5P_DEFAULT); + } + } + /* both dset_id and fspace should now have sane values */ + + /* check file space */ + int ndims_fspace = H5Sget_simple_extent_dims(fspace, dims, NULL); + assert(ndims_fspace == int(ndim(fc) + 1)); + offset[0] = toffset; + if (this->real_space_representation) + { + for (unsigned int i=0; i<ndim(fc); i++) + { + offset[i+dim_counter_offset] = this->rlayout->starts[i]; + assert(dims[i+dim_counter_offset] == this->rlayout->sizes[i]); + } + H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL); if (read) { std::fill_n(this->data, this->rmemlayout->local_size, 0); @@ -332,13 +507,8 @@ int field<rnumber, be, fc>::io( } else { + assert(this->real_space_representation); H5Dwrite(dset_id, this->rnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data); - if (!this->real_space_representation) - /* in principle we could do an inverse Fourier transform in here, - * however that would be unsafe since we wouldn't know whether we'd need to - * normalize or not. - * */ - DEBUG_MSG("I just wrote complex field into real space dataset. It's probably nonsense.\n"); } H5Sclose(mspace); } @@ -346,30 +516,24 @@ int field<rnumber, be, fc>::io( { for (unsigned int i=0; i<ndim(fc); i++) { - count[i+1] = this->clayout->subsizes[i]; - offset[i+1] = this->clayout->starts[i]; - assert(dims[i+1] == this->clayout->sizes[i]); - memshape[i+1] = count[i+1]; - memoffset[i+1] = 0; + offset[i+dim_counter_offset] = this->clayout->starts[i]; + assert(dims[i+dim_counter_offset] == this->clayout->sizes[i]); } - mspace = H5Screate_simple(ndim(fc)+1, memshape, NULL); H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL); - H5Sselect_hyperslab(mspace, H5S_SELECT_SET, memoffset, NULL, count, NULL); if (read) { H5Dread(dset_id, this->cnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data); this->real_space_representation = false; + this->symmetrize(); } else { + assert(!this->real_space_representation); H5Dwrite(dset_id, this->cnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data); - if (this->real_space_representation) - DEBUG_MSG("I just wrote real space field into complex dataset. It's probably nonsense.\n"); } H5Sclose(mspace); } - H5Tclose(dset_type); H5Sclose(fspace); /* close data set */ H5Dclose(dset_id); @@ -378,17 +542,127 @@ int field<rnumber, be, fc>::io( return EXIT_SUCCESS; } + template <typename rnumber, field_backend be, field_components fc> -void field<rnumber, be, fc>::compute_rspace_stats( +int field<rnumber, be, fc>::write_0slice( + const hid_t group, + const std::string field_name, + const int iteration) +{ + TIMEZONE("field::write_0slice"); + assert(this->real_space_representation); + assert(fc == THREE); + if (this->myrank == 0) + { + hid_t dset, wspace, mspace; + int ndims; + hsize_t count[4], offset[4], dims[4]; + offset[0] = iteration; + offset[1] = 0; + offset[2] = 0; + offset[3] = 0; + dset = H5Dopen( + group, + ("0slices/" + field_name + "/real").c_str(), + H5P_DEFAULT); + wspace = H5Dget_space(dset); + ndims = H5Sget_simple_extent_dims(wspace, dims, NULL); + // array in memory has 2 extra x points, because FFTW + count[0] = 1; + count[1] = this->rmemlayout->sizes[1]; + count[2] = this->rmemlayout->sizes[2]; + count[3] = 3; + mspace = H5Screate_simple(ndims, count, NULL); + // array in file should not have the extra 2 points + count[1] = this->rlayout->sizes[1]; + count[2] = this->rlayout->sizes[2]; + // select right slice in file + H5Sselect_hyperslab( + wspace, + H5S_SELECT_SET, + offset, + NULL, + count, + NULL); + offset[0] = 0; + // select proper regions of memory + H5Sselect_hyperslab( + mspace, + H5S_SELECT_SET, + offset, + NULL, + count, + NULL); + H5Dwrite( + dset, + this->rnumber_H5T, + mspace, + wspace, + H5P_DEFAULT, + this->data); + H5Dclose(dset); + H5Sclose(mspace); + H5Sclose(wspace); + } + return EXIT_SUCCESS; +} + + +template <typename rnumber, + field_backend be, + field_components fc> +void field<rnumber, be, fc>::compute_rspace_xincrement_stats( + const int xcells, const hid_t group, const std::string dset_name, const hsize_t toffset, const std::vector<double> max_estimate) { + TIMEZONE("field::compute_rspace_xincrement_stats"); assert(this->real_space_representation); assert(fc == ONE || fc == THREE); + field<rnumber, be, fc> *tmp_field = new field<rnumber, be, fc>( + this->rlayout->sizes[2], + this->rlayout->sizes[1], + this->rlayout->sizes[0], + this->rlayout->comm); + tmp_field->real_space_representation = true; + this->RLOOP( + [&](ptrdiff_t rindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + hsize_t rrindex = (xindex + xcells)%this->rlayout->sizes[2] + ( + zindex * this->rlayout->subsizes[1] + yindex)*( + this->rmemlayout->subsizes[2]); + for (unsigned int component=0; component < ncomp(fc); component++) + tmp_field->data[rindex*ncomp(fc) + component] = + this->data[rrindex*ncomp(fc) + component] - + this->data[rindex*ncomp(fc) + component]; + }); + tmp_field->compute_rspace_stats( + group, + dset_name, + toffset, + max_estimate); + delete tmp_field; +} + + + +template <typename rnumber, + field_backend be, + field_components fc> +void field<rnumber, be, fc>::compute_rspace_stats( + const hid_t group, + const std::string dset_name, + const hsize_t toffset, + const std::vector<double> max_estimate) +{ + TIMEZONE("field::compute_rspace_stats"); + assert(this->real_space_representation); const unsigned int nmoments = 10; int nvals, nbins; if (this->myrank == 0) @@ -427,25 +701,41 @@ void field<rnumber, be, fc>::compute_rspace_stats( H5Sclose(wspace); H5Dclose(dset); } - MPI_Bcast(&nvals, 1, MPI_INT, 0, this->comm); - MPI_Bcast(&nbins, 1, MPI_INT, 0, this->comm); + { + TIMEZONE("MPI_Bcast"); + MPI_Bcast(&nvals, 1, MPI_INT, 0, this->comm); + MPI_Bcast(&nbins, 1, MPI_INT, 0, this->comm); + } assert(nvals == int(max_estimate.size())); - double *moments = new double[nmoments*nvals]; - double *local_moments = new double[nmoments*nvals]; - double *val_tmp = new double[nvals]; + + shared_array<double> local_moments_threaded(nmoments*nvals, [&](double* local_moments){ + std::fill_n(local_moments, nmoments*nvals, 0); + if (nvals == 4) local_moments[3] = max_estimate[3]; + }); + + shared_array<double> val_tmp_threaded(nvals,[&](double *val_tmp){ + std::fill_n(val_tmp, nvals, 0); + }); + + shared_array<ptrdiff_t> local_hist_threaded(nbins*nvals,[&](ptrdiff_t* local_hist){ + std::fill_n(local_hist, nbins*nvals, 0); + }); + double *binsize = new double[nvals]; - double *pow_tmp = new double[nvals]; - ptrdiff_t *hist = new ptrdiff_t[nbins*nvals]; - ptrdiff_t *local_hist = new ptrdiff_t[nbins*nvals]; - int bin; for (int i=0; i<nvals; i++) binsize[i] = 2*max_estimate[i] / nbins; - std::fill_n(local_hist, nbins*nvals, 0); - std::fill_n(local_moments, nmoments*nvals, 0); - if (nvals == 4) local_moments[3] = max_estimate[3]; - FIELD_RLOOP( - this, - std::fill_n(pow_tmp, nvals, 1.0); + + { + TIMEZONE("field::RLOOP"); + this->RLOOP( + [&](ptrdiff_t rindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + double *local_moments = local_moments_threaded.getMine(); + double *val_tmp = val_tmp_threaded.getMine(); + ptrdiff_t *local_hist = local_hist_threaded.getMine(); + if (nvals == int(4)) val_tmp[3] = 0.0; for (unsigned int i=0; i<ncomp(fc); i++) { @@ -459,9 +749,10 @@ void field<rnumber, be, fc>::compute_rspace_stats( local_moments[0*nvals+3] = val_tmp[3]; if (val_tmp[3] > local_moments[9*nvals+3]) local_moments[9*nvals+3] = val_tmp[3]; - bin = int(floor(val_tmp[3]*2/binsize[3])); - if (bin >= 0 && bin < nbins) + int bin = int(floor(val_tmp[3]*2/binsize[3])); + if (bin >= 0 && bin < nbins){ local_hist[bin*nvals+3]++; + } } for (unsigned int i=0; i<ncomp(fc); i++) { @@ -469,44 +760,70 @@ void field<rnumber, be, fc>::compute_rspace_stats( local_moments[0*nvals+i] = val_tmp[i]; if (val_tmp[i] > local_moments[(nmoments-1)*nvals+i]) local_moments[(nmoments-1)*nvals+i] = val_tmp[i]; - bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i])); + int bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i])); if (bin >= 0 && bin < nbins) local_hist[bin*nvals+i]++; } - for (int n=1; n < int(nmoments)-1; n++) - for (int i=0; i<nvals; i++) - local_moments[n*nvals + i] += (pow_tmp[i] = val_tmp[i]*pow_tmp[i]); - ); - MPI_Allreduce( - (void*)local_moments, - (void*)moments, - nvals, - MPI_DOUBLE, MPI_MIN, this->comm); - MPI_Allreduce( - (void*)(local_moments + nvals), - (void*)(moments+nvals), - (nmoments-2)*nvals, - MPI_DOUBLE, MPI_SUM, this->comm); - MPI_Allreduce( - (void*)(local_moments + (nmoments-1)*nvals), - (void*)(moments+(nmoments-1)*nvals), - nvals, - MPI_DOUBLE, MPI_MAX, this->comm); - MPI_Allreduce( - (void*)local_hist, - (void*)hist, - nbins*nvals, - MPI_INT64_T, MPI_SUM, this->comm); + for (int n=1; n < int(nmoments)-1; n++){ + double pow_tmp = 1; + for (int i=0; i<nvals; i++){ + local_moments[n*nvals + i] += (pow_tmp = val_tmp[i]*pow_tmp); + } + } + }); + + TIMEZONE("FIELD_RLOOP::Merge"); + local_moments_threaded.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double { + if(nvals == int(4) && idx == 0*nvals+3){ + return std::min(v1, v2); + } + if(nvals == int(4) && idx == 9*nvals+3){ + return std::max(v1, v2); + } + if(idx < int(ncomp(fc))){ + return std::min(v1, v2); + } + if(int(nmoments-1)*nvals <= idx && idx < int(int(nmoments-1)*nvals+ncomp(fc))){ + return std::max(v1, v2); + } + return v1 + v2; + }); + + local_hist_threaded.mergeParallel(); + } + ptrdiff_t *hist = new ptrdiff_t[nbins*nvals]; + double *moments = new double[nmoments*nvals]; + { + TIMEZONE("MPI_Allreduce"); + MPI_Allreduce( + (void*)local_moments_threaded.getMasterData(), + (void*)moments, + nvals, + MPI_DOUBLE, MPI_MIN, this->comm); + MPI_Allreduce( + (void*)(local_moments_threaded.getMasterData() + nvals), + (void*)(moments+nvals), + (nmoments-2)*nvals, + MPI_DOUBLE, MPI_SUM, this->comm); + MPI_Allreduce( + (void*)(local_moments_threaded.getMasterData() + (nmoments-1)*nvals), + (void*)(moments+(nmoments-1)*nvals), + nvals, + MPI_DOUBLE, MPI_MAX, this->comm); + MPI_Allreduce( + (void*)local_hist_threaded.getMasterData(), + (void*)hist, + nbins*nvals, + MPI_INT64_T, MPI_SUM, this->comm); + } for (int n=1; n < int(nmoments)-1; n++) for (int i=0; i<nvals; i++) moments[n*nvals + i] /= this->npoints; - delete[] local_moments; - delete[] local_hist; - delete[] val_tmp; + delete[] binsize; - delete[] pow_tmp; if (this->myrank == 0) { + TIMEZONE("root-work"); hid_t dset, wspace, mspace; hsize_t count[ndim(fc)-1], offset[ndim(fc)-1], dims[ndim(fc)-1]; dset = H5Dopen(group, ("moments/" + dset_name).c_str(), H5P_DEFAULT); @@ -543,6 +860,11 @@ void field<rnumber, be, fc>::compute_rspace_stats( H5Sclose(wspace); H5Sclose(mspace); H5Dclose(dset); + if (H5Lexists(group, "0slices", H5P_DEFAULT)) + this->write_0slice( + group, + dset_name, + toffset); } delete[] moments; delete[] hist; @@ -557,6 +879,86 @@ void field<rnumber, be, fc>::normalize() this->data[tmp_index] /= this->npoints; } +template <typename rnumber, + field_backend be, + field_components fc> +void field<rnumber, be, fc>::symmetrize() +{ + TIMEZONE("field::symmetrize"); + assert(!this->real_space_representation); + ptrdiff_t ii, cc; + typename fftw_interface<rnumber>::complex *data = this->get_cdata(); + MPI_Status *mpistatus = new MPI_Status; + if (this->myrank == this->clayout->rank[0][0]) + { + for (cc = 0; cc < ncomp(fc); cc++) + data[cc][1] = 0.0; + for (ii = 1; ii < ptrdiff_t(this->clayout->sizes[1]/2); ii++) + for (cc = 0; cc < ncomp(fc); cc++) { + ( *(data + cc + ncomp(fc)*(this->clayout->sizes[1] - ii)*this->clayout->sizes[2]))[0] = + (*(data + cc + ncomp(fc)*( ii)*this->clayout->sizes[2]))[0]; + ( *(data + cc + ncomp(fc)*(this->clayout->sizes[1] - ii)*this->clayout->sizes[2]))[1] = + -(*(data + cc + ncomp(fc)*( ii)*this->clayout->sizes[2]))[1]; + } + } + typename fftw_interface<rnumber>::complex *buffer; + buffer = fftw_interface<rnumber>::alloc_complex(ncomp(fc)*this->clayout->sizes[1]); + ptrdiff_t yy; + /*ptrdiff_t tindex;*/ + int ranksrc, rankdst; + for (yy = 1; yy < ptrdiff_t(this->clayout->sizes[0]/2); yy++) { + ranksrc = this->clayout->rank[0][yy]; + rankdst = this->clayout->rank[0][this->clayout->sizes[0] - yy]; + if (this->clayout->myrank == ranksrc) + for (ii = 0; ii < ptrdiff_t(this->clayout->sizes[1]); ii++) + for (cc = 0; cc < ncomp(fc); cc++) + for (int imag_comp=0; imag_comp<2; imag_comp++) + (*(buffer + ncomp(fc)*ii+cc))[imag_comp] = + (*(data + ncomp(fc)*((yy - this->clayout->starts[0])*this->clayout->sizes[1] + ii)*this->clayout->sizes[2] + cc))[imag_comp]; + if (ranksrc != rankdst) + { + if (this->clayout->myrank == ranksrc) + MPI_Send((void*)buffer, + ncomp(fc)*this->clayout->sizes[1], mpi_real_type<rnumber>::complex(), rankdst, yy, + this->clayout->comm); + if (this->clayout->myrank == rankdst) + MPI_Recv((void*)buffer, + ncomp(fc)*this->clayout->sizes[1], mpi_real_type<rnumber>::complex(), ranksrc, yy, + this->clayout->comm, mpistatus); + } + if (this->clayout->myrank == rankdst) + { + for (ii = 1; ii < ptrdiff_t(this->clayout->sizes[1]); ii++) + for (cc = 0; cc < ncomp(fc); cc++) + { + (*(data + ncomp(fc)*((this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1] + ii)*this->clayout->sizes[2] + cc))[0] = + (*(buffer + ncomp(fc)*(this->clayout->sizes[1]-ii)+cc))[0]; + (*(data + ncomp(fc)*((this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1] + ii)*this->clayout->sizes[2] + cc))[1] = + -(*(buffer + ncomp(fc)*(this->clayout->sizes[1]-ii)+cc))[1]; + } + for (cc = 0; cc < ncomp(fc); cc++) + { + (*((data + cc + ncomp(fc)*(this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1]*this->clayout->sizes[2])))[0] = (*(buffer + cc))[0]; + (*((data + cc + ncomp(fc)*(this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1]*this->clayout->sizes[2])))[1] = -(*(buffer + cc))[1]; + } + } + } + fftw_interface<rnumber>::free(buffer); + delete mpistatus; + /* put asymmetric data to 0 */ + /*if (this->clayout->myrank == this->clayout->rank[0][this->clayout->sizes[0]/2]) + { + tindex = ncomp(fc)*(this->clayout->sizes[0]/2 - this->clayout->starts[0])*this->clayout->sizes[1]*this->clayout->sizes[2]; + for (ii = 0; ii < this->clayout->sizes[1]; ii++) + { + std::fill_n((rnumber*)(data + tindex), ncomp(fc)*2*this->clayout->sizes[2], 0.0); + tindex += ncomp(fc)*this->clayout->sizes[2]; + } + } + tindex = ncomp(fc)*(); + std::fill_n((rnumber*)(data + tindex), ncomp(fc)*2, 0.0);*/ +} + template <typename rnumber, field_backend be, field_components fc> @@ -568,6 +970,7 @@ void field<rnumber, be, fc>::compute_stats( const hsize_t toffset, const double max_estimate) { + TIMEZONE("field::compute_stats"); std::vector<double> max_estimate_vector; bool did_rspace = false; switch(fc) @@ -585,6 +988,7 @@ void field<rnumber, be, fc>::compute_stats( } if (this->real_space_representation) { + TIMEZONE("field::compute_stats::compute_rspace_stats"); this->compute_rspace_stats( group, dset_name, @@ -593,14 +997,15 @@ void field<rnumber, be, fc>::compute_stats( did_rspace = true; this->dft(); // normalize + TIMEZONE("field::normalize"); for (hsize_t tmp_index=0; tmp_index<this->rmemlayout->local_size; tmp_index++) this->data[tmp_index] /= this->npoints; } // what follows gave me a headache until I found this link: // http://stackoverflow.com/questions/8256636/expected-primary-expression-error-on-template-method-using kk->template cospectrum<rnumber, fc>( - (cnumber*)this->data, - (cnumber*)this->data, + (typename fftw_interface<rnumber>::complex*)this->data, + (typename fftw_interface<rnumber>::complex*)this->data, group, dset_name + "_" + dset_name, toffset); @@ -616,218 +1021,62 @@ void field<rnumber, be, fc>::compute_stats( } } -template <field_backend be, - kspace_dealias_type dt> -template <field_components fc> -kspace<be, dt>::kspace( - const field_layout<fc> *source_layout, - const double DKX, - const double DKY, - const double DKZ) -{ - /* get layout */ - this->layout = new field_layout<ONE>( - source_layout->sizes, - source_layout->subsizes, - source_layout->starts, - source_layout->comm); - - /* store dk values */ - this->dkx = DKX; - this->dky = DKY; - this->dkz = DKZ; - - /* compute kx, ky, kz and compute kM values */ - switch(be) - { - case FFTW: - this->kx.resize(this->layout->sizes[2]); - this->ky.resize(this->layout->subsizes[0]); - this->kz.resize(this->layout->sizes[1]); - int i, ii; - for (i = 0; i<int(this->layout->sizes[2]); i++) - this->kx[i] = i*this->dkx; - for (i = 0; i<int(this->layout->subsizes[0]); i++) - { - ii = i + this->layout->starts[0]; - if (ii <= int(this->layout->sizes[1]/2)) - this->ky[i] = this->dky*ii; - else - this->ky[i] = this->dky*(ii - int(this->layout->sizes[1])); - } - for (i = 0; i<int(this->layout->sizes[1]); i++) - { - if (i <= int(this->layout->sizes[0]/2)) - this->kz[i] = this->dkz*i; - else - this->kz[i] = this->dkz*(i - int(this->layout->sizes[0])); - } - switch(dt) - { - case TWO_THIRDS: - this->kMx = this->dkx*(int(2*(int(this->layout->sizes[2])-1)/3)-1); - this->kMy = this->dky*(int(this->layout->sizes[0] / 3)-1); - this->kMz = this->dkz*(int(this->layout->sizes[1] / 3)-1); - break; - case SMOOTH: - this->kMx = this->dkx*(int(this->layout->sizes[2])-2); - this->kMy = this->dky*(int(this->layout->sizes[0] / 2)-1); - this->kMz = this->dkz*(int(this->layout->sizes[1] / 2)-1); - break; - } - break; - } - - /* get global kM and dk */ - this->kM = this->kMx; - if (this->kM < this->kMy) this->kM = this->kMy; - if (this->kM < this->kMz) this->kM = this->kMz; - this->kM2 = this->kM * this->kM; - this->dk = this->dkx; - if (this->dk > this->dky) this->dk = this->dky; - if (this->dk > this->dkz) this->dk = this->dkz; - this->dk2 = this->dk*this->dk; - - /* spectra stuff */ - this->nshells = int(this->kM / this->dk) + 2; - this->kshell.resize(this->nshells, 0); - this->nshell.resize(this->nshells, 0); - std::vector<double> kshell_local; - kshell_local.resize(this->nshells, 0); - std::vector<int64_t> nshell_local; - nshell_local.resize(this->nshells, 0); - double knorm; - KSPACE_CLOOP_K2_NXMODES( - this, - if (k2 < this->kM2) - { - knorm = sqrt(k2); - nshell_local[int(knorm/this->dk)] += nxmodes; - kshell_local[int(knorm/this->dk)] += nxmodes*knorm; - } - if (dt == TWO_THIRDS) - this->dealias_filter[int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.)); - ); - MPI_Allreduce( - &nshell_local.front(), - &this->nshell.front(), - this->nshells, - MPI_INT64_T, MPI_SUM, this->layout->comm); - MPI_Allreduce( - &kshell_local.front(), - &this->kshell.front(), - this->nshells, - MPI_DOUBLE, MPI_SUM, this->layout->comm); - for (int n=0; n<this->nshells; n++) - this->kshell[n] /= this->nshell[n]; -} - -template <field_backend be, - kspace_dealias_type dt> -kspace<be, dt>::~kspace() -{ - delete this->layout; -} - -template <field_backend be, - kspace_dealias_type dt> -template <typename rnumber, - field_components fc> -void kspace<be, dt>::low_pass(rnumber *__restrict__ a, const double kmax) -{ - const double km2 = kmax*kmax; - KSPACE_CLOOP_K2( - this, - if (k2 >= km2) - std::fill_n(a + 2*ncomp(fc)*cindex, 2*ncomp(fc), 0); - ); -} - -template <field_backend be, - kspace_dealias_type dt> template <typename rnumber, - field_components fc> -void kspace<be, dt>::dealias(rnumber *__restrict__ a) -{ - switch(be) - { - case TWO_THIRDS: - this->low_pass<rnumber, fc>(a, this->kM); - break; - case SMOOTH: - KSPACE_CLOOP_K2( - this, - double tval = this->dealias_filter[int(round(k2 / this->dk2))]; - for (int tcounter=0; tcounter<2*ncomp(fc); tcounter++) - a[2*ncomp(fc)*cindex + tcounter] *= tval; - ); - break; - } -} - -template <field_backend be, + field_backend be, + field_components fc1, + field_components fc2, kspace_dealias_type dt> -template <typename rnumber, - field_components fc> -void kspace<be, dt>::cospectrum( - const rnumber(* __restrict a)[2], - const rnumber(* __restrict b)[2], - const hid_t group, - const std::string dset_name, - const hsize_t toffset) +void compute_gradient( + kspace<be, dt> *kk, + field<rnumber, be, fc1> *src, + field<rnumber, be, fc2> *dst) { - std::vector<double> spec, spec_local; - spec.resize(this->nshells*ncomp(fc)*ncomp(fc), 0); - spec_local.resize(this->nshells*ncomp(fc)*ncomp(fc), 0); - KSPACE_CLOOP_K2_NXMODES( - this, - if (k2 <= this->kM2) + TIMEZONE("compute_gradient"); + assert(!src->real_space_representation); + assert((fc1 == ONE && fc2 == THREE) || + (fc1 == THREE && fc2 == THREExTHREE)); + kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 < kk->kM2) switch(fc1) { - int tmp_int = int(sqrt(k2) / this->dk)*ncomp(fc)*ncomp(fc); - for (hsize_t i=0; i<ncomp(fc); i++) - for (hsize_t j=0; j<ncomp(fc); j++) - spec_local[tmp_int + i*ncomp(fc)+j] += nxmodes * ( - (a[ncomp(fc)*cindex + i][0] * b[ncomp(fc)*cindex + j][0]) + - (a[ncomp(fc)*cindex + i][1] * b[ncomp(fc)*cindex + j][1])); + case ONE: + dst->cval(cindex, 0, 0) = -kk->kx[xindex]*src->cval(cindex, 1); + dst->cval(cindex, 0, 1) = kk->kx[xindex]*src->cval(cindex, 0); + dst->cval(cindex, 1, 0) = -kk->ky[yindex]*src->cval(cindex, 1); + dst->cval(cindex, 1, 1) = kk->ky[yindex]*src->cval(cindex, 0); + dst->cval(cindex, 2, 0) = -kk->kz[zindex]*src->cval(cindex, 1); + dst->cval(cindex, 2, 1) = kk->kz[zindex]*src->cval(cindex, 0); + break; + case THREE: + for (unsigned int field_component = 0; + field_component < ncomp(fc1); + field_component++) + { + dst->cval(cindex, 0, field_component, 0) = -kk->kx[xindex]*src->cval(cindex, field_component, 1); + dst->cval(cindex, 0, field_component, 1) = kk->kx[xindex]*src->cval(cindex, field_component, 0); + dst->cval(cindex, 1, field_component, 0) = -kk->ky[yindex]*src->cval(cindex, field_component, 1); + dst->cval(cindex, 1, field_component, 1) = kk->ky[yindex]*src->cval(cindex, field_component, 0); + dst->cval(cindex, 2, field_component, 0) = -kk->kz[zindex]*src->cval(cindex, field_component, 1); + dst->cval(cindex, 2, field_component, 1) = kk->kz[zindex]*src->cval(cindex, field_component, 0); + } + //dst->get_cdata()[(cindex*3+0)*ncomp(fc1)+field_component][0] = + // - kk->kx[xindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][1]; + //dst->get_cdata()[(cindex*3+0)*ncomp(fc1)+field_component][1] = + // kk->kx[xindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][0]; + //dst->get_cdata()[(cindex*3+1)*ncomp(fc1)+field_component][0] = + // - kk->ky[yindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][1]; + //dst->get_cdata()[(cindex*3+1)*ncomp(fc1)+field_component][1] = + // kk->ky[yindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][0]; + //dst->get_cdata()[(cindex*3+2)*ncomp(fc1)+field_component][0] = + // - kk->kz[zindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][1]; + //dst->get_cdata()[(cindex*3+2)*ncomp(fc1)+field_component][1] = + // kk->kz[zindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][0]; } - ); - MPI_Allreduce( - &spec_local.front(), - &spec.front(), - spec.size(), - MPI_DOUBLE, MPI_SUM, this->layout->comm); - if (this->layout->myrank == 0) - { - hid_t dset, wspace, mspace; - hsize_t count[(ndim(fc)-2)*2], offset[(ndim(fc)-2)*2], dims[(ndim(fc)-2)*2]; - dset = H5Dopen(group, ("spectra/" + dset_name).c_str(), H5P_DEFAULT); - wspace = H5Dget_space(dset); - H5Sget_simple_extent_dims(wspace, dims, NULL); - switch (fc) - { - case THREExTHREE: - offset[4] = 0; - offset[5] = 0; - count[4] = ncomp(fc); - count[5] = ncomp(fc); - case THREE: - offset[2] = 0; - offset[3] = 0; - count[2] = ncomp(fc); - count[3] = ncomp(fc); - default: - offset[0] = toffset; - offset[1] = 0; - count[0] = 1; - count[1] = this->nshells; - } - mspace = H5Screate_simple((ndim(fc)-2)*2, count, NULL); - H5Sselect_hyperslab(wspace, H5S_SELECT_SET, offset, NULL, count, NULL); - H5Dwrite(dset, H5T_NATIVE_DOUBLE, mspace, wspace, H5P_DEFAULT, &spec.front()); - H5Sclose(wspace); - H5Sclose(mspace); - H5Dclose(dset); - } + }); } template class field<float, FFTW, ONE>; @@ -837,49 +1086,6 @@ template class field<double, FFTW, ONE>; template class field<double, FFTW, THREE>; template class field<double, FFTW, THREExTHREE>; -template class kspace<FFTW, TWO_THIRDS>; -template class kspace<FFTW, SMOOTH>; - -template kspace<FFTW, TWO_THIRDS>::kspace<>( - const field_layout<ONE> *, - const double, const double, const double); -template kspace<FFTW, TWO_THIRDS>::kspace<>( - const field_layout<THREE> *, - const double, const double, const double); -template kspace<FFTW, TWO_THIRDS>::kspace<>( - const field_layout<THREExTHREE> *, - const double, const double, const double); - -template kspace<FFTW, SMOOTH>::kspace<>( - const field_layout<ONE> *, - const double, const double, const double); -template kspace<FFTW, SMOOTH>::kspace<>( - const field_layout<THREE> *, - const double, const double, const double); -template kspace<FFTW, SMOOTH>::kspace<>( - const field_layout<THREExTHREE> *, - const double, const double, const double); - -template void kspace<FFTW, SMOOTH>::low_pass<float, ONE>( - float *__restrict__ a, - const double kmax); -template void kspace<FFTW, SMOOTH>::low_pass<float, THREE>( - float *__restrict__ a, - const double kmax); -template void kspace<FFTW, SMOOTH>::low_pass<float, THREExTHREE>( - float *__restrict__ a, - const double kmax); - -template void kspace<FFTW, SMOOTH>::low_pass<double, ONE>( - double *__restrict__ a, - const double kmax); -template void kspace<FFTW, SMOOTH>::low_pass<double, THREE>( - double *__restrict__ a, - const double kmax); -template void kspace<FFTW, SMOOTH>::low_pass<double, THREExTHREE>( - double *__restrict__ a, - const double kmax); - template void field<float, FFTW, ONE>::compute_stats<TWO_THIRDS>( kspace<FFTW, TWO_THIRDS> *, const hid_t, const std::string, const hsize_t, const double); @@ -920,3 +1126,20 @@ template void field<double, FFTW, THREExTHREE>::compute_stats<SMOOTH>( kspace<FFTW, SMOOTH> *, const hid_t, const std::string, const hsize_t, const double); +template void compute_gradient<float, FFTW, THREE, THREExTHREE, SMOOTH>( + kspace<FFTW, SMOOTH> *, + field<float, FFTW, THREE> *, + field<float, FFTW, THREExTHREE> *); +template void compute_gradient<double, FFTW, THREE, THREExTHREE, SMOOTH>( + kspace<FFTW, SMOOTH> *, + field<double, FFTW, THREE> *, + field<double, FFTW, THREExTHREE> *); + +template void compute_gradient<float, FFTW, ONE, THREE, SMOOTH>( + kspace<FFTW, SMOOTH> *, + field<float, FFTW, ONE> *, + field<float, FFTW, THREE> *); +template void compute_gradient<double, FFTW, ONE, THREE, SMOOTH>( + kspace<FFTW, SMOOTH> *, + field<double, FFTW, ONE> *, + field<double, FFTW, THREE> *); diff --git a/bfps/cpp/field.hpp b/bfps/cpp/field.hpp index 6ebd4090e38795b2209fffcb3b6d7aab2642a8f2..360d37e668130fe1d0e0c415fa98d34fc6b13de3 100644 --- a/bfps/cpp/field.hpp +++ b/bfps/cpp/field.hpp @@ -24,110 +24,17 @@ -#include <mpi.h> #include <hdf5.h> -#include <fftw3-mpi.h> #include <unordered_map> #include <vector> #include <string> -#include "base.hpp" +#include "kspace.hpp" +#include "omputils.hpp" -#ifndef FIELD +#ifndef FIELD_HPP -#define FIELD +#define FIELD_HPP -enum field_backend {FFTW}; -enum field_components {ONE, THREE, THREExTHREE}; -enum kspace_dealias_type {TWO_THIRDS, SMOOTH}; - -constexpr unsigned int ncomp( - field_components fc) - /* return actual number of field components for each enum value */ -{ - return ((fc == THREE) ? 3 : ( - (fc == THREExTHREE) ? 9 : 1)); -} - -constexpr unsigned int ndim( - field_components fc) - /* return actual number of field dimensions for each enum value */ -{ - return ((fc == THREE) ? 4 : ( - (fc == THREExTHREE) ? 5 : 3)); -} - -template <field_components fc> -class field_layout -{ - public: - /* description */ - hsize_t sizes[ndim(fc)]; - hsize_t subsizes[ndim(fc)]; - hsize_t starts[ndim(fc)]; - hsize_t local_size, full_size; - - int myrank, nprocs; - MPI_Comm comm; - - std::vector<std::vector<int>> rank; - std::vector<std::vector<int>> all_start; - std::vector<std::vector<int>> all_size; - - /* methods */ - field_layout( - const hsize_t *SIZES, - const hsize_t *SUBSIZES, - const hsize_t *STARTS, - const MPI_Comm COMM_TO_USE); - ~field_layout(){} -}; - -template <field_backend be, - kspace_dealias_type dt> -class kspace -{ - public: - /* relevant field layout */ - field_layout<ONE> *layout; - - /* physical parameters */ - double dkx, dky, dkz, dk, dk2; - - /* mode and dealiasing information */ - double kMx, kMy, kMz, kM, kM2; - double kMspec, kMspec2; - std::vector<double> kx, ky, kz; - std::unordered_map<int, double> dealias_filter; - std::vector<double> kshell; - std::vector<int64_t> nshell; - int nshells; - - /* methods */ - template <field_components fc> - kspace( - const field_layout<fc> *source_layout, - const double DKX = 1.0, - const double DKY = 1.0, - const double DKZ = 1.0); - ~kspace(); - - template <typename rnumber, - field_components fc> - void low_pass(rnumber *__restrict__ a, const double kmax); - - template <typename rnumber, - field_components fc> - void dealias(rnumber *__restrict__ a); - - template <typename rnumber, - field_components fc> - void cospectrum( - const rnumber(* __restrict__ a)[2], - const rnumber(* __restrict__ b)[2], - const hid_t group, - const std::string dset_name, - const hsize_t toffset); -}; template <typename rnumber, field_backend be, @@ -136,10 +43,9 @@ class field { private: /* data arrays */ - rnumber *data; - typedef rnumber cnumber[2]; - hsize_t npoints; + rnumber *__restrict__ data; public: + hsize_t npoints; bool real_space_representation; /* basic MPI information */ int myrank, nprocs; @@ -153,8 +59,8 @@ class field field_layout<fc> *clayout, *rlayout, *rmemlayout; /* FFT plans */ - void *c2r_plan; - void *r2c_plan; + typename fftw_interface<rnumber>::plan c2r_plan; + typename fftw_interface<rnumber>::plan r2c_plan; unsigned fftw_plan_rigor; /* HDF5 data types for arrays */ @@ -166,34 +72,100 @@ class field const int ny, const int nz, const MPI_Comm COMM_TO_USE, - const unsigned FFTW_PLAN_RIGOR = FFTW_ESTIMATE); + const unsigned FFTW_PLAN_RIGOR = DEFAULT_FFTW_FLAG); ~field(); int io( const std::string fname, - const std::string dset_name, + const std::string field_name, + const int iteration, + const bool read = true); + int io_database( + const std::string fname, + const std::string field_name, const int toffset, const bool read = true); + int write_0slice( + const hid_t group, + const std::string field_name, + const int iteration); + + /* essential FFT stuff */ void dft(); void ift(); void normalize(); + void symmetrize(); + + /* stats */ + void compute_rspace_xincrement_stats( + const int xcells, + const hid_t group, + const std::string dset_name, + const hsize_t toffset, + const std::vector<double> max_estimate); void compute_rspace_stats( const hid_t group, const std::string dset_name, const hsize_t toffset, const std::vector<double> max_estimate); - inline rnumber *get_rdata() + + /* acess data */ + inline rnumber *__restrict__ get_rdata() + { + return this->data; + } + + inline const rnumber *__restrict__ get_rdata() const { return this->data; } - inline cnumber *get_cdata() + + inline typename fftw_interface<rnumber>::complex *__restrict__ get_cdata() + { + return (typename fftw_interface<rnumber>::complex*__restrict__)this->data; + } + + inline rnumber &rval(ptrdiff_t rindex, unsigned int component = 0) + { + assert(fc == ONE || fc == THREE); + assert(component >= 0 && component < ncomp(fc)); + return *(this->data + rindex*ncomp(fc) + component); + } + + inline rnumber &rval(ptrdiff_t rindex, int comp1, int comp0) + { + assert(fc == THREExTHREE); + assert(comp1 >= 0 && comp1 < 3); + assert(comp0 >= 0 && comp0 < 3); + return *(this->data + ((rindex*3 + comp1)*3 + comp0)); + } + + inline rnumber &cval(ptrdiff_t cindex, int imag) + { + assert(fc == ONE); + assert(imag == 0 || imag == 1); + return *(this->data + cindex*2 + imag); + } + + inline rnumber &cval(ptrdiff_t cindex, int component, int imag) + { + assert(fc == THREE); + assert(imag == 0 || imag == 1); + return *(this->data + (cindex*ncomp(fc) + component)*2 + imag); + } + + inline rnumber &cval(ptrdiff_t cindex, int comp1, int comp0, int imag) { - return (cnumber*)this->data; + assert(fc == THREExTHREE); + assert(comp1 >= 0 && comp1 < 3); + assert(comp0 >= 0 && comp0 < 3); + assert(imag == 0 || imag == 1); + return *(this->data + ((cindex*3 + comp1)*3+comp0)*2 + imag); } - inline field<rnumber, be, fc>& operator=(const cnumber *__restrict__ source) + inline field<rnumber, be, fc>& operator=(const typename fftw_interface<rnumber>::complex *__restrict__ source) { std::copy((rnumber*)source, (rnumber*)(source + this->clayout->local_size), @@ -210,6 +182,15 @@ class field this->real_space_representation = true; return *this; } + + inline field<rnumber, be, fc>& operator=(const rnumber value) + { + std::fill_n(this->data, + this->rmemlayout->local_size, + value); + return *this; + } + template <kspace_dealias_type dt> void compute_stats( kspace<be, dt> *kk, @@ -217,74 +198,61 @@ class field const std::string dset_name, const hsize_t toffset, const double max_estimate); + inline void impose_zero_mode() + { + if (this->clayout->myrank == this->clayout->rank[0][0] && + this->real_space_representation == false) + { + std::fill_n(this->data, 2*ncomp(fc), 0.0); + } + } + template <class func_type> + void RLOOP(func_type expression) + { + switch(be) + { + case FFTW: + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(this->rlayout->subsizes[1]); + const hsize_t end = OmpUtils::ForIntervalEnd(this->rlayout->subsizes[1]); + + for (hsize_t zindex = 0; zindex < this->rlayout->subsizes[0]; zindex++) + for (hsize_t yindex = start; yindex < end; yindex++) + { + ptrdiff_t rindex = ( + zindex * this->rlayout->subsizes[1] + yindex)*( + this->rmemlayout->subsizes[2]); + for (hsize_t xindex = 0; xindex < this->rlayout->subsizes[2]; xindex++) + { + expression(rindex, xindex, yindex, zindex); + rindex++; + } + } + } + break; + } + } + ptrdiff_t get_cindex( + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex) + { + return ((yindex*this->clayout->subsizes[1] + + zindex)*this->clayout->subsizes[2] + + xindex); + } }; -/* real space loop */ -#define FIELD_RLOOP(obj, expression) \ - \ -{ \ - switch (be) \ - { \ - case FFTW: \ - for (hsize_t zindex = 0; zindex < obj->rlayout->subsizes[0]; zindex++) \ - for (hsize_t yindex = 0; yindex < obj->rlayout->subsizes[1]; yindex++) \ - { \ - ptrdiff_t rindex = ( \ - zindex * obj->rlayout->subsizes[1] + yindex)*( \ - obj->rmemlayout->subsizes[2]); \ - for (hsize_t xindex = 0; xindex < obj->rlayout->subsizes[2]; xindex++) \ - { \ - expression; \ - rindex++; \ - } \ - } \ - break; \ - } \ -} - -#define KSPACE_CLOOP_K2(obj, expression) \ - \ -{ \ - double k2; \ - ptrdiff_t cindex = 0; \ - for (hsize_t yindex = 0; yindex < obj->layout->subsizes[0]; yindex++) \ - for (hsize_t zindex = 0; zindex < obj->layout->subsizes[1]; zindex++) \ - for (hsize_t xindex = 0; xindex < obj->layout->subsizes[2]; xindex++) \ - { \ - k2 = (obj->kx[xindex]*obj->kx[xindex] + \ - obj->ky[yindex]*obj->ky[yindex] + \ - obj->kz[zindex]*obj->kz[zindex]); \ - expression; \ - cindex++; \ - } \ -} - -#define KSPACE_CLOOP_K2_NXMODES(obj, expression) \ - \ -{ \ - double k2; \ - ptrdiff_t cindex = 0; \ - for (hsize_t yindex = 0; yindex < obj->layout->subsizes[0]; yindex++) \ - for (hsize_t zindex = 0; zindex < obj->layout->subsizes[1]; zindex++) \ - { \ - int nxmodes = 1; \ - hsize_t xindex = 0; \ - k2 = (obj->kx[xindex]*obj->kx[xindex] + \ - obj->ky[yindex]*obj->ky[yindex] + \ - obj->kz[zindex]*obj->kz[zindex]); \ - expression; \ - cindex++; \ - nxmodes = 2; \ - for (xindex = 1; xindex < obj->layout->subsizes[2]; xindex++) \ - { \ - k2 = (obj->kx[xindex]*obj->kx[xindex] + \ - obj->ky[yindex]*obj->ky[yindex] + \ - obj->kz[zindex]*obj->kz[zindex]); \ - expression; \ - cindex++; \ - } \ - } \ -} - -#endif//FIELD +template <typename rnumber, + field_backend be, + field_components fc1, + field_components fc2, + kspace_dealias_type dt> +void compute_gradient( + kspace<be, dt> *kk, + field<rnumber, be, fc1> *source, + field<rnumber, be, fc2> *destination); + +#endif//FIELD_HPP diff --git a/bfps/cpp/field_descriptor.cpp b/bfps/cpp/field_descriptor.cpp index b5025835903a37ea5384cb4102c716f527aabfe5..20c634262dbb45ad4c2bb5a1b5640b6df23d4d2c 100644 --- a/bfps/cpp/field_descriptor.cpp +++ b/bfps/cpp/field_descriptor.cpp @@ -31,476 +31,470 @@ #include <iostream> #include "base.hpp" #include "field_descriptor.hpp" - +#include "fftw_interface.hpp" +#include "scope_timer.hpp" /*****************************************************************************/ /* macro for specializations to numeric types compatible with FFTW */ -#define CLASS_IMPLEMENTATION(FFTW, R, MPI_RNUM, MPI_CNUM) \ - \ -template<> \ -field_descriptor<R>::field_descriptor( \ - int ndims, \ - int *n, \ - MPI_Datatype element_type, \ - MPI_Comm COMM_TO_USE) \ -{ \ - DEBUG_MSG("entered field_descriptor::field_descriptor\n"); \ - this->comm = COMM_TO_USE; \ - MPI_Comm_rank(this->comm, &this->myrank); \ - MPI_Comm_size(this->comm, &this->nprocs); \ - this->ndims = ndims; \ - this->sizes = new int[ndims]; \ - this->subsizes = new int[ndims]; \ - this->starts = new int[ndims]; \ - int tsizes [ndims]; \ - int tsubsizes[ndims]; \ - int tstarts [ndims]; \ - ptrdiff_t *nfftw = new ptrdiff_t[ndims]; \ - ptrdiff_t local_n0, local_0_start; \ - for (int i = 0; i < this->ndims; i++) \ - nfftw[i] = n[i]; \ - this->local_size = fftw_mpi_local_size_many( \ - this->ndims, \ - nfftw, \ - 1, \ - FFTW_MPI_DEFAULT_BLOCK, \ - this->comm, \ - &local_n0, \ - &local_0_start); \ - this->sizes[0] = n[0]; \ - this->subsizes[0] = (int)local_n0; \ - this->starts[0] = (int)local_0_start; \ - DEBUG_MSG_WAIT( \ - this->comm, \ - "first subsizes[0] = %d %d %d\n", \ - this->subsizes[0], \ - tsubsizes[0], \ - (int)local_n0); \ - tsizes[0] = n[0]; \ - tsubsizes[0] = (int)local_n0; \ - tstarts[0] = (int)local_0_start; \ - DEBUG_MSG_WAIT( \ - this->comm, \ - "second subsizes[0] = %d %d %d\n", \ - this->subsizes[0], \ - tsubsizes[0], \ - (int)local_n0); \ - this->mpi_dtype = element_type; \ - this->slice_size = 1; \ - this->full_size = this->sizes[0]; \ - for (int i = 1; i < this->ndims; i++) \ - { \ - this->sizes[i] = n[i]; \ - this->subsizes[i] = n[i]; \ - this->starts[i] = 0; \ - this->slice_size *= this->subsizes[i]; \ - this->full_size *= this->sizes[i]; \ - tsizes[i] = this->sizes[i]; \ - tsubsizes[i] = this->subsizes[i]; \ - tstarts[i] = this->starts[i]; \ - } \ - tsizes[ndims-1] *= sizeof(R); \ - tsubsizes[ndims-1] *= sizeof(R); \ - tstarts[ndims-1] *= sizeof(R); \ - if (this->mpi_dtype == MPI_CNUM) \ - { \ - tsizes[ndims-1] *= 2; \ - tsubsizes[ndims-1] *= 2; \ - tstarts[ndims-1] *= 2; \ - } \ - int local_zero_array[this->nprocs], zero_array[this->nprocs]; \ - for (int i=0; i<this->nprocs; i++) \ - local_zero_array[i] = 0; \ - local_zero_array[this->myrank] = (this->subsizes[0] == 0) ? 1 : 0; \ - MPI_Allreduce( \ - local_zero_array, \ - zero_array, \ - this->nprocs, \ - MPI_INT, \ - MPI_SUM, \ - this->comm); \ - int no_of_excluded_ranks = 0; \ - for (int i = 0; i<this->nprocs; i++) \ - no_of_excluded_ranks += zero_array[i]; \ - DEBUG_MSG_WAIT( \ - this->comm, \ - "subsizes[0] = %d %d\n", \ - this->subsizes[0], \ - tsubsizes[0]); \ - if (no_of_excluded_ranks == 0) \ - { \ - this->io_comm = this->comm; \ - this->io_nprocs = this->nprocs; \ - this->io_myrank = this->myrank; \ - } \ - else \ - { \ - int excluded_rank[no_of_excluded_ranks]; \ - for (int i=0, j=0; i<this->nprocs; i++) \ - if (zero_array[i]) \ - { \ - excluded_rank[j] = i; \ - j++; \ - } \ - MPI_Group tgroup0, tgroup; \ - MPI_Comm_group(this->comm, &tgroup0); \ - MPI_Group_excl(tgroup0, no_of_excluded_ranks, excluded_rank, &tgroup); \ - MPI_Comm_create(this->comm, tgroup, &this->io_comm); \ - MPI_Group_free(&tgroup0); \ - MPI_Group_free(&tgroup); \ - if (this->subsizes[0] > 0) \ - { \ - MPI_Comm_rank(this->io_comm, &this->io_myrank); \ - MPI_Comm_size(this->io_comm, &this->io_nprocs); \ - } \ - else \ - { \ - this->io_myrank = MPI_PROC_NULL; \ - this->io_nprocs = -1; \ - } \ - } \ - DEBUG_MSG_WAIT( \ - this->comm, \ - "inside field_descriptor constructor, about to call " \ - "MPI_Type_create_subarray " \ - "%d %d %d\n", \ - this->sizes[0], \ - this->subsizes[0], \ - this->starts[0]); \ - for (int i=0; i<this->ndims; i++) \ - DEBUG_MSG_WAIT( \ - this->comm, \ - "tsizes " \ - "%d %d %d\n", \ - tsizes[i], \ - tsubsizes[i], \ - tstarts[i]); \ - if (this->subsizes[0] > 0) \ - { \ - DEBUG_MSG("creating subarray\n"); \ - MPI_Type_create_subarray( \ - ndims, \ - tsizes, \ - tsubsizes, \ - tstarts, \ - MPI_ORDER_C, \ - MPI_UNSIGNED_CHAR, \ - &this->mpi_array_dtype); \ - MPI_Type_commit(&this->mpi_array_dtype); \ - } \ - this->rank = new int[this->sizes[0]]; \ - int *local_rank = new int[this->sizes[0]]; \ - std::fill_n(local_rank, this->sizes[0], 0); \ - for (int i = 0; i < this->sizes[0]; i++) \ - if (i >= this->starts[0] && i < this->starts[0] + this->subsizes[0]) \ - local_rank[i] = this->myrank; \ - MPI_Allreduce( \ - local_rank, \ - this->rank, \ - this->sizes[0], \ - MPI_INT, \ - MPI_SUM, \ - this->comm); \ - delete[] local_rank; \ - this->all_start0 = new int[this->nprocs]; \ - int *local_start0 = new int[this->nprocs]; \ - std::fill_n(local_start0, this->nprocs, 0); \ - for (int i = 0; i < this->nprocs; i++) \ - if (this->myrank == i) \ - local_start0[i] = this->starts[0]; \ - MPI_Allreduce( \ - local_start0, \ - this->all_start0, \ - this->nprocs, \ - MPI_INT, \ - MPI_SUM, \ - this->comm); \ - delete[] local_start0; \ - this->all_size0 = new int[this->nprocs]; \ - int *local_size0 = new int[this->nprocs]; \ - std::fill_n(local_size0, this->nprocs, 0); \ - for (int i = 0; i < this->nprocs; i++) \ - if (this->myrank == i) \ - local_size0[i] = this->subsizes[0]; \ - MPI_Allreduce( \ - local_size0, \ - this->all_size0, \ - this->nprocs, \ - MPI_INT, \ - MPI_SUM, \ - this->comm); \ - delete[] local_size0; \ - DEBUG_MSG("exiting field_descriptor constructor\n"); \ -} \ - \ -template <> \ -int field_descriptor<R>::read( \ - const char *fname, \ - void *buffer) \ -{ \ - DEBUG_MSG("entered field_descriptor::read\n"); \ - char representation[] = "native"; \ - if (this->subsizes[0] > 0) \ - { \ - MPI_Info info; \ - MPI_Info_create(&info); \ - MPI_File f; \ - ptrdiff_t read_size = this->local_size*sizeof(R); \ - DEBUG_MSG("read size is %ld\n", read_size); \ - char ffname[200]; \ - if (this->mpi_dtype == MPI_CNUM) \ - read_size *= 2; \ - DEBUG_MSG("read size is %ld\n", read_size); \ - sprintf(ffname, "%s", fname); \ - \ - MPI_File_open( \ - this->io_comm, \ - ffname, \ - MPI_MODE_RDONLY, \ - info, \ - &f); \ - DEBUG_MSG("opened file\n"); \ - MPI_File_set_view( \ - f, \ - 0, \ - MPI_UNSIGNED_CHAR, \ - this->mpi_array_dtype, \ - representation, \ - info); \ - DEBUG_MSG("view is set\n"); \ - MPI_File_read_all( \ - f, \ - buffer, \ - read_size, \ - MPI_UNSIGNED_CHAR, \ - MPI_STATUS_IGNORE); \ - DEBUG_MSG("info is read\n"); \ - MPI_File_close(&f); \ - } \ - DEBUG_MSG("finished with field_descriptor::read\n"); \ - return EXIT_SUCCESS; \ -} \ - \ -template <> \ -int field_descriptor<R>::write( \ - const char *fname, \ - void *buffer) \ -{ \ - char representation[] = "native"; \ - if (this->subsizes[0] > 0) \ - { \ - MPI_Info info; \ - MPI_Info_create(&info); \ - MPI_File f; \ - ptrdiff_t read_size = this->local_size*sizeof(R); \ - char ffname[200]; \ - if (this->mpi_dtype == MPI_CNUM) \ - read_size *= 2; \ - sprintf(ffname, "%s", fname); \ - \ - MPI_File_open( \ - this->io_comm, \ - ffname, \ - MPI_MODE_CREATE | MPI_MODE_WRONLY, \ - info, \ - &f); \ - MPI_File_set_view( \ - f, \ - 0, \ - MPI_UNSIGNED_CHAR, \ - this->mpi_array_dtype, \ - representation, \ - info); \ - MPI_File_write_all( \ - f, \ - buffer, \ - read_size, \ - MPI_UNSIGNED_CHAR, \ - MPI_STATUS_IGNORE); \ - MPI_File_close(&f); \ - } \ - \ - return EXIT_SUCCESS; \ -} \ - \ -template <> \ -int field_descriptor<R>::transpose( \ - R *input, \ - R *output) \ -{ \ - /* IMPORTANT NOTE: \ - for 3D transposition, the input data is messed up */ \ - FFTW(plan) tplan; \ - if (this->ndims == 3) \ - { \ - /* transpose the two local dimensions 1 and 2 */ \ - R *atmp; \ - atmp = FFTW(alloc_real)(this->slice_size); \ - for (int k = 0; k < this->subsizes[0]; k++) \ - { \ - /* put transposed slice in atmp */ \ - for (int j = 0; j < this->sizes[1]; j++) \ - for (int i = 0; i < this->sizes[2]; i++) \ - atmp[i*this->sizes[1] + j] = \ - input[(k*this->sizes[1] + j)*this->sizes[2] + i]; \ - /* copy back transposed slice */ \ - std::copy( \ - atmp, \ - atmp + this->slice_size, \ - input + k*this->slice_size); \ - } \ - FFTW(free)(atmp); \ - } \ - tplan = FFTW(mpi_plan_transpose)( \ - this->sizes[0], this->slice_size, \ - input, output, \ - this->comm, \ - FFTW_ESTIMATE); \ - FFTW(execute)(tplan); \ - FFTW(destroy_plan)(tplan); \ - return EXIT_SUCCESS; \ -} \ - \ -template<> \ -int field_descriptor<R>::transpose( \ - FFTW(complex) *input, \ - FFTW(complex) *output) \ -{ \ - switch (this->ndims) \ - { \ - case 2: \ - /* do a global transpose over the 2 dimensions */ \ - if (output == NULL) \ - { \ - std::cerr << "bad arguments for transpose.\n" << std::endl; \ - return EXIT_FAILURE; \ - } \ - FFTW(plan) tplan; \ - tplan = FFTW(mpi_plan_many_transpose)( \ - this->sizes[0], this->sizes[1], 2, \ - FFTW_MPI_DEFAULT_BLOCK, \ - FFTW_MPI_DEFAULT_BLOCK, \ - (R*)input, (R*)output, \ - this->comm, \ - FFTW_ESTIMATE); \ - FFTW(execute)(tplan); \ - FFTW(destroy_plan)(tplan); \ - break; \ - case 3: \ - /* transpose the two local dimensions 1 and 2 */ \ - FFTW(complex) *atmp; \ - atmp = FFTW(alloc_complex)(this->slice_size); \ - for (int k = 0; k < this->subsizes[0]; k++) \ - { \ - /* put transposed slice in atmp */ \ - for (int j = 0; j < this->sizes[1]; j++) \ - for (int i = 0; i < this->sizes[2]; i++) \ - { \ - atmp[i*this->sizes[1] + j][0] = \ - input[(k*this->sizes[1] + j)*this->sizes[2] + i][0]; \ - atmp[i*this->sizes[1] + j][1] = \ - input[(k*this->sizes[1] + j)*this->sizes[2] + i][1]; \ - } \ - /* copy back transposed slice */ \ - std::copy( \ - (R*)(atmp), \ - (R*)(atmp + this->slice_size), \ - (R*)(input + k*this->slice_size)); \ - } \ - FFTW(free)(atmp); \ - break; \ - default: \ - return EXIT_FAILURE; \ - break; \ - } \ - return EXIT_SUCCESS; \ -} \ - \ -template<> \ -int field_descriptor<R>::interleave( \ - R *a, \ - int dim) \ -{ \ -/* the following is copied from \ - * http://agentzlerich.blogspot.com/2010/01/using-fftw-for-in-place-matrix.html \ - * */ \ - FFTW(iodim) howmany_dims[2]; \ - howmany_dims[0].n = dim; \ - howmany_dims[0].is = this->local_size; \ - howmany_dims[0].os = 1; \ - howmany_dims[1].n = this->local_size; \ - howmany_dims[1].is = 1; \ - howmany_dims[1].os = dim; \ - const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]); \ - \ - FFTW(plan) tmp = FFTW(plan_guru_r2r)( \ - /*rank*/0, \ - /*dims*/NULL, \ - howmany_rank, \ - howmany_dims, \ - a, \ - a, \ - /*kind*/NULL, \ - FFTW_ESTIMATE); \ - FFTW(execute)(tmp); \ - FFTW(destroy_plan)(tmp); \ - return EXIT_SUCCESS; \ -} \ - \ -template<> \ -int field_descriptor<R>::interleave( \ - FFTW(complex) *a, \ - int dim) \ -{ \ - FFTW(iodim) howmany_dims[2]; \ - howmany_dims[0].n = dim; \ - howmany_dims[0].is = this->local_size; \ - howmany_dims[0].os = 1; \ - howmany_dims[1].n = this->local_size; \ - howmany_dims[1].is = 1; \ - howmany_dims[1].os = dim; \ - const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]); \ - \ - FFTW(plan) tmp = FFTW(plan_guru_dft)( \ - /*rank*/0, \ - /*dims*/NULL, \ - howmany_rank, \ - howmany_dims, \ - a, \ - a, \ - +1, \ - FFTW_ESTIMATE); \ - FFTW(execute)(tmp); \ - FFTW(destroy_plan)(tmp); \ - return EXIT_SUCCESS; \ -} \ - \ -template<> \ -field_descriptor<R>* field_descriptor<R>::get_transpose() \ -{ \ - int n[this->ndims]; \ - for (int i=0; i<this->ndims; i++) \ - n[i] = this->sizes[this->ndims - i - 1]; \ - return new field_descriptor<R>(this->ndims, n, this->mpi_dtype, this->comm); \ -} \ -/*****************************************************************************/ +template <class rnumber> +field_descriptor<rnumber>::field_descriptor( + int ndims, + int *n, + MPI_Datatype element_type, + MPI_Comm COMM_TO_USE) +{ + TIMEZONE("field_descriptor"); + DEBUG_MSG("entered field_descriptor::field_descriptor\n"); + this->comm = COMM_TO_USE; + MPI_Comm_rank(this->comm, &this->myrank); + MPI_Comm_size(this->comm, &this->nprocs); + this->ndims = ndims; + this->sizes = new int[ndims]; + this->subsizes = new int[ndims]; + this->starts = new int[ndims]; + int tsizes [ndims]; + int tsubsizes[ndims]; + int tstarts [ndims]; + std::vector<ptrdiff_t> nfftw; + nfftw.resize(ndims); + ptrdiff_t local_n0, local_0_start; + for (int i = 0; i < this->ndims; i++) + nfftw[i] = n[i]; + this->local_size = fftw_mpi_local_size_many( + this->ndims, + &nfftw.front(), + 1, + FFTW_MPI_DEFAULT_BLOCK, + this->comm, + &local_n0, + &local_0_start); + this->sizes[0] = n[0]; + this->subsizes[0] = (int)local_n0; + this->starts[0] = (int)local_0_start; + DEBUG_MSG_WAIT( + this->comm, + "first subsizes[0] = %d %d %d\n", + this->subsizes[0], + tsubsizes[0], + (int)local_n0); + tsizes[0] = n[0]; + tsubsizes[0] = (int)local_n0; + tstarts[0] = (int)local_0_start; + DEBUG_MSG_WAIT( + this->comm, + "second subsizes[0] = %d %d %d\n", + this->subsizes[0], + tsubsizes[0], + (int)local_n0); + this->mpi_dtype = element_type; + this->slice_size = 1; + this->full_size = this->sizes[0]; + for (int i = 1; i < this->ndims; i++) + { + this->sizes[i] = n[i]; + this->subsizes[i] = n[i]; + this->starts[i] = 0; + this->slice_size *= this->subsizes[i]; + this->full_size *= this->sizes[i]; + tsizes[i] = this->sizes[i]; + tsubsizes[i] = this->subsizes[i]; + tstarts[i] = this->starts[i]; + } + tsizes[ndims-1] *= sizeof(rnumber); + tsubsizes[ndims-1] *= sizeof(rnumber); + tstarts[ndims-1] *= sizeof(rnumber); + if (this->mpi_dtype == mpi_real_type<rnumber>::complex()) + { + tsizes[ndims-1] *= 2; + tsubsizes[ndims-1] *= 2; + tstarts[ndims-1] *= 2; + } + int local_zero_array[this->nprocs], zero_array[this->nprocs]; + for (int i=0; i<this->nprocs; i++) + local_zero_array[i] = 0; + local_zero_array[this->myrank] = (this->subsizes[0] == 0) ? 1 : 0; + MPI_Allreduce( + local_zero_array, + zero_array, + this->nprocs, + MPI_INT, + MPI_SUM, + this->comm); + int no_of_excluded_ranks = 0; + for (int i = 0; i<this->nprocs; i++) + no_of_excluded_ranks += zero_array[i]; + DEBUG_MSG_WAIT( + this->comm, + "subsizes[0] = %d %d\n", + this->subsizes[0], + tsubsizes[0]); + if (no_of_excluded_ranks == 0) + { + this->io_comm = this->comm; + this->io_nprocs = this->nprocs; + this->io_myrank = this->myrank; + } + else + { + int excluded_rank[no_of_excluded_ranks]; + for (int i=0, j=0; i<this->nprocs; i++) + if (zero_array[i]) + { + excluded_rank[j] = i; + j++; + } + MPI_Group tgroup0, tgroup; + MPI_Comm_group(this->comm, &tgroup0); + MPI_Group_excl(tgroup0, no_of_excluded_ranks, excluded_rank, &tgroup); + MPI_Comm_create(this->comm, tgroup, &this->io_comm); + MPI_Group_free(&tgroup0); + MPI_Group_free(&tgroup); + if (this->subsizes[0] > 0) + { + MPI_Comm_rank(this->io_comm, &this->io_myrank); + MPI_Comm_size(this->io_comm, &this->io_nprocs); + } + else + { + this->io_myrank = MPI_PROC_NULL; + this->io_nprocs = -1; + } + } + DEBUG_MSG_WAIT( + this->comm, + "inside field_descriptor constructor, about to call " + "MPI_Type_create_subarray " + "%d %d %d\n", + this->sizes[0], + this->subsizes[0], + this->starts[0]); + for (int i=0; i<this->ndims; i++) + DEBUG_MSG_WAIT( + this->comm, + "tsizes " + "%d %d %d\n", + tsizes[i], + tsubsizes[i], + tstarts[i]); + if (this->subsizes[0] > 0) + { + DEBUG_MSG("creating subarray\n"); + MPI_Type_create_subarray( + ndims, + tsizes, + tsubsizes, + tstarts, + MPI_ORDER_C, + MPI_UNSIGNED_CHAR, + &this->mpi_array_dtype); + MPI_Type_commit(&this->mpi_array_dtype); + } + this->rank = new int[this->sizes[0]]; + int *local_rank = new int[this->sizes[0]]; + std::fill_n(local_rank, this->sizes[0], 0); + for (int i = 0; i < this->sizes[0]; i++) + if (i >= this->starts[0] && i < this->starts[0] + this->subsizes[0]) + local_rank[i] = this->myrank; + MPI_Allreduce( + local_rank, + this->rank, + this->sizes[0], + MPI_INT, + MPI_SUM, + this->comm); + delete[] local_rank; + this->all_start0 = new int[this->nprocs]; + int *local_start0 = new int[this->nprocs]; + std::fill_n(local_start0, this->nprocs, 0); + for (int i = 0; i < this->nprocs; i++) + if (this->myrank == i) + local_start0[i] = this->starts[0]; + MPI_Allreduce( + local_start0, + this->all_start0, + this->nprocs, + MPI_INT, + MPI_SUM, + this->comm); + delete[] local_start0; + this->all_size0 = new int[this->nprocs]; + int *local_size0 = new int[this->nprocs]; + std::fill_n(local_size0, this->nprocs, 0); + for (int i = 0; i < this->nprocs; i++) + if (this->myrank == i) + local_size0[i] = this->subsizes[0]; + MPI_Allreduce( + local_size0, + this->all_size0, + this->nprocs, + MPI_INT, + MPI_SUM, + this->comm); + delete[] local_size0; + DEBUG_MSG("exiting field_descriptor constructor\n"); +} + +template <class rnumber> +int field_descriptor<rnumber>::read( + const char *fname, + void *buffer) +{ + TIMEZONE("field_descriptor::read"); + DEBUG_MSG("entered field_descriptor::read\n"); + char representation[] = "native"; + if (this->subsizes[0] > 0) + { + MPI_Info info; + MPI_Info_create(&info); + MPI_File f; + ptrdiff_t read_size = this->local_size*sizeof(rnumber); + DEBUG_MSG("read size is %ld\n", read_size); + char ffname[200]; + if (this->mpi_dtype == mpi_real_type<rnumber>::complex()) + read_size *= 2; + DEBUG_MSG("read size is %ld\n", read_size); + sprintf(ffname, "%s", fname); + + MPI_File_open( + this->io_comm, + ffname, + MPI_MODE_RDONLY, + info, + &f); + DEBUG_MSG("opened file\n"); + MPI_File_set_view( + f, + 0, + MPI_UNSIGNED_CHAR, + this->mpi_array_dtype, + representation, + info); + DEBUG_MSG("view is set\n"); + MPI_File_read_all( + f, + buffer, + read_size, + MPI_UNSIGNED_CHAR, + MPI_STATUS_IGNORE); + DEBUG_MSG("info is read\n"); + MPI_File_close(&f); + } + DEBUG_MSG("finished with field_descriptor::read\n"); + return EXIT_SUCCESS; +} + +template <class rnumber> +int field_descriptor<rnumber>::write( + const char *fname, + void *buffer) +{ + TIMEZONE("field_descriptor::write"); + char representation[] = "native"; + if (this->subsizes[0] > 0) + { + MPI_Info info; + MPI_Info_create(&info); + MPI_File f; + ptrdiff_t read_size = this->local_size*sizeof(rnumber); + char ffname[200]; + if (this->mpi_dtype == mpi_real_type<rnumber>::complex()) + read_size *= 2; + sprintf(ffname, "%s", fname); + + MPI_File_open( + this->io_comm, + ffname, + MPI_MODE_CREATE | MPI_MODE_WRONLY, + info, + &f); + MPI_File_set_view( + f, + 0, + MPI_UNSIGNED_CHAR, + this->mpi_array_dtype, + representation, + info); + MPI_File_write_all( + f, + buffer, + read_size, + MPI_UNSIGNED_CHAR, + MPI_STATUS_IGNORE); + MPI_File_close(&f); + } + + return EXIT_SUCCESS; +} +template <class rnumber> +int field_descriptor<rnumber>::transpose( + rnumber *input, + rnumber *output) +{ + TIMEZONE("field_descriptor::transpose"); + /* IMPORTANT NOTE: + for 3D transposition, the input data is messed up */ + typename fftw_interface<rnumber>::plan tplan; + if (this->ndims == 3) + { + /* transpose the two local dimensions 1 and 2 */ + rnumber *atmp; + atmp = fftw_interface<rnumber>::alloc_real(this->slice_size); + for (int k = 0; k < this->subsizes[0]; k++) + { + /* put transposed slice in atmp */ + for (int j = 0; j < this->sizes[1]; j++) + for (int i = 0; i < this->sizes[2]; i++) + atmp[i*this->sizes[1] + j] = + input[(k*this->sizes[1] + j)*this->sizes[2] + i]; + /* copy back transposed slice */ + std::copy( + atmp, + atmp + this->slice_size, + input + k*this->slice_size); + } + fftw_interface<rnumber>::free(atmp); + } + tplan = fftw_interface<rnumber>::mpi_plan_transpose( + this->sizes[0], this->slice_size, + input, output, + this->comm, + DEFAULT_FFTW_FLAG); + fftw_interface<rnumber>::execute(tplan); + fftw_interface<rnumber>::destroy_plan(tplan); + return EXIT_SUCCESS; +} +template <class rnumber> +int field_descriptor<rnumber>::transpose( + typename fftw_interface<rnumber>::complex *input, + typename fftw_interface<rnumber>::complex *output) +{ + TIMEZONE("field_descriptor::transpose2"); + switch (this->ndims) + { + case 2: + /* do a global transpose over the 2 dimensions */ + if (output == NULL) + { + std::cerr << "bad arguments for transpose.\n" << std::endl; + return EXIT_FAILURE; + } + typename fftw_interface<rnumber>::plan tplan; + tplan = fftw_interface<rnumber>::mpi_plan_many_transpose( + this->sizes[0], this->sizes[1], 2, + FFTW_MPI_DEFAULT_BLOCK, + FFTW_MPI_DEFAULT_BLOCK, + (rnumber*)input, (rnumber*)output, + this->comm, + DEFAULT_FFTW_FLAG); + fftw_interface<rnumber>::execute(tplan); + fftw_interface<rnumber>::destroy_plan(tplan); + break; + case 3: + /* transpose the two local dimensions 1 and 2 */ + typename fftw_interface<rnumber>::complex *atmp; + atmp = fftw_interface<rnumber>::alloc_complex(this->slice_size); + for (int k = 0; k < this->subsizes[0]; k++) + { + /* put transposed slice in atmp */ + for (int j = 0; j < this->sizes[1]; j++) + for (int i = 0; i < this->sizes[2]; i++) + { + atmp[i*this->sizes[1] + j][0] = + input[(k*this->sizes[1] + j)*this->sizes[2] + i][0]; + atmp[i*this->sizes[1] + j][1] = + input[(k*this->sizes[1] + j)*this->sizes[2] + i][1]; + } + /* copy back transposed slice */ + std::copy( + (rnumber*)(atmp), + (rnumber*)(atmp + this->slice_size), + (rnumber*)(input + k*this->slice_size)); + } + fftw_interface<rnumber>::free(atmp); + break; + default: + return EXIT_FAILURE; + break; + } + return EXIT_SUCCESS; +} + +template <class rnumber> +int field_descriptor<rnumber>::interleave( + rnumber *a, + int dim) +{ + TIMEZONE("field_descriptor::interleav"); + /* the following is copied from + * http://agentzlerich.blogspot.com/2010/01/using-fftw-for-in-place-matrix.html + * */ + typename fftw_interface<rnumber>::iodim howmany_dims[2]; + howmany_dims[0].n = dim; + howmany_dims[0].is = this->local_size; + howmany_dims[0].os = 1; + howmany_dims[1].n = this->local_size; + howmany_dims[1].is = 1; + howmany_dims[1].os = dim; + const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]); + + typename fftw_interface<rnumber>::plan tmp = fftw_interface<rnumber>::plan_guru_r2r( + /*rank*/0, + /*dims*/nullptr, + howmany_rank, + howmany_dims, + a, + a, + /*kind*/nullptr, + DEFAULT_FFTW_FLAG); + fftw_interface<rnumber>::execute(tmp); + fftw_interface<rnumber>::destroy_plan(tmp); + return EXIT_SUCCESS; +} + +template <class rnumber> +int field_descriptor<rnumber>::interleave( + typename fftw_interface<rnumber>::complex *a, + int dim) +{ + TIMEZONE("field_descriptor::interleave2"); + typename fftw_interface<rnumber>::iodim howmany_dims[2]; + howmany_dims[0].n = dim; + howmany_dims[0].is = this->local_size; + howmany_dims[0].os = 1; + howmany_dims[1].n = this->local_size; + howmany_dims[1].is = 1; + howmany_dims[1].os = dim; + const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]); + + typename fftw_interface<rnumber>::plan tmp = fftw_interface<rnumber>::plan_guru_dft( + /*rank*/0, + /*dims*/nullptr, + howmany_rank, + howmany_dims, + a, + a, + +1, + DEFAULT_FFTW_FLAG); + fftw_interface<rnumber>::execute(tmp); + fftw_interface<rnumber>::destroy_plan(tmp); + return EXIT_SUCCESS; +} + +template <class rnumber> +field_descriptor<rnumber>* field_descriptor<rnumber>::get_transpose() +{ + TIMEZONE("field_descriptor::get_transpose"); + int n[this->ndims]; + for (int i=0; i<this->ndims; i++) + n[i] = this->sizes[this->ndims - i - 1]; + return new field_descriptor<rnumber>(this->ndims, n, this->mpi_dtype, this->comm); +} /*****************************************************************************/ -/* now actually use the macro defined above */ -CLASS_IMPLEMENTATION( - FFTW_MANGLE_FLOAT, - float, - MPI_FLOAT, - MPI_COMPLEX) -CLASS_IMPLEMENTATION( - FFTW_MANGLE_DOUBLE, - double, - MPI_DOUBLE, - BFPS_MPICXX_DOUBLE_COMPLEX) /*****************************************************************************/ @@ -511,23 +505,23 @@ template <class rnumber> field_descriptor<rnumber>::~field_descriptor() { DEBUG_MSG_WAIT( - MPI_COMM_WORLD, - this->io_comm == MPI_COMM_NULL ? "null\n" : "not null\n"); + MPI_COMM_WORLD, + this->io_comm == MPI_COMM_NULL ? "null\n" : "not null\n"); DEBUG_MSG_WAIT( - MPI_COMM_WORLD, - "subsizes[0] = %d \n", this->subsizes[0]); + MPI_COMM_WORLD, + "subsizes[0] = %d \n", this->subsizes[0]); if (this->subsizes[0] > 0) { DEBUG_MSG_WAIT( - this->io_comm, - "deallocating mpi_array_dtype\n"); + this->io_comm, + "deallocating mpi_array_dtype\n"); MPI_Type_free(&this->mpi_array_dtype); } if (this->nprocs != this->io_nprocs && this->io_myrank != MPI_PROC_NULL) { DEBUG_MSG_WAIT( - this->io_comm, - "freeing io_comm\n"); + this->io_comm, + "freeing io_comm\n"); MPI_Comm_free(&this->io_comm); } delete[] this->sizes; diff --git a/bfps/cpp/field_descriptor.hpp b/bfps/cpp/field_descriptor.hpp index bfcf52ed415ddb90bd77a6c6793974aea6a94734..2fb491bca7c130704fc5de5d22c3393cb196eec7 100644 --- a/bfps/cpp/field_descriptor.hpp +++ b/bfps/cpp/field_descriptor.hpp @@ -26,6 +26,7 @@ #include <mpi.h> #include <fftw3-mpi.h> +#include "fftw_interface.hpp" #ifndef FIELD_DESCRIPTOR @@ -85,14 +86,14 @@ class field_descriptor rnumber *input, rnumber *output); int transpose( - cnumber *input, - cnumber *output = NULL); + typename fftw_interface<rnumber>::complex *input, + typename fftw_interface<rnumber>::complex *output = NULL); int interleave( rnumber *input, int dim); int interleave( - cnumber *input, + typename fftw_interface<rnumber>::complex *input, int dim); }; diff --git a/bfps/cpp/field_layout.cpp b/bfps/cpp/field_layout.cpp new file mode 100644 index 0000000000000000000000000000000000000000..908904991d5d95b0c89ba679b402d8d5727b8c85 --- /dev/null +++ b/bfps/cpp/field_layout.cpp @@ -0,0 +1,111 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ + + +#include <cassert> +#include "field_layout.hpp" +#include "scope_timer.hpp" + +template <field_components fc> +field_layout<fc>::field_layout( + const hsize_t *SIZES, + const hsize_t *SUBSIZES, + const hsize_t *STARTS, + const MPI_Comm COMM_TO_USE) +{ + TIMEZONE("field_layout::field_layout"); + this->comm = COMM_TO_USE; + MPI_Comm_rank(this->comm, &this->myrank); + MPI_Comm_size(this->comm, &this->nprocs); + + std::copy(SIZES, SIZES + 3, this->sizes); + std::copy(SUBSIZES, SUBSIZES + 3, this->subsizes); + std::copy(STARTS, STARTS + 3, this->starts); + if (fc == THREE || fc == THREExTHREE) + { + this->sizes[3] = 3; + this->subsizes[3] = 3; + this->starts[3] = 0; + } + if (fc == THREExTHREE) + { + this->sizes[4] = 3; + this->subsizes[4] = 3; + this->starts[4] = 0; + } + this->local_size = 1; + this->full_size = 1; + for (unsigned int i=0; i<ndim(fc); i++) + { + this->local_size *= this->subsizes[i]; + this->full_size *= this->sizes[i]; + } + + /*field will at most be distributed in 2D*/ + this->rank.resize(2); + this->all_start.resize(2); + this->all_size.resize(2); + for (int i=0; i<2; i++) + { + this->rank[i].resize(this->sizes[i]); + std::vector<int> local_rank; + local_rank.resize(this->sizes[i], 0); + for (unsigned int ii=this->starts[i]; ii<this->starts[i]+this->subsizes[i]; ii++) + local_rank[ii] = this->myrank; + MPI_Allreduce( + &local_rank.front(), + &this->rank[i].front(), + this->sizes[i], + MPI_INT, + MPI_SUM, + this->comm); + this->all_start[i].resize(this->nprocs); + std::vector<int> local_start; + local_start.resize(this->nprocs, 0); + local_start[this->myrank] = this->starts[i]; + MPI_Allreduce( + &local_start.front(), + &this->all_start[i].front(), + this->nprocs, + MPI_INT, + MPI_SUM, + this->comm); + this->all_size[i].resize(this->nprocs); + std::vector<int> local_subsize; + local_subsize.resize(this->nprocs, 0); + local_subsize[this->myrank] = this->subsizes[i]; + MPI_Allreduce( + &local_subsize.front(), + &this->all_size[i].front(), + this->nprocs, + MPI_INT, + MPI_SUM, + this->comm); + } +} + +template class field_layout<ONE>; +template class field_layout<THREE>; +template class field_layout<THREExTHREE>; + diff --git a/bfps/cpp/field_layout.hpp b/bfps/cpp/field_layout.hpp new file mode 100644 index 0000000000000000000000000000000000000000..770119c2dcb05017d495b62559f050646872dc84 --- /dev/null +++ b/bfps/cpp/field_layout.hpp @@ -0,0 +1,79 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ + + + +#include <vector> +#include "base.hpp" + +#ifndef FIELD_LAYOUT_HPP + +#define FIELD_LAYOUT_HPP + +enum field_components {ONE, THREE, THREExTHREE}; + +constexpr unsigned int ncomp( + field_components fc) + /* return actual number of field components for each enum value */ +{ + return ((fc == THREE) ? 3 : ( + (fc == THREExTHREE) ? 9 : 1)); +} + +constexpr unsigned int ndim( + field_components fc) + /* return actual number of field dimensions for each enum value */ +{ + return ((fc == THREE) ? 4 : ( + (fc == THREExTHREE) ? 5 : 3)); +} + +template <field_components fc> +class field_layout +{ + public: + /* description */ + hsize_t sizes[ndim(fc)]; + hsize_t subsizes[ndim(fc)]; + hsize_t starts[ndim(fc)]; + hsize_t local_size, full_size; + + int myrank, nprocs; + MPI_Comm comm; + + std::vector<std::vector<int>> rank; + std::vector<std::vector<int>> all_start; + std::vector<std::vector<int>> all_size; + + /* methods */ + field_layout( + const hsize_t *SIZES, + const hsize_t *SUBSIZES, + const hsize_t *STARTS, + const MPI_Comm COMM_TO_USE); + ~field_layout(){} +}; + +#endif//FIELD_LAYOUT_HPP + diff --git a/bfps/cpp/fluid_solver.cpp b/bfps/cpp/fluid_solver.cpp index a634117bc43075db475be47256f1579b39bc1193..319186103797f8135d4d3e2244ed5e3a8f271b00 100644 --- a/bfps/cpp/fluid_solver.cpp +++ b/bfps/cpp/fluid_solver.cpp @@ -31,7 +31,8 @@ #include <cstring> #include "fluid_solver.hpp" #include "fftw_tools.hpp" - +#include "scope_timer.hpp" +#include "shared_array.hpp" template <class rnumber> @@ -48,911 +49,1003 @@ void fluid_solver<rnumber>::impose_zero_modes() /*****************************************************************************/ /* macro for specializations to numeric types compatible with FFTW */ -#define FLUID_SOLVER_DEFINITIONS(FFTW, R, MPI_RNUM, MPI_CNUM) \ - \ -template<> \ -fluid_solver<R>::fluid_solver( \ - const char *NAME, \ - int nx, \ - int ny, \ - int nz, \ - double DKX, \ - double DKY, \ - double DKZ, \ - int DEALIAS_TYPE, \ - unsigned FFTW_PLAN_RIGOR) : fluid_solver_base<R>( \ - NAME, \ - nx , ny , nz, \ - DKX, DKY, DKZ, \ - DEALIAS_TYPE, \ - FFTW_PLAN_RIGOR) \ -{ \ - this->cvorticity = FFTW(alloc_complex)(this->cd->local_size);\ - this->cvelocity = FFTW(alloc_complex)(this->cd->local_size);\ - this->rvorticity = FFTW(alloc_real)(this->cd->local_size*2);\ - /*this->rvelocity = (R*)(this->cvelocity);*/\ - this->rvelocity = FFTW(alloc_real)(this->cd->local_size*2);\ - \ - this->ru = this->rvelocity;\ - this->cu = this->cvelocity;\ - \ - this->rv[0] = this->rvorticity;\ - this->rv[3] = this->rvorticity;\ - this->cv[0] = this->cvorticity;\ - this->cv[3] = this->cvorticity;\ - \ - this->cv[1] = FFTW(alloc_complex)(this->cd->local_size);\ - this->cv[2] = this->cv[1];\ - this->rv[1] = FFTW(alloc_real)(this->cd->local_size*2);\ - this->rv[2] = this->rv[1];\ - \ - this->c2r_vorticity = new FFTW(plan);\ - this->r2c_vorticity = new FFTW(plan);\ - this->c2r_velocity = new FFTW(plan);\ - this->r2c_velocity = new FFTW(plan);\ - \ - ptrdiff_t sizes[] = {nz, \ - ny, \ - nx};\ - \ - *(FFTW(plan)*)this->c2r_vorticity = FFTW(mpi_plan_many_dft_c2r)( \ - 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \ - this->cvorticity, this->rvorticity, \ - MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \ - \ - *(FFTW(plan)*)this->r2c_vorticity = FFTW(mpi_plan_many_dft_r2c)( \ - 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \ - this->rvorticity, this->cvorticity, \ - MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \ - \ - *(FFTW(plan)*)this->c2r_velocity = FFTW(mpi_plan_many_dft_c2r)( \ - 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \ - this->cvelocity, this->rvelocity, \ - MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \ - \ - *(FFTW(plan)*)this->r2c_velocity = FFTW(mpi_plan_many_dft_r2c)( \ - 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \ - this->rvelocity, this->cvelocity, \ - MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \ - \ - this->uc2r = this->c2r_velocity;\ - this->ur2c = this->r2c_velocity;\ - this->vc2r[0] = this->c2r_vorticity;\ - this->vr2c[0] = this->r2c_vorticity;\ - \ - this->vc2r[1] = new FFTW(plan);\ - this->vr2c[1] = new FFTW(plan);\ - this->vc2r[2] = new FFTW(plan);\ - this->vr2c[2] = new FFTW(plan);\ - \ - *(FFTW(plan)*)(this->vc2r[1]) = FFTW(mpi_plan_many_dft_c2r)( \ - 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \ - this->cv[1], this->rv[1], \ - MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \ - \ - *(FFTW(plan)*)this->vc2r[2] = FFTW(mpi_plan_many_dft_c2r)( \ - 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \ - this->cv[2], this->rv[2], \ - MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \ - \ - *(FFTW(plan)*)this->vr2c[1] = FFTW(mpi_plan_many_dft_r2c)( \ - 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \ - this->rv[1], this->cv[1], \ - MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \ - \ - *(FFTW(plan)*)this->vr2c[2] = FFTW(mpi_plan_many_dft_r2c)( \ - 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \ - this->rv[2], this->cv[2], \ - MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \ - \ - /* ``physical'' parameters etc, initialized here just in case */ \ - \ - this->nu = 0.1; \ - this->fmode = 1; \ - this->famplitude = 1.0; \ - this->fk0 = 0; \ - this->fk1 = 3.0; \ - /* initialization of fields must be done AFTER planning */ \ - std::fill_n((R*)this->cvorticity, this->cd->local_size*2, 0.0); \ - std::fill_n((R*)this->cvelocity, this->cd->local_size*2, 0.0); \ - std::fill_n(this->rvelocity, this->cd->local_size*2, 0.0); \ - std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0); \ - std::fill_n((R*)this->cv[1], this->cd->local_size*2, 0.0); \ - std::fill_n(this->rv[1], this->cd->local_size*2, 0.0); \ - std::fill_n(this->rv[2], this->cd->local_size*2, 0.0); \ -} \ - \ -template<> \ -fluid_solver<R>::~fluid_solver() \ -{ \ - FFTW(destroy_plan)(*(FFTW(plan)*)this->c2r_vorticity);\ - FFTW(destroy_plan)(*(FFTW(plan)*)this->r2c_vorticity);\ - FFTW(destroy_plan)(*(FFTW(plan)*)this->c2r_velocity );\ - FFTW(destroy_plan)(*(FFTW(plan)*)this->r2c_velocity );\ - FFTW(destroy_plan)(*(FFTW(plan)*)this->vc2r[1]);\ - FFTW(destroy_plan)(*(FFTW(plan)*)this->vr2c[1]);\ - FFTW(destroy_plan)(*(FFTW(plan)*)this->vc2r[2]);\ - FFTW(destroy_plan)(*(FFTW(plan)*)this->vr2c[2]);\ - \ - delete (FFTW(plan)*)this->c2r_vorticity;\ - delete (FFTW(plan)*)this->r2c_vorticity;\ - delete (FFTW(plan)*)this->c2r_velocity ;\ - delete (FFTW(plan)*)this->r2c_velocity ;\ - delete (FFTW(plan)*)this->vc2r[1];\ - delete (FFTW(plan)*)this->vr2c[1];\ - delete (FFTW(plan)*)this->vc2r[2];\ - delete (FFTW(plan)*)this->vr2c[2];\ - \ - FFTW(free)(this->cv[1]);\ - FFTW(free)(this->rv[1]);\ - FFTW(free)(this->cvorticity);\ - FFTW(free)(this->rvorticity);\ - FFTW(free)(this->cvelocity);\ - FFTW(free)(this->rvelocity);\ -} \ - \ -template<> \ -void fluid_solver<R>::compute_vorticity() \ -{ \ - ptrdiff_t tindex; \ - CLOOP_K2( \ - this, \ - tindex = 3*cindex; \ - if (k2 <= this->kM2) \ - { \ - this->cvorticity[tindex+0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]); \ - this->cvorticity[tindex+1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]); \ - this->cvorticity[tindex+2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]); \ - this->cvorticity[tindex+0][1] = (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]); \ - this->cvorticity[tindex+1][1] = (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]); \ - this->cvorticity[tindex+2][1] = (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]); \ - } \ - else \ - std::fill_n((R*)(this->cvorticity+tindex), 6, 0.0); \ - ); \ - this->symmetrize(this->cvorticity, 3); \ -} \ - \ -template<> \ -void fluid_solver<R>::compute_velocity(FFTW(complex) *vorticity) \ -{ \ - ptrdiff_t tindex; \ - CLOOP_K2( \ - this, \ - tindex = 3*cindex; \ - if (k2 <= this->kM2 && k2 > 0) \ - { \ - this->cu[tindex+0][0] = -(this->ky[yindex]*vorticity[tindex+2][1] - this->kz[zindex]*vorticity[tindex+1][1]) / k2; \ - this->cu[tindex+1][0] = -(this->kz[zindex]*vorticity[tindex+0][1] - this->kx[xindex]*vorticity[tindex+2][1]) / k2; \ - this->cu[tindex+2][0] = -(this->kx[xindex]*vorticity[tindex+1][1] - this->ky[yindex]*vorticity[tindex+0][1]) / k2; \ - this->cu[tindex+0][1] = (this->ky[yindex]*vorticity[tindex+2][0] - this->kz[zindex]*vorticity[tindex+1][0]) / k2; \ - this->cu[tindex+1][1] = (this->kz[zindex]*vorticity[tindex+0][0] - this->kx[xindex]*vorticity[tindex+2][0]) / k2; \ - this->cu[tindex+2][1] = (this->kx[xindex]*vorticity[tindex+1][0] - this->ky[yindex]*vorticity[tindex+0][0]) / k2; \ - } \ - else \ - std::fill_n((R*)(this->cu+tindex), 6, 0.0); \ - ); \ - /*this->symmetrize(this->cu, 3);*/ \ -} \ - \ -template<> \ -void fluid_solver<R>::ift_velocity() \ -{ \ - FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity )); \ -} \ - \ -template<> \ -void fluid_solver<R>::ift_vorticity() \ -{ \ - std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0); \ - FFTW(execute)(*((FFTW(plan)*)this->c2r_vorticity )); \ -} \ - \ -template<> \ -void fluid_solver<R>::dft_velocity() \ -{ \ - FFTW(execute)(*((FFTW(plan)*)this->r2c_velocity )); \ -} \ - \ -template<> \ -void fluid_solver<R>::dft_vorticity() \ -{ \ - std::fill_n((R*)this->cvorticity, this->cd->local_size*2, 0.0); \ - FFTW(execute)(*((FFTW(plan)*)this->r2c_vorticity )); \ -} \ - \ -template<> \ -void fluid_solver<R>::add_forcing(\ - FFTW(complex) *acc_field, FFTW(complex) *vort_field, R factor) \ -{ \ - if (strcmp(this->forcing_type, "none") == 0) \ - return; \ - if (strcmp(this->forcing_type, "Kolmogorov") == 0) \ - { \ - ptrdiff_t cindex; \ - if (this->cd->myrank == this->cd->rank[this->fmode]) \ - { \ - cindex = ((this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3; \ - acc_field[cindex+2][0] -= this->famplitude*factor/2; \ - } \ - if (this->cd->myrank == this->cd->rank[this->cd->sizes[0] - this->fmode]) \ - { \ - cindex = ((this->cd->sizes[0] - this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3; \ - acc_field[cindex+2][0] -= this->famplitude*factor/2; \ - } \ - return; \ - } \ - if (strcmp(this->forcing_type, "linear") == 0) \ - { \ - double knorm; \ - CLOOP( \ - this, \ - knorm = sqrt(this->kx[xindex]*this->kx[xindex] + \ - this->ky[yindex]*this->ky[yindex] + \ - this->kz[zindex]*this->kz[zindex]); \ - if ((this->fk0 <= knorm) && \ - (this->fk1 >= knorm)) \ - for (int c=0; c<3; c++) \ - for (int i=0; i<2; i++) \ - acc_field[cindex*3+c][i] += this->famplitude*vort_field[cindex*3+c][i]*factor; \ - ); \ - return; \ - } \ -} \ - \ -template<> \ -void fluid_solver<R>::omega_nonlin( \ - int src) \ -{ \ - assert(src >= 0 && src < 3); \ - this->compute_velocity(this->cv[src]); \ - /* get fields from Fourier space to real space */ \ - FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity )); \ - FFTW(execute)(*((FFTW(plan)*)this->vc2r[src])); \ - /* compute cross product $u \times \omega$, and normalize */ \ - R tmp[3][2]; \ - ptrdiff_t tindex; \ - RLOOP ( \ - this, \ - tindex = 3*rindex; \ - for (int cc=0; cc<3; cc++) \ - tmp[cc][0] = (this->ru[tindex+(cc+1)%3]*this->rv[src][tindex+(cc+2)%3] - \ - this->ru[tindex+(cc+2)%3]*this->rv[src][tindex+(cc+1)%3]); \ - for (int cc=0; cc<3; cc++) \ - this->ru[(3*rindex)+cc] = tmp[cc][0] / this->normalization_factor; \ - ); \ - /* go back to Fourier space */ \ - this->clean_up_real_space(this->ru, 3); \ - FFTW(execute)(*((FFTW(plan)*)this->r2c_velocity )); \ - this->dealias(this->cu, 3); \ - /* $\imath k \times Fourier(u \times \omega)$ */ \ - CLOOP( \ - this, \ - tindex = 3*cindex; \ - { \ - tmp[0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]); \ - tmp[1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]); \ - tmp[2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]); \ - tmp[0][1] = (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]); \ - tmp[1][1] = (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]); \ - tmp[2][1] = (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]); \ - } \ - for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \ - this->cu[tindex+cc][i] = tmp[cc][i]; \ - ); \ - this->add_forcing(this->cu, this->cv[src], 1.0); \ - this->force_divfree(this->cu); \ -} \ - \ -template<> \ -void fluid_solver<R>::step(double dt) \ -{ \ - double factor0, factor1; \ - std::fill_n((R*)this->cv[1], this->cd->local_size*2, 0.0); \ - this->omega_nonlin(0); \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2) \ - { \ - factor0 = exp(-this->nu * k2 * dt); \ - for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \ - this->cv[1][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i] + \ - dt*this->cu[3*cindex+cc][i])*factor0; \ - } \ - ); \ - \ - this->omega_nonlin(1); \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2) \ - { \ - factor0 = exp(-this->nu * k2 * dt/2); \ - factor1 = exp( this->nu * k2 * dt/2); \ - for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \ - this->cv[2][3*cindex+cc][i] = (3*this->cv[0][3*cindex+cc][i]*factor0 + \ - (this->cv[1][3*cindex+cc][i] + \ - dt*this->cu[3*cindex+cc][i])*factor1)*0.25; \ - } \ - ); \ - \ - this->omega_nonlin(2); \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2) \ - { \ - factor0 = exp(-this->nu * k2 * dt * 0.5); \ - for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \ - this->cv[3][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i]*factor0 + \ - 2*(this->cv[2][3*cindex+cc][i] + \ - dt*this->cu[3*cindex+cc][i]))*factor0/3; \ - } \ - ); \ - \ - this->force_divfree(this->cvorticity); \ - this->symmetrize(this->cvorticity, 3); \ - this->iteration++; \ -} \ - \ -template<> \ -int fluid_solver<R>::read(char field, char representation) \ -{ \ - char fname[512]; \ - int read_result; \ - if (field == 'v') \ - { \ - if (representation == 'c') \ - { \ - this->fill_up_filename("cvorticity", fname); \ - read_result = this->cd->read(fname, (void*)this->cvorticity); \ - if (read_result != EXIT_SUCCESS) \ - return read_result; \ - } \ - if (representation == 'r') \ - { \ - read_result = this->read_base("rvorticity", this->rvorticity); \ - if (read_result != EXIT_SUCCESS) \ - return read_result; \ - else \ - FFTW(execute)(*((FFTW(plan)*)this->r2c_vorticity )); \ - } \ - this->low_pass_Fourier(this->cvorticity, 3, this->kM); \ - this->force_divfree(this->cvorticity); \ - this->symmetrize(this->cvorticity, 3); \ - return EXIT_SUCCESS; \ - } \ - if ((field == 'u') && (representation == 'c')) \ - { \ - read_result = this->read_base("cvelocity", this->cvelocity); \ - this->low_pass_Fourier(this->cvelocity, 3, this->kM); \ - this->force_divfree(this->cvorticity); \ - this->symmetrize(this->cvorticity, 3); \ - return read_result; \ - } \ - if ((field == 'u') && (representation == 'r')) \ - return this->read_base("rvelocity", this->rvelocity); \ - return EXIT_FAILURE; \ -} \ - \ -template<> \ -int fluid_solver<R>::write(char field, char representation) \ -{ \ - char fname[512]; \ - if ((field == 'v') && (representation == 'c')) \ - { \ - this->fill_up_filename("cvorticity", fname); \ - return this->cd->write(fname, (void*)this->cvorticity); \ - } \ - if ((field == 'v') && (representation == 'r')) \ - { \ - FFTW(execute)(*((FFTW(plan)*)this->c2r_vorticity )); \ - clip_zero_padding<R>(this->rd, this->rvorticity, 3); \ - this->fill_up_filename("rvorticity", fname); \ - return this->rd->write(fname, this->rvorticity); \ - } \ - this->compute_velocity(this->cvorticity); \ - if ((field == 'u') && (representation == 'c')) \ - { \ - this->fill_up_filename("cvelocity", fname); \ - return this->cd->write(fname, this->cvelocity); \ - } \ - if ((field == 'u') && (representation == 'r')) \ - { \ - this->ift_velocity(); \ - clip_zero_padding<R>(this->rd, this->rvelocity, 3); \ - this->fill_up_filename("rvelocity", fname); \ - return this->rd->write(fname, this->rvelocity); \ - } \ - return EXIT_FAILURE; \ -} \ - \ -template<> \ -int fluid_solver<R>::write_rTrS2() \ -{ \ - char fname[512]; \ - this->fill_up_filename("rTrS2", fname); \ - FFTW(complex) *ca; \ - R *ra; \ - ca = FFTW(alloc_complex)(this->cd->local_size*3); \ - ra = (R*)(ca); \ - this->compute_velocity(this->cvorticity); \ - this->compute_vector_gradient(ca, this->cvelocity); \ - for (int cc=0; cc<3; cc++) \ - { \ - std::copy( \ - (R*)(ca + cc*this->cd->local_size), \ - (R*)(ca + (cc+1)*this->cd->local_size), \ - (R*)this->cv[1]); \ - FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \ - std::copy( \ - this->rv[1], \ - this->rv[1] + this->cd->local_size*2, \ - ra + cc*this->cd->local_size*2); \ - } \ - /* velocity gradient is now stored, in real space, in ra */ \ - R *dx_u, *dy_u, *dz_u; \ - dx_u = ra; \ - dy_u = ra + 2*this->cd->local_size; \ - dz_u = ra + 4*this->cd->local_size; \ - R *trS2 = FFTW(alloc_real)((this->cd->local_size/3)*2); \ - double average_local = 0; \ - RLOOP( \ - this, \ - R AxxAxx; \ - R AyyAyy; \ - R AzzAzz; \ - R Sxy; \ - R Syz; \ - R Szx; \ - ptrdiff_t tindex = 3*rindex; \ - AxxAxx = dx_u[tindex+0]*dx_u[tindex+0]; \ - AyyAyy = dy_u[tindex+1]*dy_u[tindex+1]; \ - AzzAzz = dz_u[tindex+2]*dz_u[tindex+2]; \ - Sxy = dx_u[tindex+1]+dy_u[tindex+0]; \ - Syz = dy_u[tindex+2]+dz_u[tindex+1]; \ - Szx = dz_u[tindex+0]+dx_u[tindex+2]; \ - trS2[rindex] = (AxxAxx + AyyAyy + AzzAzz + \ - (Sxy*Sxy + Syz*Syz + Szx*Szx)/2); \ - average_local += trS2[rindex]; \ - ); \ - double average; \ - MPI_Allreduce( \ - &average_local, \ - &average, \ - 1, \ - MPI_DOUBLE, MPI_SUM, this->cd->comm); \ - DEBUG_MSG("average TrS2 is %g\n", average); \ - FFTW(free)(ca); \ - /* output goes here */ \ - int ntmp[3]; \ - ntmp[0] = this->rd->sizes[0]; \ - ntmp[1] = this->rd->sizes[1]; \ - ntmp[2] = this->rd->sizes[2]; \ - field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \ - clip_zero_padding<R>(scalar_descriptor, trS2, 1); \ - int return_value = scalar_descriptor->write(fname, trS2); \ - delete scalar_descriptor; \ - FFTW(free)(trS2); \ - return return_value; \ -} \ - \ -template<> \ -int fluid_solver<R>::write_renstrophy() \ -{ \ - char fname[512]; \ - this->fill_up_filename("renstrophy", fname); \ - R *enstrophy = FFTW(alloc_real)((this->cd->local_size/3)*2); \ - this->ift_vorticity(); \ - double average_local = 0; \ - RLOOP( \ - this, \ - ptrdiff_t tindex = 3*rindex; \ - enstrophy[rindex] = ( \ - this->rvorticity[tindex+0]*this->rvorticity[tindex+0] + \ - this->rvorticity[tindex+1]*this->rvorticity[tindex+1] + \ - this->rvorticity[tindex+2]*this->rvorticity[tindex+2] \ - )/2; \ - average_local += enstrophy[rindex]; \ - ); \ - double average; \ - MPI_Allreduce( \ - &average_local, \ - &average, \ - 1, \ - MPI_DOUBLE, MPI_SUM, this->cd->comm); \ - DEBUG_MSG("average enstrophy is %g\n", average); \ - /* output goes here */ \ - int ntmp[3]; \ - ntmp[0] = this->rd->sizes[0]; \ - ntmp[1] = this->rd->sizes[1]; \ - ntmp[2] = this->rd->sizes[2]; \ - field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \ - clip_zero_padding<R>(scalar_descriptor, enstrophy, 1); \ - int return_value = scalar_descriptor->write(fname, enstrophy); \ - delete scalar_descriptor; \ - FFTW(free)(enstrophy); \ - return return_value; \ -} \ - \ -template<> \ -void fluid_solver<R>::compute_pressure(FFTW(complex) *pressure) \ -{ \ - /* assume velocity is already in real space representation */ \ - ptrdiff_t tindex; \ - \ - /* diagonal terms 11 22 33 */\ - RLOOP ( \ - this, \ - tindex = 3*rindex; \ - for (int cc=0; cc<3; cc++) \ - this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc]; \ - ); \ - this->clean_up_real_space(this->rv[1], 3); \ - FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \ - this->dealias(this->cv[1], 3); \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2 && k2 > 0) \ - { \ - tindex = 3*cindex; \ - for (int i=0; i<2; i++) \ - { \ - pressure[cindex][i] = -(this->kx[xindex]*this->kx[xindex]*this->cv[1][tindex+0][i] + \ - this->ky[yindex]*this->ky[yindex]*this->cv[1][tindex+1][i] + \ - this->kz[zindex]*this->kz[zindex]*this->cv[1][tindex+2][i]); \ - } \ - } \ - else \ - std::fill_n((R*)(pressure+cindex), 2, 0.0); \ - ); \ - /* off-diagonal terms 12 23 31 */\ - RLOOP ( \ - this, \ - tindex = 3*rindex; \ - for (int cc=0; cc<3; cc++) \ - this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3]; \ - ); \ - this->clean_up_real_space(this->rv[1], 3); \ - FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \ - this->dealias(this->cv[1], 3); \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2 && k2 > 0) \ - { \ - tindex = 3*cindex; \ - for (int i=0; i<2; i++) \ - { \ - pressure[cindex][i] -= 2*(this->kx[xindex]*this->ky[yindex]*this->cv[1][tindex+0][i] + \ - this->ky[yindex]*this->kz[zindex]*this->cv[1][tindex+1][i] + \ - this->kz[zindex]*this->kx[xindex]*this->cv[1][tindex+2][i]); \ - pressure[cindex][i] /= this->normalization_factor*k2; \ - } \ - } \ - ); \ -} \ - \ -template<> \ -void fluid_solver<R>::compute_gradient_statistics( \ - FFTW(complex) *vec, \ - double *gradu_moments, \ - double *trS2QR_moments, \ - ptrdiff_t *gradu_hist, \ - ptrdiff_t *trS2QR_hist, \ - ptrdiff_t *QR2D_hist, \ - double trS2QR_max_estimates[], \ - double gradu_max_estimates[], \ - int nbins, \ - int QR2D_nbins) \ -{ \ - FFTW(complex) *ca; \ - R *ra; \ - ca = FFTW(alloc_complex)(this->cd->local_size*3); \ - ra = (R*)(ca); \ - this->compute_vector_gradient(ca, vec); \ - for (int cc=0; cc<3; cc++) \ - { \ - std::copy( \ - (R*)(ca + cc*this->cd->local_size), \ - (R*)(ca + (cc+1)*this->cd->local_size), \ - (R*)this->cv[1]); \ - FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \ - std::copy( \ - this->rv[1], \ - this->rv[1] + this->cd->local_size*2, \ - ra + cc*this->cd->local_size*2); \ - } \ - /* velocity gradient is now stored, in real space, in ra */ \ - std::fill_n(this->rv[1], 2*this->cd->local_size, 0.0); \ - R *dx_u, *dy_u, *dz_u; \ - dx_u = ra; \ - dy_u = ra + 2*this->cd->local_size; \ - dz_u = ra + 4*this->cd->local_size; \ - double binsize[2]; \ - double tmp_max_estimate[3]; \ - tmp_max_estimate[0] = trS2QR_max_estimates[0]; \ - tmp_max_estimate[1] = trS2QR_max_estimates[1]; \ - tmp_max_estimate[2] = trS2QR_max_estimates[2]; \ - binsize[0] = 2*tmp_max_estimate[2] / QR2D_nbins; \ - binsize[1] = 2*tmp_max_estimate[1] / QR2D_nbins; \ - ptrdiff_t *local_hist = new ptrdiff_t[QR2D_nbins*QR2D_nbins]; \ - std::fill_n(local_hist, QR2D_nbins*QR2D_nbins, 0); \ - RLOOP( \ - this, \ - R AxxAxx; \ - R AyyAyy; \ - R AzzAzz; \ - R AxyAyx; \ - R AyzAzy; \ - R AzxAxz; \ - R Sxy; \ - R Syz; \ - R Szx; \ - ptrdiff_t tindex = 3*rindex; \ - AxxAxx = dx_u[tindex+0]*dx_u[tindex+0]; \ - AyyAyy = dy_u[tindex+1]*dy_u[tindex+1]; \ - AzzAzz = dz_u[tindex+2]*dz_u[tindex+2]; \ - AxyAyx = dx_u[tindex+1]*dy_u[tindex+0]; \ - AyzAzy = dy_u[tindex+2]*dz_u[tindex+1]; \ - AzxAxz = dz_u[tindex+0]*dx_u[tindex+2]; \ - this->rv[1][tindex+1] = - (AxxAxx + AyyAyy + AzzAzz)/2 - AxyAyx - AyzAzy - AzxAxz; \ - this->rv[1][tindex+2] = - (dx_u[tindex+0]*(AxxAxx/3 + AxyAyx + AzxAxz) + \ - dy_u[tindex+1]*(AyyAyy/3 + AxyAyx + AyzAzy) + \ - dz_u[tindex+2]*(AzzAzz/3 + AzxAxz + AyzAzy) + \ - dx_u[tindex+1]*dy_u[tindex+2]*dz_u[tindex+0] + \ - dx_u[tindex+2]*dy_u[tindex+0]*dz_u[tindex+1]); \ - int bin0 = int(floor((this->rv[1][tindex+2] + tmp_max_estimate[2]) / binsize[0])); \ - int bin1 = int(floor((this->rv[1][tindex+1] + tmp_max_estimate[1]) / binsize[1])); \ - if ((bin0 >= 0 && bin0 < QR2D_nbins) && \ - (bin1 >= 0 && bin1 < QR2D_nbins)) \ - local_hist[bin1*QR2D_nbins + bin0]++; \ - Sxy = dx_u[tindex+1]+dy_u[tindex+0]; \ - Syz = dy_u[tindex+2]+dz_u[tindex+1]; \ - Szx = dz_u[tindex+0]+dx_u[tindex+2]; \ - this->rv[1][tindex] = (AxxAxx + AyyAyy + AzzAzz + \ - (Sxy*Sxy + Syz*Syz + Szx*Szx)/2); \ - ); \ - MPI_Allreduce( \ - local_hist, \ - QR2D_hist, \ - QR2D_nbins * QR2D_nbins, \ - MPI_INT64_T, MPI_SUM, this->cd->comm); \ - delete[] local_hist; \ - this->compute_rspace_stats3( \ - this->rv[1], \ - trS2QR_moments, \ - trS2QR_hist, \ - tmp_max_estimate, \ - nbins); \ - double *tmp_moments = new double[10*3]; \ - ptrdiff_t *tmp_hist = new ptrdiff_t[nbins*3]; \ - for (int cc=0; cc<3; cc++) \ - { \ - tmp_max_estimate[0] = gradu_max_estimates[cc*3 + 0]; \ - tmp_max_estimate[1] = gradu_max_estimates[cc*3 + 1]; \ - tmp_max_estimate[2] = gradu_max_estimates[cc*3 + 2]; \ - this->compute_rspace_stats3( \ - dx_u, \ - tmp_moments, \ - tmp_hist, \ - tmp_max_estimate, \ - nbins); \ - for (int n = 0; n < 10; n++) \ - for (int i = 0; i < 3 ; i++) \ - { \ - gradu_moments[(n*3 + cc)*3 + i] = tmp_moments[n*3 + i]; \ - } \ - for (int n = 0; n < nbins; n++) \ - for (int i = 0; i < 3; i++) \ - { \ - gradu_hist[(n*3 + cc)*3 + i] = tmp_hist[n*3 + i]; \ - } \ - } \ - delete[] tmp_moments; \ - delete[] tmp_hist; \ - FFTW(free)(ca); \ -} \ - \ -template<> \ -void fluid_solver<R>::compute_Lagrangian_acceleration(R (*acceleration)[2]) \ -{ \ - ptrdiff_t tindex; \ - FFTW(complex) *pressure; \ - pressure = FFTW(alloc_complex)(this->cd->local_size/3); \ - this->compute_velocity(this->cvorticity); \ - this->ift_velocity(); \ - this->compute_pressure(pressure); \ - this->compute_velocity(this->cvorticity); \ - std::fill_n((R*)this->cv[1], 2*this->cd->local_size, 0.0); \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2) \ - { \ - tindex = 3*cindex; \ - for (int cc=0; cc<3; cc++) \ - for (int i=0; i<2; i++) \ - this->cv[1][tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i]; \ - if (strcmp(this->forcing_type, "linear") == 0) \ - { \ - double knorm = sqrt(k2); \ - if ((this->fk0 <= knorm) && \ - (this->fk1 >= knorm)) \ - for (int c=0; c<3; c++) \ - for (int i=0; i<2; i++) \ - this->cv[1][tindex+c][i] += this->famplitude*this->cu[tindex+c][i]; \ - } \ - this->cv[1][tindex+0][0] += this->kx[xindex]*pressure[cindex][1]; \ - this->cv[1][tindex+1][0] += this->ky[yindex]*pressure[cindex][1]; \ - this->cv[1][tindex+2][0] += this->kz[zindex]*pressure[cindex][1]; \ - this->cv[1][tindex+0][1] -= this->kx[xindex]*pressure[cindex][0]; \ - this->cv[1][tindex+1][1] -= this->ky[yindex]*pressure[cindex][0]; \ - this->cv[1][tindex+2][1] -= this->kz[zindex]*pressure[cindex][0]; \ - } \ - ); \ - std::copy( \ - (R*)this->cv[1], \ - (R*)(this->cv[1] + this->cd->local_size), \ - (R*)acceleration); \ - FFTW(free)(pressure); \ -} \ - \ -template<> \ -void fluid_solver<R>::compute_Eulerian_acceleration(FFTW(complex) *acceleration) \ -{ \ - std::fill_n((R*)(acceleration), 2*this->cd->local_size, 0.0); \ - ptrdiff_t tindex; \ - this->compute_velocity(this->cvorticity); \ - /* put in linear terms */ \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2) \ - { \ - tindex = 3*cindex; \ - for (int cc=0; cc<3; cc++) \ - for (int i=0; i<2; i++) \ - acceleration[tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i]; \ - if (strcmp(this->forcing_type, "linear") == 0) \ - { \ - double knorm = sqrt(k2); \ - if ((this->fk0 <= knorm) && \ - (this->fk1 >= knorm)) \ - { \ - for (int c=0; c<3; c++) \ - for (int i=0; i<2; i++) \ - acceleration[tindex+c][i] += this->famplitude*this->cu[tindex+c][i]; \ - } \ - } \ - } \ - ); \ - this->ift_velocity(); \ - /* compute uu */ \ - /* 11 22 33 */ \ - RLOOP ( \ - this, \ - tindex = 3*rindex; \ - for (int cc=0; cc<3; cc++) \ - this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc] / this->normalization_factor; \ - ); \ - this->clean_up_real_space(this->rv[1], 3); \ - FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \ - this->dealias(this->cv[1], 3); \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2) \ - { \ - tindex = 3*cindex; \ - acceleration[tindex+0][0] += \ - this->kx[xindex]*this->cv[1][tindex+0][1]; \ - acceleration[tindex+0][1] += \ - -this->kx[xindex]*this->cv[1][tindex+0][0]; \ - acceleration[tindex+1][0] += \ - this->ky[yindex]*this->cv[1][tindex+1][1]; \ - acceleration[tindex+1][1] += \ - -this->ky[yindex]*this->cv[1][tindex+1][0]; \ - acceleration[tindex+2][0] += \ - this->kz[zindex]*this->cv[1][tindex+2][1]; \ - acceleration[tindex+2][1] += \ - -this->kz[zindex]*this->cv[1][tindex+2][0]; \ - } \ - ); \ - /* 12 23 31 */ \ - RLOOP ( \ - this, \ - tindex = 3*rindex; \ - for (int cc=0; cc<3; cc++) \ - this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3] / this->normalization_factor; \ - ); \ - this->clean_up_real_space(this->rv[1], 3); \ - FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \ - this->dealias(this->cv[1], 3); \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2) \ - { \ - tindex = 3*cindex; \ - acceleration[tindex+0][0] += \ - (this->ky[yindex]*this->cv[1][tindex+0][1] + \ - this->kz[zindex]*this->cv[1][tindex+2][1]); \ - acceleration[tindex+0][1] += \ - - (this->ky[yindex]*this->cv[1][tindex+0][0] + \ - this->kz[zindex]*this->cv[1][tindex+2][0]); \ - acceleration[tindex+1][0] += \ - (this->kz[zindex]*this->cv[1][tindex+1][1] + \ - this->kx[xindex]*this->cv[1][tindex+0][1]); \ - acceleration[tindex+1][1] += \ - - (this->kz[zindex]*this->cv[1][tindex+1][0] + \ - this->kx[xindex]*this->cv[1][tindex+0][0]); \ - acceleration[tindex+2][0] += \ - (this->kx[xindex]*this->cv[1][tindex+2][1] + \ - this->ky[yindex]*this->cv[1][tindex+1][1]); \ - acceleration[tindex+2][1] += \ - - (this->kx[xindex]*this->cv[1][tindex+2][0] + \ - this->ky[yindex]*this->cv[1][tindex+1][0]); \ - } \ - ); \ - if (this->cd->myrank == this->cd->rank[0]) \ - std::fill_n((R*)(acceleration), 6, 0.0); \ - this->force_divfree(acceleration); \ -} \ - \ -template<> \ -void fluid_solver<R>::compute_Lagrangian_acceleration(R *acceleration) \ -{ \ - this->compute_Lagrangian_acceleration((FFTW(complex)*)acceleration); \ - FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \ - std::copy( \ - this->rv[1], \ - this->rv[1] + 2*this->cd->local_size, \ - acceleration); \ -} \ - \ -template<> \ -int fluid_solver<R>::write_rpressure() \ -{ \ - char fname[512]; \ - FFTW(complex) *pressure; \ - pressure = FFTW(alloc_complex)(this->cd->local_size/3); \ - this->compute_velocity(this->cvorticity); \ - this->ift_velocity(); \ - this->compute_pressure(pressure); \ - this->fill_up_filename("rpressure", fname); \ - R *rpressure = FFTW(alloc_real)((this->cd->local_size/3)*2); \ - FFTW(plan) c2r; \ - c2r = FFTW(mpi_plan_dft_c2r_3d)( \ - this->rd->sizes[0], this->rd->sizes[1], this->rd->sizes[2], \ - pressure, rpressure, this->cd->comm, \ - this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \ - FFTW(execute)(c2r); \ - /* output goes here */ \ - int ntmp[3]; \ - ntmp[0] = this->rd->sizes[0]; \ - ntmp[1] = this->rd->sizes[1]; \ - ntmp[2] = this->rd->sizes[2]; \ - field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \ - clip_zero_padding<R>(scalar_descriptor, rpressure, 1); \ - int return_value = scalar_descriptor->write(fname, rpressure); \ - delete scalar_descriptor; \ - FFTW(destroy_plan)(c2r); \ - FFTW(free)(pressure); \ - FFTW(free)(rpressure); \ - return return_value; \ -} \ +template <class rnumber> +fluid_solver<rnumber>::fluid_solver( + const char *NAME, + int nx, + int ny, + int nz, + double DKX, + double DKY, + double DKZ, + int DEALIAS_TYPE, + unsigned FFTW_PLAN_RIGOR) : fluid_solver_base<rnumber>( + NAME, + nx , ny , nz, + DKX, DKY, DKZ, + DEALIAS_TYPE, + FFTW_PLAN_RIGOR) +{ + TIMEZONE("fluid_solver::fluid_solver"); + this->cvorticity = fftw_interface<rnumber>::alloc_complex(this->cd->local_size); + this->cvelocity = fftw_interface<rnumber>::alloc_complex(this->cd->local_size); + this->rvorticity = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2); + /*this->rvelocity = (rnumber*)(this->cvelocity);*/ + this->rvelocity = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2); + + this->ru = this->rvelocity; + this->cu = this->cvelocity; -/*****************************************************************************/ + this->rv[0] = this->rvorticity; + this->rv[3] = this->rvorticity; + this->cv[0] = this->cvorticity; + this->cv[3] = this->cvorticity; + this->cv[1] = fftw_interface<rnumber>::alloc_complex(this->cd->local_size); + this->cv[2] = this->cv[1]; + this->rv[1] = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2); + this->rv[2] = this->rv[1]; + this->c2r_vorticity = new typename fftw_interface<rnumber>::plan; + this->r2c_vorticity = new typename fftw_interface<rnumber>::plan; + this->c2r_velocity = new typename fftw_interface<rnumber>::plan; + this->r2c_velocity = new typename fftw_interface<rnumber>::plan; + + ptrdiff_t sizes[] = {nz, + ny, + nx}; + + *this->c2r_vorticity = fftw_interface<rnumber>::mpi_plan_many_dft_c2r( + 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->cvorticity, this->rvorticity, + MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); + + *this->r2c_vorticity = fftw_interface<rnumber>::mpi_plan_many_dft_r2c( + 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->rvorticity, this->cvorticity, + MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); + + *this->c2r_velocity = fftw_interface<rnumber>::mpi_plan_many_dft_c2r( + 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->cvelocity, this->rvelocity, + MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); + + *this->r2c_velocity = fftw_interface<rnumber>::mpi_plan_many_dft_r2c( + 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->rvelocity, this->cvelocity, + MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); + + this->uc2r = this->c2r_velocity; + this->ur2c = this->r2c_velocity; + this->vc2r[0] = this->c2r_vorticity; + this->vr2c[0] = this->r2c_vorticity; + + this->vc2r[1] = new typename fftw_interface<rnumber>::plan; + this->vr2c[1] = new typename fftw_interface<rnumber>::plan; + this->vc2r[2] = new typename fftw_interface<rnumber>::plan; + this->vr2c[2] = new typename fftw_interface<rnumber>::plan; + + *(this->vc2r[1]) = fftw_interface<rnumber>::mpi_plan_many_dft_c2r( + 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->cv[1], this->rv[1], + MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); + + *this->vc2r[2] = fftw_interface<rnumber>::mpi_plan_many_dft_c2r( + 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->cv[2], this->rv[2], + MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); + + *this->vr2c[1] = fftw_interface<rnumber>::mpi_plan_many_dft_r2c( + 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->rv[1], this->cv[1], + MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); + + *this->vr2c[2] = fftw_interface<rnumber>::mpi_plan_many_dft_r2c( + 3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, + this->rv[2], this->cv[2], + MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); + + /* ``physical'' parameters etc, initialized here just in case */ + + this->nu = 0.1; + this->fmode = 1; + this->famplitude = 1.0; + this->fk0 = 0; + this->fk1 = 3.0; + /* initialization of fields must be done AFTER planning */ + std::fill_n((rnumber*)this->cvorticity, this->cd->local_size*2, 0.0); + std::fill_n((rnumber*)this->cvelocity, this->cd->local_size*2, 0.0); + std::fill_n(this->rvelocity, this->cd->local_size*2, 0.0); + std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0); + std::fill_n((rnumber*)this->cv[1], this->cd->local_size*2, 0.0); + std::fill_n(this->rv[1], this->cd->local_size*2, 0.0); + std::fill_n(this->rv[2], this->cd->local_size*2, 0.0); +} + +template <class rnumber> +fluid_solver<rnumber>::~fluid_solver() +{ + fftw_interface<rnumber>::destroy_plan(*this->c2r_vorticity); + fftw_interface<rnumber>::destroy_plan(*this->r2c_vorticity); + fftw_interface<rnumber>::destroy_plan(*this->c2r_velocity ); + fftw_interface<rnumber>::destroy_plan(*this->r2c_velocity ); + fftw_interface<rnumber>::destroy_plan(*this->vc2r[1]); + fftw_interface<rnumber>::destroy_plan(*this->vr2c[1]); + fftw_interface<rnumber>::destroy_plan(*this->vc2r[2]); + fftw_interface<rnumber>::destroy_plan(*this->vr2c[2]); + + delete this->c2r_vorticity; + delete this->r2c_vorticity; + delete this->c2r_velocity ; + delete this->r2c_velocity ; + delete this->vc2r[1]; + delete this->vr2c[1]; + delete this->vc2r[2]; + delete this->vr2c[2]; + + fftw_interface<rnumber>::free(this->cv[1]); + fftw_interface<rnumber>::free(this->rv[1]); + fftw_interface<rnumber>::free(this->cvorticity); + fftw_interface<rnumber>::free(this->rvorticity); + fftw_interface<rnumber>::free(this->cvelocity); + fftw_interface<rnumber>::free(this->rvelocity); +} + +template <class rnumber> +void fluid_solver<rnumber>::compute_vorticity() +{ + TIMEZONE("fluid_solver::compute_vorticity"); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){ + // cindex indexing is thread safe (and tindex too) + it is a write + ptrdiff_t tindex = 3*cindex; + if (k2 <= this->kM2) + { + this->cvorticity[tindex+0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]); + this->cvorticity[tindex+1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]); + this->cvorticity[tindex+2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]); + this->cvorticity[tindex+0][1] = (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]); + this->cvorticity[tindex+1][1] = (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]); + this->cvorticity[tindex+2][1] = (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]); + } + else{ + std::fill_n((rnumber*)(this->cvorticity+tindex), 6, 0.0); + } + } + ); + this->symmetrize(this->cvorticity, 3); +} + +template <class rnumber> +void fluid_solver<rnumber>::compute_velocity(rnumber (*__restrict__ vorticity)[2]) +{ + TIMEZONE("fluid_solver::compute_velocity"); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){ + // cindex indexing is thread safe (and tindex too) + it is a write + ptrdiff_t tindex = 3*cindex; + if (k2 <= this->kM2 && k2 > 0) + { + this->cu[tindex+0][0] = -(this->ky[yindex]*vorticity[tindex+2][1] - this->kz[zindex]*vorticity[tindex+1][1]) / k2; + this->cu[tindex+1][0] = -(this->kz[zindex]*vorticity[tindex+0][1] - this->kx[xindex]*vorticity[tindex+2][1]) / k2; + this->cu[tindex+2][0] = -(this->kx[xindex]*vorticity[tindex+1][1] - this->ky[yindex]*vorticity[tindex+0][1]) / k2; + this->cu[tindex+0][1] = (this->ky[yindex]*vorticity[tindex+2][0] - this->kz[zindex]*vorticity[tindex+1][0]) / k2; + this->cu[tindex+1][1] = (this->kz[zindex]*vorticity[tindex+0][0] - this->kx[xindex]*vorticity[tindex+2][0]) / k2; + this->cu[tindex+2][1] = (this->kx[xindex]*vorticity[tindex+1][0] - this->ky[yindex]*vorticity[tindex+0][0]) / k2; + } + else + std::fill_n((rnumber*)(this->cu+tindex), 6, 0.0); + } + ); + /*this->symmetrize(this->cu, 3);*/ +} + +template <class rnumber> +void fluid_solver<rnumber>::ift_velocity() +{ + TIMEZONE("fluid_solver::ift_velocity"); + fftw_interface<rnumber>::execute(*(this->c2r_velocity )); +} + +template <class rnumber> +void fluid_solver<rnumber>::ift_vorticity() +{ + TIMEZONE("fluid_solver::ift_vorticity"); + std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0); + fftw_interface<rnumber>::execute(*(this->c2r_vorticity )); +} + +template <class rnumber> +void fluid_solver<rnumber>::dft_velocity() +{ + TIMEZONE("fluid_solver::dft_velocity"); + fftw_interface<rnumber>::execute(*(this->r2c_velocity )); +} + +template <class rnumber> +void fluid_solver<rnumber>::dft_vorticity() +{ + TIMEZONE("fluid_solver::dft_vorticity"); + std::fill_n((rnumber*)this->cvorticity, this->cd->local_size*2, 0.0); + fftw_interface<rnumber>::execute(*(this->r2c_vorticity )); +} + +template <class rnumber> +void fluid_solver<rnumber>::add_forcing( + rnumber (*__restrict__ acc_field)[2], rnumber (*__restrict__ vort_field)[2], rnumber factor) +{ + TIMEZONE("fluid_solver::add_forcing"); + if (strcmp(this->forcing_type, "none") == 0) + return; + if (strcmp(this->forcing_type, "Kolmogorov") == 0) + { + ptrdiff_t cindex; + if (this->cd->myrank == this->cd->rank[this->fmode]) + { + cindex = ((this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3; + acc_field[cindex+2][0] -= this->famplitude*factor/2; + } + if (this->cd->myrank == this->cd->rank[this->cd->sizes[0] - this->fmode]) + { + cindex = ((this->cd->sizes[0] - this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3; + acc_field[cindex+2][0] -= this->famplitude*factor/2; + } + return; + } + if (strcmp(this->forcing_type, "linear") == 0) + { + CLOOP( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){ + // cindex indexing is thread safe (and cindex*3+c too) + double knorm = sqrt(this->kx[xindex]*this->kx[xindex] + + this->ky[yindex]*this->ky[yindex] + + this->kz[zindex]*this->kz[zindex]); + if ((this->fk0 <= knorm) && (this->fk1 >= knorm)) + for (int c=0; c<3; c++) + for (int i=0; i<2; i++) + acc_field[cindex*3+c][i] += this->famplitude*vort_field[cindex*3+c][i]*factor; + } + ); + return; + } +} + +template <class rnumber> +void fluid_solver<rnumber>::omega_nonlin( + int src) +{ + TIMEZONE("fluid_solver::omega_nonlin"); + assert(src >= 0 && src < 3); + this->compute_velocity(this->cv[src]); + /* get fields from Fourier space to real space */ + { + TIMEZONE("fluid_solver::omega_nonlin::fftw"); + fftw_interface<rnumber>::execute(*(this->c2r_velocity )); + fftw_interface<rnumber>::execute(*(this->vc2r[src])); + } + /* compute cross product $u \times \omega$, and normalize */ + { + TIMEZONE("fluid_solver::omega_nonlin::RLOOP"); + RLOOP ( + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + ptrdiff_t tindex = 3*rindex; + rnumber tmp[3][2]; + for (int cc=0; cc<3; cc++) + tmp[cc][0] = (this->ru[tindex+(cc+1)%3]*this->rv[src][tindex+(cc+2)%3] - + this->ru[tindex+(cc+2)%3]*this->rv[src][tindex+(cc+1)%3]); + // Access to rindex is thread safe so there is no overlap between threads + for (int cc=0; cc<3; cc++) + this->ru[(3*rindex)+cc] = tmp[cc][0] / this->normalization_factor; + } + ); + } + /* go back to Fourier space */ + this->clean_up_real_space(this->ru, 3); + { + TIMEZONE("fluid_solver::omega_nonlin::fftw-2"); + fftw_interface<rnumber>::execute(*(this->r2c_velocity )); + } + this->dealias(this->cu, 3); + /* $\imath k \times Fourier(u \times \omega)$ */ + { + TIMEZONE("fluid_solver::omega_nonlin::CLOOP"); + CLOOP( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){ + rnumber tmp[3][2]; + ptrdiff_t tindex = 3*cindex; + { + tmp[0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]); + tmp[1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]); + tmp[2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]); + tmp[0][1] = (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]); + tmp[1][1] = (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]); + tmp[2][1] = (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]); + } + // cindex indexing is thread safe so it is 3*cindex so there is no overlap between threads + for (int cc=0; cc<3; cc++) + for (int i=0; i<2; i++) + this->cu[tindex+cc][i] = tmp[cc][i]; + } + ); + } + { + TIMEZONE("fluid_solver::omega_nonlin::add_forcing"); + this->add_forcing(this->cu, this->cv[src], 1.0); + } + { + TIMEZONE("fluid_solver::omega_nonlin::force_divfree"); + this->force_divfree(this->cu); + } +} + +template <class rnumber> +void fluid_solver<rnumber>::step(double dt) +{ + TIMEZONE("fluid_solver::step"); + std::fill_n((rnumber*)this->cv[1], this->cd->local_size*2, 0.0); + this->omega_nonlin(0); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/, double k2){ + if (k2 <= this->kM2) + { + double factor0 = exp(-this->nu * k2 * dt); + // cindex indexing is thread safe so there is no overlap between threads + for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) + this->cv[1][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i] + + dt*this->cu[3*cindex+cc][i])*factor0; + } + } + ); + + this->omega_nonlin(1); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/, double k2){ + if (k2 <= this->kM2) + { + double factor0 = exp(-this->nu * k2 * dt/2); + double factor1 = exp( this->nu * k2 * dt/2); + // cindex indexing is thread safe so there is no overlap between threads + for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) + this->cv[2][3*cindex+cc][i] = (3*this->cv[0][3*cindex+cc][i]*factor0 + + (this->cv[1][3*cindex+cc][i] + + dt*this->cu[3*cindex+cc][i])*factor1)*0.25; + } + } + ); + + this->omega_nonlin(2); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/, double k2){ + if (k2 <= this->kM2) + { + double factor0 = exp(-this->nu * k2 * dt * 0.5); + // cindex indexing is thread safe so there is no overlap between threads + for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) + this->cv[3][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i]*factor0 + + 2*(this->cv[2][3*cindex+cc][i] + + dt*this->cu[3*cindex+cc][i]))*factor0/3; + } + } + ); + + this->force_divfree(this->cvorticity); + this->symmetrize(this->cvorticity, 3); + this->iteration++; +} + +template <class rnumber> +int fluid_solver<rnumber>::read(char field, char representation) +{ + TIMEZONE("fluid_solver::read"); + char fname[512]; + int read_result; + if (field == 'v') + { + if (representation == 'c') + { + this->fill_up_filename("cvorticity", fname); + read_result = this->cd->read(fname, (void*)this->cvorticity); + if (read_result != EXIT_SUCCESS) + return read_result; + } + if (representation == 'r') + { + read_result = this->read_base("rvorticity", this->rvorticity); + if (read_result != EXIT_SUCCESS) + return read_result; + else + fftw_interface<rnumber>::execute(*(this->r2c_vorticity )); + } + this->low_pass_Fourier(this->cvorticity, 3, this->kM); + this->force_divfree(this->cvorticity); + this->symmetrize(this->cvorticity, 3); + return EXIT_SUCCESS; + } + if ((field == 'u') && (representation == 'c')) + { + read_result = this->read_base("cvelocity", this->cvelocity); + this->low_pass_Fourier(this->cvelocity, 3, this->kM); + this->force_divfree(this->cvorticity); + this->symmetrize(this->cvorticity, 3); + return read_result; + } + if ((field == 'u') && (representation == 'r')) + return this->read_base("rvelocity", this->rvelocity); + return EXIT_FAILURE; +} + +template <class rnumber> +int fluid_solver<rnumber>::write(char field, char representation) +{ + TIMEZONE("fluid_solver::write"); + char fname[512]; + if ((field == 'v') && (representation == 'c')) + { + this->fill_up_filename("cvorticity", fname); + return this->cd->write(fname, (void*)this->cvorticity); + } + if ((field == 'v') && (representation == 'r')) + { + fftw_interface<rnumber>::execute(*(this->c2r_vorticity )); + clip_zero_padding<rnumber>(this->rd, this->rvorticity, 3); + this->fill_up_filename("rvorticity", fname); + return this->rd->write(fname, this->rvorticity); + } + this->compute_velocity(this->cvorticity); + if ((field == 'u') && (representation == 'c')) + { + this->fill_up_filename("cvelocity", fname); + return this->cd->write(fname, this->cvelocity); + } + if ((field == 'u') && (representation == 'r')) + { + this->ift_velocity(); + clip_zero_padding<rnumber>(this->rd, this->rvelocity, 3); + this->fill_up_filename("rvelocity", fname); + return this->rd->write(fname, this->rvelocity); + } + return EXIT_FAILURE; +} + +template <class rnumber> +int fluid_solver<rnumber>::write_rTrS2() +{ + TIMEZONE("fluid_solver::write_rTrS2"); + char fname[512]; + this->fill_up_filename("rTrS2", fname); + typename fftw_interface<rnumber>::complex *ca; + rnumber *ra; + ca = fftw_interface<rnumber>::alloc_complex(this->cd->local_size*3); + ra = (rnumber*)(ca); + this->compute_velocity(this->cvorticity); + this->compute_vector_gradient(ca, this->cvelocity); + for (int cc=0; cc<3; cc++) + { + std::copy( + (rnumber*)(ca + cc*this->cd->local_size), + (rnumber*)(ca + (cc+1)*this->cd->local_size), + (rnumber*)this->cv[1]); + fftw_interface<rnumber>::execute(*(this->vc2r[1])); + std::copy( + this->rv[1], + this->rv[1] + this->cd->local_size*2, + ra + cc*this->cd->local_size*2); + } + /* velocity gradient is now stored, in real space, in ra */ + rnumber *dx_u, *dy_u, *dz_u; + dx_u = ra; + dy_u = ra + 2*this->cd->local_size; + dz_u = ra + 4*this->cd->local_size; + rnumber *trS2 = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2); + shared_array<double> average_local(1, [&](double* data){ + data[0] = 0; + }); + + RLOOP( + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + rnumber AxxAxx; + rnumber AyyAyy; + rnumber AzzAzz; + rnumber Sxy; + rnumber Syz; + rnumber Szx; + ptrdiff_t tindex = 3*rindex; + AxxAxx = dx_u[tindex+0]*dx_u[tindex+0]; + AyyAyy = dy_u[tindex+1]*dy_u[tindex+1]; + AzzAzz = dz_u[tindex+2]*dz_u[tindex+2]; + Sxy = dx_u[tindex+1]+dy_u[tindex+0]; + Syz = dy_u[tindex+2]+dz_u[tindex+1]; + Szx = dz_u[tindex+0]+dx_u[tindex+2]; + // rindex is thread safe + No overlap between thread it is a write + trS2[rindex] = (AxxAxx + AyyAyy + AzzAzz + + (Sxy*Sxy + Syz*Syz + Szx*Szx)/2); + average_local.getMine()[0] += trS2[rindex]; + } + ); + average_local.mergeParallel(); + double average; + MPI_Allreduce( + average_local.getMasterData(), + &average, + 1, + MPI_DOUBLE, MPI_SUM, this->cd->comm); + DEBUG_MSG("average TrS2 is %g\n", average); + fftw_interface<rnumber>::free(ca); + /* output goes here */ + int ntmp[3]; + ntmp[0] = this->rd->sizes[0]; + ntmp[1] = this->rd->sizes[1]; + ntmp[2] = this->rd->sizes[2]; + field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm); + clip_zero_padding<rnumber>(scalar_descriptor, trS2, 1); + int return_value = scalar_descriptor->write(fname, trS2); + delete scalar_descriptor; + fftw_interface<rnumber>::free(trS2); + return return_value; +} + +template <class rnumber> +int fluid_solver<rnumber>::write_renstrophy() +{ + TIMEZONE("fluid_solver::write_renstrophy"); + char fname[512]; + this->fill_up_filename("renstrophy", fname); + rnumber *enstrophy = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2); + this->ift_vorticity(); + shared_array<double> average_local(1, [&](double* data){ + data[0] = 0; + }); + + RLOOP( + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + ptrdiff_t tindex = 3*rindex; + // rindex indexing is thread safe so there is no overlap between threads + enstrophy[rindex] = ( + this->rvorticity[tindex+0]*this->rvorticity[tindex+0] + + this->rvorticity[tindex+1]*this->rvorticity[tindex+1] + + this->rvorticity[tindex+2]*this->rvorticity[tindex+2] + )/2; + average_local.getMine()[0] += enstrophy[rindex]; + } + ); + average_local.mergeParallel(); + double average; + MPI_Allreduce( + average_local.getMasterData(), + &average, + 1, + MPI_DOUBLE, MPI_SUM, this->cd->comm); + DEBUG_MSG("average enstrophy is %g\n", average); + /* output goes here */ + int ntmp[3]; + ntmp[0] = this->rd->sizes[0]; + ntmp[1] = this->rd->sizes[1]; + ntmp[2] = this->rd->sizes[2]; + field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm); + clip_zero_padding<rnumber>(scalar_descriptor, enstrophy, 1); + int return_value = scalar_descriptor->write(fname, enstrophy); + delete scalar_descriptor; + fftw_interface<rnumber>::free(enstrophy); + return return_value; +} + +template <class rnumber> +void fluid_solver<rnumber>::compute_pressure(rnumber (*__restrict__ pressure)[2]) +{ + TIMEZONE("fluid_solver::compute_pressure"); + /* assume velocity is already in real space representation */ + /* diagonal terms 11 22 33 */ + RLOOP ( + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + // rindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*rindex; + for (int cc=0; cc<3; cc++) + this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc]; + } + ); + this->clean_up_real_space(this->rv[1], 3); + { + TIMEZONE("fftw_interface<rnumber>::execute"); + fftw_interface<rnumber>::execute(*(this->vr2c[1])); + } + this->dealias(this->cv[1], 3); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){ + if (k2 <= this->kM2 && k2 > 0) + { + // cindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*cindex; + for (int i=0; i<2; i++) + { + pressure[cindex][i] = -(this->kx[xindex]*this->kx[xindex]*this->cv[1][tindex+0][i] + + this->ky[yindex]*this->ky[yindex]*this->cv[1][tindex+1][i] + + this->kz[zindex]*this->kz[zindex]*this->cv[1][tindex+2][i]); + } + } + else + std::fill_n((rnumber*)(pressure+cindex), 2, 0.0); + } + ); + /* off-diagonal terms 12 23 31 */ + RLOOP ( + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + // rindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*rindex; + for (int cc=0; cc<3; cc++) + this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3]; + } + ); + this->clean_up_real_space(this->rv[1], 3); + { + TIMEZONE("fftw_interface<rnumber>::execute"); + fftw_interface<rnumber>::execute(*(this->vr2c[1])); + } + this->dealias(this->cv[1], 3); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){ + if (k2 <= this->kM2 && k2 > 0) + { + // cindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*cindex; + for (int i=0; i<2; i++) + { + pressure[cindex][i] -= 2*(this->kx[xindex]*this->ky[yindex]*this->cv[1][tindex+0][i] + + this->ky[yindex]*this->kz[zindex]*this->cv[1][tindex+1][i] + + this->kz[zindex]*this->kx[xindex]*this->cv[1][tindex+2][i]); + pressure[cindex][i] /= this->normalization_factor*k2; + } + } + } + ); +} + +template <class rnumber> +void fluid_solver<rnumber>::compute_gradient_statistics( + rnumber (*__restrict__ vec)[2], +double *gradu_moments, +double *trS2QR_moments, +ptrdiff_t *gradu_hist, +ptrdiff_t *trS2QR_hist, +ptrdiff_t *QR2D_hist, +double trS2QR_max_estimates[], +double gradu_max_estimates[], +int nbins, +int QR2D_nbins) +{ + TIMEZONE("fluid_solver::compute_gradient_statistics"); + typename fftw_interface<rnumber>::complex *ca; + rnumber *ra; + ca = fftw_interface<rnumber>::alloc_complex(this->cd->local_size*3); + ra = (rnumber*)(ca); + this->compute_vector_gradient(ca, vec); + for (int cc=0; cc<3; cc++) + { + std::copy( + (rnumber*)(ca + cc*this->cd->local_size), + (rnumber*)(ca + (cc+1)*this->cd->local_size), + (rnumber*)this->cv[1]); + fftw_interface<rnumber>::execute(*(this->vc2r[1])); + std::copy( + this->rv[1], + this->rv[1] + this->cd->local_size*2, + ra + cc*this->cd->local_size*2); + } + /* velocity gradient is now stored, in real space, in ra */ + std::fill_n(this->rv[1], 2*this->cd->local_size, 0.0); + rnumber *dx_u, *dy_u, *dz_u; + dx_u = ra; + dy_u = ra + 2*this->cd->local_size; + dz_u = ra + 4*this->cd->local_size; + double binsize[2]; + double tmp_max_estimate[3]; + tmp_max_estimate[0] = trS2QR_max_estimates[0]; + tmp_max_estimate[1] = trS2QR_max_estimates[1]; + tmp_max_estimate[2] = trS2QR_max_estimates[2]; + binsize[0] = 2*tmp_max_estimate[2] / QR2D_nbins; + binsize[1] = 2*tmp_max_estimate[1] / QR2D_nbins; + ptrdiff_t *local_hist = new ptrdiff_t[QR2D_nbins*QR2D_nbins]; + std::fill_n(local_hist, QR2D_nbins*QR2D_nbins, 0); + RLOOP( + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + rnumber AxxAxx; + rnumber AyyAyy; + rnumber AzzAzz; + rnumber AxyAyx; + rnumber AyzAzy; + rnumber AzxAxz; + rnumber Sxy; + rnumber Syz; + rnumber Szx; + // rindex indexing is thread safe so there is no overlap between threads + // tindex[0:2] is thread safe too + ptrdiff_t tindex = 3*rindex; + AxxAxx = dx_u[tindex+0]*dx_u[tindex+0]; + AyyAyy = dy_u[tindex+1]*dy_u[tindex+1]; + AzzAzz = dz_u[tindex+2]*dz_u[tindex+2]; + AxyAyx = dx_u[tindex+1]*dy_u[tindex+0]; + AyzAzy = dy_u[tindex+2]*dz_u[tindex+1]; + AzxAxz = dz_u[tindex+0]*dx_u[tindex+2]; + this->rv[1][tindex+1] = - (AxxAxx + AyyAyy + AzzAzz)/2 - AxyAyx - AyzAzy - AzxAxz; + this->rv[1][tindex+2] = - (dx_u[tindex+0]*(AxxAxx/3 + AxyAyx + AzxAxz) + + dy_u[tindex+1]*(AyyAyy/3 + AxyAyx + AyzAzy) + + dz_u[tindex+2]*(AzzAzz/3 + AzxAxz + AyzAzy) + + dx_u[tindex+1]*dy_u[tindex+2]*dz_u[tindex+0] + + dx_u[tindex+2]*dy_u[tindex+0]*dz_u[tindex+1]); + int bin0 = int(floor((this->rv[1][tindex+2] + tmp_max_estimate[2]) / binsize[0])); + int bin1 = int(floor((this->rv[1][tindex+1] + tmp_max_estimate[1]) / binsize[1])); + if ((bin0 >= 0 && bin0 < QR2D_nbins) && + (bin1 >= 0 && bin1 < QR2D_nbins)) + local_hist[bin1*QR2D_nbins + bin0]++; + Sxy = dx_u[tindex+1]+dy_u[tindex+0]; + Syz = dy_u[tindex+2]+dz_u[tindex+1]; + Szx = dz_u[tindex+0]+dx_u[tindex+2]; + this->rv[1][tindex] = (AxxAxx + AyyAyy + AzzAzz + + (Sxy*Sxy + Syz*Syz + Szx*Szx)/2); + } + ); + MPI_Allreduce( + local_hist, + QR2D_hist, + QR2D_nbins * QR2D_nbins, + MPI_INT64_T, MPI_SUM, this->cd->comm); + delete[] local_hist; + this->compute_rspace_stats3( + this->rv[1], + trS2QR_moments, + trS2QR_hist, + tmp_max_estimate, + nbins); + double *tmp_moments = new double[10*3]; + ptrdiff_t *tmp_hist = new ptrdiff_t[nbins*3]; + for (int cc=0; cc<3; cc++) + { + tmp_max_estimate[0] = gradu_max_estimates[cc*3 + 0]; + tmp_max_estimate[1] = gradu_max_estimates[cc*3 + 1]; + tmp_max_estimate[2] = gradu_max_estimates[cc*3 + 2]; + this->compute_rspace_stats3( + dx_u + cc*2*this->cd->local_size, + tmp_moments, + tmp_hist, + tmp_max_estimate, + nbins); + for (int n = 0; n < 10; n++) + for (int i = 0; i < 3 ; i++) + { + gradu_moments[(n*3 + cc)*3 + i] = tmp_moments[n*3 + i]; + } + for (int n = 0; n < nbins; n++) + for (int i = 0; i < 3; i++) + { + gradu_hist[(n*3 + cc)*3 + i] = tmp_hist[n*3 + i]; + } + } + delete[] tmp_moments; + delete[] tmp_hist; + fftw_interface<rnumber>::free(ca); +} + +template <class rnumber> +void fluid_solver<rnumber>::compute_Lagrangian_acceleration(rnumber (*acceleration)[2]) +{ + TIMEZONE("fluid_solver::compute_Lagrangian_acceleration"); + typename fftw_interface<rnumber>::complex *pressure; + pressure = fftw_interface<rnumber>::alloc_complex(this->cd->local_size/3); + this->compute_velocity(this->cvorticity); + this->ift_velocity(); + this->compute_pressure(pressure); + this->compute_velocity(this->cvorticity); + std::fill_n((rnumber*)this->cv[1], 2*this->cd->local_size, 0.0); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){ + if (k2 <= this->kM2) + { + // cindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*cindex; + for (int cc=0; cc<3; cc++) + for (int i=0; i<2; i++) + this->cv[1][tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i]; + if (strcmp(this->forcing_type, "linear") == 0) + { + double knorm = sqrt(k2); + if ((this->fk0 <= knorm) && + (this->fk1 >= knorm)) + for (int c=0; c<3; c++) + for (int i=0; i<2; i++) + this->cv[1][tindex+c][i] += this->famplitude*this->cu[tindex+c][i]; + } + this->cv[1][tindex+0][0] += this->kx[xindex]*pressure[cindex][1]; + this->cv[1][tindex+1][0] += this->ky[yindex]*pressure[cindex][1]; + this->cv[1][tindex+2][0] += this->kz[zindex]*pressure[cindex][1]; + this->cv[1][tindex+0][1] -= this->kx[xindex]*pressure[cindex][0]; + this->cv[1][tindex+1][1] -= this->ky[yindex]*pressure[cindex][0]; + this->cv[1][tindex+2][1] -= this->kz[zindex]*pressure[cindex][0]; + } + } + ); + std::copy( + (rnumber*)this->cv[1], + (rnumber*)(this->cv[1] + this->cd->local_size), + (rnumber*)acceleration); + fftw_interface<rnumber>::free(pressure); +} + +template <class rnumber> +void fluid_solver<rnumber>::compute_Eulerian_acceleration(rnumber (*__restrict__ acceleration)[2]) +{ + TIMEZONE("fluid_solver::compute_Eulerian_acceleration"); + std::fill_n((rnumber*)(acceleration), 2*this->cd->local_size, 0.0); + this->compute_velocity(this->cvorticity); + /* put in linear terms */ + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/, double k2){ + if (k2 <= this->kM2) + { + // cindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*cindex; + for (int cc=0; cc<3; cc++) + for (int i=0; i<2; i++) + acceleration[tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i]; + if (strcmp(this->forcing_type, "linear") == 0) + { + double knorm = sqrt(k2); + if ((this->fk0 <= knorm) && + (this->fk1 >= knorm)) + { + for (int c=0; c<3; c++) + for (int i=0; i<2; i++) + acceleration[tindex+c][i] += this->famplitude*this->cu[tindex+c][i]; + } + } + } + } + ); + this->ift_velocity(); + /* compute uu */ + /* 11 22 33 */ + RLOOP ( + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + // cindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*rindex; + for (int cc=0; cc<3; cc++) + this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc] / this->normalization_factor; + } + ); + this->clean_up_real_space(this->rv[1], 3); + fftw_interface<rnumber>::execute(*(this->vr2c[1])); + this->dealias(this->cv[1], 3); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){ + if (k2 <= this->kM2) + { + // cindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*cindex; + acceleration[tindex+0][0] += + this->kx[xindex]*this->cv[1][tindex+0][1]; + acceleration[tindex+0][1] += + -this->kx[xindex]*this->cv[1][tindex+0][0]; + acceleration[tindex+1][0] += + this->ky[yindex]*this->cv[1][tindex+1][1]; + acceleration[tindex+1][1] += + -this->ky[yindex]*this->cv[1][tindex+1][0]; + acceleration[tindex+2][0] += + this->kz[zindex]*this->cv[1][tindex+2][1]; + acceleration[tindex+2][1] += + -this->kz[zindex]*this->cv[1][tindex+2][0]; + } + } + ); + /* 12 23 31 */ + RLOOP ( + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + // cindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*rindex; + for (int cc=0; cc<3; cc++) + this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3] / this->normalization_factor; + } + ); + this->clean_up_real_space(this->rv[1], 3); + fftw_interface<rnumber>::execute(*(this->vr2c[1])); + this->dealias(this->cv[1], 3); + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){ + if (k2 <= this->kM2) + { + // cindex indexing is thread safe so there is no overlap between threads + ptrdiff_t tindex = 3*cindex; + acceleration[tindex+0][0] += + (this->ky[yindex]*this->cv[1][tindex+0][1] + + this->kz[zindex]*this->cv[1][tindex+2][1]); + acceleration[tindex+0][1] += + - (this->ky[yindex]*this->cv[1][tindex+0][0] + + this->kz[zindex]*this->cv[1][tindex+2][0]); + acceleration[tindex+1][0] += + (this->kz[zindex]*this->cv[1][tindex+1][1] + + this->kx[xindex]*this->cv[1][tindex+0][1]); + acceleration[tindex+1][1] += + - (this->kz[zindex]*this->cv[1][tindex+1][0] + + this->kx[xindex]*this->cv[1][tindex+0][0]); + acceleration[tindex+2][0] += + (this->kx[xindex]*this->cv[1][tindex+2][1] + + this->ky[yindex]*this->cv[1][tindex+1][1]); + acceleration[tindex+2][1] += + - (this->kx[xindex]*this->cv[1][tindex+2][0] + + this->ky[yindex]*this->cv[1][tindex+1][0]); + } + } + ); + if (this->cd->myrank == this->cd->rank[0]) + std::fill_n((rnumber*)(acceleration), 6, 0.0); + this->force_divfree(acceleration); +} + +template <class rnumber> +void fluid_solver<rnumber>::compute_Lagrangian_acceleration(rnumber *__restrict__ acceleration) +{ + TIMEZONE("fluid_solver::compute_Lagrangian_acceleration"); + this->compute_Lagrangian_acceleration((typename fftw_interface<rnumber>::complex*)acceleration); + fftw_interface<rnumber>::execute(*(this->vc2r[1])); + std::copy( + this->rv[1], + this->rv[1] + 2*this->cd->local_size, + acceleration); +} + +template <class rnumber> +int fluid_solver<rnumber>::write_rpressure() +{ + TIMEZONE("fluid_solver::write_rpressure"); + char fname[512]; + typename fftw_interface<rnumber>::complex *pressure; + pressure = fftw_interface<rnumber>::alloc_complex(this->cd->local_size/3); + this->compute_velocity(this->cvorticity); + this->ift_velocity(); + this->compute_pressure(pressure); + this->fill_up_filename("rpressure", fname); + rnumber *rpressure = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2); + typename fftw_interface<rnumber>::plan c2r; + c2r = fftw_interface<rnumber>::mpi_plan_dft_c2r_3d( + this->rd->sizes[0], this->rd->sizes[1], this->rd->sizes[2], + pressure, rpressure, this->cd->comm, + this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); + fftw_interface<rnumber>::execute(c2r); + /* output goes here */ + int ntmp[3]; + ntmp[0] = this->rd->sizes[0]; + ntmp[1] = this->rd->sizes[1]; + ntmp[2] = this->rd->sizes[2]; + field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm); + clip_zero_padding<rnumber>(scalar_descriptor, rpressure, 1); + int return_value = scalar_descriptor->write(fname, rpressure); + delete scalar_descriptor; + fftw_interface<rnumber>::destroy_plan(c2r); + fftw_interface<rnumber>::free(pressure); + fftw_interface<rnumber>::free(rpressure); + return return_value; +} /*****************************************************************************/ -/* now actually use the macro defined above */ -FLUID_SOLVER_DEFINITIONS( - FFTW_MANGLE_FLOAT, - float, - MPI_FLOAT, - MPI_COMPLEX) -FLUID_SOLVER_DEFINITIONS( - FFTW_MANGLE_DOUBLE, - double, - MPI_DOUBLE, - BFPS_MPICXX_DOUBLE_COMPLEX) -/*****************************************************************************/ + diff --git a/bfps/cpp/fluid_solver.hpp b/bfps/cpp/fluid_solver.hpp index 2b6ec64de12cc133687074c83c71696ffc507509..4cc75cee4385353f64dc9bc9e7d34c6efba9ad48 100644 --- a/bfps/cpp/fluid_solver.hpp +++ b/bfps/cpp/fluid_solver.hpp @@ -55,12 +55,12 @@ class fluid_solver:public fluid_solver_base<rnumber> typename fluid_solver_base<rnumber>::cnumber *cu, *cv[4]; /* plans */ - void *c2r_vorticity; - void *r2c_vorticity; - void *c2r_velocity; - void *r2c_velocity; - void *uc2r, *ur2c; - void *vr2c[3], *vc2r[3]; + typename fftw_interface<rnumber>::plan *c2r_vorticity; + typename fftw_interface<rnumber>::plan *r2c_vorticity; + typename fftw_interface<rnumber>::plan *c2r_velocity; + typename fftw_interface<rnumber>::plan *r2c_velocity; + typename fftw_interface<rnumber>::plan *uc2r, *ur2c; + typename fftw_interface<rnumber>::plan *vr2c[3], *vc2r[3]; /* physical parameters */ double nu; diff --git a/bfps/cpp/fluid_solver_base.cpp b/bfps/cpp/fluid_solver_base.cpp index 2f2aeee9a8ae699b7863c90dcffb550bc905390a..1ac50f29c8c5d58a7efb064302055430901ab24a 100644 --- a/bfps/cpp/fluid_solver_base.cpp +++ b/bfps/cpp/fluid_solver_base.cpp @@ -32,7 +32,8 @@ #include "base.hpp" #include "fluid_solver_base.hpp" #include "fftw_tools.hpp" - +#include "scope_timer.hpp" +#include "shared_array.hpp" template <class rnumber> void fluid_solver_base<rnumber>::fill_up_filename(const char *base_name, char *destination) @@ -43,6 +44,7 @@ void fluid_solver_base<rnumber>::fill_up_filename(const char *base_name, char *d template <class rnumber> void fluid_solver_base<rnumber>::clean_up_real_space(rnumber *a, int howmany) { + TIMEZONE("fluid_solver_base::clean_up_real_space"); for (ptrdiff_t rindex = 0; rindex < this->cd->local_size*2; rindex += howmany*(this->rd->subsizes[2]+2)) std::fill_n(a+rindex+this->rd->subsizes[2]*howmany, 2*howmany, 0.0); } @@ -65,65 +67,76 @@ double fluid_solver_base<rnumber>::autocorrel(cnumber *a) template <class rnumber> void fluid_solver_base<rnumber>::cospectrum(cnumber *a, cnumber *b, double *spec) { - double *cospec_local = fftw_alloc_real(this->nshells*9); - std::fill_n(cospec_local, this->nshells*9, 0); - int tmp_int; + TIMEZONE("fluid_solver_base::cospectrum"); + shared_array<double> cospec_local_thread(this->nshells*9,[&](double* cospec_local){ + std::fill_n(cospec_local, this->nshells*9, 0); + }); + CLOOP_K2_NXMODES( - this, - if (k2 <= this->kMspec2) - { - tmp_int = int(sqrt(k2)/this->dk)*9; - for (int i=0; i<3; i++) - for (int j=0; j<3; j++) - { - cospec_local[tmp_int+i*3+j] += nxmodes * ( - (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] + - (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]); - } - } - ); + this, + + [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, + ptrdiff_t /*zindex*/, double k2, int nxmodes){ + if (k2 <= this->kMspec2) + { + int tmp_int = int(sqrt(k2)/this->dk)*9; + double* cospec_local = cospec_local_thread.getMine(); + for (int i=0; i<3; i++) + for (int j=0; j<3; j++) + { + cospec_local[tmp_int+i*3+j] += nxmodes * ( + (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] + + (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]); + } + }} + ); + cospec_local_thread.mergeParallel(); MPI_Allreduce( - (void*)cospec_local, - (void*)spec, - this->nshells*9, - MPI_DOUBLE, MPI_SUM, this->cd->comm); - fftw_free(cospec_local); + cospec_local_thread.getMasterData(), + (void*)spec, + this->nshells*9, + MPI_DOUBLE, MPI_SUM, this->cd->comm); } template <class rnumber> void fluid_solver_base<rnumber>::cospectrum(cnumber *a, cnumber *b, double *spec, const double k2exponent) { - double *cospec_local = fftw_alloc_real(this->nshells*9); - std::fill_n(cospec_local, this->nshells*9, 0); - double factor = 1; - int tmp_int; + TIMEZONE("fluid_solver_base::cospectrum2"); + shared_array<double> cospec_local_thread(this->nshells*9,[&](double* cospec_local){ + std::fill_n(cospec_local, this->nshells*9, 0); + }); + CLOOP_K2_NXMODES( - this, - if (k2 <= this->kMspec2) - { - factor = nxmodes*pow(k2, k2exponent); - tmp_int = int(sqrt(k2)/this->dk)*9; - for (int i=0; i<3; i++) - for (int j=0; j<3; j++) - { - cospec_local[tmp_int+i*3+j] += factor * ( - (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] + - (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]); - } - } - ); + this, + + [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, + ptrdiff_t /*zindex*/, double k2, int nxmodes){ + if (k2 <= this->kMspec2) + { + double factor = nxmodes*pow(k2, k2exponent); + int tmp_int = int(sqrt(k2)/this->dk)*9; + double* cospec_local = cospec_local_thread.getMine(); + for (int i=0; i<3; i++) + for (int j=0; j<3; j++) + { + cospec_local[tmp_int+i*3+j] += factor * ( + (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] + + (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]); + } + }} + ); + cospec_local_thread.mergeParallel(); MPI_Allreduce( - (void*)cospec_local, - (void*)spec, - this->nshells*9, - MPI_DOUBLE, MPI_SUM, this->cd->comm); + cospec_local_thread.getMasterData(), + (void*)spec, + this->nshells*9, + MPI_DOUBLE, MPI_SUM, this->cd->comm); //for (int n=0; n<this->nshells; n++) //{ // spec[n] *= 12.5663706144*pow(this->kshell[n], 2) / this->nshell[n]; // /*is normalization needed? // * spec[n] /= this->normalization_factor*/ //} - fftw_free(cospec_local); } template <class rnumber> @@ -134,6 +147,7 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( const hsize_t toffset, const std::vector<double> max_estimate) { + TIMEZONE("fluid_solver_base::compute_rspace_stats"); const int nmoments = 10; int nvals, nbins; if (this->rd->myrank == 0) @@ -145,6 +159,7 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( wspace = H5Dget_space(dset); ndims = H5Sget_simple_extent_dims(wspace, dims, NULL); assert(ndims == 3); + variable_used_only_in_assert(ndims); assert(dims[1] == nmoments); nvals = dims[2]; H5Sclose(wspace); @@ -161,22 +176,29 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( MPI_Bcast(&nvals, 1, MPI_INT, 0, this->rd->comm); MPI_Bcast(&nbins, 1, MPI_INT, 0, this->rd->comm); assert(nvals == max_estimate.size()); - double *moments = new double[nmoments*nvals]; - double *local_moments = new double[nmoments*nvals]; - double *val_tmp = new double[nvals]; + shared_array<double> threaded_local_moments(nmoments*nvals, [&](double* local_moments){ + std::fill_n(local_moments, nmoments*nvals, 0); + if (nvals == 4) local_moments[3] = max_estimate[3]; + }); + + shared_array<double> threaded_val_tmp(nvals); + + shared_array<ptrdiff_t> threaded_local_hist(nbins*nvals, [&](ptrdiff_t* local_hist){ + std::fill_n(local_hist, nbins*nvals, 0); + }); + + // Not written by threads double *binsize = new double[nvals]; - double *pow_tmp = new double[nvals]; - ptrdiff_t *hist = new ptrdiff_t[nbins*nvals]; - ptrdiff_t *local_hist = new ptrdiff_t[nbins*nvals]; - int bin; for (int i=0; i<nvals; i++) binsize[i] = 2*max_estimate[i] / nbins; - std::fill_n(local_hist, nbins*nvals, 0); - std::fill_n(local_moments, nmoments*nvals, 0); - if (nvals == 4) local_moments[3] = max_estimate[3]; + RLOOP( - this, - std::fill_n(pow_tmp, nvals, 1.0); + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + double *val_tmp = threaded_val_tmp.getMine(); + ptrdiff_t* local_hist = threaded_local_hist.getMine(); + double *local_moments = threaded_local_moments.getMine(); + if (nvals == 4) val_tmp[3] = 0.0; for (int i=0; i<3; i++) { @@ -190,7 +212,7 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( local_moments[0*nvals+3] = val_tmp[3]; if (val_tmp[3] > local_moments[9*nvals+3]) local_moments[9*nvals+3] = val_tmp[3]; - bin = int(floor(val_tmp[3]*2/binsize[3])); + int bin = int(floor(val_tmp[3]*2/binsize[3])); if (bin >= 0 && bin < nbins) local_hist[bin*nvals+3]++; } @@ -200,42 +222,63 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( local_moments[0*nvals+i] = val_tmp[i]; if (val_tmp[i] > local_moments[(nmoments-1)*nvals+i]) local_moments[(nmoments-1)*nvals+i] = val_tmp[i]; - bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i])); + int bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i])); if (bin >= 0 && bin < nbins) local_hist[bin*nvals+i]++; } - for (int n=1; n < nmoments-1; n++) - for (int i=0; i<nvals; i++) - local_moments[n*nvals + i] += (pow_tmp[i] = val_tmp[i]*pow_tmp[i]); - ); + for (int n=1; n < nmoments-1; n++){ + double pow_tmp = 1.; + for (int i=0; i<nvals; i++){ + local_moments[n*nvals + i] += (pow_tmp = val_tmp[i]*pow_tmp); + } + } + } + ); + + threaded_local_hist.mergeParallel(); + threaded_local_moments.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double { + if(nvals == int(4) && idx == 0*nvals+3){ + return std::min(v1, v2); + } + if(nvals == int(4) && idx == 9*nvals+3){ + return std::max(v1, v2); + } + if(idx < 3){ + return std::min(v1, v2); + } + if((nmoments-1)*nvals <= idx && idx < (nmoments-1)*nvals+3){ + return std::max(v1, v2); + } + return v1 + v2; + }); + + + double *moments = new double[nmoments*nvals]; MPI_Allreduce( - (void*)local_moments, - (void*)moments, - nvals, - MPI_DOUBLE, MPI_MIN, this->cd->comm); + threaded_local_moments.getMasterData(), + (void*)moments, + nvals, + MPI_DOUBLE, MPI_MIN, this->cd->comm); MPI_Allreduce( - (void*)(local_moments + nvals), - (void*)(moments+nvals), - (nmoments-2)*nvals, - MPI_DOUBLE, MPI_SUM, this->cd->comm); + (threaded_local_moments.getMasterData() + nvals), + (void*)(moments+nvals), + (nmoments-2)*nvals, + MPI_DOUBLE, MPI_SUM, this->cd->comm); MPI_Allreduce( - (void*)(local_moments + (nmoments-1)*nvals), - (void*)(moments+(nmoments-1)*nvals), - nvals, - MPI_DOUBLE, MPI_MAX, this->cd->comm); + (threaded_local_moments.getMasterData() + (nmoments-1)*nvals), + (void*)(moments+(nmoments-1)*nvals), + nvals, + MPI_DOUBLE, MPI_MAX, this->cd->comm); + ptrdiff_t *hist = new ptrdiff_t[nbins*nvals]; MPI_Allreduce( - (void*)local_hist, - (void*)hist, - nbins*nvals, - MPI_INT64_T, MPI_SUM, this->cd->comm); + threaded_local_hist.getMasterData(), + (void*)hist, + nbins*nvals, + MPI_INT64_T, MPI_SUM, this->cd->comm); for (int n=1; n < nmoments-1; n++) for (int i=0; i<nvals; i++) moments[n*nvals + i] /= this->normalization_factor; - delete[] local_moments; - delete[] local_hist; - delete[] val_tmp; delete[] binsize; - delete[] pow_tmp; if (this->rd->myrank == 0) { hid_t dset, wspace, mspace; @@ -280,18 +323,28 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( double max_estimate[], const int nbins) { - double *local_moments = fftw_alloc_real(10*nvals); - double val_tmp[nvals], binsize[nvals], pow_tmp[nvals]; - ptrdiff_t *local_hist = new ptrdiff_t[nbins*nvals]; - int bin; + TIMEZONE("fluid_solver_base::compute_rspace_stats"); + shared_array<double> threaded_local_moments(10*nvals,[&](double* local_moments){ + std::fill_n(local_moments, 10*nvals, 0); + if (nvals == 4) local_moments[3] = max_estimate[3]; + }); + + shared_array<ptrdiff_t> threaded_local_hist(nbins*nvals, [&](ptrdiff_t* local_hist){ + std::fill_n(local_hist, nbins*nvals, 0); + }); + + // Will not be modified by the threads + double binsize[nvals]; for (int i=0; i<nvals; i++) binsize[i] = 2*max_estimate[i] / nbins; - std::fill_n(local_hist, nbins*nvals, 0); - std::fill_n(local_moments, 10*nvals, 0); - if (nvals == 4) local_moments[3] = max_estimate[3]; + RLOOP( - this, - std::fill_n(pow_tmp, nvals, 1.0); + this, + [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){ + ptrdiff_t *local_hist = threaded_local_hist.getMine(); + double *local_moments = threaded_local_moments.getMine(); + + double val_tmp[nvals]; if (nvals == 4) val_tmp[3] = 0.0; for (int i=0; i<3; i++) { @@ -305,7 +358,7 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( local_moments[0*nvals+3] = val_tmp[3]; if (val_tmp[3] > local_moments[9*nvals+3]) local_moments[9*nvals+3] = val_tmp[3]; - bin = int(floor(val_tmp[3]*2/binsize[3])); + int bin = int(floor(val_tmp[3]*2/binsize[3])); if (bin >= 0 && bin < nbins) local_hist[bin*nvals+3]++; } @@ -315,44 +368,65 @@ void fluid_solver_base<rnumber>::compute_rspace_stats( local_moments[0*nvals+i] = val_tmp[i]; if (val_tmp[i] > local_moments[9*nvals+i]) local_moments[9*nvals+i] = val_tmp[i]; - bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i])); + int bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i])); if (bin >= 0 && bin < nbins) local_hist[bin*nvals+i]++; } - for (int n=1; n<9; n++) - for (int i=0; i<nvals; i++) - local_moments[n*nvals + i] += (pow_tmp[i] = val_tmp[i]*pow_tmp[i]); - ); + for (int n=1; n<9; n++){ + double pow_tmp = 1; + for (int i=0; i<nvals; i++){ + local_moments[n*nvals + i] += (pow_tmp = val_tmp[i]*pow_tmp); + } + } + } + ); + + threaded_local_moments.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double { + if(nvals == int(4) && idx == 0*nvals+3){ + return std::min(v1, v2); + } + if(nvals == int(4) && idx == 9*nvals+3){ + return std::max(v1, v2); + } + if(idx < 3){ + return std::min(v1, v2); + } + if(9*nvals <= idx && idx < 9*nvals+3){ + return std::max(v1, v2); + } + return v1 + v2; + }); + threaded_local_hist.mergeParallel(); + MPI_Allreduce( - (void*)local_moments, - (void*)moments, - nvals, - MPI_DOUBLE, MPI_MIN, this->cd->comm); + threaded_local_moments.getMasterData(), + (void*)moments, + nvals, + MPI_DOUBLE, MPI_MIN, this->cd->comm); MPI_Allreduce( - (void*)(local_moments + nvals), - (void*)(moments+nvals), - 8*nvals, - MPI_DOUBLE, MPI_SUM, this->cd->comm); + (threaded_local_moments.getMasterData() + nvals), + (void*)(moments+nvals), + 8*nvals, + MPI_DOUBLE, MPI_SUM, this->cd->comm); MPI_Allreduce( - (void*)(local_moments + 9*nvals), - (void*)(moments+9*nvals), - nvals, - MPI_DOUBLE, MPI_MAX, this->cd->comm); + (threaded_local_moments.getMasterData() + 9*nvals), + (void*)(moments+9*nvals), + nvals, + MPI_DOUBLE, MPI_MAX, this->cd->comm); MPI_Allreduce( - (void*)local_hist, - (void*)hist, - nbins*nvals, - MPI_INT64_T, MPI_SUM, this->cd->comm); + (void*)threaded_local_hist.getMasterData(), + (void*)hist, + nbins*nvals, + MPI_INT64_T, MPI_SUM, this->cd->comm); for (int n=1; n<9; n++) for (int i=0; i<nvals; i++) moments[n*nvals + i] /= this->normalization_factor; - fftw_free(local_moments); - delete[] local_hist; } template <class rnumber> void fluid_solver_base<rnumber>::write_spectrum(const char *fname, cnumber *a, const double k2exponent) { + TIMEZONE("fluid_solver_base::write_spectrum"); double *spec = fftw_alloc_real(this->nshells); this->cospectrum(a, a, spec, k2exponent); if (this->cd->myrank == 0) @@ -371,362 +445,383 @@ void fluid_solver_base<rnumber>::write_spectrum(const char *fname, cnumber *a, c /*****************************************************************************/ /* macro for specializations to numeric types compatible with FFTW */ -#define FLUID_SOLVER_BASE_DEFINITIONS(FFTW, R, MPI_RNUM, MPI_CNUM) \ - \ -template<> \ -fluid_solver_base<R>::fluid_solver_base( \ - const char *NAME, \ - int nx, \ - int ny, \ - int nz, \ - double DKX, \ - double DKY, \ - double DKZ, \ - int DEALIAS_TYPE, \ - unsigned FFTW_PLAN_RIGOR) \ -{ \ - strncpy(this->name, NAME, 256); \ - this->name[255] = '\0'; \ - this->iteration = 0; \ - this->fftw_plan_rigor = FFTW_PLAN_RIGOR; \ - \ - int ntmp[4]; \ - ntmp[0] = nz; \ - ntmp[1] = ny; \ - ntmp[2] = nx; \ - ntmp[3] = 3; \ - this->rd = new field_descriptor<R>( \ - 4, ntmp, MPI_RNUM, MPI_COMM_WORLD);\ - this->normalization_factor = (this->rd->full_size/3); \ - ntmp[0] = ny; \ - ntmp[1] = nz; \ - ntmp[2] = nx/2 + 1; \ - ntmp[3] = 3; \ - this->cd = new field_descriptor<R>( \ - 4, ntmp, MPI_CNUM, this->rd->comm);\ - \ - this->dkx = DKX; \ - this->dky = DKY; \ - this->dkz = DKZ; \ - this->kx = new double[this->cd->sizes[2]]; \ - this->ky = new double[this->cd->subsizes[0]]; \ - this->kz = new double[this->cd->sizes[1]]; \ - this->dealias_type = DEALIAS_TYPE; \ - switch(this->dealias_type) \ - { \ - /* HL07 smooth filter */ \ - case 1: \ - this->kMx = this->dkx*(int(this->rd->sizes[2] / 2)-1); \ - this->kMy = this->dky*(int(this->rd->sizes[1] / 2)-1); \ - this->kMz = this->dkz*(int(this->rd->sizes[0] / 2)-1); \ - break; \ - default: \ - this->kMx = this->dkx*(int(this->rd->sizes[2] / 3)-1); \ - this->kMy = this->dky*(int(this->rd->sizes[1] / 3)-1); \ - this->kMz = this->dkz*(int(this->rd->sizes[0] / 3)-1); \ - } \ - int i, ii; \ - for (i = 0; i<this->cd->sizes[2]; i++) \ - this->kx[i] = i*this->dkx; \ - for (i = 0; i<this->cd->subsizes[0]; i++) \ - { \ - ii = i + this->cd->starts[0]; \ - if (ii <= this->rd->sizes[1]/2) \ - this->ky[i] = this->dky*ii; \ - else \ - this->ky[i] = this->dky*(ii - this->rd->sizes[1]); \ - } \ - for (i = 0; i<this->cd->sizes[1]; i++) \ - { \ - if (i <= this->rd->sizes[0]/2) \ - this->kz[i] = this->dkz*i; \ - else \ - this->kz[i] = this->dkz*(i - this->rd->sizes[0]); \ - } \ - this->kM = this->kMx; \ - if (this->kM < this->kMy) this->kM = this->kMy; \ - if (this->kM < this->kMz) this->kM = this->kMz; \ - this->kM2 = this->kM * this->kM; \ - this->kMspec = this->kM; \ - this->kMspec2 = this->kM2; \ - this->dk = this->dkx; \ - if (this->dk > this->dky) this->dk = this->dky; \ - if (this->dk > this->dkz) this->dk = this->dkz; \ - this->dk2 = this->dk*this->dk; \ - DEBUG_MSG( \ - "kM = %g, kM2 = %g, dk = %g, dk2 = %g\n", \ - this->kM, this->kM2, this->dk, this->dk2); \ - /* spectra stuff */ \ - this->nshells = int(this->kMspec / this->dk) + 2; \ - DEBUG_MSG( \ - "kMspec = %g, kMspec2 = %g, nshells = %ld\n", \ - this->kMspec, this->kMspec2, this->nshells); \ - this->kshell = new double[this->nshells]; \ - std::fill_n(this->kshell, this->nshells, 0.0); \ - this->nshell = new int64_t[this->nshells]; \ - std::fill_n(this->nshell, this->nshells, 0); \ - double *kshell_local = new double[this->nshells]; \ - std::fill_n(kshell_local, this->nshells, 0.0); \ - int64_t *nshell_local = new int64_t[this->nshells]; \ - std::fill_n(nshell_local, this->nshells, 0.0); \ - double knorm; \ - CLOOP_K2_NXMODES( \ - this, \ - if (k2 < this->kM2) \ - { \ - knorm = sqrt(k2); \ - nshell_local[int(knorm/this->dk)] += nxmodes; \ - kshell_local[int(knorm/this->dk)] += nxmodes*knorm; \ - } \ - this->Fourier_filter[int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.)); \ - ); \ - \ - MPI_Allreduce( \ - (void*)(nshell_local), \ - (void*)(this->nshell), \ - this->nshells, \ - MPI_INT64_T, MPI_SUM, this->cd->comm); \ - MPI_Allreduce( \ - (void*)(kshell_local), \ - (void*)(this->kshell), \ - this->nshells, \ - MPI_DOUBLE, MPI_SUM, this->cd->comm); \ - for (unsigned int n=0; n<this->nshells; n++) \ - { \ - this->kshell[n] /= this->nshell[n]; \ - } \ - delete[] nshell_local; \ - delete[] kshell_local; \ -} \ - \ -template<> \ -fluid_solver_base<R>::~fluid_solver_base() \ -{ \ - delete[] this->kshell; \ - delete[] this->nshell; \ - \ - delete[] this->kx; \ - delete[] this->ky; \ - delete[] this->kz; \ - \ - delete this->cd; \ - delete this->rd; \ -} \ - \ -template<> \ -void fluid_solver_base<R>::low_pass_Fourier(FFTW(complex) *a, const int howmany, const double kmax) \ -{ \ - const double km2 = kmax*kmax; \ - const int howmany2 = 2*howmany; \ - /*DEBUG_MSG("entered low_pass_Fourier, kmax=%lg km2=%lg howmany2=%d\n", kmax, km2, howmany2);*/ \ - CLOOP_K2( \ - this, \ - /*DEBUG_MSG("kx=%lg ky=%lg kz=%lg k2=%lg\n", \ - this->kx[xindex], \ - this->ky[yindex], \ - this->kz[zindex], \ - k2);*/ \ - if (k2 >= km2) \ - std::fill_n((R*)(a + howmany*cindex), howmany2, 0.0); \ - );\ -} \ - \ -template<> \ -void fluid_solver_base<R>::dealias(FFTW(complex) *a, const int howmany) \ -{ \ - if (this->dealias_type == 0) \ - { \ - this->low_pass_Fourier(a, howmany, this->kM); \ - return; \ - } \ - double tval; \ - CLOOP_K2( \ - this, \ - tval = this->Fourier_filter[int(round(k2/this->dk2))]; \ - for (int tcounter = 0; tcounter < howmany; tcounter++) \ - for (int i=0; i<2; i++) \ - a[howmany*cindex+tcounter][i] *= tval; \ - ); \ -} \ - \ -template<> \ -void fluid_solver_base<R>::force_divfree(FFTW(complex) *a) \ -{ \ - FFTW(complex) tval; \ - CLOOP_K2( \ - this, \ - if (k2 > 0) \ - { \ - tval[0] = (this->kx[xindex]*((*(a + cindex*3 ))[0]) + \ - this->ky[yindex]*((*(a + cindex*3+1))[0]) + \ - this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2; \ - tval[1] = (this->kx[xindex]*((*(a + cindex*3 ))[1]) + \ - this->ky[yindex]*((*(a + cindex*3+1))[1]) + \ - this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2; \ - for (int imag_part=0; imag_part<2; imag_part++) \ - { \ - a[cindex*3 ][imag_part] -= tval[imag_part]*this->kx[xindex]; \ - a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex]; \ - a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex]; \ - } \ - } \ - );\ - if (this->cd->myrank == this->cd->rank[0]) \ - std::fill_n((R*)(a), 6, 0.0); \ -} \ - \ -template<> \ -void fluid_solver_base<R>::compute_vector_gradient(FFTW(complex) *A, FFTW(complex) *cvec) \ -{ \ - ptrdiff_t tindex; \ - std::fill_n((R*)A, 3*2*this->cd->local_size, 0.0); \ - FFTW(complex) *dx_u, *dy_u, *dz_u; \ - dx_u = A; \ - dy_u = A + this->cd->local_size; \ - dz_u = A + 2*this->cd->local_size; \ - CLOOP_K2( \ - this, \ - if (k2 <= this->kM2) \ - { \ - tindex = 3*cindex; \ - for (int cc=0; cc<3; cc++) \ - { \ - dx_u[tindex + cc][0] = -this->kx[xindex]*cvec[tindex+cc][1]; \ - dx_u[tindex + cc][1] = this->kx[xindex]*cvec[tindex+cc][0]; \ - dy_u[tindex + cc][0] = -this->ky[yindex]*cvec[tindex+cc][1]; \ - dy_u[tindex + cc][1] = this->ky[yindex]*cvec[tindex+cc][0]; \ - dz_u[tindex + cc][0] = -this->kz[zindex]*cvec[tindex+cc][1]; \ - dz_u[tindex + cc][1] = this->kz[zindex]*cvec[tindex+cc][0]; \ - } \ - } \ - ); \ -} \ - \ -template<> \ -void fluid_solver_base<R>::symmetrize(FFTW(complex) *data, const int howmany) \ -{ \ - ptrdiff_t ii, cc; \ - MPI_Status *mpistatus = new MPI_Status; \ - if (this->cd->myrank == this->cd->rank[0]) \ - { \ - for (cc = 0; cc < howmany; cc++) \ - data[cc][1] = 0.0; \ - for (ii = 1; ii < this->cd->sizes[1]/2; ii++) \ - for (cc = 0; cc < howmany; cc++) { \ - ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[0] = \ - (*(data + cc + howmany*( ii)*this->cd->sizes[2]))[0]; \ - ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[1] = \ - -(*(data + cc + howmany*( ii)*this->cd->sizes[2]))[1]; \ - } \ - } \ - FFTW(complex) *buffer; \ - buffer = FFTW(alloc_complex)(howmany*this->cd->sizes[1]); \ - ptrdiff_t yy; \ - /*ptrdiff_t tindex;*/ \ - int ranksrc, rankdst; \ - for (yy = 1; yy < this->cd->sizes[0]/2; yy++) { \ - ranksrc = this->cd->rank[yy]; \ - rankdst = this->cd->rank[this->cd->sizes[0] - yy]; \ - if (this->cd->myrank == ranksrc) \ - for (ii = 0; ii < this->cd->sizes[1]; ii++) \ - for (cc = 0; cc < howmany; cc++) \ - for (int imag_comp=0; imag_comp<2; imag_comp++) \ - (*(buffer + howmany*ii+cc))[imag_comp] = \ - (*(data + howmany*((yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[imag_comp]; \ - if (ranksrc != rankdst) \ - { \ - if (this->cd->myrank == ranksrc) \ - MPI_Send((void*)buffer, \ - howmany*this->cd->sizes[1], MPI_CNUM, rankdst, yy, \ - this->cd->comm); \ - if (this->cd->myrank == rankdst) \ - MPI_Recv((void*)buffer, \ - howmany*this->cd->sizes[1], MPI_CNUM, ranksrc, yy, \ - this->cd->comm, mpistatus); \ - } \ - if (this->cd->myrank == rankdst) \ - { \ - for (ii = 1; ii < this->cd->sizes[1]; ii++) \ - for (cc = 0; cc < howmany; cc++) \ - { \ - (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[0] = \ - (*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[0]; \ - (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[1] = \ - -(*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[1]; \ - } \ - for (cc = 0; cc < howmany; cc++) \ - { \ - (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[0] = (*(buffer + cc))[0]; \ - (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[1] = -(*(buffer + cc))[1]; \ - } \ - } \ - } \ - FFTW(free)(buffer); \ - delete mpistatus; \ - /* put asymmetric data to 0 */\ - /*if (this->cd->myrank == this->cd->rank[this->cd->sizes[0]/2]) \ - { \ - tindex = howmany*(this->cd->sizes[0]/2 - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2]; \ - for (ii = 0; ii < this->cd->sizes[1]; ii++) \ - { \ - std::fill_n((R*)(data + tindex), howmany*2*this->cd->sizes[2], 0.0); \ - tindex += howmany*this->cd->sizes[2]; \ - } \ - } \ - tindex = howmany*(); \ - std::fill_n((R*)(data + tindex), howmany*2, 0.0);*/ \ -} \ - \ -template<> \ -int fluid_solver_base<R>::read_base(const char *fname, R *data) \ -{ \ - char full_name[512]; \ - sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \ - return this->rd->read(full_name, (void*)data); \ -} \ - \ -template<> \ -int fluid_solver_base<R>::read_base(const char *fname, FFTW(complex) *data) \ -{ \ - char full_name[512]; \ - sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \ - return this->cd->read(full_name, (void*)data); \ -} \ - \ -template<> \ -int fluid_solver_base<R>::write_base(const char *fname, R *data) \ -{ \ - char full_name[512]; \ - sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \ - return this->rd->write(full_name, (void*)data); \ -} \ - \ -template<> \ -int fluid_solver_base<R>::write_base(const char *fname, FFTW(complex) *data) \ -{ \ - char full_name[512]; \ - sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \ - return this->cd->write(full_name, (void*)data); \ -} \ - \ -/* finally, force generation of code */ \ -template class fluid_solver_base<R>; \ +template <class rnumber> +fluid_solver_base<rnumber>::fluid_solver_base( + const char *NAME, + int nx, + int ny, + int nz, + double DKX, + double DKY, + double DKZ, + int DEALIAS_TYPE, + unsigned FFTW_PLAN_RIGOR) +{ + TIMEZONE("fluid_solver_base::fluid_solver_base"); + strncpy(this->name, NAME, 256); + this->name[255] = '\0'; + this->iteration = 0; + this->fftw_plan_rigor = FFTW_PLAN_RIGOR; -/*****************************************************************************/ + int ntmp[4]; + ntmp[0] = nz; + ntmp[1] = ny; + ntmp[2] = nx; + ntmp[3] = 3; + this->rd = new field_descriptor<rnumber>( + 4, ntmp, mpi_real_type<rnumber>::real(), MPI_COMM_WORLD); + this->normalization_factor = (this->rd->full_size/3); + ntmp[0] = ny; + ntmp[1] = nz; + ntmp[2] = nx/2 + 1; + ntmp[3] = 3; + this->cd = new field_descriptor<rnumber>( + 4, ntmp, mpi_real_type<rnumber>::complex(), this->rd->comm); + this->dkx = DKX; + this->dky = DKY; + this->dkz = DKZ; + this->kx = new double[this->cd->sizes[2]]; + this->ky = new double[this->cd->subsizes[0]]; + this->kz = new double[this->cd->sizes[1]]; + this->dealias_type = DEALIAS_TYPE; + switch(this->dealias_type) + { + /* HL07 smooth filter */ + case 1: + this->kMx = this->dkx*(int(this->rd->sizes[2] / 2)-1); + this->kMy = this->dky*(int(this->rd->sizes[1] / 2)-1); + this->kMz = this->dkz*(int(this->rd->sizes[0] / 2)-1); + break; + default: + this->kMx = this->dkx*(int(this->rd->sizes[2] / 3)-1); + this->kMy = this->dky*(int(this->rd->sizes[1] / 3)-1); + this->kMz = this->dkz*(int(this->rd->sizes[0] / 3)-1); + } + int i, ii; + for (i = 0; i<this->cd->sizes[2]; i++) + this->kx[i] = i*this->dkx; + for (i = 0; i<this->cd->subsizes[0]; i++) + { + ii = i + this->cd->starts[0]; + if (ii <= this->rd->sizes[1]/2) + this->ky[i] = this->dky*ii; + else + this->ky[i] = this->dky*(ii - this->rd->sizes[1]); + } + for (i = 0; i<this->cd->sizes[1]; i++) + { + if (i <= this->rd->sizes[0]/2) + this->kz[i] = this->dkz*i; + else + this->kz[i] = this->dkz*(i - this->rd->sizes[0]); + } + this->kM = this->kMx; + if (this->kM < this->kMy) this->kM = this->kMy; + if (this->kM < this->kMz) this->kM = this->kMz; + this->kM2 = this->kM * this->kM; + this->kMspec = this->kM; + this->kMspec2 = this->kM2; + this->dk = this->dkx; + if (this->dk > this->dky) this->dk = this->dky; + if (this->dk > this->dkz) this->dk = this->dkz; + this->dk2 = this->dk*this->dk; + DEBUG_MSG( + "kM = %g, kM2 = %g, dk = %g, dk2 = %g\n", + this->kM, this->kM2, this->dk, this->dk2); + /* spectra stuff */ + this->nshells = int(this->kMspec / this->dk) + 2; + DEBUG_MSG( + "kMspec = %g, kMspec2 = %g, nshells = %ld\n", + this->kMspec, this->kMspec2, this->nshells); + this->kshell = new double[this->nshells]; + std::fill_n(this->kshell, this->nshells, 0.0); + this->nshell = new int64_t[this->nshells]; + std::fill_n(this->nshell, this->nshells, 0); + shared_array<double> kshell_local_threaded(this->nshells,[&](double* kshell_local){ + std::fill_n(kshell_local, this->nshells, 0.0); + }); + shared_array<double> nshell_local_threaded(this->nshells,[&](double* nshell_local){ + std::fill_n(nshell_local, this->nshells, 0.0); + }); + + std::vector<std::unordered_map<int, double>> Fourier_filter_threaded(omp_get_max_threads()); + + CLOOP_K2_NXMODES( + this, + + [&](ptrdiff_t /*cindex*/, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, + ptrdiff_t /*zindex*/, double k2, int nxmodes){ + if (k2 < this->kM2) + { + double knorm = sqrt(k2); + nshell_local_threaded.getMine()[int(knorm/this->dk)] += nxmodes; + kshell_local_threaded.getMine()[int(knorm/this->dk)] += nxmodes*knorm; + } + Fourier_filter_threaded[omp_get_thread_num()][int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));} + ); + + // Merge results + nshell_local_threaded.mergeParallel(); + kshell_local_threaded.mergeParallel(); + for(int idxMerge = 0 ; idxMerge < int(Fourier_filter_threaded.size()) ; ++idxMerge){ + for(const auto kv : Fourier_filter_threaded[idxMerge]){ + this->Fourier_filter[kv.first] = kv.second; + } + } + + MPI_Allreduce( + (void*)(nshell_local_threaded.getMasterData()), + (void*)(this->nshell), + this->nshells, + MPI_INT64_T, MPI_SUM, this->cd->comm); + MPI_Allreduce( + (void*)(kshell_local_threaded.getMasterData()), + (void*)(this->kshell), + this->nshells, + MPI_DOUBLE, MPI_SUM, this->cd->comm); + for (unsigned int n=0; n<this->nshells; n++) + { + this->kshell[n] /= this->nshell[n]; + } +} + +template <class rnumber> +fluid_solver_base<rnumber>::~fluid_solver_base() +{ + delete[] this->kshell; + delete[] this->nshell; + + delete[] this->kx; + delete[] this->ky; + delete[] this->kz; + + delete this->cd; + delete this->rd; +} + +template <class rnumber> +void fluid_solver_base<rnumber>::low_pass_Fourier(cnumber *a, const int howmany, const double kmax) +{ + TIMEZONE("fluid_solver_base::low_pass_Fourier"); + const double km2 = kmax*kmax; + const int howmany2 = 2*howmany; + /*DEBUG_MSG("entered low_pass_Fourier, kmax=%lg km2=%lg howmany2=%d\n", kmax, km2, howmany2);*/ + CLOOP_K2( + this, + /*DEBUG_MSG("kx=%lg ky=%lg kz=%lg k2=%lg\n", + this->kx[xindex], + this->ky[yindex], + this->kz[zindex], + k2);*/ + + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, + ptrdiff_t zindex, double k2){ + if (k2 >= km2) + std::fill_n((rnumber*)(a + howmany*cindex), howmany2, 0.0);} + ); +} + +template <class rnumber> +void fluid_solver_base<rnumber>::dealias(cnumber *a, const int howmany) +{ + TIMEZONE("fluid_solver_base::dealias"); + if (this->dealias_type == 0) + { + this->low_pass_Fourier(a, howmany, this->kM); + return; + } + + CLOOP_K2( + this, + [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, + ptrdiff_t /*zindex*/, double k2){ + double tval = this->Fourier_filter[int(round(k2/this->dk2))]; + // It is thread safe on the index cindex + for (int tcounter = 0; tcounter < howmany; tcounter++) + for (int i=0; i<2; i++) + a[howmany*cindex+tcounter][i] *= tval; + } + ); +} + +template <class rnumber> +void fluid_solver_base<rnumber>::force_divfree(cnumber *a) +{ + TIMEZONE("fluid_solver_base::force_divfree"); + CLOOP_K2( + this, + + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, + ptrdiff_t zindex, double k2){ + if (k2 > 0) + { + // It is thread safe on index cindex + cnumber tval; + tval[0] = (this->kx[xindex]*((*(a + cindex*3 ))[0]) + + this->ky[yindex]*((*(a + cindex*3+1))[0]) + + this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2; + tval[1] = (this->kx[xindex]*((*(a + cindex*3 ))[1]) + + this->ky[yindex]*((*(a + cindex*3+1))[1]) + + this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2; + for (int imag_part=0; imag_part<2; imag_part++) + { + a[cindex*3 ][imag_part] -= tval[imag_part]*this->kx[xindex]; + a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex]; + a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex]; + } + }} + ); + if (this->cd->myrank == this->cd->rank[0]) + std::fill_n((rnumber*)(a), 6, 0.0); +} + +template <class rnumber> +void fluid_solver_base<rnumber>::compute_vector_gradient(cnumber *A, cnumber *cvec) +{ + TIMEZONE("fluid_solver_base::compute_vector_gradient"); + std::fill_n((rnumber*)A, 3*2*this->cd->local_size, 0.0); + cnumber *dx_u, *dy_u, *dz_u; + dx_u = A; + dy_u = A + this->cd->local_size; + dz_u = A + 2*this->cd->local_size; + CLOOP_K2( + this, + + [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, + ptrdiff_t zindex, double k2){ + if (k2 <= this->kM2) + { + // It is thread safe on cindex + ptrdiff_t tindex = 3*cindex; + for (int cc=0; cc<3; cc++) + { + dx_u[tindex + cc][0] = -this->kx[xindex]*cvec[tindex+cc][1]; + dx_u[tindex + cc][1] = this->kx[xindex]*cvec[tindex+cc][0]; + dy_u[tindex + cc][0] = -this->ky[yindex]*cvec[tindex+cc][1]; + dy_u[tindex + cc][1] = this->ky[yindex]*cvec[tindex+cc][0]; + dz_u[tindex + cc][0] = -this->kz[zindex]*cvec[tindex+cc][1]; + dz_u[tindex + cc][1] = this->kz[zindex]*cvec[tindex+cc][0]; + } + }} + ); +} + +template <class rnumber> +void fluid_solver_base<rnumber>::symmetrize(cnumber *data, const int howmany) +{ + TIMEZONE("fluid_solver_base::symmetrize"); + ptrdiff_t ii, cc; + MPI_Status *mpistatus = new MPI_Status; + if (this->cd->myrank == this->cd->rank[0]) + { + for (cc = 0; cc < howmany; cc++) + data[cc][1] = 0.0; + for (ii = 1; ii < this->cd->sizes[1]/2; ii++) + for (cc = 0; cc < howmany; cc++) { + ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[0] = + (*(data + cc + howmany*( ii)*this->cd->sizes[2]))[0]; + ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[1] = + -(*(data + cc + howmany*( ii)*this->cd->sizes[2]))[1]; + } + } + cnumber *buffer; + buffer = fftw_interface<rnumber>::alloc_complex(howmany*this->cd->sizes[1]); + ptrdiff_t yy; + /*ptrdiff_t tindex;*/ + int ranksrc, rankdst; + for (yy = 1; yy < this->cd->sizes[0]/2; yy++) { + ranksrc = this->cd->rank[yy]; + rankdst = this->cd->rank[this->cd->sizes[0] - yy]; + if (this->cd->myrank == ranksrc) + for (ii = 0; ii < this->cd->sizes[1]; ii++) + for (cc = 0; cc < howmany; cc++) + for (int imag_comp=0; imag_comp<2; imag_comp++) + (*(buffer + howmany*ii+cc))[imag_comp] = + (*(data + howmany*((yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[imag_comp]; + if (ranksrc != rankdst) + { + if (this->cd->myrank == ranksrc) + MPI_Send((void*)buffer, + howmany*this->cd->sizes[1], mpi_real_type<rnumber>::complex(), rankdst, yy, + this->cd->comm); + if (this->cd->myrank == rankdst) + MPI_Recv((void*)buffer, + howmany*this->cd->sizes[1], mpi_real_type<rnumber>::complex(), ranksrc, yy, + this->cd->comm, mpistatus); + } + if (this->cd->myrank == rankdst) + { + for (ii = 1; ii < this->cd->sizes[1]; ii++) + for (cc = 0; cc < howmany; cc++) + { + (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[0] = + (*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[0]; + (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[1] = + -(*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[1]; + } + for (cc = 0; cc < howmany; cc++) + { + (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[0] = (*(buffer + cc))[0]; + (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[1] = -(*(buffer + cc))[1]; + } + } + } + fftw_interface<rnumber>::free(buffer); + delete mpistatus; + /* put asymmetric data to 0 */ + /*if (this->cd->myrank == this->cd->rank[this->cd->sizes[0]/2]) + { + tindex = howmany*(this->cd->sizes[0]/2 - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2]; + for (ii = 0; ii < this->cd->sizes[1]; ii++) + { + std::fill_n((rnumber*)(data + tindex), howmany*2*this->cd->sizes[2], 0.0); + tindex += howmany*this->cd->sizes[2]; + } + } + tindex = howmany*(); + std::fill_n((rnumber*)(data + tindex), howmany*2, 0.0);*/ +} + +template <class rnumber> +int fluid_solver_base<rnumber>::read_base(const char *fname, rnumber *data) +{ + char full_name[512]; + sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); + return this->rd->read(full_name, (void*)data); +} + +template <class rnumber> +int fluid_solver_base<rnumber>::read_base(const char *fname, cnumber *data) +{ + char full_name[512]; + sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); + return this->cd->read(full_name, (void*)data); +} + +template <class rnumber> +int fluid_solver_base<rnumber>::write_base(const char *fname, rnumber *data) +{ + char full_name[512]; + sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); + return this->rd->write(full_name, (void*)data); +} + +template <class rnumber> +int fluid_solver_base<rnumber>::write_base(const char *fname, cnumber *data) +{ + char full_name[512]; + sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); + return this->cd->write(full_name, (void*)data); +} + +/* finally, force generation of code */ +template class fluid_solver_base<float>; +template class fluid_solver_base<double>; /*****************************************************************************/ -/* now actually use the macro defined above */ -FLUID_SOLVER_BASE_DEFINITIONS( - FFTW_MANGLE_FLOAT, - float, - MPI_FLOAT, - MPI_COMPLEX) -FLUID_SOLVER_BASE_DEFINITIONS( - FFTW_MANGLE_DOUBLE, - double, - MPI_DOUBLE, - BFPS_MPICXX_DOUBLE_COMPLEX) -/*****************************************************************************/ + + + diff --git a/bfps/cpp/fluid_solver_base.hpp b/bfps/cpp/fluid_solver_base.hpp index 62deb597b4a6a3f4fc87198099d15778e7a2a255..e446956001a08fdbf0d3b11da8552e1cb6c61a45 100644 --- a/bfps/cpp/fluid_solver_base.hpp +++ b/bfps/cpp/fluid_solver_base.hpp @@ -30,6 +30,8 @@ #include <vector> #include "base.hpp" #include "field_descriptor.hpp" +#include "scope_timer.hpp" +#include "omputils.hpp" #ifndef FLUID_SOLVER_BASE @@ -81,7 +83,7 @@ class fluid_solver_base double DKY = 1.0, double DKZ = 1.0, int DEALIAS_TYPE = 0, - unsigned FFTW_PLAN_RIGOR = FFTW_ESTIMATE); + unsigned FFTW_PLAN_RIGOR = DEFAULT_FFTW_FLAG); ~fluid_solver_base(); void low_pass_Fourier(cnumber *__restrict__ a, int howmany, double kmax); @@ -135,97 +137,133 @@ class fluid_solver_base /* macros for loops */ /* Fourier space loop */ -#define CLOOP(obj, expression) \ - \ -{ \ - ptrdiff_t cindex = 0; \ - for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \ - for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \ - for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) \ - { \ - expression; \ - cindex++; \ - } \ +template <class ObjectType, class FuncType> +void CLOOP(ObjectType* obj, FuncType expression) +{ + TIMEZONE("CLOOP"); + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[0]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[0]); + for (ptrdiff_t yindex = start; yindex < ptrdiff_t(end); yindex++){ + ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]; + for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) + for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) + { + expression(cindex, xindex, yindex, zindex); + cindex++; + } + } + } } -#define CLOOP_NXMODES(obj, expression) \ - \ -{ \ - ptrdiff_t cindex = 0; \ - for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \ - for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \ - { \ - int nxmodes = 1; \ - ptrdiff_t xindex = 0; \ - expression; \ - cindex++; \ - nxmodes = 2; \ - for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) \ - { \ - expression; \ - cindex++; \ - } \ - } \ +template <class ObjectType, class FuncType> +void CLOOP_NXMODES(ObjectType* obj, FuncType expression) +{ + TIMEZONE("CLOOP_NXMODES"); + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[1]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[1]); + for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){ + for (ptrdiff_t zindex = start; zindex < ptrdiff_t(end); zindex++) + { + ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2] + + zindex*obj->cd->subsizes[2]; + int nxmodes = 1; + ptrdiff_t xindex = 0; + expression(); + cindex++; + nxmodes = 2; + for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) + { + expression(); + cindex++; + } + } + } + } } -#define CLOOP_K2(obj, expression) \ - \ -{ \ - double k2; \ - ptrdiff_t cindex = 0; \ - for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \ - for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \ - for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) \ - { \ - k2 = (obj->kx[xindex]*obj->kx[xindex] + \ - obj->ky[yindex]*obj->ky[yindex] + \ - obj->kz[zindex]*obj->kz[zindex]); \ - expression; \ - cindex++; \ - } \ + +template <class ObjectType, class FuncType> +void CLOOP_K2(ObjectType* obj, FuncType expression) +{ + TIMEZONE("CLOOP_K2"); + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[1]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[1]); + for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){ + for (ptrdiff_t zindex = start; zindex < ptrdiff_t(end); zindex++){ + ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2] + + zindex*obj->cd->subsizes[2]; + for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) + { + double k2 = (obj->kx[xindex]*obj->kx[xindex] + + obj->ky[yindex]*obj->ky[yindex] + + obj->kz[zindex]*obj->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2); + cindex++; + } + } + } + } } -#define CLOOP_K2_NXMODES(obj, expression) \ - \ -{ \ - double k2; \ - ptrdiff_t cindex = 0; \ - for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \ - for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \ - { \ - int nxmodes = 1; \ - ptrdiff_t xindex = 0; \ - k2 = (obj->kx[xindex]*obj->kx[xindex] + \ - obj->ky[yindex]*obj->ky[yindex] + \ - obj->kz[zindex]*obj->kz[zindex]); \ - expression; \ - cindex++; \ - nxmodes = 2; \ - for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) \ - { \ - k2 = (obj->kx[xindex]*obj->kx[xindex] + \ - obj->ky[yindex]*obj->ky[yindex] + \ - obj->kz[zindex]*obj->kz[zindex]); \ - expression; \ - cindex++; \ - } \ - } \ + +template <class ObjectType, class FuncType> +void CLOOP_K2_NXMODES(ObjectType* obj, FuncType expression) +{ + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[1]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[1]); + for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){ + for (ptrdiff_t zindex = start; zindex < ptrdiff_t(end); zindex++) + { + ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2] + + zindex*obj->cd->subsizes[2]; + int nxmodes = 1; + ptrdiff_t xindex = 0; + double k2 = (obj->kx[xindex]*obj->kx[xindex] + + obj->ky[yindex]*obj->ky[yindex] + + obj->kz[zindex]*obj->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2, nxmodes); + cindex++; + nxmodes = 2; + for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) + { + double k2 = (obj->kx[xindex]*obj->kx[xindex] + + obj->ky[yindex]*obj->ky[yindex] + + obj->kz[zindex]*obj->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2, nxmodes); + cindex++; + } + } + } + } } -/* real space loop */ -#define RLOOP(obj, expression) \ - \ -{ \ - for (int zindex = 0; zindex < obj->rd->subsizes[0]; zindex++) \ - for (int yindex = 0; yindex < obj->rd->subsizes[1]; yindex++) \ - { \ - ptrdiff_t rindex = (zindex * obj->rd->subsizes[1] + yindex)*(obj->rd->subsizes[2]+2); \ - for (int xindex = 0; xindex < obj->rd->subsizes[2]; xindex++) \ - { \ - expression; \ - rindex++; \ - } \ - } \ + +template <class ObjectType, class FuncType> +void RLOOP(ObjectType* obj, FuncType expression) +{ + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(obj->rd->subsizes[1]); + const hsize_t end = OmpUtils::ForIntervalEnd(obj->rd->subsizes[1]); + for (int zindex = 0; zindex < obj->rd->subsizes[0] ; zindex++) + for (int yindex = start; yindex < ptrdiff_t(end); yindex++) + { + ptrdiff_t rindex = (zindex * obj->rd->subsizes[1] + yindex)*(obj->rd->subsizes[2]+2); + for (int xindex = 0; xindex < obj->rd->subsizes[2]; xindex++) + { + expression(rindex, xindex, yindex, zindex); + rindex++; + } + } + } } /*****************************************************************************/ diff --git a/bfps/cpp/interpolator.cpp b/bfps/cpp/interpolator.cpp index ef53742a4fdeb2545f02954f10c47d2bcb3f6538..b088f86df95d6d0166e8a95923bf0d1cc062c073 100644 --- a/bfps/cpp/interpolator.cpp +++ b/bfps/cpp/interpolator.cpp @@ -150,7 +150,7 @@ template <class rnumber, int interp_neighbours> void interpolator<rnumber, interp_neighbours>::operator()( const int *xg, const double *xx, - double *dest, + double *__restrict__ dest, const int *deriv) { double bx[interp_neighbours*2+2], by[interp_neighbours*2+2], bz[interp_neighbours*2+2]; diff --git a/bfps/cpp/interpolator_base.cpp b/bfps/cpp/interpolator_base.cpp index 58bf57cf13382f0704da4537dae9d21bb4a841da..db81dcb329070e14897e432e82a2fee95810e169 100644 --- a/bfps/cpp/interpolator_base.cpp +++ b/bfps/cpp/interpolator_base.cpp @@ -43,6 +43,20 @@ interpolator_base<rnumber, interp_neighbours>::interpolator_base( this->dz = 4*acos(0) / (fs->dkz*this->descriptor->sizes[0]); } +template <class rnumber, int interp_neighbours> +interpolator_base<rnumber, interp_neighbours>::interpolator_base( + vorticity_equation<rnumber, FFTW> *fs, + base_polynomial_values BETA_POLYS) +{ +// this->descriptor = fs->rd; +// this->compute_beta = BETA_POLYS; +// +// // compute dx, dy, dz; +// this->dx = 4*acos(0) / (fs->kk->dkx*this->descriptor->sizes[2]); +// this->dy = 4*acos(0) / (fs->kk->dky*this->descriptor->sizes[1]); +// this->dz = 4*acos(0) / (fs->kk->dkz*this->descriptor->sizes[0]); +} + template <class rnumber, int interp_neighbours> void interpolator_base<rnumber, interp_neighbours>::get_grid_coordinates( const int nparticles, diff --git a/bfps/cpp/interpolator_base.hpp b/bfps/cpp/interpolator_base.hpp index 7dda7fb08319bf2a044bcc220e204b748d6336d6..f4b793342d9b5b38e39c717ad30ee88e106958aa 100644 --- a/bfps/cpp/interpolator_base.hpp +++ b/bfps/cpp/interpolator_base.hpp @@ -25,6 +25,7 @@ #include "fluid_solver_base.hpp" +#include "vorticity_equation.hpp" #include "spline_n1.hpp" #include "spline_n2.hpp" #include "spline_n3.hpp" @@ -58,6 +59,10 @@ class interpolator_base interpolator_base( fluid_solver_base<rnumber> *FSOLVER, base_polynomial_values BETA_POLYS); + + interpolator_base( + vorticity_equation<rnumber, FFTW> *FSOLVER, + base_polynomial_values BETA_POLYS); virtual ~interpolator_base(){} /* may not destroy input */ diff --git a/bfps/cpp/kspace.cpp b/bfps/cpp/kspace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..70581f081790ba7c114a8abbe5e113eabf38dd54 --- /dev/null +++ b/bfps/cpp/kspace.cpp @@ -0,0 +1,492 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ + + +#include <cmath> +#include <cstdlib> +#include <algorithm> +#include <cassert> +#include "kspace.hpp" +#include "scope_timer.hpp" +#include "shared_array.hpp" + +template <field_backend be, + kspace_dealias_type dt> +template <field_components fc> +kspace<be, dt>::kspace( + const field_layout<fc> *source_layout, + const double DKX, + const double DKY, + const double DKZ) +{ + TIMEZONE("field::kspace"); + /* get layout */ + this->layout = new field_layout<ONE>( + source_layout->sizes, + source_layout->subsizes, + source_layout->starts, + source_layout->comm); + + /* store dk values */ + this->dkx = DKX; + this->dky = DKY; + this->dkz = DKZ; + + /* compute kx, ky, kz and compute kM values */ + switch(be) + { + case FFTW: + this->kx.resize(this->layout->sizes[2]); + this->ky.resize(this->layout->subsizes[0]); + this->kz.resize(this->layout->sizes[1]); + int i, ii; + for (i = 0; i<int(this->layout->sizes[2]); i++) + this->kx[i] = i*this->dkx; + for (i = 0; i<int(this->layout->subsizes[0]); i++) + { + ii = i + this->layout->starts[0]; + if (ii <= int(this->layout->sizes[1]/2)) + this->ky[i] = this->dky*ii; + else + this->ky[i] = this->dky*(ii - int(this->layout->sizes[1])); + } + for (i = 0; i<int(this->layout->sizes[1]); i++) + { + if (i <= int(this->layout->sizes[0]/2)) + this->kz[i] = this->dkz*i; + else + this->kz[i] = this->dkz*(i - int(this->layout->sizes[0])); + } + switch(dt) + { + case TWO_THIRDS: + this->kMx = this->dkx*(int(2*(int(this->layout->sizes[2])-1)/3)-1); + this->kMy = this->dky*(int(this->layout->sizes[0] / 3)-1); + this->kMz = this->dkz*(int(this->layout->sizes[1] / 3)-1); + break; + case SMOOTH: + this->kMx = this->dkx*(int(this->layout->sizes[2])-2); + this->kMy = this->dky*(int(this->layout->sizes[0] / 2)-1); + this->kMz = this->dkz*(int(this->layout->sizes[1] / 2)-1); + break; + } + break; + } + + /* get global kM and dk */ + this->kM = this->kMx; + if (this->kM < this->kMy) this->kM = this->kMy; + if (this->kM < this->kMz) this->kM = this->kMz; + this->kM2 = this->kM * this->kM; + this->dk = this->dkx; + if (this->dk > this->dky) this->dk = this->dky; + if (this->dk > this->dkz) this->dk = this->dkz; + this->dk2 = this->dk*this->dk; + + /* spectra stuff */ + this->nshells = int(this->kM / this->dk) + 2; + this->kshell.resize(this->nshells, 0); + this->nshell.resize(this->nshells, 0); + + shared_array<double> kshell_local_thread(this->nshells,[&](double* kshell_local){ + std::fill_n(kshell_local, this->nshells, 0); + }); + shared_array<int64_t> nshell_local_thread(this->nshells,[&](int64_t* nshell_local){ + std::fill_n(nshell_local, this->nshells, 0); + }); + + std::vector<std::unordered_map<int, double>> dealias_filter_threaded(omp_get_max_threads()); + + this->CLOOP_K2_NXMODES( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2, + int nxmodes){ + if (k2 < this->kM2) + { + double knorm = sqrt(k2); + kshell_local_thread.getMine()[int(knorm/this->dk)] += nxmodes; + nshell_local_thread.getMine()[int(knorm/this->dk)] += nxmodes*knorm; + } + if (dt == SMOOTH){ + dealias_filter_threaded[omp_get_thread_num()][int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.)); + } + }); + + // Merge results + + kshell_local_thread.mergeParallel(); + nshell_local_thread.mergeParallel(); + + if (dt == SMOOTH){ + for(int idxMerge = 0 ; idxMerge < int(dealias_filter_threaded.size()) ; ++idxMerge){ + for(const auto kv : dealias_filter_threaded[idxMerge]){ + this->dealias_filter[kv.first] = kv.second; + } + } + } + + MPI_Allreduce( + nshell_local_thread.getMasterData(), + &this->nshell.front(), + this->nshells, + MPI_INT64_T, MPI_SUM, this->layout->comm); + MPI_Allreduce( + kshell_local_thread.getMasterData(), + &this->kshell.front(), + this->nshells, + MPI_DOUBLE, MPI_SUM, this->layout->comm); + for (int n=0; n<this->nshells; n++){ + if(this->nshell[n] != 0){ + this->kshell[n] /= this->nshell[n]; + } + } +} + +template <field_backend be, + kspace_dealias_type dt> +kspace<be, dt>::~kspace() +{ + delete this->layout; +} + +template <field_backend be, + kspace_dealias_type dt> +template <typename rnumber, + field_components fc> +void kspace<be, dt>::low_pass(typename fftw_interface<rnumber>::complex *__restrict__ a, const double kmax) +{ + const double km2 = kmax*kmax; + this->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 >= km2) + std::fill_n((rnumber*)(a + ncomp(fc)*cindex), 2*ncomp(fc), 0); + }); +} + +template <field_backend be, + kspace_dealias_type dt> +template <typename rnumber, + field_components fc> +void kspace<be, dt>::Gauss_filter( + typename fftw_interface<rnumber>::complex *__restrict__ a, + const double sigma) +{ + const double prefactor = - sigma*sigma/2; + this->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kM2) + { + for (unsigned int tcounter=0; tcounter<2*ncomp(fc); tcounter++) + ((rnumber*)a)[2*ncomp(fc)*cindex + tcounter] *= exp(prefactor*k2); + } + }); +} + +template <field_backend be, + kspace_dealias_type dt> +template <typename rnumber, + field_components fc> +void kspace<be, dt>::dealias(typename fftw_interface<rnumber>::complex *__restrict__ a) +{ + switch(dt) + { + case TWO_THIRDS: + this->low_pass<rnumber, fc>(a, this->kM); + break; + case SMOOTH: + this->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + double tval = this->dealias_filter[int(round(k2 / this->dk2))]; + for (unsigned int tcounter=0; tcounter<2*ncomp(fc); tcounter++) + ((rnumber*)a)[2*ncomp(fc)*cindex + tcounter] *= tval; + }); + break; + } +} + +template <field_backend be, + kspace_dealias_type dt> +template <typename rnumber> +void kspace<be, dt>::force_divfree(typename fftw_interface<rnumber>::complex *__restrict__ a) +{ + TIMEZONE("kspace::force_divfree"); + this->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 > 0) + { + typename fftw_interface<rnumber>::complex tval; + tval[0] = (this->kx[xindex]*((*(a + cindex*3 ))[0]) + + this->ky[yindex]*((*(a + cindex*3+1))[0]) + + this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2; + tval[1] = (this->kx[xindex]*((*(a + cindex*3 ))[1]) + + this->ky[yindex]*((*(a + cindex*3+1))[1]) + + this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2; + for (int imag_part=0; imag_part<2; imag_part++) + { + a[cindex*3 ][imag_part] -= tval[imag_part]*this->kx[xindex]; + a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex]; + a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex]; + } + } + } + ); + if (this->layout->myrank == this->layout->rank[0][0]) + std::fill_n((rnumber*)(a), 6, 0.0); +} + +template <field_backend be, + kspace_dealias_type dt> +template <typename rnumber, + field_components fc> +void kspace<be, dt>::cospectrum( + const rnumber(* __restrict a)[2], + const rnumber(* __restrict b)[2], + const hid_t group, + const std::string dset_name, + const hsize_t toffset) +{ + TIMEZONE("field::cospectrum"); + shared_array<double> spec_local_thread(this->nshells*ncomp(fc)*ncomp(fc),[&](double* spec_local){ + std::fill_n(spec_local, this->nshells*ncomp(fc)*ncomp(fc), 0); + }); + + this->CLOOP_K2_NXMODES( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2, + int nxmodes){ + if (k2 <= this->kM2) + { + double* spec_local = spec_local_thread.getMine(); + int tmp_int = int(sqrt(k2) / this->dk)*ncomp(fc)*ncomp(fc); + for (hsize_t i=0; i<ncomp(fc); i++) + for (hsize_t j=0; j<ncomp(fc); j++){ + spec_local[tmp_int + i*ncomp(fc)+j] += nxmodes * ( + (a[ncomp(fc)*cindex + i][0] * b[ncomp(fc)*cindex + j][0]) + + (a[ncomp(fc)*cindex + i][1] * b[ncomp(fc)*cindex + j][1])); + } + } + }); + + spec_local_thread.mergeParallel(); + + std::vector<double> spec; + spec.resize(this->nshells*ncomp(fc)*ncomp(fc), 0); + MPI_Allreduce( + spec_local_thread.getMasterData(), + &spec.front(), + spec.size(), + MPI_DOUBLE, MPI_SUM, this->layout->comm); + if (this->layout->myrank == 0) + { + hid_t dset, wspace, mspace; + hsize_t count[(ndim(fc)-2)*2], offset[(ndim(fc)-2)*2], dims[(ndim(fc)-2)*2]; + dset = H5Dopen(group, ("spectra/" + dset_name).c_str(), H5P_DEFAULT); + wspace = H5Dget_space(dset); + H5Sget_simple_extent_dims(wspace, dims, NULL); + switch (fc) + { + case THREExTHREE: + offset[4] = 0; + offset[5] = 0; + count[4] = ncomp(fc); + count[5] = ncomp(fc); + case THREE: + offset[2] = 0; + offset[3] = 0; + count[2] = ncomp(fc); + count[3] = ncomp(fc); + default: + offset[0] = toffset; + offset[1] = 0; + count[0] = 1; + count[1] = this->nshells; + } + mspace = H5Screate_simple((ndim(fc)-2)*2, count, NULL); + H5Sselect_hyperslab(wspace, H5S_SELECT_SET, offset, NULL, count, NULL); + H5Dwrite(dset, H5T_NATIVE_DOUBLE, mspace, wspace, H5P_DEFAULT, &spec.front()); + H5Sclose(wspace); + H5Sclose(mspace); + H5Dclose(dset); + } +} + + +template class kspace<FFTW, TWO_THIRDS>; +template class kspace<FFTW, SMOOTH>; + +template kspace<FFTW, TWO_THIRDS>::kspace<>( + const field_layout<ONE> *, + const double, const double, const double); +template kspace<FFTW, TWO_THIRDS>::kspace<>( + const field_layout<THREE> *, + const double, const double, const double); +template kspace<FFTW, TWO_THIRDS>::kspace<>( + const field_layout<THREExTHREE> *, + const double, const double, const double); + +template kspace<FFTW, SMOOTH>::kspace<>( + const field_layout<ONE> *, + const double, const double, const double); +template kspace<FFTW, SMOOTH>::kspace<>( + const field_layout<THREE> *, + const double, const double, const double); +template kspace<FFTW, SMOOTH>::kspace<>( + const field_layout<THREExTHREE> *, + const double, const double, const double); + +template void kspace<FFTW, SMOOTH>::low_pass<float, ONE>( + typename fftw_interface<float>::complex *__restrict__ a, + const double kmax); +template void kspace<FFTW, SMOOTH>::low_pass<float, THREE>( + typename fftw_interface<float>::complex *__restrict__ a, + const double kmax); +template void kspace<FFTW, SMOOTH>::low_pass<float, THREExTHREE>( + typename fftw_interface<float>::complex *__restrict__ a, + const double kmax); + +template void kspace<FFTW, SMOOTH>::low_pass<double, ONE>( + typename fftw_interface<double>::complex *__restrict__ a, + const double kmax); +template void kspace<FFTW, SMOOTH>::low_pass<double, THREE>( + typename fftw_interface<double>::complex *__restrict__ a, + const double kmax); +template void kspace<FFTW, SMOOTH>::low_pass<double, THREExTHREE>( + typename fftw_interface<double>::complex *__restrict__ a, + const double kmax); + +template void kspace<FFTW, SMOOTH>::dealias<float, ONE>( + typename fftw_interface<float>::complex *__restrict__ a); +template void kspace<FFTW, SMOOTH>::dealias<float, THREE>( + typename fftw_interface<float>::complex *__restrict__ a); +template void kspace<FFTW, SMOOTH>::dealias<float, THREExTHREE>( + typename fftw_interface<float>::complex *__restrict__ a); + +template void kspace<FFTW, SMOOTH>::dealias<double, ONE>( + typename fftw_interface<double>::complex *__restrict__ a); +template void kspace<FFTW, SMOOTH>::dealias<double, THREE>( + typename fftw_interface<double>::complex *__restrict__ a); +template void kspace<FFTW, SMOOTH>::dealias<double, THREExTHREE>( + typename fftw_interface<double>::complex *__restrict__ a); + +template void kspace<FFTW, TWO_THIRDS>::cospectrum<float, ONE>( + const typename fftw_interface<float>::complex *__restrict__ a, + const typename fftw_interface<float>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, TWO_THIRDS>::cospectrum<float, THREE>( + const typename fftw_interface<float>::complex *__restrict__ a, + const typename fftw_interface<float>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, TWO_THIRDS>::cospectrum<float, THREExTHREE>( + const typename fftw_interface<float>::complex *__restrict__ a, + const typename fftw_interface<float>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, TWO_THIRDS>::cospectrum<double, ONE>( + const typename fftw_interface<double>::complex *__restrict__ a, + const typename fftw_interface<double>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, TWO_THIRDS>::cospectrum<double, THREE>( + const typename fftw_interface<double>::complex *__restrict__ a, + const typename fftw_interface<double>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, TWO_THIRDS>::cospectrum<double, THREExTHREE>( + const typename fftw_interface<double>::complex *__restrict__ a, + const typename fftw_interface<double>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); + +template void kspace<FFTW, SMOOTH>::cospectrum<float, ONE>( + const typename fftw_interface<float>::complex *__restrict__ a, + const typename fftw_interface<float>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, SMOOTH>::cospectrum<float, THREE>( + const typename fftw_interface<float>::complex *__restrict__ a, + const typename fftw_interface<float>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, SMOOTH>::cospectrum<float, THREExTHREE>( + const typename fftw_interface<float>::complex *__restrict__ a, + const typename fftw_interface<float>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, SMOOTH>::cospectrum<double, ONE>( + const typename fftw_interface<double>::complex *__restrict__ a, + const typename fftw_interface<double>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, SMOOTH>::cospectrum<double, THREE>( + const typename fftw_interface<double>::complex *__restrict__ a, + const typename fftw_interface<double>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); +template void kspace<FFTW, SMOOTH>::cospectrum<double, THREExTHREE>( + const typename fftw_interface<double>::complex *__restrict__ a, + const typename fftw_interface<double>::complex *__restrict__ b, + const hid_t group, + const std::string dset_name, + const hsize_t toffset); + +template void kspace<FFTW, SMOOTH>::force_divfree<float>( + typename fftw_interface<float>::complex *__restrict__ a); +template void kspace<FFTW, SMOOTH>::force_divfree<double>( + typename fftw_interface<double>::complex *__restrict__ a); + diff --git a/bfps/cpp/kspace.hpp b/bfps/cpp/kspace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e6f0f67a09a39d355480f94683e1d40d68b12cce --- /dev/null +++ b/bfps/cpp/kspace.hpp @@ -0,0 +1,176 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ + + + +#include <hdf5.h> +#include <unordered_map> +#include <vector> +#include <string> +#include "omputils.hpp" +#include "fftw_interface.hpp" +#include "field_layout.hpp" + +#ifndef KSPACE_HPP + +#define KSPACE_HPP + +enum field_backend {FFTW}; +enum kspace_dealias_type {TWO_THIRDS, SMOOTH}; + + +template <field_backend be, + kspace_dealias_type dt> +class kspace +{ + public: + /* relevant field layout */ + field_layout<ONE> *layout; + + /* physical parameters */ + double dkx, dky, dkz, dk, dk2; + + /* mode and dealiasing information */ + double kMx, kMy, kMz, kM, kM2; + std::vector<double> kx, ky, kz; + std::unordered_map<int, double> dealias_filter; + std::vector<double> kshell; + std::vector<int64_t> nshell; + int nshells; + + /* methods */ + template <field_components fc> + kspace( + const field_layout<fc> *source_layout, + const double DKX = 1.0, + const double DKY = 1.0, + const double DKZ = 1.0); + ~kspace(); + + template <typename rnumber, + field_components fc> + void low_pass( + typename fftw_interface<rnumber>::complex *__restrict__ a, + const double kmax); + + template <typename rnumber, + field_components fc> + void Gauss_filter( + typename fftw_interface<rnumber>::complex *__restrict__ a, + const double sigma); + + template <typename rnumber, + field_components fc> + void dealias(typename fftw_interface<rnumber>::complex *__restrict__ a); + + template <typename rnumber, + field_components fc> + void cospectrum( + const rnumber(* __restrict__ a)[2], + const rnumber(* __restrict__ b)[2], + const hid_t group, + const std::string dset_name, + const hsize_t toffset); + template <class func_type> + void CLOOP(func_type expression) + { + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(this->layout->subsizes[1]); + const hsize_t end = OmpUtils::ForIntervalEnd(this->layout->subsizes[1]); + + for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){ + for (hsize_t zindex = start; zindex < end; zindex++){ + ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2] + + zindex*this->layout->subsizes[2]; + for (hsize_t xindex = 0; xindex < this->layout->subsizes[2]; xindex++) + { + expression(cindex, xindex, yindex, zindex); + cindex++; + } + } + } + } + } + template <class func_type> + void CLOOP_K2(func_type expression) + { + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(this->layout->subsizes[1]); + const hsize_t end = OmpUtils::ForIntervalEnd(this->layout->subsizes[1]); + + for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){ + for (hsize_t zindex = start; zindex < end; zindex++){ + ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2] + + zindex*this->layout->subsizes[2]; + for (hsize_t xindex = 0; xindex < this->layout->subsizes[2]; xindex++) + { + double k2 = (this->kx[xindex]*this->kx[xindex] + + this->ky[yindex]*this->ky[yindex] + + this->kz[zindex]*this->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2); + cindex++; + } + } + } + } + } + template <class func_type> + void CLOOP_K2_NXMODES(func_type expression) + { + #pragma omp parallel + { + const hsize_t start = OmpUtils::ForIntervalStart(this->layout->subsizes[1]); + const hsize_t end = OmpUtils::ForIntervalEnd(this->layout->subsizes[1]); + + for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){ + for (hsize_t zindex = start; zindex < end; zindex++){ + ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2] + + zindex*this->layout->subsizes[2]; + hsize_t xindex = 0; + double k2 = ( + this->kx[xindex]*this->kx[xindex] + + this->ky[yindex]*this->ky[yindex] + + this->kz[zindex]*this->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2, 1); + cindex++; + for (xindex = 1; xindex < this->layout->subsizes[2]; xindex++) + { + k2 = (this->kx[xindex]*this->kx[xindex] + + this->ky[yindex]*this->ky[yindex] + + this->kz[zindex]*this->kz[zindex]); + expression(cindex, xindex, yindex, zindex, k2, 2); + cindex++; + } + } + } + } + } + template <typename rnumber> + void force_divfree(typename fftw_interface<rnumber>::complex *__restrict__ a); +}; + +#endif//KSPACE_HPP + diff --git a/bfps/cpp/omputils.hpp b/bfps/cpp/omputils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cdd6c6c173c7cf002b72e0c1a7aebcf727f2d33e --- /dev/null +++ b/bfps/cpp/omputils.hpp @@ -0,0 +1,27 @@ +#ifndef OMPUTILS_HPP +#define OMPUTILS_HPP + +#include <omp.h> + +namespace OmpUtils{ + +template <class IndexType> +inline IndexType ForIntervalStart(const IndexType size){ + const double chunk = double(size)/double(omp_get_num_threads()); + const IndexType start = IndexType(chunk*double(omp_get_thread_num())); + return start; +} + +template <class IndexType> +inline IndexType ForIntervalEnd(const IndexType size){ + const double chunk = double(size)/double(omp_get_num_threads()); + const IndexType end = (omp_get_thread_num() == omp_get_num_threads()-1) ? + size: + IndexType(chunk*double(omp_get_thread_num()+1)); + return end; +} + +} + + +#endif diff --git a/bfps/cpp/particles.cpp b/bfps/cpp/particles.cpp index 847f065d49299b559162060876402101fe48d9d4..cdaf157cb912c3074faf84bfecf1d9b3752c78a7 100644 --- a/bfps/cpp/particles.cpp +++ b/bfps/cpp/particles.cpp @@ -43,17 +43,17 @@ template <particle_types particle_type, class rnumber, int interp_neighbours> particles<particle_type, rnumber, interp_neighbours>::particles( const char *NAME, const hid_t data_file_id, - interpolator_base<rnumber, interp_neighbours> *FIELD, + interpolator_base<rnumber, interp_neighbours> *VEL, const int TRAJ_SKIP, const int INTEGRATION_STEPS) : particles_io_base<particle_type>( NAME, TRAJ_SKIP, data_file_id, - FIELD->descriptor->comm) + VEL->descriptor->comm) { assert((INTEGRATION_STEPS <= 6) && (INTEGRATION_STEPS >= 1)); - this->vel = FIELD; + this->vel = VEL; this->integration_steps = INTEGRATION_STEPS; this->array_size = this->nparticles * state_dimension(particle_type); this->state = new double[this->array_size]; diff --git a/bfps/cpp/particles/abstract_particles_distr.hpp b/bfps/cpp/particles/abstract_particles_distr.hpp new file mode 100644 index 0000000000000000000000000000000000000000..28837b5cdc69e711ca90f6b62d2fb72128564dbe --- /dev/null +++ b/bfps/cpp/particles/abstract_particles_distr.hpp @@ -0,0 +1,849 @@ +#ifndef ABSTRACT_PARTICLES_DISTR_HPP +#define ABSTRACT_PARTICLES_DISTR_HPP + +#include <mpi.h> + +#include <vector> +#include <memory> +#include <cassert> + +#include <type_traits> +#include <omp.h> + +#include "scope_timer.hpp" +#include "particles_utils.hpp" + + +template <class real_number, int size_particle_positions, int size_particle_rhs, int size_particle_index> +class abstract_particles_distr { +protected: + static const int MaxNbRhs = 100; + + enum MpiTag{ + TAG_LOW_UP_NB_PARTICLES, + TAG_UP_LOW_NB_PARTICLES, + TAG_LOW_UP_PARTICLES, + TAG_UP_LOW_PARTICLES, + TAG_LOW_UP_RESULTS, + TAG_UP_LOW_RESULTS, + + TAG_LOW_UP_MOVED_NB_PARTICLES, + TAG_UP_LOW_MOVED_NB_PARTICLES, + TAG_LOW_UP_MOVED_PARTICLES, + TAG_UP_LOW_MOVED_PARTICLES, + + TAG_LOW_UP_MOVED_PARTICLES_INDEXES, + TAG_UP_LOW_MOVED_PARTICLES_INDEXES, + + TAG_LOW_UP_MOVED_PARTICLES_RHS, + TAG_LOW_UP_MOVED_PARTICLES_RHS_MAX = TAG_LOW_UP_MOVED_PARTICLES_RHS+MaxNbRhs, + + TAG_UP_LOW_MOVED_PARTICLES_RHS = TAG_LOW_UP_MOVED_PARTICLES_RHS_MAX, + TAG_UP_LOW_MOVED_PARTICLES_RHS_MAX = TAG_UP_LOW_MOVED_PARTICLES_RHS+MaxNbRhs, + }; + + struct NeighborDescriptor{ + int nbPartitionsToSend; + int nbPartitionsToRecv; + int nbParticlesToSend; + int nbParticlesToRecv; + int destProc; + int rankDiff; + bool isLower; + int idxLowerUpper; + + std::unique_ptr<real_number[]> toRecvAndMerge; + std::unique_ptr<real_number[]> toCompute; + std::unique_ptr<real_number[]> results; + }; + + enum Action{ + NOTHING_TODO, + RECV_PARTICLES, + COMPUTE_PARTICLES, + RELEASE_BUFFER_PARTICLES, + MERGE_PARTICLES, + + RECV_MOVE_NB_LOW, + RECV_MOVE_NB_UP, + RECV_MOVE_LOW, + RECV_MOVE_UP + }; + + MPI_Comm current_com; + + int my_rank; + int nb_processes; + int nb_processes_involved; + + const std::pair<int,int> current_partition_interval; + const int current_partition_size; + + std::unique_ptr<int[]> partition_interval_size_per_proc; + std::unique_ptr<int[]> partition_interval_offset_per_proc; + + std::unique_ptr<int[]> current_offset_particles_for_partition; + + std::vector<std::pair<Action,int>> whatNext; + std::vector<MPI_Request> mpiRequests; + std::vector<NeighborDescriptor> neigDescriptors; + +public: + //////////////////////////////////////////////////////////////////////////// + + abstract_particles_distr(MPI_Comm in_current_com, + const std::pair<int,int>& in_current_partitions) + : current_com(in_current_com), + my_rank(-1), nb_processes(-1),nb_processes_involved(-1), + current_partition_interval(in_current_partitions), + current_partition_size(current_partition_interval.second-current_partition_interval.first){ + + AssertMpi(MPI_Comm_rank(current_com, &my_rank)); + AssertMpi(MPI_Comm_size(current_com, &nb_processes)); + + partition_interval_size_per_proc.reset(new int[nb_processes]); + AssertMpi( MPI_Allgather( const_cast<int*>(¤t_partition_size), 1, MPI_INT, + partition_interval_size_per_proc.get(), 1, MPI_INT, + current_com) ); + assert(partition_interval_size_per_proc[my_rank] == current_partition_size); + + partition_interval_offset_per_proc.reset(new int[nb_processes+1]); + partition_interval_offset_per_proc[0] = 0; + for(int idxProc = 0 ; idxProc < nb_processes ; ++idxProc){ + partition_interval_offset_per_proc[idxProc+1] = partition_interval_offset_per_proc[idxProc] + partition_interval_size_per_proc[idxProc]; + } + + current_offset_particles_for_partition.reset(new int[current_partition_size+1]); + + nb_processes_involved = nb_processes; + while(nb_processes_involved != 0 && partition_interval_size_per_proc[nb_processes_involved-1] == 0){ + nb_processes_involved -= 1; + } + assert(nb_processes_involved != 0); + for(int idx_proc_involved = 0 ; idx_proc_involved < nb_processes_involved ; ++idx_proc_involved){ + assert(partition_interval_size_per_proc[idx_proc_involved] != 0); + } + } + + virtual ~abstract_particles_distr(){} + + //////////////////////////////////////////////////////////////////////////// + + void compute_distr(const int current_my_nb_particles_per_partition[], + const real_number particles_positions[], + real_number particles_current_rhs[], + const int interpolation_size){ + TIMEZONE("compute_distr"); + + // Some processes might not be involved + if(nb_processes_involved <= my_rank){ + return; + } + + current_offset_particles_for_partition[0] = 0; + int myTotalNbParticles = 0; + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + myTotalNbParticles += current_my_nb_particles_per_partition[idxPartition]; + current_offset_particles_for_partition[idxPartition+1] = current_offset_particles_for_partition[idxPartition] + current_my_nb_particles_per_partition[idxPartition]; + } + + ////////////////////////////////////////////////////////////////////// + /// Exchange the number of particles in each partition + /// Could involve only here but I do not think it will be a problem + ////////////////////////////////////////////////////////////////////// + + + assert(whatNext.size() == 0); + assert(mpiRequests.size() == 0); + + neigDescriptors.clear(); + + int nbProcToRecvLower; + { + int nextDestProc = my_rank; + for(int idxLower = 1 ; idxLower <= interpolation_size ; idxLower += partition_interval_size_per_proc[nextDestProc]){ + nextDestProc = (nextDestProc-1+nb_processes_involved)%nb_processes_involved; + const int destProc = nextDestProc; + const int lowerRankDiff = (nextDestProc < my_rank ? my_rank - nextDestProc : nb_processes_involved-nextDestProc+my_rank); + + const int nbPartitionsToSend = std::min(current_partition_size, interpolation_size-(idxLower-1)); + const int nbParticlesToSend = current_offset_particles_for_partition[nbPartitionsToSend] - current_offset_particles_for_partition[0]; + + const int nbPartitionsToRecv = std::min(partition_interval_size_per_proc[destProc], (interpolation_size+1)-(idxLower-1)); + const int nbParticlesToRecv = -1; + + NeighborDescriptor descriptor; + descriptor.destProc = destProc; + descriptor.rankDiff = lowerRankDiff; + descriptor.nbPartitionsToSend = nbPartitionsToSend; + descriptor.nbParticlesToSend = nbParticlesToSend; + descriptor.nbPartitionsToRecv = nbPartitionsToRecv; + descriptor.nbParticlesToRecv = nbParticlesToRecv; + descriptor.isLower = true; + descriptor.idxLowerUpper = idxLower; + + neigDescriptors.emplace_back(std::move(descriptor)); + } + nbProcToRecvLower = neigDescriptors.size(); + + nextDestProc = my_rank; + for(int idxUpper = 1 ; idxUpper <= interpolation_size ; idxUpper += partition_interval_size_per_proc[nextDestProc]){ + nextDestProc = (nextDestProc+1+nb_processes_involved)%nb_processes_involved; + const int destProc = nextDestProc; + const int upperRankDiff = (nextDestProc > my_rank ? nextDestProc - my_rank: nb_processes_involved-my_rank+nextDestProc); + + const int nbPartitionsToSend = std::min(current_partition_size, (interpolation_size+1)-(idxUpper-1)); + const int nbParticlesToSend = current_offset_particles_for_partition[current_partition_size] - current_offset_particles_for_partition[current_partition_size-nbPartitionsToSend]; + + const int nbPartitionsToRecv = std::min(partition_interval_size_per_proc[destProc], interpolation_size-(idxUpper-1)); + const int nbParticlesToRecv = -1; + + NeighborDescriptor descriptor; + descriptor.destProc = destProc; + descriptor.rankDiff = upperRankDiff; + descriptor.nbPartitionsToSend = nbPartitionsToSend; + descriptor.nbParticlesToSend = nbParticlesToSend; + descriptor.nbPartitionsToRecv = nbPartitionsToRecv; + descriptor.nbParticlesToRecv = nbParticlesToRecv; + descriptor.isLower = false; + descriptor.idxLowerUpper = idxUpper; + + neigDescriptors.emplace_back(std::move(descriptor)); + } + } + const int nbProcToRecvUpper = neigDescriptors.size()-nbProcToRecvLower; + const int nbProcToRecv = nbProcToRecvUpper + nbProcToRecvLower; + assert(int(neigDescriptors.size()) == nbProcToRecv); + + for(int idxDescr = 0 ; idxDescr < int(neigDescriptors.size()) ; ++idxDescr){ + NeighborDescriptor& descriptor = neigDescriptors[idxDescr]; + + if(descriptor.isLower){ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(const_cast<int*>(&descriptor.nbParticlesToSend), 1, MPI_INT, descriptor.destProc, TAG_LOW_UP_NB_PARTICLES, + current_com, &mpiRequests.back())); + + if(descriptor.nbParticlesToSend){ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(const_cast<real_number*>(&particles_positions[0]), descriptor.nbParticlesToSend*size_particle_positions, particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_LOW_UP_PARTICLES, + current_com, &mpiRequests.back())); + + assert(descriptor.toRecvAndMerge == nullptr); + descriptor.toRecvAndMerge.reset(new real_number[descriptor.nbParticlesToSend*size_particle_rhs]); + whatNext.emplace_back(std::pair<Action,int>{MERGE_PARTICLES, idxDescr}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend*size_particle_rhs, particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_UP_LOW_RESULTS, + current_com, &mpiRequests.back())); + } + + whatNext.emplace_back(std::pair<Action,int>{RECV_PARTICLES, idxDescr}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&descriptor.nbParticlesToRecv, + 1, MPI_INT, descriptor.destProc, TAG_UP_LOW_NB_PARTICLES, + current_com, &mpiRequests.back())); + } + else{ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(const_cast<int*>(&descriptor.nbParticlesToSend), 1, MPI_INT, descriptor.destProc, TAG_UP_LOW_NB_PARTICLES, + current_com, &mpiRequests.back())); + + if(descriptor.nbParticlesToSend){ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(const_cast<real_number*>(&particles_positions[(current_offset_particles_for_partition[current_partition_size-descriptor.nbPartitionsToSend])*size_particle_positions]), descriptor.nbParticlesToSend*size_particle_positions, particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_UP_LOW_PARTICLES, + current_com, &mpiRequests.back())); + + assert(descriptor.toRecvAndMerge == nullptr); + descriptor.toRecvAndMerge.reset(new real_number[descriptor.nbParticlesToSend*size_particle_rhs]); + whatNext.emplace_back(std::pair<Action,int>{MERGE_PARTICLES, idxDescr}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend*size_particle_rhs, particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_LOW_UP_RESULTS, + current_com, &mpiRequests.back())); + } + + whatNext.emplace_back(std::pair<Action,int>{RECV_PARTICLES, idxDescr}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&descriptor.nbParticlesToRecv, + 1, MPI_INT, descriptor.destProc, TAG_LOW_UP_NB_PARTICLES, + current_com, &mpiRequests.back())); + } + } + + const bool more_than_one_thread = (omp_get_max_threads() > 1); + + TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads()) + #pragma omp parallel default(shared) + { + #pragma omp master + { + while(mpiRequests.size()){ + assert(mpiRequests.size() == whatNext.size()); + + int idxDone = mpiRequests.size(); + { + TIMEZONE("wait"); + AssertMpi(MPI_Waitany(mpiRequests.size(), mpiRequests.data(), &idxDone, MPI_STATUSES_IGNORE)); + } + const std::pair<Action, int> releasedAction = whatNext[idxDone]; + std::swap(mpiRequests[idxDone], mpiRequests[mpiRequests.size()-1]); + std::swap(whatNext[idxDone], whatNext[mpiRequests.size()-1]); + mpiRequests.pop_back(); + whatNext.pop_back(); + + ////////////////////////////////////////////////////////////////////// + /// Data to exchange particles + ////////////////////////////////////////////////////////////////////// + if(releasedAction.first == RECV_PARTICLES){ + NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second]; + + if(descriptor.isLower){ + //const int idxLower = descriptor.idxLowerUpper; + const int destProc = descriptor.destProc; + //const int nbPartitionsToRecv = descriptor.nbPartitionsToRecv; + const int NbParticlesToReceive = descriptor.nbParticlesToRecv; + assert(NbParticlesToReceive != -1); + assert(descriptor.toCompute == nullptr); + if(NbParticlesToReceive){ + descriptor.toCompute.reset(new real_number[NbParticlesToReceive*size_particle_positions]); + whatNext.emplace_back(std::pair<Action,int>{COMPUTE_PARTICLES, releasedAction.second}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(descriptor.toCompute.get(), NbParticlesToReceive*size_particle_positions, particles_utils::GetMpiType(real_number()), destProc, TAG_UP_LOW_PARTICLES, + current_com, &mpiRequests.back())); + } + } + else{ + //const int idxUpper = descriptor.idxLowerUpper; + const int destProc = descriptor.destProc; + //const int nbPartitionsToRecv = descriptor.nbPartitionsToRecv; + const int NbParticlesToReceive = descriptor.nbParticlesToRecv; + assert(NbParticlesToReceive != -1); + assert(descriptor.toCompute == nullptr); + if(NbParticlesToReceive){ + descriptor.toCompute.reset(new real_number[NbParticlesToReceive*size_particle_positions]); + whatNext.emplace_back(std::pair<Action,int>{COMPUTE_PARTICLES, releasedAction.second}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(descriptor.toCompute.get(), NbParticlesToReceive*size_particle_positions, particles_utils::GetMpiType(real_number()), destProc, TAG_LOW_UP_PARTICLES, + current_com, &mpiRequests.back())); + } + } + } + + ////////////////////////////////////////////////////////////////////// + /// Computation + ////////////////////////////////////////////////////////////////////// + if(releasedAction.first == COMPUTE_PARTICLES){ + NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second]; + const int NbParticlesToReceive = descriptor.nbParticlesToRecv; + + assert(descriptor.toCompute != nullptr); + descriptor.results.reset(new real_number[NbParticlesToReceive*size_particle_rhs]); + init_result_array(descriptor.results.get(), NbParticlesToReceive); + + if(more_than_one_thread == false){ + apply_computation(descriptor.toCompute.get(), descriptor.results.get(), NbParticlesToReceive); + } + else{ + TIMEZONE_OMP_INIT_PRETASK(timeZoneTaskKey) + NeighborDescriptor* ptr_descriptor = &descriptor; + #pragma omp taskgroup + { + for(int idxPart = 0 ; idxPart < NbParticlesToReceive ; idxPart += 300){ + const int sizeToDo = std::min(300, NbParticlesToReceive-idxPart); + #pragma omp task default(shared) firstprivate(ptr_descriptor, idxPart, sizeToDo) priority(10) \ + TIMEZONE_OMP_PRAGMA_TASK_KEY(timeZoneTaskKey) + { + TIMEZONE_OMP_TASK("apply_computation", timeZoneTaskKey); + apply_computation(&ptr_descriptor->toCompute[idxPart*size_particle_positions], + &ptr_descriptor->results[idxPart*size_particle_rhs], sizeToDo); + } + } + } + } + + const int destProc = descriptor.destProc; + whatNext.emplace_back(std::pair<Action,int>{RELEASE_BUFFER_PARTICLES, releasedAction.second}); + mpiRequests.emplace_back(); + const int tag = descriptor.isLower? TAG_LOW_UP_RESULTS : TAG_UP_LOW_RESULTS; + AssertMpi(MPI_Isend(descriptor.results.get(), NbParticlesToReceive*size_particle_rhs, particles_utils::GetMpiType(real_number()), destProc, tag, + current_com, &mpiRequests.back())); + } + ////////////////////////////////////////////////////////////////////// + /// Computation + ////////////////////////////////////////////////////////////////////// + if(releasedAction.first == RELEASE_BUFFER_PARTICLES){ + NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second]; + assert(descriptor.toCompute != nullptr); + descriptor.toCompute.release(); + } + ////////////////////////////////////////////////////////////////////// + /// Merge + ////////////////////////////////////////////////////////////////////// + if(releasedAction.first == MERGE_PARTICLES && more_than_one_thread == false){ + NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second]; + + if(descriptor.isLower){ + TIMEZONE("reduce"); + assert(descriptor.toRecvAndMerge != nullptr); + reduce_particles_rhs(&particles_current_rhs[0], descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend); + descriptor.toRecvAndMerge.release(); + } + else { + TIMEZONE("reduce"); + assert(descriptor.toRecvAndMerge != nullptr); + reduce_particles_rhs(&particles_current_rhs[(current_offset_particles_for_partition[current_partition_size]-descriptor.nbParticlesToSend)*size_particle_rhs], + descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend); + descriptor.toRecvAndMerge.release(); + } + } + } + } + if(more_than_one_thread && omp_get_thread_num() == 1){ + TIMEZONE_OMP_INIT_PRETASK(timeZoneTaskKey) + #pragma omp taskgroup + { + // Do for all partitions except the first and last one + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + for(int idxPart = current_offset_particles_for_partition[idxPartition] ; + idxPart < current_offset_particles_for_partition[idxPartition+1] ; idxPart += 300){ + + const int sizeToDo = std::min(300, current_offset_particles_for_partition[idxPartition+1]-idxPart); + + // Low priority to help master thread when possible + #pragma omp task default(shared) firstprivate(idxPart, sizeToDo) priority(0) TIMEZONE_OMP_PRAGMA_TASK_KEY(timeZoneTaskKey) + { + TIMEZONE_OMP_TASK("apply_computation", timeZoneTaskKey); + apply_computation(&particles_positions[idxPart*size_particle_positions], + &particles_current_rhs[idxPart*size_particle_rhs], + sizeToDo); + } + } + } + } + } + } + + if(more_than_one_thread == true){ + for(int idxDescr = 0 ; idxDescr < int(neigDescriptors.size()) ; ++idxDescr){ + NeighborDescriptor& descriptor = neigDescriptors[idxDescr]; + if(descriptor.nbParticlesToSend){ + if(descriptor.isLower){ + TIMEZONE("reduce_later"); + assert(descriptor.toRecvAndMerge != nullptr); + reduce_particles_rhs(&particles_current_rhs[0], descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend); + descriptor.toRecvAndMerge.release(); + } + else { + TIMEZONE("reduce_later"); + assert(descriptor.toRecvAndMerge != nullptr); + reduce_particles_rhs(&particles_current_rhs[(current_offset_particles_for_partition[current_partition_size]-descriptor.nbParticlesToSend)*size_particle_rhs], + descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend); + descriptor.toRecvAndMerge.release(); + } + } + } + } + + // Do my own computation if not threaded + if(more_than_one_thread == false){ + TIMEZONE("compute-my_compute"); + // Compute my particles + if(myTotalNbParticles){ + apply_computation(particles_positions, particles_current_rhs, myTotalNbParticles); + } + } + + assert(whatNext.size() == 0); + assert(mpiRequests.size() == 0); + } + + //////////////////////////////////////////////////////////////////////////// + + virtual void init_result_array(real_number particles_current_rhs[], + const int nb_particles) const = 0; + virtual void apply_computation(const real_number particles_positions[], + real_number particles_current_rhs[], + const int nb_particles) const = 0; + virtual void reduce_particles_rhs(real_number particles_current_rhs[], + const real_number extra_particles_current_rhs[], + const int nb_particles) const = 0; + + //////////////////////////////////////////////////////////////////////////// + + void redistribute(int current_my_nb_particles_per_partition[], + int* nb_particles, + std::unique_ptr<real_number[]>* inout_positions_particles, + std::unique_ptr<real_number[]> inout_rhs_particles[], const int in_nb_rhs, + std::unique_ptr<int[]>* inout_index_particles, + const real_number mySpatialLowLimit, + const real_number mySpatialUpLimit, + const real_number spatialPartitionWidth){ + TIMEZONE("redistribute"); + + // Some latest processes might not be involved + if(nb_processes_involved <= my_rank){ + return; + } + + current_offset_particles_for_partition[0] = 0; + int myTotalNbParticles = 0; + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + myTotalNbParticles += current_my_nb_particles_per_partition[idxPartition]; + current_offset_particles_for_partition[idxPartition+1] = current_offset_particles_for_partition[idxPartition] + current_my_nb_particles_per_partition[idxPartition]; + } + assert((*nb_particles) == myTotalNbParticles); + + // Find particles outside my interval + const int nbOutLower = particles_utils::partition_extra<size_particle_positions>(&(*inout_positions_particles)[0], current_my_nb_particles_per_partition[0], + [&](const real_number val[]){ + const bool isLower = val[IDX_Z] < mySpatialLowLimit; + return isLower; + }, + [&](const int idx1, const int idx2){ + for(int idx_val = 0 ; idx_val < size_particle_index ; ++idx_val){ + std::swap((*inout_index_particles)[idx1], (*inout_index_particles)[idx2]); + } + + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + for(int idx_val = 0 ; idx_val < size_particle_rhs ; ++idx_val){ + std::swap(inout_rhs_particles[idx_rhs][idx1*size_particle_rhs + idx_val], + inout_rhs_particles[idx_rhs][idx2*size_particle_rhs + idx_val]); + } + } + }); + const int offesetOutLow = (current_partition_size==1? nbOutLower : 0); + + const int nbOutUpper = current_my_nb_particles_per_partition[current_partition_size-1] - offesetOutLow - particles_utils::partition_extra<size_particle_positions>( + &(*inout_positions_particles)[(current_offset_particles_for_partition[current_partition_size-1]+offesetOutLow)*size_particle_positions], + myTotalNbParticles - (current_offset_particles_for_partition[current_partition_size-1]+offesetOutLow), + [&](const real_number val[]){ + const bool isUpper = mySpatialUpLimit <= val[IDX_Z]; + return !isUpper; + }, + [&](const int idx1, const int idx2){ + for(int idx_val = 0 ; idx_val < size_particle_index ; ++idx_val){ + std::swap((*inout_index_particles)[idx1], (*inout_index_particles)[idx2]); + } + + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + for(int idx_val = 0 ; idx_val < size_particle_rhs ; ++idx_val){ + std::swap(inout_rhs_particles[idx_rhs][idx1*size_particle_rhs + idx_val], + inout_rhs_particles[idx_rhs][idx2*size_particle_rhs + idx_val]); + } + } + }, (current_offset_particles_for_partition[current_partition_size-1]+offesetOutLow)); + + // Exchange number + int eventsBeforeWaitall = 0; + int nbNewFromLow = 0; + int nbNewFromUp = 0; + std::unique_ptr<real_number[]> newParticlesLow; + std::unique_ptr<real_number[]> newParticlesUp; + std::unique_ptr<int[]> newParticlesLowIndexes; + std::unique_ptr<int[]> newParticlesUpIndexes; + std::vector<std::unique_ptr<real_number[]>> newParticlesLowRhs(in_nb_rhs); + std::vector<std::unique_ptr<real_number[]>> newParticlesUpRhs(in_nb_rhs); + + const bool more_than_one_thread = (omp_get_max_threads() > 1); + + TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads()) + #pragma omp parallel default(shared) + { + #pragma omp master + { + assert(whatNext.size() == 0); + assert(mpiRequests.size() == 0); + + whatNext.emplace_back(std::pair<Action,int>{RECV_MOVE_NB_LOW, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&nbNewFromLow, 1, MPI_INT, (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_UP_LOW_MOVED_NB_PARTICLES, + MPI_COMM_WORLD, &mpiRequests.back())); + eventsBeforeWaitall += 1; + + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(const_cast<int*>(&nbOutLower), 1, MPI_INT, (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_LOW_UP_MOVED_NB_PARTICLES, + MPI_COMM_WORLD, &mpiRequests.back())); + + if(nbOutLower){ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(&(*inout_positions_particles)[0], nbOutLower*size_particle_positions, particles_utils::GetMpiType(real_number()), (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES, + MPI_COMM_WORLD, &mpiRequests.back())); + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(&(*inout_index_particles)[0], nbOutLower, MPI_INT, (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES_INDEXES, + MPI_COMM_WORLD, &mpiRequests.back())); + + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(&inout_rhs_particles[idx_rhs][0], nbOutLower*size_particle_rhs, particles_utils::GetMpiType(real_number()), (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES_RHS+idx_rhs, + MPI_COMM_WORLD, &mpiRequests.back())); + } + } + + whatNext.emplace_back(std::pair<Action,int>{RECV_MOVE_NB_UP, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&nbNewFromUp, 1, MPI_INT, (my_rank+1)%nb_processes_involved, TAG_LOW_UP_MOVED_NB_PARTICLES, + MPI_COMM_WORLD, &mpiRequests.back())); + eventsBeforeWaitall += 1; + + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(const_cast<int*>(&nbOutUpper), 1, MPI_INT, (my_rank+1)%nb_processes_involved, TAG_UP_LOW_MOVED_NB_PARTICLES, + MPI_COMM_WORLD, &mpiRequests.back())); + + if(nbOutUpper){ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(&(*inout_positions_particles)[(myTotalNbParticles-nbOutUpper)*size_particle_positions], nbOutUpper*size_particle_positions, particles_utils::GetMpiType(real_number()), (my_rank+1)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES, + MPI_COMM_WORLD, &mpiRequests.back())); + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(&(*inout_index_particles)[(myTotalNbParticles-nbOutUpper)], nbOutUpper, MPI_INT, (my_rank+1)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES_INDEXES, + MPI_COMM_WORLD, &mpiRequests.back())); + + + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(&inout_rhs_particles[idx_rhs][(myTotalNbParticles-nbOutUpper)*size_particle_rhs], nbOutUpper*size_particle_rhs, particles_utils::GetMpiType(real_number()), (my_rank+1)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES_RHS+idx_rhs, + MPI_COMM_WORLD, &mpiRequests.back())); + } + } + + while(mpiRequests.size() && eventsBeforeWaitall){ + int idxDone = mpiRequests.size(); + { + TIMEZONE("waitany_move"); + AssertMpi(MPI_Waitany(mpiRequests.size(), mpiRequests.data(), &idxDone, MPI_STATUSES_IGNORE)); + } + const std::pair<Action, int> releasedAction = whatNext[idxDone]; + std::swap(mpiRequests[idxDone], mpiRequests[mpiRequests.size()-1]); + std::swap(whatNext[idxDone], whatNext[mpiRequests.size()-1]); + mpiRequests.pop_back(); + whatNext.pop_back(); + + if(releasedAction.first == RECV_MOVE_NB_LOW){ + if(nbNewFromLow){ + assert(newParticlesLow == nullptr); + newParticlesLow.reset(new real_number[nbNewFromLow*size_particle_positions]); + whatNext.emplace_back(std::pair<Action,int>{RECV_MOVE_LOW, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&newParticlesLow[0], nbNewFromLow*size_particle_positions, particles_utils::GetMpiType(real_number()), (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES, + MPI_COMM_WORLD, &mpiRequests.back())); + + newParticlesLowIndexes.reset(new int[nbNewFromLow]); + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&newParticlesLowIndexes[0], nbNewFromLow, MPI_INT, (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES_INDEXES, + MPI_COMM_WORLD, &mpiRequests.back())); + + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + newParticlesLowRhs[idx_rhs].reset(new real_number[nbNewFromLow*size_particle_rhs]); + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&newParticlesLowRhs[idx_rhs][0], nbNewFromLow*size_particle_rhs, particles_utils::GetMpiType(real_number()), (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES_RHS+idx_rhs, + MPI_COMM_WORLD, &mpiRequests.back())); + } + } + eventsBeforeWaitall -= 1; + } + else if(releasedAction.first == RECV_MOVE_NB_UP){ + if(nbNewFromUp){ + assert(newParticlesUp == nullptr); + newParticlesUp.reset(new real_number[nbNewFromUp*size_particle_positions]); + whatNext.emplace_back(std::pair<Action,int>{RECV_MOVE_UP, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&newParticlesUp[0], nbNewFromUp*size_particle_positions, particles_utils::GetMpiType(real_number()), (my_rank+1)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES, + MPI_COMM_WORLD, &mpiRequests.back())); + + newParticlesUpIndexes.reset(new int[nbNewFromUp]); + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&newParticlesUpIndexes[0], nbNewFromUp, MPI_INT, (my_rank+1)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES_INDEXES, + MPI_COMM_WORLD, &mpiRequests.back())); + + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + newParticlesUpRhs[idx_rhs].reset(new real_number[nbNewFromUp*size_particle_rhs]); + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&newParticlesUpRhs[idx_rhs][0], nbNewFromUp*size_particle_rhs, particles_utils::GetMpiType(real_number()), (my_rank+1)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES_RHS+idx_rhs, + MPI_COMM_WORLD, &mpiRequests.back())); + } + } + eventsBeforeWaitall -= 1; + } + } + + if(mpiRequests.size()){ + // TODO Proceed when received + TIMEZONE("waitall-move"); + AssertMpi(MPI_Waitall(mpiRequests.size(), mpiRequests.data(), MPI_STATUSES_IGNORE)); + mpiRequests.clear(); + whatNext.clear(); + } + + // If we use thread, insert task to proceed received data + if(more_than_one_thread == true){ + TIMEZONE_OMP_INIT_PRETASK(timeZoneTaskKey) + #pragma omp taskgroup + { + if(nbNewFromLow){ + assert(newParticlesLow.get() != nullptr); + #pragma omp task TIMEZONE_OMP_PRAGMA_TASK_KEY(timeZoneTaskKey) + { + TIMEZONE_OMP_TASK("task-pbc", timeZoneTaskKey); + apply_pbc_z_new_particles(newParticlesLow.get(), nbNewFromLow); + apply_pbc_xy(newParticlesLow.get(), nbNewFromLow); + } + } + if(nbNewFromUp){ + assert(newParticlesUp.get() != nullptr); + #pragma omp task TIMEZONE_OMP_PRAGMA_TASK_KEY(timeZoneTaskKey) + { + TIMEZONE_OMP_TASK("task-pbc", timeZoneTaskKey); + apply_pbc_z_new_particles(newParticlesUp.get(), nbNewFromUp); + apply_pbc_xy(newParticlesUp.get(), nbNewFromUp); + } + } + } + } + } + // if we use threads and we are not master thread than proceed local data (not send/recv) + if(more_than_one_thread == true && omp_get_thread_num() > 0){ + TIMEZONE("apply_pbc_xy"); + const int nbOldParticles = myTotalNbParticles - nbOutLower - nbOutUpper; + particles_utils::IntervalSplitter<int> interval(nbOldParticles, + omp_get_num_threads()-1, + omp_get_thread_num()-1); + + apply_pbc_xy(&(*inout_positions_particles)[(nbOutLower+interval.getMyOffset())*size_particle_positions], interval.getMySize()); + } + } + + // If we do not use thread, process all data sequentially + if(more_than_one_thread == false){ + TIMEZONE("apply_pbc_z_new_particles"); + if(nbNewFromLow){ + assert(newParticlesLow.get() != nullptr); + apply_pbc_z_new_particles(newParticlesLow.get(), nbNewFromLow); + apply_pbc_xy(newParticlesLow.get(), nbNewFromLow); + } + if(nbNewFromUp){ + assert(newParticlesUp.get() != nullptr); + apply_pbc_z_new_particles(newParticlesUp.get(), nbNewFromUp); + apply_pbc_xy(newParticlesUp.get(), nbNewFromUp); + } + + apply_pbc_xy(&(*inout_positions_particles)[nbOutLower*size_particle_positions], myTotalNbParticles - nbOutLower - nbOutUpper); + } + + // Realloc an merge + { + TIMEZONE("realloc_copy"); + const int nbOldParticlesInside = myTotalNbParticles - nbOutLower - nbOutUpper; + const int myTotalNewNbParticles = nbOldParticlesInside + nbNewFromLow + nbNewFromUp; + + std::unique_ptr<real_number[]> newArray(new real_number[myTotalNewNbParticles*size_particle_positions]); + std::unique_ptr<int[]> newArrayIndexes(new int[myTotalNewNbParticles]); + std::vector<std::unique_ptr<real_number[]>> newArrayRhs(in_nb_rhs); + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + newArrayRhs[idx_rhs].reset(new real_number[myTotalNewNbParticles*size_particle_rhs]); + } + + // Copy new particles recv form lower first + if(nbNewFromLow){ + const particles_utils::fixed_copy fcp(0, 0, nbNewFromLow); + fcp.copy(newArray, newParticlesLow, size_particle_positions); + fcp.copy(newArrayIndexes, newParticlesLowIndexes); + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + fcp.copy(newArrayRhs[idx_rhs], newParticlesLowRhs[idx_rhs], size_particle_rhs); + } + } + + // Copy my own particles + { + const particles_utils::fixed_copy fcp(nbNewFromLow, nbOutLower, nbOldParticlesInside); + fcp.copy(newArray, (*inout_positions_particles), size_particle_positions); + fcp.copy(newArrayIndexes, (*inout_index_particles)); + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + fcp.copy(newArrayRhs[idx_rhs], inout_rhs_particles[idx_rhs], size_particle_rhs); + } + } + + // Copy new particles from upper at the back + if(nbNewFromUp){ + const particles_utils::fixed_copy fcp(nbNewFromLow+nbOldParticlesInside, 0, nbNewFromUp); + fcp.copy(newArray, newParticlesUp, size_particle_positions); + fcp.copy(newArrayIndexes, newParticlesUpIndexes); + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + fcp.copy(newArrayRhs[idx_rhs], newParticlesUpRhs[idx_rhs], size_particle_rhs); + } + } + + (*inout_positions_particles) = std::move(newArray); + (*inout_index_particles) = std::move(newArrayIndexes); + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + inout_rhs_particles[idx_rhs] = std::move(newArrayRhs[idx_rhs]); + } + + myTotalNbParticles = myTotalNewNbParticles; + } + + // Partitions all particles + { + TIMEZONE("repartition"); + particles_utils::partition_extra_z<size_particle_positions>(&(*inout_positions_particles)[0], + myTotalNbParticles,current_partition_size, + current_my_nb_particles_per_partition, current_offset_particles_for_partition.get(), + [&](const int idxPartition){ + return (idxPartition+1)*spatialPartitionWidth + mySpatialLowLimit; + }, + [&](const int idx1, const int idx2){ + for(int idx_val = 0 ; idx_val < size_particle_index ; ++idx_val){ + std::swap((*inout_index_particles)[idx1], (*inout_index_particles)[idx2]); + } + + for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){ + for(int idx_val = 0 ; idx_val < size_particle_rhs ; ++idx_val){ + std::swap(inout_rhs_particles[idx_rhs][idx1*size_particle_rhs + idx_val], + inout_rhs_particles[idx_rhs][idx2*size_particle_rhs + idx_val]); + } + } + }); + + {// TODO remove + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + assert(current_my_nb_particles_per_partition[idxPartition] == + current_offset_particles_for_partition[idxPartition+1] - current_offset_particles_for_partition[idxPartition]); + const real_number limitPartition = (idxPartition+1)*spatialPartitionWidth + mySpatialLowLimit; + for(int idx = 0 ; idx < current_offset_particles_for_partition[idxPartition+1] ; ++idx){ + assert((*inout_positions_particles)[idx*3+IDX_Z] < limitPartition); + } + for(int idx = current_offset_particles_for_partition[idxPartition+1] ; idx < myTotalNbParticles ; ++idx){ + assert((*inout_positions_particles)[idx*3+IDX_Z] >= limitPartition); + } + } + } + } + (*nb_particles) = myTotalNbParticles; + + assert(mpiRequests.size() == 0); + } + + virtual void apply_pbc_z_new_particles(real_number* newParticlesLow, const int nbNewFromLow) const = 0; + virtual void apply_pbc_xy(real_number* inout_positions_particles, const int nbNew) const = 0; + + //////////////////////////////////////////////////////////////////////////// + + virtual void move_particles(real_number particles_positions[], + const int nb_particles, + const std::unique_ptr<real_number[]> particles_current_rhs[], + const int nb_rhs, const real_number dt) const = 0; +}; + +#endif diff --git a/bfps/cpp/particles/abstract_particles_input.hpp b/bfps/cpp/particles/abstract_particles_input.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bb295c40717085e58126530e8403c3ff5b71a014 --- /dev/null +++ b/bfps/cpp/particles/abstract_particles_input.hpp @@ -0,0 +1,21 @@ +#ifndef ABSTRACT_PARTICLES_INPUT_HPP +#define ABSTRACT_PARTICLES_INPUT_HPP + +#include <tuple> + +template <class real_number> +class abstract_particles_input { +public: + virtual ~abstract_particles_input(){} + + virtual int getTotalNbParticles() = 0; + virtual int getLocalNbParticles() = 0; + virtual int getNbRhs() = 0; + + virtual std::unique_ptr<real_number[]> getMyParticles() = 0; + virtual std::unique_ptr<int[]> getMyParticlesIndexes() = 0; + virtual std::vector<std::unique_ptr<real_number[]>> getMyRhs() = 0; +}; + + +#endif diff --git a/bfps/cpp/particles/abstract_particles_output.hpp b/bfps/cpp/particles/abstract_particles_output.hpp new file mode 100644 index 0000000000000000000000000000000000000000..955f1e6fd07f98421837bd9bf359026ea9535b74 --- /dev/null +++ b/bfps/cpp/particles/abstract_particles_output.hpp @@ -0,0 +1,192 @@ +#ifndef ABSTRACT_PARTICLES_OUTPUT +#define ABSTRACT_PARTICLES_OUTPUT + +#include <memory> +#include <vector> +#include <cassert> +#include <algorithm> +#include <cstddef> + +#include "base.hpp" +#include "particles_utils.hpp" +#include "alltoall_exchanger.hpp" +#include "scope_timer.hpp" + + +template <class real_number, int size_particle_positions, int size_particle_rhs> +class abstract_particles_output { + MPI_Comm mpi_com; + + int my_rank; + int nb_processes; + + const int total_nb_particles; + const int nb_rhs; + + std::unique_ptr<std::pair<int,int>[]> buffer_indexes_send; + std::unique_ptr<real_number[]> buffer_particles_positions_send; + std::vector<std::unique_ptr<real_number[]>> buffer_particles_rhs_send; + int size_buffers_send; + + std::unique_ptr<real_number[]> buffer_particles_positions_recv; + std::vector<std::unique_ptr<real_number[]>> buffer_particles_rhs_recv; + std::unique_ptr<int[]> buffer_indexes_recv; + int size_buffers_recv; + + +protected: + MPI_Comm& getCom(){ + return mpi_com; + } + + int getTotalNbParticles() const { + return total_nb_particles; + } + + int getNbRhs() const { + return nb_rhs; + } + +public: + abstract_particles_output(MPI_Comm in_mpi_com, const int inTotalNbParticles, const int in_nb_rhs) + : mpi_com(in_mpi_com), my_rank(-1), nb_processes(-1), + total_nb_particles(inTotalNbParticles), nb_rhs(in_nb_rhs), + buffer_particles_rhs_send(in_nb_rhs), size_buffers_send(-1), + buffer_particles_rhs_recv(in_nb_rhs), size_buffers_recv(-1){ + + AssertMpi(MPI_Comm_rank(mpi_com, &my_rank)); + AssertMpi(MPI_Comm_size(mpi_com, &nb_processes)); + } + + virtual ~abstract_particles_output(){ + } + + void releaseMemory(){ + buffer_indexes_send.release(); + buffer_particles_positions_send.release(); + size_buffers_send = -1; + buffer_indexes_recv.release(); + buffer_particles_positions_recv.release(); + size_buffers_recv = -1; + for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){ + buffer_particles_rhs_send[idx_rhs].release(); + buffer_particles_rhs_recv[idx_rhs].release(); + } + } + + void save(const real_number input_particles_positions[], const std::unique_ptr<real_number[]> input_particles_rhs[], + const int index_particles[], const int nb_particles, const int idx_time_step){ + TIMEZONE("abstract_particles_output::save"); + assert(total_nb_particles != -1); + + { + TIMEZONE("sort-to-distribute"); + + if(size_buffers_send < nb_particles && nb_particles){ + buffer_indexes_send.reset(new std::pair<int,int>[nb_particles]); + buffer_particles_positions_send.reset(new real_number[nb_particles*size_particle_positions]); + for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){ + buffer_particles_rhs_send[idx_rhs].reset(new real_number[nb_particles*size_particle_rhs]); + } + size_buffers_send = nb_particles; + } + + for(int idx_part = 0 ; idx_part < nb_particles ; ++idx_part){ + buffer_indexes_send[idx_part].first = idx_part; + buffer_indexes_send[idx_part].second = index_particles[idx_part]; + } + + std::sort(&buffer_indexes_send[0], &buffer_indexes_send[nb_particles], [](const std::pair<int,int>& p1, const std::pair<int,int>& p2){ + return p1.second < p2.second; + }); + + for(int idx_part = 0 ; idx_part < nb_particles ; ++idx_part){ + const int src_idx = buffer_indexes_send[idx_part].first; + const int dst_idx = idx_part; + + for(int idx_val = 0 ; idx_val < size_particle_positions ; ++idx_val){ + buffer_particles_positions_send[dst_idx*size_particle_positions + idx_val] + = input_particles_positions[src_idx*size_particle_positions + idx_val]; + } + for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){ + for(int idx_val = 0 ; idx_val < int(size_particle_rhs) ; ++idx_val){ + buffer_particles_rhs_send[idx_rhs][dst_idx*size_particle_rhs + idx_val] + = input_particles_rhs[idx_rhs][src_idx*size_particle_rhs + idx_val]; + } + } + } + } + + const particles_utils::IntervalSplitter<int> particles_splitter(total_nb_particles, nb_processes, my_rank); + + int* buffer_indexes_send_tmp = reinterpret_cast<int*>(buffer_indexes_send.get());// trick re-use buffer_indexes_send memory + std::vector<int> nb_particles_to_send(nb_processes, 0); + for(int idx_part = 0 ; idx_part < nb_particles ; ++idx_part){ + nb_particles_to_send[particles_splitter.getOwner(buffer_indexes_send[idx_part].second)] += 1; + buffer_indexes_send_tmp[idx_part] = buffer_indexes_send[idx_part].second; + } + + alltoall_exchanger exchanger(mpi_com, std::move(nb_particles_to_send)); + // nb_particles_to_send is invalid after here + + const int nb_to_receive = exchanger.getTotalToRecv(); + assert(nb_to_receive == particles_splitter.getMySize()); + + if(size_buffers_recv < nb_to_receive && nb_to_receive){ + buffer_indexes_recv.reset(new int[nb_to_receive]); + buffer_particles_positions_recv.reset(new real_number[nb_to_receive*size_particle_positions]); + for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){ + buffer_particles_rhs_recv[idx_rhs].reset(new real_number[nb_to_receive*size_particle_rhs]); + } + size_buffers_recv = nb_to_receive; + } + + { + TIMEZONE("exchange"); + // Could be done with multiple asynchronous coms + exchanger.alltoallv<int>(buffer_indexes_send_tmp, buffer_indexes_recv.get()); + exchanger.alltoallv<real_number>(buffer_particles_positions_send.get(), buffer_particles_positions_recv.get(), size_particle_positions); + for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){ + exchanger.alltoallv<real_number>(buffer_particles_rhs_send[idx_rhs].get(), buffer_particles_rhs_recv[idx_rhs].get(), size_particle_rhs); + } + } + + if(size_buffers_send < nb_to_receive && nb_to_receive){ + buffer_indexes_send.reset(new std::pair<int,int>[nb_to_receive]); + buffer_particles_positions_send.reset(new real_number[nb_to_receive*size_particle_positions]); + for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){ + buffer_particles_rhs_send[idx_rhs].reset(new real_number[nb_to_receive*size_particle_rhs]); + } + size_buffers_send = nb_to_receive; + } + + { + TIMEZONE("copy-local-order"); + for(int idx_part = 0 ; idx_part < nb_to_receive ; ++idx_part){ + const int src_idx = idx_part; + const int dst_idx = buffer_indexes_recv[idx_part]-particles_splitter.getMyOffset(); + assert(0 <= dst_idx); + assert(dst_idx < particles_splitter.getMySize()); + + for(int idx_val = 0 ; idx_val < size_particle_positions ; ++idx_val){ + buffer_particles_positions_send[dst_idx*size_particle_positions + idx_val] + = buffer_particles_positions_recv[src_idx*size_particle_positions + idx_val]; + } + for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){ + for(int idx_val = 0 ; idx_val < int(size_particle_rhs) ; ++idx_val){ + buffer_particles_rhs_send[idx_rhs][dst_idx*size_particle_rhs + idx_val] + = buffer_particles_rhs_recv[idx_rhs][src_idx*size_particle_rhs + idx_val]; + } + } + } + } + + write(idx_time_step, buffer_particles_positions_send.get(), buffer_particles_rhs_send.data(), + nb_to_receive, particles_splitter.getMyOffset()); + } + + virtual void write(const int idx_time_step, const real_number* positions, const std::unique_ptr<real_number[]>* rhs, + const int nb_particles, const int particles_idx_offset) = 0; +}; + +#endif diff --git a/bfps/cpp/particles/abstract_particles_system.hpp b/bfps/cpp/particles/abstract_particles_system.hpp new file mode 100644 index 0000000000000000000000000000000000000000..32510404b4fa69596a53385b470aea0d4136b08b --- /dev/null +++ b/bfps/cpp/particles/abstract_particles_system.hpp @@ -0,0 +1,32 @@ +#ifndef ABSTRACT_PARTICLES_SYSTEM_HPP +#define ABSTRACT_PARTICLES_SYSTEM_HPP + +#include <memory> + +template <class real_number> +class abstract_particles_system { +public: + virtual void compute() = 0; + + virtual void move(const real_number dt) = 0; + + virtual void redistribute() = 0; + + virtual void inc_step_idx() = 0; + + virtual void shift_rhs_vectors() = 0; + + virtual void completeLoop(const real_number dt) = 0; + + virtual const real_number* getParticlesPositions() const = 0; + + virtual const std::unique_ptr<real_number[]>* getParticlesRhs() const = 0; + + virtual const int* getParticlesIndexes() const = 0; + + virtual int getLocalNbParticles() const = 0; + + virtual int getNbRhs() const = 0; +}; + +#endif diff --git a/bfps/cpp/particles/alltoall_exchanger.hpp b/bfps/cpp/particles/alltoall_exchanger.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3c592011c9772afad1ab68555b179754f868624e --- /dev/null +++ b/bfps/cpp/particles/alltoall_exchanger.hpp @@ -0,0 +1,109 @@ +#ifndef ALLTOALL_EXCHANGER_HPP +#define ALLTOALL_EXCHANGER_HPP + +#include <mpi.h> +#include <cassert> + +#include "base.hpp" +#include "particles_utils.hpp" +#include "scope_timer.hpp" + +class alltoall_exchanger { + const MPI_Comm mpi_com; + + int my_rank; + int nb_processes; + + const std::vector<int> nb_items_to_send; + + std::vector<int> offset_items_to_send; + + std::vector<int> nb_items_to_sendrecv_all; + std::vector<int> nb_items_to_recv; + std::vector<int> offset_items_to_recv; + + int total_to_recv; + +public: + alltoall_exchanger(const MPI_Comm& in_mpi_com, std::vector<int>/*no ref to move here*/ in_nb_items_to_send) + :mpi_com(in_mpi_com), nb_items_to_send(std::move(in_nb_items_to_send)), total_to_recv(0){ + TIMEZONE("alltoall_exchanger::constructor"); + + AssertMpi(MPI_Comm_rank(mpi_com, &my_rank)); + AssertMpi(MPI_Comm_size(mpi_com, &nb_processes)); + + assert(int(nb_items_to_send.size()) == nb_processes); + + offset_items_to_send.resize(nb_processes+1, 0); + for(int idx_proc = 0 ; idx_proc < nb_processes ; ++idx_proc){ + offset_items_to_send[idx_proc+1] = offset_items_to_send[idx_proc] + + nb_items_to_send[idx_proc]; + } + + nb_items_to_sendrecv_all.resize(nb_processes*nb_processes); + AssertMpi(MPI_Allgather(const_cast<int*>(nb_items_to_send.data()), nb_processes, MPI_INT, + nb_items_to_sendrecv_all.data(), nb_processes, MPI_INT, + mpi_com)); + + nb_items_to_recv.resize(nb_processes, 0); + offset_items_to_recv.resize(nb_processes+1, 0); + for(int idx_proc = 0 ; idx_proc < nb_processes ; ++idx_proc){ + const int nbrecv = nb_items_to_sendrecv_all[idx_proc*nb_processes + my_rank]; + total_to_recv += nbrecv; + nb_items_to_recv[idx_proc] = nbrecv; + offset_items_to_recv[idx_proc+1] = nb_items_to_recv[idx_proc] + + offset_items_to_recv[idx_proc]; + } + } + + int getTotalToRecv() const{ + return total_to_recv; + } + + template <class ItemType> + void alltoallv_dt(const ItemType in_to_send[], + ItemType out_to_recv[], const MPI_Datatype& in_type) const { + TIMEZONE("alltoallv"); + AssertMpi(MPI_Alltoallv(const_cast<ItemType*>(in_to_send), const_cast<int*>(nb_items_to_send.data()), + const_cast<int*>(offset_items_to_send.data()), in_type, out_to_recv, + const_cast<int*>(nb_items_to_recv.data()), const_cast<int*>(offset_items_to_recv.data()), in_type, + mpi_com)); + } + + template <class ItemType> + void alltoallv(const ItemType in_to_send[], + ItemType out_to_recv[]) const { + alltoallv_dt<ItemType>(in_to_send, out_to_recv, particles_utils::GetMpiType(ItemType())); + } + + template <class ItemType> + void alltoallv_dt(const ItemType in_to_send[], + ItemType out_to_recv[], const MPI_Datatype& in_type, const int in_nb_values_per_item) const { + TIMEZONE("alltoallv"); + std::vector<int> nb_items_to_send_tmp = nb_items_to_send; + particles_utils::transform(nb_items_to_send_tmp.begin(), nb_items_to_send_tmp.end(), nb_items_to_send_tmp.begin(), + [&](const int val) -> int { return val * in_nb_values_per_item ;}); + std::vector<int> offset_items_to_send_tmp = offset_items_to_send; + particles_utils::transform(offset_items_to_send_tmp.begin(), offset_items_to_send_tmp.end(), offset_items_to_send_tmp.begin(), + [&](const int val) -> int { return val * in_nb_values_per_item ;}); + std::vector<int> nb_items_to_recv_tmp = nb_items_to_recv; + particles_utils::transform(nb_items_to_recv_tmp.begin(), nb_items_to_recv_tmp.end(), nb_items_to_recv_tmp.begin(), + [&](const int val) -> int { return val * in_nb_values_per_item ;}); + std::vector<int> offset_items_to_recv_tmp = offset_items_to_recv; + particles_utils::transform(offset_items_to_recv_tmp.begin(), offset_items_to_recv_tmp.end(), offset_items_to_recv_tmp.begin(), + [&](const int val) -> int { return val * in_nb_values_per_item ;}); + + AssertMpi(MPI_Alltoallv(const_cast<ItemType*>(in_to_send), const_cast<int*>(nb_items_to_send_tmp.data()), + const_cast<int*>(offset_items_to_send_tmp.data()), in_type, out_to_recv, + const_cast<int*>(nb_items_to_recv_tmp.data()), const_cast<int*>(offset_items_to_recv_tmp.data()), in_type, + mpi_com)); + } + + template <class ItemType> + void alltoallv(const ItemType in_to_send[], + ItemType out_to_recv[], const int in_nb_values_per_item) const { + alltoallv_dt<ItemType>(in_to_send, out_to_recv,particles_utils::GetMpiType(ItemType()), in_nb_values_per_item); + } +}; + +#endif diff --git a/bfps/cpp/particles/field_accessor.hpp b/bfps/cpp/particles/field_accessor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bb4fae763a6a65f7d17fb88d85b9d31a45d48a18 --- /dev/null +++ b/bfps/cpp/particles/field_accessor.hpp @@ -0,0 +1,54 @@ +#ifndef FIELD_ACCESSOR_HPP +#define FIELD_ACCESSOR_HPP + +#include <algorithm> +#include <array> + +#include "particles_utils.hpp" + +template <class real_number> +class field_accessor { + static const int nb_dim = 3; + + const real_number* field_date; + std::array<size_t,3> local_field_dims; + std::array<size_t,3> local_field_offset; + std::array<size_t,3> field_memory_dims; + +public: + field_accessor(const real_number* in_field_date, const std::array<size_t,3>& in_dims, + const std::array<size_t,3>& in_local_field_offset, + const std::array<size_t,3>& in_field_memory_dims) + : field_date(in_field_date), local_field_dims(in_dims), + local_field_offset(in_local_field_offset), + field_memory_dims(in_field_memory_dims){ + } + + ~field_accessor(){} + + const real_number& getValue(const size_t in_index, const int in_dim) const { + assert(in_index < field_memory_dims[0]*field_memory_dims[1]*field_memory_dims[2]); + return field_date[in_index*nb_dim + in_dim]; + } + + size_t getIndexFromGlobalPosition(const size_t in_global_x, const size_t in_global_y, const size_t in_global_z) const { + return getIndexFromLocalPosition(in_global_x - local_field_offset[IDX_X], + in_global_y - local_field_offset[IDX_Y], + in_global_z - local_field_offset[IDX_Z]); + } + + size_t getIndexFromLocalPosition(const size_t in_local_x, const size_t in_local_y, const size_t in_local_z) const { + assert(0 <= in_local_x && in_local_x < local_field_dims[IDX_X]); + assert(0 <= in_local_y && in_local_y < local_field_dims[IDX_Y]); + assert(0 <= in_local_z && in_local_z < local_field_dims[IDX_Z]); + static_assert(IDX_X == 2 && IDX_Y == 1 && IDX_Z == 0, + "Dimension idx does not match, please ensure getIndexFromLocalPosition" + "is correct before commenting this assert"); + return (((in_local_z)*field_memory_dims[1] + + in_local_y)*(field_memory_dims[2]) + + in_local_x); + } +}; + + +#endif diff --git a/bfps/cpp/particles/particles_adams_bashforth.hpp b/bfps/cpp/particles/particles_adams_bashforth.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aaa81e03515c4dad9e7468d3435a3bee0adc8487 --- /dev/null +++ b/bfps/cpp/particles/particles_adams_bashforth.hpp @@ -0,0 +1,113 @@ +#ifndef PARTICLES_ADAMS_BASHFORTH_HPP +#define PARTICLES_ADAMS_BASHFORTH_HPP + +#include <stdexcept> +#include <omp.h> + +#include "scope_timer.hpp" +#include "particles_utils.hpp" + +template <class real_number, int size_particle_positions = 3, int size_particle_rhs = 3> +class particles_adams_bashforth { +public: + static const int Max_steps = 6; + + void move_particles(real_number particles_positions[], + const int nb_particles, + const std::unique_ptr<real_number[]> particles_rhs[], + const int nb_rhs, const real_number dt) const{ + TIMEZONE("particles_adams_bashforth::move_particles"); + + if(Max_steps < nb_rhs){ + throw std::runtime_error("Error, in bfps particles_adams_bashforth.\n" + "Step in particles_adams_bashforth is too large," + "you must add formulation up this number or limit the number of steps."); + } + + // Not needed: TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads()) + #pragma omp parallel default(shared) + { + particles_utils::IntervalSplitter<int> interval(nb_particles, + omp_get_num_threads(), + omp_get_thread_num()); + const int last_idx = interval.getMyOffset()+interval.getMySize(); + + // TODO full unroll + blocking + switch (nb_rhs){ + case 1: + for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){ + for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){ + // dt × [0] + particles_positions[idx_part*size_particle_positions + idx_dim] + += dt * particles_rhs[0][idx_part*size_particle_rhs + idx_dim]; + } + } + break; + case 2: + for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){ + for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){ + // dt × (3[0] - [1])/2 + particles_positions[idx_part*size_particle_positions + idx_dim] + += dt * (3.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim] + - particles_rhs[1][idx_part*size_particle_rhs + idx_dim])/2.; + } + } + break; + case 3: + for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){ + for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){ + // dt × (23[0] - 16[1] + 5[2])/12 + particles_positions[idx_part*size_particle_positions + idx_dim] + += dt * (23.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim] + - 16.*particles_rhs[1][idx_part*size_particle_rhs + idx_dim] + + 5.*particles_rhs[2][idx_part*size_particle_rhs + idx_dim])/12.; + } + } + break; + case 4: + for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){ + for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){ + // dt × (55[0] - 59[1] + 37[2] - 9[3])/24 + particles_positions[idx_part*size_particle_positions + idx_dim] + += dt * (55.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim] + - 59.*particles_rhs[1][idx_part*size_particle_rhs + idx_dim] + + 37.*particles_rhs[2][idx_part*size_particle_rhs + idx_dim] + - 9.*particles_rhs[3][idx_part*size_particle_rhs + idx_dim])/24.; + } + } + break; + case 5: + for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){ + for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){ + // dt × (1901[0] - 2774[1] + 2616[2] - 1274[3] + 251[4])/720 + particles_positions[idx_part*size_particle_positions + idx_dim] + += dt * (1901.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim] + - 2774.*particles_rhs[1][idx_part*size_particle_rhs + idx_dim] + + 2616.*particles_rhs[2][idx_part*size_particle_rhs + idx_dim] + - 1274.*particles_rhs[3][idx_part*size_particle_rhs + idx_dim] + + 251.*particles_rhs[4][idx_part*size_particle_rhs + idx_dim])/720.; + } + } + break; + case 6: + for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){ + for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){ + // dt × (4277[0] - 7923[1] + 9982[2] - 7298[3] + 2877[4] - 475[5])/1440 + particles_positions[idx_part*size_particle_positions + idx_dim] + += dt * (4277.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim] + - 7923.*particles_rhs[1][idx_part*size_particle_rhs + idx_dim] + + 9982.*particles_rhs[2][idx_part*size_particle_rhs + idx_dim] + - 7298.*particles_rhs[3][idx_part*size_particle_rhs + idx_dim] + + 2877.*particles_rhs[4][idx_part*size_particle_rhs + idx_dim] + - 475.*particles_rhs[5][idx_part*size_particle_rhs + idx_dim])/1440.; + } + } + break; + } + } + } +}; + + + +#endif diff --git a/bfps/cpp/particles/particles_field_computer.hpp b/bfps/cpp/particles/particles_field_computer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..80f4745070953509efee5af587230d6941287d14 --- /dev/null +++ b/bfps/cpp/particles/particles_field_computer.hpp @@ -0,0 +1,234 @@ +#ifndef PARTICLES_FIELD_COMPUTER_HPP +#define PARTICLES_FIELD_COMPUTER_HPP + +#include <array> +#include <utility> + +#include "abstract_particles_distr.hpp" +#include "scope_timer.hpp" +#include "particles_utils.hpp" + +template <class real_number, class interpolator_class, class field_class, int interp_neighbours, class positions_updater_class > +class particles_field_computer : public abstract_particles_distr<real_number, 3,3,1> { + using Parent = abstract_particles_distr<real_number, 3,3,1>; + + const std::array<size_t,3> field_grid_dim; + const std::pair<int,int> current_partition_interval; + + const interpolator_class& interpolator; + const field_class& field; + + const positions_updater_class positions_updater; + + const std::array<real_number,3> spatial_box_width; + const std::array<real_number,3> box_step_width; + const real_number my_spatial_low_limit_z; + const real_number my_spatial_up_limit_z; + + int deriv[3]; + + //////////////////////////////////////////////////////////////////////// + /// Computation related + //////////////////////////////////////////////////////////////////////// + + virtual void init_result_array(real_number particles_current_rhs[], + const int nb_particles) const final{ + // Set values to zero initialy + std::fill(particles_current_rhs, particles_current_rhs+nb_particles*3, 0); + } + + real_number get_norm_pos_in_cell(const real_number in_pos, const int idx_pos) const { + const real_number cell_idx = floor(in_pos/box_step_width[idx_pos]); + const real_number pos_in_cell = (in_pos - cell_idx*box_step_width[idx_pos]) / box_step_width[idx_pos]; + assert(0 <= pos_in_cell && pos_in_cell < 1); + return pos_in_cell; + } + + virtual void apply_computation(const real_number particles_positions[], + real_number particles_current_rhs[], + const int nb_particles) const final{ + TIMEZONE("particles_field_computer::apply_computation"); + for(int idxPart = 0 ; idxPart < nb_particles ; ++idxPart){ + const real_number reltv_x = get_norm_pos_in_cell(particles_positions[idxPart*3+IDX_X], IDX_X); + const real_number reltv_y = get_norm_pos_in_cell(particles_positions[idxPart*3+IDX_Y], IDX_Y); + const real_number reltv_z = get_norm_pos_in_cell(particles_positions[idxPart*3+IDX_Z], IDX_Z); + + typename interpolator_class::real_number bx[interp_neighbours*2+2], by[interp_neighbours*2+2], bz[interp_neighbours*2+2]; + interpolator.compute_beta(deriv[IDX_X], reltv_x, bx); + interpolator.compute_beta(deriv[IDX_Y], reltv_y, by); + interpolator.compute_beta(deriv[IDX_Z], reltv_z, bz); + + const int partGridIdx_x = int(particles_positions[idxPart*3+IDX_X]/box_step_width[IDX_X]); + const int partGridIdx_y = int(particles_positions[idxPart*3+IDX_Y]/box_step_width[IDX_Y]); + const int partGridIdx_z = int(particles_positions[idxPart*3+IDX_Z]/box_step_width[IDX_Z]); + + assert(0 <= partGridIdx_x && partGridIdx_x < int(field_grid_dim[IDX_X])); + assert(0 <= partGridIdx_y && partGridIdx_y < int(field_grid_dim[IDX_Y])); + assert(0 <= partGridIdx_z && partGridIdx_z < int(field_grid_dim[IDX_Z])); + + const int interp_limit_mx = partGridIdx_x-interp_neighbours; + const int interp_limit_x = partGridIdx_x+interp_neighbours+1; + const int interp_limit_my = partGridIdx_y-interp_neighbours; + const int interp_limit_y = partGridIdx_y+interp_neighbours+1; + const int interp_limit_mz_bz = partGridIdx_z-interp_neighbours; + + int interp_limit_mz[2]; + int interp_limit_z[2]; + int nb_z_intervals; + + if((partGridIdx_z-interp_neighbours) < 0){ + assert(partGridIdx_z+interp_neighbours+1 < int(field_grid_dim[IDX_Z])); + interp_limit_mz[0] = ((partGridIdx_z-interp_neighbours)+field_grid_dim[IDX_Z])%field_grid_dim[IDX_Z]; + interp_limit_z[0] = current_partition_interval.second-1; + + interp_limit_mz[1] = std::max(0, current_partition_interval.first);// max is not really needed here + interp_limit_z[1] = std::min(partGridIdx_z+interp_neighbours+1, current_partition_interval.second-1); + + nb_z_intervals = 2; + } + else if(int(field_grid_dim[2]) <= (partGridIdx_z+interp_neighbours+1)){ + interp_limit_mz[0] = std::max(current_partition_interval.first, partGridIdx_z-interp_neighbours); + interp_limit_z[0] = std::min(int(field_grid_dim[IDX_Z])-1,current_partition_interval.second-1);// max is not really needed here + + interp_limit_mz[1] = std::max(0, current_partition_interval.first); + interp_limit_z[1] = std::min(int((partGridIdx_z+interp_neighbours+1+field_grid_dim[IDX_Z])%field_grid_dim[IDX_Z]), current_partition_interval.second-1); + + nb_z_intervals = 2; + } + else{ + interp_limit_mz[0] = std::max(partGridIdx_z-interp_neighbours, current_partition_interval.first); + interp_limit_z[0] = std::min(partGridIdx_z+interp_neighbours+1, current_partition_interval.second-1); + nb_z_intervals = 1; + } + + for(int idx_inter = 0 ; idx_inter < nb_z_intervals ; ++idx_inter){ + for(int idx_z = interp_limit_mz[idx_inter] ; idx_z <= interp_limit_z[idx_inter] ; ++idx_z ){ + const int idx_z_pbc = (idx_z + field_grid_dim[IDX_Z])%field_grid_dim[IDX_Z]; + assert(current_partition_interval.first <= idx_z_pbc && idx_z_pbc < current_partition_interval.second); + assert(((idx_z+field_grid_dim[IDX_Z]-interp_limit_mz_bz)%field_grid_dim[IDX_Z]) < interp_neighbours*2+2); + + for(int idx_x = interp_limit_mx ; idx_x <= interp_limit_x ; ++idx_x ){ + const int idx_x_pbc = (idx_x + field_grid_dim[IDX_X])%field_grid_dim[IDX_X]; + assert(idx_x-interp_limit_mx < interp_neighbours*2+2); + + for(int idx_y = interp_limit_my ; idx_y <= interp_limit_y ; ++idx_y ){ + const int idx_y_pbc = (idx_y + field_grid_dim[IDX_Y])%field_grid_dim[IDX_Y]; + assert(idx_y-interp_limit_my < interp_neighbours*2+2); + + const real_number coef = (bz[((idx_z+field_grid_dim[IDX_Z]-interp_limit_mz_bz)%field_grid_dim[IDX_Z])] + * by[idx_y-interp_limit_my] + * bx[idx_x-interp_limit_mx]); + + const ptrdiff_t tindex = field.getIndexFromGlobalPosition(idx_x_pbc, idx_y_pbc, idx_z_pbc); + + // getValue does not necessary return real_number + particles_current_rhs[idxPart*3+IDX_X] += real_number(field.getValue(tindex,IDX_X))*coef; + particles_current_rhs[idxPart*3+IDX_Y] += real_number(field.getValue(tindex,IDX_Y))*coef; + particles_current_rhs[idxPart*3+IDX_Z] += real_number(field.getValue(tindex,IDX_Z))*coef; + } + } + } + } + } + } + + virtual void reduce_particles_rhs(real_number particles_current_rhs[], + const real_number extra_particles_current_rhs[], + const int nb_particles) const final{ + TIMEZONE("particles_field_computer::reduce_particles"); + // Simply sum values + for(int idxPart = 0 ; idxPart < nb_particles ; ++idxPart){ + particles_current_rhs[idxPart*3+IDX_X] += extra_particles_current_rhs[idxPart*3+IDX_X]; + particles_current_rhs[idxPart*3+IDX_Y] += extra_particles_current_rhs[idxPart*3+IDX_Y]; + particles_current_rhs[idxPart*3+IDX_Z] += extra_particles_current_rhs[idxPart*3+IDX_Z]; + } + } + + + //////////////////////////////////////////////////////////////////////// + /// Re-distribution related + //////////////////////////////////////////////////////////////////////// + + void apply_pbc_xy(real_number* inout_particles, const int size) const final { + TIMEZONE("particles_field_computer::apply_pbc_xy"); + const std::array<int, 2> dims_xy={IDX_X, IDX_Y}; + for(int idxPart = 0 ; idxPart < size ; ++idxPart){ + // Consider it will never move for more than one box repeatition + for(const int idxDim : dims_xy){ + if(inout_particles[idxPart*3+idxDim] < 0) inout_particles[idxPart*3+idxDim] += spatial_box_width[idxDim]; + else if(spatial_box_width[idxDim] <= inout_particles[idxPart*3+idxDim]) inout_particles[idxPart*3+idxDim] -= spatial_box_width[idxDim]; + assert(0 <= inout_particles[idxPart*3+idxDim] && inout_particles[idxPart*3+idxDim] < spatial_box_width[idxDim]); + } + } + } + + void apply_pbc_z_new_particles(real_number* values, const int size) const final { + TIMEZONE("particles_field_computer::apply_pbc_z_new_particles"); + if(Parent::my_rank == 0){ + const int idxDim = IDX_Z; + for(int idxPart = 0 ; idxPart < size ; ++idxPart){ + assert(values[idxPart*3+idxDim] < my_spatial_up_limit_z || spatial_box_width[idxDim] <= values[idxPart*3+idxDim]); + assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim]); + + if(spatial_box_width[idxDim] <= values[idxPart*3+idxDim]) values[idxPart*3+idxDim] -= spatial_box_width[idxDim]; + + assert(0 <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < spatial_box_width[idxDim]); + assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < my_spatial_up_limit_z); + } + } + else if(Parent::my_rank == Parent::nb_processes_involved - 1){ + const int idxDim = IDX_Z; + for(int idxPart = 0 ; idxPart < size ; ++idxPart){ + assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim] || values[idxPart*3+idxDim] < 0); + assert(values[idxPart*3+idxDim] < spatial_box_width[idxDim]); + + if(values[idxPart*3+idxDim] < 0) values[idxPart*3+idxDim] += spatial_box_width[idxDim]; + + assert(0 <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < spatial_box_width[idxDim]); + assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < my_spatial_up_limit_z); + } + } + else{ + const int idxDim = IDX_Z; + for(int idxPart = 0 ; idxPart < size ; ++idxPart){ + assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < my_spatial_up_limit_z); + } + } + } + +public: + + particles_field_computer(MPI_Comm in_current_com, const std::array<size_t,3>& in_field_grid_dim, + const std::pair<int,int>& in_current_partitions, + const interpolator_class& in_interpolator, + const field_class& in_field, + const std::array<real_number,3>& in_spatial_box_width, + const std::array<real_number,3>& in_box_step_width, const real_number in_my_spatial_low_limit_z, + const real_number in_my_spatial_up_limit_z) + : abstract_particles_distr<real_number, 3,3,1>(in_current_com, in_current_partitions), + field_grid_dim(in_field_grid_dim), current_partition_interval(in_current_partitions), + interpolator(in_interpolator), field(in_field), positions_updater(), + spatial_box_width(in_spatial_box_width), box_step_width(in_box_step_width), + my_spatial_low_limit_z(in_my_spatial_low_limit_z), my_spatial_up_limit_z(in_my_spatial_up_limit_z){ + deriv[IDX_X] = 0; + deriv[IDX_Y] = 0; + deriv[IDX_Z] = 0; + } + + //////////////////////////////////////////////////////////////////////// + /// Update position + //////////////////////////////////////////////////////////////////////// + + void move_particles(real_number particles_positions[], + const int nb_particles, + const std::unique_ptr<real_number[]> particles_current_rhs[], + const int nb_rhs, const real_number dt) const final{ + TIMEZONE("particles_field_computer::move_particles"); + positions_updater.move_particles(particles_positions, nb_particles, + particles_current_rhs, nb_rhs, dt); + } + +}; + + +#endif diff --git a/bfps/cpp/particles/particles_input_hdf5.hpp b/bfps/cpp/particles/particles_input_hdf5.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ce1bddc152f459576d3e539ed1a73fde69a651e3 --- /dev/null +++ b/bfps/cpp/particles/particles_input_hdf5.hpp @@ -0,0 +1,305 @@ +#ifndef PARTICLES_INPUT_HDF5_HPP +#define PARTICLES_INPUT_HDF5_HPP + +#include <tuple> +#include <mpi.h> +#include <hdf5.h> +#include <cassert> +#include <vector> + +#include "abstract_particles_input.hpp" +#include "base.hpp" +#include "alltoall_exchanger.hpp" +#include "particles_utils.hpp" +#include "scope_timer.hpp" + + +// why is "size_particle_rhs" a template parameter? +// I think it's safe to assume this will always be 3. +template <class real_number, int size_particle_positions, int size_particle_rhs> +class particles_input_hdf5 : public abstract_particles_input<real_number> { + const std::string filename; + + MPI_Comm mpi_comm; + int my_rank; + int nb_processes; + + hsize_t nb_total_particles; + hsize_t nb_rhs; + int nb_particles_for_me; + + std::unique_ptr<real_number[]> my_particles_positions; + std::unique_ptr<int[]> my_particles_indexes; + std::vector<std::unique_ptr<real_number[]>> my_particles_rhs; + + static std::vector<real_number> BuildLimitsAllProcesses(MPI_Comm mpi_comm, + const real_number my_spatial_low_limit, const real_number my_spatial_up_limit){ + int my_rank; + int nb_processes; + + AssertMpi(MPI_Comm_rank(mpi_comm, &my_rank)); + AssertMpi(MPI_Comm_size(mpi_comm, &nb_processes)); + + std::vector<real_number> spatial_limit_per_proc(nb_processes*2); + + real_number intervalToSend[2] = {my_spatial_low_limit, my_spatial_up_limit}; + AssertMpi(MPI_Allgather(intervalToSend, 2, particles_utils::GetMpiType(real_number()), + spatial_limit_per_proc.data(), 2, particles_utils::GetMpiType(real_number()), mpi_comm)); + + for(int idx_proc = 0; idx_proc < nb_processes-1 ; ++idx_proc){ + assert(spatial_limit_per_proc[idx_proc*2] <= spatial_limit_per_proc[idx_proc*2+1]); + assert(spatial_limit_per_proc[idx_proc*2+1] == spatial_limit_per_proc[(idx_proc+1)*2]); + spatial_limit_per_proc[idx_proc+1] = spatial_limit_per_proc[idx_proc*2+1]; + } + spatial_limit_per_proc[nb_processes] = spatial_limit_per_proc[(nb_processes-1)*2+1]; + spatial_limit_per_proc.resize(nb_processes+1); + + return spatial_limit_per_proc; + } + +public: + particles_input_hdf5(const MPI_Comm in_mpi_comm,const std::string& inFilename, + const std::string& inDatanameState, const std::string& inDatanameRhs, + const real_number my_spatial_low_limit, const real_number my_spatial_up_limit) + : particles_input_hdf5(in_mpi_comm, inFilename, inDatanameState, inDatanameRhs, + BuildLimitsAllProcesses(in_mpi_comm, my_spatial_low_limit, my_spatial_up_limit)){ + } + + particles_input_hdf5(const MPI_Comm in_mpi_comm,const std::string& inFilename, + const std::string& inDatanameState, const std::string& inDatanameRhs, + const std::vector<real_number>& in_spatial_limit_per_proc) + : filename(inFilename), + mpi_comm(in_mpi_comm), my_rank(-1), nb_processes(-1), nb_total_particles(0), + nb_particles_for_me(-1){ + TIMEZONE("particles_input_hdf5"); + + AssertMpi(MPI_Comm_rank(mpi_comm, &my_rank)); + AssertMpi(MPI_Comm_size(mpi_comm, &nb_processes)); + assert(int(in_spatial_limit_per_proc.size()) == nb_processes+1); + + hid_t plist_id_par = H5Pcreate(H5P_FILE_ACCESS); + assert(plist_id_par >= 0); + { + int retTest = H5Pset_fapl_mpio(plist_id_par, mpi_comm, MPI_INFO_NULL); + assert(retTest >= 0); + } + + hid_t particle_file = H5Fopen(filename.c_str(), H5F_ACC_RDONLY, plist_id_par); + assert(particle_file >= 0); + + { + TIMEZONE("state"); + hid_t dset = H5Dopen(particle_file, inDatanameState.c_str(), H5P_DEFAULT); + assert(dset >= 0); + + hid_t dspace = H5Dget_space(dset); // copy? + assert(dspace >= 0); + + hid_t space_dim = H5Sget_simple_extent_ndims(dspace); + assert(space_dim >= 2); + + std::vector<hsize_t> state_dim_array(space_dim); + int hdfret = H5Sget_simple_extent_dims(dspace, &state_dim_array[0], NULL); + assert(hdfret >= 0); + // Last value is the position dim of the particles + assert(state_dim_array.back() == size_particle_positions); + + nb_total_particles = 1; + for (size_t idx_dim = 0; idx_dim < state_dim_array.size()-1; ++idx_dim){ + nb_total_particles *= state_dim_array[idx_dim]; + } + + hdfret = H5Sclose(dspace); + assert(hdfret >= 0); + hdfret = H5Dclose(dset); + assert(hdfret >= 0); + } + { + TIMEZONE("rhs"); + hid_t dset = H5Dopen(particle_file, inDatanameRhs.c_str(), H5P_DEFAULT); + assert(dset >= 0); + hid_t dspace = H5Dget_space(dset); // copy? + assert(dspace >= 0); + + hid_t rhs_dim = H5Sget_simple_extent_ndims(dspace); + // Chichi comment: this assertion will fail in general, there's no reason for it. + //assert(rhs_dim == 4); + std::vector<hsize_t> rhs_dim_array(rhs_dim); + + // Chichi comment: wouldn't &rhs_dim_array.front() be safer? + int hdfret = H5Sget_simple_extent_dims(dspace, &rhs_dim_array[0], NULL); + assert(hdfret >= 0); + assert(rhs_dim_array.back() == size_particle_rhs); + // Chichi comment: this assertion will fail in general + //assert(rhs_dim_array.front() == 1); + nb_rhs = rhs_dim_array[0]; + + hdfret = H5Sclose(dspace); + assert(hdfret >= 0); + hdfret = H5Dclose(dset); + assert(hdfret >= 0); + } + + particles_utils::IntervalSplitter<hsize_t> load_splitter(nb_total_particles, nb_processes, my_rank); + + static_assert(std::is_same<real_number, double>::value + || std::is_same<real_number, float>::value, "real_number must be double or float"); + const hid_t type_id = (sizeof(real_number) == 8?H5T_NATIVE_DOUBLE:H5T_NATIVE_FLOAT); + + /// Load the data + std::unique_ptr<real_number[]> split_particles_positions(new real_number[load_splitter.getMySize()*size_particle_positions]); + { + TIMEZONE("state-read"); + hid_t dset = H5Dopen(particle_file, inDatanameState.c_str(), H5P_DEFAULT); + assert(dset >= 0); + + hid_t rspace = H5Dget_space(dset); + assert(rspace >= 0); + + hsize_t offset[2] = {load_splitter.getMyOffset(), 0}; + hsize_t mem_dims[2] = {load_splitter.getMySize(), 3}; + + hid_t mspace = H5Screate_simple(2, &mem_dims[0], NULL); + assert(mspace >= 0); + + int rethdf = H5Sselect_hyperslab(rspace, H5S_SELECT_SET, offset, + NULL, mem_dims, NULL); + assert(rethdf >= 0); + rethdf = H5Dread(dset, type_id, mspace, rspace, H5P_DEFAULT, split_particles_positions.get()); + assert(rethdf >= 0); + + rethdf = H5Sclose(rspace); + assert(rethdf >= 0); + rethdf = H5Dclose(dset); + assert(rethdf >= 0); + } + std::vector<std::unique_ptr<real_number[]>> split_particles_rhs(nb_rhs); + { + TIMEZONE("rhs-read"); + hid_t dset = H5Dopen(particle_file, inDatanameRhs.c_str(), H5P_DEFAULT); + assert(dset >= 0); + + for(hsize_t idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){ + hid_t rspace = H5Dget_space(dset); + assert(rspace >= 0); + + split_particles_rhs[idx_rhs].reset(new real_number[load_splitter.getMySize()*size_particle_rhs]); + + hsize_t offset[3] = {idx_rhs, load_splitter.getMyOffset(), 0}; + hsize_t mem_dims[3] = {1, load_splitter.getMySize(), size_particle_rhs}; + + hid_t mspace = H5Screate_simple( 3, &mem_dims[0], NULL); + assert(mspace >= 0); + + int rethdf = H5Sselect_hyperslab( rspace, H5S_SELECT_SET, offset, + NULL, mem_dims, NULL); + assert(rethdf >= 0); + rethdf = H5Dread(dset, type_id, mspace, rspace, H5P_DEFAULT, split_particles_rhs[idx_rhs].get()); + assert(rethdf >= 0); + + rethdf = H5Sclose(mspace); + assert(rethdf >= 0); + + rethdf = H5Sclose(rspace); + assert(rethdf >= 0); + } + int rethdf = H5Dclose(dset); + assert(rethdf >= 0); + } + + std::unique_ptr<int[]> split_particles_indexes(new int[load_splitter.getMySize()]); + for(int idx_part = 0 ; idx_part < int(load_splitter.getMySize()) ; ++idx_part){ + split_particles_indexes[idx_part] = idx_part + load_splitter.getMyOffset(); + } + + // Permute + std::vector<int> nb_particles_per_proc(nb_processes); + { + TIMEZONE("partition"); + int previousOffset = 0; + for(int idx_proc = 0 ; idx_proc < nb_processes-1 ; ++idx_proc){ + const real_number limitPartition = in_spatial_limit_per_proc[idx_proc+1]; + const int localOffset = particles_utils::partition_extra<size_particle_positions>( + &split_particles_positions[previousOffset*size_particle_positions], + load_splitter.getMySize()-previousOffset, + [&](const real_number val[]){ + return val[IDX_Z] < limitPartition; + }, + [&](const int idx1, const int idx2){ + std::swap(split_particles_indexes[idx1], split_particles_indexes[idx2]); + for(int idx_rhs = 0 ; idx_rhs < int(nb_rhs) ; ++idx_rhs){ + for(int idx_val = 0 ; idx_val < size_particle_rhs ; ++idx_val){ + std::swap(split_particles_rhs[idx_rhs][idx1*size_particle_rhs + idx_val], + split_particles_rhs[idx_rhs][idx2*size_particle_rhs + idx_val]); + } + } + }, previousOffset); + + nb_particles_per_proc[idx_proc] = localOffset; + previousOffset += localOffset; + } + nb_particles_per_proc[nb_processes-1] = load_splitter.getMySize() - previousOffset; + } + + { + TIMEZONE("exchanger"); + alltoall_exchanger exchanger(mpi_comm, std::move(nb_particles_per_proc)); + // nb_particles_per_processes cannot be used after due to move + nb_particles_for_me = exchanger.getTotalToRecv(); + + my_particles_positions.reset(new real_number[exchanger.getTotalToRecv()*size_particle_positions]); + exchanger.alltoallv<real_number>(split_particles_positions.get(), my_particles_positions.get(), size_particle_positions); + split_particles_positions.release(); + + my_particles_indexes.reset(new int[exchanger.getTotalToRecv()]); + exchanger.alltoallv<int>(split_particles_indexes.get(), my_particles_indexes.get()); + split_particles_indexes.release(); + + my_particles_rhs.resize(nb_rhs); + for(int idx_rhs = 0 ; idx_rhs < int(nb_rhs) ; ++idx_rhs){ + my_particles_rhs[idx_rhs].reset(new real_number[exchanger.getTotalToRecv()*size_particle_rhs]); + exchanger.alltoallv<real_number>(split_particles_rhs[idx_rhs].get(), my_particles_rhs[idx_rhs].get(), size_particle_rhs); + } + } + + { + TIMEZONE("close"); + int hdfret = H5Fclose(particle_file); + assert(hdfret >= 0); + hdfret = H5Pclose(plist_id_par); + assert(hdfret >= 0); + } + } + + ~particles_input_hdf5(){ + } + + int getTotalNbParticles() final{ + return nb_total_particles; + } + + int getLocalNbParticles() final{ + return int(nb_particles_for_me); + } + + int getNbRhs() final{ + return int(nb_rhs); + } + + std::unique_ptr<real_number[]> getMyParticles() final { + assert(my_particles_positions != nullptr); + return std::move(my_particles_positions); + } + + std::vector<std::unique_ptr<real_number[]>> getMyRhs() final { + assert(my_particles_rhs.size() == nb_rhs); + return std::move(my_particles_rhs); + } + + std::unique_ptr<int[]> getMyParticlesIndexes() final { + assert(my_particles_indexes != nullptr); + return std::move(my_particles_indexes); + } +}; + +#endif diff --git a/bfps/cpp/particles/particles_interp_spline.hpp b/bfps/cpp/particles/particles_interp_spline.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d1a9f67da0cb0f360711b889363c05286ffc3009 --- /dev/null +++ b/bfps/cpp/particles/particles_interp_spline.hpp @@ -0,0 +1,201 @@ +#ifndef PARTICLES_INTER_SPLINE_HPP +#define PARTICLES_INTER_SPLINE_HPP + +template <class real_number, int interp_neighbours, int mode> +class particles_interp_spline; + +#include "spline_n1.hpp" + +template <> +class particles_interp_spline<double, 1,0>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n1_m0(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 1,1>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n1_m1(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 1,2>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n1_m2(in_derivative, in_part_val, poly_val); + } +}; + +#include "spline_n2.hpp" + +template <> +class particles_interp_spline<double, 2,0>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n2_m0(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 2,1>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n2_m1(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 2,2>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n2_m2(in_derivative, in_part_val, poly_val); + } +}; + +#include "spline_n3.hpp" + +template <> +class particles_interp_spline<double, 3,0>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n3_m0(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 3,1>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n3_m1(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 3,2>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n3_m2(in_derivative, in_part_val, poly_val); + } +}; + +#include "spline_n4.hpp" + +template <> +class particles_interp_spline<double, 4,0>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n4_m0(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 4,1>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n4_m1(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 4,2>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n4_m2(in_derivative, in_part_val, poly_val); + } +}; + +#include "spline_n5.hpp" + +template <> +class particles_interp_spline<double, 5,0>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n5_m0(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 5,1>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n5_m1(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 5,2>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n5_m2(in_derivative, in_part_val, poly_val); + } +}; + +#include "spline_n6.hpp" + +template <> +class particles_interp_spline<double, 6,0>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n6_m0(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 6,1>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n6_m1(in_derivative, in_part_val, poly_val); + } +}; + +template <> +class particles_interp_spline<double, 6,2>{ +public: + using real_number = double; + + void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const { + beta_n6_m2(in_derivative, in_part_val, poly_val); + } +}; + + + +#endif diff --git a/bfps/cpp/particles/particles_output_hdf5.hpp b/bfps/cpp/particles/particles_output_hdf5.hpp new file mode 100644 index 0000000000000000000000000000000000000000..49a0b69e3d0c0b888a7c685570d4ba7296a230f5 --- /dev/null +++ b/bfps/cpp/particles/particles_output_hdf5.hpp @@ -0,0 +1,162 @@ +#ifndef PARTICLES_OUTPUT_HDF5_HPP +#define PARTICLES_OUTPUT_HDF5_HPP + +#include <memory> +#include <vector> +#include <hdf5.h> + +#include "abstract_particles_output.hpp" +#include "scope_timer.hpp" + +template <class real_number, int size_particle_positions, int size_particle_rhs> +class particles_output_hdf5 : public abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>{ + using Parent = abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>; + + const std::string filename; + + hid_t file_id; + const int total_nb_particles; + + const std::string datagroup_basename_state; + const std::string datagroup_basename_rhs; + + hid_t dset_id_state; + hid_t dset_id_rhs; + +public: + particles_output_hdf5(MPI_Comm in_mpi_com, const std::string in_filename, const int inTotalNbParticles, + const int in_nb_rhs, const std::string in_datagroup_basename_state, + const std::string in_datagroup_basename_rhs) + : abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>(in_mpi_com, inTotalNbParticles, in_nb_rhs), + filename(in_filename), + file_id(0), total_nb_particles(inTotalNbParticles), datagroup_basename_state(in_datagroup_basename_state), + datagroup_basename_rhs(in_datagroup_basename_rhs), dset_id_state(0), dset_id_rhs(0){ + if(datagroup_basename_state == datagroup_basename_rhs){ + DEBUG_MSG("The same dataset names have been passed to particles_output_hdf5 for the state and the rhs\n" + "It will result into an undefined behavior.\n" + "Dataset name = %s\n", datagroup_basename_state.c_str()); + } + + TIMEZONE("particles_output_hdf5::H5Pcreate"); + hid_t plist_id_par = H5Pcreate(H5P_FILE_ACCESS); + assert(plist_id_par >= 0); + int retTest = H5Pset_fapl_mpio(plist_id_par, Parent::getCom(), MPI_INFO_NULL); + assert(retTest >= 0); + + // Parallel HDF5 write + file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR | H5F_ACC_DEBUG, plist_id_par); + // file_id = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC | H5F_ACC_DEBUG/*H5F_ACC_EXCL*/, H5P_DEFAULT/*H5F_ACC_RDWR*/, plist_id_par); + assert(file_id >= 0); + H5Pclose(plist_id_par); + + dset_id_state = H5Gopen(file_id, datagroup_basename_state.c_str(), H5P_DEFAULT); + assert(dset_id_state >= 0); + dset_id_rhs = H5Gopen(file_id, datagroup_basename_rhs.c_str(), H5P_DEFAULT); + assert(dset_id_rhs >= 0); + } + + ~particles_output_hdf5(){ + TIMEZONE("particles_output_hdf5::H5Dclose"); + + int rethdf = H5Gclose(dset_id_state); + assert(rethdf >= 0); + + rethdf = H5Gclose(dset_id_rhs); + assert(rethdf >= 0); + + rethdf = H5Fclose(file_id); + assert(rethdf >= 0); + } + + void write(const int idx_time_step, const real_number* particles_positions, const std::unique_ptr<real_number[]>* particles_rhs, + const int nb_particles, const int particles_idx_offset) final{ + TIMEZONE("particles_output_hdf5::write"); + + assert(particles_idx_offset < Parent::getTotalNbParticles()); + assert(particles_idx_offset+nb_particles <= Parent::getTotalNbParticles()); + + static_assert(std::is_same<real_number, double>::value + || std::is_same<real_number, float>::value, "real_number must be double or float"); + const hid_t type_id = (sizeof(real_number) == 8?H5T_NATIVE_DOUBLE:H5T_NATIVE_FLOAT); + + hid_t plist_id = H5Pcreate(H5P_DATASET_XFER); + assert(plist_id >= 0); + { + int rethdf = H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT); + assert(rethdf >= 0); + } + + { + assert(total_nb_particles >= 0); + assert(size_particle_positions >= 0); + const hsize_t datacount[2] = {hsize_t(total_nb_particles), hsize_t(size_particle_positions)}; + hid_t dataspace = H5Screate_simple(2, datacount, NULL); + assert(dataspace >= 0); + + hid_t dataset_id = H5Dcreate( dset_id_state, std::to_string(idx_time_step).c_str(), type_id, dataspace, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + assert(dataset_id >= 0); + + assert(nb_particles >= 0); + assert(particles_idx_offset >= 0); + const hsize_t count[2] = {hsize_t(nb_particles), size_particle_positions}; + const hsize_t offset[2] = {hsize_t(particles_idx_offset), 0}; + hid_t memspace = H5Screate_simple(2, count, NULL); + assert(memspace >= 0); + + hid_t filespace = H5Dget_space(dataset_id); + int rethdf = H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL); + assert(rethdf >= 0); + + herr_t status = H5Dwrite(dataset_id, type_id, memspace, filespace, + plist_id, particles_positions); + assert(status >= 0); + rethdf = H5Sclose(memspace); + assert(rethdf >= 0); + rethdf = H5Dclose(dataset_id); + assert(rethdf >= 0); + rethdf = H5Sclose(filespace); + assert(rethdf >= 0); + } + { + assert(size_particle_rhs >= 0); + const hsize_t datacount[3] = {hsize_t(Parent::getNbRhs()), hsize_t(total_nb_particles), hsize_t(size_particle_rhs)}; + hid_t dataspace = H5Screate_simple(3, datacount, NULL); + assert(dataspace >= 0); + + hid_t dataset_id = H5Dcreate( dset_id_rhs, std::to_string(idx_time_step).c_str(), type_id, dataspace, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + assert(dataset_id >= 0); + + assert(particles_idx_offset >= 0); + for(int idx_rhs = 0 ; idx_rhs < Parent::getNbRhs() ; ++idx_rhs){ + const hsize_t count[3] = {1, hsize_t(nb_particles), hsize_t(size_particle_rhs)}; + const hsize_t offset[3] = {hsize_t(idx_rhs), hsize_t(particles_idx_offset), 0}; + hid_t memspace = H5Screate_simple(3, count, NULL); + assert(memspace >= 0); + + hid_t filespace = H5Dget_space(dataset_id); + assert(filespace >= 0); + int rethdf = H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL); + assert(rethdf >= 0); + + herr_t status = H5Dwrite(dataset_id, type_id, memspace, filespace, + plist_id, particles_rhs[idx_rhs].get()); + assert(status >= 0); + rethdf = H5Sclose(filespace); + assert(rethdf >= 0); + rethdf = H5Sclose(memspace); + assert(rethdf >= 0); + } + int rethdf = H5Dclose(dataset_id); + assert(rethdf >= 0); + } + + { + int rethdf = H5Pclose(plist_id); + assert(rethdf >= 0); + } + } +}; + +#endif diff --git a/bfps/cpp/particles/particles_output_mpiio.hpp b/bfps/cpp/particles/particles_output_mpiio.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4a034c74b90f4a121a52308a460a9942b9dd1d29 --- /dev/null +++ b/bfps/cpp/particles/particles_output_mpiio.hpp @@ -0,0 +1,86 @@ +#ifndef PARTICLES_OUTPUT_MPIIO +#define PARTICLES_OUTPUT_MPIIO + +#include <memory> +#include <vector> +#include <string> +#include <cassert> + +#include "abstract_particles_output.hpp" +#include "scope_timer.hpp" +#include "particles_utils.hpp" + +template <class real_number, int size_particle_positions, int size_particle_rhs> +class particles_output_mpiio : public abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>{ + using Parent = abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>; + + const std::string filename; + const int nb_step_prealloc; + + int current_step_in_file; + + MPI_File mpi_file; + +public: + particles_output_mpiio(MPI_Comm in_mpi_com, const std::string in_filename, const int inTotalNbParticles, + const int in_nb_rhs, const int in_nb_step_prealloc = -1) + : abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>(in_mpi_com, inTotalNbParticles, in_nb_rhs), + filename(in_filename), nb_step_prealloc(in_nb_step_prealloc), current_step_in_file(0){ + { + TIMEZONE("particles_output_mpiio::MPI_File_open"); + AssertMpi(MPI_File_open(Parent::getCom(), const_cast<char*>(filename.c_str()), + MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &mpi_file)); + } + if(nb_step_prealloc != -1){ + TIMEZONE("particles_output_mpiio::MPI_File_set_size"); + AssertMpi(MPI_File_set_size(mpi_file, + nb_step_prealloc*Parent::getTotalNbParticles()*sizeof(real_number)*(size_particle_positions+size_particle_rhs*Parent::getNbRhs()))); + } + } + + ~particles_output_mpiio(){ + TIMEZONE("particles_output_mpiio::MPI_File_close"); + AssertMpi(MPI_File_close(&mpi_file)); + } + + void write(const int /*time_step*/, const real_number* particles_positions, const std::unique_ptr<real_number[]>* particles_rhs, + const int nb_particles, const int particles_idx_offset) final{ + TIMEZONE("particles_output_mpiio::write"); + + assert(nb_step_prealloc == -1 || current_step_in_file < nb_step_prealloc); + assert(particles_idx_offset < Parent::getTotalNbParticles()); + assert(particles_idx_offset+nb_particles <= Parent::getTotalNbParticles()); + + if(nb_step_prealloc == -1){ + TIMEZONE("particles_output_mpiio::write::MPI_File_set_size"); + AssertMpi(MPI_File_set_size(mpi_file, + (current_step_in_file+1)*Parent::getTotalNbParticles()*sizeof(real_number)*(size_particle_positions+size_particle_rhs*Parent::getNbRhs()))); + } + + const MPI_Offset globalParticlesOffset = current_step_in_file*Parent::getTotalNbParticles()*(size_particle_positions+size_particle_rhs*Parent::getNbRhs()) + + nb_particles*size_particle_positions; + + const MPI_Offset writingOffset = globalParticlesOffset * sizeof(real_number); + + AssertMpi(MPI_File_write_at(mpi_file, writingOffset, + const_cast<real_number*>(particles_positions), nb_particles*size_particle_positions, particles_utils::GetMpiType(real_number()), + MPI_STATUS_IGNORE)); + + for(int idx_rsh = 0 ; idx_rsh < Parent::getNbRhs() ; ++idx_rsh){ + const MPI_Offset globalParticlesOffsetOutput = current_step_in_file*Parent::getTotalNbParticles()*(size_particle_positions+size_particle_rhs) + + Parent::getTotalNbParticles()*size_particle_positions + + idx_rsh*Parent::getTotalNbParticles()*size_particle_rhs + + nb_particles*size_particle_rhs; + + const MPI_Offset writingOffsetOutput = globalParticlesOffsetOutput * sizeof(real_number); + + AssertMpi(MPI_File_write_at(mpi_file, writingOffsetOutput, + const_cast<real_number*>(particles_rhs[idx_rsh].get()), nb_particles*size_particle_rhs, particles_utils::GetMpiType(real_number()), + MPI_STATUS_IGNORE)); + } + + current_step_in_file += 1; + } +}; + +#endif diff --git a/bfps/cpp/particles/particles_system.hpp b/bfps/cpp/particles/particles_system.hpp new file mode 100644 index 0000000000000000000000000000000000000000..472ca95d86ff14448f56092c9122229abfeebd1e --- /dev/null +++ b/bfps/cpp/particles/particles_system.hpp @@ -0,0 +1,202 @@ +#ifndef PARTICLES_SYSTEM_HPP +#define PARTICLES_SYSTEM_HPP + +#include <array> + +#include "abstract_particles_system.hpp" +#include "particles_output_hdf5.hpp" +#include "particles_output_mpiio.hpp" +#include "particles_field_computer.hpp" +#include "field_accessor.hpp" +#include "abstract_particles_input.hpp" +#include "particles_adams_bashforth.hpp" +#include "scope_timer.hpp" + +template <class real_number, class field_rnumber, class interpolator_class, int interp_neighbours> +class particles_system : public abstract_particles_system<real_number> { + MPI_Comm mpi_com; + + const std::pair<int,int> current_partition_interval; + const int partition_interval_size; + + field_accessor<field_rnumber> field; + + interpolator_class interpolator; + + particles_field_computer<real_number, interpolator_class, field_accessor<field_rnumber>, interp_neighbours, particles_adams_bashforth<real_number, 3,3>> computer; + + std::unique_ptr<int[]> current_my_nb_particles_per_partition; + std::unique_ptr<int[]> current_offset_particles_for_partition; + + const std::array<real_number,3> spatial_box_width; + const std::array<real_number,3> spatial_partition_width; + const real_number my_spatial_low_limit; + const real_number my_spatial_up_limit; + + std::unique_ptr<real_number[]> my_particles_positions; + std::unique_ptr<int[]> my_particles_positions_indexes; + int my_nb_particles; + std::vector<std::unique_ptr<real_number[]>> my_particles_rhs; + + int step_idx; + +public: + particles_system(const std::array<size_t,3>& field_grid_dim, const std::array<real_number,3>& in_spatial_box_width, + const std::array<real_number,3>& in_spatial_partition_width, + const real_number in_my_spatial_low_limit, const real_number in_my_spatial_up_limit, + const field_rnumber* in_field_data, const std::array<size_t,3>& in_local_field_dims, + const std::array<size_t,3>& in_local_field_offset, + const std::array<size_t,3>& in_field_memory_dims, + MPI_Comm in_mpi_com) + : mpi_com(in_mpi_com), + current_partition_interval({in_local_field_offset[IDX_Z], in_local_field_offset[IDX_Z] + in_local_field_dims[IDX_Z]}), + partition_interval_size(current_partition_interval.second - current_partition_interval.first), + field(in_field_data, in_local_field_dims, in_local_field_offset, in_field_memory_dims), + interpolator(), + computer(in_mpi_com, field_grid_dim, current_partition_interval, + interpolator, field, in_spatial_box_width, in_spatial_partition_width, + in_my_spatial_low_limit, in_my_spatial_up_limit), + spatial_box_width(in_spatial_box_width), spatial_partition_width(in_spatial_partition_width), + my_spatial_low_limit(in_my_spatial_low_limit), my_spatial_up_limit(in_my_spatial_up_limit), + my_nb_particles(0), step_idx(1){ + + current_my_nb_particles_per_partition.reset(new int[partition_interval_size]); + current_offset_particles_for_partition.reset(new int[partition_interval_size+1]); + } + + ~particles_system(){ + } + + void init(abstract_particles_input<real_number>& particles_input){ + TIMEZONE("particles_system::init"); + + my_particles_positions = particles_input.getMyParticles(); + my_particles_positions_indexes = particles_input.getMyParticlesIndexes(); + my_particles_rhs = particles_input.getMyRhs(); + my_nb_particles = particles_input.getLocalNbParticles(); + + for(int idx_part = 0 ; idx_part < my_nb_particles ; ++idx_part){ // TODO remove me + assert(my_particles_positions[idx_part*3+IDX_Z] >= my_spatial_low_limit); + assert(my_particles_positions[idx_part*3+IDX_Z] < my_spatial_up_limit); + } + + particles_utils::partition_extra_z<3>(&my_particles_positions[0], my_nb_particles, partition_interval_size, + current_my_nb_particles_per_partition.get(), current_offset_particles_for_partition.get(), + [&](const int idxPartition){ + const real_number limitPartition = (idxPartition+1)*spatial_partition_width[IDX_Z] + my_spatial_low_limit; + return limitPartition; + }, + [&](const int idx1, const int idx2){ + std::swap(my_particles_positions_indexes[idx1], my_particles_positions_indexes[idx2]); + for(int idx_rhs = 0 ; idx_rhs < int(my_particles_rhs.size()) ; ++idx_rhs){ + for(int idx_val = 0 ; idx_val < 3 ; ++idx_val){ + std::swap(my_particles_rhs[idx_rhs][idx1*3 + idx_val], + my_particles_rhs[idx_rhs][idx2*3 + idx_val]); + } + } + }); + + {// TODO remove + for(int idxPartition = 0 ; idxPartition < partition_interval_size ; ++idxPartition){ + assert(current_my_nb_particles_per_partition[idxPartition] == + current_offset_particles_for_partition[idxPartition+1] - current_offset_particles_for_partition[idxPartition]); + const real_number limitPartition = (idxPartition+1)*spatial_partition_width[IDX_Z] + my_spatial_low_limit; + for(int idx = 0 ; idx < current_offset_particles_for_partition[idxPartition+1] ; ++idx){ + assert(my_particles_positions[idx*3+IDX_Z] < limitPartition); + } + for(int idx = current_offset_particles_for_partition[idxPartition+1] ; idx < my_nb_particles ; ++idx){ + assert(my_particles_positions[idx*3+IDX_Z] >= limitPartition); + } + } + } + } + + + void compute() final { + TIMEZONE("particles_system::compute"); + computer.compute_distr(current_my_nb_particles_per_partition.get(), + my_particles_positions.get(), + my_particles_rhs.front().get(), + interp_neighbours); + } + + void move(const real_number dt) final { + TIMEZONE("particles_system::move"); + computer.move_particles(my_particles_positions.get(), my_nb_particles, + my_particles_rhs.data(), std::min(step_idx+1,int(my_particles_rhs.size())), + dt); + } + + void redistribute() final { + TIMEZONE("particles_system::redistribute"); + computer.redistribute(current_my_nb_particles_per_partition.get(), + &my_nb_particles, + &my_particles_positions, + my_particles_rhs.data(), my_particles_rhs.size(), + &my_particles_positions_indexes, + my_spatial_low_limit, + my_spatial_up_limit, + spatial_partition_width[IDX_Z]); + } + + void inc_step_idx() final { + step_idx += 1; + } + + void shift_rhs_vectors() final { + if(my_particles_rhs.size()){ + std::unique_ptr<real_number[]> next_current(std::move(my_particles_rhs.back())); + for(int idx_rhs = my_particles_rhs.size()-1 ; idx_rhs > 0 ; --idx_rhs){ + my_particles_rhs[idx_rhs] = std::move(my_particles_rhs[idx_rhs-1]); + } + my_particles_rhs[0] = std::move(next_current); + particles_utils::memzero(my_particles_rhs[0], 3*my_nb_particles); + } + } + + void completeLoop(const real_number dt) final { + TIMEZONE("particles_system::completeLoop"); + compute(); + move(dt); + redistribute(); + inc_step_idx(); + shift_rhs_vectors(); + } + + const real_number* getParticlesPositions() const final { + return my_particles_positions.get(); + } + + const std::unique_ptr<real_number[]>* getParticlesRhs() const final { + return my_particles_rhs.data(); + } + + const int* getParticlesIndexes() const final { + return my_particles_positions_indexes.get(); + } + + int getLocalNbParticles() const final { + return my_nb_particles; + } + + int getNbRhs() const final { + return int(my_particles_rhs.size()); + } + + void checkNan() const { // TODO remove + for(int idx_part = 0 ; idx_part < my_nb_particles ; ++idx_part){ // TODO remove me + assert(std::isnan(my_particles_positions[idx_part*3+IDX_X]) == false); + assert(std::isnan(my_particles_positions[idx_part*3+IDX_Y]) == false); + assert(std::isnan(my_particles_positions[idx_part*3+IDX_Z]) == false); + + for(int idx_rhs = 0 ; idx_rhs < my_particles_rhs.size() ; ++idx_rhs){ + assert(std::isnan(my_particles_rhs[idx_rhs][idx_part*3+IDX_X]) == false); + assert(std::isnan(my_particles_rhs[idx_rhs][idx_part*3+IDX_Y]) == false); + assert(std::isnan(my_particles_rhs[idx_rhs][idx_part*3+IDX_Z]) == false); + } + } + } +}; + + +#endif diff --git a/bfps/cpp/particles/particles_system_builder.hpp b/bfps/cpp/particles/particles_system_builder.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d314ab5001410ae0c2529395e7910614b432819a --- /dev/null +++ b/bfps/cpp/particles/particles_system_builder.hpp @@ -0,0 +1,253 @@ +#ifndef PARTICLES_SYSTEM_BUILDER_HPP +#define PARTICLES_SYSTEM_BUILDER_HPP + +#include <string> + +#include "abstract_particles_system.hpp" +#include "particles_system.hpp" +#include "particles_input_hdf5.hpp" +#include "particles_interp_spline.hpp" + +#include "field.hpp" +#include "kspace.hpp" + + + +////////////////////////////////////////////////////////////////////////////// +/// +/// Double template "for" +/// +////////////////////////////////////////////////////////////////////////////// + +namespace Template_double_for_if{ + +template <class RetType, + class IterType1, IterType1 CurrentIter1, + class IterType2, const IterType2 CurrentIter2, const IterType2 iterTo2, const IterType2 IterStep2, + class Func, bool IsNotOver, typename... Args> +struct For2{ + static RetType evaluate(IterType2 value2, Args... args){ + if(CurrentIter2 == value2){ + return std::move(Func::template instanciate<CurrentIter1, CurrentIter2>(args...)); + } + else{ + return std::move(For2<RetType, + IterType1, CurrentIter1, + IterType2, CurrentIter2+IterStep2, iterTo2, IterStep2, + Func, (CurrentIter2+IterStep2 < iterTo2), Args...>::evaluate(value2, args...)); + } + } +}; + +template <class RetType, + class IterType1, IterType1 CurrentIter1, + class IterType2, const IterType2 CurrentIter2, const IterType2 iterTo2, const IterType2 IterStep2, + class Func, typename... Args> +struct For2<RetType, + IterType1, CurrentIter1, + IterType2, CurrentIter2, iterTo2, IterStep2, + Func, false, Args...>{ + static RetType evaluate(IterType2 value2, Args... args){ + std::cout << __FUNCTION__ << "[ERROR] template values for loop 2 " << value2 << " does not exist\n"; + return RetType(); + } +}; + +template <class RetType, + class IterType1, const IterType1 CurrentIter1, const IterType1 iterTo1, const IterType1 IterStep1, + class IterType2, const IterType2 IterFrom2, const IterType2 iterTo2, const IterType2 IterStep2, + class Func, bool IsNotOver, typename... Args> +struct For1{ + static RetType evaluate(IterType1 value1, IterType2 value2, Args... args){ + if(CurrentIter1 == value1){ + return std::move(For2<RetType, + IterType1, CurrentIter1, + IterType2, IterFrom2, iterTo2, IterStep2, + Func, (IterFrom2<iterTo2), Args...>::evaluate(value2, args...)); + } + else{ + return std::move(For1<RetType, + IterType1, CurrentIter1+IterStep1, iterTo1, IterStep1, + IterType2, IterFrom2, iterTo2, IterStep2, + Func, (CurrentIter1+IterStep1 < iterTo1), Args...>::evaluate(value1, value2, args...)); + } + } +}; + +template <class RetType, + class IterType1, const IterType1 IterFrom1, const IterType1 iterTo1, const IterType1 IterStep1, + class IterType2, const IterType2 IterFrom2, const IterType2 iterTo2, const IterType2 IterStep2, + class Func, typename... Args> +struct For1<RetType, + IterType1, IterFrom1, iterTo1, IterStep1, + IterType2, IterFrom2, iterTo2, IterStep2, + Func, false, Args...>{ + static RetType evaluate(IterType1 value1, IterType2 value2, Args... args){ + std::cout << __FUNCTION__ << "[ERROR] template values for loop 1 " << value1 << " does not exist\n"; + return RetType(); + } +}; + +template <class RetType, + class IterType1, const IterType1 IterFrom1, const IterType1 iterTo1, const IterType1 IterStep1, + class IterType2, const IterType2 IterFrom2, const IterType2 iterTo2, const IterType2 IterStep2, + class Func, typename... Args> +inline RetType evaluate(IterType1 value1, IterType2 value2, Args... args){ + return std::move(For1<RetType, + IterType1, IterFrom1, iterTo1, IterStep1, + IterType2, IterFrom2, iterTo2, IterStep2, + Func, (IterFrom1<iterTo1), Args...>::evaluate(value1, value2, args...)); +} + +} + + +////////////////////////////////////////////////////////////////////////////// +/// +/// Builder Functions +/// +////////////////////////////////////////////////////////////////////////////// + +template <class field_rnumber, field_backend be, class particles_rnumber> +struct particles_system_build_container { + template <const int interpolation_size, const int spline_mode> + static std::unique_ptr<abstract_particles_system<particles_rnumber>> instanciate( + const field<field_rnumber, be, THREE>* fs_field, // (field object) + const kspace<be, SMOOTH>* fs_kk, // (kspace object, contains dkx, dky, dkz) + const int nsteps, // to check coherency between parameters and hdf input file (nb rhs) + const int nparticles, // to check coherency between parameters and hdf input file + const std::string& fname_input, // particles input filename + const std::string& inDatanameState, const std::string& inDatanameRhs, // input dataset names + MPI_Comm mpi_comm){ + + // The size of the field grid (global size) all_size seems + std::array<size_t,3> field_grid_dim; + field_grid_dim[IDX_X] = fs_field->rlayout->sizes[IDX_X];// nx + field_grid_dim[IDX_Y] = fs_field->rlayout->sizes[IDX_Y];// nx + field_grid_dim[IDX_Z] = fs_field->rlayout->sizes[IDX_Z];// nz + + // The size of the local field grid (the field nodes that belong to current process) + std::array<size_t,3> local_field_dims; + local_field_dims[IDX_X] = fs_field->rlayout->subsizes[IDX_X]; + local_field_dims[IDX_Y] = fs_field->rlayout->subsizes[IDX_Y]; + local_field_dims[IDX_Z] = fs_field->rlayout->subsizes[IDX_Z]; + + // The offset of the local field grid + std::array<size_t,3> local_field_offset; + local_field_offset[IDX_X] = fs_field->rlayout->starts[IDX_X]; + local_field_offset[IDX_Y] = fs_field->rlayout->starts[IDX_Y]; + local_field_offset[IDX_Z] = fs_field->rlayout->starts[IDX_Z]; + + + // Retreive split from fftw to know processes that have no work + int my_rank, nb_processes; + AssertMpi(MPI_Comm_rank(mpi_comm, &my_rank)); + AssertMpi(MPI_Comm_size(mpi_comm, &nb_processes)); + + const int split_step = (int(field_grid_dim[IDX_Z])+nb_processes-1)/nb_processes; + const int nb_processes_involved = (int(field_grid_dim[IDX_Z])+split_step-1)/split_step; + + assert((my_rank < nb_processes_involved && local_field_dims[IDX_Z] != 0) + || (nb_processes_involved <= my_rank && local_field_dims[IDX_Z] == 0)); + assert(nb_processes_involved <= int(field_grid_dim[IDX_Z])); + + // Make the idle processes starting from the limit (and not 0 as set by fftw) + if(nb_processes_involved <= my_rank){ + local_field_offset[IDX_Z] = field_grid_dim[IDX_Z]; + } + + // Ensure that 1D partitioning is used + { + assert(local_field_offset[IDX_X] == 0); + assert(local_field_offset[IDX_Y] == 0); + assert(local_field_dims[IDX_X] == field_grid_dim[IDX_X]); + assert(local_field_dims[IDX_Y] == field_grid_dim[IDX_Y]); + + assert(my_rank >= nb_processes_involved || ((my_rank == 0 && local_field_offset[IDX_Z] == 0) + || (my_rank != 0 && local_field_offset[IDX_Z] != 0))); + assert(my_rank >= nb_processes_involved || ((my_rank == nb_processes_involved-1 && local_field_offset[IDX_Z]+local_field_dims[IDX_Z] == field_grid_dim[IDX_Z]) + || (my_rank != nb_processes_involved-1 && local_field_offset[IDX_Z]+local_field_dims[IDX_Z] != field_grid_dim[IDX_Z]))); + } + + // The offset of the local field grid + std::array<size_t,3> local_field_mem_size; + local_field_mem_size[IDX_X] = fs_field->rmemlayout->subsizes[IDX_X]; + local_field_mem_size[IDX_Y] = fs_field->rmemlayout->subsizes[IDX_Y]; + local_field_mem_size[IDX_Z] = fs_field->rmemlayout->subsizes[IDX_Z]; + + // The spatial box size (all particles should be included inside) + std::array<particles_rnumber,3> spatial_box_width; + spatial_box_width[IDX_X] = 4 * acos(0) / (fs_kk->dkx); + spatial_box_width[IDX_Y] = 4 * acos(0) / (fs_kk->dky); + spatial_box_width[IDX_Z] = 4 * acos(0) / (fs_kk->dkz); + + // The distance between two field nodes in z + std::array<particles_rnumber,3> spatial_partition_width; + spatial_partition_width[IDX_X] = spatial_box_width[IDX_X]/particles_rnumber(field_grid_dim[IDX_X]); + spatial_partition_width[IDX_Y] = spatial_box_width[IDX_Y]/particles_rnumber(field_grid_dim[IDX_Y]); + spatial_partition_width[IDX_Z] = spatial_box_width[IDX_Z]/particles_rnumber(field_grid_dim[IDX_Z]); + // The spatial interval of the current process + const particles_rnumber my_spatial_low_limit_z = particles_rnumber(local_field_offset[IDX_Z])*spatial_partition_width[IDX_Z]; + const particles_rnumber my_spatial_up_limit_z = particles_rnumber(local_field_offset[IDX_Z]+local_field_dims[IDX_Z])*spatial_partition_width[IDX_Z]; + + // Create the particles system + particles_system<particles_rnumber, field_rnumber, particles_interp_spline<particles_rnumber, interpolation_size,spline_mode>, interpolation_size>* part_sys + = new particles_system<particles_rnumber, field_rnumber, particles_interp_spline<particles_rnumber, interpolation_size,spline_mode>, interpolation_size>(field_grid_dim, + spatial_box_width, + spatial_partition_width, + my_spatial_low_limit_z, + my_spatial_up_limit_z, + fs_field->get_rdata(), + local_field_dims, + local_field_offset, + local_field_mem_size, + mpi_comm); + + // Load particles from hdf5 + particles_input_hdf5<particles_rnumber, 3,3> generator(mpi_comm, fname_input, + inDatanameState, inDatanameRhs, my_spatial_low_limit_z, my_spatial_up_limit_z); + + // Ensure parameters match the input file + if(generator.getNbRhs() != nsteps){ + std::runtime_error(std::string("Nb steps is ") + std::to_string(nsteps) + + " in the parameters but " + std::to_string(generator.getNbRhs()) + " in the particles file."); + } + // Ensure parameters match the input file + if(generator.getTotalNbParticles() != nparticles){ + std::runtime_error(std::string("Nb particles is ") + std::to_string(nparticles) + + " in the parameters but " + std::to_string(generator.getTotalNbParticles()) + " in the particles file."); + } + + // Load the particles and move them to the particles system + part_sys->init(generator); + + assert(part_sys->getNbRhs() == nsteps); + + // Return the created particles system + return std::unique_ptr<abstract_particles_system<particles_rnumber>>(part_sys); + } +}; + + +template <class field_rnumber, field_backend be, class particles_rnumber = double> +inline std::unique_ptr<abstract_particles_system<particles_rnumber>> particles_system_builder( + const field<field_rnumber, be, THREE>* fs_field, // (field object) + const kspace<be, SMOOTH>* fs_kk, // (kspace object, contains dkx, dky, dkz) + const int nsteps, // to check coherency between parameters and hdf input file (nb rhs) + const int nparticles, // to check coherency between parameters and hdf input file + const std::string& fname_input, // particles input filename + const std::string& inDatanameState, const std::string& inDatanameRhs, // input dataset names + const int interpolation_size, + const int spline_mode, + MPI_Comm mpi_comm){ + return Template_double_for_if::evaluate<std::unique_ptr<abstract_particles_system<particles_rnumber>>, + int, 1, 7, 1, // interpolation_size + int, 0, 3, 1, // spline_mode + particles_system_build_container<field_rnumber,be,particles_rnumber>>( + interpolation_size, // template iterator 1 + spline_mode, // template iterator 2 + fs_field,fs_kk, nsteps, nparticles, fname_input, inDatanameState, inDatanameRhs, mpi_comm); +} + + +#endif diff --git a/bfps/cpp/particles/particles_utils.hpp b/bfps/cpp/particles/particles_utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0ebd79641bec71671fbcfa5788cf2134dad61b0e --- /dev/null +++ b/bfps/cpp/particles/particles_utils.hpp @@ -0,0 +1,300 @@ +#ifndef PARTICLES_UTILS_HPP +#define PARTICLES_UTILS_HPP + +#include <mpi.h> + +#include <cassert> +#include <stack> +#include <vector> +#include <memory> +#include <cstring> + +#if _OPENMP < 201511 +#warning Openmp priority is not supported here +#define priority(x) +#endif + + +#ifndef AssertMpi +#define AssertMpi(X) if(MPI_SUCCESS != (X)) { printf("MPI Error at line %d\n",__LINE__); fflush(stdout) ; throw std::runtime_error("Stop from from mpi erro"); } +#endif + +enum IDXS_3D { + IDX_X = 2, + IDX_Y = 1, + IDX_Z = 0 +}; + +namespace particles_utils { + +class GetMpiType{ + const MPI_Datatype type; +public: + explicit GetMpiType(const int&) : type(MPI_INT){} + explicit GetMpiType(const double&) : type(MPI_DOUBLE){} + explicit GetMpiType(const float&) : type(MPI_FLOAT){} + explicit GetMpiType(const char&) : type(MPI_CHAR){} + explicit GetMpiType(const long&) : type(MPI_LONG){} + + /*do not make it explicit*/ operator MPI_Datatype() const { return type; } +}; + + +template <int nb_values, class real_number, class Predicate> +inline int partition(real_number* array, const int size, Predicate pdc) +{ + if(size == 0) return 0; + if(size == 1) return (pdc(&array[0])?1:0); + + int idxInsert = 0; + + for(int idx = 0 ; idx < size && pdc(&array[idx*nb_values]); ++idx){ + idxInsert += 1; + } + + for(int idx = idxInsert ; idx < size ; ++idx){ + if(pdc(&array[idx*nb_values])){ + for(int idxVal = 0 ; idxVal < nb_values ; ++idxVal){ + std::swap(array[idx*nb_values + idxVal], array[idxInsert*nb_values + idxVal]); + } + idxInsert += 1; + } + } + + return idxInsert; +} + + +template <int nb_values, class real_number, class Predicate1, class Predicate2> +inline int partition_extra(real_number* array, const int size, Predicate1 pdc, Predicate2 pdcswap, const int offset_idx_swap = 0) +{ + if(size == 0) return 0; + if(size == 1) return (pdc(&array[0])?1:0); + + int idxInsert = 0; + + for(int idx = 0 ; idx < size && pdc(&array[idx*nb_values]); ++idx){ + idxInsert += 1; + } + + for(int idx = idxInsert ; idx < size ; ++idx){ + if(pdc(&array[idx*nb_values])){ + for(int idxVal = 0 ; idxVal < nb_values ; ++idxVal){ + std::swap(array[idx*nb_values + idxVal], array[idxInsert*nb_values + idxVal]); + } + pdcswap(idx+offset_idx_swap, idxInsert+offset_idx_swap); + idxInsert += 1; + } + } + + return idxInsert; +} + +template <int nb_values, class real_number, class Predicate1, class Predicate2> +inline void partition_extra_z(real_number* array, const int size, const int nb_partitions, + int partitions_size[], int partitions_offset[], + Predicate1 partitions_limits, Predicate2 pdcswap) +{ + if(nb_partitions == 0){ + return ; + } + + partitions_offset[0] = 0; + partitions_offset[nb_partitions] = size; + + if(nb_partitions == 1){ + partitions_size[0] = size; + return; + } + + if(nb_partitions == 2){ + const real_number limit = partitions_limits(0); + const int size_current = partition_extra<nb_values>(array, size, + [&](const real_number inval[]){ + return inval[IDX_Z] < limit; + }, pdcswap); + partitions_size[0] = size_current; + partitions_size[1] = size-size_current; + partitions_offset[1] = size_current; + return; + } + + std::stack<std::pair<int,int>> toproceed; + + toproceed.push({0, nb_partitions}); + + while(toproceed.size()){ + const std::pair<int,int> current_part = toproceed.top(); + toproceed.pop(); + + assert(current_part.second-current_part.first >= 1); + + if(current_part.second-current_part.first == 1){ + partitions_size[current_part.first] = partitions_offset[current_part.first+1] - partitions_offset[current_part.first]; + } + else{ + const int idx_middle = (current_part.second-current_part.first)/2 + current_part.first - 1; + + const int size_unpart = partitions_offset[current_part.second]- partitions_offset[current_part.first]; + + const real_number limit = partitions_limits(idx_middle); + const int size_current = partition_extra<nb_values>(&array[partitions_offset[current_part.first]*nb_values], + size_unpart, + [&](const real_number inval[]){ + return inval[IDX_Z] < limit; + }, pdcswap, partitions_offset[current_part.first]); + + partitions_offset[idx_middle+1] = size_current + partitions_offset[current_part.first]; + + toproceed.push({current_part.first, idx_middle+1}); + + toproceed.push({idx_middle+1, current_part.second}); + } + } +} + +template <int nb_values, class real_number, class Predicate1, class Predicate2> +inline std::pair<std::vector<int>,std::vector<int>> partition_extra_z(real_number* array, const int size, + const int nb_partitions, Predicate1 partitions_limits, + Predicate2 pdcswap){ + + std::vector<int> partitions_size(nb_partitions); + std::vector<int> partitions_offset(nb_partitions+1); + partition_extra_z<nb_values, real_number, Predicate1, Predicate2>(array, size, nb_partitions, + partitions_size.data(), partitions_offset.data(), + partitions_limits, pdcswap); + return {std::move(partitions_size), std::move(partitions_offset)}; +} + + +template <class NumType = int> +class IntervalSplitter { + const NumType nb_items; + const NumType nb_intervals; + const NumType my_idx; + + double step_split; + NumType offset_mine; + NumType size_mine; +public: + IntervalSplitter(const NumType in_nb_items, + const NumType in_nb_intervals, + const NumType in_my_idx) + : nb_items(in_nb_items), nb_intervals(in_nb_intervals), my_idx(in_my_idx), + step_split(0), offset_mine(0), size_mine(0){ + if(nb_items <= nb_intervals){ + step_split = 1; + if(my_idx < nb_items){ + offset_mine = my_idx; + size_mine = 1; + } + else{ + offset_mine = nb_intervals; + size_mine = 0; + } + } + else{ + step_split = double(nb_items)/double(nb_intervals); + offset_mine = NumType(step_split*double(my_idx)); + size_mine = (my_idx != nb_intervals-1 ? NumType(step_split*double(my_idx+1)) : nb_items) -offset_mine; + } + } + + NumType getMySize() const { + return size_mine; + } + + NumType getMyOffset() const { + return offset_mine; + } + + NumType getSizeOther(const NumType in_idx_other) const { + return IntervalSplitter<NumType>(nb_items, nb_intervals, in_idx_other).getMySize(); + } + + NumType getOffsetOther(const NumType in_idx_other) const { + return IntervalSplitter<NumType>(nb_items, nb_intervals, in_idx_other).getMyOffset(); + } + + NumType getOwner(const NumType in_item_idx) const { + NumType owner = NumType(double(in_item_idx)/step_split); + if(owner != nb_intervals-1 && NumType(step_split*double(owner+1)) <= in_item_idx){ + owner += 1; + } + assert(owner < nb_intervals); + assert(IntervalSplitter(nb_items, nb_intervals, owner).getMyOffset() <= in_item_idx); + assert(in_item_idx < IntervalSplitter(nb_items, nb_intervals, owner).getMySize()+IntervalSplitter(nb_items, nb_intervals, owner).getMyOffset()); + return owner; + } +}; + +// http://en.cppreference.com/w/cpp/algorithm/transform +template<class InputIt, class OutputIt, class UnaryOperation> +OutputIt transform(InputIt first1, InputIt last1, OutputIt d_first, + UnaryOperation unary_op) +{ + while (first1 != last1) { + *d_first++ = unary_op(*first1++); + } + return d_first; +} + + +template <class NumType> +void memzero(NumType* array, size_t size){ + memset(array, 0, size*sizeof(NumType)); +} + +template <class NumType> +void memzero(std::unique_ptr<NumType[]>& array, size_t size){ + memset(array.get(), 0, size*sizeof(NumType)); +} + + +class fixed_copy { + const size_t to_idx; + const size_t from_idx; + const size_t nb_elements_to_copy; + +public: + fixed_copy(const size_t in_to_idx, const size_t in_from_idx, const size_t in_nb_elements_to_copy) + : to_idx(in_to_idx), from_idx(in_from_idx), nb_elements_to_copy(in_nb_elements_to_copy){ + } + + fixed_copy(const size_t in_to_idx, const size_t in_nb_elements_to_copy) + : fixed_copy(in_to_idx, 0, in_nb_elements_to_copy){ + } + + fixed_copy(const size_t in_nb_elements_to_copy) + : fixed_copy(0, in_nb_elements_to_copy){ + } + + template <class ItemType> + const fixed_copy& copy(ItemType dest[], const ItemType source[]) const { + memcpy(&dest[to_idx], &source[from_idx], sizeof(ItemType)*nb_elements_to_copy); + return *this; + } + + template <class ItemType> + const fixed_copy& copy(ItemType dest[], const ItemType source[], const size_t nb_values_per_element) const { + memcpy(&dest[to_idx*nb_values_per_element], &source[from_idx*nb_values_per_element], sizeof(ItemType)*nb_elements_to_copy*nb_values_per_element); + return *this; + } + + template <class ItemType> + const fixed_copy& copy(std::unique_ptr<ItemType[]>& dest, const std::unique_ptr<ItemType[]>& source) const { + memcpy(&dest[to_idx], &source[from_idx], sizeof(ItemType)*nb_elements_to_copy); + return *this; + } + + template <class ItemType> + const fixed_copy& copy(std::unique_ptr<ItemType[]>& dest, const std::unique_ptr<ItemType[]>& source, const size_t nb_values_per_element) const { + memcpy(&dest[to_idx*nb_values_per_element], &source[from_idx*nb_values_per_element], sizeof(ItemType)*nb_elements_to_copy*nb_values_per_element); + return *this; + } +}; + + +} + +#endif diff --git a/bfps/cpp/particles_base.cpp b/bfps/cpp/particles_base.cpp index ff0fec32d4f0493814351788ca25081adfb27a12..1410488410a429ff463a1751e86f78cc2157679b 100644 --- a/bfps/cpp/particles_base.cpp +++ b/bfps/cpp/particles_base.cpp @@ -29,6 +29,7 @@ #include <algorithm> #include <cassert> #include "particles_base.hpp" +#include "scope_timer.hpp" template <particle_types particle_type> single_particle_state<particle_type>::single_particle_state() @@ -88,6 +89,7 @@ int get_chunk_offsets( std::vector<hsize_t> chnk_dims, std::vector<std::vector<hsize_t>> &co) { + TIMEZONE("get_chunk_offsets"); std::vector<hsize_t> nchunks(data_dims); int total_number_of_chunks = 1; for (unsigned i=0; i<nchunks.size(); i++) @@ -121,6 +123,7 @@ particles_io_base<particle_type>::particles_io_base( const hid_t data_file_id, MPI_Comm COMM) { + TIMEZONE("particles_io_base::particles_io_base"); this->name = std::string(NAME); this->traj_skip = TRAJ_SKIP; this->comm = COMM; @@ -233,6 +236,7 @@ void particles_io_base<particle_type>::read_state_chunk( const int cindex, double *data) { + TIMEZONE("particles_io_base::read_state_chunk"); DEBUG_MSG("entered read_state_chunk\n"); hid_t dset = H5Dopen(this->hdf5_group_id, "state", H5P_DEFAULT); hid_t rspace = H5Dget_space(dset); @@ -267,6 +271,7 @@ void particles_io_base<particle_type>::write_state_chunk( const int cindex, const double *data) { + TIMEZONE("particles_io_base::write_state_chunk"); hid_t dset = H5Dopen(this->hdf5_group_id, "state", H5P_DEFAULT); hid_t rspace = H5Dget_space(dset); std::vector<hsize_t> mem_dims(this->hdf5_state_chunks); @@ -300,6 +305,7 @@ void particles_io_base<particle_type>::read_rhs_chunk( const int rhsindex, double *data) { + TIMEZONE("particles_io_base::read_rhs_chunk"); //DEBUG_MSG("entered read_rhs_chunk\n"); hid_t dset = H5Dopen(this->hdf5_group_id, "rhs", H5P_DEFAULT); hid_t rspace = H5Dget_space(dset); @@ -342,6 +348,7 @@ void particles_io_base<particle_type>::write_rhs_chunk( const int rhsindex, const double *data) { + TIMEZONE("particles_io_base::write_rhs_chunk"); hid_t dset = H5Dopen(this->hdf5_group_id, "rhs", H5P_DEFAULT); hid_t rspace = H5Dget_space(dset); std::vector<hsize_t> mem_dims(this->hdf5_rhs_chunks); @@ -379,6 +386,7 @@ void particles_io_base<particle_type>::write_point3D_chunk( const int cindex, const double *data) { + TIMEZONE("particles_io_base::write_point3D_chunk"); hid_t dset = H5Dopen(this->hdf5_group_id, dset_name.c_str(), H5P_DEFAULT); hid_t rspace = H5Dget_space(dset); std::vector<hsize_t> mem_dims(this->hdf5_state_chunks); diff --git a/bfps/cpp/rFFTW_distributed_particles.cpp b/bfps/cpp/rFFTW_distributed_particles.cpp index ab694ab3cc226c4690970cf3959bb2c480207c61..265975f8c817a1b40942e076bd016c2921618bbc 100644 --- a/bfps/cpp/rFFTW_distributed_particles.cpp +++ b/bfps/cpp/rFFTW_distributed_particles.cpp @@ -32,10 +32,13 @@ #include <string> #include <sstream> #include <set> +#include <algorithm> +#include <ctime> #include "base.hpp" #include "rFFTW_distributed_particles.hpp" #include "fftw_tools.hpp" +#include "scope_timer.hpp" extern int myrank, nprocs; @@ -44,14 +47,15 @@ template <particle_types particle_type, class rnumber, int interp_neighbours> rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::rFFTW_distributed_particles( const char *NAME, const hid_t data_file_id, - rFFTW_interpolator<rnumber, interp_neighbours> *FIELD, + rFFTW_interpolator<rnumber, interp_neighbours> *VEL, const int TRAJ_SKIP, const int INTEGRATION_STEPS) : particles_io_base<particle_type>( NAME, TRAJ_SKIP, data_file_id, - FIELD->descriptor->comm) + VEL->descriptor->comm) { + TIMEZONE("rFFTW_distributed_particles::rFFTW_distributed_particles"); /* check that integration_steps has a valid value. * If NDEBUG is defined, "assert" doesn't do anything. * With NDEBUG defined, and an invalid INTEGRATION_STEPS, @@ -65,18 +69,21 @@ rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::rFFTW_di * therefore I prefer to just kill the code at this point, * no matter whether or not NDEBUG is present. * */ - if (interp_neighbours*2+2 > FIELD->descriptor->subsizes[0]) + if (interp_neighbours*2+2 > VEL->descriptor->subsizes[0]) { DEBUG_MSG("parameters incompatible with rFFTW_distributed_particles.\n" "interp kernel size is %d, local_z_size is %d\n", - interp_neighbours*2+2, FIELD->descriptor->subsizes[0]); - if (FIELD->descriptor->myrank == 0) + interp_neighbours*2+2, VEL->descriptor->subsizes[0]); + if (VEL->descriptor->myrank == 0) std::cerr << "parameters incompatible with rFFTW_distributed_particles." << std::endl; exit(0); } - this->vel = FIELD; + this->vel = VEL; this->rhs.resize(INTEGRATION_STEPS); this->integration_steps = INTEGRATION_STEPS; + /* the particles are expected to be evenly distributed among processes. + * therefore allocating twice that amount of memory seems enough. + * */ this->state.reserve(2*this->nparticles / this->nprocs); for (unsigned int i=0; i<this->rhs.size(); i++) this->rhs[i].reserve(2*this->nparticles / this->nprocs); @@ -157,6 +164,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::sam const std::unordered_map<int, std::unordered_set<int>> &dp, std::unordered_map<int, single_particle_state<POINT3D>> &y) { + TIMEZONE("rFFTW_distributed_particles::sample"); double *yyy; double *yy; y.clear(); @@ -184,24 +192,35 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::sam int tindex; tindex = 0; // can this sorting be done more efficiently? - std::set<int> ordered_dp; + std::vector<int> ordered_dp; + { + TIMEZONE("rFFTW_distributed_particles::sample::ordered_dp"); + ordered_dp.reserve(dp.at(domain_index).size()); for (auto p: dp.at(domain_index)) - ordered_dp.insert(p); + ordered_dp.push_back(p); + //std::set<int> ordered_dp(dp.at(domain_index)); + std::sort(ordered_dp.begin(), ordered_dp.end()); + } for (auto p: ordered_dp) + //for (auto p: dp.at(domain_index)) { (*field)(x.at(p).data, yy + tindex*3); tindex++; } - MPI_Allreduce( + { + TIMEZONE("rFFTW_distributed_particles::sample::MPI_Allreduce"); + MPI_Allreduce( yy, yyy, 3*dp.at(domain_index).size(), MPI_DOUBLE, MPI_SUM, this->domain_comm[domain_index]); + } tindex = 0; for (auto p: ordered_dp) + //for (auto p: dp.at(domain_index)) { y[p] = yyy + tindex*3; tindex++; @@ -224,8 +243,10 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::get case VELOCITY_TRACER: this->sample(this->vel, x, dp, yy); y.clear(); - for (auto &pp: x) - y[pp.first] = yy[pp.first].data; + y.reserve(yy.size()); + y.rehash(this->nparticles); + for (auto &pp: yy) + y[pp.first] = pp.second.data; break; } } @@ -253,31 +274,38 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red std::vector<std::unordered_map<int, single_particle_state<particle_type>>> &vals, std::unordered_map<int, std::unordered_set<int>> &dp) { + TIMEZONE("rFFTW_distributed_particles::redistribute"); //DEBUG_MSG("entered redistribute\n"); /* get new distribution of particles */ std::unordered_map<int, std::unordered_set<int>> newdp; - this->sort_into_domains(x, newdp); + { + TIMEZONE("sort_into_domains"); + this->sort_into_domains(x, newdp); + } /* take care of particles that are leaving the shared domains */ int dindex[2] = {-1, 1}; // for each D of the 2 shared domains - for (int di=0; di<2; di++) - // for all particles previously in D - for (auto p: dp[dindex[di]]) - { - // if the particle is no longer in D - if (newdp[dindex[di]].find(p) == newdp[dindex[di]].end()) + { + TIMEZONE("Loop1"); + for (int di=0; di<2; di++) + // for all particles previously in D + for (auto p: dp[dindex[di]]) { - // and the particle is not in the local domain - if (newdp[0].find(p) == newdp[0].end()) + // if the particle is no longer in D + if (newdp[dindex[di]].find(p) == newdp[dindex[di]].end()) { - // remove the particle from the local list - x.erase(p); - for (unsigned int i=0; i<vals.size(); i++) - vals[i].erase(p); + // and the particle is not in the local domain + if (newdp[0].find(p) == newdp[0].end()) + { + // remove the particle from the local list + x.erase(p); + for (unsigned int i=0; i<vals.size(); i++) + vals[i].erase(p); + } + // if the particle is in the local domain, do nothing } - // if the particle is in the local domain, do nothing } - } + } /* take care of particles that are entering the shared domains */ /* neighbouring rank offsets */ int ro[2]; @@ -285,16 +313,23 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red ro[1] = 1; /* particles to send, particles to receive */ std::vector<int> ps[2], pr[2]; + for (int tcounter = 0; tcounter < 2; tcounter++) + { + ps[tcounter].reserve(newdp[dindex[tcounter]].size()); + } /* number of particles to send, number of particles to receive */ int nps[2], npr[2]; int rsrc, rdst; /* get list of id-s to send */ - for (auto &p: dp[0]) { - for (int di=0; di<2; di++) + TIMEZONE("Loop2"); + for (auto &p: dp[0]) { - if (newdp[dindex[di]].find(p) != newdp[dindex[di]].end()) - ps[di].push_back(p); + for (int di=0; di<2; di++) + { + if (newdp[dindex[di]].find(p) != newdp[dindex[di]].end()) + ps[di].push_back(p); + } } } /* prepare data for send recv */ @@ -304,7 +339,8 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red for (int i=0; i<2; i++) { rdst = MOD(rsrc+ro[i], this->nprocs); - if (this->myrank == rsrc) + if (this->myrank == rsrc){ + TIMEZONE("MPI_Send"); MPI_Send( nps+i, 1, @@ -312,7 +348,9 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red rdst, 2*(rsrc*this->nprocs + rdst)+i, this->comm); - if (this->myrank == rdst) + } + if (this->myrank == rdst){ + TIMEZONE("MPI_Recv"); MPI_Recv( npr+1-i, 1, @@ -321,6 +359,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red 2*(rsrc*this->nprocs + rdst)+i, this->comm, MPI_STATUS_IGNORE); + } } //DEBUG_MSG("I have to send %d %d particles\n", nps[0], nps[1]); //DEBUG_MSG("I have to recv %d %d particles\n", npr[0], npr[1]); @@ -338,6 +377,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red rdst = MOD(rsrc+ro[i], this->nprocs); if (this->myrank == rsrc && nps[i] > 0) { + TIMEZONE("this->myrank == rsrc && nps[i] > 0"); MPI_Send( &ps[i].front(), nps[i], @@ -369,6 +409,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red } if (this->myrank == rdst && npr[1-i] > 0) { + TIMEZONE("this->myrank == rdst && npr[1-i] > 0"); MPI_Recv( &pr[1-i].front(), npr[1-i], @@ -401,8 +442,10 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red delete[] buffer; // x has been changed, so newdp is obsolete // we need to sort into domains again - this->sort_into_domains(x, dp); - + { + TIMEZONE("sort_into_domains2"); + this->sort_into_domains(x, dp); + } #ifndef NDEBUG /* check that all particles at x are local */ @@ -425,44 +468,51 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::Ada const int nsteps) { this->get_rhs(this->state, this->domain_particles, this->rhs[0]); - for (auto &pp: this->state) +#define AdamsBashforth_LOOP_PREAMBLE \ + for (auto &pp: this->state) \ for (unsigned int i=0; i<state_dimension(particle_type); i++) - switch(nsteps) - { - case 1: - pp.second[i] += this->dt*this->rhs[0][pp.first][i]; - break; - case 2: - pp.second[i] += this->dt*(3*this->rhs[0][pp.first][i] - - this->rhs[1][pp.first][i])/2; - break; - case 3: - pp.second[i] += this->dt*(23*this->rhs[0][pp.first][i] - - 16*this->rhs[1][pp.first][i] - + 5*this->rhs[2][pp.first][i])/12; - break; - case 4: - pp.second[i] += this->dt*(55*this->rhs[0][pp.first][i] - - 59*this->rhs[1][pp.first][i] - + 37*this->rhs[2][pp.first][i] - - 9*this->rhs[3][pp.first][i])/24; - break; - case 5: - pp.second[i] += this->dt*(1901*this->rhs[0][pp.first][i] - - 2774*this->rhs[1][pp.first][i] - + 2616*this->rhs[2][pp.first][i] - - 1274*this->rhs[3][pp.first][i] - + 251*this->rhs[4][pp.first][i])/720; - break; - case 6: - pp.second[i] += this->dt*(4277*this->rhs[0][pp.first][i] - - 7923*this->rhs[1][pp.first][i] - + 9982*this->rhs[2][pp.first][i] - - 7298*this->rhs[3][pp.first][i] - + 2877*this->rhs[4][pp.first][i] - - 475*this->rhs[5][pp.first][i])/1440; - break; - } + switch(nsteps) + { + case 1: + AdamsBashforth_LOOP_PREAMBLE + pp.second[i] += this->dt*this->rhs[0][pp.first][i]; + break; + case 2: + AdamsBashforth_LOOP_PREAMBLE + pp.second[i] += this->dt*(3*this->rhs[0][pp.first][i] + - this->rhs[1][pp.first][i])/2; + break; + case 3: + AdamsBashforth_LOOP_PREAMBLE + pp.second[i] += this->dt*(23*this->rhs[0][pp.first][i] + - 16*this->rhs[1][pp.first][i] + + 5*this->rhs[2][pp.first][i])/12; + break; + case 4: + AdamsBashforth_LOOP_PREAMBLE + pp.second[i] += this->dt*(55*this->rhs[0][pp.first][i] + - 59*this->rhs[1][pp.first][i] + + 37*this->rhs[2][pp.first][i] + - 9*this->rhs[3][pp.first][i])/24; + break; + case 5: + AdamsBashforth_LOOP_PREAMBLE + pp.second[i] += this->dt*(1901*this->rhs[0][pp.first][i] + - 2774*this->rhs[1][pp.first][i] + + 2616*this->rhs[2][pp.first][i] + - 1274*this->rhs[3][pp.first][i] + + 251*this->rhs[4][pp.first][i])/720; + break; + case 6: + AdamsBashforth_LOOP_PREAMBLE + pp.second[i] += this->dt*(4277*this->rhs[0][pp.first][i] + - 7923*this->rhs[1][pp.first][i] + + 9982*this->rhs[2][pp.first][i] + - 7298*this->rhs[3][pp.first][i] + + 2877*this->rhs[4][pp.first][i] + - 475*this->rhs[5][pp.first][i])/1440; + break; + } this->redistribute(this->state, this->rhs, this->domain_particles); this->roll_rhs(); } @@ -471,6 +521,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::Ada template <particle_types particle_type, class rnumber, int interp_neighbours> void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::step() { + TIMEZONE("rFFTW_distributed_particles::step"); this->AdamsBashforth((this->iteration < this->integration_steps) ? this->iteration+1 : this->integration_steps); @@ -483,6 +534,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::sor const std::unordered_map<int, single_particle_state<particle_type>> &x, std::unordered_map<int, std::unordered_set<int>> &dp) { + TIMEZONE("rFFTW_distributed_particles::sort_into_domains"); int tmpint1, tmpint2; dp.clear(); dp[-1] = std::unordered_set<int>(); @@ -521,19 +573,25 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::sor template <particle_types particle_type, class rnumber, int interp_neighbours> void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::read() { + TIMEZONE("rFFTW_distributed_particles::read"); double *temp = new double[this->chunk_size*state_dimension(particle_type)]; int tmpint1, tmpint2; for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++) { //read state - if (this->myrank == 0) + if (this->myrank == 0){ + TIMEZONE("read_state_chunk"); this->read_state_chunk(cindex, temp); - MPI_Bcast( + } + { + TIMEZONE("MPI_Bcast"); + MPI_Bcast( temp, this->chunk_size*state_dimension(particle_type), MPI_DOUBLE, 0, this->comm); + } for (unsigned int p=0; p<this->chunk_size; p++) { if (this->vel->get_rank_info(temp[state_dimension(particle_type)*p+2], tmpint1, tmpint2)) @@ -542,17 +600,23 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::rea } } //read rhs - if (this->iteration > 0) + if (this->iteration > 0){ + TIMEZONE("this->iteration > 0"); for (int i=0; i<this->integration_steps; i++) { - if (this->myrank == 0) + if (this->myrank == 0){ + TIMEZONE("read_rhs_chunk"); this->read_rhs_chunk(cindex, i, temp); - MPI_Bcast( + } + { + TIMEZONE("MPI_Bcast"); + MPI_Bcast( temp, this->chunk_size*state_dimension(particle_type), MPI_DOUBLE, 0, this->comm); + } for (unsigned int p=0; p<this->chunk_size; p++) { auto pp = this->state.find(p+cindex*this->chunk_size); @@ -560,6 +624,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::rea this->rhs[i][p+cindex*this->chunk_size] = temp + state_dimension(particle_type)*p; } } + } } this->sort_into_domains(this->state, this->domain_particles); DEBUG_MSG("%s->state.size = %ld\n", this->name.c_str(), this->state.size()); @@ -575,31 +640,48 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::wri const char *dset_name, std::unordered_map<int, single_particle_state<POINT3D>> &y) { - double *data = new double[this->nparticles*3]; - double *yy = new double[this->nparticles*3]; - int pindex = 0; - for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++) + TIMEZONE("rFFTW_distributed_particles::write"); + double *data = new double[this->chunk_size*3]; + double *yy = new double[this->chunk_size*3]; + //int pindex = 0; + for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++) { std::fill_n(yy, this->chunk_size*3, 0); - for (unsigned int p=0; p<this->chunk_size; p++, pindex++) + //for (unsigned int p=0; p<this->chunk_size; p++, pindex++) + //{ + // if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() || + // this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end()) + // { + // std::copy(y[pindex].data, + // y[pindex].data + 3, + // yy + p*3); + // } + //} + for (int s = -1; s <= 0; s++) + for (auto &pp: this->domain_particles[s]) + { + if (pp >= int(cindex*this->chunk_size) && + pp < int((cindex+1)*this->chunk_size)) + { + std::copy(y[pp].data, + y[pp].data + 3, + yy + (pp-cindex*this->chunk_size)*3); + } + } { - if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() || - this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end()) - { - std::copy(y[pindex].data, - y[pindex].data + 3, - yy + p*3); - } - } - MPI_Allreduce( + TIMEZONE("MPI_Allreduce"); + MPI_Allreduce( yy, data, 3*this->chunk_size, MPI_DOUBLE, MPI_SUM, this->comm); - if (this->myrank == 0) + } + if (this->myrank == 0){ + TIMEZONE("write_point3D_chunk"); this->write_point3D_chunk(dset_name, cindex, data); + } } delete[] yy; delete[] data; @@ -609,59 +691,96 @@ template <particle_types particle_type, class rnumber, int interp_neighbours> void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::write( const bool write_rhs) { + TIMEZONE("rFFTW_distributed_particles::write2"); double *temp0 = new double[this->chunk_size*state_dimension(particle_type)]; double *temp1 = new double[this->chunk_size*state_dimension(particle_type)]; - int pindex = 0; + //int pindex = 0; for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++) { //write state std::fill_n(temp0, state_dimension(particle_type)*this->chunk_size, 0); - pindex = cindex*this->chunk_size; - for (unsigned int p=0; p<this->chunk_size; p++, pindex++) + //pindex = cindex*this->chunk_size; + //for (unsigned int p=0; p<this->chunk_size; p++, pindex++) + //{ + // if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() || + // this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end()) + // { + // TIMEZONE("std::copy"); + // std::copy(this->state[pindex].data, + // this->state[pindex].data + state_dimension(particle_type), + // temp0 + p*state_dimension(particle_type)); + // } + //} + for (int s = -1; s <= 0; s++) + for (auto &pp: this->domain_particles[s]) + { + if (pp >= int(cindex*this->chunk_size) && + pp < int((cindex+1)*this->chunk_size)) + { + std::copy(this->state[pp].data, + this->state[pp].data + state_dimension(particle_type), + temp0 + (pp-cindex*this->chunk_size)*state_dimension(particle_type)); + } + } { - if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() || - this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end()) - { - std::copy(this->state[pindex].data, - this->state[pindex].data + state_dimension(particle_type), - temp0 + p*state_dimension(particle_type)); - } + TIMEZONE("MPI_Allreduce"); + MPI_Allreduce( + temp0, + temp1, + state_dimension(particle_type)*this->chunk_size, + MPI_DOUBLE, + MPI_SUM, + this->comm); } - MPI_Allreduce( - temp0, - temp1, - state_dimension(particle_type)*this->chunk_size, - MPI_DOUBLE, - MPI_SUM, - this->comm); - if (this->myrank == 0) + if (this->myrank == 0){ + TIMEZONE("write_state_chunk"); this->write_state_chunk(cindex, temp1); + } //write rhs - if (write_rhs) + if (write_rhs){ + TIMEZONE("write_rhs"); for (int i=0; i<this->integration_steps; i++) { std::fill_n(temp0, state_dimension(particle_type)*this->chunk_size, 0); - pindex = cindex*this->chunk_size; - for (unsigned int p=0; p<this->chunk_size; p++, pindex++) + //pindex = cindex*this->chunk_size; + //for (unsigned int p=0; p<this->chunk_size; p++, pindex++) + //{ + // if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() || + // this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end()) + // { + // TIMEZONE("std::copy"); + // std::copy(this->rhs[i][pindex].data, + // this->rhs[i][pindex].data + state_dimension(particle_type), + // temp0 + p*state_dimension(particle_type)); + // } + //} + for (int s = -1; s <= 0; s++) + for (auto &pp: this->domain_particles[s]) + { + if (pp >= int(cindex*this->chunk_size) && + pp < int((cindex+1)*this->chunk_size)) + { + std::copy(this->rhs[i][pp].data, + this->rhs[i][pp].data + state_dimension(particle_type), + temp0 + (pp-cindex*this->chunk_size)*state_dimension(particle_type)); + } + } { - if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() || - this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end()) - { - std::copy(this->rhs[i][pindex].data, - this->rhs[i][pindex].data + state_dimension(particle_type), - temp0 + p*state_dimension(particle_type)); - } - } - MPI_Allreduce( + TIMEZONE("MPI_Allreduce"); + MPI_Allreduce( temp0, temp1, state_dimension(particle_type)*this->chunk_size, MPI_DOUBLE, MPI_SUM, this->comm); - if (this->myrank == 0) + } + if (this->myrank == 0){ + TIMEZONE("write_rhs_chunk"); this->write_rhs_chunk(cindex, i, temp1); + } } + } } delete[] temp0; delete[] temp1; diff --git a/bfps/cpp/rFFTW_distributed_particles.hpp b/bfps/cpp/rFFTW_distributed_particles.hpp index e271bbfae56c0d49bf66cebcb5e8e8158f81940b..400411d5f1fd6e597714be494a72272a76e01206 100644 --- a/bfps/cpp/rFFTW_distributed_particles.hpp +++ b/bfps/cpp/rFFTW_distributed_particles.hpp @@ -44,12 +44,25 @@ template <particle_types particle_type, class rnumber, int interp_neighbours> class rFFTW_distributed_particles: public particles_io_base<particle_type> { private: - std::unordered_map<int, single_particle_state<particle_type>> state; - std::vector<std::unordered_map<int, single_particle_state<particle_type>>> rhs; + // a "domain" corresponds to a region in 3D real space where a fixed set + // of MPI processes are required to participate in the interpolation + // formula (i.e. they all contain required information). + // we need to know how many processes there are for each of the domains + // to which the local process belongs. std::unordered_map<int, int> domain_nprocs; + // each domain has an associated communicator, and we keep a list of the + // communicators to which the local process belongs std::unordered_map<int, MPI_Comm> domain_comm; + // for each domain, we need a list of the IDs of the particles located + // in that domain std::unordered_map<int, std::unordered_set<int>> domain_particles; + // for each domain, we need the state of each particle + std::unordered_map<int, single_particle_state<particle_type>> state; + // for each domain, we also need the last few values of the right hand + // side of the ODE, since we use Adams-Bashforth integration + std::vector<std::unordered_map<int, single_particle_state<particle_type>>> rhs; + public: int integration_steps; // this class only works with rFFTW interpolator @@ -87,9 +100,24 @@ class rFFTW_distributed_particles: public particles_io_base<particle_type> std::unordered_map<int, single_particle_state<particle_type>> &y); + /* given a list of particle positions, + * figure out which go into what local domain, and construct the relevant + * map of ID lists "dp" (for domain particles). + * */ void sort_into_domains( const std::unordered_map<int, single_particle_state<particle_type>> &x, std::unordered_map<int, std::unordered_set<int>> &dp); + /* suppose the particles are currently badly distributed, and some + * arbitrary quantities (stored in "vals") are associated to the particles, + * and we need to properly distribute them among processes. + * that's what this function does. + * In practice it's only used to redistribute the rhs values (and it + * automatically redistributes the state x being passed). + * Some more comments are present in the .cpp file, but, in brief: the + * particles are simply moved from one domain to another. + * If it turns out that the new domain contains a process which does not + * know about a particle, that information is sent from the closest process. + * */ void redistribute( std::unordered_map<int, single_particle_state<particle_type>> &x, std::vector<std::unordered_map<int, single_particle_state<particle_type>>> &vals, diff --git a/bfps/cpp/rFFTW_interpolator.cpp b/bfps/cpp/rFFTW_interpolator.cpp index bffae44f5986f9873a231442e92cba6cf005d3a4..55388e4e6800b86ed71291508b74e4595b24845c 100644 --- a/bfps/cpp/rFFTW_interpolator.cpp +++ b/bfps/cpp/rFFTW_interpolator.cpp @@ -28,15 +28,15 @@ #include <cmath> #include "rFFTW_interpolator.hpp" +#include "scope_timer.hpp" template <class rnumber, int interp_neighbours> rFFTW_interpolator<rnumber, interp_neighbours>::rFFTW_interpolator( fluid_solver_base<rnumber> *fs, base_polynomial_values BETA_POLYS, - rnumber *FIELD) : interpolator_base<rnumber, interp_neighbours>(fs, BETA_POLYS) + rnumber *FIELD_POINTER) : interpolator_base<rnumber, interp_neighbours>(fs, BETA_POLYS) { - this->field_size = 2*fs->cd->local_size; - this->field = FIELD; + this->field = FIELD_POINTER; // generate compute array @@ -48,6 +48,24 @@ rFFTW_interpolator<rnumber, interp_neighbours>::rFFTW_interpolator( this->compute[((iz + this->descriptor->sizes[0]) % this->descriptor->sizes[0])] = true; } +template <class rnumber, int interp_neighbours> +rFFTW_interpolator<rnumber, interp_neighbours>::rFFTW_interpolator( + vorticity_equation<rnumber, FFTW> *fs, + base_polynomial_values BETA_POLYS, + rnumber *FIELD_POINTER) : interpolator_base<rnumber, interp_neighbours>(fs, BETA_POLYS) +{ +// this->field = FIELD_POINTER; +// +// +// // generate compute array +// this->compute = new bool[this->descriptor->sizes[0]]; +// std::fill_n(this->compute, this->descriptor->sizes[0], false); +// for (int iz = this->descriptor->starts[0]-interp_neighbours-1; +// iz <= this->descriptor->starts[0]+this->descriptor->subsizes[0]+interp_neighbours; +// iz++) +// this->compute[((iz + this->descriptor->sizes[0]) % this->descriptor->sizes[0])] = true; +} + template <class rnumber, int interp_neighbours> rFFTW_interpolator<rnumber, interp_neighbours>::~rFFTW_interpolator() { @@ -80,6 +98,7 @@ void rFFTW_interpolator<rnumber, interp_neighbours>::sample( double *__restrict__ y, const int *deriv) { + TIMEZONE("rFFTW_interpolator::sample"); /* get grid coordinates */ int *xg = new int[3*nparticles]; double *xx = new double[3*nparticles]; @@ -109,7 +128,14 @@ void rFFTW_interpolator<rnumber, interp_neighbours>::operator()( double *dest, const int *deriv) { + TIMEZONE("rFFTW_interpolator::operator()"); double bx[interp_neighbours*2+2], by[interp_neighbours*2+2], bz[interp_neighbours*2+2]; + /* please note that the polynomials in z are computed for all the different + * iz values, independently of whether or not "myrank" will perform the + * computation for all the different iz slices. + * I don't know how big a deal this really is, but it is something that we can + * optimize. + * */ if (deriv == NULL) { this->compute_beta(0, xx[0], bx); @@ -124,17 +150,30 @@ void rFFTW_interpolator<rnumber, interp_neighbours>::operator()( } std::fill_n(dest, 3, 0); ptrdiff_t bigiz, bigiy, bigix; + // loop over the 2*interp_neighbours + 2 z slices for (int iz = -interp_neighbours; iz <= interp_neighbours+1; iz++) { + // bigiz is the z index of the cell containing the particles + // this->descriptor->sizes[0] is added before taking the modulo + // because we want to be sure that "bigiz" is a positive number. + // I'm no longer sure why I don't use the MOD function here. bigiz = ptrdiff_t(((xg[2]+iz) + this->descriptor->sizes[0]) % this->descriptor->sizes[0]); + // once we know bigiz, we know whether "myrank" has the relevant slice. + // if not, go to next value of bigiz if (this->descriptor->myrank == this->descriptor->rank[bigiz]) { for (int iy = -interp_neighbours; iy <= interp_neighbours+1; iy++) { + // bigiy is the y index of the cell + // since we have all the y indices in myrank, we can safely use the + // modulo value bigiy = ptrdiff_t(MOD(xg[1]+iy, this->descriptor->sizes[1])); for (int ix = -interp_neighbours; ix <= interp_neighbours+1; ix++) { + // bigix is the x index of the cell bigix = ptrdiff_t(MOD(xg[0]+ix, this->descriptor->sizes[2])); + // here we create the index to the current grid node + // note the removal of local_z_start from bigiz. ptrdiff_t tindex = (((bigiz-this->descriptor->starts[0])*this->descriptor->sizes[1] + bigiy)*(this->descriptor->sizes[2]+2) + bigix)*3; diff --git a/bfps/cpp/rFFTW_interpolator.hpp b/bfps/cpp/rFFTW_interpolator.hpp index 795257d2744e432d9c346b93848cadfbd8cc85dc..5088be8b2f3094fd96332af0c923d7cc905e4f3f 100644 --- a/bfps/cpp/rFFTW_interpolator.hpp +++ b/bfps/cpp/rFFTW_interpolator.hpp @@ -27,6 +27,7 @@ #include "field_descriptor.hpp" #include "fftw_tools.hpp" #include "fluid_solver_base.hpp" +#include "vorticity_equation.hpp" #include "interpolator_base.hpp" #ifndef RFFTW_INTERPOLATOR @@ -38,41 +39,74 @@ class rFFTW_interpolator:public interpolator_base<rnumber, interp_neighbours> { public: using interpolator_base<rnumber, interp_neighbours>::operator(); - /* size of field to interpolate */ - ptrdiff_t field_size; - /* pointers to fields that are to be interpolated + /* pointer to field that has to be interpolated + * The reason this is a member variable is because I want this class + * to be consistent with the "interpolator" class, where a member + * variable is absolutely required (since that class uses padding). * */ rnumber *field; - /* compute[iz] is true if . + /* compute[iz] is an array that says whether or not the current MPI + * process is involved in the interpolation formula for a particle + * located in cell "iz". + * It is mostly used in the formula itself. + * This translates as the following condition: * local_zstart - neighbours <= iz <= local_zend + 1 + neighbours + * I think it's cleaner to keep things in an array, especially since + * "local_zend" is shorthand for another arithmetic operation anyway. * */ bool *compute; + + /* Constructors */ rFFTW_interpolator( fluid_solver_base<rnumber> *FSOLVER, base_polynomial_values BETA_POLYS, rnumber *FIELD_DATA); + + /* this constructor is empty, I just needed for a quick hack of the + * "vorticity_equation" class. + * It should be removed soon. + * */ + rFFTW_interpolator( + vorticity_equation<rnumber, FFTW> *FSOLVER, + base_polynomial_values BETA_POLYS, + rnumber *FIELD_DATA); ~rFFTW_interpolator(); - /* does not destroy input */ + /* This method is provided for consistency with "interpolator", and it + * does not destroy input */ inline int read_rFFTW(const void *src) { this->field = (rnumber*)src; return EXIT_SUCCESS; } + /* This is used when "compute" is not enough. + * For a given z location, it gives the outermost ranks that are relevant + * for the interpolation formula. + * */ bool get_rank_info(double z, int &maxz_rank, int &minz_rank); - /* interpolate field at an array of locations */ + /* interpolate field at an array of locations. + * After interpolation is performed, call Allreduce for "y", over + * this->descriptor->comm --- generally MPI_COMM_WORLD. + * This is useful for the simple "particles" class, where particle + * information is synchronized across all processes. + * */ void sample( const int nparticles, const int pdimension, const double *__restrict__ x, double *__restrict__ y, const int *deriv = NULL); - /* interpolate 1 point */ + /* interpolate 1 point. + * Result is kept local. + * This is used in the "rFFTW_distributed_particles" class, with the + * result being synchronized across the relevant "local particle + * communicator". + * */ void operator()( const int *__restrict__ xg, const double *__restrict__ xx, diff --git a/bfps/cpp/scope_timer.cpp b/bfps/cpp/scope_timer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..61ddd89583fe8d53cee328c4267df603e128d417 --- /dev/null +++ b/bfps/cpp/scope_timer.cpp @@ -0,0 +1,8 @@ + + +#include "scope_timer.hpp" + + +#ifdef USE_TIMINGOUTPUT +EventManager global_timer_manager("BFPS", std::cout); +#endif diff --git a/bfps/cpp/scope_timer.hpp b/bfps/cpp/scope_timer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e513e8e6e47a14d69fc0c695894fc2114a9b6058 --- /dev/null +++ b/bfps/cpp/scope_timer.hpp @@ -0,0 +1,821 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ + +#ifndef SCOPE_TIMER_HPP +#define SCOPE_TIMER_HPP + +#include <memory> +#include <iostream> +#include <vector> +#include <stack> +#include <string> +#include <limits> +#include <cassert> +#include <sstream> +#include <unordered_map> +#include <mpi.h> +#include <cstring> +#include <stdexcept> +#include <omp.h> +#include <iomanip> +#include <fstream> + +#include "base.hpp" +#include "bfps_timer.hpp" + +//< To add it as friend of EventManager +class ScopeEvent; + +class EventManager { +protected: + + class CoreEvent { + protected: + //< Name of the event (from the user) + const std::string m_name; + //< Previous events (stack of parents) + std::stack<CoreEvent*> m_parentStack; + //< Current event children + std::vector<CoreEvent*> m_children; + + //< Total execution time + double m_totalTime; + //< Minimum execution time + double m_minTime; + //< Maximum execution time + double m_maxTime; + //< Number of occurrence for this event + int m_occurrence; + //< Number of occurrence that are tasks for this event + int m_nbTasks; + //< Children lock + omp_lock_t m_childrenLock; + //< Children lock + omp_lock_t m_updateLock; + + public: + /** Create a core-event from the name and the current stack */ + CoreEvent(const std::string& inName, + const std::stack<CoreEvent*>& inParentStack) + : m_name(inName), + m_parentStack(inParentStack), + m_totalTime(0), + m_minTime(std::numeric_limits<double>::max()), + m_maxTime(std::numeric_limits<double>::min()), + m_occurrence(0), + m_nbTasks(0) { + omp_init_lock(&m_childrenLock); + omp_init_lock(&m_updateLock); + } + + ~CoreEvent() { + omp_destroy_lock(&m_childrenLock); + omp_destroy_lock(&m_updateLock); + } + + /** Add a record */ + void addRecord(const double inDuration, const bool isTask) { + #pragma omp atomic update + m_totalTime += inDuration; + #pragma omp atomic update + m_occurrence += 1; + #pragma omp flush // (m_minTime, m_maxTime) + if (inDuration < m_minTime || m_maxTime < inDuration) { + omp_set_lock(&m_updateLock); + m_minTime = std::min(m_minTime, inDuration); + m_maxTime = std::max(m_maxTime, inDuration); + omp_unset_lock(&m_updateLock); + } + if (isTask) { + #pragma omp atomic update + m_nbTasks += 1; + } + } + + const std::stack<CoreEvent*>& getParents() const { return m_parentStack; } + + std::stack<CoreEvent*>& getParents() { return m_parentStack; } + + void addChild(CoreEvent* inChild) { + omp_set_lock(&m_childrenLock); + m_children.push_back(inChild); + omp_unset_lock(&m_childrenLock); + } + + //! Must not be called during a paralle execution + const std::vector<CoreEvent*>& getChildren() const { + assert(omp_in_parallel() == 0); + return m_children; + } + + const std::string& getName() const { return m_name; } + + double getMin() const { return m_minTime; } + + double getMax() const { return m_maxTime; } + + int getOccurrence() const { return m_occurrence; } + + double getAverage() const { + return m_totalTime / static_cast<double>(m_occurrence); + } + + double getDuration() const { return m_totalTime; } + + int getNbTasks() const { return m_nbTasks; } + }; + + /////////////////////////////////////////////////////////////// + + //< The main node + std::unique_ptr<CoreEvent> m_root; + //< Output stream to print out + std::ostream& m_outputStream; + + //< Current stack, there are one stack of stack per thread + std::vector<std::stack<std::stack<CoreEvent*>>> m_currentEventsStackPerThread; + //< All recorded events (that will then be delete at the end) + std::unordered_multimap<std::string, CoreEvent*> m_records; + //< Lock for m_records + omp_lock_t m_recordsLock; + + /** Find a event from its name. If such even does not exist + * the function creates one. If an event with the same name exists + * but with a different stack, a new one is created. + * It pushes the returned event in the stack. + */ + CoreEvent* getEvent(const std::string& inName, + const std::string& inUniqueKey) { + const std::string completeName = inName + inUniqueKey; + CoreEvent* foundEvent = nullptr; + + omp_set_lock(&m_recordsLock); + // find all events with this name + auto range = m_records.equal_range(completeName); + for (auto iter = range.first; iter != range.second; ++iter) { + // events are equal if same name and same parents + if ((*iter).second->getParents() == + m_currentEventsStackPerThread[omp_get_thread_num()].top()) { + foundEvent = (*iter).second; + break; + } + } + + // Keep the lock to ensure that not two threads create the same event + + if (!foundEvent) { + // create this event + foundEvent = new CoreEvent( + inName, m_currentEventsStackPerThread[omp_get_thread_num()].top()); + m_currentEventsStackPerThread[omp_get_thread_num()].top().top()->addChild( + foundEvent); + m_records.insert({completeName, foundEvent}); + } + omp_unset_lock(&m_recordsLock); + + m_currentEventsStackPerThread[omp_get_thread_num()].top().push(foundEvent); + return foundEvent; + } + + CoreEvent* getEventFromContext(const std::string& inName, + const std::string& inUniqueKey, + const std::stack<CoreEvent*>& inParentStack) { + m_currentEventsStackPerThread[omp_get_thread_num()].push(inParentStack); + return getEvent(inName, inUniqueKey); + } + + /** Pop current event */ + void popEvent(const CoreEvent* eventToRemove) { + assert(m_currentEventsStackPerThread[omp_get_thread_num()].top().size() > 1); + // Comparing address is cheaper + if (m_currentEventsStackPerThread[omp_get_thread_num()].top().top() != + eventToRemove) { + throw std::runtime_error( + "You must end events (ScopeEvent/TIMEZONE) in order.\n" + "Please make sure that you only ask to the last event to finish."); + } + m_currentEventsStackPerThread[omp_get_thread_num()].top().pop(); + } + + /** Pop current context */ + void popContext(const CoreEvent* eventToRemove) { + assert(m_currentEventsStackPerThread[omp_get_thread_num()].size() > 1); + assert(m_currentEventsStackPerThread[omp_get_thread_num()].top().size() > 1); + // Comparing address is cheaper + if (m_currentEventsStackPerThread[omp_get_thread_num()].top().top() != + eventToRemove) { + throw std::runtime_error( + "You must end events (ScopeEvent/TIMEZONE) in order.\n" + "Please make sure that you only ask to the last event to finish."); + } + m_currentEventsStackPerThread[omp_get_thread_num()].pop(); + } + +public: + /** Create an event manager */ + EventManager(const std::string& inAppName, std::ostream& inOutputStream) + : m_root(new CoreEvent(inAppName, std::stack<CoreEvent*>())), + m_outputStream(inOutputStream), + m_currentEventsStackPerThread(1) { + m_currentEventsStackPerThread[0].emplace(); + m_currentEventsStackPerThread[0].top().push(m_root.get()); + omp_init_lock(&m_recordsLock); + } + + ~EventManager() throw() { + assert(m_currentEventsStackPerThread[0].size() == 1); + + assert(m_currentEventsStackPerThread[0].top().size() == 1); + + omp_destroy_lock(&m_recordsLock); + + for (auto event : m_records) { + delete event.second; + } + } + + void startParallelRegion(const int inNbThreads) { + m_currentEventsStackPerThread.resize(1); + m_currentEventsStackPerThread.resize(inNbThreads, + m_currentEventsStackPerThread[0]); + } + + void showDistributed(const MPI_Comm inComm) const { + int myRank, nbProcess; + int retMpi = MPI_Comm_rank( inComm, &myRank); + variable_used_only_in_assert(retMpi); + assert(retMpi == MPI_SUCCESS); + retMpi = MPI_Comm_size( inComm, &nbProcess); + assert(retMpi == MPI_SUCCESS); + + if((&m_outputStream == &std::cout || &m_outputStream == &std::clog) && myrank != nbProcess-1){ + // Print in reverse order + char tmp; + retMpi = MPI_Recv(&tmp, 1, MPI_BYTE, myrank+1, 99, inComm, MPI_STATUS_IGNORE); + assert(retMpi == MPI_SUCCESS); + } + m_outputStream.flush(); + + std::stack<std::pair<int, const CoreEvent*>> events; + + for (int idx = static_cast<int>(m_root->getChildren().size()) - 1; idx >= 0; --idx) { + events.push({0, m_root->getChildren()[idx]}); + } + + m_outputStream << "[TIMING-" << myRank<< "] Local times.\n"; + m_outputStream << "[TIMING-" << myRank<< "] :" << m_root->getName() << "\n"; + + while (events.size()) { + const std::pair<int, const CoreEvent*> eventToShow = + events.top(); + events.pop(); + + m_outputStream << "[TIMING-" << myRank<< "] "; + + int offsetTab = eventToShow.first; + while (offsetTab--) { + m_outputStream << "\t"; + } + m_outputStream << "@" << eventToShow.second->getName() << " = " << eventToShow.second->getDuration() << "s"; + if (eventToShow.second->getOccurrence() != 1) { + m_outputStream << " (Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax() + << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = " + << eventToShow.second->getOccurrence() << ")"; + } + + m_outputStream << "\n"; + for (int idx = + static_cast<int>(eventToShow.second->getChildren().size()) - 1; + idx >= 0; --idx) { + events.push( + {eventToShow.first + 1, eventToShow.second->getChildren()[idx]}); + } + } + m_outputStream.flush(); + + if((&m_outputStream == &std::cout || &m_outputStream == &std::clog) && myrank != 0){ + // Print in reverse order + char tmp; + retMpi = MPI_Send(&tmp, 1, MPI_BYTE, myrank-1, 99, inComm); + assert(retMpi == MPI_SUCCESS); + } + } + + void show(const MPI_Comm inComm, const bool onlyP0 = true) const { + int myRank, nbProcess; + int retMpi = MPI_Comm_rank( inComm, &myRank); + variable_used_only_in_assert(retMpi); + assert(retMpi == MPI_SUCCESS); + retMpi = MPI_Comm_size( inComm, &nbProcess); + assert(retMpi == MPI_SUCCESS); + + if(onlyP0 && myRank != 0){ + return; + } + + std::stringstream myResults; + + std::stack<std::pair<int, const CoreEvent*>> events; + + for (int idx = static_cast<int>(m_root->getChildren().size()) - 1; idx >= 0; --idx) { + events.push({0, m_root->getChildren()[idx]}); + } + + myResults << "[TIMING-" << myRank<< "] Local times.\n"; + myResults << "[TIMING-" << myRank<< "] :" << m_root->getName() << "\n"; + + while (events.size()) { + const std::pair<int, const CoreEvent*> eventToShow = + events.top(); + events.pop(); + + myResults << "[TIMING-" << myRank<< "] "; + + int offsetTab = eventToShow.first; + while (offsetTab--) { + myResults << "\t"; + } + myResults << "@" << eventToShow.second->getName() << " = " << eventToShow.second->getDuration() << "s"; + if (eventToShow.second->getOccurrence() != 1) { + myResults << " (Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax() + << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = " + << eventToShow.second->getOccurrence() << ")"; + } + + myResults << "\n"; + for (int idx = + static_cast<int>(eventToShow.second->getChildren().size()) - 1; + idx >= 0; --idx) { + events.push( + {eventToShow.first + 1, eventToShow.second->getChildren()[idx]}); + } + } + + if(myrank != 0){ + const std::string strOutput = myResults.str(); + int sizeOutput = strOutput.length(); + retMpi = MPI_Send(&sizeOutput, 1, MPI_INT, 0, 99, inComm); + assert(retMpi == MPI_SUCCESS); + retMpi = MPI_Send((void*)strOutput.data(), sizeOutput, MPI_CHAR, 0, 100, inComm); + assert(retMpi == MPI_SUCCESS); + } + else{ + if(onlyP0 == false){ + std::vector<char> buffer; + for(int idxProc = nbProcess-1 ; idxProc > 0 ; --idxProc){ + int sizeRecv; + retMpi = MPI_Recv(&sizeRecv, 1, MPI_INT, idxProc, 99, inComm, MPI_STATUS_IGNORE); + assert(retMpi == MPI_SUCCESS); + buffer.resize(sizeRecv+1); + retMpi = MPI_Recv(buffer.data(), sizeRecv, MPI_CHAR, idxProc, 100, inComm, MPI_STATUS_IGNORE); + assert(retMpi == MPI_SUCCESS); + buffer[sizeRecv]='\0'; + m_outputStream << buffer.data(); + } + } + m_outputStream << myResults.str(); + m_outputStream.flush(); + } + } + + void showMpi(const MPI_Comm inComm) const { + struct SerializedEvent { + char path[512]; + char name[128]; + double totalTime; + double minTime; + double maxTime; + int occurrence; + }; + + // Convert my events into sendable object + + std::vector<SerializedEvent> myEvents; + myEvents.reserve(m_records.size()); + + for(const std::pair<std::string, const CoreEvent*>& event : m_records){ + myEvents.emplace_back(); + SerializedEvent& current_event = myEvents.back(); + + current_event.totalTime = event.second->getDuration(); + current_event.minTime = event.second->getMin(); + current_event.maxTime = event.second->getMax(); + current_event.occurrence = event.second->getOccurrence(); + + strncpy(current_event.name, event.second->getName().c_str(), 128); + std::stringstream path; + std::stack<CoreEvent*> parents = event.second->getParents(); + while(parents.size()){ + path << parents.top()->getName() << " << "; + parents.pop(); + } + + strncpy(current_event.path, path.str().c_str(), 512); + } + + // Send to process 0 + int myRank, nbProcess; + int retMpi = MPI_Comm_rank( inComm, &myRank); + variable_used_only_in_assert(retMpi); + assert(retMpi == MPI_SUCCESS); + retMpi = MPI_Comm_size( inComm, &nbProcess); + assert(retMpi == MPI_SUCCESS); + std::unique_ptr<int[]> nbEventsPerProc; + if(myRank == 0){ + nbEventsPerProc.reset(new int[nbProcess]); + } + const int myNbEvents = myEvents.size(); + retMpi = MPI_Gather(const_cast<int*>(&myNbEvents), 1, MPI_INT, + nbEventsPerProc.get(), 1, MPI_INT, + 0, inComm); + assert(retMpi == MPI_SUCCESS); + // Process 0 merge and print results + std::unique_ptr<int[]> dipls; + std::unique_ptr<SerializedEvent[]> allEvents; + std::unique_ptr<int[]> nbEventsPerProcByte; + std::unique_ptr<int[]> diplsByte; + if(myRank == 0){ + dipls.reset(new int[nbProcess+1]); + diplsByte.reset(new int[nbProcess+1]); + nbEventsPerProcByte.reset(new int[nbProcess]); + dipls[0] = 0; + diplsByte[0] = 0; + for(int idx = 1 ; idx <= nbProcess ; ++idx){ + dipls[idx] = dipls[idx-1] + nbEventsPerProc[idx-1]; + diplsByte[idx] = dipls[idx] * sizeof(SerializedEvent); + nbEventsPerProcByte[idx-1] = nbEventsPerProc[idx-1] * sizeof(SerializedEvent); + } + allEvents.reset(new SerializedEvent[dipls[nbProcess]]); + } + + retMpi = MPI_Gatherv(myEvents.data(), myNbEvents * sizeof(SerializedEvent), MPI_BYTE, + allEvents.get(), nbEventsPerProcByte.get(), diplsByte.get(), + MPI_BYTE, 0, inComm); + assert(retMpi == MPI_SUCCESS); + + if(myRank == 0){ + struct GlobalEvent { + char path[512]; + char name[128]; + double totalTime; + double minTime; + double maxTime; + int occurrence; + int nbProcess; + double minTimeProcess; + double maxTimeProcess; + }; + + std::unordered_map<std::string, GlobalEvent> mapEvents; + for(int idxEvent = 0 ; idxEvent < dipls[nbProcess] ; ++idxEvent){ + const std::string key = std::string(allEvents[idxEvent].path) + std::string(allEvents[idxEvent].name); + if(mapEvents.find(key) == mapEvents.end()){ + GlobalEvent& newEvent = mapEvents[key]; + strncpy(newEvent.path, allEvents[idxEvent].path, 512); + strncpy(newEvent.name, allEvents[idxEvent].name, 128); + newEvent.totalTime = allEvents[idxEvent].totalTime; + newEvent.minTime = allEvents[idxEvent].minTime; + newEvent.maxTime = allEvents[idxEvent].maxTime; + newEvent.occurrence = allEvents[idxEvent].totalTime; + newEvent.nbProcess = 1; + newEvent.minTimeProcess = allEvents[idxEvent].totalTime; + newEvent.maxTimeProcess = allEvents[idxEvent].totalTime; + } + else{ + GlobalEvent& newEvent = mapEvents[key]; + assert(strcmp(newEvent.path, allEvents[idxEvent].path) == 0); + assert(strcmp(newEvent.name, allEvents[idxEvent].name) == 0); + newEvent.totalTime += allEvents[idxEvent].totalTime; + newEvent.minTime = std::min(newEvent.minTime, allEvents[idxEvent].minTime); + newEvent.maxTime = std::max(newEvent.maxTime, allEvents[idxEvent].maxTime); + newEvent.occurrence += allEvents[idxEvent].occurrence; + newEvent.nbProcess += 1; + newEvent.minTimeProcess = std::min(newEvent.minTimeProcess, + allEvents[idxEvent].totalTime); + newEvent.maxTimeProcess = std::max(newEvent.maxTimeProcess, + allEvents[idxEvent].totalTime); + } + } + + m_outputStream << "[MPI-TIMING] Mpi times.\n"; + for(const auto& iter : mapEvents){ + const GlobalEvent& gevent = iter.second; + m_outputStream << "[MPI-TIMING] @" << gevent.name << "\n"; + m_outputStream << "[MPI-TIMING] Stack => " << gevent.path << "\n"; + m_outputStream << "[MPI-TIMING] \t Done by " << gevent.nbProcess << " processes\n"; + m_outputStream << "[MPI-TIMING] \t Total time for all " << gevent.totalTime + << "s (average per process " << gevent.totalTime/gevent.nbProcess << "s)\n"; + m_outputStream << "[MPI-TIMING] \t Min time for a process " << gevent.minTimeProcess + << "s Max time for a process " << gevent.maxTimeProcess << "s\n"; + m_outputStream << "[MPI-TIMING] \t The same call has been done " << gevent.occurrence + << " times by all process (duration min " << gevent.minTime << "s max " << gevent.maxTime << "s avg " + << gevent.totalTime/gevent.occurrence << "s)\n"; + } + } + m_outputStream.flush(); + } + + void showHtml(const MPI_Comm inComm, const bool onlyP0 = true) const { + int myRank, nbProcess; + int retMpi = MPI_Comm_rank( inComm, &myRank); + assert(retMpi == MPI_SUCCESS); + variable_used_only_in_assert(retMpi); + retMpi = MPI_Comm_size( inComm, &nbProcess); + assert(retMpi == MPI_SUCCESS); + + if(onlyP0 && myRank != 0){ + return; + } + + std::stringstream myResults; + + std::stack<std::pair<int, const CoreEvent*>> events; + + for (int idx = static_cast<int>(m_root->getChildren().size()) - 1; idx >= 0; --idx) { + events.push({0, m_root->getChildren()[idx]}); + } + + myResults << "<h1>Process : " << myRank << "</h1>\n"; + + double totalDuration = 0; + for (int idx = + static_cast<int>(m_root->getChildren().size()) - 1; + idx >= 0; --idx) { + totalDuration += m_root->getChildren()[idx]->getDuration(); + } + + myResults << "<h2> " << m_root->getName() << " (" << totalDuration << "s)</h2>\n"; + myResults << "<ul>\n"; + int idxBox = myRank*100000; + + while (events.size()) { + const std::pair<int, const CoreEvent*> eventToShow = + events.top(); + events.pop(); + + if(eventToShow.first == -1){ + myResults << "</ul>\n"; + myResults << "</li>\n"; + } + else if(eventToShow.second->getChildren().size() == 0){ + myResults << "<li>● <span title=\""; + if (eventToShow.second->getOccurrence() != 1) { + myResults << "Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax() + << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = " + << eventToShow.second->getOccurrence(); + } + myResults << "\">" << eventToShow.second->getName(); + const double percentage = 100*eventToShow.second->getDuration()/totalDuration; + if( percentage < 0.001 ){ + myResults << " (< 0.001% -- " ; + } + else{ + myResults << " (" << std::fixed << std::setprecision(3) << percentage << "% -- " ; + } + if(eventToShow.second->getParents().size()){ + const double percentageParent = 100*eventToShow.second->getDuration()/eventToShow.second->getParents().top()->getDuration(); + myResults << "[" << std::fixed << std::setprecision(3) << percentageParent << "%] -- " ; + } + myResults << eventToShow.second->getDuration() <<"s)</span></li>\n"; + } + else{ + myResults << "<li><input type=\"checkbox\" id=\"c" << idxBox << "\" />\n"; + myResults << " <i class=\"fa fa-angle-double-right\">→ </i>\n"; + myResults << " <i class=\"fa fa-angle-double-down\">↓ </i>\n"; + myResults << " <label for=\"c" << idxBox++ << "\"><span title=\""; + if (eventToShow.second->getOccurrence() != 1) { + myResults << "Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax() + << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = " + << eventToShow.second->getOccurrence(); + } + myResults << "\">" << eventToShow.second->getName(); + const double percentage = 100*eventToShow.second->getDuration()/totalDuration; + if( percentage < 0.001 ){ + myResults << " (< 0.001% -- " ; + } + else{ + myResults << " (" << std::fixed << std::setprecision(3) << percentage << "% -- " ; + } + if(eventToShow.second->getParents().size()){ + const double percentageParent = 100*eventToShow.second->getDuration()/eventToShow.second->getParents().top()->getDuration(); + myResults << "[" << std::fixed << std::setprecision(3) << percentageParent << "%] -- " ; + } + myResults << eventToShow.second->getDuration() <<"s)</span></label>\n"; + myResults << "<ul>\n"; + events.push({-1, nullptr}); + + for (int idx = + static_cast<int>(eventToShow.second->getChildren().size()) - 1; + idx >= 0; --idx) { + events.push( + {eventToShow.first + 1, eventToShow.second->getChildren()[idx]}); + } + } + } + + myResults << "</ul>\n"; + + if(myRank != 0){ + const std::string strOutput = myResults.str(); + int sizeOutput = strOutput.length(); + retMpi = MPI_Send(&sizeOutput, 1, MPI_INT, 0, 99, inComm); + assert(retMpi == MPI_SUCCESS); + retMpi = MPI_Send((void*)strOutput.data(), sizeOutput, MPI_CHAR, 0, 100, inComm); + assert(retMpi == MPI_SUCCESS); + } + else{ + const std::string htmlOutput = (getenv("HTMLOUTPUT")?getenv("HTMLOUTPUT"):"timings.html"); + + std::cout << "Timing output html set to : " << htmlOutput << std::endl; + + std::ofstream htmlfile(htmlOutput); + + htmlfile << "<html>\ + <head>\ + <style>\ + input {\ + display: none;\ + }\ + input ~ ul {\ + display: none;\ + }\ + input:checked ~ ul {\ + display: block;\ + }\ + input ~ .fa-angle-double-down {\ + display: none;\ + }\ + input:checked ~ .fa-angle-double-right {\ + display: none;\ + }\ + input:checked ~ .fa-angle-double-down {\ + display: inline;\ + }\ + li {\ + display: block;\ + font-family: 'Arial';\ + font-size: 15px;\ + padding: 0.2em;\ + border: 1px solid transparent;\ + }\ + li:hover {\ + border: 1px solid grey;\ + border-radius: 3px;\ + background-color: lightgrey;\ + }\ + span:hover {\ + color: blue;\ + }\ + </style>\ + </head>\ + <body>"; + + if(onlyP0 == false){ + std::vector<char> buffer; + for(int idxProc = nbProcess-1 ; idxProc > 0 ; --idxProc){ + int sizeRecv; + retMpi = MPI_Recv(&sizeRecv, 1, MPI_INT, idxProc, 99, inComm, MPI_STATUS_IGNORE); + assert(retMpi == MPI_SUCCESS); + buffer.resize(sizeRecv+1); + retMpi = MPI_Recv(buffer.data(), sizeRecv, MPI_CHAR, idxProc, 100, inComm, MPI_STATUS_IGNORE); + assert(retMpi == MPI_SUCCESS); + buffer[sizeRecv]='\0'; + htmlfile << buffer.data(); + } + } + htmlfile << myResults.str(); + htmlfile << "</body>\ + </html>"; + } + } + + + std::stack<CoreEvent*> getCurrentThreadEvent() const { + return m_currentEventsStackPerThread[omp_get_thread_num()].top(); + } + + friend ScopeEvent; +}; + +/////////////////////////////////////////////////////////////// + +/** A scope event should be used + * to record the duration of a part of the code + * (section, scope, etc.). + * The timer is stoped automatically when the object is destroyed + * or when "finish" is explicitely called. + * The object cannot be copied/moved to ensure coherency in the + * events hierarchy. + */ +class ScopeEvent { +protected: + //< The manager to refer to + EventManager& m_manager; + //< The core event + EventManager::CoreEvent* m_event; + //< Time to get elapsed time + bfps_timer m_timer; + //< Is true if it has been created for task + bool m_isTask; + +public: + ScopeEvent(const std::string& inName, EventManager& inManager, + const std::string& inUniqueKey) + : m_manager(inManager), + m_event(inManager.getEvent(inName, inUniqueKey)), + m_isTask(false) { + m_timer.start(); + } + + ScopeEvent(const std::string& inName, EventManager& inManager, + const std::string& inUniqueKey, + const std::stack<EventManager::CoreEvent*>& inParentStack) + : m_manager(inManager), + m_event( + inManager.getEventFromContext(inName, inUniqueKey, inParentStack)), + m_isTask(true) { + m_timer.start(); + } + + ~ScopeEvent() { + m_event->addRecord(m_timer.stopAndGetElapsed(), m_isTask); + if (m_isTask == false) { + m_manager.popEvent(m_event); + } else { + m_manager.popContext(m_event); + } + } + + ScopeEvent(const ScopeEvent&) = delete; + ScopeEvent& operator=(const ScopeEvent&) = delete; + ScopeEvent(ScopeEvent&&) = delete; + ScopeEvent& operator=(ScopeEvent&&) = delete; +}; + +#define ScopeEventUniqueKey_Core_To_Str_Ext(X) #X +#define ScopeEventUniqueKey_Core_To_Str(X) \ + ScopeEventUniqueKey_Core_To_Str_Ext(X) +#define ScopeEventUniqueKey __FILE__ ScopeEventUniqueKey_Core_To_Str(__LINE__) + +#define ScopeEventMultiRefKey std::string("-- multiref event --") + +#ifdef USE_TIMINGOUTPUT + +extern EventManager global_timer_manager; + +#define TIMEZONE_Core_Merge(x, y) x##y +#define TIMEZONE_Core_Pre_Merge(x, y) TIMEZONE_Core_Merge(x, y) + +#define TIMEZONE(NAME) \ + ScopeEvent TIMEZONE_Core_Pre_Merge(____TIMEZONE_AUTO_ID, __LINE__)( \ + NAME, global_timer_manager, ScopeEventUniqueKey); +#define TIMEZONE_MULTI_REF(NAME) \ + ScopeEvent TIMEZONE_Core_Pre_Merge(____TIMEZONE_AUTO_ID, __LINE__)( \ + NAME, global_timer_manager, ScopeEventMultiRefKey); + +#define TIMEZONE_OMP_INIT_PRETASK(VARNAME) \ + auto VARNAME##core = global_timer_manager.getCurrentThreadEvent(); \ + auto VARNAME = &VARNAME##core; +#define TIMEZONE_OMP_TASK(NAME, VARNAME) \ + ScopeEvent TIMEZONE_Core_Pre_Merge(____TIMEZONE_AUTO_ID, __LINE__)( \ + NAME, global_timer_manager, ScopeEventUniqueKey, *VARNAME); +#define TIMEZONE_OMP_PRAGMA_TASK_KEY(VARNAME) \ + shared(global_timer_manager) firstprivate(VARNAME) + +#define TIMEZONE_OMP_INIT_PREPARALLEL(NBTHREADS) \ + global_timer_manager.startParallelRegion(NBTHREADS); + +#else + +#define TIMEZONE(NAME) +#define TIMEZONE_MULTI_REF(NAME) +#define TIMEZONE_OMP_INIT_PRETASK(VARNAME) +#define TIMEZONE_OMP_TASK(NAME, VARNAME) +#define TIMEZONE_OMP_PRAGMA_TASK_KEY(VARNAME) +#define TIMEZONE_OMP_INIT_PREPARALLEL(NBTHREADS) + +#endif + + +#endif diff --git a/bfps/cpp/shared_array.hpp b/bfps/cpp/shared_array.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1951e2f9838ccf37367d859206453d3db91e8e19 --- /dev/null +++ b/bfps/cpp/shared_array.hpp @@ -0,0 +1,110 @@ +#ifndef SHAREDARRAY_HPP +#define SHAREDARRAY_HPP + +#include <omp.h> +#include <functional> +#include <iostream> + +// Cannot be used by different parallel section at the same time +template <class ValueType> +class shared_array{ + int currentNbThreads; + ValueType** __restrict__ values; + size_t dim; + + std::function<void(ValueType*)> initFunc; + + bool hasBeenMerged; + +public: + shared_array(const size_t inDim) + : currentNbThreads(omp_get_max_threads()), + values(nullptr), dim(inDim), hasBeenMerged(false){ + values = new ValueType*[currentNbThreads]; + values[0] = new ValueType[dim]; + for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){ + values[idxThread] = nullptr; + } + } + + shared_array(const size_t inDim, std::function<void(ValueType*)> inInitFunc) + : shared_array(inDim){ + setInitFunction(inInitFunc); + } + + ~shared_array(){ + for(int idxThread = 0 ; idxThread < currentNbThreads ; ++idxThread){ + delete[] values[idxThread]; + } + delete[] values; + if(hasBeenMerged == false){ + } + } + + ValueType* getMasterData(){ + return values[0]; + } + + const ValueType* getMasterData() const{ + return values[0]; + } + + void merge(){ + ValueType* __restrict__ dest = values[0]; + for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){ + if(values[idxThread]){ + const ValueType* __restrict__ src = values[idxThread]; + for( size_t idxVal = 0 ; idxVal < dim ; ++idxVal){ + dest[idxVal] += src[idxVal]; + } + } + } + hasBeenMerged = true; + } + + template <class Func> + void merge(Func func){ + ValueType* __restrict__ dest = values[0]; + for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){ + if(values[idxThread]){ + const ValueType* __restrict__ src = values[idxThread]; + for( size_t idxVal = 0 ; idxVal < dim ; ++idxVal){ + dest[idxVal] = func(idxVal, dest[idxVal], src[idxVal]); + } + } + } + hasBeenMerged = true; + } + + void mergeParallel(){ + merge(); // not done yet + } + + template <class Func> + void mergeParallel(Func func){ + merge(func); // not done yet + } + + void setInitFunction(std::function<void(ValueType*)> inInitFunc){ + initFunc = inInitFunc; + initFunc(values[0]); + } + + ValueType* getMine(){ + assert(omp_get_thread_num() < currentNbThreads); + + if(values[omp_get_thread_num()] == nullptr){ + ValueType* myValue = new ValueType[dim]; + if(initFunc){ + initFunc(myValue); + } + + values[omp_get_thread_num()] = myValue; + return myValue; + } + + return values[omp_get_thread_num()]; + } +}; + +#endif diff --git a/bfps/cpp/vorticity_equation.cpp b/bfps/cpp/vorticity_equation.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a84e8a30aceefa9943c982b46389a3245aba2b34 --- /dev/null +++ b/bfps/cpp/vorticity_equation.cpp @@ -0,0 +1,716 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ + + + +#define NDEBUG + +#include <cassert> +#include <cmath> +#include <cstring> +#include "fftw_tools.hpp" +#include "vorticity_equation.hpp" +#include "scope_timer.hpp" + + + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::impose_zero_modes() +{ + TIMEZONE("vorticity_equation::impose_zero_modes"); + this->u->impose_zero_mode(); + this->v[0]->impose_zero_mode(); + this->v[1]->impose_zero_mode(); + this->v[2]->impose_zero_mode(); +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::update_checkpoint() +{ + std::string fname = this->get_current_fname(); + if (this->kk->layout->myrank == 0) + { + bool file_exists = false; + { + struct stat file_buffer; + file_exists = (stat(fname.c_str(), &file_buffer) == 0); + } + if (file_exists) + { + // check how many fields there are in the checkpoint file + // increment checkpoint if needed + hsize_t fields_stored; + hid_t fid, group_id; + fid = H5Fopen(fname.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); + group_id = H5Gopen(fid, "vorticity/complex", H5P_DEFAULT); + H5Gget_num_objs( + group_id, + &fields_stored); + bool dset_exists = H5Lexists( + group_id, + std::to_string(this->iteration).c_str(), + H5P_DEFAULT); + H5Gclose(group_id); + H5Fclose(fid); + if ((fields_stored >= this->checkpoints_per_file) && + !dset_exists) + this->checkpoint++; + } + else + { + // create file, create fields_stored dset + hid_t fid = H5Fcreate( + fname.c_str(), + H5F_ACC_EXCL, + H5P_DEFAULT, + H5P_DEFAULT); + hid_t gg = H5Gcreate( + fid, + "vorticity", + H5P_DEFAULT, + H5P_DEFAULT, + H5P_DEFAULT); + hid_t ggg = H5Gcreate( + gg, + "complex", + H5P_DEFAULT, + H5P_DEFAULT, + H5P_DEFAULT); + H5Gclose(ggg); + H5Gclose(gg); + H5Fclose(fid); + } + } + MPI_Bcast(&this->checkpoint, 1, MPI_INT, 0, this->kk->layout->comm); +} + +template <class rnumber, + field_backend be> +vorticity_equation<rnumber, be>::vorticity_equation( + const char *NAME, + int nx, + int ny, + int nz, + double DKX, + double DKY, + double DKZ, + unsigned FFTW_PLAN_RIGOR) +{ + TIMEZONE("vorticity_equation::vorticity_equation"); + /* initialize name and basic stuff */ + strncpy(this->name, NAME, 256); + this->name[255] = '\0'; + this->iteration = 0; + this->checkpoint = 0; + + /* initialize fields */ + this->cvorticity = new field<rnumber, be, THREE>( + nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR); + this->rvorticity = new field<rnumber, be, THREE>( + nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR); + this->v[1] = new field<rnumber, be, THREE>( + nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR); + this->v[2] = new field<rnumber, be, THREE>( + nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR); + this->v[0] = this->cvorticity; + this->v[3] = this->cvorticity; + + this->cvelocity = new field<rnumber, be, THREE>( + nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR); + this->rvelocity = new field<rnumber, be, THREE>( + nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR); + this->u = this->cvelocity; + + /* initialize kspace */ + this->kk = new kspace<be, SMOOTH>( + this->cvorticity->clayout, DKX, DKY, DKZ); + + /* ``physical'' parameters etc, initialized here just in case */ + + this->nu = 0.1; + this->fmode = 1; + this->famplitude = 1.0; + this->fk0 = 2.0; + this->fk1 = 4.0; +} + +template <class rnumber, + field_backend be> +vorticity_equation<rnumber, be>::~vorticity_equation() +{ + TIMEZONE("vorticity_equation::~vorticity_equation"); + delete this->kk; + delete this->cvorticity; + delete this->rvorticity; + delete this->v[1]; + delete this->v[2]; + delete this->cvelocity; + delete this->rvelocity; +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::compute_vorticity() +{ + TIMEZONE("vorticity_equation::compute_vorticity"); + this->cvorticity->real_space_representation = false; + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2) + { + this->cvorticity->cval(cindex,0,0) = -(this->kk->ky[yindex]*this->u->cval(cindex,2,1) - this->kk->kz[zindex]*this->u->cval(cindex,1,1)); + this->cvorticity->cval(cindex,0,1) = (this->kk->ky[yindex]*this->u->cval(cindex,2,0) - this->kk->kz[zindex]*this->u->cval(cindex,1,0)); + this->cvorticity->cval(cindex,1,0) = -(this->kk->kz[zindex]*this->u->cval(cindex,0,1) - this->kk->kx[xindex]*this->u->cval(cindex,2,1)); + this->cvorticity->cval(cindex,1,1) = (this->kk->kz[zindex]*this->u->cval(cindex,0,0) - this->kk->kx[xindex]*this->u->cval(cindex,2,0)); + this->cvorticity->cval(cindex,2,0) = -(this->kk->kx[xindex]*this->u->cval(cindex,1,1) - this->kk->ky[yindex]*this->u->cval(cindex,0,1)); + this->cvorticity->cval(cindex,2,1) = (this->kk->kx[xindex]*this->u->cval(cindex,1,0) - this->kk->ky[yindex]*this->u->cval(cindex,0,0)); + //ptrdiff_t tindex = 3*cindex; + //this->cvorticity->get_cdata()[tindex+0][0] = -(this->kk->ky[yindex]*this->u->get_cdata()[tindex+2][1] - this->kk->kz[zindex]*this->u->get_cdata()[tindex+1][1]); + //this->cvorticity->get_cdata()[tindex+1][0] = -(this->kk->kz[zindex]*this->u->get_cdata()[tindex+0][1] - this->kk->kx[xindex]*this->u->get_cdata()[tindex+2][1]); + //this->cvorticity->get_cdata()[tindex+2][0] = -(this->kk->kx[xindex]*this->u->get_cdata()[tindex+1][1] - this->kk->ky[yindex]*this->u->get_cdata()[tindex+0][1]); + //this->cvorticity->get_cdata()[tindex+0][1] = (this->kk->ky[yindex]*this->u->get_cdata()[tindex+2][0] - this->kk->kz[zindex]*this->u->get_cdata()[tindex+1][0]); + //this->cvorticity->get_cdata()[tindex+1][1] = (this->kk->kz[zindex]*this->u->get_cdata()[tindex+0][0] - this->kk->kx[xindex]*this->u->get_cdata()[tindex+2][0]); + //this->cvorticity->get_cdata()[tindex+2][1] = (this->kk->kx[xindex]*this->u->get_cdata()[tindex+1][0] - this->kk->ky[yindex]*this->u->get_cdata()[tindex+0][0]); + } + else + std::fill_n((rnumber*)(this->cvorticity->get_cdata()+3*cindex), 6, 0.0); + } + ); + this->cvorticity->symmetrize(); +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::compute_velocity(field<rnumber, be, THREE> *vorticity) +{ + TIMEZONE("vorticity_equation::compute_velocity"); + this->u->real_space_representation = false; + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2 && k2 > 0) + { + this->u->cval(cindex,0,0) = -(this->kk->ky[yindex]*vorticity->cval(cindex,2,1) - this->kk->kz[zindex]*vorticity->cval(cindex,1,1)) / k2; + this->u->cval(cindex,0,1) = (this->kk->ky[yindex]*vorticity->cval(cindex,2,0) - this->kk->kz[zindex]*vorticity->cval(cindex,1,0)) / k2; + this->u->cval(cindex,1,0) = -(this->kk->kz[zindex]*vorticity->cval(cindex,0,1) - this->kk->kx[xindex]*vorticity->cval(cindex,2,1)) / k2; + this->u->cval(cindex,1,1) = (this->kk->kz[zindex]*vorticity->cval(cindex,0,0) - this->kk->kx[xindex]*vorticity->cval(cindex,2,0)) / k2; + this->u->cval(cindex,2,0) = -(this->kk->kx[xindex]*vorticity->cval(cindex,1,1) - this->kk->ky[yindex]*vorticity->cval(cindex,0,1)) / k2; + this->u->cval(cindex,2,1) = (this->kk->kx[xindex]*vorticity->cval(cindex,1,0) - this->kk->ky[yindex]*vorticity->cval(cindex,0,0)) / k2; + //ptrdiff_t tindex = 3*cindex; + //this->u->get_cdata()[tindex+0][0] = -(this->kk->ky[yindex]*vorticity->get_cdata()[tindex+2][1] - this->kk->kz[zindex]*vorticity->get_cdata()[tindex+1][1]) / k2; + //this->u->get_cdata()[tindex+0][1] = (this->kk->ky[yindex]*vorticity->get_cdata()[tindex+2][0] - this->kk->kz[zindex]*vorticity->get_cdata()[tindex+1][0]) / k2; + //this->u->get_cdata()[tindex+1][0] = -(this->kk->kz[zindex]*vorticity->get_cdata()[tindex+0][1] - this->kk->kx[xindex]*vorticity->get_cdata()[tindex+2][1]) / k2; + //this->u->get_cdata()[tindex+1][1] = (this->kk->kz[zindex]*vorticity->get_cdata()[tindex+0][0] - this->kk->kx[xindex]*vorticity->get_cdata()[tindex+2][0]) / k2; + //this->u->get_cdata()[tindex+2][0] = -(this->kk->kx[xindex]*vorticity->get_cdata()[tindex+1][1] - this->kk->ky[yindex]*vorticity->get_cdata()[tindex+0][1]) / k2; + //this->u->get_cdata()[tindex+2][1] = (this->kk->kx[xindex]*vorticity->get_cdata()[tindex+1][0] - this->kk->ky[yindex]*vorticity->get_cdata()[tindex+0][0]) / k2; + } + else + std::fill_n((rnumber*)(this->u->get_cdata()+3*cindex), 6, 0.0); + } + ); + this->u->symmetrize(); +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::add_forcing( + field<rnumber, be, THREE> *dst, + field<rnumber, be, THREE> *vort_field, + rnumber factor) +{ + TIMEZONE("vorticity_equation::add_forcing"); + if (strcmp(this->forcing_type, "none") == 0) + return; + if (strcmp(this->forcing_type, "Kolmogorov") == 0) + { + ptrdiff_t cindex; + if (this->cvorticity->clayout->myrank == this->cvorticity->clayout->rank[0][this->fmode]) + { + cindex = ((this->fmode - this->cvorticity->clayout->starts[0]) * this->cvorticity->clayout->sizes[1])*this->cvorticity->clayout->sizes[2]; + dst->cval(cindex,2, 0) -= this->famplitude*factor/2; + //dst->get_cdata()[cindex*3+2][0] -= this->famplitude*factor/2; + } + if (this->cvorticity->clayout->myrank == this->cvorticity->clayout->rank[0][this->cvorticity->clayout->sizes[0] - this->fmode]) + { + cindex = ((this->cvorticity->clayout->sizes[0] - this->fmode - this->cvorticity->clayout->starts[0]) * this->cvorticity->clayout->sizes[1])*this->cvorticity->clayout->sizes[2]; + dst->cval(cindex, 2, 0) -= this->famplitude*factor/2; + //dst->get_cdata()[cindex*3+2][0] -= this->famplitude*factor/2; + } + return; + } + if (strcmp(this->forcing_type, "linear") == 0) + { + this->kk->CLOOP( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + double knorm = sqrt(this->kk->kx[xindex]*this->kk->kx[xindex] + + this->kk->ky[yindex]*this->kk->ky[yindex] + + this->kk->kz[zindex]*this->kk->kz[zindex]); + if ((this->fk0 <= knorm) && + (this->fk1 >= knorm)) + for (int c=0; c<3; c++) + for (int i=0; i<2; i++) + dst->cval(cindex,c,i) += this->famplitude*vort_field->cval(cindex,c,i)*factor; + //dst->get_cdata()[cindex*3+c][i] += this->famplitude*vort_field->get_cdata()[cindex*3+c][i]*factor; + } + ); + return; + } +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::omega_nonlin( + int src) +{ + DEBUG_MSG("vorticity_equation::omega_nonlin(%d)\n", src); + assert(src >= 0 && src < 3); + this->compute_velocity(this->v[src]); + /* get fields from Fourier space to real space */ + this->u->ift(); + this->rvorticity->real_space_representation = false; + *this->rvorticity = this->v[src]->get_cdata(); + this->rvorticity->ift(); + /* compute cross product $u \times \omega$, and normalize */ + this->u->RLOOP( + [&](ptrdiff_t rindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + //ptrdiff_t tindex = 3*rindex; + rnumber tmp[3]; + for (int cc=0; cc<3; cc++) + tmp[cc] = (this->u->rval(rindex,(cc+1)%3)*this->rvorticity->rval(rindex,(cc+2)%3) - + this->u->rval(rindex,(cc+2)%3)*this->rvorticity->rval(rindex,(cc+1)%3)); + //tmp[cc][0] = (this->u->get_rdata()[tindex+(cc+1)%3]*this->rvorticity->get_rdata()[tindex+(cc+2)%3] - + // this->u->get_rdata()[tindex+(cc+2)%3]*this->rvorticity->get_rdata()[tindex+(cc+1)%3]); + for (int cc=0; cc<3; cc++) + this->u->rval(rindex,cc) = tmp[cc] / this->u->npoints; + //this->u->get_rdata()[(3*rindex)+cc] = tmp[cc][0] / this->u->npoints; + } + ); + /* go back to Fourier space */ + //this->clean_up_real_space(this->ru, 3); + this->u->dft(); + this->kk->template dealias<rnumber, THREE>(this->u->get_cdata()); + /* $\imath k \times Fourier(u \times \omega)$ */ + this->kk->CLOOP( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + rnumber tmp[3][2]; + { + tmp[0][0] = -(this->kk->ky[yindex]*this->u->cval(cindex,2,1) - this->kk->kz[zindex]*this->u->cval(cindex,1,1)); + tmp[1][0] = -(this->kk->kz[zindex]*this->u->cval(cindex,0,1) - this->kk->kx[xindex]*this->u->cval(cindex,2,1)); + tmp[2][0] = -(this->kk->kx[xindex]*this->u->cval(cindex,1,1) - this->kk->ky[yindex]*this->u->cval(cindex,0,1)); + tmp[0][1] = (this->kk->ky[yindex]*this->u->cval(cindex,2,0) - this->kk->kz[zindex]*this->u->cval(cindex,1,0)); + tmp[1][1] = (this->kk->kz[zindex]*this->u->cval(cindex,0,0) - this->kk->kx[xindex]*this->u->cval(cindex,2,0)); + tmp[2][1] = (this->kk->kx[xindex]*this->u->cval(cindex,1,0) - this->kk->ky[yindex]*this->u->cval(cindex,0,0)); + } + //ptrdiff_t tindex = 3*cindex; + //{ + // tmp[0][0] = -(this->kk->ky[yindex]*this->u->get_cdata()[tindex+2][1] - this->kk->kz[zindex]*this->u->get_cdata()[tindex+1][1]); + // tmp[1][0] = -(this->kk->kz[zindex]*this->u->get_cdata()[tindex+0][1] - this->kk->kx[xindex]*this->u->get_cdata()[tindex+2][1]); + // tmp[2][0] = -(this->kk->kx[xindex]*this->u->get_cdata()[tindex+1][1] - this->kk->ky[yindex]*this->u->get_cdata()[tindex+0][1]); + // tmp[0][1] = (this->kk->ky[yindex]*this->u->get_cdata()[tindex+2][0] - this->kk->kz[zindex]*this->u->get_cdata()[tindex+1][0]); + // tmp[1][1] = (this->kk->kz[zindex]*this->u->get_cdata()[tindex+0][0] - this->kk->kx[xindex]*this->u->get_cdata()[tindex+2][0]); + // tmp[2][1] = (this->kk->kx[xindex]*this->u->get_cdata()[tindex+1][0] - this->kk->ky[yindex]*this->u->get_cdata()[tindex+0][0]); + //} + for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) + this->u->cval(cindex, cc, i) = tmp[cc][i]; + //this->u->get_cdata()[3*cindex+cc][i] = tmp[cc][i]; + } + ); + this->add_forcing(this->u, this->v[src], 1.0); + this->kk->template force_divfree<rnumber>(this->u->get_cdata()); +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::step(double dt) +{ + DEBUG_MSG("vorticity_equation::step\n"); + TIMEZONE("vorticity_equation::step"); + *this->v[1] = 0.0; + this->omega_nonlin(0); + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2) + { + double factor0; + factor0 = exp(-this->nu * k2 * dt); + for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) + this->v[1]->cval(cindex,cc,i) = ( + this->v[0]->cval(cindex,cc,i) + + dt*this->u->cval(cindex,cc,i))*factor0; + //this->v[1]->get_cdata()[3*cindex+cc][i] = ( + // this->v[0]->get_cdata()[3*cindex+cc][i] + + // dt*this->u->get_cdata()[3*cindex+cc][i])*factor0; + } + } + ); + + this->omega_nonlin(1); + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2) + { + double factor0, factor1; + factor0 = exp(-this->nu * k2 * dt/2); + factor1 = exp( this->nu * k2 * dt/2); + for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) + this->v[2]->cval(cindex, cc, i) = ( + 3*this->v[0]->cval(cindex,cc,i)*factor0 + + ( this->v[1]->cval(cindex,cc,i) + + dt*this->u->cval(cindex,cc,i))*factor1)*0.25; + //this->v[2]->get_cdata()[3*cindex+cc][i] = ( + // 3*this->v[0]->get_cdata()[3*cindex+cc][i]*factor0 + + // (this->v[1]->get_cdata()[3*cindex+cc][i] + + // dt*this->u->get_cdata()[3*cindex+cc][i])*factor1)*0.25; + } + } + ); + + this->omega_nonlin(2); + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2) + { + double factor0; + factor0 = exp(-this->nu * k2 * dt * 0.5); + for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) + this->v[3]->cval(cindex,cc,i) = ( + this->v[0]->cval(cindex,cc,i)*factor0 + + 2*(this->v[2]->cval(cindex,cc,i) + + dt*this->u->cval(cindex,cc,i)))*factor0/3; + //this->v[3]->get_cdata()[3*cindex+cc][i] = ( + // this->v[0]->get_cdata()[3*cindex+cc][i]*factor0 + + // 2*(this->v[2]->get_cdata()[3*cindex+cc][i] + + // dt*this->u->get_cdata()[3*cindex+cc][i]))*factor0/3; + } + } + ); + + this->kk->template force_divfree<rnumber>(this->cvorticity->get_cdata()); + this->cvorticity->symmetrize(); + this->iteration++; +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::compute_pressure(field<rnumber, be, ONE> *pressure) +{ + TIMEZONE("vorticity_equation::compute_pressure"); + /* assume velocity is already in real space representation */ + + this->v[1]->real_space_representation = true; + /* diagonal terms 11 22 33 */ + this->v[1]->RLOOP ( + [&](ptrdiff_t rindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + //ptrdiff_t tindex = 3*rindex; + for (int cc=0; cc<3; cc++) + this->v[1]->rval(rindex,cc) = this->u->rval(rindex,cc)*this->u->rval(rindex,cc); + //this->v[1]->get_rdata()[tindex+cc] = this->u->get_rdata()[tindex+cc]*this->u->get_rdata()[tindex+cc]; + } + ); + //this->clean_up_real_space(this->rv[1], 3); + this->v[1]->dft(); + this->kk->template dealias<rnumber, THREE>(this->v[1]->get_cdata()); + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2 && k2 > 0) + { + ptrdiff_t tindex = 3*cindex; + for (int i=0; i<2; i++) + { + pressure->get_cdata()[cindex][i] = \ + -(this->kk->kx[xindex]*this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][i] + + this->kk->ky[yindex]*this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][i] + + this->kk->kz[zindex]*this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][i]); + } + } + else + std::fill_n((rnumber*)(pressure->get_cdata()+cindex), 2, 0.0); + } + ); + /* off-diagonal terms 12 23 31 */ + this->v[1]->real_space_representation = true; + this->v[1]->RLOOP ( + [&](ptrdiff_t rindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + //ptrdiff_t tindex = 3*rindex; + for (int cc=0; cc<3; cc++) + this->v[1]->rval(rindex,cc) = this->u->rval(rindex,cc)*this->u->rval(rindex,(cc+1)%3); + //this->v[1]->get_rdata()[tindex+cc] = this->u->get_rdata()[tindex+cc]*this->u->get_rdata()[tindex+(cc+1)%3]; + } + ); + //this->clean_up_real_space(this->rv[1], 3); + this->v[1]->dft(); + this->kk->template dealias<rnumber, THREE>(this->v[1]->get_cdata()); + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2 && k2 > 0) + { + ptrdiff_t tindex = 3*cindex; + for (int i=0; i<2; i++) + { + pressure->get_cdata()[cindex][i] -= \ + 2*(this->kk->kx[xindex]*this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+0][i] + + this->kk->ky[yindex]*this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+1][i] + + this->kk->kz[zindex]*this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+2][i]); + pressure->get_cdata()[cindex][i] /= pressure->npoints*k2; + } + } + } + ); +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::compute_Lagrangian_acceleration( + field<rnumber, be, THREE> *acceleration) +{ + field<rnumber, be, ONE> *pressure = new field<rnumber, be, ONE>( + this->cvelocity->rlayout->sizes[2], + this->cvelocity->rlayout->sizes[1], + this->cvelocity->rlayout->sizes[0], + this->cvelocity->rlayout->comm, + this->cvelocity->fftw_plan_rigor); + this->compute_velocity(this->cvorticity); + this->cvelocity->ift(); + this->compute_pressure(pressure); + this->compute_velocity(this->cvorticity); + acceleration->real_space_representation = false; + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2) + { + ptrdiff_t tindex = 3*cindex; + for (int cc=0; cc<3; cc++) + for (int i=0; i<2; i++) + acceleration->get_cdata()[tindex+cc][i] = \ + - this->nu*k2*this->cvelocity->get_cdata()[tindex+cc][i]; + if (strcmp(this->forcing_type, "linear") == 0) + { + double knorm = sqrt(k2); + if ((this->fk0 <= knorm) && + (this->fk1 >= knorm)) + for (int c=0; c<3; c++) + for (int i=0; i<2; i++) + acceleration->get_cdata()[tindex+c][i] += \ + this->famplitude*this->cvelocity->get_cdata()[tindex+c][i]; + } + acceleration->get_cdata()[tindex+0][0] += this->kk->kx[xindex]*pressure->get_cdata()[cindex][1]; + acceleration->get_cdata()[tindex+1][0] += this->kk->ky[yindex]*pressure->get_cdata()[cindex][1]; + acceleration->get_cdata()[tindex+2][0] += this->kk->kz[zindex]*pressure->get_cdata()[cindex][1]; + acceleration->get_cdata()[tindex+0][1] -= this->kk->kx[xindex]*pressure->get_cdata()[cindex][0]; + acceleration->get_cdata()[tindex+1][1] -= this->kk->ky[yindex]*pressure->get_cdata()[cindex][0]; + acceleration->get_cdata()[tindex+2][1] -= this->kk->kz[zindex]*pressure->get_cdata()[cindex][0]; + } + }); + delete pressure; +} + +template <class rnumber, + field_backend be> +void vorticity_equation<rnumber, be>::compute_Eulerian_acceleration( + field<rnumber, be, THREE> *acceleration) +{ + this->compute_velocity(this->cvorticity); + acceleration->real_space_representation = false; + /* put in linear terms */ + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2) + { + ptrdiff_t tindex = 3*cindex; + for (int cc=0; cc<3; cc++) + for (int i=0; i<2; i++) + acceleration->get_cdata()[tindex+cc][i] = \ + - this->nu*k2*this->cvelocity->get_cdata()[tindex+cc][i]; + if (strcmp(this->forcing_type, "linear") == 0) + { + double knorm = sqrt(k2); + if ((this->fk0 <= knorm) && + (this->fk1 >= knorm)) + { + for (int c=0; c<3; c++) + for (int i=0; i<2; i++) + acceleration->get_cdata()[tindex+c][i] += \ + this->famplitude*this->cvelocity->get_cdata()[tindex+c][i]; + } + } + } + } + ); + this->cvelocity->ift(); + /* compute uu */ + /* 11 22 33 */ + this->v[1]->real_space_representation = true; + this->cvelocity->RLOOP ( + [&](ptrdiff_t rindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + //ptrdiff_t tindex = 3*rindex; + for (int cc=0; cc<3; cc++) + this->v[1]->rval(rindex,cc) = \ + this->cvelocity->rval(rindex,cc)*this->cvelocity->rval(rindex,cc) / this->cvelocity->npoints; + //this->v[1]->get_rdata()[tindex+cc] = this->cvelocity->get_rdata()[tindex+cc]*this->cvelocity->get_rdata()[tindex+cc] / this->cvelocity->npoints; + } + ); + this->v[1]->dft(); + this->kk->template dealias<rnumber, THREE>(this->v[1]->get_cdata()); + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2) + { + ptrdiff_t tindex = 3*cindex; + acceleration->get_cdata()[tindex+0][0] += + this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][1]; + acceleration->get_cdata()[tindex+0][1] += + -this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][0]; + acceleration->get_cdata()[tindex+1][0] += + this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][1]; + acceleration->get_cdata()[tindex+1][1] += + -this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][0]; + acceleration->get_cdata()[tindex+2][0] += + this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][1]; + acceleration->get_cdata()[tindex+2][1] += + -this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][0]; + } + } + ); + /* 12 23 31 */ + this->v[1]->real_space_representation = true; + this->cvelocity->RLOOP ( + [&](ptrdiff_t rindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex){ + //ptrdiff_t tindex = 3*rindex; + for (int cc=0; cc<3; cc++) + this->v[1]->rval(rindex,cc) = \ + this->cvelocity->rval(rindex,cc)*this->cvelocity->rval(rindex,(cc+1)%3) / this->cvelocity->npoints; + //this->v[1]->get_rdata()[tindex+cc] = this->cvelocity->get_rdata()[tindex+cc]*this->cvelocity->get_rdata()[tindex+(cc+1)%3] / this->cvelocity->npoints; + } + ); + this->v[1]->dft(); + this->kk->template dealias<rnumber, THREE>(this->v[1]->get_cdata()); + this->kk->CLOOP_K2( + [&](ptrdiff_t cindex, + ptrdiff_t xindex, + ptrdiff_t yindex, + ptrdiff_t zindex, + double k2){ + if (k2 <= this->kk->kM2) + { + ptrdiff_t tindex = 3*cindex; + acceleration->get_cdata()[tindex+0][0] += + (this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+0][1] + + this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][1]); + acceleration->get_cdata()[tindex+0][1] += + - (this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+0][0] + + this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][0]); + acceleration->get_cdata()[tindex+1][0] += + (this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+1][1] + + this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][1]); + acceleration->get_cdata()[tindex+1][1] += + - (this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+1][0] + + this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][0]); + acceleration->get_cdata()[tindex+2][0] += + (this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+2][1] + + this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][1]); + acceleration->get_cdata()[tindex+2][1] += + - (this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+2][0] + + this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][0]); + } + } + ); + if (this->kk->layout->myrank == this->kk->layout->rank[0][0]) + std::fill_n((rnumber*)(acceleration->get_cdata()), 6, 0.0); + this->kk->template force_divfree<rnumber>(acceleration->get_cdata()); +} + + +/*****************************************************************************/ + + + + +/*****************************************************************************/ +/* finally, force generation of code for single precision */ +template class vorticity_equation<float, FFTW>; +template class vorticity_equation<double, FFTW>; +/*****************************************************************************/ + diff --git a/bfps/cpp/vorticity_equation.hpp b/bfps/cpp/vorticity_equation.hpp new file mode 100644 index 0000000000000000000000000000000000000000..60d566ed9f149c5a5e4848a2b4640c7378b05e98 --- /dev/null +++ b/bfps/cpp/vorticity_equation.hpp @@ -0,0 +1,137 @@ +/********************************************************************** +* * +* Copyright 2015 Max Planck Institute * +* for Dynamics and Self-Organization * +* * +* This file is part of bfps. * +* * +* bfps is free software: you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published * +* by the Free Software Foundation, either version 3 of the License, * +* or (at your option) any later version. * +* * +* bfps is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with bfps. If not, see <http://www.gnu.org/licenses/> * +* * +* Contact: Cristian.Lalescu@ds.mpg.de * +* * +**********************************************************************/ + +#include <sys/stat.h> +#include <stdio.h> +#include <stdlib.h> +#include <iostream> + +#include "field.hpp" +#include "field_descriptor.hpp" + +#ifndef VORTICITY_EQUATION + +#define VORTICITY_EQUATION + +extern int myrank, nprocs; + + +/* container for field descriptor, fields themselves, parameters, etc + * This particular class is only meant as a stepping stone to a proper solver + * that only uses the field class (and related layout and kspace classes), and + * HDF5 for I/O. + * */ + +template <typename rnumber, + field_backend be> +class vorticity_equation +{ + public: + /* name */ + char name[256]; + + /* iteration */ + int iteration; + int checkpoint; + int checkpoints_per_file; + + /* fields */ + field<rnumber, be, THREE> *cvorticity, *cvelocity; + field<rnumber, be, THREE> *rvorticity, *rvelocity; + kspace<be, SMOOTH> *kk; + + + /* short names for velocity, and 4 vorticity fields */ + field<rnumber, be, THREE> *u, *v[4]; + + /* physical parameters */ + double nu; + int fmode; // for Kolmogorov flow + double famplitude; // both for Kflow and band forcing + double fk0, fk1; // for band forcing + char forcing_type[128]; + + /* constructor, destructor */ + vorticity_equation( + const char *NAME, + int nx, + int ny, + int nz, + double DKX = 1.0, + double DKY = 1.0, + double DKZ = 1.0, + unsigned FFTW_PLAN_RIGOR = FFTW_MEASURE); + ~vorticity_equation(void); + + /* solver essential methods */ + void omega_nonlin(int src); + void step(double dt); + void impose_zero_modes(void); + void add_forcing(field<rnumber, be, THREE> *dst, + field<rnumber, be, THREE> *src_vorticity, + rnumber factor); + void compute_vorticity(void); + void compute_velocity(field<rnumber, be, THREE> *vorticity); + + /* I/O stuff */ + inline std::string get_current_fname() + { + return ( + std::string(this->name) + + std::string("_checkpoint_") + + std::to_string(this->checkpoint) + + std::string(".h5")); + } + void update_checkpoint(void); + inline void io_checkpoint(bool read = true) + { + assert(!this->cvorticity->real_space_representation); + if (!read) + this->update_checkpoint(); + std::string fname = this->get_current_fname(); + this->cvorticity->io( + fname, + "vorticity", + this->iteration, + read); + if (read) + { + #if (__GNUC__ <= 4 && __GNUC_MINOR__ <= 7) + this->kk->low_pass<rnumber, THREE>(this->cvorticity->get_cdata(), this->kk->kM); + this->kk->force_divfree<rnumber>(this->cvorticity->get_cdata()); + #else + this->kk->template low_pass<rnumber, THREE>(this->cvorticity->get_cdata(), this->kk->kM); + this->kk->template force_divfree<rnumber>(this->cvorticity->get_cdata()); + #endif + } + } + + /* statistics and general postprocessing */ + void compute_pressure(field<rnumber, be, ONE> *pressure); + void compute_Eulerian_acceleration(field<rnumber, be, THREE> *acceleration); + void compute_Lagrangian_acceleration(field<rnumber, be, THREE> *acceleration); +}; + +#endif//VORTICITY_EQUATION + diff --git a/bfps/tools.py b/bfps/tools.py index ff5d365aa979fd0c98b9ab64fe8a2a5404f05474..69756ec648409ab52d57930d26b1ab1ca8b942c1 100644 --- a/bfps/tools.py +++ b/bfps/tools.py @@ -28,6 +28,36 @@ import sys import math import numpy as np +import h5py + +def create_alloc_early_dataset( + data_file, + dset_name, + dset_shape, + dset_maxshape, + dset_chunks, + # maybe something more general can be used here + dset_dtype = h5py.h5t.IEEE_F64LE): + # create the dataspace. + space_id = h5py.h5s.create_simple( + dset_shape, + dset_maxshape) + # create the dataset creation property list. + dcpl = h5py.h5p.create(h5py.h5p.DATASET_CREATE) + # set the allocation time to "early". + dcpl.set_alloc_time(h5py.h5d.ALLOC_TIME_EARLY) + dcpl.set_chunk(dset_chunks) + # and now create dataset + if sys.version_info[0] == 3: + dset_name = dset_name.encode() + return h5py.h5d.create( + data_file.id, + dset_name, + dset_dtype, + space_id, + dcpl, + h5py.h5p.DEFAULT) + def generate_data_3D_uniform( n0, n1, n2, dtype = np.complex128, diff --git a/documentation/_static/overview.rst b/documentation/_static/overview.rst index 607cfcc4774cbfd583240d2e7f9bad3cc766af80..afe7a753666e6ea5911ce1266d0803aa25ea5c45 100644 --- a/documentation/_static/overview.rst +++ b/documentation/_static/overview.rst @@ -2,6 +2,65 @@ Overview and Tutorial ===================== +---------------- +General comments +---------------- + +The purpose of this code is to run pseudo-spectral DNS of turbulence, +and integrate particle trajectories in the resulting fields. +In brief, the main aim of the code is to simplify the launching of +compute jobs and postprocessing, up to and including the generation of +publication-ready figures. + +For research, people routinely write code from scratch because research +goals change to a point where modifying the previous code is too +expensive. +With bfps, the desire is to identify core functionality that should be +implemented in a library. +The core library can then be used by many problem-specific codes. + +In this sense, the structuring of the code-base is non-standard. +The core functionality is implemented in C++ (classes useful for +describing working with fields or sets of particles), while a python +wrapper is used for generating "main" programmes to be linked against +the core library. +The core library uses MPI for parallelization, and the python wrapper +compiles this core library when being installed. +The compilation environment can be configured for different +machines as required. + +Python3 "wrapper" +----------------- + +In principle, users of the code should only need to use python3 for +launching jobs and postprocessing data. +While python2 compatibility should not be too hard to maintain, the +usage of strings makes it a bit cumbersome --- +the code makes extensive usage of strings for `HDF5` I/O. + +Classes defined in the python package can be used to generate executable +codes, compile/launch them, and then for accessing and postprocessing +data. +Obviously, postprocessing methods can be optimized with C extensions or +otherwise, as needed. + +Code generation is quite straightforward, with C++ code snippets handled +as strings in the python code, such that they can be combined in +different ways. + +Once a "main" file has been written, it is compiled and linked against +the core library. +Depending on machine-specific settings, the code can then be launched +directly, or job scripts appropriate for queueing systems are generated +and submitted. + +C++ core library +---------------- + +A small set of base classes are implemented. + +[ some details to be added here ] + --------- Equations --------- diff --git a/documentation/figs/interpolation.py b/documentation/figs/interpolation.py new file mode 100644 index 0000000000000000000000000000000000000000..302efcc157971b8b0407bb76bd3e7be6437f1206 --- /dev/null +++ b/documentation/figs/interpolation.py @@ -0,0 +1,52 @@ +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import math + +def main(): + slab = 2 + nproc = 5 + f = plt.figure(figsize = (6, 4.5)) + a = f.add_subplot(111) + for p in range(nproc): + color = plt.get_cmap('plasma')(p*1./nproc) + a.add_patch( + mpatches.Rectangle( + [0, p*slab], + slab*(nproc+2)-1, 1, + color = color, + alpha = .2)) + a.text(-.5, p*slab+.5, '$p_{0}$'.format(p), + verticalalignment = 'center') + for y in range((nproc+2)*slab): + a.plot([y, y], + range(p*slab, (p+1)*slab), + marker = '.', + linestyle = 'none', + color = color) + for X, Y in [(9.9, 6.3), + (3.3, 3.7)]: + a.plot([X], [Y], + color = 'black', + marker = 'x') + for n in [1, 2]: + a.add_patch( + mpatches.Rectangle( + [math.floor(X-n), math.floor(Y-n)], + 2*n+1, 2*n+1, + color = 'green', + alpha = .2)) + a.text(math.floor(X)+.5, math.floor(Y - n)-.3, + '$n = {0}$'.format(n), + horizontalalignment = 'center') + a.set_ylim(bottom = -1, top = 10) + a.set_xlim(left = -1) + a.set_ylabel('$z$') + a.set_xlabel('$x,y$') + a.set_aspect('equal') + f.tight_layout() + f.savefig('interp_problem.pdf') + return None + +if __name__ == '__main__': + main() + diff --git a/done.txt b/done.txt deleted file mode 100644 index 2064592cc9dd7a6e278c9980770882e636b8a2be..0000000000000000000000000000000000000000 --- a/done.txt +++ /dev/null @@ -1,21 +0,0 @@ -x 2015-12-04 make code py3 compatible @python3 -x 2015-12-23 decide on versioning system +merge0 -x 2015-12-24 move get_grid coords to interpolator @optimization +v1.0 -x 2015-12-25 get rid of temporal interpolation @optimization +v1.0 -x 2015-12-26 call interpolation only when needed @optimization +v1.0 -x 2015-12-26 clean up tox files, make sure all tests run @tests +v1.0 -x 2016-01-03 check divfree function -x 2016-01-03 compute kMeta(t) as well -x 2016-01-03 split library into core and extra @optimization +v1.0 -x 2016-01-07 FFTW interpolator doesn't need its own field @optimization +v1.0 +particle_api -x 2016-01-08 simplify tracer/field addition mechanism @design +v1.0 +particle_api -x 2016-01-08 add stat choice parameter to add_particles @design +v1.0 +particle_api -x 2016-01-15 particle output is broken when niter_part != 1 @bugfix -x 2016-01-19 clean up machine_settings mess @design @documentation +v2.0 -x 2016-01-24 clear delimitation of public API @documentation +v1.0 -x 2016-01-24 document coordinate conventions @documentation +v1.0 -x 2016-01-24 move parameters from _fluid_particle_base to NavierStokes etc @design -x 2016-01-29 install_info should be renamed to bfps_info in data file -x 2016-02-01 tweak HDF5 settings @optimization @HDF5 +I/O -x 2016-03-02 code overview @documentation -x 2016-04-29 use HDF5 io for fields @design @HDF5 +I/O diff --git a/examples/NS0SliceParticles.py b/examples/NS0SliceParticles.py new file mode 100644 index 0000000000000000000000000000000000000000..7c089405988a1c6eef6a1c7649e11c7a4a6edcaa --- /dev/null +++ b/examples/NS0SliceParticles.py @@ -0,0 +1,126 @@ +####################################################################### +# # +# Copyright 2015 Max Planck Institute # +# for Dynamics and Self-Organization # +# # +# This file is part of bfps. # +# # +# bfps is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published # +# by the Free Software Foundation, either version 3 of the License, # +# or (at your option) any later version. # +# # +# bfps is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with bfps. If not, see <http://www.gnu.org/licenses/> # +# # +# Contact: Cristian.Lalescu@ds.mpg.de # +# # +####################################################################### + + + +import os +import sys +import bfps +import numpy as np + +class NS0SliceParticles(bfps.NavierStokes): + """ + Example of how bfps is envisioned to be used. + Standard NavierStokes class is inherited, and then new functionality + added on top. + In particular, this class will a DNS with particles starting on a square + grid in the z=0 slice of the field. + """ + standard_names = ['NS0SP', + 'NS0SP-single', + 'NS0SP-double'] + def __init__( + self, + name = 'NS0SliceParticles-v' + bfps.__version__, + **kwargs): + bfps.NavierStokes.__init__( + self, + name = name, + **kwargs) + return None + def specific_parser_arguments( + self, + parser): + bfps.NavierStokes.specific_parser_arguments(self, parser) + parser.add_argument( + '--pcloudX', + type = float, + dest = 'pcloudX', + default = 0.0) + parser.add_argument( + '--pcloudY', + type = float, + dest = 'pcloudY', + default = 0.0) + return None + def launch_jobs( + self, + opt = None): + if not os.path.exists(os.path.join(self.work_dir, self.simname + '.h5')): + particle_initial_condition = None + if self.parameters['nparticles'] > 0: + # the extra dimension of 1 is because I want + # a single chunk of particles. + particle_initial_condition = np.zeros( + (1, + self.parameters['nparticles'], + self.parameters['nparticles'], + 3), + dtype = np.float64) + xvals = (opt.pcloudX + + np.linspace(-opt.particle_cloud_size/2, + opt.particle_cloud_size/2, + self.parameters['nparticles'])) + yvals = (opt.pcloudY + + np.linspace(-opt.particle_cloud_size/2, + opt.particle_cloud_size/2, + self.parameters['nparticles'])) + particle_initial_condition[..., 0] = xvals[None, None, :] + particle_initial_condition[..., 1] = yvals[None, :, None] + self.write_par( + particle_ic = particle_initial_condition) + if self.parameters['nparticles'] > 0: + data = self.generate_tracer_state( + species = 0, + rseed = opt.particle_rand_seed, + data = particle_initial_condition) + init_condition_file = os.path.join( + self.work_dir, + self.simname + '_cvorticity_i{0:0>5x}'.format(0)) + if not os.path.exists(init_condition_file): + if len(opt.src_simname) > 0: + src_file = os.path.join( + os.path.realpath(opt.src_work_dir), + opt.src_simname + '_cvorticity_i{0:0>5x}'.format(opt.src_iteration)) + os.symlink(src_file, init_condition_file) + else: + self.generate_vector_field( + write_to_file = True, + spectra_slope = 2.0, + amplitude = 0.05) + self.run( + ncpu = opt.ncpu, + njobs = opt.njobs, + hours = opt.minutes // 60, + minutes = opt.minutes % 60) + return None + +def main(): + c = NS0SliceParticles() + c.launch(args = sys.argv[1:]) + return None + +if __name__ == '__main__': + main() + diff --git a/examples/NSBufferedParticles.py b/examples/NSBufferedParticles.py new file mode 100644 index 0000000000000000000000000000000000000000..34906576d62e2b2cac68f2d6c261129b23d667b7 --- /dev/null +++ b/examples/NSBufferedParticles.py @@ -0,0 +1,51 @@ +import bfps +import argparse +import sys + +class NSBufferedParticles(bfps.NavierStokes): + """ + Another example. + This class behaves identically to NavierStokes, except that it uses a + buffered interpolator, and the corresponding distributed_particles class. + """ + standard_names = ['NSBP', + 'NSBP-single', + 'NSBP-double'] + def launch( + self, + args = [], + noparticles = False, + **kwargs): + self.name = 'NSBufferedParticles-v' + bfps.__version__ + opt = self.prepare_launch(args = args) + self.fill_up_fluid_code() + if noparticles: + opt.nparticles = 0 + elif type(opt.nparticles) == int: + if opt.nparticles > 0: + self.name += '-particles' + self.add_3D_rFFTW_field( + name = 'rFFTW_acc') + self.add_interpolator( + name = 'cubic_spline', + neighbours = opt.neighbours, + smoothness = opt.smoothness, + class_name = 'interpolator') + self.add_particles( + integration_steps = [4], + interpolator = 'cubic_spline', + acc_name = 'rFFTW_acc', + class_name = 'distributed_particles') + self.finalize_code() + self.launch_jobs(opt = opt) + return None + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog = 'NSBufferedParticles') + parser.add_argument( + '-v', '--version', + action = 'version', + version = '%(prog)s ' + bfps.__version__) + c = NSBufferedParticles(fluid_precision = 'single') + c.launch(args = sys.argv[1:]) + diff --git a/examples/NavierStokesDB.py b/examples/NavierStokesDB.py new file mode 100644 index 0000000000000000000000000000000000000000..d099ad308e8fa47aea08275bc80694da796465b2 --- /dev/null +++ b/examples/NavierStokesDB.py @@ -0,0 +1,112 @@ +####################################################################### +# # +# Copyright 2015 Max Planck Institute # +# for Dynamics and Self-Organization # +# # +# This file is part of bfps. # +# # +# bfps is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published # +# by the Free Software Foundation, either version 3 of the License, # +# or (at your option) any later version. # +# # +# bfps is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with bfps. If not, see <http://www.gnu.org/licenses/> # +# # +# Contact: Cristian.Lalescu@ds.mpg.de # +# # +####################################################################### + + + +import os +import h5py +import bfps + +class NavierStokesDB(bfps.NavierStokes): + """ + Example of how bfps is envisioned to be used. + Standard NavierStokes class is inherited, and then new functionality + added on top. + In particular, this class will generate an HDF5 file containing a 5D + array representing the time history of the velocity field. + Snapshots are saved every "niter_stat" iterations. + + No effort was spent on optimizing the HDF5 file access, since the code + was only used for a teeny DNS of 72^3 so far. + """ + standard_names = ['NSDB', + 'NSDB-single', + 'NSDB-double'] + def __init__( + self, + name = 'NavierStokesDataBase-v' + bfps.__version__, + **kwargs): + bfps.NavierStokes.__init__( + self, + name = name, + **kwargs) + self.file_datasets_grow += """ + { + if (myrank == 0) + { + hid_t database_file; + char dbfname[256]; + sprintf(dbfname, "%s_field_database.h5", simname); + database_file = H5Fopen(dbfname, H5F_ACC_RDWR, H5P_DEFAULT); + hsize_t dset = H5Dopen(database_file, "rvelocity", H5P_DEFAULT); + grow_single_dataset(dset, niter_todo/niter_stat); + H5Dclose(dset); + H5Fclose(database_file); + } + } + """ + self.stat_src += """ + { + fs->compute_velocity(fs->cvorticity); + *tmp_vec_field = fs->cvelocity; + tmp_vec_field->ift(); + char dbfname[256]; + sprintf(dbfname, "%s_field_database.h5", simname); + tmp_vec_field->io(dbfname, "rvelocity", fs->iteration / niter_stat, false); + } + """ + return None + def get_database_file_name(self): + return os.path.join(self.work_dir, self.simname + '_field_database.h5') + def get_database_file(self): + return h5py.File(self.get_postprocess_file_name(), 'r') + def write_par( + self, + iter0 = 0, + **kwargs): + bfps.NavierStokes.write_par( + self, + iter0 = iter0, + **kwargs) + with h5py.File(self.get_database_file_name(), 'a') as ofile: + ofile.create_dataset( + 'rvelocity', + (1, + self.parameters['nz'], + self.parameters['ny'], + self.parameters['nx'], + 3), + chunks = (1, + self.parameters['nz'], + self.parameters['ny'], + self.parameters['nx'], + 3), + maxshape = (None, + self.parameters['nz'], + self.parameters['ny'], + self.parameters['nx'], + 3), + dtype = self.rtype) + return None + diff --git a/machine_settings_py.py b/machine_settings_py.py index 22123e391aa14151e2f1d4b4c8c0b5c8d6a1c435..787f1d5a10b9b0b260b42a1da18d35e67c56dacc 100644 --- a/machine_settings_py.py +++ b/machine_settings_py.py @@ -37,6 +37,7 @@ import os hostname = os.getenv('HOSTNAME') +compiler = 'g++' extra_compile_args = ['-Wall', '-O2', '-g', '-mtune=native', '-ffast-math', '-std=c++11'] extra_libraries = ['hdf5'] include_dirs = [] diff --git a/setup.py b/setup.py index c9bbc9c1d956d4d74d6344e19d1d220b1ff12b0b..e1d85b38a95a4a47186e74c44a1e4aeb52098da2 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ if not os.path.exists(os.path.join(bfpsfolder, 'host_information.py')): shutil.copyfile('./machine_settings_py.py', os.path.join(bfpsfolder, 'machine_settings.py')) sys.path.insert(0, bfpsfolder) # import stuff required for compilation of static library -from machine_settings import include_dirs, library_dirs, extra_compile_args, extra_libraries +from machine_settings import compiler, include_dirs, library_dirs, extra_compile_args, extra_libraries ### package versioning @@ -88,7 +88,10 @@ print('This is bfps version ' + VERSION) ### lists of files and MANIFEST.in -src_file_list = ['field', +src_file_list = ['vorticity_equation', + 'field', + 'kspace', + 'field_layout', 'field_descriptor', 'rFFTW_distributed_particles', 'distributed_particles', @@ -107,11 +110,34 @@ src_file_list = ['field', 'spline_n4', 'spline_n5', 'spline_n6', - 'Lagrange_polys'] + 'Lagrange_polys', + 'scope_timer'] + +particle_headers = [ + 'cpp/particles/abstract_particles_distr.hpp', + 'cpp/particles/abstract_particles_input.hpp', + 'cpp/particles/abstract_particles_output.hpp', + 'cpp/particles/abstract_particles_system.hpp', + 'cpp/particles/alltoall_exchanger.hpp', + 'cpp/particles/field_accessor.hpp', + 'cpp/particles/particles_adams_bashforth.hpp', + 'cpp/particles/particles_field_computer.hpp', + 'cpp/particles/particles_input_hdf5.hpp', + 'cpp/particles/particles_interp_spline.hpp', + 'cpp/particles/particles_output_hdf5.hpp', + 'cpp/particles/particles_output_mpiio.hpp', + 'cpp/particles/particles_system_builder.hpp', + 'cpp/particles/particles_system.hpp', + 'cpp/particles/particles_utils.hpp'] header_list = (['cpp/base.hpp'] + + ['cpp/fftw_interface.hpp'] + + ['cpp/bfps_timer.hpp'] + + ['cpp/omputils.hpp'] + + ['cpp/shared_array.hpp'] + ['cpp/' + fname + '.hpp' - for fname in src_file_list]) + for fname in src_file_list] + + particle_headers) with open('MANIFEST.in', 'w') as manifest_in_file: for fname in (['bfps/cpp/' + ff + '.cpp' for ff in src_file_list] + @@ -121,77 +147,86 @@ with open('MANIFEST.in', 'w') as manifest_in_file: ### libraries -libraries = ['fftw3_mpi', - 'fftw3', - 'fftw3f_mpi', - 'fftw3f'] -libraries += extra_libraries - - - -### save compiling information -pickle.dump( - {'include_dirs' : include_dirs, - 'library_dirs' : library_dirs, - 'extra_compile_args' : extra_compile_args, - 'libraries' : libraries, - 'install_date' : now, - 'VERSION' : VERSION, - 'git_revision' : git_revision}, - open('bfps/install_info.pickle', 'wb'), - protocol = 2) - - - -def compile_bfps_library(): - if not os.path.isdir('obj'): - os.makedirs('obj') - need_to_compile = True - else: - ofile = 'bfps/libbfps.a' - libtime = datetime.datetime.fromtimestamp(os.path.getctime(ofile)) - latest = libtime - for fname in header_list: - latest = max(latest, - datetime.datetime.fromtimestamp(os.path.getctime('bfps/' + fname))) - need_to_compile = (latest > libtime) - for fname in src_file_list: - ifile = 'bfps/cpp/' + fname + '.cpp' - ofile = 'obj/' + fname + '.o' - if not os.path.exists(ofile): - need_to_compile_file = True - else: - need_to_compile_file = (need_to_compile or - (datetime.datetime.fromtimestamp(os.path.getctime(ofile)) < - datetime.datetime.fromtimestamp(os.path.getctime(ifile)))) - if need_to_compile_file: - command_strings = ['g++', '-c'] - command_strings += ['bfps/cpp/' + fname + '.cpp'] - command_strings += ['-o', 'obj/' + fname + '.o'] - command_strings += extra_compile_args - command_strings += ['-I' + idir for idir in include_dirs] - command_strings.append('-Ibfps/cpp/') - print(' '.join(command_strings)) - assert(subprocess.call(command_strings) == 0) - command_strings = ['ar', 'rvs', 'bfps/libbfps.a'] - command_strings += ['obj/' + fname + '.o' for fname in src_file_list] - print(' '.join(command_strings)) - assert(subprocess.call(command_strings) == 0) - return None - -from distutils.command.build import build as DistutilsBuild -from distutils.command.install import install as DistutilsInstall - -class CustomBuild(DistutilsBuild): +libraries = extra_libraries + + +import distutils.cmd + +class CompileLibCommand(distutils.cmd.Command): + description = 'Compile bfps library.' + user_options = [ + ('timing-output=', None, 'Toggle timing output.'), + ('fftw-estimate=', None, 'Use FFTW ESTIMATE.'), + ('disable-fftw-omp=', None, 'Turn Off FFTW OpenMP.'), + ] + def initialize_options(self): + self.timing_output = 0 + self.fftw_estimate = 0 + self.disable_fftw_omp = 0 + return None + def finalize_options(self): + self.timing_output = (int(self.timing_output) == 1) + self.fftw_estimate = (int(self.fftw_estimate) == 1) + self.disable_fftw_omp = (int(self.disable_fftw_omp) == 1) + return None def run(self): - compile_bfps_library() - DistutilsBuild.run(self) - -# this custom install leads to a broken installation. no idea why... -class CustomInstall(DistutilsInstall): - def run(self): - compile_bfps_library() - DistutilsInstall.run(self) + if not os.path.isdir('obj'): + os.makedirs('obj') + need_to_compile = True + if not os.path.isfile('bfps/libbfps.a'): + need_to_compile = True + else: + ofile = 'bfps/libbfps.a' + libtime = datetime.datetime.fromtimestamp(os.path.getctime(ofile)) + latest = libtime + for fname in header_list: + latest = max(latest, + datetime.datetime.fromtimestamp(os.path.getctime('bfps/' + fname))) + need_to_compile = (latest > libtime) + eca = extra_compile_args + eca += ['-fPIC'] + if self.timing_output: + eca += ['-DUSE_TIMINGOUTPUT'] + if self.fftw_estimate: + eca += ['-DUSE_FFTWESTIMATE'] + if self.disable_fftw_omp: + eca += ['-DNO_FFTWOMP'] + for fname in src_file_list: + ifile = 'bfps/cpp/' + fname + '.cpp' + ofile = 'obj/' + fname + '.o' + if not os.path.exists(ofile): + need_to_compile_file = True + else: + need_to_compile_file = (need_to_compile or + (datetime.datetime.fromtimestamp(os.path.getctime(ofile)) < + datetime.datetime.fromtimestamp(os.path.getctime(ifile)))) + if need_to_compile_file: + command_strings = [compiler, '-c'] + command_strings += ['bfps/cpp/' + fname + '.cpp'] + command_strings += ['-o', 'obj/' + fname + '.o'] + command_strings += eca + command_strings += ['-I' + idir for idir in include_dirs] + command_strings.append('-Ibfps/cpp/') + print(' '.join(command_strings)) + subprocess.check_call(command_strings) + command_strings = ['ar', 'rvs', 'bfps/libbfps.a'] + command_strings += ['obj/' + fname + '.o' for fname in src_file_list] + print(' '.join(command_strings)) + subprocess.check_call(command_strings) + + ### save compiling information + pickle.dump( + {'include_dirs' : include_dirs, + 'library_dirs' : library_dirs, + 'compiler' : compiler, + 'extra_compile_args' : eca, + 'libraries' : libraries, + 'install_date' : now, + 'VERSION' : VERSION, + 'git_revision' : git_revision}, + open('bfps/install_info.pickle', 'wb'), + protocol = 2) + return None from setuptools import setup @@ -199,7 +234,7 @@ setup( name = 'bfps', packages = ['bfps'], install_requires = ['numpy>=1.8', 'h5py>=2.2.1'], - cmdclass={'build' : CustomBuild}, + cmdclass={'compile_library' : CompileLibCommand}, package_data = {'bfps': header_list + ['libbfps.a', 'install_info.pickle']}, entry_points = { diff --git a/tests/test_field_class.py b/tests/test_field_class.py index fc52f419a5ab2dd7a5231676c41b9d586d497080..110d9be685ef42d4ed231a3a3c723ac34e3d916d 100644 --- a/tests/test_field_class.py +++ b/tests/test_field_class.py @@ -32,32 +32,37 @@ class TestField(_fluid_particle_base): self.fluid_includes += '#include "fftw_tools.hpp"\n' self.fluid_includes += '#include "field.hpp"\n' self.fluid_variables += ('field<' + self.C_dtype + ', FFTW, ONE> *f;\n' + + 'field<' + self.C_dtype + ', FFTW, THREE> *v;\n' + 'kspace<FFTW, SMOOTH> *kk;\n') self.fluid_start += """ //begincpp f = new field<{0}, FFTW, ONE>( nx, ny, nz, MPI_COMM_WORLD); + v = new field<{0}, FFTW, THREE>( + nx, ny, nz, MPI_COMM_WORLD); kk = new kspace<FFTW, SMOOTH>( f->clayout, 1., 1., 1.); // read rdata - f->io("field.h5", "rdata", 0, true); + f->real_space_representation = true; + f->io("field.h5", "scal", 0, true); // go to fourier space, write into cdata_tmp f->dft(); - f->io("field.h5", "cdata_tmp", 0, false); + f->io("field.h5", "scal_tmp", 0, false); f->ift(); - f->io("field.h5", "rdata", 0, false); - f->io("field.h5", "cdata", 0, true); + f->io("field.h5", "scal", 0, false); + f->real_space_representation = false; + f->io("field.h5", "scal", 0, true); hid_t gg; if (f->myrank == 0) gg = H5Fopen("field.h5", H5F_ACC_RDWR, H5P_DEFAULT); kk->cospectrum<float, ONE>( - f->get_rdata(), - f->get_rdata(), + f->get_cdata(), + f->get_cdata(), gg, "scal", 0); f->ift(); - f->io("field.h5", "rdata_tmp", 0, false); + f->io("field.h5", "scal_tmp", 0, false); std::vector<double> me; me.resize(1); me[0] = 30; @@ -66,11 +71,15 @@ class TestField(_fluid_particle_base): 0, me); if (f->myrank == 0) H5Fclose(gg); + v->real_space_representation = false; + v->io("field.h5", "vec", 0, true); + v->io("field.h5", "vec_tmp", 0, false); //endcpp """.format(self.C_dtype) self.fluid_end += """ //begincpp delete f; + delete v; //endcpp """ return None @@ -92,7 +101,7 @@ class TestField(_fluid_particle_base): return None def main(): - n = 128 + n = 32 kdata = pyfftw.n_byte_align_empty( (n, n, n//2 + 1), pyfftw.simd_alignment, @@ -116,10 +125,10 @@ def main(): tf.parameters['ny'] = n tf.parameters['nz'] = n f = h5py.File('field.h5', 'w') - f['cdata'] = cdata.reshape((1,) + cdata.shape) - f['cdata_tmp'] = np.zeros(shape=(1,) + cdata.shape).astype(cdata.dtype) - f['rdata'] = rdata.reshape((1,) + rdata.shape) - f['rdata_tmp'] = np.zeros(shape=(1,) + rdata.shape).astype(rdata.dtype) + f['scal/complex/0'] = cdata + f['scal/real/0'] = rdata + f['vec/complex/0'] = np.array([cdata, cdata, cdata]).reshape(cdata.shape + (3,)) + f['vec/real/0'] = np.array([rdata, rdata, rdata]).reshape(rdata.shape + (3,)) f['moments/scal'] = np.zeros(shape = (1, 10)).astype(np.float) f['histograms/scal'] = np.zeros(shape = (1, 64)).astype(np.float) kspace = tf.get_kspace() @@ -133,35 +142,60 @@ def main(): '--ncpu', '2']) f = h5py.File('field.h5', 'r') - err0 = np.max(np.abs(f['rdata_tmp'][0] - rdata)) / np.mean(np.abs(rdata)) - err1 = np.max(np.abs(f['rdata'][0]/(n**3) - rdata)) / np.mean(np.abs(rdata)) - err2 = np.max(np.abs(f['cdata_tmp'][0]/(n**3) - cdata)) / np.mean(np.abs(cdata)) - print(err0, err1, err2) - assert(err0 < 1e-5) - assert(err1 < 1e-5) - assert(err2 < 1e-4) - ### compare - #fig = plt.figure(figsize=(12, 6)) - #a = fig.add_subplot(121) - #a.set_axis_off() - #a.imshow(rdata[0, :, :], interpolation = 'none') - #a = fig.add_subplot(122) - #a.set_axis_off() - #a.imshow(f['rdata_tmp'][0, 0, :, :], interpolation = 'none') + #err0 = np.max(np.abs(f['scal_tmp/real/0'].value - rdata)) / np.mean(np.abs(rdata)) + #err1 = np.max(np.abs(f['scal/real/0'].value/(n**3) - rdata)) / np.mean(np.abs(rdata)) + #err2 = np.max(np.abs(f['scal_tmp/complex/0'].value/(n**3) - cdata)) / np.mean(np.abs(cdata)) + #print(err0, err1, err2) + #assert(err0 < 1e-5) + #assert(err1 < 1e-5) + #assert(err2 < 1e-4) + ## compare + fig = plt.figure(figsize=(18, 6)) + a = fig.add_subplot(131) + a.set_axis_off() + v0 = f['vec/complex/0'][:, :, 0, 0] + v1 = f['vec_tmp/complex/0'][:, :, 0, 0] + a.imshow(np.log(np.abs(v0 - v1)), + interpolation = 'none') + a = fig.add_subplot(132) + a.set_axis_off() + a.imshow(np.log(np.abs(v0)), + interpolation = 'none') + a = fig.add_subplot(133) + a.set_axis_off() + a.imshow(np.log(np.abs(v1)), + interpolation = 'none') + fig.tight_layout() + fig.savefig('tst_fields.pdf') + fig = plt.figure(figsize=(18, 6)) + a = fig.add_subplot(131) + a.set_axis_off() + v0 = f['scal/complex/0'][:, :, 0] + v1 = f['scal_tmp/complex/0'][:, :, 0] + a.imshow(np.log(np.abs(v0 - v1)), + interpolation = 'none') + a = fig.add_subplot(132) + a.set_axis_off() + a.imshow(np.log(np.abs(v0)), + interpolation = 'none') + a = fig.add_subplot(133) + a.set_axis_off() + a.imshow(np.log(np.abs(v1)), + interpolation = 'none') + fig.tight_layout() + fig.savefig('tst_sfields.pdf') + # look at moments and histogram + #print('moments are ', f['moments/scal'][0]) + #fig = plt.figure(figsize=(6,6)) + #a = fig.add_subplot(211) + #a.plot(f['histograms/scal'][0]) + #a.set_yscale('log') + #a = fig.add_subplot(212) + #a.plot(f['spectra/scal'][0]) + #a.set_xscale('log') + #a.set_yscale('log') #fig.tight_layout() #fig.savefig('tst.pdf') - # look at moments and histogram - print('moments are ', f['moments/scal'][0]) - fig = plt.figure(figsize=(6,6)) - a = fig.add_subplot(211) - a.plot(f['histograms/scal'][0]) - a.set_yscale('log') - a = fig.add_subplot(212) - a.plot(f['spectra/scal'][0]) - a.set_xscale('log') - a.set_yscale('log') - fig.tight_layout() - fig.savefig('tst.pdf') return None if __name__ == '__main__': diff --git a/tests/test_io.py b/tests/test_io.py index ce825c808785266c5199149aac5a4ab481ffedc2..624d357b0950eb8c3ae18c1f4a9ae7f47f45b0f8 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -54,6 +54,6 @@ if __name__ == '__main__': c = test_io(work_dir = opt.work_dir + '/io') c.write_src() c.write_par() - c.set_host_info({'type' : 'pc'}) - c.run(ncpu = opt.ncpu) + c.set_host_info(bfps.host_info) + c.run(opt.ncpu, 1) diff --git a/tests/test_io_00.py b/tests/test_io_00.py new file mode 100644 index 0000000000000000000000000000000000000000..f558cb8c6fc87be0518a7f63b4fadb0f06acd293 --- /dev/null +++ b/tests/test_io_00.py @@ -0,0 +1,37 @@ +####################################################################### +# # +# Copyright 2015 Max Planck Institute # +# for Dynamics and Self-Organization # +# # +# This file is part of bfps. # +# # +# bfps is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published # +# by the Free Software Foundation, either version 3 of the License, # +# or (at your option) any later version. # +# # +# bfps is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with bfps. If not, see <http://www.gnu.org/licenses/> # +# # +# Contact: Cristian.Lalescu@ds.mpg.de # +# # +####################################################################### + + + +from test_io import * + +if __name__ == '__main__': + opt = parser.parse_args( + ['-n', '32', + '--ncpu', '2'] + + sys.argv[1:]) + print('about to create test_io object') + c = test_io(work_dir = opt.work_dir + '/io') + print('congratulations, test_io object was created') + diff --git a/tests/test_io_01_write.py b/tests/test_io_01_write.py new file mode 100644 index 0000000000000000000000000000000000000000..d3876da168d55cc3c44b86f08fde653b61aa4301 --- /dev/null +++ b/tests/test_io_01_write.py @@ -0,0 +1,37 @@ +####################################################################### +# # +# Copyright 2015 Max Planck Institute # +# for Dynamics and Self-Organization # +# # +# This file is part of bfps. # +# # +# bfps is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published # +# by the Free Software Foundation, either version 3 of the License, # +# or (at your option) any later version. # +# # +# bfps is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with bfps. If not, see <http://www.gnu.org/licenses/> # +# # +# Contact: Cristian.Lalescu@ds.mpg.de # +# # +####################################################################### + + + +from test_io import * + +if __name__ == '__main__': + opt = parser.parse_args( + ['-n', '32', + '--ncpu', '2'] + + sys.argv[1:]) + c = test_io(work_dir = opt.work_dir + '/io') + c.write_src() + c.write_par() + diff --git a/tests/test_io_02_compile.py b/tests/test_io_02_compile.py new file mode 100644 index 0000000000000000000000000000000000000000..5db5cba3520a5c9b28015d5099e4afb7ecd9ebf3 --- /dev/null +++ b/tests/test_io_02_compile.py @@ -0,0 +1,39 @@ +####################################################################### +# # +# Copyright 2015 Max Planck Institute # +# for Dynamics and Self-Organization # +# # +# This file is part of bfps. # +# # +# bfps is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published # +# by the Free Software Foundation, either version 3 of the License, # +# or (at your option) any later version. # +# # +# bfps is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with bfps. If not, see <http://www.gnu.org/licenses/> # +# # +# Contact: Cristian.Lalescu@ds.mpg.de # +# # +####################################################################### + + + +from test_io import * + +if __name__ == '__main__': + opt = parser.parse_args( + ['-n', '32', + '--ncpu', '2'] + + sys.argv[1:]) + c = test_io(work_dir = opt.work_dir + '/io') + c.write_src() + c.write_par() + c.set_host_info(bfps.host_info) + c.compile_code() + diff --git a/tests/test_io_03_run.py b/tests/test_io_03_run.py new file mode 100644 index 0000000000000000000000000000000000000000..a789ac66fd99d8e5525ce69b1e861f609d969212 --- /dev/null +++ b/tests/test_io_03_run.py @@ -0,0 +1,39 @@ +####################################################################### +# # +# Copyright 2015 Max Planck Institute # +# for Dynamics and Self-Organization # +# # +# This file is part of bfps. # +# # +# bfps is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published # +# by the Free Software Foundation, either version 3 of the License, # +# or (at your option) any later version. # +# # +# bfps is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with bfps. If not, see <http://www.gnu.org/licenses/> # +# # +# Contact: Cristian.Lalescu@ds.mpg.de # +# # +####################################################################### + + + +from test_io import * + +if __name__ == '__main__': + opt = parser.parse_args( + ['-n', '32', + '--ncpu', '2'] + + sys.argv[1:]) + c = test_io(work_dir = opt.work_dir + '/io') + c.write_src() + c.write_par() + c.set_host_info(bfps.host_info) + c.run() + diff --git a/tests/test_vorticity_equation.py b/tests/test_vorticity_equation.py new file mode 100644 index 0000000000000000000000000000000000000000..ec50531df29e82c1ff767ab3d292bef0aac66c4c --- /dev/null +++ b/tests/test_vorticity_equation.py @@ -0,0 +1,101 @@ +####################################################################### +# # +# Copyright 2015 Max Planck Institute # +# for Dynamics and Self-Organization # +# # +# This file is part of bfps. # +# # +# bfps is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published # +# by the Free Software Foundation, either version 3 of the License, # +# or (at your option) any later version. # +# # +# bfps is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with bfps. If not, see <http://www.gnu.org/licenses/> # +# # +# Contact: Cristian.Lalescu@ds.mpg.de # +# # +####################################################################### + + + +import sys +import os +import numpy as np +import h5py +import argparse + +import bfps +import bfps.tools + +from bfps_addons import NSReader +import matplotlib.pyplot as plt + +def main(): + c = bfps.NavierStokes() + c.launch( + ['-n', '72', + '--simname', 'fluid_solver', + '--ncpu', '4', + '--niter_todo', '256', + '--niter_out', '256', + '--niter_stat', '1', + '--wd', './'] + + sys.argv[1:]) + data = c.read_cfield(iteration = 0) + f = h5py.File('vorticity_equation_cvorticity_i00000.h5', 'w') + f['vorticity/complex/0'] = data + f.close() + c = bfps.NSVorticityEquation() + c.launch( + ['-n', '72', + '--simname', 'vorticity_equation', + '--ncpu', '4', + '--niter_todo', '256', + '--niter_out', '256', + '--niter_stat', '1', + '--wd', './'] + + sys.argv[1:]) + c0 = NSReader(simname = 'fluid_solver') + c1 = NSReader(simname = 'vorticity_equation') + df0 = c0.get_data_file() + df1 = c1.get_data_file() + f = plt.figure(figsize=(6,10)) + a = f.add_subplot(211) + a.plot(df0['statistics/moments/vorticity'][:, 2, 3], + color = 'blue', + marker = '.') + a.plot(df1['statistics/moments/vorticity'][:, 2, 3], + color = 'red', + marker = '.') + a = f.add_subplot(212) + a.plot(df0['statistics/moments/velocity'][:, 2, 3], + color = 'blue', + marker = '.') + a.plot(df1['statistics/moments/velocity'][:, 2, 3], + color = 'red', + marker = '.') + f.tight_layout() + f.savefig('figs/moments.pdf') + f = plt.figure(figsize = (6, 10)) + a = f.add_subplot(111) + a.plot(c0.statistics['enstrophy(t, k)'][0]) + a.plot(c1.statistics['enstrophy(t, k)'][0]) + a.set_yscale('log') + f.tight_layout() + f.savefig('figs/spectra.pdf') + f = h5py.File('vorticity_equation_cvorticity_i00000.h5', 'r') + #print(c0.statistics['enstrophy(t, k)'][0]) + #print(c1.statistics['enstrophy(t, k)'][0]) + c0.do_plots() + c1.do_plots() + return None + +if __name__ == '__main__': + main() + diff --git a/todo.txt b/todo.txt deleted file mode 100644 index 0b5cafdefaf49739269dd49c19b14ffcd680b86f..0000000000000000000000000000000000000000 --- a/todo.txt +++ /dev/null @@ -1,17 +0,0 @@ -(B) compute z polynomials only when needed @optimization -(B) use argparse subcommands instead of required argument @design -(B) read https://www.xsede.org/documents/271087/369161/ExtScale-Koziol.pdf @optimization @HDF5 +I/O -(B) set up mechanism for adding in new PDEs @design +v2.0 +alternate_algorithms -(B) use less memory @optimization -(B) move stat I/O to cpp lib @design @HDF5 -(C) test involving hydrodynamic similarity @tests -(C) tests should use launch instead of get_parser @design @tests -(D) executable should be compiled in a tmp folder -(D) generalize interpolation comparison test @tests -(D) generate separate lib(s) with extra classes @tests +alternate_algorithms -(D) test anisotropic grids @tests -(D) test non-cubic domains @tests -(D) tests should not overwrite other tests (tox_full) @tests -(E) add u-equation algorithm for testing purposes @tests +alternate_algorithms -(E) pure python DNS addon: pros and cons @tests +alternate_algorithms -(F) add switch to turn off simulation