diff --git a/bfps/cpp/particles/.tocompile b/bfps/cpp/particles/.tocompile new file mode 100644 index 0000000000000000000000000000000000000000..02874ed792f4eedb859e1b779facd3d2c775ec08 --- /dev/null +++ b/bfps/cpp/particles/.tocompile @@ -0,0 +1,2 @@ +mpicxx -g main_tocompile.cpp -o /tmp/main_test_part.exe -I/home/bbramas/Projects/bfps/bfps/cpp/ -I/home/bbramas/Downloads/hdf5install/include -L/home/bbramas/Downloads/hdf5install/lib -lhdf5 -lsz -lz +mpicxx -fPIC -rdynamic -g NSVE-v2.0.1-single.cpp -o /tmp/NSVE-v2.0.1-single.exe -I/home/bbramas/Projects/bfps/bfps/cpp/ -I/home/bbramas/Downloads/hdf5install/include -I/home/bbramas/Downloads/fftw-3.3.4/install/include/ -L/home/bbramas/Downloads/hdf5install/lib -lhdf5 -lsz -lz -L/home/bbramas/.local/lib/python2.7/site-packages/bfps-2.0.1.post31+g12693ea-py2.7.egg/bfps/ -lbfps -fopenmp -lgomp -L/home/bbramas/Downloads/fftw-3.3.4/install/lib/ -lfftw3_mpi -lfftw3f_mpi -lfftw3_omp -lfftw3f_omp -lfftw3 -lfftw3f diff --git a/bfps/cpp/particles/p2p_computer.hpp b/bfps/cpp/particles/p2p_computer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..efe0e5e321f190cf26a66d68c35e828de6ffddb1 --- /dev/null +++ b/bfps/cpp/particles/p2p_computer.hpp @@ -0,0 +1,32 @@ +#ifndef P2P_COMPUTER_HPP +#define P2P_COMPUTER_HPP + +#include <cstring> + +template <class real_number, class partsize_t> +class p2p_computer{ +public: + template <int size_particle_rhs> + void init_result_array(real_number rhs[], const partsize_t nbParticles) const{ + memset(rhs, 0, sizeof(real_number)*nbParticles); + } + + template <int size_particle_rhs> + void reduce_particles_rhs(real_number rhs_dst[], const real_number rhs_src[], const partsize_t nbParticles) const{ + for(int idx_part = 0 ; idx_part < nbParticles ; ++idx_part){ + for(int idx_rhs = 0 ; idx_rhs < size_particle_rhs ; ++idx_rhs){ + rhs_dst[idx_part*size_particle_rhs+idx_rhs] += rhs_src[idx_part*size_particle_rhs+idx_rhs]; + } + } + } + + template <int size_particle_positions, int size_particle_rhs> + void compute_interaction(const real_number pos_part1[], real_number rhs_part1[], + const real_number pos_part2[], real_number rhs_part2[], + const real_number dist_pow2) const{ + rhs_part1[0] += 1; + rhs_part2[0] += 1; + } +}; + +#endif diff --git a/bfps/cpp/particles/p2p_distr_mpi.hpp b/bfps/cpp/particles/p2p_distr_mpi.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7f610be18c749077ef73730ebec365b21e212569 --- /dev/null +++ b/bfps/cpp/particles/p2p_distr_mpi.hpp @@ -0,0 +1,688 @@ +#ifndef P2P_DISTR_MPI_HPP +#define P2P_DISTR_MPI_HPP + +#include <mpi.h> + +#include <vector> +#include <memory> +#include <cassert> + +#include <type_traits> +#include <omp.h> +#include <algorithm> + +#include "scope_timer.hpp" +#include "particles_utils.hpp" +#include "p2p_tree.hpp" + +/* +- method to reorder each particle section following the cutoff grid (will permite index too) +- exchange particles (with upper only) and receive particle (from lower only) +- 1 message at a time! so need the offset of each cell of the cutoff grid +- iterate on what has been received with my own particles, fill both rhs +- send back the rhs +- merge rhs +- update particles property + */ + +template <class partsize_t, class real_number> +class p2p_distr_mpi { +protected: + static const int MaxNbRhs = 100; + + enum MpiTag{ + TAG_NB_PARTICLES, + TAG_POSITION_PARTICLES, + TAG_RESULT_PARTICLES, + }; + + struct NeighborDescriptor{ + partsize_t nbParticlesToExchange; + int destProc; + int nbLevelsToExchange; + bool isRecv; + + std::unique_ptr<real_number[]> toRecvAndMerge; + std::unique_ptr<real_number[]> toCompute; + std::unique_ptr<real_number[]> results; + }; + + enum Action{ + NOTHING_TODO, + RECV_PARTICLES, + COMPUTE_PARTICLES, + RELEASE_BUFFER_PARTICLES, + MERGE_PARTICLES, + + RECV_MOVE_NB_LOW, + RECV_MOVE_NB_UP, + RECV_MOVE_LOW, + RECV_MOVE_UP + }; + + MPI_Comm current_com; + + int my_rank; + int nb_processes; + int nb_processes_involved; + + const std::pair<int,int> current_partition_interval; + const int current_partition_size; + const std::array<size_t,3> field_grid_dim; + + std::unique_ptr<int[]> partition_interval_size_per_proc; + std::unique_ptr<int[]> partition_interval_offset_per_proc; + + std::unique_ptr<partsize_t[]> current_offset_particles_for_partition; + + std::vector<std::pair<Action,int>> whatNext; + std::vector<MPI_Request> mpiRequests; + std::vector<NeighborDescriptor> neigDescriptors; + + std::array<real_number,3> spatial_box_width; + std::array<real_number,3> spatial_box_offset; + + const real_number cutoff_radius; + std::array<long int,3> nb_cell_levels; + +public: + //////////////////////////////////////////////////////////////////////////// + + p2p_distr_mpi(MPI_Comm in_current_com, + const std::pair<int,int>& in_current_partitions, + const std::array<size_t,3>& in_field_grid_dim, + const std::array<real_number,3>& in_spatial_box_width, + const std::array<real_number,3>& in_spatial_box_offset, + const real_number in_cutoff_radius) + : current_com(in_current_com), + my_rank(-1), nb_processes(-1),nb_processes_involved(-1), + current_partition_interval(in_current_partitions), + current_partition_size(current_partition_interval.second-current_partition_interval.first), + field_grid_dim(in_field_grid_dim), + spatial_box_width(in_spatial_box_width), spatial_box_offset(in_spatial_box_offset), + cutoff_radius(in_cutoff_radius){ + + AssertMpi(MPI_Comm_rank(current_com, &my_rank)); + AssertMpi(MPI_Comm_size(current_com, &nb_processes)); + + partition_interval_size_per_proc.reset(new int[nb_processes]); + AssertMpi( MPI_Allgather( const_cast<int*>(¤t_partition_size), 1, MPI_INT, + partition_interval_size_per_proc.get(), 1, MPI_INT, + current_com) ); + assert(partition_interval_size_per_proc[my_rank] == current_partition_size); + + partition_interval_offset_per_proc.reset(new int[nb_processes+1]); + partition_interval_offset_per_proc[0] = 0; + for(int idxProc = 0 ; idxProc < nb_processes ; ++idxProc){ + partition_interval_offset_per_proc[idxProc+1] = partition_interval_offset_per_proc[idxProc] + partition_interval_size_per_proc[idxProc]; + } + + current_offset_particles_for_partition.reset(new partsize_t[current_partition_size+1]); + + nb_processes_involved = nb_processes; + while(nb_processes_involved != 0 && partition_interval_size_per_proc[nb_processes_involved-1] == 0){ + nb_processes_involved -= 1; + } + assert(nb_processes_involved != 0); + for(int idx_proc_involved = 0 ; idx_proc_involved < nb_processes_involved ; ++idx_proc_involved){ + assert(partition_interval_size_per_proc[idx_proc_involved] != 0); + } + + assert(int(field_grid_dim[IDX_Z]) == partition_interval_offset_per_proc[nb_processes_involved]); + + nb_cell_levels[IDX_X] = spatial_box_width[IDX_X]/cutoff_radius; + nb_cell_levels[IDX_Y] = spatial_box_width[IDX_Y]/cutoff_radius; + nb_cell_levels[IDX_Z] = spatial_box_width[IDX_Z]/cutoff_radius; + } + + virtual ~p2p_distr_mpi(){} + + //////////////////////////////////////////////////////////////////////////// + + long int get_cell_coord_x_from_index(const long int index) const{ + return index % nb_cell_levels[IDX_X]; + } + + long int get_cell_coord_y_from_index(const long int index) const{ + return (index - get_cell_coord_z_from_index(index)*(nb_cell_levels[IDX_X]*nb_cell_levels[IDX_Y])) + / nb_cell_levels[IDX_X]; + } + + long int get_cell_coord_z_from_index(const long int index) const{ + return index / (nb_cell_levels[IDX_X]*nb_cell_levels[IDX_Y]); + } + + long int first_cell_level_proc(const int dest_proc) const{ + const real_number field_section_width_z = spatial_box_width[IDX_Z]/real_number(field_grid_dim[IDX_Z]); + return static_cast<long int>((field_section_width_z*real_number(partition_interval_offset_per_proc[dest_proc]))/cutoff_radius); + } + + long int last_cell_level_proc(const int dest_proc) const{ + const real_number field_section_width_z = spatial_box_width[IDX_Z]/real_number(field_grid_dim[IDX_Z]); + return static_cast<long int>((field_section_width_z*real_number(partition_interval_offset_per_proc[dest_proc+1]) + - std::numeric_limits<real_number>::epsilon())/cutoff_radius); + } + + std::array<long int,3> get_cell_coordinate(const real_number pos_x, const real_number pos_y, + const real_number pos_z) const { + const real_number diff_x = pos_x - spatial_box_offset[IDX_X]; + const real_number diff_y = pos_y - spatial_box_offset[IDX_Y]; + const real_number diff_z = pos_z - spatial_box_offset[IDX_Z]; + std::array<long int,3> coord; + coord[IDX_X] = static_cast<long int>(diff_x/cutoff_radius); + coord[IDX_Y] = static_cast<long int>(diff_y/cutoff_radius); + coord[IDX_Z] = static_cast<long int>(diff_z/cutoff_radius); + return coord; + } + + long int get_cell_idx(const real_number pos_x, const real_number pos_y, + const real_number pos_z) const { + std::array<long int,3> coord = get_cell_coordinate(pos_x, pos_y, pos_z); + return ((coord[IDX_Z]*nb_cell_levels[IDX_Y])+coord[IDX_Y])*nb_cell_levels[IDX_X]+coord[IDX_X]; + } + + real_number compute_distance_r2(const real_number x1, const real_number y1, const real_number z1, + const real_number x2, const real_number y2, const real_number z2) const { + return ((x1-x2)*(x1-x2)) + ((y1-y2)*(y1-y2)) + ((z1-z2)*(z1-z2)); + } + + + template <int size_particle_positions, int size_particle_rhs> + struct ParticleView{ + partsize_t p_index; + real_number* ptr_particles_positions; + real_number* ptr_particles_current_rhs; + partsize_t* ptr_global_idx; + long int* ptr_cell_idx; + + void swap(ParticleView& p1, ParticleView& p2){ + for(int idx_pos = 0 ; idx_pos < size_particle_positions ; ++idx_pos){ + std::swap(p1.ptr_particles_positions[p1.p_index*size_particle_positions+idx_pos], + p2.ptr_particles_positions[p2.p_index*size_particle_positions+idx_pos]); + } + for(int idx_rhs = 0 ; idx_rhs < size_particle_rhs ; ++idx_rhs){ + std::swap(p1.ptr_particles_current_rhs[p1.p_index*size_particle_rhs+idx_rhs], + p2.ptr_particles_current_rhs[p2.p_index*size_particle_rhs+idx_rhs]); + } + std::swap(p1.ptr_cell_idx[p1.p_index],p2.ptr_cell_idx[p2.p_index]); + std::swap(p1.ptr_global_idx[p1.p_index],p2.ptr_global_idx[p2.p_index]); + std::swap(p1.p_index,p2.p_index); + } + }; + + template <class computer_class, int size_particle_positions, int size_particle_rhs> + void compute_distr(computer_class& in_computer, + const partsize_t current_my_nb_particles_per_partition[], + real_number particles_positions[], + real_number particles_current_rhs[], + partsize_t inout_index_particles[]){ + TIMEZONE("compute_distr"); + + // Some processes might not be involved + if(nb_processes_involved <= my_rank){ + return; + } + + const long int my_top_z_cell_level = last_cell_level_proc(my_rank); + const long int my_down_z_cell_level = first_cell_level_proc(my_rank); + const long int my_nb_cell_levels = 1+my_top_z_cell_level-my_down_z_cell_level; + + current_offset_particles_for_partition[0] = 0; + partsize_t myTotalNbParticles = 0; + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + myTotalNbParticles += current_my_nb_particles_per_partition[idxPartition]; + current_offset_particles_for_partition[idxPartition+1] = current_offset_particles_for_partition[idxPartition] + current_my_nb_particles_per_partition[idxPartition]; + } + + // Compute box idx for each particle + std::unique_ptr<long int[]> particles_coord(new long int[current_offset_particles_for_partition[current_partition_size]]); + + { + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + #pragma omp parallel for schedule(static) + for(partsize_t idxPart = current_offset_particles_for_partition[idxPartition] ; idxPart < current_offset_particles_for_partition[idxPartition+1] ; ++idxPart ){ + particles_coord[idxPart] = get_cell_idx(particles_positions[(idxPart)*size_particle_positions + IDX_X], + particles_positions[(idxPart)*size_particle_positions + IDX_Y], + particles_positions[(idxPart)*size_particle_positions + IDX_Z]); + assert(my_down_z_cell_level <= get_cell_coord_z_from_index(particles_coord[idxPart])); + assert(get_cell_coord_z_from_index(particles_coord[idxPart]) <= my_top_z_cell_level); + } + } + + std::vector<ParticleView<size_particle_positions,size_particle_rhs>> part_to_sort; + + // Sort each partition in cells + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + part_to_sort.clear(); + + for(partsize_t idxPart = current_offset_particles_for_partition[idxPartition] ; idxPart < current_offset_particles_for_partition[idxPartition+1] ; ++idxPart ){ + part_to_sort.emplace_back(); + part_to_sort.back().p_index = idxPart; + part_to_sort.back().ptr_particles_positions = particles_positions; + part_to_sort.back().ptr_particles_current_rhs = particles_current_rhs; + part_to_sort.back().ptr_global_idx = inout_index_particles; + part_to_sort.back().ptr_cell_idx = particles_coord.get(); + } + + assert(part_to_sort.size() == (current_offset_particles_for_partition[idxPartition+1]-current_offset_particles_for_partition[idxPartition])); + + std::sort(part_to_sort.begin(), part_to_sort.end(), + [](const ParticleView<size_particle_positions,size_particle_rhs>& p1, + const ParticleView<size_particle_positions,size_particle_rhs>& p2){ + return p1.ptr_cell_idx[p1.p_index] < p2.ptr_cell_idx[p2.p_index]; + }); + } + } + + // Build the tree + p2p_tree<std::vector<std::pair<partsize_t,partsize_t>>> my_tree(nb_cell_levels); + + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + long int current_cell_idx = -1; + partsize_t current_nb_particles_in_cell = 0; + partsize_t current_cell_offset = 0; + + for(partsize_t idx_part = current_offset_particles_for_partition[idxPartition] ; + idx_part != current_offset_particles_for_partition[idxPartition+1]; ++idx_part){ + if(particles_coord[idx_part] != current_cell_idx){ + if(current_nb_particles_in_cell){ + my_tree.getCell(current_cell_idx).emplace_back(current_cell_offset,current_nb_particles_in_cell); + } + current_cell_idx = particles_coord[idx_part]; + current_nb_particles_in_cell = 1; + current_cell_offset = idx_part; + } + } + if(current_nb_particles_in_cell){ + my_tree.getCell(current_cell_idx).emplace_back(current_cell_offset,current_nb_particles_in_cell); + + } + } + + printf("[%d] go from cutoff level %ld to %ld\n", + my_rank, my_down_z_cell_level, my_top_z_cell_level); // TODO remove + fflush(stdout); // TODO + + // Offset per cell layers + std::unique_ptr<partsize_t[]> particles_offset_layers(new partsize_t[my_nb_cell_levels+1]()); + for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){ + for(partsize_t idx_part = current_offset_particles_for_partition[idxPartition] ; + idx_part != current_offset_particles_for_partition[idxPartition+1]; ++idx_part){ + assert(my_down_z_cell_level <= get_cell_coord_z_from_index(particles_coord[idx_part])); + assert(get_cell_coord_z_from_index(particles_coord[idx_part]) <= my_top_z_cell_level); + particles_offset_layers[get_cell_coord_z_from_index(particles_coord[idx_part])+1-my_down_z_cell_level] += 1; + } + } + for(size_t idx_layer = 0 ; idx_layer < my_nb_cell_levels ; ++idx_layer){ + printf("[%d] nb particles in cutoff level %llu are %ld\n", + my_rank, idx_layer, particles_offset_layers[idx_layer+1]); // TODO remove + fflush(stdout); // TODO + particles_offset_layers[idx_layer+1] += particles_offset_layers[idx_layer]; + } + + // Reset vectors + assert(whatNext.size() == 0); + assert(mpiRequests.size() == 0); + neigDescriptors.clear(); + + // Find process with at least one neighbor + { + std::cout << my_rank << " my_top_z_cell_level " << my_top_z_cell_level << std::endl; + std::cout << my_rank << " my_down_z_cell_level " << my_down_z_cell_level << std::endl; + std::cout.flush();// TODO + + int dest_proc = (my_rank+1)%nb_processes_involved; + while(dest_proc != my_rank + && (my_top_z_cell_level == first_cell_level_proc(dest_proc) + || (my_top_z_cell_level+1)%nb_cell_levels[IDX_Z] == first_cell_level_proc(dest_proc))){ + // Find if we have to send 1 or 2 cell levels + int nb_levels_to_send = 1; + if(my_nb_cell_levels > 1 // I have more than one level + && (my_top_z_cell_level-1+2)%nb_cell_levels[IDX_Z] <= last_cell_level_proc(dest_proc)){ + nb_levels_to_send += 1; + } + + std::cout << my_rank << " dest_proc " << dest_proc << std::endl; + std::cout << my_rank << " first_cell_level_proc(dest_proc) " << first_cell_level_proc(dest_proc) << std::endl; + std::cout << my_rank << " last_cell_level_proc(dest_proc) " << last_cell_level_proc(dest_proc) << std::endl; + std::cout.flush();// TODO + + NeighborDescriptor descriptor; + descriptor.destProc = dest_proc; + descriptor.nbLevelsToExchange = nb_levels_to_send; + descriptor.nbParticlesToExchange = particles_offset_layers[my_nb_cell_levels] - particles_offset_layers[my_nb_cell_levels-nb_levels_to_send]; + descriptor.isRecv = false; + + std::cout << my_rank << "SEND" << std::endl; + std::cout << "descriptor.destProc " << descriptor.destProc << std::endl; + std::cout << "descriptor.nbLevelsToExchange " << descriptor.nbLevelsToExchange << std::endl; + std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl; + std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl; + std::cout << "descriptor.isRecv " << descriptor.isRecv << std::endl; + std::cout << "neigDescriptors.size() " << neigDescriptors.size() << std::endl; + std::cout.flush();// TODO + + neigDescriptors.emplace_back(std::move(descriptor)); + + dest_proc = (dest_proc+1)%nb_processes_involved; + } + std::cout << my_rank << " NO dest_proc " << dest_proc << std::endl; + std::cout << my_rank << " NO first_cell_level_proc(dest_proc) " << first_cell_level_proc(dest_proc) << std::endl; + std::cout.flush();// TODO + + int src_proc = (my_rank-1+nb_processes_involved)%nb_processes_involved; + while(src_proc != my_rank + && (last_cell_level_proc(src_proc) == my_down_z_cell_level + || (last_cell_level_proc(src_proc)+1)%nb_cell_levels[IDX_Z] == my_down_z_cell_level)){ + // Find if we have to send 1 or 2 cell levels + int nb_levels_to_recv = 1; + if(my_nb_cell_levels > 1 // I have more than one level + && first_cell_level_proc(src_proc) <= (my_down_z_cell_level-1+2)%nb_cell_levels[IDX_Z]){ + nb_levels_to_recv += 1; + } + + std::cout << my_rank << " src_proc " << src_proc << std::endl; + std::cout << my_rank << " first_cell_level_proc(src_proc) " << first_cell_level_proc(src_proc) << std::endl; + std::cout.flush();// TODO + + NeighborDescriptor descriptor; + descriptor.destProc = src_proc; + descriptor.nbLevelsToExchange = nb_levels_to_recv; + descriptor.nbParticlesToExchange = -1; + descriptor.isRecv = true; + + neigDescriptors.emplace_back(std::move(descriptor)); + + std::cout << my_rank << "] RECV" << std::endl; + std::cout << "descriptor.destProc " << descriptor.destProc << std::endl; + std::cout << "descriptor.nbLevelsToExchange " << descriptor.nbLevelsToExchange << std::endl; + std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl; + std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl; + std::cout << "descriptor.isRecv " << descriptor.isRecv << std::endl; + std::cout << "neigDescriptors.size() " << neigDescriptors.size() << std::endl; + std::cout.flush();// TODO + + src_proc = (src_proc-1+nb_processes_involved)%nb_processes_involved; + } + std::cout << my_rank << " NO src_proc " << src_proc << std::endl; + std::cout << my_rank << " NO first_cell_level_proc(src_proc) " << first_cell_level_proc(src_proc) << std::endl; + std::cout.flush();// TODO + } + + ////////////////////////////////////////////////////////////////////// + /// Exchange the number of particles in each partition + /// Could involve only here but I do not think it will be a problem + ////////////////////////////////////////////////////////////////////// + + assert(whatNext.size() == 0); + assert(mpiRequests.size() == 0); + + + for(int idxDescr = 0 ; idxDescr < int(neigDescriptors.size()) ; ++idxDescr){ + NeighborDescriptor& descriptor = neigDescriptors[idxDescr]; + + if(descriptor.isRecv == false){ + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Isend(const_cast<partsize_t*>(&descriptor.nbParticlesToExchange), + 1, particles_utils::GetMpiType(partsize_t()), + descriptor.destProc, TAG_NB_PARTICLES, + current_com, &mpiRequests.back())); + + if(descriptor.nbParticlesToExchange){ + std::cout << my_rank << "] SEND_PARTICLES" << std::endl; + std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl; + std::cout << "descriptor.destProc " << descriptor.destProc << std::endl; + std::cout << "idxDescr " << idxDescr << std::endl; + + whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1}); + mpiRequests.emplace_back(); + assert(descriptor.nbParticlesToExchange*size_particle_positions < std::numeric_limits<int>::max()); + AssertMpi(MPI_Isend(const_cast<real_number*>(&particles_positions[particles_offset_layers[my_nb_cell_levels-descriptor.nbLevelsToExchange]]), + int(descriptor.nbParticlesToExchange*size_particle_positions), particles_utils::GetMpiType(real_number()), + descriptor.destProc, TAG_POSITION_PARTICLES, + current_com, &mpiRequests.back())); + + assert(descriptor.toRecvAndMerge == nullptr); + descriptor.toRecvAndMerge.reset(new real_number[descriptor.nbParticlesToExchange*size_particle_rhs]); + whatNext.emplace_back(std::pair<Action,int>{MERGE_PARTICLES, int(neigDescriptors.size())}); + mpiRequests.emplace_back(); + assert(descriptor.nbParticlesToExchange*size_particle_rhs < std::numeric_limits<int>::max()); + AssertMpi(MPI_Irecv(descriptor.toRecvAndMerge.get(), int(descriptor.nbParticlesToExchange*size_particle_rhs), + particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_RESULT_PARTICLES, + current_com, &mpiRequests.back())); + } + } + else{ + std::cout << "RECV_PARTICLES " << RECV_PARTICLES << std::endl; + std::cout << "idxDescr " << idxDescr << std::endl; + whatNext.emplace_back(std::pair<Action,int>{RECV_PARTICLES, idxDescr}); + mpiRequests.emplace_back(); + AssertMpi(MPI_Irecv(&descriptor.nbParticlesToExchange, + 1, particles_utils::GetMpiType(partsize_t()), descriptor.destProc, TAG_NB_PARTICLES, + current_com, &mpiRequests.back())); + } + } + + const bool more_than_one_thread = (omp_get_max_threads() > 1); + + TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads()) + #pragma omp parallel default(shared) + { + #pragma omp master + { + while(mpiRequests.size()){ + assert(mpiRequests.size() == whatNext.size()); + + int idxDone = int(mpiRequests.size()); + { + TIMEZONE("wait"); + AssertMpi(MPI_Waitany(int(mpiRequests.size()), mpiRequests.data(), &idxDone, MPI_STATUSES_IGNORE)); + } + const std::pair<Action, int> releasedAction = whatNext[idxDone]; + std::swap(mpiRequests[idxDone], mpiRequests[mpiRequests.size()-1]); + std::swap(whatNext[idxDone], whatNext[mpiRequests.size()-1]); + mpiRequests.pop_back(); + whatNext.pop_back(); + + ////////////////////////////////////////////////////////////////////// + /// Data to exchange particles + ////////////////////////////////////////////////////////////////////// + if(releasedAction.first == RECV_PARTICLES){ + NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second]; + + assert(descriptor.isRecv == true); + const int destProc = descriptor.destProc; + const partsize_t NbParticlesToReceive = descriptor.nbParticlesToExchange; + assert(NbParticlesToReceive != -1); + assert(descriptor.toCompute == nullptr); + + std::cout << my_rank << "] RECV_PARTICLES" << std::endl; + std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl; + std::cout << "descriptor.destProc " << descriptor.destProc << std::endl; + std::cout << "releasedAction.second " << releasedAction.second << std::endl; + + if(NbParticlesToReceive){ + std::cout << "MPI_Irecv " << std::endl; + descriptor.toCompute.reset(new real_number[NbParticlesToReceive*size_particle_positions]); + whatNext.emplace_back(std::pair<Action,int>{COMPUTE_PARTICLES, releasedAction.second}); + mpiRequests.emplace_back(); + assert(NbParticlesToReceive*size_particle_positions < std::numeric_limits<int>::max()); + AssertMpi(MPI_Irecv(descriptor.toCompute.get(), int(NbParticlesToReceive*size_particle_positions), + particles_utils::GetMpiType(real_number()), destProc, TAG_POSITION_PARTICLES, + current_com, &mpiRequests.back())); + } + } + + ////////////////////////////////////////////////////////////////////// + /// Computation + ////////////////////////////////////////////////////////////////////// + if(releasedAction.first == COMPUTE_PARTICLES){ + NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second]; + const partsize_t NbParticlesToReceive = descriptor.nbParticlesToExchange; + + assert(descriptor.toCompute != nullptr); + descriptor.results.reset(new real_number[NbParticlesToReceive*size_particle_rhs]); + // TODO in_computer.template init_result_array<size_particle_rhs>(descriptor.results.get(), NbParticlesToReceive); + + // Compute + partsize_t idxPart = 0; + while(idxPart != NbParticlesToReceive){ + const long int current_cell_idx = get_cell_idx(descriptor.toCompute[idxPart*size_particle_positions + IDX_X], + descriptor.toCompute[idxPart*size_particle_positions + IDX_Y], + descriptor.toCompute[idxPart*size_particle_positions + IDX_Z]); + partsize_t nb_parts_in_cell = 0; + while(idxPart+nb_parts_in_cell != NbParticlesToReceive + && current_cell_idx == get_cell_idx(descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDX_X], + descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDX_Y], + descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDX_Z])){ + nb_parts_in_cell += 1; + } + + const std::vector<std::pair<partsize_t,partsize_t>>* neighbors[27]; + const int nbNeighbors = my_tree.getNeighbors(current_cell_idx, neighbors); + + // with other interval + for(size_t idx_neighbor = 0 ; idx_neighbor < nbNeighbors ; ++idx_neighbor){ + for(size_t idx_2 = 0 ; idx_2 < (*neighbors[idx_neighbor]).size() ; ++idx_2){ + for(partsize_t idx_p1 = 0 ; idx_p1 < nb_parts_in_cell ; ++idx_p1){ + for(partsize_t idx_p2 = 0 ; idx_p2 < (*neighbors[idx_neighbor])[idx_2].second ; ++idx_p2){ + const real_number dist_r2 = compute_distance_r2(descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDX_X], + descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDX_Y], + descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDX_Z], + particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_X], + particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_Y], + particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_Z]); + if(dist_r2 < cutoff_radius*cutoff_radius){ + // TODO in_computer.template compute_interaction<size_particle_positions,size_particle_rhs>( +// &descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions], +// &descriptor.results[(idxPart+idx_p1)*size_particle_rhs], +// &particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions], +// &particles_current_rhs[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_rhs], +// dist_r2); + } + } + } + } + } + + idxPart += nb_parts_in_cell; + } + + // Send back + const int destProc = descriptor.destProc; + whatNext.emplace_back(std::pair<Action,int>{RELEASE_BUFFER_PARTICLES, releasedAction.second}); + mpiRequests.emplace_back(); + assert(NbParticlesToReceive*size_particle_rhs < std::numeric_limits<int>::max()); + AssertMpi(MPI_Isend(descriptor.results.get(), int(NbParticlesToReceive*size_particle_rhs), + particles_utils::GetMpiType(real_number()), destProc, TAG_RESULT_PARTICLES, + current_com, &mpiRequests.back())); + } + ////////////////////////////////////////////////////////////////////// + /// Computation + ////////////////////////////////////////////////////////////////////// + if(releasedAction.first == RELEASE_BUFFER_PARTICLES){ + NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second]; + assert(descriptor.toCompute != nullptr); + descriptor.toCompute.release(); + } + ////////////////////////////////////////////////////////////////////// + /// Merge + ////////////////////////////////////////////////////////////////////// + if(releasedAction.first == MERGE_PARTICLES && more_than_one_thread == false){ + NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second]; + assert(descriptor.isRecv); + assert(descriptor.toRecvAndMerge != nullptr); + // TODO in_computer.template reduce_particles_rhs<size_particle_rhs>(&particles_current_rhs[0], descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToExchange); + descriptor.toRecvAndMerge.release(); + } + } + } + } + + assert(whatNext.size() == 0); + assert(mpiRequests.size() == 0); + + // Compute self data + for(const auto& iter_cell : my_tree){ + const std::vector<std::pair<partsize_t,partsize_t>>& intervals = iter_cell.second; + + for(size_t idx_1 = 0 ; idx_1 < intervals.size() ; ++idx_1){ + // self interval + for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){ + for(partsize_t idx_p2 = idx_p1+1 ; idx_p2 < intervals[idx_1].second ; ++idx_p2){ + const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_X], + particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Y], + particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Z], + particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDX_X], + particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDX_Y], + particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDX_Z]); + if(dist_r2 < cutoff_radius*cutoff_radius){ + in_computer.template compute_interaction<size_particle_positions,size_particle_rhs>( + &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions], + &particles_current_rhs[(intervals[idx_1].first+idx_p1)*size_particle_rhs], + &particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions], + &particles_current_rhs[(intervals[idx_1].first+idx_p2)*size_particle_rhs], + dist_r2); + } + } + } + + // with other interval + for(size_t idx_2 = idx_1+1 ; idx_2 < intervals.size() ; ++idx_2){ + for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){ + for(partsize_t idx_p2 = 0 ; idx_p2 < intervals[idx_2].second ; ++idx_p2){ + const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_X], + particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Y], + particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Z], + particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDX_X], + particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDX_Y], + particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDX_Z]); + if(dist_r2 < cutoff_radius*cutoff_radius){ + in_computer.template compute_interaction<size_particle_positions,size_particle_rhs>( + &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions], + &particles_current_rhs[(intervals[idx_1].first+idx_p1)*size_particle_rhs], + &particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions], + &particles_current_rhs[(intervals[idx_2].first+idx_p2)*size_particle_rhs], + dist_r2); + } + } + } + } + } + + + const long int currenct_cell_idx = iter_cell.first; + const std::vector<std::pair<partsize_t,partsize_t>>* neighbors[27]; + const int nbNeighbors = my_tree.getNeighbors(currenct_cell_idx, neighbors); + + for(size_t idx_1 = 0 ; idx_1 < intervals.size() ; ++idx_1){ + // with other interval + for(size_t idx_neighbor = 0 ; idx_neighbor < nbNeighbors ; ++idx_neighbor){ + for(size_t idx_2 = 0 ; idx_2 < (*neighbors[idx_neighbor]).size() ; ++idx_2){ + for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){ + for(partsize_t idx_p2 = 0 ; idx_p2 < (*neighbors[idx_neighbor])[idx_2].second ; ++idx_p2){ + const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_X], + particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Y], + particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Z], + particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_X], + particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_Y], + particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_Z]); + if(dist_r2 < cutoff_radius*cutoff_radius){ + in_computer.template compute_interaction<size_particle_positions,size_particle_rhs>( + &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions], + &particles_current_rhs[(intervals[idx_1].first+idx_p1)*size_particle_rhs], + &particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions], + &particles_current_rhs[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_rhs], + dist_r2); + } + } + } + } + } + } + } + } +}; + +#endif diff --git a/bfps/cpp/particles/p2p_tree.hpp b/bfps/cpp/particles/p2p_tree.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b11cd826f74e59b8014d4563098069873cf7ded8 --- /dev/null +++ b/bfps/cpp/particles/p2p_tree.hpp @@ -0,0 +1,109 @@ +#ifndef P2P_TREE_HPP +#define P2P_TREE_HPP + +#include <unordered_map> +#include <vector> + +template <class CellClass> +class p2p_tree{ + std::unordered_map<long int, CellClass> data; + CellClass emptyCell; + std::array<long int,3> nb_cell_levels; + + long int get_cell_coord_x_from_index(const long int index) const{ + return index % nb_cell_levels[IDX_X]; + } + + long int get_cell_coord_y_from_index(const long int index) const{ + return (index - get_cell_coord_z_from_index(index)*(nb_cell_levels[IDX_X]*nb_cell_levels[IDX_Y])) + / nb_cell_levels[IDX_X]; + } + + long int get_cell_coord_z_from_index(const long int index) const{ + return index / (nb_cell_levels[IDX_X]*nb_cell_levels[IDX_Y]); + } + + long int get_cell_idx(const long int idx_x, const long int idx_y, + const long int idx_z) const { + return (((idx_z*nb_cell_levels[IDX_Y])+idx_y)*nb_cell_levels[IDX_X])+idx_x; + } + +public: + explicit p2p_tree(std::array<long int,3> in_nb_cell_levels) + : nb_cell_levels(in_nb_cell_levels){ + } + + CellClass& getCell(const long int idx){ + return data[idx]; + } + + + const CellClass& getCell(const long int idx) const { + const auto& iter = data.find(idx); + if(iter != data.end()){ + return iter->second; + } + return emptyCell; + } + + int getNeighbors(const long int idx, const CellClass* output[27]) const{ + int nbNeighbors = 0; + + const long int idx_x = get_cell_coord_x_from_index(idx); + const long int idx_y = get_cell_coord_y_from_index(idx); + const long int idx_z = get_cell_coord_z_from_index(idx); + + for(long int neigh_x = - 1 ; neigh_x <= 1 ; ++neigh_x){ + long int neigh_x_pbc = neigh_x+idx_x; + if(neigh_x_pbc < 0){ + neigh_x_pbc += nb_cell_levels[IDX_X]; + } + else if(nb_cell_levels[IDX_X] <= neigh_x_pbc){ + neigh_x_pbc -= nb_cell_levels[IDX_X]; + } + + for(long int neigh_y = - 1 ; neigh_y <= 1 ; ++neigh_y){ + long int neigh_y_pbc = neigh_y+idx_y; + if(neigh_y_pbc < 0){ + neigh_y_pbc += nb_cell_levels[IDX_Y]; + } + else if(nb_cell_levels[IDX_Y] <= neigh_y_pbc){ + neigh_y_pbc -= nb_cell_levels[IDX_Y]; + } + + for(long int neigh_z = - 1 ; neigh_z <= 1 ; ++neigh_z){ + long int neigh_z_pbc = neigh_z+idx_z; + if(neigh_z_pbc < 0){ + neigh_z_pbc += nb_cell_levels[IDX_Z]; + } + else if(nb_cell_levels[IDX_Z] <= neigh_z_pbc){ + neigh_z_pbc -= nb_cell_levels[IDX_Z]; + } + + // Not the current cell + if(neigh_x_pbc != idx_x || neigh_y_pbc != idx_y || neigh_z_pbc != idx_z){ + const long int idx_neigh = get_cell_idx(neigh_x_pbc, + neigh_y_pbc, + neigh_z_pbc); + const auto& iter = data.find(idx_neigh); + if(iter != data.end()){ + output[nbNeighbors] = &(iter->second); + } + } + } + } + } + + return nbNeighbors; + } + + typename std::unordered_map<long int, CellClass>::iterator begin(){ + return data.begin(); + } + + typename std::unordered_map<long int, CellClass>::iterator end(){ + return data.end(); + } +}; + +#endif