Add first version of particle particle interactions

eb1b7d36 · Berenger Bramas · 0ce7d47f · eb1b7d36 · eb1b7d36 · eb1b7d36
Commit eb1b7d36 authored 7 years ago by Berenger Bramas
--- a/bfps/cpp/particles/.tocompile
+++ b/bfps/cpp/particles/.tocompile
+mpicxx -g main_tocompile.cpp -o /tmp/main_test_part.exe -I/home/bbramas/Projects/bfps/bfps/cpp/ -I/home/bbramas/Downloads/hdf5install/include -L/home/bbramas/Downloads/hdf5install/lib -lhdf5 -lsz -lz
+mpicxx -fPIC -rdynamic -g NSVE-v2.0.1-single.cpp -o /tmp/NSVE-v2.0.1-single.exe -I/home/bbramas/Projects/bfps/bfps/cpp/ -I/home/bbramas/Downloads/hdf5install/include -I/home/bbramas/Downloads/fftw-3.3.4/install/include/ -L/home/bbramas/Downloads/hdf5install/lib -lhdf5 -lsz -lz -L/home/bbramas/.local/lib/python2.7/site-packages/bfps-2.0.1.post31+g12693ea-py2.7.egg/bfps/ -lbfps -fopenmp -lgomp -L/home/bbramas/Downloads/fftw-3.3.4/install/lib/ -lfftw3_mpi -lfftw3f_mpi -lfftw3_omp -lfftw3f_omp -lfftw3 -lfftw3f
--- a/bfps/cpp/particles/p2p_computer.hpp
+++ b/bfps/cpp/particles/p2p_computer.hpp
+#ifndef P2P_COMPUTER_HPP
+#define P2P_COMPUTER_HPP
+
+#include <cstring>
+
+template <class real_number, class partsize_t>
+class p2p_computer{
+public:
+    template <int size_particle_rhs>
+    void init_result_array(real_number rhs[], const partsize_t nbParticles) const{
+        memset(rhs, 0, sizeof(real_number)*nbParticles);
+    }
+
+    template <int size_particle_rhs>
+    void reduce_particles_rhs(real_number rhs_dst[], const real_number rhs_src[], const partsize_t nbParticles) const{
+        for(int idx_part = 0 ; idx_part < nbParticles ; ++idx_part){
+            for(int idx_rhs = 0 ; idx_rhs < size_particle_rhs ; ++idx_rhs){
+                rhs_dst[idx_part*size_particle_rhs+idx_rhs] += rhs_src[idx_part*size_particle_rhs+idx_rhs];
+            }
+        }
+    }
+
+    template <int size_particle_positions, int size_particle_rhs>
+    void compute_interaction(const real_number pos_part1[], real_number rhs_part1[],
+                             const real_number pos_part2[], real_number rhs_part2[],
+                             const real_number dist_pow2) const{
+        rhs_part1[0] += 1;
+        rhs_part2[0] += 1;
+    }
+};
+
+#endif
--- a/bfps/cpp/particles/p2p_distr_mpi.hpp
+++ b/bfps/cpp/particles/p2p_distr_mpi.hpp
+#ifndef P2P_DISTR_MPI_HPP
+#define P2P_DISTR_MPI_HPP
+
+#include <mpi.h>
+
+#include <vector>
+#include <memory>
+#include <cassert>
+
+#include <type_traits>
+#include <omp.h>
+#include <algorithm>
+
+#include "scope_timer.hpp"
+#include "particles_utils.hpp"
+#include "p2p_tree.hpp"
+
+/*
+- method to reorder each particle section following the cutoff grid (will permite index too)
+- exchange particles (with upper only) and receive particle (from lower only)
+- 1 message at a time! so need the offset of each cell of the cutoff grid
+- iterate on what has been received with my own particles, fill both rhs
+- send back the rhs
+- merge rhs
+- update particles property
+  */
+
+template <class partsize_t, class real_number>
+class p2p_distr_mpi {
+protected:
+    static const int MaxNbRhs = 100;
+
+    enum MpiTag{
+        TAG_NB_PARTICLES,
+        TAG_POSITION_PARTICLES,
+        TAG_RESULT_PARTICLES,
+    };
+
+    struct NeighborDescriptor{
+        partsize_t nbParticlesToExchange;
+        int destProc;
+        int nbLevelsToExchange;
+        bool isRecv;
+
+        std::unique_ptr<real_number[]> toRecvAndMerge;
+        std::unique_ptr<real_number[]> toCompute;
+        std::unique_ptr<real_number[]> results;
+    };
+
+    enum Action{
+        NOTHING_TODO,
+        RECV_PARTICLES,
+        COMPUTE_PARTICLES,
+        RELEASE_BUFFER_PARTICLES,
+        MERGE_PARTICLES,
+
+        RECV_MOVE_NB_LOW,
+        RECV_MOVE_NB_UP,
+        RECV_MOVE_LOW,
+        RECV_MOVE_UP
+    };
+
+    MPI_Comm current_com;
+
+    int my_rank;
+    int nb_processes;
+    int nb_processes_involved;
+
+    const std::pair<int,int> current_partition_interval;
+    const int current_partition_size;
+    const std::array<size_t,3> field_grid_dim;
+
+    std::unique_ptr<int[]> partition_interval_size_per_proc;
+    std::unique_ptr<int[]> partition_interval_offset_per_proc;
+
+    std::unique_ptr<partsize_t[]> current_offset_particles_for_partition;
+
+    std::vector<std::pair<Action,int>> whatNext;
+    std::vector<MPI_Request> mpiRequests;
+    std::vector<NeighborDescriptor> neigDescriptors;
+
+    std::array<real_number,3> spatial_box_width;
+    std::array<real_number,3> spatial_box_offset;
+
+    const real_number cutoff_radius;
+    std::array<long int,3> nb_cell_levels;
+
+public:
+    ////////////////////////////////////////////////////////////////////////////
+
+    p2p_distr_mpi(MPI_Comm in_current_com,
+                     const std::pair<int,int>& in_current_partitions,
+                     const std::array<size_t,3>& in_field_grid_dim,
+                     const std::array<real_number,3>& in_spatial_box_width,
+                     const std::array<real_number,3>& in_spatial_box_offset,
+                     const real_number in_cutoff_radius)
+        : current_com(in_current_com),
+            my_rank(-1), nb_processes(-1),nb_processes_involved(-1),
+            current_partition_interval(in_current_partitions),
+            current_partition_size(current_partition_interval.second-current_partition_interval.first),
+            field_grid_dim(in_field_grid_dim),
+            spatial_box_width(in_spatial_box_width), spatial_box_offset(in_spatial_box_offset),
+            cutoff_radius(in_cutoff_radius){
+
+        AssertMpi(MPI_Comm_rank(current_com, &my_rank));
+        AssertMpi(MPI_Comm_size(current_com, &nb_processes));
+
+        partition_interval_size_per_proc.reset(new int[nb_processes]);
+        AssertMpi( MPI_Allgather( const_cast<int*>(&current_partition_size), 1, MPI_INT,
+                                  partition_interval_size_per_proc.get(), 1, MPI_INT,
+                                  current_com) );
+        assert(partition_interval_size_per_proc[my_rank] == current_partition_size);
+
+        partition_interval_offset_per_proc.reset(new int[nb_processes+1]);
+        partition_interval_offset_per_proc[0] = 0;
+        for(int idxProc = 0 ; idxProc < nb_processes ; ++idxProc){
+            partition_interval_offset_per_proc[idxProc+1] = partition_interval_offset_per_proc[idxProc] + partition_interval_size_per_proc[idxProc];
+        }
+
+        current_offset_particles_for_partition.reset(new partsize_t[current_partition_size+1]);
+
+        nb_processes_involved = nb_processes;
+        while(nb_processes_involved != 0 && partition_interval_size_per_proc[nb_processes_involved-1] == 0){
+            nb_processes_involved -= 1;
+        }
+        assert(nb_processes_involved != 0);
+        for(int idx_proc_involved = 0 ; idx_proc_involved < nb_processes_involved ; ++idx_proc_involved){
+            assert(partition_interval_size_per_proc[idx_proc_involved] != 0);
+        }
+
+        assert(int(field_grid_dim[IDX_Z]) == partition_interval_offset_per_proc[nb_processes_involved]);
+
+        nb_cell_levels[IDX_X] = spatial_box_width[IDX_X]/cutoff_radius;
+        nb_cell_levels[IDX_Y] = spatial_box_width[IDX_Y]/cutoff_radius;
+        nb_cell_levels[IDX_Z] = spatial_box_width[IDX_Z]/cutoff_radius;
+    }
+
+    virtual ~p2p_distr_mpi(){}
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    long int get_cell_coord_x_from_index(const long int index) const{
+        return index % nb_cell_levels[IDX_X];
+    }
+
+    long int get_cell_coord_y_from_index(const long int index) const{
+        return (index - get_cell_coord_z_from_index(index)*(nb_cell_levels[IDX_X]*nb_cell_levels[IDX_Y]))
+                / nb_cell_levels[IDX_X];
+    }
+
+    long int get_cell_coord_z_from_index(const long int index) const{
+        return index / (nb_cell_levels[IDX_X]*nb_cell_levels[IDX_Y]);
+    }
+
+    long int first_cell_level_proc(const int dest_proc) const{
+        const real_number field_section_width_z = spatial_box_width[IDX_Z]/real_number(field_grid_dim[IDX_Z]);
+        return static_cast<long int>((field_section_width_z*real_number(partition_interval_offset_per_proc[dest_proc]))/cutoff_radius);
+    }
+
+    long int last_cell_level_proc(const int dest_proc) const{
+        const real_number field_section_width_z = spatial_box_width[IDX_Z]/real_number(field_grid_dim[IDX_Z]);
+        return static_cast<long int>((field_section_width_z*real_number(partition_interval_offset_per_proc[dest_proc+1])
+                                     - std::numeric_limits<real_number>::epsilon())/cutoff_radius);
+    }
+
+    std::array<long int,3> get_cell_coordinate(const real_number pos_x, const real_number pos_y,
+                                               const real_number pos_z) const {
+        const real_number diff_x = pos_x - spatial_box_offset[IDX_X];
+        const real_number diff_y = pos_y - spatial_box_offset[IDX_Y];
+        const real_number diff_z = pos_z - spatial_box_offset[IDX_Z];
+        std::array<long int,3> coord;
+        coord[IDX_X] = static_cast<long int>(diff_x/cutoff_radius);
+        coord[IDX_Y] = static_cast<long int>(diff_y/cutoff_radius);
+        coord[IDX_Z] = static_cast<long int>(diff_z/cutoff_radius);
+        return coord;
+    }
+
+    long int get_cell_idx(const real_number pos_x, const real_number pos_y,
+                          const real_number pos_z) const {
+        std::array<long int,3> coord = get_cell_coordinate(pos_x, pos_y, pos_z);
+        return ((coord[IDX_Z]*nb_cell_levels[IDX_Y])+coord[IDX_Y])*nb_cell_levels[IDX_X]+coord[IDX_X];
+    }
+
+    real_number compute_distance_r2(const real_number x1, const real_number y1, const real_number z1,
+                                    const real_number x2, const real_number y2, const real_number z2) const {
+        return ((x1-x2)*(x1-x2)) + ((y1-y2)*(y1-y2)) + ((z1-z2)*(z1-z2));
+    }
+
+
+    template <int size_particle_positions, int size_particle_rhs>
+    struct ParticleView{
+        partsize_t p_index;
+        real_number* ptr_particles_positions;
+        real_number* ptr_particles_current_rhs;
+        partsize_t* ptr_global_idx;
+        long int* ptr_cell_idx;
+
+        void swap(ParticleView& p1, ParticleView& p2){
+            for(int idx_pos = 0 ; idx_pos < size_particle_positions ; ++idx_pos){
+                std::swap(p1.ptr_particles_positions[p1.p_index*size_particle_positions+idx_pos],
+                          p2.ptr_particles_positions[p2.p_index*size_particle_positions+idx_pos]);
+            }
+            for(int idx_rhs = 0 ; idx_rhs < size_particle_rhs ; ++idx_rhs){
+                std::swap(p1.ptr_particles_current_rhs[p1.p_index*size_particle_rhs+idx_rhs],
+                          p2.ptr_particles_current_rhs[p2.p_index*size_particle_rhs+idx_rhs]);
+            }
+            std::swap(p1.ptr_cell_idx[p1.p_index],p2.ptr_cell_idx[p2.p_index]);
+            std::swap(p1.ptr_global_idx[p1.p_index],p2.ptr_global_idx[p2.p_index]);
+            std::swap(p1.p_index,p2.p_index);
+        }
+    };
+
+    template <class computer_class, int size_particle_positions, int size_particle_rhs>
+    void compute_distr(computer_class& in_computer,
+                       const partsize_t current_my_nb_particles_per_partition[],
+                       real_number particles_positions[],
+                       real_number particles_current_rhs[],
+                       partsize_t inout_index_particles[]){
+        TIMEZONE("compute_distr");
+
+        // Some processes might not be involved
+        if(nb_processes_involved <= my_rank){
+            return;
+        }
+
+        const long int my_top_z_cell_level = last_cell_level_proc(my_rank);
+        const long int my_down_z_cell_level = first_cell_level_proc(my_rank);
+        const long int my_nb_cell_levels = 1+my_top_z_cell_level-my_down_z_cell_level;
+
+        current_offset_particles_for_partition[0] = 0;
+        partsize_t myTotalNbParticles = 0;
+        for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+            myTotalNbParticles += current_my_nb_particles_per_partition[idxPartition];
+            current_offset_particles_for_partition[idxPartition+1] = current_offset_particles_for_partition[idxPartition] + current_my_nb_particles_per_partition[idxPartition];
+        }
+
+        // Compute box idx for each particle
+        std::unique_ptr<long int[]> particles_coord(new long int[current_offset_particles_for_partition[current_partition_size]]);
+
+        {
+            for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+                #pragma omp parallel for schedule(static)
+                for(partsize_t idxPart = current_offset_particles_for_partition[idxPartition] ; idxPart < current_offset_particles_for_partition[idxPartition+1] ; ++idxPart ){
+                    particles_coord[idxPart] = get_cell_idx(particles_positions[(idxPart)*size_particle_positions + IDX_X],
+                                                                              particles_positions[(idxPart)*size_particle_positions + IDX_Y],
+                                                                              particles_positions[(idxPart)*size_particle_positions + IDX_Z]);
+                    assert(my_down_z_cell_level <= get_cell_coord_z_from_index(particles_coord[idxPart]));
+                    assert(get_cell_coord_z_from_index(particles_coord[idxPart]) <= my_top_z_cell_level);
+                }
+            }
+
+            std::vector<ParticleView<size_particle_positions,size_particle_rhs>> part_to_sort;
+
+            // Sort each partition in cells
+            for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+                part_to_sort.clear();
+
+                for(partsize_t idxPart = current_offset_particles_for_partition[idxPartition] ; idxPart < current_offset_particles_for_partition[idxPartition+1] ; ++idxPart ){
+                    part_to_sort.emplace_back();
+                    part_to_sort.back().p_index = idxPart;
+                    part_to_sort.back().ptr_particles_positions = particles_positions;
+                    part_to_sort.back().ptr_particles_current_rhs = particles_current_rhs;
+                    part_to_sort.back().ptr_global_idx = inout_index_particles;
+                    part_to_sort.back().ptr_cell_idx = particles_coord.get();
+                }
+
+                assert(part_to_sort.size() == (current_offset_particles_for_partition[idxPartition+1]-current_offset_particles_for_partition[idxPartition]));
+
+                std::sort(part_to_sort.begin(), part_to_sort.end(),
+                          [](const ParticleView<size_particle_positions,size_particle_rhs>& p1,
+                             const ParticleView<size_particle_positions,size_particle_rhs>& p2){
+                    return p1.ptr_cell_idx[p1.p_index] < p2.ptr_cell_idx[p2.p_index];
+                });
+            }
+        }
+
+        // Build the tree
+        p2p_tree<std::vector<std::pair<partsize_t,partsize_t>>> my_tree(nb_cell_levels);
+
+        for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+            long int current_cell_idx = -1;
+            partsize_t current_nb_particles_in_cell = 0;
+            partsize_t current_cell_offset = 0;
+
+            for(partsize_t idx_part = current_offset_particles_for_partition[idxPartition] ;
+                            idx_part != current_offset_particles_for_partition[idxPartition+1]; ++idx_part){
+                if(particles_coord[idx_part] != current_cell_idx){
+                    if(current_nb_particles_in_cell){
+                        my_tree.getCell(current_cell_idx).emplace_back(current_cell_offset,current_nb_particles_in_cell);
+                    }
+                    current_cell_idx = particles_coord[idx_part];
+                    current_nb_particles_in_cell = 1;
+                    current_cell_offset = idx_part;
+                }
+            }
+            if(current_nb_particles_in_cell){
+                my_tree.getCell(current_cell_idx).emplace_back(current_cell_offset,current_nb_particles_in_cell);
+
+            }
+        }
+
+        printf("[%d] go from cutoff level %ld to %ld\n",
+               my_rank, my_down_z_cell_level, my_top_z_cell_level); // TODO remove
+        fflush(stdout); // TODO
+
+        // Offset per cell layers
+        std::unique_ptr<partsize_t[]> particles_offset_layers(new partsize_t[my_nb_cell_levels+1]());
+        for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+            for(partsize_t idx_part = current_offset_particles_for_partition[idxPartition] ;
+                            idx_part != current_offset_particles_for_partition[idxPartition+1]; ++idx_part){
+                assert(my_down_z_cell_level <= get_cell_coord_z_from_index(particles_coord[idx_part]));
+                assert(get_cell_coord_z_from_index(particles_coord[idx_part]) <= my_top_z_cell_level);
+                particles_offset_layers[get_cell_coord_z_from_index(particles_coord[idx_part])+1-my_down_z_cell_level] += 1;
+            }
+        }
+        for(size_t idx_layer = 0 ; idx_layer < my_nb_cell_levels ; ++idx_layer){
+            printf("[%d] nb particles in cutoff level %llu are %ld\n",
+                   my_rank, idx_layer, particles_offset_layers[idx_layer+1]); // TODO remove
+            fflush(stdout); // TODO
+            particles_offset_layers[idx_layer+1] += particles_offset_layers[idx_layer];
+        }
+
+        // Reset vectors
+        assert(whatNext.size() == 0);
+        assert(mpiRequests.size() == 0);
+        neigDescriptors.clear();
+
+        // Find process with at least one neighbor
+        {
+            std::cout << my_rank << " my_top_z_cell_level " << my_top_z_cell_level << std::endl;
+            std::cout << my_rank << " my_down_z_cell_level " << my_down_z_cell_level << std::endl;
+            std::cout.flush();// TODO
+
+            int dest_proc = (my_rank+1)%nb_processes_involved;
+            while(dest_proc != my_rank
+                  && (my_top_z_cell_level == first_cell_level_proc(dest_proc)
+                      || (my_top_z_cell_level+1)%nb_cell_levels[IDX_Z] == first_cell_level_proc(dest_proc))){
+                // Find if we have to send 1 or 2 cell levels
+                int nb_levels_to_send = 1;
+                if(my_nb_cell_levels > 1 // I have more than one level
+                        && (my_top_z_cell_level-1+2)%nb_cell_levels[IDX_Z] <= last_cell_level_proc(dest_proc)){
+                    nb_levels_to_send += 1;
+                }
+
+                std::cout << my_rank << " dest_proc " << dest_proc << std::endl;
+                std::cout << my_rank << " first_cell_level_proc(dest_proc) " << first_cell_level_proc(dest_proc) << std::endl;
+                std::cout << my_rank << " last_cell_level_proc(dest_proc) " << last_cell_level_proc(dest_proc) << std::endl;
+                std::cout.flush();// TODO
+
+                NeighborDescriptor descriptor;
+                descriptor.destProc = dest_proc;
+                descriptor.nbLevelsToExchange = nb_levels_to_send;
+                descriptor.nbParticlesToExchange = particles_offset_layers[my_nb_cell_levels] - particles_offset_layers[my_nb_cell_levels-nb_levels_to_send];
+                descriptor.isRecv = false;
+
+                std::cout << my_rank << "SEND" << std::endl;
+                std::cout << "descriptor.destProc " << descriptor.destProc << std::endl;
+                std::cout << "descriptor.nbLevelsToExchange " << descriptor.nbLevelsToExchange << std::endl;
+                std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl;
+                std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl;
+                std::cout << "descriptor.isRecv " << descriptor.isRecv << std::endl;
+                std::cout << "neigDescriptors.size() " << neigDescriptors.size() << std::endl;
+                std::cout.flush();// TODO
+
+                neigDescriptors.emplace_back(std::move(descriptor));
+
+                dest_proc = (dest_proc+1)%nb_processes_involved;
+            }
+            std::cout << my_rank << " NO dest_proc " << dest_proc << std::endl;
+            std::cout << my_rank << " NO first_cell_level_proc(dest_proc) " << first_cell_level_proc(dest_proc) << std::endl;
+            std::cout.flush();// TODO
+
+            int src_proc = (my_rank-1+nb_processes_involved)%nb_processes_involved;
+            while(src_proc != my_rank
+                  && (last_cell_level_proc(src_proc) == my_down_z_cell_level
+                      || (last_cell_level_proc(src_proc)+1)%nb_cell_levels[IDX_Z] == my_down_z_cell_level)){
+                // Find if we have to send 1 or 2 cell levels
+                int nb_levels_to_recv = 1;
+                if(my_nb_cell_levels > 1 // I have more than one level
+                        && first_cell_level_proc(src_proc) <= (my_down_z_cell_level-1+2)%nb_cell_levels[IDX_Z]){
+                    nb_levels_to_recv += 1;
+                }
+
+                std::cout << my_rank << " src_proc " << src_proc << std::endl;
+                std::cout << my_rank << " first_cell_level_proc(src_proc) " << first_cell_level_proc(src_proc) << std::endl;
+                std::cout.flush();// TODO
+
+                NeighborDescriptor descriptor;
+                descriptor.destProc = src_proc;
+                descriptor.nbLevelsToExchange = nb_levels_to_recv;
+                descriptor.nbParticlesToExchange = -1;
+                descriptor.isRecv = true;
+
+                neigDescriptors.emplace_back(std::move(descriptor));
+
+                std::cout << my_rank << "] RECV" << std::endl;
+                std::cout << "descriptor.destProc " << descriptor.destProc << std::endl;
+                std::cout << "descriptor.nbLevelsToExchange " << descriptor.nbLevelsToExchange << std::endl;
+                std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl;
+                std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl;
+                std::cout << "descriptor.isRecv " << descriptor.isRecv << std::endl;
+                std::cout << "neigDescriptors.size() " << neigDescriptors.size() << std::endl;
+                std::cout.flush();// TODO
+
+                src_proc = (src_proc-1+nb_processes_involved)%nb_processes_involved;
+            }
+            std::cout << my_rank << " NO src_proc " << src_proc << std::endl;
+            std::cout << my_rank << " NO first_cell_level_proc(src_proc) " << first_cell_level_proc(src_proc) << std::endl;
+            std::cout.flush();// TODO
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        /// Exchange the number of particles in each partition
+        /// Could involve only here but I do not think it will be a problem
+        //////////////////////////////////////////////////////////////////////
+
+        assert(whatNext.size() == 0);
+        assert(mpiRequests.size() == 0);
+
+
+        for(int idxDescr = 0 ; idxDescr < int(neigDescriptors.size()) ; ++idxDescr){
+            NeighborDescriptor& descriptor = neigDescriptors[idxDescr];
+
+            if(descriptor.isRecv == false){
+                whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Isend(const_cast<partsize_t*>(&descriptor.nbParticlesToExchange),
+                                    1, particles_utils::GetMpiType(partsize_t()),
+                                    descriptor.destProc, TAG_NB_PARTICLES,
+                                    current_com, &mpiRequests.back()));
+
+                if(descriptor.nbParticlesToExchange){
+                    std::cout << my_rank << "] SEND_PARTICLES" << std::endl;
+                    std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl;
+                    std::cout << "descriptor.destProc " << descriptor.destProc << std::endl;
+                    std::cout << "idxDescr " << idxDescr << std::endl;
+
+                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                    mpiRequests.emplace_back();
+                    assert(descriptor.nbParticlesToExchange*size_particle_positions < std::numeric_limits<int>::max());
+                    AssertMpi(MPI_Isend(const_cast<real_number*>(&particles_positions[particles_offset_layers[my_nb_cell_levels-descriptor.nbLevelsToExchange]]),
+                              int(descriptor.nbParticlesToExchange*size_particle_positions), particles_utils::GetMpiType(real_number()),
+                              descriptor.destProc, TAG_POSITION_PARTICLES,
+                              current_com, &mpiRequests.back()));
+
+                    assert(descriptor.toRecvAndMerge == nullptr);
+                    descriptor.toRecvAndMerge.reset(new real_number[descriptor.nbParticlesToExchange*size_particle_rhs]);
+                    whatNext.emplace_back(std::pair<Action,int>{MERGE_PARTICLES, int(neigDescriptors.size())});
+                    mpiRequests.emplace_back();
+                    assert(descriptor.nbParticlesToExchange*size_particle_rhs < std::numeric_limits<int>::max());
+                    AssertMpi(MPI_Irecv(descriptor.toRecvAndMerge.get(), int(descriptor.nbParticlesToExchange*size_particle_rhs),
+                                        particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_RESULT_PARTICLES,
+                                        current_com, &mpiRequests.back()));
+                }
+            }
+            else{
+                std::cout << "RECV_PARTICLES " << RECV_PARTICLES << std::endl;
+                std::cout << "idxDescr " << idxDescr << std::endl;
+                whatNext.emplace_back(std::pair<Action,int>{RECV_PARTICLES, idxDescr});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Irecv(&descriptor.nbParticlesToExchange,
+                      1, particles_utils::GetMpiType(partsize_t()), descriptor.destProc, TAG_NB_PARTICLES,
+                      current_com, &mpiRequests.back()));
+            }
+        }
+
+        const bool more_than_one_thread = (omp_get_max_threads() > 1);
+
+        TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads())
+        #pragma omp parallel default(shared)
+        {
+            #pragma omp master
+            {
+                while(mpiRequests.size()){
+                    assert(mpiRequests.size() == whatNext.size());
+
+                    int idxDone = int(mpiRequests.size());
+                    {
+                        TIMEZONE("wait");
+                        AssertMpi(MPI_Waitany(int(mpiRequests.size()), mpiRequests.data(), &idxDone, MPI_STATUSES_IGNORE));
+                    }
+                    const std::pair<Action, int> releasedAction = whatNext[idxDone];
+                    std::swap(mpiRequests[idxDone], mpiRequests[mpiRequests.size()-1]);
+                    std::swap(whatNext[idxDone], whatNext[mpiRequests.size()-1]);
+                    mpiRequests.pop_back();
+                    whatNext.pop_back();
+
+                    //////////////////////////////////////////////////////////////////////
+                    /// Data to exchange particles
+                    //////////////////////////////////////////////////////////////////////
+                    if(releasedAction.first == RECV_PARTICLES){
+                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
+
+                        assert(descriptor.isRecv == true);
+                        const int destProc = descriptor.destProc;
+                        const partsize_t NbParticlesToReceive = descriptor.nbParticlesToExchange;
+                        assert(NbParticlesToReceive != -1);
+                        assert(descriptor.toCompute == nullptr);
+
+                        std::cout << my_rank << "] RECV_PARTICLES" << std::endl;
+                        std::cout << "descriptor.nbParticlesToExchange " << descriptor.nbParticlesToExchange << std::endl;
+                        std::cout << "descriptor.destProc " << descriptor.destProc << std::endl;
+                        std::cout << "releasedAction.second " << releasedAction.second << std::endl;
+
+                        if(NbParticlesToReceive){
+                            std::cout << "MPI_Irecv " << std::endl;
+                            descriptor.toCompute.reset(new real_number[NbParticlesToReceive*size_particle_positions]);
+                            whatNext.emplace_back(std::pair<Action,int>{COMPUTE_PARTICLES, releasedAction.second});
+                            mpiRequests.emplace_back();
+                            assert(NbParticlesToReceive*size_particle_positions < std::numeric_limits<int>::max());
+                            AssertMpi(MPI_Irecv(descriptor.toCompute.get(), int(NbParticlesToReceive*size_particle_positions),
+                                                particles_utils::GetMpiType(real_number()), destProc, TAG_POSITION_PARTICLES,
+                                                current_com, &mpiRequests.back()));
+                        }
+                    }
+
+                    //////////////////////////////////////////////////////////////////////
+                    /// Computation
+                    //////////////////////////////////////////////////////////////////////
+                    if(releasedAction.first == COMPUTE_PARTICLES){
+                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
+                        const partsize_t NbParticlesToReceive = descriptor.nbParticlesToExchange;
+
+                        assert(descriptor.toCompute != nullptr);
+                        descriptor.results.reset(new real_number[NbParticlesToReceive*size_particle_rhs]);
+                        // TODO in_computer.template init_result_array<size_particle_rhs>(descriptor.results.get(), NbParticlesToReceive);
+
+                        // Compute
+                        partsize_t idxPart = 0;
+                        while(idxPart != NbParticlesToReceive){
+                            const long int current_cell_idx = get_cell_idx(descriptor.toCompute[idxPart*size_particle_positions + IDX_X],
+                                                                           descriptor.toCompute[idxPart*size_particle_positions + IDX_Y],
+                                                                           descriptor.toCompute[idxPart*size_particle_positions + IDX_Z]);
+                            partsize_t nb_parts_in_cell = 0;
+                            while(idxPart+nb_parts_in_cell != NbParticlesToReceive
+                                  && current_cell_idx == get_cell_idx(descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDX_X],
+                                                                     descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDX_Y],
+                                                                     descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDX_Z])){
+                                nb_parts_in_cell += 1;
+                            }
+
+                            const std::vector<std::pair<partsize_t,partsize_t>>* neighbors[27];
+                            const int nbNeighbors = my_tree.getNeighbors(current_cell_idx, neighbors);
+
+                            // with other interval
+                            for(size_t idx_neighbor = 0 ; idx_neighbor < nbNeighbors ; ++idx_neighbor){
+                                for(size_t idx_2 = 0 ; idx_2 < (*neighbors[idx_neighbor]).size() ; ++idx_2){
+                                    for(partsize_t idx_p1 = 0 ; idx_p1 < nb_parts_in_cell ; ++idx_p1){
+                                        for(partsize_t idx_p2 = 0 ; idx_p2 < (*neighbors[idx_neighbor])[idx_2].second ; ++idx_p2){
+                                            const real_number dist_r2 = compute_distance_r2(descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDX_X],
+                                                                                            descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDX_Y],
+                                                                                            descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDX_Z],
+                                                                                            particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_X],
+                                                                                            particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_Y],
+                                                                                            particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_Z]);
+                                            if(dist_r2 < cutoff_radius*cutoff_radius){
+                                                // TODO in_computer.template compute_interaction<size_particle_positions,size_particle_rhs>(
+//                                                                    &descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions],
+//                                                                    &descriptor.results[(idxPart+idx_p1)*size_particle_rhs],
+//                                                                    &particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions],
+//                                                                    &particles_current_rhs[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_rhs],
+//                                                                    dist_r2);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            idxPart += nb_parts_in_cell;
+                        }
+
+                        // Send back
+                        const int destProc = descriptor.destProc;
+                        whatNext.emplace_back(std::pair<Action,int>{RELEASE_BUFFER_PARTICLES, releasedAction.second});
+                        mpiRequests.emplace_back();
+                        assert(NbParticlesToReceive*size_particle_rhs < std::numeric_limits<int>::max());
+                        AssertMpi(MPI_Isend(descriptor.results.get(), int(NbParticlesToReceive*size_particle_rhs),
+                                            particles_utils::GetMpiType(real_number()), destProc, TAG_RESULT_PARTICLES,
+                                            current_com, &mpiRequests.back()));
+                    }
+                    //////////////////////////////////////////////////////////////////////
+                    /// Computation
+                    //////////////////////////////////////////////////////////////////////
+                    if(releasedAction.first == RELEASE_BUFFER_PARTICLES){
+                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
+                        assert(descriptor.toCompute != nullptr);
+                        descriptor.toCompute.release();
+                    }
+                    //////////////////////////////////////////////////////////////////////
+                    /// Merge
+                    //////////////////////////////////////////////////////////////////////
+                    if(releasedAction.first == MERGE_PARTICLES && more_than_one_thread == false){
+                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
+                        assert(descriptor.isRecv);
+                        assert(descriptor.toRecvAndMerge != nullptr);
+                        // TODO in_computer.template reduce_particles_rhs<size_particle_rhs>(&particles_current_rhs[0], descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToExchange);
+                        descriptor.toRecvAndMerge.release();
+                    }
+                }
+            }
+        }
+
+        assert(whatNext.size() == 0);
+        assert(mpiRequests.size() == 0);
+
+        // Compute self data
+        for(const auto& iter_cell : my_tree){
+            const std::vector<std::pair<partsize_t,partsize_t>>& intervals = iter_cell.second;
+
+            for(size_t idx_1 = 0 ; idx_1 < intervals.size() ; ++idx_1){
+                // self interval
+                for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){
+                    for(partsize_t idx_p2 = idx_p1+1 ; idx_p2 < intervals[idx_1].second ; ++idx_p2){
+                        const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_X],
+                                                                        particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Y],
+                                                                        particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Z],
+                                                                        particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDX_X],
+                                                                        particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDX_Y],
+                                                                        particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDX_Z]);
+                        if(dist_r2 < cutoff_radius*cutoff_radius){
+                            in_computer.template compute_interaction<size_particle_positions,size_particle_rhs>(
+                                                &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions],
+                                                &particles_current_rhs[(intervals[idx_1].first+idx_p1)*size_particle_rhs],
+                                                &particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions],
+                                                &particles_current_rhs[(intervals[idx_1].first+idx_p2)*size_particle_rhs],
+                                                dist_r2);
+                        }
+                    }
+                }
+
+                // with other interval
+                for(size_t idx_2 = idx_1+1 ; idx_2 < intervals.size() ; ++idx_2){
+                    for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){
+                        for(partsize_t idx_p2 = 0 ; idx_p2 < intervals[idx_2].second ; ++idx_p2){
+                            const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_X],
+                                                                            particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Y],
+                                                                            particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Z],
+                                                                            particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDX_X],
+                                                                            particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDX_Y],
+                                                                            particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDX_Z]);
+                            if(dist_r2 < cutoff_radius*cutoff_radius){
+                                in_computer.template compute_interaction<size_particle_positions,size_particle_rhs>(
+                                                    &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions],
+                                                    &particles_current_rhs[(intervals[idx_1].first+idx_p1)*size_particle_rhs],
+                                                    &particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions],
+                                                    &particles_current_rhs[(intervals[idx_2].first+idx_p2)*size_particle_rhs],
+                                                    dist_r2);
+                            }
+                        }
+                    }
+                }
+            }
+
+
+            const long int currenct_cell_idx = iter_cell.first;
+            const std::vector<std::pair<partsize_t,partsize_t>>* neighbors[27];
+            const int nbNeighbors = my_tree.getNeighbors(currenct_cell_idx, neighbors);
+
+            for(size_t idx_1 = 0 ; idx_1 < intervals.size() ; ++idx_1){
+                // with other interval
+                for(size_t idx_neighbor = 0 ; idx_neighbor < nbNeighbors ; ++idx_neighbor){
+                    for(size_t idx_2 = 0 ; idx_2 < (*neighbors[idx_neighbor]).size() ; ++idx_2){
+                        for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){
+                            for(partsize_t idx_p2 = 0 ; idx_p2 < (*neighbors[idx_neighbor])[idx_2].second ; ++idx_p2){
+                                const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_X],
+                                                                                particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Y],
+                                                                                particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDX_Z],
+                                                                                particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_X],
+                                                                                particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_Y],
+                                                                                particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDX_Z]);
+                                if(dist_r2 < cutoff_radius*cutoff_radius){
+                                    in_computer.template compute_interaction<size_particle_positions,size_particle_rhs>(
+                                                        &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions],
+                                                        &particles_current_rhs[(intervals[idx_1].first+idx_p1)*size_particle_rhs],
+                                                        &particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions],
+                                                        &particles_current_rhs[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_rhs],
+                                                        dist_r2);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+};
+
+#endif
--- a/bfps/cpp/particles/p2p_tree.hpp
+++ b/bfps/cpp/particles/p2p_tree.hpp
+#ifndef P2P_TREE_HPP
+#define P2P_TREE_HPP
+
+#include <unordered_map>
+#include <vector>
+
+template <class CellClass>
+class p2p_tree{
+    std::unordered_map<long int, CellClass> data;
+    CellClass emptyCell;
+    std::array<long int,3> nb_cell_levels;
+
+    long int get_cell_coord_x_from_index(const long int index) const{
+        return index % nb_cell_levels[IDX_X];
+    }
+
+    long int get_cell_coord_y_from_index(const long int index) const{
+        return (index - get_cell_coord_z_from_index(index)*(nb_cell_levels[IDX_X]*nb_cell_levels[IDX_Y]))
+                / nb_cell_levels[IDX_X];
+    }
+
+    long int get_cell_coord_z_from_index(const long int index) const{
+        return index / (nb_cell_levels[IDX_X]*nb_cell_levels[IDX_Y]);
+    }
+
+    long int get_cell_idx(const long int idx_x, const long int idx_y,
+                          const long int idx_z) const {
+        return (((idx_z*nb_cell_levels[IDX_Y])+idx_y)*nb_cell_levels[IDX_X])+idx_x;
+    }
+
+public:
+    explicit p2p_tree(std::array<long int,3> in_nb_cell_levels)
+        : nb_cell_levels(in_nb_cell_levels){
+    }
+
+    CellClass& getCell(const long int idx){
+        return data[idx];
+    }
+
+
+    const CellClass& getCell(const long int idx) const {
+        const auto& iter = data.find(idx);
+        if(iter != data.end()){
+            return iter->second;
+        }
+        return emptyCell;
+    }
+
+    int getNeighbors(const long int idx, const CellClass* output[27]) const{
+        int nbNeighbors = 0;
+
+        const long int idx_x = get_cell_coord_x_from_index(idx);
+        const long int idx_y = get_cell_coord_y_from_index(idx);
+        const long int idx_z = get_cell_coord_z_from_index(idx);
+
+        for(long int neigh_x = - 1 ; neigh_x <= 1 ; ++neigh_x){
+            long int neigh_x_pbc = neigh_x+idx_x;
+            if(neigh_x_pbc < 0){
+                neigh_x_pbc += nb_cell_levels[IDX_X];
+            }
+            else if(nb_cell_levels[IDX_X] <= neigh_x_pbc){
+                neigh_x_pbc -= nb_cell_levels[IDX_X];
+            }
+
+            for(long int neigh_y = - 1 ; neigh_y <= 1 ; ++neigh_y){
+                long int neigh_y_pbc = neigh_y+idx_y;
+                if(neigh_y_pbc < 0){
+                    neigh_y_pbc += nb_cell_levels[IDX_Y];
+                }
+                else if(nb_cell_levels[IDX_Y] <= neigh_y_pbc){
+                    neigh_y_pbc -= nb_cell_levels[IDX_Y];
+                }
+
+                for(long int neigh_z = - 1 ; neigh_z <= 1 ; ++neigh_z){
+                    long int neigh_z_pbc = neigh_z+idx_z;
+                    if(neigh_z_pbc < 0){
+                        neigh_z_pbc += nb_cell_levels[IDX_Z];
+                    }
+                    else if(nb_cell_levels[IDX_Z] <= neigh_z_pbc){
+                        neigh_z_pbc -= nb_cell_levels[IDX_Z];
+                    }
+
+                    // Not the current cell
+                    if(neigh_x_pbc != idx_x || neigh_y_pbc != idx_y || neigh_z_pbc != idx_z){
+                        const long int idx_neigh = get_cell_idx(neigh_x_pbc,
+                                                                  neigh_y_pbc,
+                                                                  neigh_z_pbc);
+                        const auto& iter = data.find(idx_neigh);
+                        if(iter != data.end()){
+                            output[nbNeighbors] = &(iter->second);
+                        }
+                    }
+                }
+            }
+        }
+
+        return nbNeighbors;
+    }
+
+    typename std::unordered_map<long int, CellClass>::iterator begin(){
+        return data.begin();
+    }
+
+    typename std::unordered_map<long int, CellClass>::iterator end(){
+        return data.end();
+    }
+};
+
+#endif