p2p_distr_mpi.hpp 53.5 KB
Newer Older
1
2
3
4
/******************************************************************************
*                                                                             *
*  Copyright 2019 Max Planck Institute for Dynamics and Self-Organization     *
*                                                                             *
Cristian Lalescu's avatar
Cristian Lalescu committed
5
*  This file is part of TurTLE.                                               *
6
*                                                                             *
Cristian Lalescu's avatar
Cristian Lalescu committed
7
*  TurTLE is free software: you can redistribute it and/or modify             *
8
9
10
11
*  it under the terms of the GNU General Public License as published          *
*  by the Free Software Foundation, either version 3 of the License,          *
*  or (at your option) any later version.                                     *
*                                                                             *
Cristian Lalescu's avatar
Cristian Lalescu committed
12
*  TurTLE is distributed in the hope that it will be useful,                  *
13
14
15
16
17
*  but WITHOUT ANY WARRANTY; without even the implied warranty of             *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
*  GNU General Public License for more details.                               *
*                                                                             *
*  You should have received a copy of the GNU General Public License          *
Cristian Lalescu's avatar
Cristian Lalescu committed
18
*  along with TurTLE.  If not, see <http://www.gnu.org/licenses/>             *
19
20
21
22
23
24
25
*                                                                             *
* Contact: Cristian.Lalescu@ds.mpg.de                                         *
*                                                                             *
******************************************************************************/



26
27
28
29
30
31
32
33
34
35
36
37
38
39
#ifndef P2P_DISTR_MPI_HPP
#define P2P_DISTR_MPI_HPP

#include <mpi.h>

#include <vector>
#include <memory>
#include <cassert>

#include <type_traits>
#include <omp.h>
#include <algorithm>

#include "scope_timer.hpp"
40
41
42
#include "particles/particles_utils.hpp"
#include "particles/p2p/p2p_tree.hpp"
#include "particles/lock_free_bool_array.hpp"
43
44
45
46

template <class partsize_t, class real_number>
class p2p_distr_mpi {
protected:
Cristian Lalescu's avatar
Cristian Lalescu committed
47
    static const int MaxNbRhs = 10;
48
49
50
51

    enum MpiTag{
        TAG_NB_PARTICLES,
        TAG_POSITION_PARTICLES,
52
        TAG_INDEX_PARTICLES,
53
54
55
56
57
58
59
60
        TAG_RESULT_PARTICLES,
    };

    struct NeighborDescriptor{
        partsize_t nbParticlesToExchange;
        int destProc;
        int nbLevelsToExchange;
        bool isRecv;
61
        int nbReceived;
62
63
64
65

        std::unique_ptr<real_number[]> toRecvAndMerge;
        std::unique_ptr<real_number[]> toCompute;
        std::unique_ptr<real_number[]> results;
66
        std::unique_ptr<partsize_t[]> indexes;
67
68
69
    };

    enum Action{
70
        NOTHING_TODO = 512,
71
72
73
        RECV_PARTICLES,
        COMPUTE_PARTICLES,
        RELEASE_BUFFER_PARTICLES,
74
        MERGE_PARTICLES
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
    };

    MPI_Comm current_com;

    int my_rank;
    int nb_processes;
    int nb_processes_involved;

    const std::pair<int,int> current_partition_interval;
    const int current_partition_size;
    const std::array<size_t,3> field_grid_dim;

    std::unique_ptr<int[]> partition_interval_size_per_proc;
    std::unique_ptr<int[]> partition_interval_offset_per_proc;

    std::unique_ptr<partsize_t[]> current_offset_particles_for_partition;

    std::vector<std::pair<Action,int>> whatNext;
    std::vector<MPI_Request> mpiRequests;
    std::vector<NeighborDescriptor> neigDescriptors;

    std::array<real_number,3> spatial_box_width;
    std::array<real_number,3> spatial_box_offset;

99
    const real_number cutoff_radius_compute;
100
    const int nb_cells_factor;
101
102
103
    const real_number cutoff_radius;
    std::array<long int,3> nb_cell_levels;

104
    template <class DataType, int sizeElement>
Cristian Lalescu's avatar
Cristian Lalescu committed
105
106
107
108
109
    static void permute_copy(const partsize_t offsetIdx,
                             const partsize_t nbElements,
                             const std::pair<long int, partsize_t> permutation[],
                             DataType data[],
                             std::vector<unsigned char>* buffer){
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
        buffer->resize(nbElements*sizeof(DataType)*sizeElement);
        DataType* dataBuffer = reinterpret_cast<DataType*>(buffer->data());

        // Permute
        for(partsize_t idxPart = 0 ; idxPart < nbElements ; ++idxPart){
            const partsize_t srcData = permutation[idxPart].second;
            const partsize_t destData = idxPart;
            for(int idxVal = 0 ; idxVal < sizeElement ; ++idxVal){
                dataBuffer[destData*sizeElement + idxVal]
                        = data[srcData*sizeElement + idxVal];
            }
        }

        // Copy back
        for(partsize_t idxPart = 0 ; idxPart < nbElements ; ++idxPart){
            const partsize_t srcData = idxPart;
            const partsize_t destData = idxPart+offsetIdx;
            for(int idxVal = 0 ; idxVal < sizeElement ; ++idxVal){
                data[destData*sizeElement + idxVal]
                        = dataBuffer[srcData*sizeElement + idxVal];
            }
        }
Cristian Lalescu's avatar
Cristian Lalescu committed
132
133

        buffer->resize(0);
134
135
    }

136
    static int foundGridFactor(const real_number in_cutoff_radius, const std::array<real_number,3>& in_spatial_box_width){
137
        int idx_factor = 1;
138
        while(in_cutoff_radius <= in_spatial_box_width[IDXC_Z]/real_number(idx_factor+1)){
139
140
            idx_factor += 1;
        }
141
        return idx_factor;
142
143
    }

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
public:
    ////////////////////////////////////////////////////////////////////////////

    p2p_distr_mpi(MPI_Comm in_current_com,
                     const std::pair<int,int>& in_current_partitions,
                     const std::array<size_t,3>& in_field_grid_dim,
                     const std::array<real_number,3>& in_spatial_box_width,
                     const std::array<real_number,3>& in_spatial_box_offset,
                     const real_number in_cutoff_radius)
        : current_com(in_current_com),
            my_rank(-1), nb_processes(-1),nb_processes_involved(-1),
            current_partition_interval(in_current_partitions),
            current_partition_size(current_partition_interval.second-current_partition_interval.first),
            field_grid_dim(in_field_grid_dim),
            spatial_box_width(in_spatial_box_width), spatial_box_offset(in_spatial_box_offset),
159
            cutoff_radius_compute(in_cutoff_radius),
160
            nb_cells_factor(foundGridFactor(in_cutoff_radius, in_spatial_box_width)),
161
            cutoff_radius(in_spatial_box_width[IDXC_Z]/real_number(nb_cells_factor)){
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

        AssertMpi(MPI_Comm_rank(current_com, &my_rank));
        AssertMpi(MPI_Comm_size(current_com, &nb_processes));

        partition_interval_size_per_proc.reset(new int[nb_processes]);
        AssertMpi( MPI_Allgather( const_cast<int*>(&current_partition_size), 1, MPI_INT,
                                  partition_interval_size_per_proc.get(), 1, MPI_INT,
                                  current_com) );
        assert(partition_interval_size_per_proc[my_rank] == current_partition_size);

        partition_interval_offset_per_proc.reset(new int[nb_processes+1]);
        partition_interval_offset_per_proc[0] = 0;
        for(int idxProc = 0 ; idxProc < nb_processes ; ++idxProc){
            partition_interval_offset_per_proc[idxProc+1] = partition_interval_offset_per_proc[idxProc] + partition_interval_size_per_proc[idxProc];
        }

        current_offset_particles_for_partition.reset(new partsize_t[current_partition_size+1]);

        nb_processes_involved = nb_processes;
        while(nb_processes_involved != 0 && partition_interval_size_per_proc[nb_processes_involved-1] == 0){
            nb_processes_involved -= 1;
        }
        assert(nb_processes_involved != 0);
        for(int idx_proc_involved = 0 ; idx_proc_involved < nb_processes_involved ; ++idx_proc_involved){
            assert(partition_interval_size_per_proc[idx_proc_involved] != 0);
        }

189
        assert(int(field_grid_dim[IDXC_Z]) == partition_interval_offset_per_proc[nb_processes_involved]);
190

191
192
193
        nb_cell_levels[IDXC_X] = nb_cells_factor;
        nb_cell_levels[IDXC_Y] = nb_cells_factor;
        nb_cell_levels[IDXC_Z] = nb_cells_factor;
194
195
196
197
198
199
    }

    virtual ~p2p_distr_mpi(){}

    ////////////////////////////////////////////////////////////////////////////

200
201
202
203
204
205
206
207
    int getGridFactor() const{
        return nb_cells_factor;
    }

    real_number getGridCutoff() const{
        return cutoff_radius;
    }

208
    long int get_cell_coord_x_from_index(const long int index) const{
209
        return index % nb_cell_levels[IDXC_X];
210
211
212
    }

    long int get_cell_coord_y_from_index(const long int index) const{
213
214
        return (index % (nb_cell_levels[IDXC_X]*nb_cell_levels[IDXC_Y]))
                / nb_cell_levels[IDXC_X];
215
216
217
    }

    long int get_cell_coord_z_from_index(const long int index) const{
218
        return index / (nb_cell_levels[IDXC_X]*nb_cell_levels[IDXC_Y]);
219
220
221
    }

    long int first_cell_level_proc(const int dest_proc) const{
222
        const real_number field_section_width_z = spatial_box_width[IDXC_Z]/real_number(field_grid_dim[IDXC_Z]);
223
224
225
226
        return static_cast<long int>((field_section_width_z*real_number(partition_interval_offset_per_proc[dest_proc]))/cutoff_radius);
    }

    long int last_cell_level_proc(const int dest_proc) const{
227
        const real_number field_section_width_z = spatial_box_width[IDXC_Z]/real_number(field_grid_dim[IDXC_Z]);
228
        const long int limite = static_cast<long int>((field_section_width_z*real_number(partition_interval_offset_per_proc[dest_proc+1])
229
                                     - std::numeric_limits<real_number>::epsilon())/cutoff_radius);
230
231
232
233
234
        if(static_cast<real_number>(limite)*cutoff_radius
                == field_section_width_z*real_number(partition_interval_offset_per_proc[dest_proc+1])){
            return limite-1;
        }
        return limite;
235
236
    }

Cristian Lalescu's avatar
Cristian Lalescu committed
237
    real_number apply_pbc(real_number pos, IDX_COMPONENT_3D dim) const{
238
239
240
241
242
243
244
245
246
        while( pos < spatial_box_offset[dim] ){
            pos += spatial_box_width[dim];
        }
        while( spatial_box_width[dim]+spatial_box_offset[dim] <= pos){
            pos -= spatial_box_width[dim];
        }
        return pos;
    }

247
248
    std::array<long int,3> get_cell_coordinate(const real_number pos_x, const real_number pos_y,
                                               const real_number pos_z) const {
249
250
251
        const real_number diff_x = apply_pbc(pos_x,IDXC_X) - spatial_box_offset[IDXC_X];
        const real_number diff_y = apply_pbc(pos_y,IDXC_Y) - spatial_box_offset[IDXC_Y];
        const real_number diff_z = apply_pbc(pos_z,IDXC_Z) - spatial_box_offset[IDXC_Z];
252
        std::array<long int,3> coord;
253
254
255
        coord[IDXC_X] = static_cast<long int>(diff_x/cutoff_radius);
        coord[IDXC_Y] = static_cast<long int>(diff_y/cutoff_radius);
        coord[IDXC_Z] = static_cast<long int>(diff_z/cutoff_radius);
256
257
258
259
260
261
        return coord;
    }

    long int get_cell_idx(const real_number pos_x, const real_number pos_y,
                          const real_number pos_z) const {
        std::array<long int,3> coord = get_cell_coordinate(pos_x, pos_y, pos_z);
262
        return ((coord[IDXC_Z]*nb_cell_levels[IDXC_Y])+coord[IDXC_Y])*nb_cell_levels[IDXC_X]+coord[IDXC_X];
263
264
265
    }

    real_number compute_distance_r2(const real_number x1, const real_number y1, const real_number z1,
Berenger Bramas's avatar
Berenger Bramas committed
266
                                    const real_number x2, const real_number y2, const real_number z2,
267
268
269
270
                                    const real_number xshift_coef, const real_number yshift_coef, const real_number zshift_coef,
                                    real_number &diff_x, real_number &diff_y, real_number &diff_z) const {
        diff_x = apply_pbc(x1,IDXC_X)-apply_pbc(x2,IDXC_X)+xshift_coef*spatial_box_width[IDXC_X];
        assert(std::abs(diff_x) <= 2*cutoff_radius);
271

272
273
        diff_y = apply_pbc(y1,IDXC_X)-apply_pbc(y2,IDXC_X)+yshift_coef*spatial_box_width[IDXC_Y];
        assert(std::abs(diff_y) <= 2*cutoff_radius);
274

275
276
        diff_z = apply_pbc(z1,IDXC_X)-apply_pbc(z2,IDXC_X)+zshift_coef*spatial_box_width[IDXC_Z];
        assert(std::abs(diff_z) <= 2*cutoff_radius);
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
277
278
279

        return (diff_x*diff_x) + (diff_y*diff_y) + (diff_z*diff_z);
    }
280

281
    template <class computer_class, int size_particle_positions, int size_particle_rhs>
282
283
284
    void compute_distr(computer_class& in_computer,
                       const partsize_t current_my_nb_particles_per_partition[],
                       real_number particles_positions[],
285
                       std::unique_ptr<real_number[]> particles_current_rhs[], const int nb_rhs,
286
                       partsize_t inout_index_particles[]){
287
        TIMEZONE("p2p_distr_mpi::compute_distr");
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311

        // Some processes might not be involved
        if(nb_processes_involved <= my_rank){
            return;
        }

        const long int my_top_z_cell_level = last_cell_level_proc(my_rank);
        const long int my_down_z_cell_level = first_cell_level_proc(my_rank);
        const long int my_nb_cell_levels = 1+my_top_z_cell_level-my_down_z_cell_level;

        current_offset_particles_for_partition[0] = 0;
        partsize_t myTotalNbParticles = 0;
        for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
            myTotalNbParticles += current_my_nb_particles_per_partition[idxPartition];
            current_offset_particles_for_partition[idxPartition+1] = current_offset_particles_for_partition[idxPartition] + current_my_nb_particles_per_partition[idxPartition];
        }

        // Compute box idx for each particle
        std::unique_ptr<long int[]> particles_coord(new long int[current_offset_particles_for_partition[current_partition_size]]);

        {
            for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
                #pragma omp parallel for schedule(static)
                for(partsize_t idxPart = current_offset_particles_for_partition[idxPartition] ; idxPart < current_offset_particles_for_partition[idxPartition+1] ; ++idxPart ){
312
313
314
                    particles_coord[idxPart] = get_cell_idx(particles_positions[(idxPart)*size_particle_positions + IDXC_X],
                                                                              particles_positions[(idxPart)*size_particle_positions + IDXC_Y],
                                                                              particles_positions[(idxPart)*size_particle_positions + IDXC_Z]);
315
316
317
318
319
                    assert(my_down_z_cell_level <= get_cell_coord_z_from_index(particles_coord[idxPart]));
                    assert(get_cell_coord_z_from_index(particles_coord[idxPart]) <= my_top_z_cell_level);
                }
            }

320
            std::vector<std::pair<long int,partsize_t>> part_to_sort;
321
322
323
324
325
326
327

            // Sort each partition in cells
            for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
                part_to_sort.clear();

                for(partsize_t idxPart = current_offset_particles_for_partition[idxPartition] ; idxPart < current_offset_particles_for_partition[idxPartition+1] ; ++idxPart ){
                    part_to_sort.emplace_back();
328
329
                    part_to_sort.back().first = particles_coord[idxPart];
                    part_to_sort.back().second = idxPart;
330
331
                }

332
                assert(partsize_t(part_to_sort.size()) == (current_my_nb_particles_per_partition[idxPartition]));
333
334

                std::sort(part_to_sort.begin(), part_to_sort.end(),
335
336
337
                          [](const std::pair<long int,partsize_t>& p1,
                             const std::pair<long int,partsize_t>& p2){
                    return p1.first < p2.first;
338
                });
339
340

                // Permute array using buffer
341
                // permute 4th function parameter using buffer, based on information in first 3 parameters
342
                std::vector<unsigned char> buffer;
343
                permute_copy<real_number, size_particle_positions>(
344
                                current_offset_particles_for_partition[idxPartition],
345
346
                                current_my_nb_particles_per_partition[idxPartition],
                                part_to_sort.data(),
Cristian Lalescu's avatar
Cristian Lalescu committed
347
348
349
350
                                particles_positions,
                                &buffer);
                for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs)
                {
351
352
353
354
355
356
357
                    permute_copy<real_number, size_particle_rhs>(
                                    current_offset_particles_for_partition[idxPartition],
                                    current_my_nb_particles_per_partition[idxPartition],
                                    part_to_sort.data(),
                                    particles_current_rhs[idx_rhs].get(),
                                    &buffer);
                }
358
                permute_copy<partsize_t, 1>(
359
                                current_offset_particles_for_partition[idxPartition],
360
361
                                current_my_nb_particles_per_partition[idxPartition],
                                part_to_sort.data(),
Cristian Lalescu's avatar
Cristian Lalescu committed
362
363
                                inout_index_particles,
                                &buffer);
364
                permute_copy<long int, 1>(
365
                                current_offset_particles_for_partition[idxPartition],
366
367
                                current_my_nb_particles_per_partition[idxPartition],
                                part_to_sort.data(),
Cristian Lalescu's avatar
Cristian Lalescu committed
368
369
                                particles_coord.get(),
                                &buffer);
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
            }
        }

        // Build the tree
        p2p_tree<std::vector<std::pair<partsize_t,partsize_t>>> my_tree(nb_cell_levels);

        for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
            long int current_cell_idx = -1;
            partsize_t current_nb_particles_in_cell = 0;
            partsize_t current_cell_offset = 0;

            for(partsize_t idx_part = current_offset_particles_for_partition[idxPartition] ;
                            idx_part != current_offset_particles_for_partition[idxPartition+1]; ++idx_part){
                if(particles_coord[idx_part] != current_cell_idx){
                    if(current_nb_particles_in_cell){
                        my_tree.getCell(current_cell_idx).emplace_back(current_cell_offset,current_nb_particles_in_cell);
                    }
                    current_cell_idx = particles_coord[idx_part];
                    current_nb_particles_in_cell = 1;
                    current_cell_offset = idx_part;
390
391
392
                }
                else{
                    current_nb_particles_in_cell += 1;
393
394
395
396
397
398
399
400
401
                }
            }
            if(current_nb_particles_in_cell){
                my_tree.getCell(current_cell_idx).emplace_back(current_cell_offset,current_nb_particles_in_cell);

            }
        }

        // Offset per cell layers
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
402
        long int previous_index = 0;
403
        variable_used_only_in_assert(previous_index);
404
405
406
407
        std::unique_ptr<partsize_t[]> particles_offset_layers(new partsize_t[my_nb_cell_levels+1]());
        for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
            for(partsize_t idx_part = current_offset_particles_for_partition[idxPartition] ;
                            idx_part != current_offset_particles_for_partition[idxPartition+1]; ++idx_part){
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
408
409
410
411
412
413
                const long int part_box_z_index = get_cell_coord_z_from_index(particles_coord[idx_part]);
                assert(my_down_z_cell_level <= part_box_z_index);
                assert(part_box_z_index <= my_top_z_cell_level);
                particles_offset_layers[part_box_z_index+1-my_down_z_cell_level] += 1;
                assert(previous_index <= part_box_z_index);
                previous_index = part_box_z_index;
414
415
            }
        }
416
        for(long int idx_layer = 0 ; idx_layer < my_nb_cell_levels ; ++idx_layer){
417
418
419
420
421
422
423
424
425
426
427
428
429
            particles_offset_layers[idx_layer+1] += particles_offset_layers[idx_layer];
        }

        // Reset vectors
        assert(whatNext.size() == 0);
        assert(mpiRequests.size() == 0);
        neigDescriptors.clear();

        // Find process with at least one neighbor
        {
            int dest_proc = (my_rank+1)%nb_processes_involved;
            while(dest_proc != my_rank
                  && (my_top_z_cell_level == first_cell_level_proc(dest_proc)
430
                      || (my_top_z_cell_level+1)%nb_cell_levels[IDXC_Z] == first_cell_level_proc(dest_proc))){
431
432
433
                // Find if we have to send 1 or 2 cell levels
                int nb_levels_to_send = 1;
                if(my_nb_cell_levels > 1 // I have more than one level
434
                        && (my_top_z_cell_level-1+2)%nb_cell_levels[IDXC_Z] <= last_cell_level_proc(dest_proc)){
435
436
437
438
439
440
441
442
                    nb_levels_to_send += 1;
                }

                NeighborDescriptor descriptor;
                descriptor.destProc = dest_proc;
                descriptor.nbLevelsToExchange = nb_levels_to_send;
                descriptor.nbParticlesToExchange = particles_offset_layers[my_nb_cell_levels] - particles_offset_layers[my_nb_cell_levels-nb_levels_to_send];
                descriptor.isRecv = false;
443
                descriptor.nbReceived = 0;
444
445
446
447
448
449
450
451
452

                neigDescriptors.emplace_back(std::move(descriptor));

                dest_proc = (dest_proc+1)%nb_processes_involved;
            }

            int src_proc = (my_rank-1+nb_processes_involved)%nb_processes_involved;
            while(src_proc != my_rank
                  && (last_cell_level_proc(src_proc) == my_down_z_cell_level
453
                      || (last_cell_level_proc(src_proc)+1)%nb_cell_levels[IDXC_Z] == my_down_z_cell_level)){
454
455
456
                // Find if we have to send 1 or 2 cell levels
                int nb_levels_to_recv = 1;
                if(my_nb_cell_levels > 1 // I have more than one level
457
                        && first_cell_level_proc(src_proc) <= (my_down_z_cell_level-1+2)%nb_cell_levels[IDXC_Z]){
458
459
460
461
462
463
464
465
                    nb_levels_to_recv += 1;
                }

                NeighborDescriptor descriptor;
                descriptor.destProc = src_proc;
                descriptor.nbLevelsToExchange = nb_levels_to_recv;
                descriptor.nbParticlesToExchange = -1;
                descriptor.isRecv = true;
466
                descriptor.nbReceived = 0;
467
468
469
470
471
472
473
474
475
476
477
478
479
480

                neigDescriptors.emplace_back(std::move(descriptor));

                src_proc = (src_proc-1+nb_processes_involved)%nb_processes_involved;
            }
        }

        //////////////////////////////////////////////////////////////////////
        /// Exchange the number of particles in each partition
        /// Could involve only here but I do not think it will be a problem
        //////////////////////////////////////////////////////////////////////

        assert(whatNext.size() == 0);
        assert(mpiRequests.size() == 0);
481
482
483
484
#ifndef NDEBUG // Just for assertion
        std::vector<int> willsend(nb_processes_involved, 0);
        std::vector<int> willrecv(nb_processes_involved, 0);
#endif
485
486
487
488
489
490
491
492
493
494
495

        for(int idxDescr = 0 ; idxDescr < int(neigDescriptors.size()) ; ++idxDescr){
            NeighborDescriptor& descriptor = neigDescriptors[idxDescr];

            if(descriptor.isRecv == false){
                whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
                mpiRequests.emplace_back();
                AssertMpi(MPI_Isend(const_cast<partsize_t*>(&descriptor.nbParticlesToExchange),
                                    1, particles_utils::GetMpiType(partsize_t()),
                                    descriptor.destProc, TAG_NB_PARTICLES,
                                    current_com, &mpiRequests.back()));
496
497
498
#ifndef NDEBUG // Just for assertion
                willsend[descriptor.destProc] += 1;
#endif
499
500
501
502
                if(descriptor.nbParticlesToExchange){
                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
                    mpiRequests.emplace_back();
                    assert(descriptor.nbParticlesToExchange*size_particle_positions < std::numeric_limits<int>::max());
Cristian Lalescu's avatar
Cristian Lalescu committed
503
504
505
506
507
508
509
510
                    AssertMpi(MPI_Isend(
                                const_cast<real_number*>(&particles_positions[particles_offset_layers[my_nb_cell_levels-descriptor.nbLevelsToExchange]*size_particle_positions]),
                                int(descriptor.nbParticlesToExchange*size_particle_positions),
                                particles_utils::GetMpiType(real_number()),
                                descriptor.destProc,
                                TAG_POSITION_PARTICLES,
                                current_com,
                                &mpiRequests.back()));
511

512
513
514
                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
                    mpiRequests.emplace_back();
                    assert(descriptor.nbParticlesToExchange*size_particle_positions < std::numeric_limits<int>::max());
Cristian Lalescu's avatar
Cristian Lalescu committed
515
516
517
518
519
520
521
522
                    AssertMpi(MPI_Isend(
                                const_cast<partsize_t*>(&inout_index_particles[particles_offset_layers[my_nb_cell_levels-descriptor.nbLevelsToExchange]]),
                                int(descriptor.nbParticlesToExchange),
                                particles_utils::GetMpiType(partsize_t()),
                                descriptor.destProc,
                                TAG_INDEX_PARTICLES,
                                current_com,
                                &mpiRequests.back()));
523

524
525
                    assert(descriptor.toRecvAndMerge == nullptr);
                    descriptor.toRecvAndMerge.reset(new real_number[descriptor.nbParticlesToExchange*size_particle_rhs]);
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
526
                    whatNext.emplace_back(std::pair<Action,int>{MERGE_PARTICLES, idxDescr});
527
528
                    mpiRequests.emplace_back();
                    assert(descriptor.nbParticlesToExchange*size_particle_rhs < std::numeric_limits<int>::max());
Cristian Lalescu's avatar
Cristian Lalescu committed
529
530
531
532
533
534
535
536
                    AssertMpi(MPI_Irecv(
                                descriptor.toRecvAndMerge.get(),
                                int(descriptor.nbParticlesToExchange*size_particle_rhs),
                                particles_utils::GetMpiType(real_number()),
                                descriptor.destProc,
                                TAG_RESULT_PARTICLES,
                                current_com,
                                &mpiRequests.back()));
537
538
539
                }
            }
            else{
540
541
542
#ifndef NDEBUG // Just for assertion
                willrecv[descriptor.destProc] += 1;
#endif
543
544
                whatNext.emplace_back(std::pair<Action,int>{RECV_PARTICLES, idxDescr});
                mpiRequests.emplace_back();
Cristian Lalescu's avatar
Cristian Lalescu committed
545
546
547
548
549
550
551
552
                AssertMpi(MPI_Irecv(
                            &descriptor.nbParticlesToExchange,
                            1,
                            particles_utils::GetMpiType(partsize_t()),
                            descriptor.destProc,
                            TAG_NB_PARTICLES,
                            current_com,
                            &mpiRequests.back()));
553
554
555
            }
        }

556
557
558
559
560
561
#ifndef NDEBUG // Just for assertion
        {
            if(myrank == 0){
                std::vector<int> willsendall(nb_processes_involved*nb_processes_involved, 0);// TODO debug
                std::vector<int> willrecvall(nb_processes_involved*nb_processes_involved, 0);// TODO debug

Cristian Lalescu's avatar
Cristian Lalescu committed
562
563
564
565
566
567
568
569
                MPI_Gather(willrecv.data(),
                           nb_processes_involved,
                           MPI_INT,
                           willrecvall.data(),
                           nb_processes_involved,
                           MPI_INT,
                           0,
                           MPI_COMM_WORLD);
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
                MPI_Gather(willsend.data(), nb_processes_involved, MPI_INT, willsendall.data(),
                            nb_processes_involved, MPI_INT, 0, MPI_COMM_WORLD);

                for(int idxproc = 0 ; idxproc < nb_processes_involved ; ++idxproc){
                    for(int idxtest = 0 ; idxtest < nb_processes_involved ; ++idxtest){
                        assert(willsendall[idxproc*nb_processes_involved + idxtest]
                                == willrecvall[idxtest*nb_processes_involved + idxproc]);
                    }
                }
            }
            else{
                MPI_Gather(willrecv.data(), nb_processes_involved, MPI_INT, nullptr,
                            0, MPI_INT, 0, MPI_COMM_WORLD);
                MPI_Gather(willsend.data(), nb_processes_involved, MPI_INT, nullptr,
                            0, MPI_INT, 0, MPI_COMM_WORLD);
            }
        }
#endif

589
590
        lock_free_bool_array cells_locker(512);

591
592
        std::vector<std::unique_ptr<computer_class>> computer_for_all_threads(omp_get_max_threads()-1);
        for(int idxThread = 1 ; idxThread < omp_get_max_threads() ; ++idxThread){
593
594
595
            computer_for_all_threads[idxThread-1].reset(new computer_class(in_computer));
        }

596
597
598
        TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads())
        #pragma omp parallel default(shared)
        {
599
            computer_class& computer_thread = (omp_get_thread_num() == 0 ? in_computer : *computer_for_all_threads[omp_get_thread_num()-1]);
600
601
602
            #pragma omp master
            {
                while(mpiRequests.size()){
Berenger Bramas's avatar
Berenger Bramas committed
603
                    TIMEZONE("wait-loop");
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
                    assert(mpiRequests.size() == whatNext.size());

                    int idxDone = int(mpiRequests.size());
                    {
                        TIMEZONE("wait");
                        AssertMpi(MPI_Waitany(int(mpiRequests.size()), mpiRequests.data(), &idxDone, MPI_STATUSES_IGNORE));
                    }
                    const std::pair<Action, int> releasedAction = whatNext[idxDone];
                    std::swap(mpiRequests[idxDone], mpiRequests[mpiRequests.size()-1]);
                    std::swap(whatNext[idxDone], whatNext[mpiRequests.size()-1]);
                    mpiRequests.pop_back();
                    whatNext.pop_back();

                    //////////////////////////////////////////////////////////////////////
                    /// Data to exchange particles
                    //////////////////////////////////////////////////////////////////////
                    if(releasedAction.first == RECV_PARTICLES){
Berenger Bramas's avatar
Berenger Bramas committed
621
                        TIMEZONE("post-recv-particles");
622
                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
623
                        assert(descriptor.isRecv);
624
625
626
627
                        const int destProc = descriptor.destProc;
                        const partsize_t NbParticlesToReceive = descriptor.nbParticlesToExchange;
                        assert(NbParticlesToReceive != -1);
                        assert(descriptor.toCompute == nullptr);
628
                        assert(descriptor.indexes == nullptr);
629
630
631

                        if(NbParticlesToReceive){
                            descriptor.toCompute.reset(new real_number[NbParticlesToReceive*size_particle_positions]);
632
                            whatNext.emplace_back(std::pair<Action,int>{COMPUTE_PARTICLES, releasedAction.second});
633
634
                            mpiRequests.emplace_back();
                            assert(NbParticlesToReceive*size_particle_positions < std::numeric_limits<int>::max());
Cristian Lalescu's avatar
Cristian Lalescu committed
635
636
637
638
639
640
641
642
                            AssertMpi(MPI_Irecv(
                                        descriptor.toCompute.get(),
                                        int(NbParticlesToReceive*size_particle_positions),
                                        particles_utils::GetMpiType(real_number()),
                                        destProc,
                                        TAG_POSITION_PARTICLES,
                                        current_com,
                                        &mpiRequests.back()));
643

644
                            descriptor.indexes.reset(new partsize_t[NbParticlesToReceive]);
645
646
647
                            whatNext.emplace_back(std::pair<Action,int>{COMPUTE_PARTICLES, releasedAction.second});
                            mpiRequests.emplace_back();
                            assert(NbParticlesToReceive*size_particle_positions < std::numeric_limits<int>::max());
Cristian Lalescu's avatar
Cristian Lalescu committed
648
649
650
651
652
653
654
655
                            AssertMpi(MPI_Irecv(
                                        descriptor.indexes.get(),
                                        int(NbParticlesToReceive),
                                        particles_utils::GetMpiType(partsize_t()),
                                        destProc,
                                        TAG_INDEX_PARTICLES,
                                        current_com,
                                        &mpiRequests.back()));
656
657
658
659
660
661
662
663
                        }
                    }

                    //////////////////////////////////////////////////////////////////////
                    /// Computation
                    //////////////////////////////////////////////////////////////////////
                    if(releasedAction.first == COMPUTE_PARTICLES){
                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
664
665
666
667
668
669
670
671
672
673
                        descriptor.nbReceived += 1;
                        assert(descriptor.nbReceived <= 2);

                        if(descriptor.nbReceived == 2){
                            TIMEZONE("compute-particles");
                            NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
                            assert(descriptor.isRecv);
                            const partsize_t NbParticlesToReceive = descriptor.nbParticlesToExchange;

                            assert(descriptor.toCompute != nullptr);
674
                            assert(descriptor.indexes != nullptr);
Cristian Lalescu's avatar
Cristian Lalescu committed
675
                            // allocate rhs buffer
676
                            descriptor.results.reset(new real_number[NbParticlesToReceive*size_particle_rhs]);
Cristian Lalescu's avatar
Cristian Lalescu committed
677
                            // clean up rhs buffer
678
679
680
                            set_particle_data_to_zero<partsize_t, real_number, size_particle_rhs>(
                                    descriptor.results.get(),
                                    NbParticlesToReceive);
681
682
683
684
685
686
687
688
689
690
691
692
693
694

                            // Compute
                            partsize_t idxPart = 0;
                            while(idxPart != NbParticlesToReceive){
                                const long int current_cell_idx = get_cell_idx(descriptor.toCompute[idxPart*size_particle_positions + IDXC_X],
                                                                               descriptor.toCompute[idxPart*size_particle_positions + IDXC_Y],
                                                                               descriptor.toCompute[idxPart*size_particle_positions + IDXC_Z]);
                                partsize_t nb_parts_in_cell = 1;
                                while(idxPart+nb_parts_in_cell != NbParticlesToReceive
                                      && current_cell_idx == get_cell_idx(descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDXC_X],
                                                                         descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDXC_Y],
                                                                         descriptor.toCompute[(idxPart+nb_parts_in_cell)*size_particle_positions + IDXC_Z])){
                                    nb_parts_in_cell += 1;
                                }
695

696
697
                                #pragma omp task default(shared) firstprivate(idxPart, nb_parts_in_cell, current_cell_idx)
                                {
698
                                    computer_class& computer_thread_task = (omp_get_thread_num() == 0 ? in_computer : *computer_for_all_threads[omp_get_thread_num()-1]);
699
700
701
                                    const std::vector<std::pair<partsize_t,partsize_t>>* neighbors[27];
                                    long int neighbors_indexes[27];
                                    std::array<real_number,3> shift[27];
702
                                    real_number diff_x, diff_y, diff_z;
703
                                    const int nbNeighbors = my_tree.getNeighbors(
704
705
706
707
708
                                                    current_cell_idx,
                                                    neighbors,
                                                    neighbors_indexes,
                                                    shift,
                                                    true);
709
710
711
712
713
714
715
716

                                    // with other interval
                                    for(int idx_neighbor = 0 ; idx_neighbor < nbNeighbors ; ++idx_neighbor){
                                        cells_locker.lock(neighbors_indexes[idx_neighbor]);

                                        for(size_t idx_2 = 0 ; idx_2 < (*neighbors[idx_neighbor]).size() ; ++idx_2){
                                            for(partsize_t idx_p1 = 0 ; idx_p1 < nb_parts_in_cell ; ++idx_p1){
                                                for(partsize_t idx_p2 = 0 ; idx_p2 < (*neighbors[idx_neighbor])[idx_2].second ; ++idx_p2){
717
                                                    const real_number dist_r2 = compute_distance_r2(
718
                                                                    descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDXC_X],
719
720
721
722
723
                                                                    descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDXC_Y],
                                                                    descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions + IDXC_Z],
                                                                    particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDXC_X],
                                                                    particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDXC_Y],
                                                                    particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDXC_Z],
724
725
726
727
728
729
                                                                    shift[idx_neighbor][IDXC_X],
                                                                    shift[idx_neighbor][IDXC_Y],
                                                                    shift[idx_neighbor][IDXC_Z],
                                                                    diff_x,
                                                                    diff_y,
                                                                    diff_z);
730
                                                    if(dist_r2 < cutoff_radius_compute*cutoff_radius_compute){
731
                                                        computer_thread_task.template compute_interaction<size_particle_positions, size_particle_rhs>(
732
733
734
735
736
                                                                            descriptor.indexes[(idxPart+idx_p1)],
                                                                            &descriptor.toCompute[(idxPart+idx_p1)*size_particle_positions],
                                                                            &descriptor.results[(idxPart+idx_p1)*size_particle_rhs],
                                                                            inout_index_particles[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)],
                                                                            &particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions],
737
                                                                            &particles_current_rhs[0][((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_rhs],
738
                                                                            dist_r2,
739
740
741
742
                                                                            cutoff_radius_compute,
                                                                            diff_x,
                                                                            diff_y,
                                                                            diff_z);
743
                                                    }
744
                                                }
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
745
                                            }
746
                                        }
747

748
749
                                        cells_locker.unlock(neighbors_indexes[idx_neighbor]);
                                    }
750
751
                                }

752
753
                                idxPart += nb_parts_in_cell;
                            }
754

755
                            #pragma omp taskwait
756

757
758
759
760
761
                            // Send back
                            const int destProc = descriptor.destProc;
                            whatNext.emplace_back(std::pair<Action,int>{RELEASE_BUFFER_PARTICLES, releasedAction.second});
                            mpiRequests.emplace_back();
                            assert(NbParticlesToReceive*size_particle_rhs < std::numeric_limits<int>::max());
Cristian Lalescu's avatar
Cristian Lalescu committed
762
763
764
765
766
767
768
769
                            AssertMpi(MPI_Isend(
                                        descriptor.results.get(),
                                        int(NbParticlesToReceive*size_particle_rhs),
                                        particles_utils::GetMpiType(real_number()),
                                        destProc,
                                        TAG_RESULT_PARTICLES,
                                        current_com,
                                        &mpiRequests.back()));
770
                            delete[] descriptor.toCompute.release();
771
                            delete[] descriptor.indexes.release();
772
                        }
773
774
                    }
                    //////////////////////////////////////////////////////////////////////
775
                    /// Release memory that was sent back
776
                    //////////////////////////////////////////////////////////////////////
777
778
                    if(releasedAction.first == RELEASE_BUFFER_PARTICLES){
                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
779
                        assert(descriptor.results != nullptr);
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
780
                        assert(descriptor.isRecv);
Cristian Lalescu's avatar
Cristian Lalescu committed
781
                        delete[] descriptor.results.release();
782
783
784
785
                    }
                    //////////////////////////////////////////////////////////////////////
                    /// Merge
                    //////////////////////////////////////////////////////////////////////
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
786
                    if(releasedAction.first == MERGE_PARTICLES){
Berenger Bramas's avatar
Berenger Bramas committed
787
                        TIMEZONE("merge");
788
                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
Berenger Bramas's avatar
Debug    
Berenger Bramas committed
789
                        assert(descriptor.isRecv == false);
790
                        assert(descriptor.toRecvAndMerge != nullptr);
Cristian Lalescu's avatar
Cristian Lalescu committed
791
792
793
794
                        computer_thread.template reduce_particles_rhs<size_particle_rhs>(
                                &particles_current_rhs[0][particles_offset_layers[my_nb_cell_levels-descriptor.nbLevelsToExchange]*size_particle_rhs],
                                descriptor.toRecvAndMerge.get(),
                                descriptor.nbParticlesToExchange);
Cristian Lalescu's avatar
Cristian Lalescu committed
795
                        delete[] descriptor.toRecvAndMerge.release();
796
797
798
799
800
801
802
803
                    }
                }
            }
        }

        assert(whatNext.size() == 0);
        assert(mpiRequests.size() == 0);

804
805
806
807
808
809
810
        {
            computer_class& computer_thread = (omp_get_thread_num() == 0 ? in_computer : *computer_for_all_threads[omp_get_thread_num()-1]);
            // Compute self data
            for(const auto& iter_cell : my_tree){
                TIMEZONE("proceed-leaf");
                const long int currenct_cell_idx = iter_cell.first;
                const std::vector<std::pair<partsize_t,partsize_t>>* intervals_ptr = &iter_cell.second;
811

812
813
814
815
816
    #pragma omp task default(shared) firstprivate(currenct_cell_idx, intervals_ptr)
                {
                    const std::vector<std::pair<partsize_t,partsize_t>>& intervals = (*intervals_ptr);

                    cells_locker.lock(currenct_cell_idx);
817

818
819
                    for(size_t idx_1 = 0 ; idx_1 < intervals.size() ; ++idx_1){
                        // self interval
820
                        for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){
821
                            for(partsize_t idx_p2 = idx_p1+1 ; idx_p2 < intervals[idx_1].second ; ++idx_p2){
822
                                real_number diff_x, diff_y, diff_z;
823
824
825
                                const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_X],
                                                                                particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_Y],
                                                                                particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_Z],
826
827
828
                                                                                particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDXC_X],
                                                                                particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDXC_Y],
                                                                                particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions + IDXC_Z],
829
830
                                                                                0, 0, 0,
                                                                                diff_x, diff_y, diff_z);
831
                                if(dist_r2 < cutoff_radius_compute*cutoff_radius_compute){
832
                                    computer_thread.template compute_interaction<size_particle_positions,size_particle_rhs>(
833
                                                        inout_index_particles[(intervals[idx_1].first+idx_p1)],
834
                                                        &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions],
835
                                                        &particles_current_rhs[0][(intervals[idx_1].first+idx_p1)*size_particle_rhs],
836
                                                        inout_index_particles[(intervals[idx_1].first+idx_p2)],
837
                                                        &particles_positions[(intervals[idx_1].first+idx_p2)*size_particle_positions],
838
                                                        &particles_current_rhs[0][(intervals[idx_1].first+idx_p2)*size_particle_rhs],
839
                                                        dist_r2, cutoff_radius_compute, diff_x, diff_y, diff_z);
840
841
                                }
                            }
842
                        }
843
844
845
846
847

                        // with other interval
                        for(size_t idx_2 = idx_1+1 ; idx_2 < intervals.size() ; ++idx_2){
                            for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){
                                for(partsize_t idx_p2 = 0 ; idx_p2 < intervals[idx_2].second ; ++idx_p2){
848
                                    real_number diff_x, diff_y, diff_z;
849
850
851
852
853
854
                                    const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_X],
                                                                                    particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_Y],
                                                                                    particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_Z],
                                                                                    particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDXC_X],
                                                                                    particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDXC_Y],
                                                                                    particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions + IDXC_Z],
855
856
                                                                                    0, 0, 0,
                                                                                    diff_x, diff_y, diff_z);
857
858
                                    if(dist_r2 < cutoff_radius_compute*cutoff_radius_compute){
                                        computer_thread.template compute_interaction<size_particle_positions,size_particle_rhs>(
859
                                                            inout_index_particles[(intervals[idx_1].first+idx_p1)],
860
                                                            &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions],
861
                                                            &particles_current_rhs[0][(intervals[idx_1].first+idx_p1)*size_particle_rhs],
862
                                                            inout_index_particles[(intervals[idx_2].first+idx_p2)],
863
                                                            &particles_positions[(intervals[idx_2].first+idx_p2)*size_particle_positions],
864
                                                            &particles_current_rhs[0][(intervals[idx_2].first+idx_p2)*size_particle_rhs],
865
                                                            dist_r2, cutoff_radius_compute, diff_x, diff_y, diff_z);
866
867
868
869
                                    }
                                }
                            }
                        }
870
871
                    }

872
873
874
875
876
877
878
879
880
881
882
883
884
885
                    const std::vector<std::pair<partsize_t,partsize_t>>* neighbors[27];
                    long int neighbors_indexes[27];
                    std::array<real_number,3> shift[27];
                    const int nbNeighbors = my_tree.getNeighbors(currenct_cell_idx, neighbors, neighbors_indexes, shift, false);

                    for(size_t idx_1 = 0 ; idx_1 < intervals.size() ; ++idx_1){
                        // with other interval
                        for(int idx_neighbor = 0 ; idx_neighbor < nbNeighbors ; ++idx_neighbor){
                            if(currenct_cell_idx < neighbors_indexes[idx_neighbor]){
                                cells_locker.lock(neighbors_indexes[idx_neighbor]);

                                for(size_t idx_2 = 0 ; idx_2 < (*neighbors[idx_neighbor]).size() ; ++idx_2){
                                    for(partsize_t idx_p1 = 0 ; idx_p1 < intervals[idx_1].second ; ++idx_p1){
                                        for(partsize_t idx_p2 = 0 ; idx_p2 < (*neighbors[idx_neighbor])[idx_2].second ; ++idx_p2){
886
                                            real_number diff_x, diff_y, diff_z;
887
888
889
890
891
892
                                            const real_number dist_r2 = compute_distance_r2(particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_X],
                                                                                            particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_Y],
                                                                                            particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions + IDXC_Z],
                                                                                            particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDXC_X],
                                                                                            particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDXC_Y],
                                                                                            particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions + IDXC_Z],
893
894
                                                                                            shift[idx_neighbor][IDXC_X], shift[idx_neighbor][IDXC_Y], shift[idx_neighbor][IDXC_Z],
                                                                                            diff_x, diff_y, diff_z);
895
896
                                            if(dist_r2 < cutoff_radius_compute*cutoff_radius_compute){
                                                computer_thread.template compute_interaction<size_particle_positions,size_particle_rhs>(
897
                                                                    inout_index_particles[(intervals[idx_1].first+idx_p1)],
898
                                                                    &particles_positions[(intervals[idx_1].first+idx_p1)*size_particle_positions],
899
                                                                    &particles_current_rhs[0][(intervals[idx_1].first+idx_p1)*size_particle_rhs],
900
                                                                    inout_index_particles[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)],
901
                                                                    &particles_positions[((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_positions],
902
                                                                    &particles_current_rhs[0][((*neighbors[idx_neighbor])[idx_2].first+idx_p2)*size_particle_rhs],
903
904
                                                                    dist_r2, cutoff_radius_compute,
                                                                    diff_x, diff_y, diff_z);
905
                                            }
906
                                        }
907
                                    }
908
                                }
909
                                cells_locker.unlock(neighbors_indexes[idx_neighbor]);
910
911
912
                            }
                        }
                    }
913

914
915
                    cells_locker.unlock(currenct_cell_idx);
                }
916
917
            }
        }
918
919
920
921

        for(int idxThread = 1 ; idxThread < omp_get_num_threads() ; ++idxThread){
            in_computer.merge(*computer_for_all_threads[idxThread-1]);
        }
922
923
924
925
    }
};

#endif