elpa2_trans_ev_tridi_to_band_template.F90 86.2 KB
Newer Older
Andreas Marek's avatar
Andreas Marek committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".

53
#include "../general/sanity.F90"
Andreas Marek's avatar
Andreas Marek committed
54

Andreas Marek's avatar
Andreas Marek committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
subroutine trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, q, ldq, matrixCols,         &
 hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, max_threads, success, &
 kernel)

!-------------------------------------------------------------------------------
!  trans_ev_tridi_to_band_real/complex:
!  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
!
!  Parameters
!
!  na          Order of matrix a, number of rows of matrix q
!
!  nev         Number eigenvectors to compute (= columns of matrix q)
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  nb          semi bandwith
!
!  q           On input: Eigenvectors of tridiagonal matrix
!              On output: Transformed eigenvectors
!              Distribution is like in Scalapack.
!
!  ldq         Leading dimension of q
!  matrixCols  local columns of matrix q
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns/both
!
!-------------------------------------------------------------------------------
  use elpa_abstract_impl
  use elpa2_workload
  use pack_unpack_cpu
  use pack_unpack_gpu
  use compute_hh_trafo
  use cuda_functions
  use precision
96
  use, intrinsic :: iso_c_binding
97
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
98
  ! use omp_lib
Andreas Marek's avatar
Andreas Marek committed
99
#endif
Andreas Marek's avatar
Andreas Marek committed
100
  implicit none
101
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
102
103
  class(elpa_abstract_impl_t), intent(inout) :: obj
  logical, intent(in)                        :: useGPU
104

Andreas Marek's avatar
Andreas Marek committed
105
106
  integer(kind=ik), intent(in)               :: kernel
  integer(kind=ik), intent(in)               :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
107
108

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
109
  MATH_DATATYPE(kind=rck)                    :: q(ldq,*)
110
#else
Andreas Marek's avatar
Andreas Marek committed
111
  MATH_DATATYPE(kind=rck)                    :: q(ldq,matrixCols)
112
113
#endif

Andreas Marek's avatar
Andreas Marek committed
114
  MATH_DATATYPE(kind=rck), intent(in)        :: hh_trans(:,:)
Andreas Marek's avatar
Andreas Marek committed
115

Andreas Marek's avatar
Andreas Marek committed
116
117
118
119
120
121
122
  integer(kind=ik)                           :: np_rows, my_prow, np_cols, my_pcol
  integer(kind=MPI_KIND)                     :: np_rowsMPI, my_prowMPI, np_colsMPI, my_pcolMPI
  integer(kind=ik)                           :: i, j, ip, sweep, nbuf, l_nev, a_dim2
  integer(kind=ik)                           :: current_n, current_local_n, current_n_start, current_n_end
  integer(kind=ik)                           :: next_n, next_local_n, next_n_start, next_n_end
  integer(kind=ik)                           :: bottom_msg_length, top_msg_length, next_top_msg_length
  integer(kind=ik)                           :: stripe_width, last_stripe_width, stripe_count
123
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
124
  integer(kind=ik)                           :: thread_width, csw, b_off, b_len
125
#endif
Andreas Marek's avatar
Andreas Marek committed
126
127
128
129
  integer(kind=ik)                           :: num_result_blocks, num_result_buffers, num_bufs_recvd
  integer(kind=ik)                           :: a_off, current_tv_off, max_blk_size
  integer(kind=ik)                           :: src, src_offset, dst, offset, nfact, num_blk
  integer(kind=MPI_KIND)                     :: mpierr
130

Andreas Marek's avatar
Andreas Marek committed
131
  logical                                    :: flag
132
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
133
  MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:,:)
134
#else
Andreas Marek's avatar
Andreas Marek committed
135
  MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:)
136
#endif
Andreas Marek's avatar
Andreas Marek committed
137
  MATH_DATATYPE(kind=rck)                    :: a_var
138

Andreas Marek's avatar
Andreas Marek committed
139
  type(c_ptr)                                :: aIntern_ptr
140

Andreas Marek's avatar
Andreas Marek committed
141
142
  MATH_DATATYPE(kind=rck), allocatable       :: row(:)
  MATH_DATATYPE(kind=rck), pointer           :: row_group(:,:)
143

144
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
145
146
147
148
  MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:)
  MATH_DATATYPE(kind=rck), allocatable       :: top_border_recv_buffer(:,:)
  MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:)
  MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_recv_buffer(:,:)
149
#else
Andreas Marek's avatar
Andreas Marek committed
150
151
152
153
  MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:,:)
  MATH_DATATYPE(kind=rck), allocatable       :: top_border_recv_buffer(:,:,:)
  MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:,:)
  MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_recv_buffer(:,:,:)
154
155
#endif

Andreas Marek's avatar
Andreas Marek committed
156
157
158
159
160
161
162
  integer(kind=c_intptr_t)                   :: aIntern_dev
  integer(kind=c_intptr_t)                   :: bcast_buffer_dev
  integer(kind=c_intptr_t)                   :: num
  integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1
  integer(kind=c_intptr_t)                   :: row_group_dev
  integer(kind=c_intptr_t)                   :: hh_tau_dev
  integer(kind=ik)                           :: row_group_size, unpack_idx
163

Andreas Marek's avatar
Andreas Marek committed
164
  type(c_ptr)                                :: row_group_host, bcast_buffer_host
Wenzhe Yu's avatar
Wenzhe Yu committed
165

Andreas Marek's avatar
Andreas Marek committed
166
167
  integer(kind=ik)                           :: n_times
  integer(kind=ik)                           :: chunk, this_chunk
168

Andreas Marek's avatar
Andreas Marek committed
169
170
  MATH_DATATYPE(kind=rck), allocatable       :: result_buffer(:,:,:)
  MATH_DATATYPE(kind=rck), pointer           :: bcast_buffer(:,:)
171

Andreas Marek's avatar
Andreas Marek committed
172
  integer(kind=ik)                           :: n_off
173

Andreas Marek's avatar
Andreas Marek committed
174
175
176
177
  integer(kind=MPI_KIND), allocatable        :: result_send_request(:), result_recv_request(:)
  integer(kind=ik), allocatable              :: limits(:)
  integer(kind=MPI_KIND), allocatable        :: top_send_request(:), bottom_send_request(:)
  integer(kind=MPI_KIND), allocatable        :: top_recv_request(:), bottom_recv_request(:)
178

Andreas Marek's avatar
Andreas Marek committed
179
  ! MPI send/recv tags, arbitrary
180

Andreas Marek's avatar
Andreas Marek committed
181
182
183
  integer(kind=ik), parameter                :: bottom_recv_tag = 111
  integer(kind=ik), parameter                :: top_recv_tag    = 222
  integer(kind=ik), parameter                :: result_recv_tag = 333
Andreas Marek's avatar
Andreas Marek committed
184

Andreas Marek's avatar
Andreas Marek committed
185
  integer(kind=ik), intent(in)               :: max_threads
186

187
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
188
  integer(kind=ik)                           :: my_thread
189
190
191
#endif


Andreas Marek's avatar
Andreas Marek committed
192
193
194
195
  ! Just for measuring the kernel performance
  real(kind=c_double)                        :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
  ! long integer
  integer(kind=lik)                          :: kernel_flops, kernel_flops_recv
196

Andreas Marek's avatar
Andreas Marek committed
197
198
199
200
201
202
  logical, intent(in)                        :: wantDebug
  logical                                    :: success
  integer(kind=ik)                           :: istat, print_flops
  character(200)                             :: errorMessage
  character(20)                              :: gpuString
  logical                                    :: successCUDA
203
#ifndef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
204
  integer(kind=ik)                           :: j1
205
#endif
Andreas Marek's avatar
Andreas Marek committed
206
207
208
209
210
211
212
213
214
215
216
  integer(kind=ik)                           :: error
  integer(kind=c_intptr_t), parameter        :: size_of_datatype = size_of_&
                                                                 &PRECISION&
                                                                 &_&
                                                                 &MATH_DATATYPE

  if(useGPU) then
    gpuString = "_gpu"
  else
    gpuString = ""
  endif
217

Andreas Marek's avatar
Andreas Marek committed
218
219
220
221
222
  call obj%timer%start("trans_ev_tridi_to_band_&
  &MATH_DATATYPE&
  &" // &
  &PRECISION_SUFFIX //&
  gpuString)
223

Andreas Marek's avatar
Andreas Marek committed
224
225
226
227
228
  n_times = 0
  if (useGPU) then
    unpack_idx = 0
    row_group_size = 0
  endif
229

Andreas Marek's avatar
Andreas Marek committed
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
  success = .true.
  kernel_time = 0.0
  kernel_flops = 0

  if (wantDebug) call obj%timer%start("mpi_communication")
  call MPI_Comm_rank(int(mpi_comm_rows,kind=MPI_KIND) , my_prowMPI , mpierr)
  call MPI_Comm_size(int(mpi_comm_rows,kind=MPI_KIND) , np_rowsMPI , mpierr)
  call MPI_Comm_rank(int(mpi_comm_cols,kind=MPI_KIND) , my_pcolMPI , mpierr)
  call MPI_Comm_size(int(mpi_comm_cols,kind=MPI_KIND) , np_colsMPI , mpierr)

  my_prow = int(my_prowMPI,kind=c_int)
  my_pcol = int(my_pcolMPI,kind=c_int)
  np_rows = int(np_rowsMPI,kind=c_int)
  np_cols = int(np_colsMPI,kind=c_int)

  if (wantDebug) call obj%timer%stop("mpi_communication")

  if (mod(nbw,nblk)/=0) then
    if (my_prow==0 .and. my_pcol==0) then
      if (wantDebug) then
        write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
                            &MATH_DATATYPE&
                            &: ERROR: nbw=',nbw,', nblk=',nblk
        write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
                            &MATH_DATATYPE&
                            &: band backtransform works only for nbw==n*nblk'
256
      endif
Andreas Marek's avatar
Andreas Marek committed
257
258
259
260
      success = .false.
      return
    endif
  endif
261

Andreas Marek's avatar
Andreas Marek committed
262
  nfact = nbw / nblk
263
264


Andreas Marek's avatar
Andreas Marek committed
265
266
  ! local number of eigenvectors
  l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
267

Andreas Marek's avatar
Andreas Marek committed
268
  if (l_nev==0) then
269
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
270
    thread_width = 0
271
#endif
Andreas Marek's avatar
Andreas Marek committed
272
273
274
    stripe_width = 0
    stripe_count = 0
    last_stripe_width = 0
275

Andreas Marek's avatar
Andreas Marek committed
276
  else ! l_nev
277

278
#if WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
279
280
281
    ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
    ! every primary cache
    ! Suggested stripe width is 48 - should this be reduced for the complex case ???
282

Andreas Marek's avatar
Andreas Marek committed
283
284
285
286
287
288
    if (useGPU) then
      stripe_width = 256 ! Must be a multiple of 4
      stripe_count = (l_nev - 1) / stripe_width + 1
    else ! useGPU
      ! openmp only in non-GPU case
      thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
289

290
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
291
      call obj%get("stripewidth_real",stripe_width, error)
292

293
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
294
      !stripe_width = 48 ! Must be a multiple of 4
295
#else
Andreas Marek's avatar
Andreas Marek committed
296
297
      stripe_width = stripe_width * 2
      !stripe_width = 96 ! Must be a multiple of 8
298
299
300
301
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
302
      call obj%get("stripewidth_complex",stripe_width, error)
303

304
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
305
      !stripe_width = 48 ! Must be a multiple of 2
306
#else
Andreas Marek's avatar
Andreas Marek committed
307
308
      stripe_width = stripe_width * 2
      !stripe_width = 48 ! Must be a multiple of 4
309
310
311
#endif
#endif /* COMPLEXCASE */

Andreas Marek's avatar
Andreas Marek committed
312
      stripe_count = (thread_width-1)/stripe_width + 1
313

Andreas Marek's avatar
Andreas Marek committed
314
      ! Adapt stripe width so that last one doesn't get too small
315

Andreas Marek's avatar
Andreas Marek committed
316
      stripe_width = (thread_width-1)/stripe_count + 1
317
318
319

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
320
321
      if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
322
323
324
325
326
          kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
          ) then
327

Andreas Marek's avatar
Andreas Marek committed
328
329
        stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                              ! (8 * sizeof(double) == 64)
330

Andreas Marek's avatar
Andreas Marek committed
331
332
333
334
      else
        stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
                                              ! (4 * sizeof(double) == 32)
      endif
335
#else
Andreas Marek's avatar
Andreas Marek committed
336
337
      if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
338
339
340
341
342
          kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6 &
          ) then
343
344


Andreas Marek's avatar
Andreas Marek committed
345
346
        stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
                                           ! (16 * sizeof(float) == 64)
347

Andreas Marek's avatar
Andreas Marek committed
348
349
350
351
      else
        stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
                                         ! (8 * sizeof(float) == 32)
      endif
352
353
354
355
356
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
357
      if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
358
359
360
361
          kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
          kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2 &
          ) then
362

Andreas Marek's avatar
Andreas Marek committed
363
364
        stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
                                        ! (4 * sizeof(double complex) == 64)
365

Andreas Marek's avatar
Andreas Marek committed
366
      else
367

Andreas Marek's avatar
Andreas Marek committed
368
369
370
        stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
                                        ! (2 * sizeof(double complex) == 32)
      endif
371
#else
372

Andreas Marek's avatar
Andreas Marek committed
373
      if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
374
375
376
377
          kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
          kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2  &
          ) then
378

Andreas Marek's avatar
Andreas Marek committed
379
380
        stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                        ! (8 * sizeof(float complex) == 64)
381

Andreas Marek's avatar
Andreas Marek committed
382
383
384
385
      else
        stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
                                        ! (4 * sizeof(float complex) == 32)
      endif
386
387
388
#endif
#endif /* COMPLEXCASE */

389
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
390
      last_stripe_width = l_nev - (stripe_count-1)*stripe_width
391
392
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
393
394
      ! only needed in no OMP case check thsis
      ! last_stripe_width = l_nev - (stripe_count-1)*stripe_width
395
#endif
396

Andreas Marek's avatar
Andreas Marek committed
397
    endif ! useGPU
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
398

399
#else /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
400

Andreas Marek's avatar
Andreas Marek committed
401
402
403
    ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
    ! every primary cache
    ! Suggested stripe width is 48 - should this be reduced for the complex case ???
404

Andreas Marek's avatar
Andreas Marek committed
405
406
407
    if (useGPU) then
      stripe_width = 1024 ! Must be a multiple of 4
      stripe_count = (l_nev - 1) / stripe_width + 1
408

Andreas Marek's avatar
Andreas Marek committed
409
    else ! useGPU
410
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
411
      call obj%get("stripewidth_real",stripe_width, error)
412

413
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
414
      !stripe_width = 48 ! Must be a multiple of 4
415
#else
Andreas Marek's avatar
Andreas Marek committed
416
417
      !stripe_width = 96 ! Must be a multiple of 8
      stripe_width = 2 * stripe_width
418
419
420
421
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
422
      call obj%get("stripewidth_complex",stripe_width, error)
423

424
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
425
      !stripe_width = 48 ! Must be a multiple of 2
426
#else
Andreas Marek's avatar
Andreas Marek committed
427
      !stripe_width = 48 ! Must be a multiple of 4
428
429
430
#endif
#endif /* COMPLEXCASE */

Andreas Marek's avatar
Andreas Marek committed
431
      stripe_count = (l_nev-1)/stripe_width + 1
432

Andreas Marek's avatar
Andreas Marek committed
433
      ! Adapt stripe width so that last one doesn't get too small
434

Andreas Marek's avatar
Andreas Marek committed
435
      stripe_width = (l_nev-1)/stripe_count + 1
436
437
438

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
439
440
      if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
441
442
443
444
445
          kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6  &
          ) then
446

Andreas Marek's avatar
Andreas Marek committed
447
448
        stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                              ! (8 * sizeof(double) == 64)
449

Andreas Marek's avatar
Andreas Marek committed
450
451
      else
        stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab    
Andreas Marek committed
452
                                            ! (4 * sizeof(double) == 32)
Andreas Marek's avatar
Andreas Marek committed
453
      endif
454
#else
Andreas Marek's avatar
Andreas Marek committed
455
456
      if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
457
458
459
460
461
          kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK4 .or. &
          kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK6  &
          ) then
462
463


Andreas Marek's avatar
Andreas Marek committed
464
       stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab    
Andreas Marek committed
465
                                               ! (16 * sizeof(float) == 64)
466

Andreas Marek's avatar
Andreas Marek committed
467
468
     else
       stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab    
Andreas Marek committed
469
                                            ! (8 * sizeof(float) == 32)
Andreas Marek's avatar
Andreas Marek committed
470
     endif
471
472
473
474
475
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
476

Andreas Marek's avatar
Andreas Marek committed
477
     if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
478
479
480
481
         kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
         kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
         kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2  &
     ) then
482

Andreas Marek's avatar
Andreas Marek committed
483
484
       stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
                                       ! (4 * sizeof(double complex) == 64)
485

Andreas Marek's avatar
Andreas Marek committed
486
     else
487

Andreas Marek's avatar
Andreas Marek committed
488
489
490
       stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
                                       ! (2 * sizeof(double complex) == 32)
     endif
491
#else
492

Andreas Marek's avatar
Andreas Marek committed
493
     if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
494
495
496
497
         kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 .or. &
         kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1 .or. &
         kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK2  &
         ) then
498

Andreas Marek's avatar
Andreas Marek committed
499
500
       stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                       ! (8 * sizeof(float complex) == 64)
501

Andreas Marek's avatar
Andreas Marek committed
502
503
504
505
     else
       stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
                                       ! (4 * sizeof(float complex) == 32)
     endif
506
507
#endif
#endif /* COMPLEXCASE */
Andreas Marek's avatar
Andreas Marek committed
508
   endif ! useGPU
509

Andreas Marek's avatar
Andreas Marek committed
510
   last_stripe_width = l_nev - (stripe_count-1)*stripe_width
511

512
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
513
  endif ! l_nev
514

Andreas Marek's avatar
Andreas Marek committed
515
  ! Determine the matrix distribution at the beginning
516

Andreas Marek's avatar
Andreas Marek committed
517
518
519
  allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
  check_allocate("trans_ev_tridi_to_band: limits", istat, errorMessage)
  call determine_workload(obj,na, nbw, np_rows, limits)
520

Andreas Marek's avatar
Andreas Marek committed
521
  max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
522

Andreas Marek's avatar
Andreas Marek committed
523
  a_dim2 = max_blk_size + nbw
524

Andreas Marek's avatar
Andreas Marek committed
525
526
527
528
  if (useGPU) then
    num =  (stripe_width*a_dim2*stripe_count)* size_of_datatype
    successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
    check_alloc_cuda("trans_ev_tridi_to_band: aIntern_dev", successCUDA)
529

Andreas Marek's avatar
Andreas Marek committed
530
531
    successCUDA = cuda_memset(aIntern_dev , 0, num)
    check_memset_cuda("trans_ev_tridi_to_band: aIntern_dev", successCUDA)
532

Andreas Marek's avatar
Andreas Marek committed
533
534
535
536
    ! "row_group" and "row_group_dev" are needed for GPU optimizations
    successCUDA = cuda_malloc_host(row_group_host,l_nev*nblk*size_of_datatype)
    check_host_alloc_cuda("trans_ev_tridi_to_band: row_group_host", successCUDA)
    call c_f_pointer(row_group_host, row_group, (/l_nev,nblk/))
Wenzhe Yu's avatar
Wenzhe Yu committed
537

Andreas Marek's avatar
Andreas Marek committed
538
539
540
541
    row_group(:, :) = 0.0_rck
    num =  (l_nev*nblk)* size_of_datatype
    successCUDA = cuda_malloc(row_group_dev, num)
    check_alloc_cuda("trans_ev_tridi_to_band: row_group_dev", successCUDA)
542

Andreas Marek's avatar
Andreas Marek committed
543
544
    successCUDA = cuda_memset(row_group_dev , 0, num)
    check_memset_cuda("trans_ev_tridi_to_band: row_group_dev", successCUDA)
545

Andreas Marek's avatar
Andreas Marek committed
546
  else ! GPUs are not used
547
548
549
550
551
552

#if 0
! realcase or complexcase
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif

553
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
554
555
556
557
558
559
560
    if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*max_threads*     &
           C_SIZEOF(a_var)) /= 0) then
      print *,"trans_ev_tridi_to_band_&
      &MATH_DATATYPE&
      &: error when allocating aIntern"//errorMessage
      stop 1
    endif
561

Andreas Marek's avatar
Andreas Marek committed
562
563
    call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
    ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
564

Andreas Marek's avatar
Andreas Marek committed
565
    ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
566

567
#else /* WITH_OPENMP_TRADITIONAL */
568

Andreas Marek's avatar
Andreas Marek committed
569
570
571
572
573
    if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*  &
        C_SIZEOF(a_var)) /= 0) then
      print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
      stop 1
    endif
574

Andreas Marek's avatar
Andreas Marek committed
575
576
    call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
    !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
577

Andreas Marek's avatar
Andreas Marek committed
578
    aIntern(:,:,:) = 0.0_rck
579
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
580
  endif !useGPU
581

Andreas Marek's avatar
Andreas Marek committed
582
583
  allocate(row(l_nev), stat=istat, errmsg=errorMessage)
  check_allocate("trans_ev_tridi_to_band: row", istat, errorMessage)
584

Andreas Marek's avatar
Andreas Marek committed
585
  row(:) = 0.0_rck
586

Andreas Marek's avatar
Andreas Marek committed
587
588
  ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
  ! and transpose the matrix using stripes of given stripe_width for cache blocking.
589

Andreas Marek's avatar
Andreas Marek committed
590
591
  ! The peculiar way it is done below is due to the fact that the last row should be
  ! ready first since it is the first one to start below
592

593
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
594
595
596
597
598
599
600
601
602
603
604
605
  ! Please note about the OMP usage below:
  ! This is not for speed, but because we want the matrix a in the memory and
  ! in the cache of the correct thread (if possible)

  call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
  !$omp parallel do private(my_thread), schedule(static, 1)
  do my_thread = 1, max_threads
    aIntern(:,:,:,my_thread) = 0.0_rck ! if possible, do first touch allocation!
  enddo
  !$omp end parallel do

  call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
606
#endif /* WITH_OPENMP_TRADITIONAL */
607

Andreas Marek's avatar
Andreas Marek committed
608
609
610
611
612
613
  do ip = np_rows-1, 0, -1
    if (my_prow == ip) then
      ! Receive my rows which have not yet been received
      src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
      do i=limits(ip)+1,limits(ip+1)
        src = mod((i-1)/nblk, np_rows)
614

Andreas Marek's avatar
Andreas Marek committed
615
        if (src < my_prow) then
616
#ifdef WITH_OPENMP_TRADITIONAL
617
618

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
619
620
621
622
          if (wantDebug) call obj%timer%start("mpi_communication")
          call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                        int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
          if (wantDebug) call obj%timer%stop("mpi_communication")
623
624
625

#else /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
626
!         row(1:l_nev) = row(1:l_nev)
627
628
629

#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
630
          call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
631

Andreas Marek's avatar
Andreas Marek committed
632
633
634
635
636
637
638
639
          !$omp parallel do private(my_thread), schedule(static, 1)
          do my_thread = 1, max_threads
            call unpack_row_&
                 &MATH_DATATYPE&
                 &_cpu_openmp_&
                 &PRECISION &
                              (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, &
                               thread_width, stripe_width, l_nev)
640

Andreas Marek's avatar
Andreas Marek committed
641
642
          enddo
          !$omp end parallel do
643

Andreas Marek's avatar
Andreas Marek committed
644
          call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
645

646
#else /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
647
648
649
650
651
652
653
654
655
656
657
          if (useGPU) then
            ! An unpacking of the current row group may occur before queuing the next row
            call unpack_and_prepare_row_group_&
            &MATH_DATATYPE&
            &_gpu_&
            &PRECISION &
                          ( &
                          row_group, row_group_dev, aIntern_dev, stripe_count, &
                                      stripe_width, last_stripe_width, a_dim2, l_nev,&
                                      row_group_size, nblk, unpack_idx, &
                                       i - limits(ip), .false.)
658
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
659
660
661
662
            if (wantDebug) call obj%timer%start("mpi_communication")
            call MPI_Recv(row_group(:, row_group_size), int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                          int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
            if (wantDebug) call obj%timer%stop("mpi_communication")
663
664

#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
665
            row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
666
667
#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
668
          else ! useGPU
669
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
670
671
672
673
            if (wantDebug) call obj%timer%start("mpi_communication")
            call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                         int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
            if (wantDebug) call obj%timer%stop("mpi_communication")
674
675
676

#else /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
677
!           row(1:l_nev) = row(1:l_nev)
678
679
680

#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
681
682
683
684
685
686
            call unpack_row_&
                &MATH_DATATYPE&
                &_cpu_&
                &PRECISION &
                (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
          endif ! useGPU
687
#endif /* WITH_OPENMP_TRADITIONAL */
688

Andreas Marek's avatar
Andreas Marek committed
689
        elseif (src == my_prow) then
690

Andreas Marek's avatar
Andreas Marek committed
691
          src_offset = src_offset+1
692

Andreas Marek's avatar
Andreas Marek committed
693
          if (useGPU) then
694
#ifndef WITH_OPENMP_TRADITIONAL
695

Andreas Marek's avatar
Andreas Marek committed
696
697
698
699
700
701
702
703
704
705
706
707
            ! An unpacking of the current row group may occur before queuing the next row
            call unpack_and_prepare_row_group_&
                 &MATH_DATATYPE&
                 &_gpu_&
                 &PRECISION &
             ( &
                          row_group, row_group_dev, aIntern_dev, stripe_count, &
                          stripe_width, last_stripe_width, a_dim2, l_nev,&
                          row_group_size, nblk, unpack_idx, &
                          i - limits(ip), .false.)

            row_group(:, row_group_size) = q(src_offset, 1:l_nev)
708
#else /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
709

710
711
!#if COMPLEXCASE == 1
!! why is an cuda call in the openmp region?
Andreas Marek's avatar
Andreas Marek committed
712
!           call unpack_and_prepare_row_group_complex_gpu_&
713
714
715
716
717
718
!                     &PRECISION&
!                     &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
!                      last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
!                      unpack_idx, i - limits(ip),.false.)
!                      row_group(:, row_group_size) = q(src_offset, 1:l_nev)
!#endif
719
720

#endif /* not OpenMP */
Andreas Marek's avatar
Andreas Marek committed
721
722
723
          else
            row(:) = q(src_offset, 1:l_nev)
          endif
724

725
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
726
          call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
727

Andreas Marek's avatar
Andreas Marek committed
728
729
730
731
732
733
734
          !$omp parallel do private(my_thread), schedule(static, 1)
          do my_thread = 1, max_threads
            call unpack_row_&
                 &MATH_DATATYPE&
                 &_cpu_openmp_&
                 &PRECISION &
                               (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
735

Andreas Marek's avatar
Andreas Marek committed
736
737
          enddo
          !$omp end parallel do
738

Andreas Marek's avatar
Andreas Marek committed
739
          call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
740

741
#else /* WITH_OPENMP_TRADITIONAL */
742

Andreas Marek's avatar
Andreas Marek committed
743
          if (useGPU) then
744

Andreas Marek's avatar
Andreas Marek committed
745
746
747
748
749
750
751
          else
            call unpack_row_&
                 &MATH_DATATYPE&
                 &_cpu_&
                 &PRECISION &
                            (obj,aIntern, row,i-limits(ip),  stripe_count, stripe_width, last_stripe_width)
          endif
752

753
#endif /* WITH_OPENMP_TRADITIONAL */
754

Andreas Marek's avatar
Andreas Marek committed
755
756
        endif
      enddo
757

Andreas Marek's avatar
Andreas Marek committed
758
759
760
761
762
763
764
      ! Send all rows which have not yet been send
      src_offset = 0
      do dst = 0, ip-1
        do i=limits(dst)+1,limits(dst+1)
          if (mod((i-1)/nblk, np_rows) == my_prow) then
            src_offset = src_offset+1
            row(:) = q(src_offset, 1:l_nev)
765
766

#ifdef WITH_MPI
767
                if (wantDebug) call obj%timer%start("mpi_communication")
768
769
                call MPI_Send(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                              int(dst,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
770
                if (wantDebug) call obj%timer%stop("mpi_communication")
771
772
773
774
775
776
#endif /* WITH_MPI */
              endif
            enddo
          enddo

        else if (my_prow < ip) then
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
777

778
779
780
781
782
783
784
785
          ! Send all rows going to PE ip
          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
          do i=limits(ip)+1,limits(ip+1)
            src = mod((i-1)/nblk, np_rows)
            if (src == my_prow) then
              src_offset = src_offset+1
              row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
786
              if (wantDebug) call obj%timer%start("mpi_communication")
787
788
              call MPI_Send(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                            int(ip,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
789
              if (wantDebug) call obj%timer%stop("mpi_communication")
790
791
792
793
794
795
796
797
#endif /* WITH_MPI */
            endif
          enddo

          ! Receive all rows from PE ip
          do i=limits(my_prow)+1,limits(my_prow+1)
            src = mod((i-1)/nblk, np_rows)
            if (src == ip) then
798
#ifdef WITH_OPENMP_TRADITIONAL
799
800

#ifdef WITH_MPI
801
              if (wantDebug) call obj%timer%start("mpi_communication")
802
803
              call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                            int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
804
              if (wantDebug) call obj%timer%stop("mpi_communication")
805
806
807
808
809
810
#else /* WITH_MPI */

!              row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

811
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
812
813
!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
814
815
816
817
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
818
                                 (obj,aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
819
820
              enddo
!$omp end parallel do
821
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
822

823
#else /* WITH_OPENMP_TRADITIONAL */
824
825
              if (useGPU) then
                ! An unpacking of the current row group may occur before queuing the next row
Andreas Marek's avatar
Andreas Marek committed
826
                call unpack_and_prepare_row_group_&
827
828
829
830
                     &MATH_DATATYPE&
                     &_gpu_&
                     &PRECISION&
                     &( &
Andreas Marek's avatar
Andreas Marek committed
831
832
833
834
                  row_group, row_group_dev, aIntern_dev, stripe_count,  &
                  stripe_width, last_stripe_width, a_dim2, l_nev,       &
                  row_group_size, nblk, unpack_idx,                     &
                  i - limits(my_prow), .false.)
835
836

#ifdef WITH_MPI
837
               if (wantDebug) call obj%timer%start("mpi_communication")
838
839
               call MPI_Recv(row_group(:, row_group_size), int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                             int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
840
               if (wantDebug) call obj%timer%stop("mpi_communication")
841
842
843
844
845
846
847
#else /* WITH_MPI */

               row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#endif /* WITH_MPI */

              else ! useGPU
#ifdef WITH_MPI
848
                if (wantDebug) call obj%timer%start("mpi_communication")
849
850
                call MPI_Recv(row, int(l_nev,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
                              int(src,kind=MPI_KIND), 0_MPI_KIND, int(mpi_comm_rows,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
851
                if (wantDebug) call obj%timer%stop("mpi_communication")
852
853
854
855
856
#else /* WITH_MPI */

!                row(1:l_nev) = row(1:l_nev)

#endif
857
                call unpack_row_&
858
859
860
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
861
                                (obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
862
863
              endif ! useGPU

864
#endif /* WITH_OPENMP_TRADITIONAL */
865
866
867
868
869
870
871
872

            endif
          enddo
        endif
      enddo

      if (useGPU) then
        ! Force an unpacking of all remaining rows that haven't been unpacked yet
Andreas Marek's avatar
Andreas Marek committed
873
        call unpack_and_prepare_row_group_&
874
875
876
877
             &MATH_DATATYPE&
             &_gpu_&
             &PRECISION&
             &( &
Andreas Marek's avatar
Andreas Marek committed
878
879
880
881
882
          row_group, row_group_dev, aIntern_dev, stripe_count, &
          stripe_width, last_stripe_width, &
          a_dim2, l_nev, row_group_size, nblk, unpack_idx,     &
          -1, .true.)

883
884
885
886
887
888
889
890
      endif

      ! Set up result buffer queue

      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows

      num_result_buffers = 4*nfact
      allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
891
      check_allocate("trans_ev_tridi_to_band: result_buffer", istat, errorMessage)
892
893

      allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
894
      check_allocate("trans_ev_tridi_to_band: result_send_request", istat, errorMessage)
895
896

      allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
897
      check_allocate("trans_ev_tridi_to_band: result_recv_request", istat, errorMessage)
898
899
900
901
902
903
904
905

#ifdef WITH_MPI
      result_send_request(:) = MPI_REQUEST_NULL
      result_recv_request(:) = MPI_REQUEST_NULL
#endif

      ! Queue up buffers
#ifdef WITH_MPI
906
      if (wantDebug) call obj%timer%start("mpi_communication")
907
908
909

      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
        do j = 1, min(num_result_buffers, num_result_blocks)
910
911
912
          call MPI_Irecv(result_buffer(1,1,j), int(l_nev*nblk,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL,     &
                         0_MPI_KIND, int(result_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND),          &
                         result_recv_request(j), mpierr)
913
914
        enddo
      endif
915
      if (wantDebug) call obj%timer%stop("mpi_communication")
916
917
918
919
920
921
922
923
924
925
926
927
#else /* WITH_MPI */

      ! carefull the "recv" has to be done at the corresponding wait or send
      ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)

#endif /* WITH_MPI */

      num_bufs_recvd = 0 ! No buffers received yet

      ! Initialize top/bottom requests

      allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
928
      check_allocate("trans_ev_tridi_to_band: top_send_request", istat, errorMessage)
929
930

      allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
931
      check_allocate("trans_ev_tridi_to_band: top_recv_request", istat, errorMessage)
932
933

      allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
934
      check_allocate("trans_ev_tridi_to_band: bottom_send_request", istat, errorMessage)
935
936

      allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
937
      check_allocate("trans_ev_tridi_to_band: bottom_recv_request", istat, errorMessage)
938
939
940
941
942
943
944
945

#ifdef WITH_MPI
      top_send_request(:) = MPI_REQUEST_NULL
      top_recv_request(:) = MPI_REQUEST_NULL
      bottom_send_request(:) = MPI_REQUEST_NULL
      bottom_recv_request(:) = MPI_REQUEST_NULL
#endif

946
#ifdef WITH_OPENMP_TRADITIONAL
947
      allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
948
      check_allocate("trans_ev_tridi_to_band: top_border_send_buffer", istat, errorMessage)
949
950

      allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
951
      check_allocate("trans_ev_tridi_to_band: top_border_recv_buffer", istat, errorMessage)
952
953

      allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
954
      check_allocate("trans_ev_tridi_to_band: bottom_border_send_buffer", istat, errorMessage)
955
956

      allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
957
      check_allocate("trans_ev_tridi_to_band: bottom_border_recv_buffer", istat, errorMessage)
958

959
960
961
962
      top_border_send_buffer(:,:) = 0.0_rck
      top_border_recv_buffer(:,:) = 0.0_rck
      bottom_border_send_buffer(:,:) = 0.0_rck
      bottom_border_recv_buffer(:,:) = 0.0_rck
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985

      if (useGPU) then
        successCUDA = cuda_host_register(int(loc(top_border_send_buffer),kind=c_intptr_t), &
                      stripe_width*nbw*max_threads * stripe_count * size_of_datatype,&
                      cudaHostRegisterDefault)
        check_host_register_cuda("trans_ev_tridi_to_band: top_border_send_buffer", successCUDA)

        successCUDA = cuda_host_register(int(loc(top_border_recv_buffer),kind=c_intptr_t), &
                      stripe_width*nbw*max_threads * stripe_count * size_of_datatype,&
                      cudaHostRegisterDefault)
        check_host_register_cuda("trans_ev_tridi_to_band: top_border_recv_buffer", successCUDA)

        successCUDA = cuda_host_register(int(loc(bottom_border_send_buffer),kind=c_intptr_t), &
                      stripe_width*nbw*max_threads * stripe_count * size_of_datatype,&
                      cudaHostRegisterDefault)
        check_host_register_cuda("trans_ev_tridi_to_band: bottom_border_send_buffer", successCUDA)

        successCUDA = cuda_host_register(int(loc(bottom_border_recv_buffer),kind=c_intptr_t), &
                      stripe_width*nbw*max_threads * stripe_count * size_of_datatype,&
                      cudaHostRegisterDefault)
        check_host_register_cuda("trans_ev_tridi_to_band: bottom_border_recv_buffer", successCUDA)
      endif

986
987
      ! Initialize broadcast buffer

988
#else /* WITH_OPENMP_TRADITIONAL */
989

990
991
      allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)