elpa2_trans_ev_tridi_to_band_template.F90 89.2 KB
Newer Older
Andreas Marek's avatar
Andreas Marek committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".

53
#include "../general/sanity.F90"
Andreas Marek's avatar
Andreas Marek committed
54
55

  subroutine trans_ev_tridi_to_band_&
56
57
58
    &MATH_DATATYPE&
    &_&
    &PRECISION &
59
    (obj, na, nev, nblk, nbw, q, q_dev, ldq, matrixCols,         &
Andreas Marek's avatar
Andreas Marek committed
60
     hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, max_threads, success, &
61
     kernel)
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

    !-------------------------------------------------------------------------------
    !  trans_ev_tridi_to_band_real/complex:
    !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
    !
    !  Parameters
    !
    !  na          Order of matrix a, number of rows of matrix q
    !
    !  nev         Number eigenvectors to compute (= columns of matrix q)
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
    !  nb          semi bandwith
    !
    !  q           On input: Eigenvectors of tridiagonal matrix
    !              On output: Transformed eigenvectors
    !              Distribution is like in Scalapack.
    !
    !  q_dev       GPU device pointer to q
    !
    !  ldq         Leading dimension of q
    !  matrixCols  local columns of matrix q
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns/both
    !
    !-------------------------------------------------------------------------------
91
      use elpa_abstract_impl
92
      use elpa2_workload
Andreas Marek's avatar
Andreas Marek committed
93
      use pack_unpack_cpu
Andreas Marek's avatar
Andreas Marek committed
94
      use pack_unpack_gpu
95
      use compute_hh_trafo
96
97
98
      use cuda_functions
      use precision
      use iso_c_binding
Andreas Marek's avatar
Andreas Marek committed
99
#ifdef WITH_OPENMP
100
      ! use omp_lib
Andreas Marek's avatar
Andreas Marek committed
101
#endif
102
      implicit none
103
#include "../general/precision_kinds.F90"
104
      class(elpa_abstract_impl_t), intent(inout) :: obj
Andreas Marek's avatar
Andreas Marek committed
105
      logical, intent(in)                        :: useGPU
106

Andreas Marek's avatar
Andreas Marek committed
107
108
      integer(kind=ik), intent(in)               :: kernel
      integer(kind=ik), intent(in)               :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
109
110

#ifdef USE_ASSUMED_SIZE
111
      MATH_DATATYPE(kind=rck)                    :: q(ldq,*)
112
#else
113
      MATH_DATATYPE(kind=rck)                    :: q(ldq,matrixCols)
114
115
#endif

116
      MATH_DATATYPE(kind=rck), intent(in)        :: hh_trans(:,:)
Andreas Marek's avatar
Andreas Marek committed
117
      integer(kind=c_intptr_t)                   :: q_dev
Andreas Marek's avatar
Andreas Marek committed
118

Andreas Marek's avatar
Andreas Marek committed
119
120
121
122
123
124
      integer(kind=ik)                           :: np_rows, my_prow, np_cols, my_pcol
      integer(kind=ik)                           :: i, j, ip, sweep, nbuf, l_nev, a_dim2
      integer(kind=ik)                           :: current_n, current_local_n, current_n_start, current_n_end
      integer(kind=ik)                           :: next_n, next_local_n, next_n_start, next_n_end
      integer(kind=ik)                           :: bottom_msg_length, top_msg_length, next_top_msg_length
      integer(kind=ik)                           :: stripe_width, last_stripe_width, stripe_count
125
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
126
      integer(kind=ik)                           :: thread_width, csw, b_off, b_len
127
#endif
Andreas Marek's avatar
Andreas Marek committed
128
129
130
      integer(kind=ik)                           :: num_result_blocks, num_result_buffers, num_bufs_recvd
      integer(kind=ik)                           :: a_off, current_tv_off, max_blk_size
      integer(kind=ik)                           :: mpierr, src, src_offset, dst, offset, nfact, num_blk
131

Andreas Marek's avatar
Andreas Marek committed
132
      logical                                    :: flag
133
#ifdef WITH_OPENMP
134
      MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:,:)
135
#else
136
      MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:)
137
#endif
138
      MATH_DATATYPE(kind=rck)                    :: a_var
139

Andreas Marek's avatar
Andreas Marek committed
140
      type(c_ptr)                                :: aIntern_ptr
141

142
143
      MATH_DATATYPE(kind=rck)   , allocatable    :: row(:)
      MATH_DATATYPE(kind=rck)   , allocatable    :: row_group(:,:)
144
145

#ifdef WITH_OPENMP
146
147
      MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
148
#else
149
150
      MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
151
152
#endif

153
154
      integer(kind=c_intptr_t)                   :: aIntern_dev
      integer(kind=c_intptr_t)                   :: bcast_buffer_dev
155
      integer(kind=c_intptr_t)                   :: num
Andreas Marek's avatar
Andreas Marek committed
156
      integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1
157
158
159
160
161
      integer(kind=c_intptr_t)                   :: row_dev
      integer(kind=c_intptr_t)                   :: row_group_dev
      integer(kind=c_intptr_t)                   :: hh_tau_dev
      integer(kind=c_intptr_t)                   :: hh_dot_dev
      integer(kind=ik)                           :: row_group_size, unpack_idx
162

163
164
      integer(kind=ik)                           :: n_times
      integer(kind=ik)                           :: top, chunk, this_chunk
165

166
167
      MATH_DATATYPE(kind=rck), allocatable       :: result_buffer(:,:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: bcast_buffer(:,:)
168

169
      integer(kind=ik)                           :: n_off
170

171
172
173
      integer(kind=ik), allocatable              :: result_send_request(:), result_recv_request(:), limits(:)
      integer(kind=ik), allocatable              :: top_send_request(:), bottom_send_request(:)
      integer(kind=ik), allocatable              :: top_recv_request(:), bottom_recv_request(:)
174
175
176

      ! MPI send/recv tags, arbitrary

177
178
179
      integer(kind=ik), parameter                :: bottom_recv_tag = 111
      integer(kind=ik), parameter                :: top_recv_tag    = 222
      integer(kind=ik), parameter                :: result_recv_tag = 333
Andreas Marek's avatar
Andreas Marek committed
180
181
182

      integer(kind=ik), intent(in)               :: max_threads
 
183
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
184
      integer(kind=ik)                           :: my_thread
185
186
187
188
#endif


      ! Just for measuring the kernel performance
189
      real(kind=c_double)                        :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
190
      ! long integer
191
      integer(kind=lik)                          :: kernel_flops, kernel_flops_recv
192

193
194
195
196
      logical, intent(in)                        :: wantDebug
      logical                                    :: success
      integer(kind=ik)                           :: istat, print_flops
      character(200)                             :: errorMessage
197
      character(20)                              :: gpuString
198
      logical                                    :: successCUDA
199
#ifndef WITH_MPI
200
      integer(kind=ik)                           :: j1
201
#endif
Andreas Marek's avatar
Andreas Marek committed
202
      integer(kind=ik)                           :: error
203
204
205
206
      integer(kind=c_intptr_t), parameter        :: size_of_datatype = size_of_&
                                                                     &PRECISION&
                                                                     &_&
                                                                     &MATH_DATATYPE
207

208
209
210
211
212
213
      if(useGPU) then
        gpuString = "_gpu"
      else
        gpuString = ""
      endif

214
      call obj%timer%start("trans_ev_tridi_to_band_&
215
      &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
216
      &" // &
217
218
      &PRECISION_SUFFIX //&
      gpuString)
219

220
      n_times = 0
221
222
223
224
225
226
227
228
229
      if (useGPU) then
        unpack_idx = 0
        row_group_size = 0
      endif

      success = .true.
      kernel_time = 0.0
      kernel_flops = 0

230
      if (wantDebug) call obj%timer%start("mpi_communication")
231
232
233
234
      call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
      call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
      call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
      call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
235
      if (wantDebug) call obj%timer%stop("mpi_communication")
236
237
238
239
240

      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
241
242
                                &MATH_DATATYPE&
                                &: ERROR: nbw=',nbw,', nblk=',nblk
243
            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
244
245
                                &MATH_DATATYPE&
                                &: band backtransform works only for nbw==n*nblk'
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
          endif
          success = .false.
          return
        endif
      endif

      nfact = nbw / nblk


      ! local number of eigenvectors
      l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)

      if (l_nev==0) then
#ifdef WITH_OPENMP
        thread_width = 0
#endif
        stripe_width = 0
        stripe_count = 0
        last_stripe_width = 0

      else ! l_nev

#if WITH_OPENMP
        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
        ! every primary cache
271
        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
272
273
274
275
276
277
278

        if (useGPU) then
          stripe_width = 256 ! Must be a multiple of 4
          stripe_count = (l_nev - 1) / stripe_width + 1
        else ! useGPU
          ! openmp only in non-GPU case
          thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
279

280
#if REALCASE == 1
281
          call obj%get("stripewidth_real",stripe_width, error)
282

283
#ifdef DOUBLE_PRECISION_REAL
284
          !stripe_width = 48 ! Must be a multiple of 4
285
#else
286
287
          stripe_width = stripe_width * 2
          !stripe_width = 96 ! Must be a multiple of 8
288
289
290
291
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
292
          call obj%get("stripewidth_complex",stripe_width, error)
293

294
#ifdef DOUBLE_PRECISION_COMPLEX
295
          !stripe_width = 48 ! Must be a multiple of 2
296
#else
297
298
          stripe_width = stripe_width * 2
          !stripe_width = 48 ! Must be a multiple of 4
299
300
301
302
303
304
305
306
307
308
309
#endif
#endif /* COMPLEXCASE */

          stripe_count = (thread_width-1)/stripe_width + 1

          ! Adapt stripe width so that last one doesn't get too small

          stripe_width = (thread_width-1)/stripe_count + 1

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
310
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
311
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
312
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
313
314
315
316

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                                  ! (8 * sizeof(double) == 64)

Andreas Marek's avatar
Retab  
Andreas Marek committed
317
    else
318
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
319
                                            ! (4 * sizeof(double) == 32)
320
          endif
321
#else
322
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
323
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
324
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
325
326
327


            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
328
                                               ! (16 * sizeof(float) == 64)
329

Andreas Marek's avatar
Retab  
Andreas Marek committed
330
    else
331
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
332
                                            ! (8 * sizeof(float) == 32)
333
          endif
334
335
336
337
338
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
339
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
340
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
341

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
342
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
343
                                            ! (4 * sizeof(double complex) == 64)
344

Andreas Marek's avatar
Retab  
Andreas Marek committed
345
    else
346
347

            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
348
349
                                            ! (2 * sizeof(double complex) == 32)
    endif
350
#else
351

352
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
353
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
354
355

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
356
                                            ! (8 * sizeof(float complex) == 64)
357
358
359

          else
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
360
361
                                            ! (4 * sizeof(float complex) == 32)
    endif
362
363
364
#endif
#endif /* COMPLEXCASE */

365
#if REALCASE == 1
366
        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
367
368
369
370
371
#endif
#if COMPLEXCASE == 1
! only needed in no OMP case check thsis
! last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif
372
373

        endif ! useGPU
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
374

375
#else /* WITH_OPENMP */
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
376

377
378
        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
        ! every primary cache
379
        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
380
381
382
383
384
385
386

        if (useGPU) then
          stripe_width = 256 ! Must be a multiple of 4
          stripe_count = (l_nev - 1) / stripe_width + 1

        else ! useGPU
#if REALCASE == 1
387
388
          call obj%get("stripewidth_real",stripe_width, error)

389
#ifdef DOUBLE_PRECISION_REAL
390
          !stripe_width = 48 ! Must be a multiple of 4
391
#else
392
393
          !stripe_width = 96 ! Must be a multiple of 8
          stripe_width = 2 * stripe_width
394
395
396
397
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
398
399
          call obj%get("stripewidth_complex",stripe_width, error)

400
#ifdef DOUBLE_PRECISION_COMPLEX
401
          !stripe_width = 48 ! Must be a multiple of 2
402
#else
403
          !stripe_width = 48 ! Must be a multiple of 4
404
405
406
407
408
409
410
411
412
413
414
#endif
#endif /* COMPLEXCASE */

          stripe_count = (l_nev-1)/stripe_width + 1

          ! Adapt stripe width so that last one doesn't get too small

          stripe_width = (l_nev-1)/stripe_count + 1

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
415
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
416
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
417
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
418
419
420
421

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                                  ! (8 * sizeof(double) == 64)

Andreas Marek's avatar
Retab  
Andreas Marek committed
422
    else
423
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
424
                                            ! (4 * sizeof(double) == 32)
425
          endif
426
#else
427
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
428
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
429
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
430
431
432


            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
433
                                               ! (16 * sizeof(float) == 64)
434

Andreas Marek's avatar
Retab  
Andreas Marek committed
435
    else
436
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
437
                                            ! (8 * sizeof(float) == 32)
438
          endif
439
440
441
442
443
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
444

445
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
446
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
447

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
448
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
449
                                            ! (4 * sizeof(double complex) == 64)
450

Andreas Marek's avatar
Retab  
Andreas Marek committed
451
    else
452
453

            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
454
455
                                            ! (2 * sizeof(double complex) == 32)
    endif
456
#else
457

458
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
459
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
460

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
461
            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
462
                                            ! (8 * sizeof(float complex) == 64)
463
464
465

          else
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
466
467
                                            ! (4 * sizeof(float complex) == 32)
    endif
468
469
470
471
472
473
474
475
476
477
478
479
480
481
#endif
#endif /* COMPLEXCASE */
        endif ! useGPU

        last_stripe_width = l_nev - (stripe_count-1)*stripe_width

#endif /* WITH_OPENMP */
      endif ! l_nev

      ! Determine the matrix distribution at the beginning

      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
482
483
                &MATH_DATATYPE&
                &: error when allocating limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
484
        stop 1
485
      endif
486
      call determine_workload(obj,na, nbw, np_rows, limits)
487
488
489
490
491
492

      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))

      a_dim2 = max_blk_size + nbw

      if (useGPU) then
493
494
        num =  (stripe_width*a_dim2*stripe_count)* size_of_datatype
        successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
495
496
497
498
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
499
          stop 1
500
501
502
503
504
505
506
        endif

        successCUDA = cuda_memset(aIntern_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
507
          stop 1
508
509
        endif

510
        num =  (l_nev)* size_of_datatype
511
512
513
514
515
        successCUDA = cuda_malloc( row_dev,num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc "
Andreas Marek's avatar
Andreas Marek committed
516
          stop 1
517
518
519
520
521
522
523
        endif

        successCUDA = cuda_memset(row_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset "
Andreas Marek's avatar
Andreas Marek committed
524
          stop 1
525
526
527
528
529
530
531
532
        endif

        ! "row_group" and "row_group_dev" are needed for GPU optimizations
        allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error when allocating row_group"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
533
          stop 1
534
535
        endif

536
        row_group(:, :) = 0.0_rck
537
        num =  (l_nev*nblk)* size_of_datatype
538
539
540
541
542
        successCUDA = cuda_malloc(row_group_dev, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
543
          stop 1
544
545
546
547
548
549
550
        endif

        successCUDA = cuda_memset(row_group_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
551
          stop 1
552
553
554
555
556
557
558
559
560
561
        endif

      else ! GPUs are not used

#if 0
! realcase or complexcase
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif

#ifdef WITH_OPENMP
562
        if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*max_threads*     &
563
               C_SIZEOF(a_var)) /= 0) then
564
565
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
566
          &: error when allocating aIntern"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
567
          stop 1
568
569
570
571
572
573
        endif

        call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
        ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)

        ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
574
575
576

#else /* WITH_OPENMP */

577
        if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*  &
578
            C_SIZEOF(a_var)) /= 0) then
579
          print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
580
          stop 1
581
582
583
584
585
        endif

        call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
        !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)

586
        aIntern(:,:,:) = 0.0_rck
587
588
589
590
591
592
593
594
#endif /* WITH_OPENMP */
      endif !useGPU

      allocate(row(l_nev), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating row"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
595
        stop 1
596
597
      endif

598
      row(:) = 0.0_rck
599
600
601
602
603
604
605
606
607
608
609
610

      ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
      ! and transpose the matrix using stripes of given stripe_width for cache blocking.

      ! The peculiar way it is done below is due to the fact that the last row should be
      ! ready first since it is the first one to start below

#ifdef WITH_OPENMP
      ! Please note about the OMP usage below:
      ! This is not for speed, but because we want the matrix a in the memory and
      ! in the cache of the correct thread (if possible)

611
      call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
612
613
      !$omp parallel do private(my_thread), schedule(static, 1)
      do my_thread = 1, max_threads
614
        aIntern(:,:,:,my_thread) = 0.0_rck ! if possible, do first touch allocation!
615
616
617
      enddo
      !$omp end parallel do

618
      call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
619
620
621
622
623
624
625
626
627
628
629
630
631
#endif /* WITH_OPENMP */

      do ip = np_rows-1, 0, -1
        if (my_prow == ip) then
          ! Receive my rows which have not yet been received
          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
          do i=limits(ip)+1,limits(ip+1)
            src = mod((i-1)/nblk, np_rows)

            if (src < my_prow) then
#ifdef WITH_OPENMP

#ifdef WITH_MPI
632
              if (wantDebug) call obj%timer%start("mpi_communication")
633
              call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
634
                            src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
635
              if (wantDebug) call obj%timer%stop("mpi_communication")
636
637
638
639
640
641
642

#else /* WITH_MPI */

!              row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

643
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
644
645
646

!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
647
648
649
650
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
651
                                  (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, &
652
653
                                   thread_width, stripe_width, l_nev)

654
655
656
              enddo
!$omp end parallel do

657
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
658
659
660
661

#else /* WITH_OPENMP */
              if (useGPU) then
                ! An unpacking of the current row group may occur before queuing the next row
662
663
664
665
                call unpack_and_prepare_row_group_&
                &MATH_DATATYPE&
                &_gpu_&
                &PRECISION &
Andreas Marek's avatar
Retab  
Andreas Marek committed
666
667
                              ( &
                              row_group, row_group_dev, aIntern_dev, stripe_count, &
Andreas Marek's avatar
Andreas Marek committed
668
669
670
                                          stripe_width, last_stripe_width, a_dim2, l_nev,&
                                          row_group_size, nblk, unpack_idx, &
                                           i - limits(ip), .false.)
671
#ifdef WITH_MPI
672
                if (wantDebug) call obj%timer%start("mpi_communication")
673
                call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
674
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
675
                if (wantDebug) call obj%timer%stop("mpi_communication")
676
677
678
679
680
681
682

#else /* WITH_MPI */
                row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif /* WITH_MPI */

              else ! useGPU
#ifdef WITH_MPI
683
                if (wantDebug) call obj%timer%start("mpi_communication")
684
                call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
685
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
686
                if (wantDebug) call obj%timer%stop("mpi_communication")
687
688
689
690
691
692
693

#else /* WITH_MPI */

!                row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

694
                call unpack_row_&
695
696
697
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
698
                                (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
699
700
701
702
703
704
705
706
707
708
              endif ! useGPU
#endif /* WITH_OPENMP */

            elseif (src == my_prow) then

              src_offset = src_offset+1

              if (useGPU) then
#ifndef WITH_OPENMP

Andreas Marek's avatar
Andreas Marek committed
709
710
                 ! An unpacking of the current row group may occur before queuing the next row
                 call unpack_and_prepare_row_group_&
711
712
713
                      &MATH_DATATYPE&
                      &_gpu_&
                      &PRECISION &
Andreas Marek's avatar
Retab  
Andreas Marek committed
714
                  ( &
Andreas Marek's avatar
Andreas Marek committed
715
716
717
718
719
                               row_group, row_group_dev, aIntern_dev, stripe_count, &
                               stripe_width, last_stripe_width, a_dim2, l_nev,&
                               row_group_size, nblk, unpack_idx, &
                               i - limits(ip), .false.)

720
                row_group(:, row_group_size) = q(src_offset, 1:l_nev)
721
#else /* WITH_OPENMP */
Andreas Marek's avatar
Andreas Marek committed
722

723
724
725
726
727
728
729
730
731
!#if COMPLEXCASE == 1
!! why is an cuda call in the openmp region?
!                call unpack_and_prepare_row_group_complex_gpu_&
!                     &PRECISION&
!                     &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
!                      last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
!                      unpack_idx, i - limits(ip),.false.)
!                      row_group(:, row_group_size) = q(src_offset, 1:l_nev)
!#endif
732
733
734
735
736
737
738

#endif /* not OpenMP */
              else
                row(:) = q(src_offset, 1:l_nev)
              endif

#ifdef WITH_OPENMP
739
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
740
741
742

!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
743
744
745
746
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
747
                                   (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
748

749
750
751
              enddo
!$omp end parallel do

752
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
753
754
755
756
757
758

#else /* WITH_OPENMP */

              if (useGPU) then

              else
759
760
761
762
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
763
                                (obj,aIntern, row,i-limits(ip),  stripe_count, stripe_width, last_stripe_width)
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
              endif

#endif /* WITH_OPENMP */

            endif
          enddo

          ! Send all rows which have not yet been send
          src_offset = 0
          do dst = 0, ip-1
            do i=limits(dst)+1,limits(dst+1)
              if (mod((i-1)/nblk, np_rows) == my_prow) then
                src_offset = src_offset+1
                row(:) = q(src_offset, 1:l_nev)

#ifdef WITH_MPI
780
                if (wantDebug) call obj%timer%start("mpi_communication")
781
                call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
782
                              dst, 0, mpi_comm_rows, mpierr)
783
                if (wantDebug) call obj%timer%stop("mpi_communication")
784
785
786
787
788
789
#endif /* WITH_MPI */
              endif
            enddo
          enddo

        else if (my_prow < ip) then
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
790

791
792
793
794
795
796
797
798
          ! Send all rows going to PE ip
          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
          do i=limits(ip)+1,limits(ip+1)
            src = mod((i-1)/nblk, np_rows)
            if (src == my_prow) then
              src_offset = src_offset+1
              row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
799
              if (wantDebug) call obj%timer%start("mpi_communication")
800
              call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
801
                            ip, 0, mpi_comm_rows, mpierr)
802
              if (wantDebug) call obj%timer%stop("mpi_communication")
803
804
805
806
807
808
809
810
811
812
813
#endif /* WITH_MPI */
            endif
          enddo

          ! Receive all rows from PE ip
          do i=limits(my_prow)+1,limits(my_prow+1)
            src = mod((i-1)/nblk, np_rows)
            if (src == ip) then
#ifdef WITH_OPENMP

#ifdef WITH_MPI
814
              if (wantDebug) call obj%timer%start("mpi_communication")
815
              call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
816
                            src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
817
              if (wantDebug) call obj%timer%stop("mpi_communication")
818
819
820
821
822
823
#else /* WITH_MPI */

!              row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

824
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
825
826
!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
827
828
829
830
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
831
                                 (obj,aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
832
833
              enddo
!$omp end parallel do
834
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
835
836
837
838

#else /* WITH_OPENMP */
              if (useGPU) then
                ! An unpacking of the current row group may occur before queuing the next row
Andreas Marek's avatar
Andreas Marek committed
839
                call unpack_and_prepare_row_group_&
840
841
842
843
                     &MATH_DATATYPE&
                     &_gpu_&
                     &PRECISION&
                     &( &
Andreas Marek's avatar
Andreas Marek committed
844
845
846
847
                  row_group, row_group_dev, aIntern_dev, stripe_count,  &
                  stripe_width, last_stripe_width, a_dim2, l_nev,       &
                  row_group_size, nblk, unpack_idx,                     &
                  i - limits(my_prow), .false.)
848
849

#ifdef WITH_MPI
850
               if (wantDebug) call obj%timer%start("mpi_communication")
851
               call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
852
                             src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
853
               if (wantDebug) call obj%timer%stop("mpi_communication")
854
855
856
857
858
859
860
#else /* WITH_MPI */

               row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#endif /* WITH_MPI */

              else ! useGPU
#ifdef WITH_MPI
861
                if (wantDebug) call obj%timer%start("mpi_communication")
862
                call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
863
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
864
                if (wantDebug) call obj%timer%stop("mpi_communication")
865
866
867
868
869
#else /* WITH_MPI */

!                row(1:l_nev) = row(1:l_nev)

#endif
870
                call unpack_row_&
871
872
873
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
874
                                (obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
875
876
877
878
879
880
881
882
883
884
885
              endif ! useGPU

#endif /* WITH_OPENMP */

            endif
          enddo
        endif
      enddo

      if (useGPU) then
        ! Force an unpacking of all remaining rows that haven't been unpacked yet
Andreas Marek's avatar
Andreas Marek committed
886
        call unpack_and_prepare_row_group_&
887
888
889
890
             &MATH_DATATYPE&
             &_gpu_&
             &PRECISION&
             &( &
Andreas Marek's avatar
Andreas Marek committed
891
892
893
894
895
          row_group, row_group_dev, aIntern_dev, stripe_count, &
          stripe_width, last_stripe_width, &
          a_dim2, l_nev, row_group_size, nblk, unpack_idx,     &
          -1, .true.)

896
897
898
899
900
901
        successCUDA = cuda_devicesynchronize()

         if (.not.(successCUDA)) then
           print *,"trans_ev_tridi_to_band_&
           &MATH_DATATYPE&
           &: error in cudaDeviceSynchronize"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
902
           stop 1
903
904
905
906
907
908
909
910
911
912
913
914
915
         endif
      endif

      ! Set up result buffer queue

      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows

      num_result_buffers = 4*nfact
      allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
916
        stop 1
917
918
919
920
921
922
923
      endif

      allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
924
        stop 1
925
926
927
928
929
930
931
      endif

      allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_recv_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
932
        stop 1
933
934
935
936
937
938
939
940
941
      endif

#ifdef WITH_MPI
      result_send_request(:) = MPI_REQUEST_NULL
      result_recv_request(:) = MPI_REQUEST_NULL
#endif

      ! Queue up buffers
#ifdef WITH_MPI
942
      if (wantDebug) call obj%timer%start("mpi_communication")
943
944
945

      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
        do j = 1, min(num_result_buffers, num_result_blocks)
946
          call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL,     &
947
948
949
                         0, result_recv_tag, mpi_comm_rows, result_recv_request(j), mpierr)
        enddo
      endif
950
      if (wantDebug) call obj%timer%stop("mpi_communication")
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
#else /* WITH_MPI */

      ! carefull the "recv" has to be done at the corresponding wait or send
      ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)

#endif /* WITH_MPI */

      num_bufs_recvd = 0 ! No buffers received yet

      ! Initialize top/bottom requests

      allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MPI_DATATYPE&
         &: error when allocating top_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
967
         stop 1
968
969
970
971
972
973
974
       endif

      allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_recv_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
975
         stop 1
976
977
978
979
980
981
982
       endif

      allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
983
         stop 1
984
985
986
987
988
989
990
       endif

      allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0