elpa2_trans_ev_tridi_to_band_template.F90 89.7 KB
Newer Older
Andreas Marek's avatar
Andreas Marek committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".

53
#include "../general/sanity.F90"
Andreas Marek's avatar
Andreas Marek committed
54 55

  subroutine trans_ev_tridi_to_band_&
56 57 58
    &MATH_DATATYPE&
    &_&
    &PRECISION &
59
    (obj, na, nev, nblk, nbw, q, q_dev, ldq, matrixCols,         &
Andreas Marek's avatar
Andreas Marek committed
60
     hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, max_threads, success, &
61
     kernel)
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90

    !-------------------------------------------------------------------------------
    !  trans_ev_tridi_to_band_real/complex:
    !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
    !
    !  Parameters
    !
    !  na          Order of matrix a, number of rows of matrix q
    !
    !  nev         Number eigenvectors to compute (= columns of matrix q)
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
    !  nb          semi bandwith
    !
    !  q           On input: Eigenvectors of tridiagonal matrix
    !              On output: Transformed eigenvectors
    !              Distribution is like in Scalapack.
    !
    !  q_dev       GPU device pointer to q
    !
    !  ldq         Leading dimension of q
    !  matrixCols  local columns of matrix q
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns/both
    !
    !-------------------------------------------------------------------------------
91
      use elpa_abstract_impl
92
      use elpa2_workload
Andreas Marek's avatar
Andreas Marek committed
93
      use pack_unpack_cpu
Andreas Marek's avatar
Andreas Marek committed
94
      use pack_unpack_gpu
95
      use compute_hh_trafo
96 97 98
      use cuda_functions
      use precision
      use iso_c_binding
Andreas Marek's avatar
Andreas Marek committed
99
#ifdef WITH_OPENMP
100
      ! use omp_lib
Andreas Marek's avatar
Andreas Marek committed
101
#endif
102
      implicit none
103
#include "../general/precision_kinds.F90"
104
      class(elpa_abstract_impl_t), intent(inout) :: obj
Andreas Marek's avatar
Andreas Marek committed
105
      logical, intent(in)                        :: useGPU
106

Andreas Marek's avatar
Andreas Marek committed
107 108
      integer(kind=ik), intent(in)               :: kernel
      integer(kind=ik), intent(in)               :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
109 110

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
111
      MATH_DATATYPE(kind=rck)                    :: q(ldq,*)
112
#else
Andreas Marek's avatar
Andreas Marek committed
113
      MATH_DATATYPE(kind=rck)                    :: q(ldq,matrixCols)
114 115
#endif

116
      MATH_DATATYPE(kind=rck), intent(in)        :: hh_trans(:,:)
Andreas Marek's avatar
Andreas Marek committed
117
      integer(kind=c_intptr_t)                   :: q_dev
Andreas Marek's avatar
Andreas Marek committed
118

Andreas Marek's avatar
Andreas Marek committed
119 120 121 122 123 124
      integer(kind=ik)                           :: np_rows, my_prow, np_cols, my_pcol
      integer(kind=ik)                           :: i, j, ip, sweep, nbuf, l_nev, a_dim2
      integer(kind=ik)                           :: current_n, current_local_n, current_n_start, current_n_end
      integer(kind=ik)                           :: next_n, next_local_n, next_n_start, next_n_end
      integer(kind=ik)                           :: bottom_msg_length, top_msg_length, next_top_msg_length
      integer(kind=ik)                           :: stripe_width, last_stripe_width, stripe_count
125
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
126
      integer(kind=ik)                           :: thread_width, csw, b_off, b_len
127
#endif
Andreas Marek's avatar
Andreas Marek committed
128 129 130
      integer(kind=ik)                           :: num_result_blocks, num_result_buffers, num_bufs_recvd
      integer(kind=ik)                           :: a_off, current_tv_off, max_blk_size
      integer(kind=ik)                           :: mpierr, src, src_offset, dst, offset, nfact, num_blk
131

Andreas Marek's avatar
Andreas Marek committed
132
      logical                                    :: flag
133
#ifdef WITH_OPENMP
134
      MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:,:)
135
#else
136
      MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:)
137
#endif
138
      MATH_DATATYPE(kind=rck)                    :: a_var
139

Andreas Marek's avatar
Andreas Marek committed
140
      type(c_ptr)                                :: aIntern_ptr
141

142 143
      MATH_DATATYPE(kind=rck)   , allocatable    :: row(:)
      MATH_DATATYPE(kind=rck)   , allocatable    :: row_group(:,:)
144 145

#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
146 147 148 149
      MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: top_border_recv_buffer(:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_recv_buffer(:,:)
150
#else
Andreas Marek's avatar
Andreas Marek committed
151 152 153 154
      MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: top_border_recv_buffer(:,:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:,:)
      MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_recv_buffer(:,:,:)
155 156
#endif

157 158
      integer(kind=c_intptr_t)                   :: aIntern_dev
      integer(kind=c_intptr_t)                   :: bcast_buffer_dev
159
      integer(kind=c_intptr_t)                   :: num
Andreas Marek's avatar
Andreas Marek committed
160
      integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1
161 162 163 164 165
      integer(kind=c_intptr_t)                   :: row_dev
      integer(kind=c_intptr_t)                   :: row_group_dev
      integer(kind=c_intptr_t)                   :: hh_tau_dev
      integer(kind=c_intptr_t)                   :: hh_dot_dev
      integer(kind=ik)                           :: row_group_size, unpack_idx
166

167 168
      integer(kind=ik)                           :: n_times
      integer(kind=ik)                           :: top, chunk, this_chunk
169

170
      MATH_DATATYPE(kind=rck), allocatable       :: result_buffer(:,:,:)
Andreas Marek's avatar
Andreas Marek committed
171
      MATH_DATATYPE(kind=rck), allocatable       :: bcast_buffer(:,:)
172

173
      integer(kind=ik)                           :: n_off
174

175 176 177
      integer(kind=ik), allocatable              :: result_send_request(:), result_recv_request(:), limits(:)
      integer(kind=ik), allocatable              :: top_send_request(:), bottom_send_request(:)
      integer(kind=ik), allocatable              :: top_recv_request(:), bottom_recv_request(:)
178 179 180

      ! MPI send/recv tags, arbitrary

181 182 183
      integer(kind=ik), parameter                :: bottom_recv_tag = 111
      integer(kind=ik), parameter                :: top_recv_tag    = 222
      integer(kind=ik), parameter                :: result_recv_tag = 333
Andreas Marek's avatar
Andreas Marek committed
184 185 186

      integer(kind=ik), intent(in)               :: max_threads
 
187
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
188
      integer(kind=ik)                           :: my_thread
189 190 191 192
#endif


      ! Just for measuring the kernel performance
193
      real(kind=c_double)                        :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
194
      ! long integer
195
      integer(kind=lik)                          :: kernel_flops, kernel_flops_recv
196

197 198 199 200
      logical, intent(in)                        :: wantDebug
      logical                                    :: success
      integer(kind=ik)                           :: istat, print_flops
      character(200)                             :: errorMessage
201
      character(20)                              :: gpuString
202
      logical                                    :: successCUDA
203
#ifndef WITH_MPI
204
      integer(kind=ik)                           :: j1
205
#endif
Andreas Marek's avatar
Andreas Marek committed
206
      integer(kind=ik)                           :: error
207 208 209 210
      integer(kind=c_intptr_t), parameter        :: size_of_datatype = size_of_&
                                                                     &PRECISION&
                                                                     &_&
                                                                     &MATH_DATATYPE
211

212 213 214 215 216 217
      if(useGPU) then
        gpuString = "_gpu"
      else
        gpuString = ""
      endif

218
      call obj%timer%start("trans_ev_tridi_to_band_&
219
      &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
220
      &" // &
221 222
      &PRECISION_SUFFIX //&
      gpuString)
223

224
      n_times = 0
225 226 227 228 229 230 231 232 233
      if (useGPU) then
        unpack_idx = 0
        row_group_size = 0
      endif

      success = .true.
      kernel_time = 0.0
      kernel_flops = 0

234
      if (wantDebug) call obj%timer%start("mpi_communication")
235 236 237 238
      call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
      call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
      call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
      call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
239
      if (wantDebug) call obj%timer%stop("mpi_communication")
240 241 242 243 244

      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
245 246
                                &MATH_DATATYPE&
                                &: ERROR: nbw=',nbw,', nblk=',nblk
247
            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
248 249
                                &MATH_DATATYPE&
                                &: band backtransform works only for nbw==n*nblk'
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
          endif
          success = .false.
          return
        endif
      endif

      nfact = nbw / nblk


      ! local number of eigenvectors
      l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)

      if (l_nev==0) then
#ifdef WITH_OPENMP
        thread_width = 0
#endif
        stripe_width = 0
        stripe_count = 0
        last_stripe_width = 0

      else ! l_nev

#if WITH_OPENMP
        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
        ! every primary cache
275
        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
276 277 278 279 280 281 282

        if (useGPU) then
          stripe_width = 256 ! Must be a multiple of 4
          stripe_count = (l_nev - 1) / stripe_width + 1
        else ! useGPU
          ! openmp only in non-GPU case
          thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
283

284
#if REALCASE == 1
285
          call obj%get("stripewidth_real",stripe_width, error)
286

287
#ifdef DOUBLE_PRECISION_REAL
288
          !stripe_width = 48 ! Must be a multiple of 4
289
#else
290 291
          stripe_width = stripe_width * 2
          !stripe_width = 96 ! Must be a multiple of 8
292 293 294 295
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
296
          call obj%get("stripewidth_complex",stripe_width, error)
297

298
#ifdef DOUBLE_PRECISION_COMPLEX
299
          !stripe_width = 48 ! Must be a multiple of 2
300
#else
301 302
          stripe_width = stripe_width * 2
          !stripe_width = 48 ! Must be a multiple of 4
303 304 305 306 307 308 309 310 311 312 313
#endif
#endif /* COMPLEXCASE */

          stripe_count = (thread_width-1)/stripe_width + 1

          ! Adapt stripe width so that last one doesn't get too small

          stripe_width = (thread_width-1)/stripe_count + 1

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
314
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
315
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
316
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
317 318 319 320

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                                  ! (8 * sizeof(double) == 64)

Andreas Marek's avatar
Retab  
Andreas Marek committed
321
    else
322
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
323
                                            ! (4 * sizeof(double) == 32)
324
          endif
325
#else
326
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
327
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
328
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
329 330 331


            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
332
                                               ! (16 * sizeof(float) == 64)
333

Andreas Marek's avatar
Retab  
Andreas Marek committed
334
    else
335
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
336
                                            ! (8 * sizeof(float) == 32)
337
          endif
338 339 340 341 342
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
343
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
344
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
345

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
346
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
347
                                            ! (4 * sizeof(double complex) == 64)
348

Andreas Marek's avatar
Retab  
Andreas Marek committed
349
    else
350 351

            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
352 353
                                            ! (2 * sizeof(double complex) == 32)
    endif
354
#else
355

356
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
357
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
358 359

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
360
                                            ! (8 * sizeof(float complex) == 64)
361 362 363

          else
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
364 365
                                            ! (4 * sizeof(float complex) == 32)
    endif
366 367 368
#endif
#endif /* COMPLEXCASE */

369
#if REALCASE == 1
370
        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
371 372 373 374 375
#endif
#if COMPLEXCASE == 1
! only needed in no OMP case check thsis
! last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif
376 377

        endif ! useGPU
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
378

379
#else /* WITH_OPENMP */
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
380

381 382
        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
        ! every primary cache
383
        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
384 385 386 387 388 389 390

        if (useGPU) then
          stripe_width = 256 ! Must be a multiple of 4
          stripe_count = (l_nev - 1) / stripe_width + 1

        else ! useGPU
#if REALCASE == 1
391 392
          call obj%get("stripewidth_real",stripe_width, error)

393
#ifdef DOUBLE_PRECISION_REAL
394
          !stripe_width = 48 ! Must be a multiple of 4
395
#else
396 397
          !stripe_width = 96 ! Must be a multiple of 8
          stripe_width = 2 * stripe_width
398 399 400 401
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
402 403
          call obj%get("stripewidth_complex",stripe_width, error)

404
#ifdef DOUBLE_PRECISION_COMPLEX
405
          !stripe_width = 48 ! Must be a multiple of 2
406
#else
407
          !stripe_width = 48 ! Must be a multiple of 4
408 409 410 411 412 413 414 415 416 417 418
#endif
#endif /* COMPLEXCASE */

          stripe_count = (l_nev-1)/stripe_width + 1

          ! Adapt stripe width so that last one doesn't get too small

          stripe_width = (l_nev-1)/stripe_count + 1

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
419
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
420
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
421
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
422 423 424 425

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                                  ! (8 * sizeof(double) == 64)

Andreas Marek's avatar
Retab  
Andreas Marek committed
426
    else
427
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
428
                                            ! (4 * sizeof(double) == 32)
429
          endif
430
#else
431
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
432
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
433
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
434 435 436


            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
437
                                               ! (16 * sizeof(float) == 64)
438

Andreas Marek's avatar
Retab  
Andreas Marek committed
439
    else
440
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
441
                                            ! (8 * sizeof(float) == 32)
442
          endif
443 444 445 446 447
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
448

449
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
450
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
451

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
452
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
453
                                            ! (4 * sizeof(double complex) == 64)
454

Andreas Marek's avatar
Retab  
Andreas Marek committed
455
    else
456 457

            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
458 459
                                            ! (2 * sizeof(double complex) == 32)
    endif
460
#else
461

462
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
463
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
464

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
465
            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
466
                                            ! (8 * sizeof(float complex) == 64)
467 468 469

          else
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
470 471
                                            ! (4 * sizeof(float complex) == 32)
    endif
472 473 474 475 476 477 478 479 480 481 482 483 484 485
#endif
#endif /* COMPLEXCASE */
        endif ! useGPU

        last_stripe_width = l_nev - (stripe_count-1)*stripe_width

#endif /* WITH_OPENMP */
      endif ! l_nev

      ! Determine the matrix distribution at the beginning

      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
486 487
                &MATH_DATATYPE&
                &: error when allocating limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
488
        stop 1
489
      endif
490
      call determine_workload(obj,na, nbw, np_rows, limits)
491 492 493 494 495 496

      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))

      a_dim2 = max_blk_size + nbw

      if (useGPU) then
497 498
        num =  (stripe_width*a_dim2*stripe_count)* size_of_datatype
        successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
499 500 501 502
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
503
          stop 1
504 505 506 507 508 509 510
        endif

        successCUDA = cuda_memset(aIntern_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
511
          stop 1
512 513
        endif

514
        num =  (l_nev)* size_of_datatype
515 516 517 518 519
        successCUDA = cuda_malloc( row_dev,num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc "
Andreas Marek's avatar
Andreas Marek committed
520
          stop 1
521 522 523 524 525 526 527
        endif

        successCUDA = cuda_memset(row_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset "
Andreas Marek's avatar
Andreas Marek committed
528
          stop 1
529 530 531 532 533 534 535 536
        endif

        ! "row_group" and "row_group_dev" are needed for GPU optimizations
        allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error when allocating row_group"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
537
          stop 1
538 539
        endif

540
        row_group(:, :) = 0.0_rck
541
        num =  (l_nev*nblk)* size_of_datatype
542 543 544 545 546
        successCUDA = cuda_malloc(row_group_dev, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
547
          stop 1
548 549 550 551 552 553 554
        endif

        successCUDA = cuda_memset(row_group_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
555
          stop 1
556 557 558 559 560 561 562 563 564 565
        endif

      else ! GPUs are not used

#if 0
! realcase or complexcase
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif

#ifdef WITH_OPENMP
566
        if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*max_threads*     &
567
               C_SIZEOF(a_var)) /= 0) then
568 569
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
570
          &: error when allocating aIntern"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
571
          stop 1
572 573 574 575 576 577
        endif

        call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
        ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)

        ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
578 579 580

#else /* WITH_OPENMP */

581
        if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*  &
582
            C_SIZEOF(a_var)) /= 0) then
583
          print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
584
          stop 1
585 586 587 588 589
        endif

        call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
        !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)

590
        aIntern(:,:,:) = 0.0_rck
591 592 593 594 595 596 597 598
#endif /* WITH_OPENMP */
      endif !useGPU

      allocate(row(l_nev), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating row"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
599
        stop 1
600 601
      endif

602
      row(:) = 0.0_rck
603 604 605 606 607 608 609 610 611 612 613 614

      ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
      ! and transpose the matrix using stripes of given stripe_width for cache blocking.

      ! The peculiar way it is done below is due to the fact that the last row should be
      ! ready first since it is the first one to start below

#ifdef WITH_OPENMP
      ! Please note about the OMP usage below:
      ! This is not for speed, but because we want the matrix a in the memory and
      ! in the cache of the correct thread (if possible)

615
      call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
616 617
      !$omp parallel do private(my_thread), schedule(static, 1)
      do my_thread = 1, max_threads
618
        aIntern(:,:,:,my_thread) = 0.0_rck ! if possible, do first touch allocation!
619 620 621
      enddo
      !$omp end parallel do

622
      call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
623 624 625 626 627 628 629 630 631 632 633 634 635
#endif /* WITH_OPENMP */

      do ip = np_rows-1, 0, -1
        if (my_prow == ip) then
          ! Receive my rows which have not yet been received
          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
          do i=limits(ip)+1,limits(ip+1)
            src = mod((i-1)/nblk, np_rows)

            if (src < my_prow) then
#ifdef WITH_OPENMP

#ifdef WITH_MPI
636
              if (wantDebug) call obj%timer%start("mpi_communication")
637
              call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
638
                            src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
639
              if (wantDebug) call obj%timer%stop("mpi_communication")
640 641 642 643 644 645 646

#else /* WITH_MPI */

!              row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

647
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
648 649 650

!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
651 652 653 654
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
655
                                  (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, &
656 657
                                   thread_width, stripe_width, l_nev)

658 659 660
              enddo
!$omp end parallel do

661
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
662 663 664 665

#else /* WITH_OPENMP */
              if (useGPU) then
                ! An unpacking of the current row group may occur before queuing the next row
666 667 668 669
                call unpack_and_prepare_row_group_&
                &MATH_DATATYPE&
                &_gpu_&
                &PRECISION &
Andreas Marek's avatar
Retab  
Andreas Marek committed
670 671
                              ( &
                              row_group, row_group_dev, aIntern_dev, stripe_count, &
Andreas Marek's avatar
Andreas Marek committed
672 673 674
                                          stripe_width, last_stripe_width, a_dim2, l_nev,&
                                          row_group_size, nblk, unpack_idx, &
                                           i - limits(ip), .false.)
675
#ifdef WITH_MPI
676
                if (wantDebug) call obj%timer%start("mpi_communication")
677
                call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
678
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
679
                if (wantDebug) call obj%timer%stop("mpi_communication")
680 681 682 683 684 685 686

#else /* WITH_MPI */
                row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif /* WITH_MPI */

              else ! useGPU
#ifdef WITH_MPI
687
                if (wantDebug) call obj%timer%start("mpi_communication")
688
                call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
689
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
690
                if (wantDebug) call obj%timer%stop("mpi_communication")
691 692 693 694 695 696 697

#else /* WITH_MPI */

!                row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

698
                call unpack_row_&
699 700 701
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
702
                                (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
703 704 705 706 707 708 709 710 711 712
              endif ! useGPU
#endif /* WITH_OPENMP */

            elseif (src == my_prow) then

              src_offset = src_offset+1

              if (useGPU) then
#ifndef WITH_OPENMP

Andreas Marek's avatar
Andreas Marek committed
713 714
                 ! An unpacking of the current row group may occur before queuing the next row
                 call unpack_and_prepare_row_group_&
715 716 717
                      &MATH_DATATYPE&
                      &_gpu_&
                      &PRECISION &
Andreas Marek's avatar
Retab  
Andreas Marek committed
718
                  ( &
Andreas Marek's avatar
Andreas Marek committed
719 720 721 722 723
                               row_group, row_group_dev, aIntern_dev, stripe_count, &
                               stripe_width, last_stripe_width, a_dim2, l_nev,&
                               row_group_size, nblk, unpack_idx, &
                               i - limits(ip), .false.)

724
                row_group(:, row_group_size) = q(src_offset, 1:l_nev)
725
#else /* WITH_OPENMP */
Andreas Marek's avatar
Andreas Marek committed
726

727 728 729 730 731 732 733 734 735
!#if COMPLEXCASE == 1
!! why is an cuda call in the openmp region?
!                call unpack_and_prepare_row_group_complex_gpu_&
!                     &PRECISION&
!                     &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
!                      last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
!                      unpack_idx, i - limits(ip),.false.)
!                      row_group(:, row_group_size) = q(src_offset, 1:l_nev)
!#endif
736 737 738 739 740 741 742

#endif /* not OpenMP */
              else
                row(:) = q(src_offset, 1:l_nev)
              endif

#ifdef WITH_OPENMP
743
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
744 745 746

!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
747 748 749 750
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
751
                                   (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
752

753 754 755
              enddo
!$omp end parallel do

756
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
757 758 759 760 761 762

#else /* WITH_OPENMP */

              if (useGPU) then

              else
763 764 765 766
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
767
                                (obj,aIntern, row,i-limits(ip),  stripe_count, stripe_width, last_stripe_width)
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783
              endif

#endif /* WITH_OPENMP */

            endif
          enddo

          ! Send all rows which have not yet been send
          src_offset = 0
          do dst = 0, ip-1
            do i=limits(dst)+1,limits(dst+1)
              if (mod((i-1)/nblk, np_rows) == my_prow) then
                src_offset = src_offset+1
                row(:) = q(src_offset, 1:l_nev)

#ifdef WITH_MPI
784
                if (wantDebug) call obj%timer%start("mpi_communication")
785
                call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
786
                              dst, 0, mpi_comm_rows, mpierr)
787
                if (wantDebug) call obj%timer%stop("mpi_communication")
788 789 790 791 792 793
#endif /* WITH_MPI */
              endif
            enddo
          enddo

        else if (my_prow < ip) then
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
794

795 796 797 798 799 800 801 802
          ! Send all rows going to PE ip
          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
          do i=limits(ip)+1,limits(ip+1)
            src = mod((i-1)/nblk, np_rows)
            if (src == my_prow) then
              src_offset = src_offset+1
              row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
803
              if (wantDebug) call obj%timer%start("mpi_communication")
804
              call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
805
                            ip, 0, mpi_comm_rows, mpierr)
806
              if (wantDebug) call obj%timer%stop("mpi_communication")
807 808 809 810 811 812 813 814 815 816 817
#endif /* WITH_MPI */
            endif
          enddo

          ! Receive all rows from PE ip
          do i=limits(my_prow)+1,limits(my_prow+1)
            src = mod((i-1)/nblk, np_rows)
            if (src == ip) then
#ifdef WITH_OPENMP

#ifdef WITH_MPI
818
              if (wantDebug) call obj%timer%start("mpi_communication")
819
              call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
820
                            src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
821
              if (wantDebug) call obj%timer%stop("mpi_communication")
822 823 824 825 826 827
#else /* WITH_MPI */

!              row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

828
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
829 830
!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
831 832 833 834
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
835
                                 (obj,aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
836 837
              enddo
!$omp end parallel do
838
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
839 840 841 842

#else /* WITH_OPENMP */
              if (useGPU) then
                ! An unpacking of the current row group may occur before queuing the next row
Andreas Marek's avatar
Andreas Marek committed
843
                call unpack_and_prepare_row_group_&
844 845 846 847
                     &MATH_DATATYPE&
                     &_gpu_&
                     &PRECISION&
                     &( &
Andreas Marek's avatar
Andreas Marek committed
848 849 850 851
                  row_group, row_group_dev, aIntern_dev, stripe_count,  &
                  stripe_width, last_stripe_width, a_dim2, l_nev,       &
                  row_group_size, nblk, unpack_idx,                     &
                  i - limits(my_prow), .false.)
852 853

#ifdef WITH_MPI
854
               if (wantDebug) call obj%timer%start("mpi_communication")
855
               call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
856
                             src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
857
               if (wantDebug) call obj%timer%stop("mpi_communication")
858 859 860 861 862 863 864
#else /* WITH_MPI */

               row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#endif /* WITH_MPI */

              else ! useGPU
#ifdef WITH_MPI
865
                if (wantDebug) call obj%timer%start("mpi_communication")
866
                call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
867
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
868
                if (wantDebug) call obj%timer%stop("mpi_communication")
869 870 871 872 873
#else /* WITH_MPI */

!                row(1:l_nev) = row(1:l_nev)

#endif
874
                call unpack_row_&
875 876 877
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
878
                                (obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
879 880 881 882 883 884 885 886 887 888 889
              endif ! useGPU

#endif /* WITH_OPENMP */

            endif
          enddo
        endif
      enddo

      if (useGPU) then
        ! Force an unpacking of all remaining rows that haven't been unpacked yet
Andreas Marek's avatar
Andreas Marek committed
890
        call unpack_and_prepare_row_group_&
891 892 893 894
             &MATH_DATATYPE&
             &_gpu_&
             &PRECISION&
             &( &
Andreas Marek's avatar
Andreas Marek committed
895 896 897 898 899
          row_group, row_group_dev, aIntern_dev, stripe_count, &
          stripe_width, last_stripe_width, &
          a_dim2, l_nev, row_group_size, nblk, unpack_idx,     &
          -1, .true.)

900 901 902 903 904 905
        successCUDA = cuda_devicesynchronize()

         if (.not.(successCUDA)) then
           print *,"trans_ev_tridi_to_band_&
           &MATH_DATATYPE&
           &: error in cudaDeviceSynchronize"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
906
           stop 1
907 908 909 910 911 912 913 914 915 916 917 918 919
         endif
      endif

      ! Set up result buffer queue

      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows

      num_result_buffers = 4*nfact
      allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
920
        stop 1
921 922 923 924 925 926 927
      endif

      allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
928
        stop 1
929 930 931 932 933 934 935
      endif

      allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_recv_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
936
        stop 1
937 938 939 940 941 942 943 944 945
      endif

#ifdef WITH_MPI
      result_send_request(:) = MPI_REQUEST_NULL
      result_recv_request(:) = MPI_REQUEST_NULL
#endif

      ! Queue up buffers
#ifdef WITH_MPI
946
      if (wantDebug) call obj%timer%start("mpi_communication")
947 948 949

      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
        do j = 1, min(num_result_buffers, num_result_blocks)
950
          call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL,     &
951 952 953
                         0, result_recv_tag, mpi_comm_rows, result_recv_request(j), mpierr)
        enddo
      endif
954
      if (wantDebug) call obj%timer%stop("mpi_communication")
955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
#else /* WITH_MPI */

      ! carefull the "recv" has to be done at the corresponding wait or send
      ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)

#endif /* WITH_MPI */

      num_bufs_recvd = 0 ! No buffers received yet

      ! Initialize top/bottom requests

      allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MPI_DATATYPE&
         &: error when allocating top_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
971
         stop 1
972 973 974 975 976 977 978
       endif

      allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_recv_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
979
         stop 1
980 981 982 983 984 985 986
       endif

      allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
987
         stop 1
988 989 990 991 992 993 994
       endif

      allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_recv_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
995
         stop 1
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
       endif

#ifdef WITH_MPI
      top_send_request(:) = MPI_REQUEST_NULL
      top_recv_request(:) = MPI_REQUEST_NULL
      bottom_send_request(:) = MPI_REQUEST_NULL
      bottom_recv_request(:) = MPI_REQUEST_NULL
#endif

#ifdef WITH_OPENMP
      allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_border_send_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1011
         stop 1
1012 1013 1014 1015 1016 1017 1018
       endif

      allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_border_recv_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1019
         stop 1
1020 1021 1022 1023 1024 1025 1026
       endif

      allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_border_send_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1027
         stop 1
1028 1029 1030 1031 1032 1033 1034
       endif

      allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_border_recv_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1035
         stop 1
1036 1037
       endif

1038 1039 1040 1041
      top_border_send_buffer(:,:) = 0.0_rck
      top_border_recv_buffer(:,:) = 0.0_rck
      bottom_border_send_buffer(:,:) = 0.0_rck
      bottom_border_recv_buffer(:,:) = 0.0_rck
1042 1043 1044 1045 1046 1047 1048 1049 1050
      ! Initialize broadcast buffer

#else /* WITH_OPENMP */

       allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_border_send_bufer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1051
         stop 1
1052 1053 1054 1055 1056 1057 1058
       endif

      allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_border_recv_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1059
         stop 1
1060 1061 1062 1063 1064 1065 1066
       endif

      allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_border_send_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1067
         stop 1
1068 1069 1070 1071 1072 1073 1074
       endif

      allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_border_recv_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1075
         stop 1
1076 1077
       endif

1078 1079 1080 1081
      top_border_send_buffer(:,:,:) = 0.0_rck
      top_border_recv_buffer(:,:,:) = 0.0_rck
      bottom_border_send_buffer(:,:,:) = 0.0_rck
      bottom_border_recv_buffer(:,:,:) = 0.0_rck
1082 1083 1084 1085 1086 1087 1088 1089 1090
#endif /* WITH_OPENMP */

      ! Initialize broadcast buffer

      allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bcast_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1091
         stop 1
1092 1093
       endif

1094
      bcast_buffer = 0.0_rck
1095
      if (useGPU) then
1096
        num =  ( nbw * max_blk_size) * size_of_datatype
1097 1098 1099 1100 1101
        successCUDA = cuda_malloc(bcast_buffer_dev, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
1102
          stop 1
1103 1104 1105 1106 1107 1108 1109
        endif

        successCUDA = cuda_memset( bcast_buffer_dev, 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset"
Andreas Marek's avatar
Andreas Marek committed
1110
          stop 1
1111 1112
        endif

1113
        num =  ((max_blk_size-1))* size_of_datatype
1114 1115 1116 1117 1118
        successCUDA = cuda_malloc( hh_dot_dev, num)
        if