elpa2_trans_ev_tridi_to_band_template.F90 92.6 KB
Newer Older
Andreas Marek's avatar
Andreas Marek committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".

53
#include "../general/sanity.F90"
Andreas Marek's avatar
Andreas Marek committed
54 55

  subroutine trans_ev_tridi_to_band_&
56 57 58
    &MATH_DATATYPE&
    &_&
    &PRECISION &
59
    (obj, na, nev, nblk, nbw, q, q_dev, ldq, matrixCols,         &
60
     hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
61
     kernel)
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90

    !-------------------------------------------------------------------------------
    !  trans_ev_tridi_to_band_real/complex:
    !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
    !
    !  Parameters
    !
    !  na          Order of matrix a, number of rows of matrix q
    !
    !  nev         Number eigenvectors to compute (= columns of matrix q)
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
    !  nb          semi bandwith
    !
    !  q           On input: Eigenvectors of tridiagonal matrix
    !              On output: Transformed eigenvectors
    !              Distribution is like in Scalapack.
    !
    !  q_dev       GPU device pointer to q
    !
    !  ldq         Leading dimension of q
    !  matrixCols  local columns of matrix q
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns/both
    !
    !-------------------------------------------------------------------------------
91
      use elpa_abstract_impl
92
      use elpa2_workload
Andreas Marek's avatar
Andreas Marek committed
93
      use pack_unpack_cpu
Andreas Marek's avatar
Andreas Marek committed
94
      use pack_unpack_gpu
95
      use compute_hh_trafo
96 97 98 99
      use cuda_functions
      use precision
      use iso_c_binding
      implicit none
100
#include "../general/precision_kinds.F90"
101
      class(elpa_abstract_impl_t), intent(inout) :: obj
Andreas Marek's avatar
Andreas Marek committed
102
      logical, intent(in)                        :: useGPU
103

Andreas Marek's avatar
Andreas Marek committed
104 105
      integer(kind=ik), intent(in)               :: kernel
      integer(kind=ik), intent(in)               :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
106 107

#ifdef USE_ASSUMED_SIZE
108
      MATH_DATATYPE(kind=rck)                   :: q(ldq,*)
109
#else
110
      MATH_DATATYPE(kind=rck)                   :: q(ldq,matrixCols)
111 112
#endif

113
      MATH_DATATYPE(kind=rck), intent(in)       :: hh_trans(:,:)
Andreas Marek's avatar
Andreas Marek committed
114
      integer(kind=c_intptr_t)                   :: q_dev
Andreas Marek's avatar
Andreas Marek committed
115

Andreas Marek's avatar
Andreas Marek committed
116 117
      integer(kind=ik)                           :: np_rows, my_prow, np_cols, my_pcol
      integer(kind=ik)                           :: tmp
118

Andreas Marek's avatar
Andreas Marek committed
119 120 121 122 123
      integer(kind=ik)                           :: i, j, ip, sweep, nbuf, l_nev, a_dim2
      integer(kind=ik)                           :: current_n, current_local_n, current_n_start, current_n_end
      integer(kind=ik)                           :: next_n, next_local_n, next_n_start, next_n_end
      integer(kind=ik)                           :: bottom_msg_length, top_msg_length, next_top_msg_length
      integer(kind=ik)                           :: stripe_width, last_stripe_width, stripe_count
124
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
125
      integer(kind=ik)                           :: thread_width, csw, b_off, b_len
126
#endif
Andreas Marek's avatar
Andreas Marek committed
127 128 129
      integer(kind=ik)                           :: num_result_blocks, num_result_buffers, num_bufs_recvd
      integer(kind=ik)                           :: a_off, current_tv_off, max_blk_size
      integer(kind=ik)                           :: mpierr, src, src_offset, dst, offset, nfact, num_blk
130

Andreas Marek's avatar
Andreas Marek committed
131
      logical                                    :: flag
132
#ifdef WITH_OPENMP
133
      MATH_DATATYPE(kind=rck), pointer          :: aIntern(:,:,:,:)
134
#else
135
      MATH_DATATYPE(kind=rck), pointer          :: aIntern(:,:,:)
136
#endif
137
      MATH_DATATYPE(kind=rck)                   :: a_var
138

Andreas Marek's avatar
Andreas Marek committed
139
      type(c_ptr)                                :: aIntern_ptr
140

141 142
      MATH_DATATYPE(kind=rck)   , allocatable   :: row(:)
      MATH_DATATYPE(kind=rck)   , allocatable   :: row_group(:,:)
143 144

#ifdef WITH_OPENMP
145 146
      MATH_DATATYPE(kind=rck), allocatable    :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
      MATH_DATATYPE(kind=rck), allocatable    :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
147
#else
148 149
      MATH_DATATYPE(kind=rck), allocatable    :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
      MATH_DATATYPE(kind=rck), allocatable    :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
150 151 152 153
#endif

      integer(kind=c_intptr_t)                 :: aIntern_dev
      integer(kind=c_intptr_t)                 :: bcast_buffer_dev
154 155
      integer(kind=c_intptr_t)                   :: num
      integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
156 157 158 159 160 161 162 163 164
      integer(kind=c_intptr_t)                 :: row_dev
      integer(kind=c_intptr_t)                 :: row_group_dev
      integer(kind=c_intptr_t)                 :: hh_tau_dev
      integer(kind=c_intptr_t)                 :: hh_dot_dev
      integer(kind=ik)                         :: row_group_size, unpack_idx

      integer(kind=ik)                         :: n_times
      integer(kind=ik)                         :: top, chunk, this_chunk

165 166
      MATH_DATATYPE(kind=rck), allocatable    :: result_buffer(:,:,:)
      MATH_DATATYPE(kind=rck), allocatable    :: bcast_buffer(:,:)
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202

      integer(kind=ik)                         :: n_off

      integer(kind=ik), allocatable            :: result_send_request(:), result_recv_request(:), limits(:)
      integer(kind=ik), allocatable            :: top_send_request(:), bottom_send_request(:)
      integer(kind=ik), allocatable            :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
!      integer(kind=ik), allocatable           :: mpi_statuses(:,:)
#endif

#ifdef WITH_OPENMP
#ifdef WITH_MPI
!      integer(kind=ik)                        :: my_MPI_STATUS_(MPI_STATUS_SIZE)
#endif
#endif

      ! MPI send/recv tags, arbitrary

      integer(kind=ik), parameter              :: bottom_recv_tag = 111
      integer(kind=ik), parameter              :: top_recv_tag    = 222
      integer(kind=ik), parameter              :: result_recv_tag = 333
#ifdef WITH_OPENMP
      integer(kind=ik)                         :: max_threads, my_thread
      integer(kind=ik)                         :: omp_get_max_threads
#endif


      ! Just for measuring the kernel performance
      real(kind=c_double)                      :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
      ! long integer
      integer(kind=lik)                        :: kernel_flops, kernel_flops_recv



      logical, intent(in)                      :: wantDebug
      logical                                  :: success
203
      integer(kind=ik)                         :: istat, print_flops
204 205 206 207 208
      character(200)                           :: errorMessage
      logical                                  :: successCUDA
#ifndef WITH_MPI
      integer(kind=ik)                         :: j1
#endif
209
      integer(kind=c_intptr_t), parameter      :: size_of_datatype = size_of_&
210 211 212 213
                                                                   &PRECISION&
                                                                   &_&
                                                                   &MATH_DATATYPE

214
      call obj%timer%start("trans_ev_tridi_to_band_&
215
      &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
216
      &" // &
217 218 219
      &PRECISION_SUFFIX &
      )

220
      n_times = 0
221 222 223 224 225 226 227 228 229 230 231 232 233
      if (useGPU) then
        unpack_idx = 0
        row_group_size = 0
      endif

      success = .true.
      kernel_time = 0.0
      kernel_flops = 0

#ifdef WITH_OPENMP
      max_threads = 1
      max_threads = omp_get_max_threads()
#endif
234
      if (wantDebug) call obj%timer%start("mpi_communication")
235 236 237 238
      call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
      call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
      call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
      call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
239
      if (wantDebug) call obj%timer%stop("mpi_communication")
240 241 242 243 244

      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
245 246
                                &MATH_DATATYPE&
                                &: ERROR: nbw=',nbw,', nblk=',nblk
247
            write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
248 249
                                &MATH_DATATYPE&
                                &: band backtransform works only for nbw==n*nblk'
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
          endif
          success = .false.
          return
        endif
      endif

      nfact = nbw / nblk


      ! local number of eigenvectors
      l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)

      if (l_nev==0) then
#ifdef WITH_OPENMP
        thread_width = 0
#endif
        stripe_width = 0
        stripe_count = 0
        last_stripe_width = 0

      else ! l_nev

#if WITH_OPENMP
        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
        ! every primary cache
275
        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306

        if (useGPU) then
          stripe_width = 256 ! Must be a multiple of 4
          stripe_count = (l_nev - 1) / stripe_width + 1
        else ! useGPU
          ! openmp only in non-GPU case
          thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
          stripe_width = 48 ! Must be a multiple of 4
#else
          stripe_width = 96 ! Must be a multiple of 8
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
          stripe_width = 48 ! Must be a multiple of 2
#else
          stripe_width = 48 ! Must be a multiple of 4
#endif
#endif /* COMPLEXCASE */

          stripe_count = (thread_width-1)/stripe_width + 1

          ! Adapt stripe width so that last one doesn't get too small

          stripe_width = (thread_width-1)/stripe_count + 1

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
307
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
308
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
309
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
310 311 312 313

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                                  ! (8 * sizeof(double) == 64)

Andreas Marek's avatar
Retab  
Andreas Marek committed
314
    else
315
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
316
                                            ! (4 * sizeof(double) == 32)
317
          endif
318
#else
319
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
320
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
321
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
322 323 324


            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
325
                                               ! (16 * sizeof(float) == 64)
326

Andreas Marek's avatar
Retab  
Andreas Marek committed
327
    else
328
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
329
                                            ! (8 * sizeof(float) == 32)
330
          endif
331 332 333 334 335
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
336
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
337
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
338

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
339
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
340
                                            ! (4 * sizeof(double complex) == 64)
341

Andreas Marek's avatar
Retab  
Andreas Marek committed
342
    else
343 344

            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
345 346
                                            ! (2 * sizeof(double complex) == 32)
    endif
347
#else
348

349
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
350
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
351 352

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
353
                                            ! (8 * sizeof(float complex) == 64)
354 355 356

          else
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
357 358
                                            ! (4 * sizeof(float complex) == 32)
    endif
359 360 361
#endif
#endif /* COMPLEXCASE */

362
#if REALCASE == 1
363
        last_stripe_width = l_nev - (stripe_count-1)*stripe_width
364 365 366 367 368
#endif
#if COMPLEXCASE == 1
! only needed in no OMP case check thsis
! last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif
369 370

        endif ! useGPU
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
371

372
#else /* WITH_OPENMP */
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
373

374 375
        ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
        ! every primary cache
376
        ! Suggested stripe width is 48 - should this be reduced for the complex case ???
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406

        if (useGPU) then
          stripe_width = 256 ! Must be a multiple of 4
          stripe_count = (l_nev - 1) / stripe_width + 1

        else ! useGPU
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
          stripe_width = 48 ! Must be a multiple of 4
#else
          stripe_width = 96 ! Must be a multiple of 8
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
          stripe_width = 48 ! Must be a multiple of 2
#else
          stripe_width = 48 ! Must be a multiple of 4
#endif
#endif /* COMPLEXCASE */

          stripe_count = (l_nev-1)/stripe_width + 1

          ! Adapt stripe width so that last one doesn't get too small

          stripe_width = (l_nev-1)/stripe_count + 1

#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
407
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
408
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
409
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
410 411 412 413

            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
                                                  ! (8 * sizeof(double) == 64)

Andreas Marek's avatar
Retab  
Andreas Marek committed
414
    else
415
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
416
                                            ! (4 * sizeof(double) == 32)
417
          endif
418
#else
419
          if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
420
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
421
              kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
422 423 424


            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
425
                                               ! (16 * sizeof(float) == 64)
426

Andreas Marek's avatar
Retab  
Andreas Marek committed
427
    else
428
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
429
                                            ! (8 * sizeof(float) == 32)
430
          endif
431 432 433 434 435
#endif
#endif /* REALCASE */

#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
436

437
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
438
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
439

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
440
            stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
441
                                            ! (4 * sizeof(double complex) == 64)
442

Andreas Marek's avatar
Retab  
Andreas Marek committed
443
    else
444 445

            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
446 447
                                            ! (2 * sizeof(double complex) == 32)
    endif
448
#else
449

450
          if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
451
              kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
452

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
453
            stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
454
                                            ! (8 * sizeof(float complex) == 64)
455 456 457

          else
            stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
Andreas Marek's avatar
Retab  
Andreas Marek committed
458 459
                                            ! (4 * sizeof(float complex) == 32)
    endif
460 461 462 463 464 465 466 467 468 469 470 471 472 473
#endif
#endif /* COMPLEXCASE */
        endif ! useGPU

        last_stripe_width = l_nev - (stripe_count-1)*stripe_width

#endif /* WITH_OPENMP */
      endif ! l_nev

      ! Determine the matrix distribution at the beginning

      allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
474 475
                &MATH_DATATYPE&
                &: error when allocating limits"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
476
        stop 1
477
      endif
478
      call determine_workload(obj,na, nbw, np_rows, limits)
479 480 481 482 483 484

      max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))

      a_dim2 = max_blk_size + nbw

      if (useGPU) then
485 486
        num =  (stripe_width*a_dim2*stripe_count)* size_of_datatype
        successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
487 488 489 490
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
491
          stop 1
492 493 494 495 496 497 498
        endif

        successCUDA = cuda_memset(aIntern_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
499
          stop 1
500 501
        endif

502
        num =  (l_nev)* size_of_datatype
503 504 505 506 507
        successCUDA = cuda_malloc( row_dev,num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc "
Andreas Marek's avatar
Andreas Marek committed
508
          stop 1
509 510 511 512 513 514 515
        endif

        successCUDA = cuda_memset(row_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset "
Andreas Marek's avatar
Andreas Marek committed
516
          stop 1
517 518 519 520 521 522 523 524
        endif

        ! "row_group" and "row_group_dev" are needed for GPU optimizations
        allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error when allocating row_group"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
525
          stop 1
526 527
        endif

528
        row_group(:, :) = 0.0_rck
529
        num =  (l_nev*nblk)* size_of_datatype
530 531 532 533 534
        successCUDA = cuda_malloc(row_group_dev, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMalloc"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
535
          stop 1
536 537 538 539 540 541 542
        endif

        successCUDA = cuda_memset(row_group_dev , 0, num)
        if (.not.(successCUDA)) then
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
          &: error in cudaMemset"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
543
          stop 1
544 545 546 547 548 549 550 551 552 553
        endif

      else ! GPUs are not used

#if 0
! realcase or complexcase
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif

#ifdef WITH_OPENMP
554
        if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*max_threads*     &
555
               C_SIZEOF(a_var)) /= 0) then
556 557
          print *,"trans_ev_tridi_to_band_&
          &MATH_DATATYPE&
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
558
          &: error when allocating aIntern"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
559
          stop 1
560 561 562 563 564 565
        endif

        call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
        ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)

        ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
566 567 568

#else /* WITH_OPENMP */

569
        if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*  &
570
            C_SIZEOF(a_var)) /= 0) then
571
          print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
572
          stop 1
573 574 575 576 577
        endif

        call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
        !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)

578
        aIntern(:,:,:) = 0.0_rck
579 580 581 582 583 584 585 586
#endif /* WITH_OPENMP */
      endif !useGPU

      allocate(row(l_nev), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating row"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
587
        stop 1
588 589
      endif

590
      row(:) = 0.0_rck
591 592 593 594 595 596 597 598 599 600 601 602

      ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
      ! and transpose the matrix using stripes of given stripe_width for cache blocking.

      ! The peculiar way it is done below is due to the fact that the last row should be
      ! ready first since it is the first one to start below

#ifdef WITH_OPENMP
      ! Please note about the OMP usage below:
      ! This is not for speed, but because we want the matrix a in the memory and
      ! in the cache of the correct thread (if possible)

603
      call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
604 605
      !$omp parallel do private(my_thread), schedule(static, 1)
      do my_thread = 1, max_threads
606
        aIntern(:,:,:,my_thread) = 0.0_rck ! if possible, do first touch allocation!
607 608 609
      enddo
      !$omp end parallel do

610
      call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
611 612 613 614 615 616 617 618 619 620 621 622 623
#endif /* WITH_OPENMP */

      do ip = np_rows-1, 0, -1
        if (my_prow == ip) then
          ! Receive my rows which have not yet been received
          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
          do i=limits(ip)+1,limits(ip+1)
            src = mod((i-1)/nblk, np_rows)

            if (src < my_prow) then
#ifdef WITH_OPENMP

#ifdef WITH_MPI
624
              if (wantDebug) call obj%timer%start("mpi_communication")
625
              call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
626
                            src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
627
              if (wantDebug) call obj%timer%stop("mpi_communication")
628 629 630 631 632 633 634

#else /* WITH_MPI */

!              row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

635
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
636 637 638

!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
639 640 641 642
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
643
                                  (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, &
644 645
                                   thread_width, stripe_width, l_nev)

646 647 648
              enddo
!$omp end parallel do

649
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
650 651 652 653

#else /* WITH_OPENMP */
              if (useGPU) then
                ! An unpacking of the current row group may occur before queuing the next row
654 655 656 657
                call unpack_and_prepare_row_group_&
                &MATH_DATATYPE&
                &_gpu_&
                &PRECISION &
Andreas Marek's avatar
Retab  
Andreas Marek committed
658 659
                              ( &
                              row_group, row_group_dev, aIntern_dev, stripe_count, &
Andreas Marek's avatar
Andreas Marek committed
660 661 662
                                          stripe_width, last_stripe_width, a_dim2, l_nev,&
                                          row_group_size, nblk, unpack_idx, &
                                           i - limits(ip), .false.)
663
#ifdef WITH_MPI
664
                if (wantDebug) call obj%timer%start("mpi_communication")
665
                call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
666
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
667
                if (wantDebug) call obj%timer%stop("mpi_communication")
668 669 670 671 672 673 674

#else /* WITH_MPI */
                row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif /* WITH_MPI */

              else ! useGPU
#ifdef WITH_MPI
675
                if (wantDebug) call obj%timer%start("mpi_communication")
676
                call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
677
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
678
                if (wantDebug) call obj%timer%stop("mpi_communication")
679 680 681 682 683 684 685

#else /* WITH_MPI */

!                row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

686
                call unpack_row_&
687 688 689
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
690
                                (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
691 692 693 694 695 696 697 698 699 700
              endif ! useGPU
#endif /* WITH_OPENMP */

            elseif (src == my_prow) then

              src_offset = src_offset+1

              if (useGPU) then
#ifndef WITH_OPENMP

Andreas Marek's avatar
Andreas Marek committed
701 702
                 ! An unpacking of the current row group may occur before queuing the next row
                 call unpack_and_prepare_row_group_&
703 704 705
                      &MATH_DATATYPE&
                      &_gpu_&
                      &PRECISION &
Andreas Marek's avatar
Retab  
Andreas Marek committed
706
                  ( &
Andreas Marek's avatar
Andreas Marek committed
707 708 709 710 711
                               row_group, row_group_dev, aIntern_dev, stripe_count, &
                               stripe_width, last_stripe_width, a_dim2, l_nev,&
                               row_group_size, nblk, unpack_idx, &
                               i - limits(ip), .false.)

712
                row_group(:, row_group_size) = q(src_offset, 1:l_nev)
713
#else /* WITH_OPENMP */
Andreas Marek's avatar
Andreas Marek committed
714

715
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
716
! why is an cuda call in the openmp region?
Andreas Marek's avatar
Andreas Marek committed
717
                call unpack_and_prepare_row_group_complex_gpu_&
718 719 720 721 722
                     &PRECISION&
                     &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
                      last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
                      unpack_idx, i - limits(ip),.false.)
                      row_group(:, row_group_size) = q(src_offset, 1:l_nev)
723 724 725 726 727 728 729 730
#endif

#endif /* not OpenMP */
              else
                row(:) = q(src_offset, 1:l_nev)
              endif

#ifdef WITH_OPENMP
731
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
732 733 734

!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
735 736 737 738
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
739
                                   (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
740

741 742 743
              enddo
!$omp end parallel do

744
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
745 746 747 748 749 750

#else /* WITH_OPENMP */

              if (useGPU) then

              else
751 752 753 754
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
755
                                (obj,aIntern, row,i-limits(ip),  stripe_count, stripe_width, last_stripe_width)
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771
              endif

#endif /* WITH_OPENMP */

            endif
          enddo

          ! Send all rows which have not yet been send
          src_offset = 0
          do dst = 0, ip-1
            do i=limits(dst)+1,limits(dst+1)
              if (mod((i-1)/nblk, np_rows) == my_prow) then
                src_offset = src_offset+1
                row(:) = q(src_offset, 1:l_nev)

#ifdef WITH_MPI
772
                if (wantDebug) call obj%timer%start("mpi_communication")
773
                call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
774
                              dst, 0, mpi_comm_rows, mpierr)
775
                if (wantDebug) call obj%timer%stop("mpi_communication")
776 777 778 779 780 781
#endif /* WITH_MPI */
              endif
            enddo
          enddo

        else if (my_prow < ip) then
Andreas Marek's avatar
Cleanup  
Andreas Marek committed
782

783 784 785 786 787 788 789 790
          ! Send all rows going to PE ip
          src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
          do i=limits(ip)+1,limits(ip+1)
            src = mod((i-1)/nblk, np_rows)
            if (src == my_prow) then
              src_offset = src_offset+1
              row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
791
              if (wantDebug) call obj%timer%start("mpi_communication")
792
              call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
793
                            ip, 0, mpi_comm_rows, mpierr)
794
              if (wantDebug) call obj%timer%stop("mpi_communication")
795 796 797 798 799 800 801 802 803 804 805
#endif /* WITH_MPI */
            endif
          enddo

          ! Receive all rows from PE ip
          do i=limits(my_prow)+1,limits(my_prow+1)
            src = mod((i-1)/nblk, np_rows)
            if (src == ip) then
#ifdef WITH_OPENMP

#ifdef WITH_MPI
806
              if (wantDebug) call obj%timer%start("mpi_communication")
807
              call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
808
                            src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
809
              if (wantDebug) call obj%timer%stop("mpi_communication")
810 811 812 813 814 815
#else /* WITH_MPI */

!              row(1:l_nev) = row(1:l_nev)

#endif /* WITH_MPI */

816
              call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
817 818
!$omp parallel do private(my_thread), schedule(static, 1)
              do my_thread = 1, max_threads
819 820 821 822
                call unpack_row_&
                     &MATH_DATATYPE&
                     &_cpu_openmp_&
                     &PRECISION &
823
                                 (obj,aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
824 825
              enddo
!$omp end parallel do
826
              call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
827 828 829 830

#else /* WITH_OPENMP */
              if (useGPU) then
                ! An unpacking of the current row group may occur before queuing the next row
Andreas Marek's avatar
Andreas Marek committed
831
                call unpack_and_prepare_row_group_&
832 833 834 835
                     &MATH_DATATYPE&
                     &_gpu_&
                     &PRECISION&
                     &( &
Andreas Marek's avatar
Andreas Marek committed
836 837 838 839
                  row_group, row_group_dev, aIntern_dev, stripe_count,  &
                  stripe_width, last_stripe_width, a_dim2, l_nev,       &
                  row_group_size, nblk, unpack_idx,                     &
                  i - limits(my_prow), .false.)
840 841

#ifdef WITH_MPI
842
               if (wantDebug) call obj%timer%start("mpi_communication")
843
               call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
844
                             src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
845
               if (wantDebug) call obj%timer%stop("mpi_communication")
846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
#else /* WITH_MPI */

#if REALCASE == 1
               row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#endif
#if COMPLEXCASE == 1
! todo: what of this is correct? Was different for double/single precision
#ifdef DOUBLE_PRECISION_COMPLEX
                row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#else
                row_group(1:l_nev,row_group_size) = row_group(1:l_nev,row_group_size) ! is this correct
#endif
#endif
#endif /* WITH_MPI */

              else ! useGPU
#ifdef WITH_MPI
863
                if (wantDebug) call obj%timer%start("mpi_communication")
864
                call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
865
                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
866
                if (wantDebug) call obj%timer%stop("mpi_communication")
867 868 869 870 871
#else /* WITH_MPI */

!                row(1:l_nev) = row(1:l_nev)

#endif
872
                call unpack_row_&
873 874 875
                     &MATH_DATATYPE&
                     &_cpu_&
                     &PRECISION &
876
                                (obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
877 878 879 880 881 882 883 884 885 886 887
              endif ! useGPU

#endif /* WITH_OPENMP */

            endif
          enddo
        endif
      enddo

      if (useGPU) then
        ! Force an unpacking of all remaining rows that haven't been unpacked yet
Andreas Marek's avatar
Andreas Marek committed
888
        call unpack_and_prepare_row_group_&
889 890 891 892
             &MATH_DATATYPE&
             &_gpu_&
             &PRECISION&
             &( &
Andreas Marek's avatar
Andreas Marek committed
893 894 895 896 897
          row_group, row_group_dev, aIntern_dev, stripe_count, &
          stripe_width, last_stripe_width, &
          a_dim2, l_nev, row_group_size, nblk, unpack_idx,     &
          -1, .true.)

898 899 900 901 902 903
        successCUDA = cuda_devicesynchronize()

         if (.not.(successCUDA)) then
           print *,"trans_ev_tridi_to_band_&
           &MATH_DATATYPE&
           &: error in cudaDeviceSynchronize"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
904
           stop 1
905 906 907 908 909 910 911 912 913 914 915 916 917
         endif
      endif

      ! Set up result buffer queue

      num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows

      num_result_buffers = 4*nfact
      allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
918
        stop 1
919 920 921 922 923 924 925
      endif

      allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
926
        stop 1
927 928 929 930 931 932 933
      endif

      allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_tridi_to_band_&
        &MATH_DATATYPE&
        &: error when allocating result_recv_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
934
        stop 1
935 936 937 938 939 940 941 942 943
      endif

#ifdef WITH_MPI
      result_send_request(:) = MPI_REQUEST_NULL
      result_recv_request(:) = MPI_REQUEST_NULL
#endif

      ! Queue up buffers
#ifdef WITH_MPI
944
      if (wantDebug) call obj%timer%start("mpi_communication")
945 946 947

      if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
        do j = 1, min(num_result_buffers, num_result_blocks)
948
          call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL,     &
949 950 951
                         0, result_recv_tag, mpi_comm_rows, result_recv_request(j), mpierr)
        enddo
      endif
952
      if (wantDebug) call obj%timer%stop("mpi_communication")
953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968
#else /* WITH_MPI */

      ! carefull the "recv" has to be done at the corresponding wait or send
      ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)

#endif /* WITH_MPI */

      num_bufs_recvd = 0 ! No buffers received yet

      ! Initialize top/bottom requests

      allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MPI_DATATYPE&
         &: error when allocating top_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
969
         stop 1
970 971 972 973 974 975 976
       endif

      allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_recv_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
977
         stop 1
978 979 980 981 982 983 984
       endif

      allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_send_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
985
         stop 1
986 987 988 989 990 991 992
       endif

      allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_recv_request"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
993
         stop 1
994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
       endif

#ifdef WITH_MPI
      top_send_request(:) = MPI_REQUEST_NULL
      top_recv_request(:) = MPI_REQUEST_NULL
      bottom_send_request(:) = MPI_REQUEST_NULL
      bottom_recv_request(:) = MPI_REQUEST_NULL
#endif

#ifdef WITH_OPENMP
      allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_border_send_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1009
         stop 1
1010 1011 1012 1013 1014 1015 1016
       endif

      allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_border_recv_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1017
         stop 1
1018 1019 1020 1021 1022 1023 1024
       endif

      allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_border_send_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1025
         stop 1
1026 1027 1028 1029 1030 1031 1032
       endif

      allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_border_recv_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1033
         stop 1
1034 1035
       endif

1036 1037 1038 1039
      top_border_send_buffer(:,:) = 0.0_rck
      top_border_recv_buffer(:,:) = 0.0_rck
      bottom_border_send_buffer(:,:) = 0.0_rck
      bottom_border_recv_buffer(:,:) = 0.0_rck
1040 1041 1042 1043 1044 1045 1046 1047 1048
      ! Initialize broadcast buffer

#else /* WITH_OPENMP */

       allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_border_send_bufer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1049
         stop 1
1050 1051 1052 1053 1054 1055 1056
       endif

      allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating top_border_recv_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1057
         stop 1
1058 1059 1060 1061 1062 1063 1064
       endif

      allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_border_send_buffer"//errorMessage
Andreas Marek's avatar
Andreas Marek committed
1065
         stop 1
1066 1067 1068 1069 1070 1071 1072
       endif

      allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"trans_ev_tridi_to_band_&
         &MATH_DATATYPE&
         &: error when allocating bottom_border_recv_buffer"//errorMessage