elpa2_bandred_template.F90 63.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63
64
65
66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
67
    (obj, na, a_mat, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
68
     tmat_dev, wantDebug, useGPU, success, &
69
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
70
     useQR, &
71
#endif
Andreas Marek's avatar
Andreas Marek committed
72
     max_threads)
73

74
  !-------------------------------------------------------------------------------
75
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
76
77
78
79
80
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
Pavel Kus's avatar
Pavel Kus committed
81
  !  a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
82
  !              Distribution is like in Scalapack.
Pavel Kus's avatar
Pavel Kus committed
83
84
  !              Opposed to Scalapack, a_mat(:,:) must be set completely (upper and lower half)
  !              a_mat(:,:) is overwritten on exit with the band and the Householder vectors
85
86
  !              in the upper half.
  !
Pavel Kus's avatar
Pavel Kus committed
87
88
  !  lda         Leading dimension of a_mat
  !  matrixCols  local columns of matrix a_mat
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
110
      use elpa_abstract_impl
111
      implicit none
112
#include "../general/precision_kinds.F90"
113
      class(elpa_abstract_impl_t), intent(inout) :: obj
114
115
116
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
Pavel Kus's avatar
Pavel Kus committed
117
      MATH_DATATYPE(kind=rck)                    :: a_mat(lda,*), tmat(nbw,nbw,*)
118
#else
Pavel Kus's avatar
Pavel Kus committed
119
      MATH_DATATYPE(kind=rck)                    :: a_mat(lda,matrixCols), tmat(nbw,nbw,numBlocks)
120
#endif
Andreas Marek's avatar
Andreas Marek committed
121
122

#if REALCASE == 1
123
      real(kind=rk)                               :: eps
124
125
#endif
      logical, intent(in)                         :: useGPU
126
      character(20)                               :: gpuString
127

128
129
130
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
131
      integer(kind=ik)                            :: vmrCols
132
#endif
Andreas Marek's avatar
Andreas Marek committed
133
134
135
136
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
137
138
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
139

140
141
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
142

143
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
144
145
146
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
147
148
149
150

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
151
152
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
153
#endif
154
      ! a_dev is passed from bandred_real to trans_ev_band
155
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
156
#ifdef WITH_MPI
157
158
159
160
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
Andreas Marek's avatar
Andreas Marek committed
161
      integer(kind=c_intptr_t)                    :: lc_start, lc_end
162
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
163
      integer(kind=c_intptr_t)                    :: lce_1, lcs_1, lre_1
164
165
166
167
168
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
169
170
#endif

171
172
173
174
175
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
176
      integer(kind=ik)                            :: min_tile_size, error
177

178
179
180
181
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
182
                                                    ii, pp
183
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
184
185
186
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
187

Andreas Marek's avatar
Andreas Marek committed
188
      logical                                     :: useGPU_reduction_lower_block_to_tridiagonal
Andreas Marek's avatar
Andreas Marek committed
189
      integer(kind=ik), intent(in)                :: max_threads
Andreas Marek's avatar
Andreas Marek committed
190

191
192
193
194
195
      if(useGPU) then
        gpuString = "_gpu"
      else
        gpuString = ""
      endif
Andreas Marek's avatar
Andreas Marek committed
196

197
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
198
199
      &MATH_DATATYPE&
      &" // &
200
201
202
      PRECISION_SUFFIX // &
      gpuString )

Andreas Marek's avatar
Andreas Marek committed
203
      useGPU_reduction_lower_block_to_tridiagonal = .false.
Andreas Marek's avatar
Andreas Marek committed
204
205
206
207
208
209
210
211
212
213
214
215

      if (useGPU) then
        useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
        if (useQR) then
          !in this case switch off GPU usage for step "reduce current block to lower triangular form"
          ! since this is done by QR decomposition
          useGPU_reduction_lower_block_to_tridiagonal = .false.
        endif
#endif
      endif

216
      if (wantDebug) call obj%timer%start("mpi_communication")
217
218
219
220
221

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
222

223
      if (wantDebug) call obj%timer%stop("mpi_communication")
224
225
226
227
228
229
230
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
231
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
232
233
                                 &MATH_DATATYPE&
                                 &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
234
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
235
236
                                 &MATH_DATATYPE&
                                 &: ELPA2 works only for nbw==n*nblk'
237
238
239
240
241
242
          endif
          success = .false.
          return
        endif
      endif

Andreas Marek's avatar
Andreas Marek committed
243
      ! na_rows in used nowhere; only na_cols
244
245
      if (useGPU) then
#ifdef WITH_MPI
246
#if COMPLEXCASE == 1
247
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
248
#endif
249
250
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
251
252
253
#if COMPLEXCASE == 1
         na_rows = na
#endif
254
        na_cols = na
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
255
#endif /* WITH_MPI */
256
257

        ! Here we convert the regular host array into a pinned host array
258
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
259
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
260
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
261
262
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
263
          stop 1
264
265
        endif

266
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
267
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
268
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
269
270
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
271
          stop 1
272
273
        endif

274
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
275
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
276
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
277
278
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
279
          stop 1
280
        endif
281
282
283
284
285
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300

      ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
      ! larger or equal to min_tile_size
      ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
      ! it can, however, be set by the user
      call obj%get("min_tile_size", min_tile_size ,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem setting option. Aborting..."
        stop
      endif
      if(min_tile_size == 0) then
        ! not set by the user, use the default value
        min_tile_size = 128*max(np_rows, np_cols)
      endif
      tile_size = ((min_tile_size-1)/tile_size+1)*tile_size
301
302
303
304

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

305
#if REALCASE == 1
306
307
308
      if (useQR) then

        if (which_qr_decomposition == 1) then
309
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
310
311
312
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
313
            stop 1
314
315
316
317
318
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
319
            stop 1
320
321
322
323
324
325
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
326
            stop 1
327
328
329
330
331
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
332
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
333
               &PRECISION&
Pavel Kus's avatar
Pavel Kus committed
334
               &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
335
336
337
338
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
339
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
340
               &PRECISION&
Pavel Kus's avatar
Pavel Kus committed
341
               &(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
342
343
344
345
346
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

347
          work_size = int(dwork_size(1))
348
349
350
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
351
            stop 1
352
          endif
Pavel Kus's avatar
Pavel Kus committed
353
          work_blocked = 0.0_rk
354
355
356
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
357
            stop 1
358
359
360
361
362
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
363
#endif /* REALCASE */
364
365
366
367
368

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
369

Pavel Kus's avatar
Pavel Kus committed
370
        successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
371
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
372
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
373
374
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
375
          stop 1
376
        endif
377
378
379
380
381
382
383
384
385
386
387
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

388
389
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

390
391
392
393
394
395
396
397
398
399
400
401
402
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
403
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
404
405
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
406
                stop 1
407
408
409
410
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
411
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
412
413
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
414
              stop 1
415
416
417
418
419
420
421
422
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
423
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
424
425
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
426
                stop 1
427
428
429
430
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
431
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
432
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
433
                stop 1
434
435
436
437
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
438

439
            if (istat .ne. 0) then
440
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
441
442
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
443
              stop 1
444
            endif
445
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
446
            if (.not.(successCUDA)) then
447
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
448
449
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
450
              stop 1
451
452
453
            endif

          endif
454
455
456
457
458

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
459
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
460
461
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
462
                stop 1
463
464
465
466
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
467
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
468
469
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
470
                 stop 1
471
472
473
              endif

            endif
474

475
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
476

477
            if (istat .ne. 0) then
478
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
479
480
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
481
              stop 1
482
483
            endif

484
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
485
            if (.not.(successCUDA)) then
486
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
487
488
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
489
              stop 1
490
            endif
491

492
          endif
493
494
495

        else ! GPU not used

496
497
498
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
499
500
501

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
502
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
503
504
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
505
            stop 1
506
507
          endif

508
509
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
510
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
511
512
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
513
            stop 1
514
          endif
Andreas Marek's avatar
Andreas Marek committed
515

516
517
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
518
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
519
520
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
521
            stop 1
522
523
524
525
526
          endif

        endif ! use GPU

        if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
527
          vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
528
        else
Pavel Kus's avatar
Pavel Kus committed
529
          vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
530
531
        endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
532
533
        vr(:) = 0.0_rck
        tmat(:,:,istep) = 0.0_rck
534
        if (useGPU) then
535
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
536
          umcCUDA(1 : umc_size) = 0.0_rck
537
#endif
538
539
540
541
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

542
          if (lc_start .le. 0) lc_start = 1
543
544
545
546

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

547
          if (my_pcol == cur_pcol) then
Pavel Kus's avatar
Pavel Kus committed
548
            successCUDA = cuda_memcpy2d(loc(a_mat(1, lc_start)), &
Andreas Marek's avatar
Retab    
Andreas Marek committed
549
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
550
551
552
553
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
554

555

556
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
557
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
558
559
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
560
              stop 1
561
562
563
564
565
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
566
#if REALCASE == 1
567
        if (useQR) then
Andreas Marek's avatar
Andreas Marek committed
568
569
570
571
          if (useGPU) then
 !            vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
          endif

572
573
574
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
575
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
576
                 &PRECISION&
Pavel Kus's avatar
Pavel Kus committed
577
                 &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
578
579
580
581
582
583
584
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
585
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
586
                 &PRECISION&
Pavel Kus's avatar
Pavel Kus committed
587
                 &(obj, a_mat(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
588
589
590
591
592
593
594
595
596
597
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
598
#endif /* REALCASE == 1 */
599
600
         do lc = n_cols, 1, -1

601
           ncol = istep*nbw + lc ! absolute column number of householder Vector
602
603
604
605
606
607
608
609
610
611
612
613
614
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

615
             ! Get Vector to be transformed; distribute last element and norm of
616
617
             ! remaining elements to all procs in current column

Pavel Kus's avatar
Pavel Kus committed
618
             vr(1:lr) = a_mat(1:lr,lch) ! Vector to be transformed
619
620
621
622
623
624

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
Pavel Kus's avatar
Pavel Kus committed
625
               aux1(2) = 0.0_rck
626
627
628
             endif

#ifdef WITH_MPI
629
             if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
630
             call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
631
                                MPI_SUM, mpi_comm_rows, mpierr)
632
             if (wantDebug) call obj%timer%stop("mpi_communication")
633
634
635

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
636
#endif
637

638
#if REALCASE == 1
639
             vnorm2 = aux2(1)
640
641
642
643
#endif
#if COMPLEXCASE == 1
             vnorm2 = real(aux2(1),kind=rk)
#endif
644
645
646
             vrl    = aux2(2)

             ! Householder transformation
Pavel Kus's avatar
Pavel Kus committed
647
648
649
       call hh_transform_&
             &MATH_DATATYPE&
             &_&
Andreas Marek's avatar
Andreas Marek committed
650
             &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
651
                         (obj, vrl, vnorm2, xf, tau, wantDebug)
652
             ! Scale vr and store Householder Vector for back transformation
653
654
655

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
Pavel Kus's avatar
Pavel Kus committed
656
657
               a_mat(1:lr-1,lch) = vr(1:lr-1)
               a_mat(lr,lch) = vrl
Pavel Kus's avatar
Pavel Kus committed
658
               vr(lr) = 1.0_rck
659
             else
Pavel Kus's avatar
Pavel Kus committed
660
               a_mat(1:lr,lch) = vr(1:lr)
661
662
663
664
             endif

           endif

665
           ! Broadcast Householder Vector and tau along columns
666
667
668

           vr(lr+1) = tau
#ifdef WITH_MPI
669
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
670
           call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
671
                          cur_pcol, mpi_comm_cols, mpierr)
672
           if (wantDebug) call obj%timer%stop("mpi_communication")
673
674

#endif /* WITH_MPI */
675

Andreas Marek's avatar
Andreas Marek committed
676
           if (useGPU_reduction_lower_block_to_tridiagonal) then
677
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
678
679
680
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
681
682
           tau = vr(lr+1)

683
684
685
686
687
688
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
689
           ! Transform remaining columns in current block with Householder Vector
690
691
           ! Local dot product

Pavel Kus's avatar
Pavel Kus committed
692
           aux1 = 0.0_rck
693
694

#ifdef WITH_OPENMP
695
696
697
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
698
699
700
701
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
Pavel Kus's avatar
Pavel Kus committed
702
               aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
703
704
705
706
707
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
708
           if (wantDebug) call obj%timer%start("mpi_communication")
709
710
711
712
713
714
715
716
717
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
Pavel Kus's avatar
Pavel Kus committed
718
               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
719

720
721
722
             endif
           enddo

723

724
           if (wantDebug) call obj%timer%stop("mpi_communication")
725
726
727
728
729
730
731
732
733
734
735

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
Pavel Kus's avatar
Pavel Kus committed
736
               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
737
738
739
740
741
742
743
744
745
746
747
748
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
Pavel Kus's avatar
Pavel Kus committed
749
!               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
750

751
752
!             endif
!           enddo
753
#endif /* if 0 */
754

755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
Pavel Kus's avatar
Pavel Kus committed
772
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
773
774
775
776
777
778
779
780
781
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
782
           if (wantDebug) call obj%timer%start("mpi_communication")
783
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
Pavel Kus's avatar
Pavel Kus committed
784
                                           MPI_SUM, mpi_comm_rows, mpierr)
785
           if (wantDebug) call obj%timer%stop("mpi_communication")
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
805
                          a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
806
807
#endif
#if COMPLEXCASE == 1
Pavel Kus's avatar
Pavel Kus committed
808
                          a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
809
810
811
812
813
814
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
815
816
817
818
819
820
821
822

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
Pavel Kus's avatar
Pavel Kus committed
823
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
824
825
826
827
828
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
829
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
830
831
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
                                         MPI_SUM, mpi_comm_rows, mpierr)
832
           if (wantDebug) call obj%timer%stop("mpi_communication")
833
834
835
836
837
838
839
840
841
842
843
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
844
               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
845
846
#endif
#if COMPLEXCASE == 1
Pavel Kus's avatar
Pavel Kus committed
847
               a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
848
849
850
851
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
852
853
         enddo ! lc

Andreas Marek's avatar
Andreas Marek committed
854
         if (useGPU_reduction_lower_block_to_tridiagonal) then
855
856
857
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
858
             successCUDA = cuda_memcpy2d((a_dev+        &
859
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
Pavel Kus's avatar
Pavel Kus committed
860
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
861
862
863
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
864
                                         int(cudaMemcpyHostToDevice,kind=c_int))
865

866
             if (.not.(successCUDA)) then
867
               print *, "bandred_&
868
869
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
870
               stop 1
871
             endif
872
873
874
875
876
877
878
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
879
         call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
880
         if (useGPU_reduction_lower_block_to_tridiagonal) then
881
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
882
883
884
885
886
887
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
888
                           n_cols, l_rows, ONE, &
889
890
891
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

Andreas Marek's avatar
Andreas Marek committed
892
         else ! useGPU_reduction_to_tridiagonal
893
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
894
895
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',           &
896
897
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
898
             call PRECISION_HERK('U', 'C',           &
899
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
900
                           n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
Andreas Marek's avatar
Andreas Marek committed
901
         endif
902
         call obj%timer%stop("blas")
903
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
904
         call symm_matrix_allreduce_&
905
906
#endif
#if COMPLEXCASE == 1
907
         call herm_matrix_allreduce_&
908
#endif
Andreas Marek's avatar
Andreas Marek committed
909
         &PRECISION &
910
                         (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
911
         ! Calculate triangular matrix T for block Householder Transformation
912
         call obj%timer%start("blas")
913
914
915
         do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if (lc<n_cols) then
Pavel Kus's avatar
Pavel Kus committed
916
             call PRECISION_TRMV('U', BLAS_TRANS_OR_CONJ, 'N',&
Andreas Marek's avatar
Andreas Marek committed
917
                                 n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
918
919

#if REALCASE == 1
920
             tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
921
922
923
924
#endif
#if COMPLEXCASE == 1
             tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
#endif
925
926
           endif
         enddo
927
         call obj%timer%stop("blas")
928
929
930
#if REALCASE == 1
       endif !useQR
#endif
Andreas Marek's avatar
Andreas Marek committed
931
932
933
934
935
936
937
938
939
940

#if REALCASE == 1
       if (useGPU .and. useQR) then
         ! copy the data for furhter usage
         ! qr worked on *CPU arrarys
         !vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
         cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
         if (my_pcol == cur_pcol) then
           successCUDA = cuda_memcpy2d((a_dev+        &
                                       int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
Pavel Kus's avatar
Pavel Kus committed
941
                                       int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
Andreas Marek's avatar
Andreas Marek committed
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
                                       int(lda*size_of_datatype,kind=c_intptr_t),           &
                                       int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                       int((lc_end - lc_start+1),kind=c_intptr_t), &
                                       int(cudaMemcpyHostToDevice,kind=c_int))

           if (.not.(successCUDA)) then
             print *, "bandred_&
                      &MATH_DATATYPE&
                      &: cuda memcpy a_dev  failed ", istat
             stop 1
           endif
         endif

       endif
#endif

958
959
       ! Transpose vmr -> vmc (stored in umc, second half)
       if (useGPU) then
960
         call elpa_transpose_vectors_&
Andreas Marek's avatar
Andreas Marek committed
961
962
963
              &MATH_DATATYPE&
              &_&
              &PRECISION &
964
965
                           (obj, vmrCUDA, cur_l_rows, mpi_comm_rows, &
                            umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, &
Andreas Marek's avatar
Andreas Marek committed
966
                            mpi_comm_cols, 1, istep*nbw, n_cols, nblk, max_threads)
967
       else ! useGPU
968
         call elpa_transpose_vectors_&
Andreas Marek's avatar
Andreas Marek committed
969
970
971
              &MATH_DATATYPE&
              &_&
              &PRECISION &
972
                                           (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
973
                                            umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
Andreas Marek's avatar
Andreas Marek committed
974
                                            1, istep*nbw, n_cols, nblk, max_threads)
975
976
977
978
979
980
       endif

       ! Calculate umc = A**T * vmr
       ! Note that the distributed A has to be transposed
       ! Opposed to direct tridiagonalization there is no need to use the cache locality
       ! of the tiles, so we can use strips of the matrix
Andreas Marek's avatar
Andreas Marek committed
981
982


983
984
#if 0
       ! original complex implemetation check for performance
Pavel Kus's avatar
Pavel Kus committed
985
986
       umcCPU(1:l_cols,1:n_cols) = </