elpa2_bandred_template.X90 75.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

64
#if REALCASE == 1
65
    subroutine bandred_real_PRECISION(na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
66
                            tmat, tmat_dev, wantDebug, useGPU, success, useQR)
67
68
69
70
71
#endif
#if COMPLEXCASE == 1
subroutine bandred_complex_PRECISION(na, a, lda, nblk, nbw, matrixCols, numBlocks,  &
                           mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, useGPU, success)
#endif
72
  !-------------------------------------------------------------------------------
73
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef HAVE_DETAILED_TIMINGS
      use timings
Andreas Marek's avatar
Andreas Marek committed
106
107
#else
      use timings_dummy
108
109
110
111
112
113
114
#endif
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
      implicit none

115
116
117
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#if REALCASE == 1
118
#ifdef USE_ASSUMED_SIZE
119
      real(kind=REAL_DATATYPE)                    :: a(lda,*), tmat(nbw,nbw,*)
120
#else
121
      real(kind=REAL_DATATYPE)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122
#endif
123
124
125
126
127
128
129
130
131
132
133
134
135
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,*), tmat(nbw,nbw,*)
#else
      complex(kind=COMPLEX_DATATYPE)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
      complex(kind=COMPLEX_DATATYPE), parameter   :: CZERO = (0.0_rk8, 0.0_rk8), CONE = (1.0_rk8, 0.0_rk8)
#else
      complex(kind=COMPLEX_DATATYPE), parameter   :: CZERO = (0.0_rk4, 0.0_rk4), CONE = (1.0_rk4, 0.0_rk4)
#endif
#endif /* COMPLEXCASE == 1 */
136

137
138
139
140
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: eps
#endif
      logical, intent(in)                         :: useGPU
141

142
143
144
145
146
147
148
149
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
      integer(kind=ik)                            :: vmrCols, mynlc
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
150

151
152
153
154
155
156
157
158
159
      real(kind=REAL_DATATYPE)                    :: vnorm2
#if REALCASE == 1
      real(kind=REAL_DATATYPE)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE)              :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)

      complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:,:), vr(:), vmrCPU(:,:), umcCPU(:,:)
#endif
160

161
162
163
164
165
166
167
168
169
170
171
172
#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable       :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      real(kind=REAL_DATATYPE), allocatable       :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      real(kind=REAL_DATATYPE), allocatable       :: vr(:)
#endif

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
      real(kind=REAL_DATATYPE)                    :: dwork_size(1)
      real(kind=REAL_DATATYPE), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
#endif
173
      ! a_dev is passed from bandred_real to trans_ev_band
174
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
175
#ifdef WITH_MPI
176
177
178
179
180
181
182
183
184
185
186
187
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
      integer(kind=c_size_t)                      :: lc_start, lc_end
#if COMPLEXCASE == 1
      integer(kind=c_size_t)                      :: lce_1, lcs_1, lre_1
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
188
189
#endif

190
191
192
193
194
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
195

196
197
198
199
200
201
202
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
#if REALCASE == 1
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
                                                    ii, pp, transformChunkSize
#endif
Andreas Marek's avatar
Andreas Marek committed
203

204
#if REALCASE == 1
205
      call timer%start("bandred_real" // PRECISION_SUFFIX)
206
207
208
209
#endif
#if COMPLEXCASE == 1
      call timer%start("bandred_complex" // PRECISION_SUFFIX)
#endif
210
211
212
213
214
215
      call timer%start("mpi_communication")

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
216

217
218
219
220
221
222
223
224
      call timer%stop("mpi_communication")
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
225
#if REALCASE == 1
226
227
            write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
            write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
228
229
230
231
232
#endif
#if COMPLEXCASE == 1
            write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk
            write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk'
#endif
233
234
235
236
237
238
239
240
241
242
          endif
          success = .false.
          return
        endif
      endif

! na_rows in used nowhere; only na_cols
      if (useGPU) then
#ifdef WITH_MPI
!        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
243
244
245
#if COMPLEXCASE == 1
         na_rows = numroc(na, nblk, my_prow, 0, np_rows)
#endif
246
247
248
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
!        na_rows = na
249
250
251
#if COMPLEXCASE == 1
         na_rows = na
#endif
252
        na_cols = na
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#endif

#if REALCASE == 1
        ! Here we convert the regular host array into a pinned host array
        successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_PRECISION_real)
        if (.not.(successCUDA)) then
          print *,"bandred_real: error in cudaMalloc"
          stop
        endif
#endif
#if COMPLEXCASE == 1
        successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_PRECISION_complex)
        if (.not.(successCUDA)) then
          print *, "bandred_complex:  cuda malloc failed a_dev ", istat
          stop
        endif
#endif

#if REALCASE == 1
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_PRECISION_real)
        if (.not.(successCUDA)) then
          print *,"bandred_real: error in cudaMalloc"
          stop
        endif
#endif
#if COMPLEXCASE == 1
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_PRECISION_complex)
        if (.not.(successCUDA)) then
          print *, " bandred_complex: cuda malloc failed tmat_dev ", istat
          stop
        endif

#endif

#if REALCASE == 1
        successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_PRECISION_real)
        if (.not.(successCUDA)) then
          print *,"bandred_real: error in cudaMalloc"
          stop
        endif
#endif
#if COMPLEXCASE == 1
        successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_PRECISION_complex)
        if (.not.(successCUDA)) then
          print *, "bandred_complex:  cuda malloc failed vav_dev ", istat
          stop
        endif

301
302
303
304
305
306
307
308
309
310
311
#endif
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

312
#if REALCASE == 1
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
      if (useQR) then

        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
          stop
        endif

        if (which_qr_decomposition == 1) then
          call qr_pqrparam_init(pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
            stop
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
            stop
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
            stop
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
344
          call qr_pdgeqrf_2dcomm_PRECISION(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
345
346
347
348
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
349
          call qr_pdgeqrf_2dcomm_PRECISION(a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
350
351
352
353
354
355
356
357
358
359
360
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

          work_size = dwork_size(1)
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
            stop
          endif
361
          work_blocked = CONST_0_0
362
363
364
365
366
367
368
369
370
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
            stop
          endif

        endif ! which_qr_decomposition

      endif ! useQr
371
#endif /* useQr */
372
373

      if (useGPU) then
374
375
376
377
378
!#if !defined(USE_ASSUMED_SIZE)
!        if (size(a,dim=1) .ne. lda .or. size(a,dim=2) .ne. na_cols) then
!          print *,"bandred_complex: sizes of a wrong ? ",lda,size(a,dim=1),na_cols,size(a,dim=2)
!        endif
!#endif
379
380
381

        cur_l_rows = 0
        cur_l_cols = 0
382
#if REALCASE == 1
383
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
384
385
386
387
        if (.not.(successCUDA)) then
          print *,"bandred_real: error in cudaMemcpy"
          stop
        endif
388
389
390
391
392
393
394
395
#endif
#if COMPLEXCASE == 1
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
        if (.not.(successCUDA)) then
          print *, "bandred_complex:  cuda memcpy faild a_dev ", istat
          stop
        endif
#endif
396
397
398
399
400
401
402
403
404
405
406
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

407
408
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

409
410
411
412
413
414
415
416
417
418
419
420
421
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
422
#if REALCASE == 1
423
                print *,"bandred_real: error when deallocating vr "//errorMessage
424
425
426
427
#endif
#if COMPLEXCASE == 1
                print *,"bandred_complex: error when deallocating vr "//errorMessage
#endif
428
429
430
431
432
                stop
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
433
#if REALCASE == 1
434
              print *,"bandred_real: error when allocating vr "//errorMessage
435
436
437
438
#endif
#if COMPLEXCASE == 1
              print *,"bandred_complex: error when allocating vr "//errorMessage
#endif
439
440
441
442
443
              stop
            endif

          endif

444
#if REALCASE == 1
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
                print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
                stop
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
                print *,"bandred_real: error in cuda_free"
                stop
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
              print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
              stop
            endif
465
            successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_PRECISION_real)
466
467
468
469
470
471
            if (.not.(successCUDA)) then
              print *,"bandred_real: error in cudaMalloc"
              stop
            endif

          endif
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
#endif

#if COMPLEXCASE == 1
          if ((.not. allocated(vmrCPU)) .or. (vmr_size .gt. ubound(vmrCPU, dim=1))) then
            if (allocated(vmrCPU)) then
              deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
                print *,"bandred_complex: error when deallocating vmrCPU "//errorMessage
                stop
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA))then
                print *,"bandred_complex: error in cudaFree"
                stop
              endif
            endif

            allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
              print *,"bandred_complex: error when allocating vmrCPU "//errorMessage
              stop
            endif

            if (max(l_rows,1) * 2*n_cols .gt. vmr_size) then
              print *,"bandred_complex: vmc_size ",max(l_rows,1) * 2*n_cols,vmr_size
            endif

            successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_PRECISION_complex)
            if (.not.(successCUDA)) then
              print *, "bandred_complex:  cuda malloc failed vmr_dev ", istat
              stop
            endif

          endif
#endif
508

509
#if REALCASE == 1
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
                print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
                stop
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
                 print *,"bandred_real: error in cudaFree"
                 stop
              endif

            endif
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
              print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
              stop
            endif
530

531
            successCUDA = cuda_malloc(umc_dev, umc_size*size_of_PRECISION_real)
532
533
534
535
536
537
            if (.not.(successCUDA)) then
              print *,"bandred_real: error in cudaMalloc"
              stop
            endif

          endif
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
          if ((.not. allocated(umcCPU)) .or. (umc_size .gt. ubound(umcCPU, dim=1))) then
            if (allocated(umcCPU)) then
              deallocate(umcCPU, stat=istat, errmsg=errorMessage)


              if (istat .ne. 0) then
                print *,"bandred_complex: error when allocating umcCPU "//errorMessage
                stop
              endif
              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA))then
                print *,"bandred_complex: error in cudaFree"
                stop
              endif
            endif

            allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
              print *,"bandred_complex: error when allocating umcCPU "//errorMessage
              stop
            endif

            if (max(l_cols,1) * 2*n_cols .gt. umc_size) then
              print *,"bandred_complex: umc_size ",max(l_cols,1) * 2*n_cols,umc_size
            endif
            successCUDA = cuda_malloc(umc_dev, umc_size*size_of_PRECISION_complex)
            if (.not.(successCUDA)) then
              print *, "bandred_complex:  cuda malloc failed umc_dev ", istat
              stop
            endif
          endif
#endif
573
574
575

        else ! GPU not used

576
577
578
579
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
#if REALCASE == 1
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
580
581
582
583
584
585

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
            stop
          endif
586
587
588
589
590
591
592
593
594
#endif
#if COMPLEXCASE == 1
          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)

          if (istat .ne. 0) then
            print *,"bandred_complex: error when allocating vmrCPU "//errorMessage
            stop
          endif
#endif
595

596
#if REALCASE == 1
597
598
599
600
601
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating umcCPU "//errorMessage
            stop
          endif
602
603
604
605
606
607
608
609
#endif
#if COMPLEXCASE == 1
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_complex: error when allocating umcCPU "//errorMessage
            stop
          endif
#endif
610
611
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
612
#if REALCASE == 1
613
            print *,"bandred_real: error when allocating vr "//errorMessage
614
615
616
617
#endif
#if COMPLEXCASE == 1
            print *,"bandred_complex: error when allocating vr "//errorMessage
#endif
618
619
620
621
622
623
            stop
          endif

        endif ! use GPU

        if (useGPU) then
624
#if REALCASE == 1
625
          vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0
626
#endif
627
        else
628
#if REALCASE == 1
629
          vmrCPU(1:l_rows,1:n_cols) = CONST_0_0
630
631
632
633
634
635
636
#endif
#if COMPLEXCASE == 1
          vmrCPU(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
#endif
        endif ! useGPU

#if REALCASE == 1
637
638
        vr(:) = CONST_0_0
        tmat(:,:,istep) = CONST_0_0
639
640
641
642
643
#endif
#if COMPLEXCASE == 1
        vr(:) = CONST_COMPLEX_0_0
        tmat(:,:,istep) = CONST_COMPLEX_0_0
#endif
644
        if (useGPU) then
645
#if REALCASE == 1
646
          umcCUDA(1 : umc_size) = CONST_0_0
647
#endif
648
649
650
651
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

652
          if (lc_start .le. 0) lc_start = 1
653
654
655
656
657

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

          if(my_pcol == cur_pcol) then
658
#if REALCASE == 1
659
660
661
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), lda*size_of_PRECISION_real,         &
                                       (a_dev + ((lc_start-1) * lda*size_of_PRECISION_real)),    &
                                       lda*size_of_PRECISION_real, lr_end*size_of_PRECISION_real, &
662
                                       (lc_end - lc_start+1), cudaMemcpyDeviceToHost)
663
664
665
666
667
668
669
670
#endif
#if COMPLEXCASE == 1
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_PRECISION_complex,kind=c_size_t),            &
                                        (a_dev + int( ( (lc_start-1) * lda*size_of_PRECISION_complex),kind=c_size_t )),      &
                                        int(lda*size_of_PRECISION_complex,kind=c_size_t),              &
                                    int(lr_end*size_of_PRECISION_complex,kind=c_size_t),               &
                                      int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int))
#endif
671
            if (.not.(successCUDA)) then
672
#if REALCASE == 1
673
              print *,"bandred_real: error in cudaMemcpy2d"
674
675
676
677
#endif
#if COMPLEXCASE == 1
              print *, "bandred_complex: error in cudaMemcpy2"
#endif
678
679
680
681
682
683
684
              stop
            endif

          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
685
#if REALCASE == 1
686
687
688
689
        if (useQR) then
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
690
            call qr_pdgeqrf_2dcomm_PRECISION(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
691
692
693
694
695
696
697
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
698
            call qr_pdgeqrf_2dcomm_PRECISION(a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
699
700
701
702
703
704
705
706
707
708
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
709
#endif /* REALCASE == 1 */
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
         do lc = n_cols, 1, -1

           ncol = istep*nbw + lc ! absolute column number of householder vector
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

             ! Get vector to be transformed; distribute last element and norm of
             ! remaining elements to all procs in current column

             vr(1:lr) = a(1:lr,lch) ! vector to be transformed

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
736
#if REALCASE == 1
737
               aux1(2) = CONST_0_0
738
739
740
741
#endif
#if COMPLEXCASE == 1
               aux1(2) = CONST_COMPLEX_0_0
#endif
742
743
744
745
             endif

#ifdef WITH_MPI
             call timer%start("mpi_communication")
746
#if REALCASE == 1
747
             call mpi_allreduce(aux1, aux2, 2, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
748
749
750
751
#endif
#if COMPLEXCASE == 1
             call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
#endif
752
753
754
755
             call timer%stop("mpi_communication")

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
756
#endif
757
758
759
760
761

             vnorm2 = aux2(1)
             vrl    = aux2(2)

             ! Householder transformation
762
#if REALCASE == 1
763
             call hh_transform_real_PRECISION(vrl, vnorm2, xf, tau)
764
765
766
767
#endif
#if COMPLEXCASE == 1
             call hh_transform_complex_PRECISION(vrl, vnorm2, xf, tau)
#endif
768
769
770
771
772
773
             ! Scale vr and store Householder vector for back transformation

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
774
#if REALCASE == 1
775
               vr(lr) = CONST_1_0
776
777
778
779
#endif
#if COMPLEXCASE == 1
               vr(lr) = CONST_COMPLEX_1_0
#endif
780
781
782
783
784
785
786
787
788
789
790
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

           ! Broadcast Householder vector and tau along columns

           vr(lr+1) = tau
#ifdef WITH_MPI
           call timer%start("mpi_communication")
791
#if REALCASE == 1
792
           call MPI_Bcast(vr, lr+1, MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
793
794
795
796
#endif
#if COMPLEXCASE == 1
           call MPI_Bcast(vr, lr+1, MPI_COMPLEX_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
#endif
797
798
799
           call timer%stop("mpi_communication")

#endif /* WITH_MPI */
800
801

#if REALCASE == 1
802
803
804
805
806
           if (useGPU) then
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
807
808
809
810
#endif
#if COMPLEXCASE == 1
           vmrCPU(1:lr,lc) = vr(1:lr)
#endif
811
812
           tau = vr(lr+1)

813
814
815
816
817
818
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
819
820
821
           ! Transform remaining columns in current block with Householder vector
           ! Local dot product

822
#if REALCASE == 1
823
           aux1 = 0
824
825
826
827
#endif
#if COMPLEXCASE == 1
          aux1 = CONST_COMPLEX_0_0
#endif
828

829
#if REALCASE == 1
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
#ifdef WITH_OPENMP
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
           call timer%start("mpi_communication")
859
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
           call timer%stop("mpi_communication")
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
           call timer%start("mpi_communication")
899
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
           call timer%stop("mpi_communication")
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
             endif
           enddo
#endif /* WITH_OPENMP */
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
           call timer%start("mpi_communication")
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
             endif
           enddo

           call timer%stop("mpi_communication")

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!             endif
!           enddo
#endif
972
973
974
975
976
977
         enddo ! lc

         if (useGPU) then
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
978
#if REALCASE == 1
979
980
981
             successCUDA = cuda_memcpy2d((a_dev+((lc_start-1)*lda*size_of_PRECISION_real)),          &
                                          lda*size_of_PRECISION_real, loc(a(1, lc_start)),           &
                                          lda*size_of_PRECISION_real,  lr_end*size_of_PRECISION_real, &
982
983
984
985
986
                                          (lc_end - lc_start+1),cudaMemcpyHostToDevice)
             if (.not.(successCUDA)) then
               print *,"bandred_real: error in cudaMemcpy2d"
               stop
             endif
987
988
989
990
991
992
993
994
995
996
997
998
999
#endif
#if COMPLEXCASE == 1
             successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_PRECISION_complex),kind=c_size_t)),    &
                                        int(lda*size_of_PRECISION_complex,kind=c_size_t), loc(a(1,lc_start)),       &
                                        int(lda*size_of_PRECISION_complex,kind=c_size_t),                           &
                                        int(lr_end*size_of_PRECISION_complex,kind=c_size_t),                        &
                                        int((lc_end - lc_start+1),kind=c_size_t) &
                                        ,int(cudaMemcpyHostToDevice,kind=c_int))
             if (.not.(successCUDA)) then
               print *, "bandred_complex: cuda memcpy a_dev  failed ", istat
               stop
             endif
#endif
1000
1001
1002
1003
1004
1005
1006
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
1007
	 call timer%start("blas")
1008
#if REALCASE == 1
1009
1010
         if (useGPU) then
           if (l_rows>0) &
1011
             call PRECISION_SYRK('U', 'T', n_cols, l_rows, CONST_1_0, vmrCUDA, cur_l_rows, CONST_0_0, vav, ubound(vav,dim=1))
1012
1013
         else
           if (l_rows>0) &
1014
             call PRECISION_SYRK('U', 'T', n_cols, l_rows, CONST_1_0, vmrCPU, ubound(vmrCPU,dim=1), CONST_0_0, vav, ubound(vav,dim=1))
1015
         endif
1016
1017
1018
1019
1020
1021
1022
1023
#endif
#if COMPLEXCASE == 1
        if (l_rows>0) then
          call timer%start("blas")
          call PRECISION_HERK('U', 'C', n_cols, l_rows, CONE, vmrCPU, ubound(vmrCPU,dim=1), CZERO, vav, ubound(vav,dim=1))
          call timer%stop("blas")
        endif
#endif
1024
	 call timer%stop("blas")
1025
#if REALCASE == 1
1026
         call symm_matrix_allreduce_PRECISION(n_cols,vav, nbw, nbw,mpi_comm_rows)
1027
1028
1029
1030
#endif
#if COMPLEXCASE == 1
         call herm_matrix_allreduce_PRECISION(n_cols,vav, nbw,nbw,mpi_comm_rows)
#endif
1031
         ! Calculate triangular matrix T for block Householder Transformation
1032
	 call timer%start("blas")
1033
1034
1035
         do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if (lc<n_cols) then
1036
#if REALCASE == 1
1037
             call PRECISION_TRMV('U', 'T', 'N', n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
1038
1039
1040
1041
1042
1043
#endif
#if COMPLEXCASE == 1
             call PRECISION_TRMV('U', 'C', 'N', n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
#endif

#if REALCASE == 1
1044
             tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
1045
1046
1047
1048
#endif
#if COMPLEXCASE == 1
             tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
#endif
1049
1050
           endif
         enddo
1051
 	 call timer%stop("blas")
1052
1053
1054
#if REALCASE == 1
       endif !useQR
#endif
1055
       ! Transpose vmr -> vmc (stored in umc, second half)
1056
#if REALCASE == 1
1057
       if (useGPU) then
1058
         call elpa_transpose_vectors_real_PRECISION  (vmrCUDA, cur_l_rows, mpi_comm_rows, &
1059
1060
1061
                                            umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, mpi_comm_cols, &
                                            1, istep*nbw, n_cols, nblk)
       else
1062
         call elpa_transpose_vectors_real_PRECISION  (vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
1063
1064
1065
                                            umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
                                            1, istep*nbw, n_cols, nblk)
       endif
1066
1067
1068
1069
1070
1071
#endif
#if COMPLEXCASE == 1
       call elpa_transpose_vectors_complex_PRECISION  (vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
                                      umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
                                      1, istep*nbw, n_cols, nblk)
#endif
1072
1073
1074
1075
1076

       ! Calculate umc = A**T * vmr
       ! Note that the distributed A has to be transposed
       ! Opposed to direct tridiagonalization there is no need to use the cache locality
       ! of the tiles, so we can use strips of the matrix
1077
#if REALCASE == 1
1078
1079
1080
       ! here the GPU version and CPU version diverged substantially, due to the newest
       ! optimizations due to Intel. The GPU version has to be re-written
       if (useGPU) then
1081
1082
         umcCUDA(1 : l_cols * n_cols) = CONST_0_0
         vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = CONST_0_0
1083
1084

         if (l_cols>0 .and. l_rows>0) then
1085
           successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_PRECISION_real,cudaMemcpyHostToDevice)
1086
1087
1088
1089
           if (.not.(successCUDA)) then
             print *,"bandred_real: error in cudaMemcpy"
             stop
           endif
1090
           successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_PRECISION_real,cudaMemcpyHostToDevice)
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
           if (.not.(successCUDA)) then
             print *,"bandred_real: error in cudaMemcpy"
             stop
           endif

           do i=0,(istep*nbw-1)/tile_size

             lcs = i*l_cols_tile+1
             lce = min(l_cols,(i+1)*l_cols_tile)
             if (lce<lcs) cycle
1101
             call timer%start("cublas")
1102
             lre = min(l_rows,(i+1)*l_rows_tile)
1103
             call cublas_PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lre, &
1104
1105
                               CONST_1_0, (a_dev + ((lcs-1)*lda*size_of_PRECISION_real)), lda, vmr_dev,cur_l_rows, &
                               CONST_1_0, (umc_dev+ (lcs-1)*size_of_PRECISION_real), cur_l_cols)
1106
1107
             if(i==0) cycle
             lre = min(l_rows,i*l_rows_tile)
1108
             call cublas_PRECISION_GEMM('N', 'N', lre,n_cols, lce-lcs+1,&
1109
1110
1111
                               CONST_1_0, (a_dev+ ((lcs-1)*lda*size_of_PRECISION_real)), lda,                  &
                               (umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_PRECISION_real), cur_l_cols, &
                               CONST_1_0, (vmr_dev+(cur_l_rows * n_cols)*size_of_PRECISION_real), cur_l_rows)
1112
             call timer%stop("cublas")
1113
           enddo
1114
           successCUDA = cuda_memcpy(loc(vmrCUDA(1)), vmr_dev,vmr_size*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
1115
1116
1117
1118
           if (.not.(successCUDA)) then
             print *,"bandred_real: error in cudaMemcpy"
             stop
           endif
1119
           successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
           if (.not.(successCUDA)) then
             print *,"bandred_real: error in cudaMemcpy"
             stop
           endif

         endif ! l_cols>0 .and. l_rows>0

       else ! do not useGPU version

         !Code for Algorithm 4

         n_way = 1
#ifdef WITH_OPENMP
         n_way = omp_get_max_threads()
#endif
1135
1136
         !umcCPU(1:l_cols,1:n_cols) = 0.d0
         !vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0
1137
1138
1139
1140
1141
1142
#ifdef WITH_OPENMP
         !$omp parallel private( i,lcs,lce,lrs,lre)
#endif
         if (n_way > 1) then
           !$omp do
           do i=1,min(l_cols_tile, l_cols)
1143
             umcCPU(i,1:n_cols) = CONST_0_0
1144
1145
1146
1147
           enddo

           !$omp do
           do i=1,l_rows
1148
             vmrCPU(i,n_cols+1:2*n_cols) = CONST_0_0
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
           enddo
           if (l_cols>0 .and. l_rows>0) then

             !SYMM variant 4
             !Partitioned Matrix Expression:
             ! Ct = Atl Bt + Atr Bb
             ! Cb = Atr' Bt + Abl Bb
             !
             !Loop invariant:
             ! Ct = Atl Bt + Atr Bb
             !
             !Update:
             ! C1 = A10'B0 + A11B1 + A21 B2
             !
             !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
             !is easily parallelized, and regardless of choise of algorithm,
             !the startup cost for parallelizing the dgemms inside the loop is too great

             !$omp do schedule(static,1)
             do i=0,(istep*nbw-1)/tile_size
               lcs = i*l_cols_tile+1                   ! local column start
               lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end

               lrs = i*l_rows_tile+1                   ! local row start
               lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end

               !C1 += [A11 A12] [B1
               !                 B2]
               if ( lre > lrs .and. l_cols > lcs ) then
1178
	         call timer%start("blas")
1179
1180
                 call PRECISION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1,          &
                            CONST_1_0, a(lrs,lcs), ubound(a,dim=1),                 &
1181
                                  umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1),  &
1182
                            CONST_0_0, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
1183
	         call timer%stop("blas")
1184
1185
1186
1187
               endif

               ! C1 += A10' B0
               if ( lce > lcs .and. i > 0 ) then
1188
	       	 call timer%start("blas")
1189
1190
                 call PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lrs-1,           &
                            CONST_1_0, a(1,lcs),   ubound(a,dim=1),           &
1191
                                  vmrCPU(1,1),   ubound(vmrCPU,dim=1),   &
1192
                            CONST_0_0, umcCPU(lcs,1), ubound(umcCPU,dim=1))
1193
	       	 call timer%stop("blas")
1194
1195
1196
1197
               endif
             enddo
           endif ! l_cols>0 .and. l_rows>0
         else ! n_way > 1
1198
1199
           umcCPU(1:l_cols,1:n_cols) = CONST_0_0
           vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_0_0
1200
1201
1202
1203
1204
1205
           if (l_cols>0 .and. l_rows>0) then
             do i=0,(istep*nbw-1)/tile_size

               lcs = i*l_cols_tile+1
               lce = min(l_cols,(i+1)*l_cols_tile)
               if (lce<lcs) cycle
1206
	       call timer%start("blas")
1207
               lre = min(l_rows,(i+1)*l_rows_tile)
1208
1209
               call PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lre, CONST_1_0, a(1,lcs), ubound(a,dim=1), &
                            vmrCPU, ubound(vmrCPU,dim=1), CONST_1_0, umcCPU(lcs,1), ubound(umcCPU,dim=1))
1210
	       call timer%stop("blas")
1211
1212
               if (i==0) cycle
                 lre = min(l_rows,i*l_rows_tile)
1213
	       	 call timer%start("blas")
1214
1215
                 call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, CONST_1_0, a(1,lcs), lda, &
                            umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), CONST_1_0, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
1216
	       	 call timer%stop("blas")
1217
1218
1219
1220
1221
1222
1223
             enddo
           endif
         endif ! n_way > 1
#ifdef WITH_OPENMP
        !$omp end parallel
#endif
       endif ! do not useGPU version
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
        umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
        vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
        if (l_cols>0 .and. l_rows>0) then
          if (useGPU) then
            if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
              print *,"bandred_complex: vmr size 2 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
              stop
            endif
            successCUDA = cuda_memcpy(vmr_dev, loc(vmrCPU(1,1)),vmr_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)

            if (.not.(successCUDA)) then
              print *, "bandred_complex:  cuda memcpy vmr_dev failed ", istat
              stop
            endif
            if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
              print *,"bandred_complex: umc size 2 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
              stop
            endif
            successCUDA = cuda_memcpy(umc_dev, loc(umcCPU(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
            if (.not.(successCUDA)) then
              print *, "bandred_complex:  cuda memcpy umc_dev failed  ", istat
              stop
            endif
          endif
          do i=0,(istep*nbw-1)/tile_size

            lcs = i*l_cols_tile+1
            lce = min(l_cols,(i+1)*l_cols_tile)
            if (lce<lcs) cycle

            lre = min(l_rows,(i+1)*l_rows_tile)

            if (useGPU) then
              call timer%start("cublas")
              call cublas_PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, CONE, (a_dev + ((lcs-1)*lda* &
                        size_of_PRECISION_complex)), lda, &
                        vmr_dev, cur_l_rows, CONE, (umc_dev +(lcs-1)*size_of_PRECISION_complex), cur_l_cols)
              call timer%stop("cublas")
            else
              call timer%start("blas")
              call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, CONE, a(1,lcs), ubound(a,dim=1), &
                         vmrCPU, ubound(vmrCPU,dim=1), CONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
              call timer%stop("blas")
            endif

            if (i==0) cycle
            lre = min(l_rows,i*l_rows_tile)
            if (useGPU) then
              call timer%start("cublas")
              call cublas_PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, CONE, (a_dev+((lcs-1)*lda* &
                        size_of_PRECISION_complex)),lda,  &
                        (umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_PRECISION_complex), cur_l_cols,CONE,  &
                        (vmr_dev+(cur_l_rows * n_cols)*size_of_PRECISION_complex), cur_l_rows)
              call timer%stop("cublas")
            else
              call timer%start("blas")
              call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, CONE, a(1,lcs), lda, &
                         umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), CONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
              call timer%stop("blas")
            endif
          enddo

          if (useGPU) then
            if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
              print *,"bandred_complex: vmr size 3 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
              stop
            endif
            successCUDA = cuda_memcpy(loc(vmrCPU(1,1)),vmr_dev,vmr_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
            if (.not.(successCUDA)) then
              print *, "bandred_complex:  cuad memcpy failed vmrCPU ", istat
              stop
            endif
            if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
              print *,"bandred_complex: umc size 3 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
              stop
            endif
            successCUDA = cuda_memcpy(loc(umcCPU(1,1)), umc_dev,umc_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
            if (.not.(successCUDA)) then
              print *, "bandred_complex:  cuad memcpy failed umcCPU ", istat
              stop
            endif
          endif ! useGPU
        endif ! (l_cols>0 .and. l_rows>0) 
#endif /* COMPLEXCASE == 1 */
1311
1312
1313
1314
1315
1316

       ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
       ! on the processors containing the diagonal
       ! This is only necessary if ur has been calculated, i.e. if the
       ! global tile size is smaller than the global remaining matrix

1317
#if REALCASE == 1
1318
1319
1320
1321
       if (useGPU) then
         ! here the GPU version and CPU version divereged due to the same reasons as above

         if (tile_size < istep*nbw) then
1322
           call elpa_reduce_add_vectors_real_PRECISION  (vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
                                               umcCUDA, cur_l_cols, mpi_comm_cols, &
                                               istep*nbw, n_cols, nblk)
         endif

         if (l_cols>0) then
           allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
           if (istat .ne. 0) then
             print *,"bandred_real: error when allocating tmpCUDA "//errorMessage
             stop
           endif

#ifdef WITH_MPI
           call timer%start("mpi_communication")

1337
           call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, ierr)
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
           umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
           call timer%stop("mpi_communication")
#else /* WITH_MPI */

!           tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)

#endif /* WITH_MPI */

           if (allocated(tmpCUDA)) then
             deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
             if (istat .ne. 0) then
               print *,"bandred_real: error when deallocating tmpCUDA "//errorMessage
               stop
             endif
           endif
         endif ! l_cols

         ! U = U * Tmat**T
1356
         successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_PRECISION_real, cudaMemcpyHostToDevice)
1357
1358
1359
1360
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif
1361
         successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
1362
1363
1364
1365
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif
1366
	 call timer%start("cublas")
1367
         call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
1368
                           CONST_1_0, tmat_dev, nbw, umc_dev, cur_l_cols)
1369
1370
	 call timer%start("cublas")

1371
         ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
1372
         successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
1373
1374
1375
1376
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif
1377
1378
	 call timer%start("cublas")

1379
         call cublas_PRECISION_GEMM('T', 'N', n_cols, n_cols, l_cols, &
1380
1381
                           CONST_1_0, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, &
                           CONST_0_0, vav_dev, nbw)
1382

1383
         call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
1384
                           CONST_1_0, tmat_dev, nbw, vav_dev, nbw)
1385
	 call timer%stop("cublas")
1386

1387
         successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_PRECISION_real, cudaMemcpyDeviceToHost)
1388
1389
1390
1391
1392
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif

1393
         call symm_matrix_allreduce_PRECISION(n_cols,vav, nbw,nbw,mpi_comm_cols)
1394

1395
         successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
1396
1397
1398
1399
1400
1401
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif

         ! U = U - 0.5 * V * VAV
1402
1403
 	 call timer%start("cublas")

1404
         call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
1405
1406
                           -CONST_0_5, (umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, vav_dev,nbw,&
                           CONST_1_0, umc_dev, cur_l_cols)
1407
	 call timer%stop("cublas")
1408

1409
         successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_PRECISION_real, cudaMemcpyDeviceToHost)
1410
1411
1412
1413
1414
1415
1416

         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif

         ! Transpose umc -> umr (stored in vmr, second half)
1417
         call elpa_transpose_vectors_real_PRECISION  (umcCUDA, cur_l_cols, mpi_comm_cols, &
1418
1419
1420
                                            vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
                                            1, istep*nbw, n_cols, nblk)

1421
         successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_PRECISION_real, cudaMemcpyHostToDevice)
1422
1423
1424
1425
1426
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif

1427
         successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_PRECISION_real, cudaMemcpyHostToDevice)
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif

         ! A = A - V*U**T - U*V**T
         do i=0,(istep*nbw-1)/tile_size
           lcs = i*l_cols_tile+1
           lce = min(l_cols,(i+1)*l_cols_tile)
           lre = min(l_rows,(i+1)*l_rows_tile)
           if (lce<lcs .or. lre<1) cycle
1439
1440
	   call timer%start("cublas")

1441
           call cublas_PRECISION_GEMM('N', 'T', lre, lce-lcs+1, 2*n_cols, -CONST_1_0, &
1442
1443
                             vmr_dev, cur_l_rows, (umc_dev +(lcs-1)*size_of_PRECISION_real), cur_l_cols, &
                             CONST_1_0, (a_dev+(lcs-1)*lda*size_of_PRECISION_real), lda)
1444
1445
	   call timer%stop("cublas")

1446
1447
1448
1449
1450
1451
         enddo

       else ! do not useGPU

         ! Or if we used the Algorithm 4
         if (tile_size < istep*nbw .or. n_way > 1) then
1452
           call elpa_reduce_add_vectors_real_PRECISION  (vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
                                             umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
                                             istep*nbw, n_cols, nblk)
         endif

         if (l_cols>0) then
           allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage)
           if (istat .ne. 0) then
             print *,"bandred_real: error when allocating tmpCPU "//errorMessage
             stop
           endif

#ifdef WITH_MPI
           call timer%start("mpi_communication")
1466
           call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
           umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
           call timer%stop("mpi_communication")
#else /* WITH_MPI */
!           tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
#endif /* WITH_MPI */

           deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
           if (istat .ne. 0) then
             print *,"bandred_real: error when deallocating tmpCPU "//errorMessage
             stop
           endif
         endif

         ! U = U * Tmat**T
1481
1482
	 call timer%start("blas")

1483
         call PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', l_cols,n_cols, CONST_1_0, tmat(1,1,istep), ubound(tmat,dim=1), &
1484
1485
1486
1487
                    umcCPU, ubound(umcCPU,dim=1))

         ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T

1488
1489
         call PRECISION_GEMM('T', 'N', n_cols, n_cols, l_cols, CONST_1_0, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), &
                    ubound(umcCPU,dim=1), CONST_0_0, vav, ubound(vav,dim=1))
1490

1491
         call PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, CONST_1_0, tmat(1,1,istep),    &
1492
                    ubound(tmat,dim=1), vav, ubound(vav,dim=1))
1493
	 call timer%stop("blas")
1494
         call symm_matrix_allreduce_PRECISION(n_cols,vav, nbw, nbw ,mpi_comm_cols)
1495
1496

         ! U = U - 0.5 * V * VAV
1497
	 call timer%start("blas")
1498
1499
         call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, -CONST_0_5, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, &
                     ubound(vav,dim=1), CONST_1_0, umcCPU, ubound(umcCPU,dim=1))
1500
	 call timer%stop("blas")
1501
         ! Transpose umc -> umr (stored in vmr, second half)
1502
         call elpa_transpose_vectors_real_PRECISION(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
                                         vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
                                         1, istep*nbw, n_cols, nblk)

         ! A = A - V*U**T - U*V**T

#ifdef WITH_OPENMP
         !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend  )
         n_threads = omp_get_num_threads()
         if (mod(n_threads, 2) == 0) then
           n_way = 2
         else
           n_way = 1
         endif

         m_way = n_threads / n_way

         m_id = mod(omp_get_thread_num(),  m_way)
         n_id = omp_get_thread_num() / m_way

         do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way
           i = ii / tile_size
           lcs = i*l_cols_tile+1
           lce = min(l_cols,(i+1)*l_cols_tile)
           lre = min(l_rows,(i+1)*l_rows_tile)
           if (lce<lcs .or. lre<1) cycle

           !Figure out this thread's range
           work_per_thread = lre / m_way
           if (work_per_thread * m_way < lre) work_per_thread = work_per_thread + 1
           mystart = m_id * work_per_thread + 1
           myend   = mystart + work_per_thread - 1
           if ( myend > lre ) myend = lre
           if ( myend-mystart+1 < 1) cycle
1536
	   call timer%start("blas")
1537
           call PRECISION_GEMM('N', 'T', myend-mystart+1, lce-lcs+1, 2*n_cols, -CONST_1_0, &
1538
                      vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), &
1539
                       CONST_1_0, a(mystart,lcs), ubound(a,1))
1540
       	   call timer%stop("blas")
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
         enddo
         !$omp end parallel

#else /* WITH_OPENMP */

         do i=0,(istep*nbw-1)/tile_size
           lcs = i*l_cols_tile+1
           lce = min(l_cols,(i+1)*l_cols_tile)
           lre = min(l_rows,(i+1)*l_rows_tile)
           if (lce<lcs .or. lre<1) cycle
1551
	   call timer%start("blas")
1552
           call PRECISION_GEMM('N', 'T', lre,lce-lcs+1, 2*n_cols, -CONST_1_0, &
1553
                       vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
1554
                       CONST_1_0, a(1,lcs), lda)
1555
	   call timer%stop("blas")
1556
1557
1558
1559
         enddo
#endif /* WITH_OPENMP */

       endif ! useGPU
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
        if (tile_size < istep*nbw) then
          call elpa_reduce_add_vectors_complex_PRECISION  (vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
                                          umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
                                          istep*nbw, n_cols, nblk)
        endif
#ifdef WITH_MPI
        if (l_cols>0) then
          allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_complex: error when allocating tmp "//errorMessage
            stop
          endif
          call timer%start("mpi_communication")
          call mpi_allreduce(umcCPU, tmp, l_cols*n_cols, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
          call timer%stop("mpi_communication")
1578

1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
          umcCPU(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
          deallocate(tmp, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_complex: error when deallocating tmp "//errorMessage
            stop
          endif
        endif

#else /* WITH_MPI */

!        if (l_cols>0) then
!          allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
!          if (istat .ne. 0) then
!            print *,"bandred_complex: error when allocating tmp "//errorMessage
!            stop
!          endif
!          tmp(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
!
!          umcCPU(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
!          deallocate(tmp, stat=istat, errmsg=errorMessage)
!          if (istat .ne. 0) then
!            print *,"bandred_complex: error when deallocating tmp "//errorMessage
!            stop
!          endif
!        endif

#endif /* WITH_MPI */




        ! U = U * Tmat**T
        if (useGPU) then
          if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
            print *,"bandred_complex: umcCPU size 4 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
            stop
          endif
          successCUDA = cuda_memcpy(umc_dev, loc(umcCPU(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
          if (.not.(successCUDA)) then
            print *, "bandred_complex:  cuad memcpy failed umc_dev ", istat
            stop
          endif
          successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
          if (.not.(successCUDA)) then
            print *, "bandred_complex:  cuad memcpy failed tmat_dev ", istat
            stop
          endif
          call timer%start("cublas")
          call  cublas_PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols)
          call timer%stop("cublas")
        else ! not useGPU
          call timer%start("blas")
          call PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), &
                     umcCPU, ubound(umcCPU,dim=1))
          call timer%stop("blas")
        endif

        ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
        if (useGPU) then
          successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
          if (.not.(successCUDA)) then
            print *, "bandred_complex:  cuad memcpy failed vav_dev ", istat
            stop
          endif
          call timer%start("cublas")
          call cublas_PRECISION_GEMM('C', 'N', n_cols, n_cols, l_cols, CONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) &
                            *size_of_PRECISION_complex ), cur_l_cols, CZERO, vav_dev, nbw)

          call cublas_PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat_dev, nbw, vav_dev, nbw)
          call timer%stop("cublas")
          successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
          if (.not.(successCUDA)) then
            print *, "bandred_complex:  cuad memcpy failed vav ", istat
            stop
          endif

          call herm_matrix_allreduce_PRECISION(n_cols,vav, nbw, nbw,mpi_comm_cols)

          successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
          if (.not.(successCUDA)) then
            print *, "bandred_complex:  cuad memcpy failed vav_dev ", istat
            stop
          endif
        else ! useGPU
          call timer%start("blas")
          call PRECISION_GEMM('C', 'N', n_cols, n_cols, l_cols, CONE, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), &
                     ubound(umcCPU,dim=1), CZERO, vav, ubound(vav,dim=1))
          call PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat(1,1,istep), &
                     ubound(tmat,dim=1), vav, ubound(vav,dim=1))
          call timer%stop("blas")
          call herm_matrix_allreduce_PRECISION(n_cols,vav,nbw,nbw,mpi_comm_cols)
        endif

        ! U = U - 0.5 * V * VAV

        if (useGPU) then
          call timer%start("cublas")
          call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, CONST_COMPLEX_PAIR_NEGATIVE_0_5, (umc_dev +  &
                            (cur_l_cols * n_cols )*size_of_PRECISION_complex), &
                            cur_l_cols, vav_dev, nbw, CONE, umc_dev, cur_l_cols)
          call timer%stop("cublas")
          ! Transpose umc -> umr (stored in vmr, second half)

          if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
            print *,"bandred_complex: umcCPU size 5 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
            stop
          endif
          successCUDA = cuda_memcpy(loc(umcCPU(1,1)),umc_dev,umc_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
          if (.not.(successCUDA)) then
            print *, "bandred_complex:  cuda memcpy failed umcCPU ", istat
            stop
          endif
          call elpa_transpose_vectors_complex_PRECISION  (umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
                                                vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
                                                1, istep*nbw, n_cols, nblk)
          if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
            print *,"bandred_complex: vmr size 4 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
            stop
          endif
          successCUDA = cuda_memcpy(vmr_dev,loc(vmrCPU(1,1)),vmr_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
          if (.not.(successCUDA)) then
            print *, "bandred_complex:  cuda memcpy failed vav_dev", istat
            stop
          endif

          if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
            print *,"bandred_complex: umcCPU size 6 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
            stop
          endif
          successCUDA = cuda_memcpy(umc_dev,loc(umcCPU(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
          if (.not.(successCUDA)) then
            print *, "bandred_complex:  cuda memcpy failed umc_dev ", istat
            stop
          endif
        else ! not useGPU
          call timer%start("blas")
          call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, CONST_COMPLEX_PAIR_NEGATIVE_0_5, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), &
                     vav, ubound(vav,dim=1), CONE, umcCPU, ubound(umcCPU,dim=1))
          call timer%stop("blas")
          ! Transpose umc -> umr (stored in vmr, second half)

          call elpa_transpose_vectors_complex_PRECISION  (umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
                                                vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
                                                1, istep*nbw, n_cols, nblk)
        endif
        ! A = A - V*U**T - U*V**T

        do i=0,(istep*nbw-1)/tile_size
          lcs = i*l_cols_tile+1
          lce = min(l_cols,(i+1)*l_cols_tile)
          lre = min(l_rows,(i+1)*l_rows_tile)
          if (lce<lcs .or. lre<1) cycle
            if (useGPU) then
              call timer%start("cublas")
              call cublas_PRECISION_GEMM('N', 'C', lre, lce-lcs+1, 2*n_cols, -CONE, &
                                vmr_dev ,cur_l_rows, (umc_dev +(lcs-1)*size_of_PRECISION_complex),cur_l_cols, &
                                CONE, (a_dev + (lcs-1)*lda*size_of_PRECISION_complex),lda)
              call timer%stop("cublas")
            else
              call timer%start("blas")
              call PRECISION_GEMM('N', 'C', lre,lce-lcs+1, 2*n_cols, -CONE, &
                         vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
                         CONE, a(1,lcs), lda)
              call timer%stop("blas")
            endif
          enddo
#endif
 
1747
1748
1749
1750
       if (.not.(useGPU)) then
         if (allocated(vr)) then
           deallocate(vr, stat=istat, errmsg=errorMessage)
           if (istat .ne. 0) then
1751
#if REALCASE == 1
1752
             print *,"bandred_real: error when deallocating vr "//errorMessage
1753
1754
1755
1756
#endif
#if COMPLEXCASE == 1
             print *,"bandred_complex: error when deallocating vr "//errorMessage
#endif
1757
1758
1759
1760
1761
1762
1763
             stop
           endif
         endif

         if (allocated(umcCPU)) then
           deallocate(umcCPU, stat=istat, errmsg=errorMessage)
           if (istat .ne. 0) then
1764
1765
1766
1767
1768
1769
#if REALCASE == 1
             print *,"bandred_real: error when deallocating umcCPU "//errorMessage
#endif
#if COMPLEXCASE == 1
             print *,"bandred_complex: error when deallocating umcCPU "//errorMessage
#endif
1770
1771
1772
1773
             stop
           endif
         endif

1774
#if REALCASE == 1
1775
1776
1777
1778
1779
1780
1781
         if (allocated(vmrCPU)) then
           deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
           if (istat .ne. 0) then
             print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
             stop
           endif
         endif
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
#endif
#if COMPLEXCASE == 1
         if (allocated(vmrCPU)) then
           deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
           if (istat .ne. 0) then
             print *,"bandred_complex: error when deallocating vmrCPU "//errorMessage
             stop
           endif
         endif
#endif
1792
1793
1794
1795
1796
       endif !useGPU

     enddo ! istep

     if (useGPU) then
1797
1798
1799
1800
1801
!#if !(defined(USE_ASSUMED_SIZE))
!       if (size(a,dim=1)*size(a,dim=2) .ne. lda*na_cols) then
!         print *,"bandred_complex: size a ",size(a,dim=1)*size(a,dim=2) , lda*na_cols
!       endif
!#endif
1802
       ! this is not needed since a_dev is passed along from one subroutine to the other
1803
#if REALCASE == 1
1804
       successCUDA = cuda_memcpy ( loc (a), a_dev, lda*na_cols*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
1805
1806
1807
1808
#endif
#if COMPLEXCASE == 1
       successCUDA = cuda_memcpy ( loc(a(1,1)), a_dev, lda*na_cols*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
#endif
1809
       if (.not.(successCUDA)) then
1810
#if REALCASE == 1
1811
         print *,"bandred_real: error in cudaMemcpy"
1812
1813
1814
1815
#endif
#if COMPLEXCASE == 1
         print *, "bandred_complex:  cuad memcpy failed a ", istat
#endif
1816
1817
1818
1819
1820
         stop
       endif

       successCUDA = cuda_free(a_dev)
       if (.not.(successCUDA)) then
1821
#if REALCASE == 1
1822
         print *,"bandred_real: error in cudaFree"
1823
1824
1825
1826
#endif
#if COMPLEXCASE == 1
         print *, "bandred_complex:  cuad memcpy failed a ", istat
#endif
1827
1828
1829
1830
1831
1832
1833
1834
         stop
       endif

!#ifdef WITH_MPI
! it should be possible to keep tmat dev on the device and not copy it arround
! this is not necessary tmat_dev is passed (unchanged) from one routine to the other
       successCUDA = cuda_free(tmat_dev)
       if (.not.(successCUDA)) then
1835
#if REALCASE == 1
1836
         print *,"bandred_real: error in cudaFree"