elpa2_compute.F90 384 KB
Newer Older
Andreas Marek's avatar
Andreas Marek committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2_compute

! Version 1.1.2, 2011-02-21

69
  use ELPA_utilities
Andreas Marek's avatar
Andreas Marek committed
70
71
72
73
  USE ELPA1_compute
  use elpa1, only : elpa_print_times, time_evp_back, time_evp_fwd, time_evp_solve
  use elpa2_utilities
  use elpa_pdgeqrf
74
  use precision
75
  use elpa_mpi
Andreas Marek's avatar
Andreas Marek committed
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

  implicit none

  PRIVATE ! By default, all routines contained are private

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex

  public :: band_band_real
  public :: divide_band

94
  integer(kind=ik), public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
Andreas Marek's avatar
Andreas Marek committed
95
96
97
98
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
  contains

99
    subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
100
                            tmat, wantDebug, useGPU, success, useQR)
Andreas Marek's avatar
Andreas Marek committed
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

  !-------------------------------------------------------------------------------
  !  bandred_real: Reduces a distributed symmetric matrix to band form
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

131
132
      use cuda_functions
      use iso_c_binding
Andreas Marek's avatar
Andreas Marek committed
133
134

#ifdef HAVE_DETAILED_TIMINGS
135
      use timings
Andreas Marek's avatar
Andreas Marek committed
136
#endif
137
138
139
#ifdef WITH_OPENMP
      use omp_lib
#endif
Andreas Marek's avatar
Andreas Marek committed
140
      use precision
141
      implicit none
Andreas Marek's avatar
Andreas Marek committed
142

Andreas Marek's avatar
Andreas Marek committed
143
      integer(kind=ik)           :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
Andreas Marek's avatar
Andreas Marek committed
144
145
146
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
      real(kind=rk)              :: a(lda,*), tmat(nbw,nbw,*)
#else
Andreas Marek's avatar
Andreas Marek committed
147
      real(kind=rk)              :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
Andreas Marek's avatar
Andreas Marek committed
148
#endif
149
150
      real(kind=rk)              :: eps
      logical, intent(in)        :: useGPU
151

Andreas Marek's avatar
Andreas Marek committed
152
      integer(kind=ik)           :: my_prow, my_pcol, np_rows, np_cols, mpierr
153
      integer(kind=ik)           :: l_cols, l_rows, vmrCols
Andreas Marek's avatar
Andreas Marek committed
154
155
156
      integer(kind=ik)           :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
      integer(kind=ik)           :: istep, ncol, lch, lcx, nlc, mynlc
      integer(kind=ik)           :: tile_size, l_rows_tile, l_cols_tile
157

Andreas Marek's avatar
Andreas Marek committed
158
      real(kind=rk)              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
159

160
161
162
      real(kind=rk), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      real(kind=rk), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      real(kind=rk), allocatable :: vr(:)
163
      ! needed for blocked QR decomposition
Andreas Marek's avatar
Andreas Marek committed
164
165
166
      integer(kind=ik)           :: PQRPARAM(11), work_size
      real(kind=rk)              :: dwork_size(1)
      real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
Andreas Marek's avatar
Andreas Marek committed
167

168
      integer(kind=C_intptr_T)   :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
169
#ifdef WITH_MPI
170
      integer(kind=ik), external :: numroc
171
#endif
172
173
174
175
176
      integer(kind=ik)           :: ierr
      integer(kind=ik)           :: cur_l_rows, cur_l_cols, vmr_size, umc_size
      integer(kind=c_size_t)     :: lc_start, lc_end
      integer(kind=ik)           :: lr_end
      integer(kind=ik)           :: na_rows, na_cols
Andreas Marek's avatar
Andreas Marek committed
177

Andreas Marek's avatar
Andreas Marek committed
178
179
      logical, intent(in)        :: wantDebug
      logical, intent(out)       :: success
180
181
182
      logical                    :: successCUDA
      integer(kind=ik)           :: istat
      character(200)             :: errorMessage
Andreas Marek's avatar
Andreas Marek committed
183

Andreas Marek's avatar
Andreas Marek committed
184
      logical, intent(in)        :: useQR
Andreas Marek's avatar
Andreas Marek committed
185

Andreas Marek's avatar
Andreas Marek committed
186
      integer(kind=ik)           :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize
Andreas Marek's avatar
Andreas Marek committed
187
188

#ifdef HAVE_DETAILED_TIMINGS
189
      call timer%start("bandred_real")
Andreas Marek's avatar
Andreas Marek committed
190
#endif
191
192
193
194
195
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      success = .true.
Andreas Marek's avatar
Andreas Marek committed
196
197


198
199
200
201
202
203
204
205
206
207
208
      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
            write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
            write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
          endif
          success = .false.
          return
        endif
      endif
Andreas Marek's avatar
Andreas Marek committed
209

210
      if (useGPU) then
211
#ifdef WITH_MPI
212
213
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
214
215
216
217
#else
        na_rows = na
        na_cols = na
#endif
218
      endif
Andreas Marek's avatar
Andreas Marek committed
219

220
      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above
Andreas Marek's avatar
Andreas Marek committed
221

222
223
      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
Andreas Marek's avatar
Andreas Marek committed
224

225
226
      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile
Andreas Marek's avatar
Andreas Marek committed
227

228
      if (useQR) then
Andreas Marek's avatar
Andreas Marek committed
229

230
231
232
233
        if (useGPU) then
          print *,"qr decomposition at the moment not supported with GPU"
          stop
        endif
Andreas Marek's avatar
Andreas Marek committed
234

235
        if (which_qr_decomposition == 1) then
236
          call qr_pqrparam_init(pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
237
238
239
240
241
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
            stop
          endif
Andreas Marek's avatar
Andreas Marek committed
242

243
244
245
246
247
          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
            stop
          endif
Andreas Marek's avatar
Andreas Marek committed
248

249
          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
250
251
252
253
254
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
            stop
          endif
Andreas Marek's avatar
Andreas Marek committed
255

256
257
          vmrCols = na
#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
258
          call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
259
260
261
262
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
263
          call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
264
265
266
267
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif
268
          work_size = dwork_size(1)
269
270
271
272
273
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
            stop
          endif
Andreas Marek's avatar
Andreas Marek committed
274

275
          work_blocked = 0.0_rk
276
277
278
279
280
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
            stop
          endif
Andreas Marek's avatar
Andreas Marek committed
281

282
        endif ! which_qr_decomposition
Andreas Marek's avatar
Andreas Marek committed
283

284
      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
285

286
287
288
289
290
291
      if (useGPU) then
        ! Here we convert the regular host array into a pinned host array
        successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_real_datatype)
        if (.not.(successCUDA)) then
          print *,"bandred_real: error in cudaMalloc"
          stop
292
        endif
Andreas Marek's avatar
Andreas Marek committed
293

294
295
296
297
298
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_real_datatype)
        if (.not.(successCUDA)) then
          print *,"bandred_real: error in cudaMalloc"
          stop
        endif
Andreas Marek's avatar
Andreas Marek committed
299

300
301
302
303
304
        successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_real_datatype)
        if (.not.(successCUDA)) then
          print *,"bandred_real: error in cudaMalloc"
          stop
        endif
Andreas Marek's avatar
Andreas Marek committed
305

306
307
        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
308

309
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*size_of_real_datatype, cudaMemcpyHostToDevice)
310
311
312
313
314
        if (.not.(successCUDA)) then
          print *,"bandred_real: error in cudaMemcpy"
          stop
        endif
      endif ! useGPU
Andreas Marek's avatar
Andreas Marek committed
315
316


317
      do istep = (na-1)/nbw, 1, -1
Andreas Marek's avatar
Andreas Marek committed
318

319
        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
Andreas Marek's avatar
Andreas Marek committed
320

321
322
323
        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
Andreas Marek's avatar
Andreas Marek committed
324

325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
                print *,"bandred_real: error when deallocating vr "//errorMessage
                stop
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
              print *,"bandred_real: error when allocating vr "//errorMessage
              stop
            endif
Andreas Marek's avatar
Andreas Marek committed
347

348
          endif
Andreas Marek's avatar
Andreas Marek committed
349

350
351
352
353
354
355
356
          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
                print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
                stop
              endif
Andreas Marek's avatar
Andreas Marek committed
357

358
359
360
361
362
363
              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
                print *,"bandred_real: error in cuda_free"
                stop
              endif
            endif
Andreas Marek's avatar
Andreas Marek committed
364

365
366
367
368
369
            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
              print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
              stop
            endif
Andreas Marek's avatar
Andreas Marek committed
370

371
372
373
374
375
            successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_real_datatype)
            if (.not.(successCUDA)) then
              print *,"bandred_real: error in cudaMalloc"
              stop
            endif
Andreas Marek's avatar
Andreas Marek committed
376

377
          endif
Andreas Marek's avatar
Andreas Marek committed
378

379
380
381
382
383
384
385
          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
                print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
                stop
              endif
Andreas Marek's avatar
Andreas Marek committed
386

387
388
389
390
391
              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
                 print *,"bandred_real: error in cudaFree"
                 stop
              endif
Andreas Marek's avatar
Andreas Marek committed
392

393
            endif
Andreas Marek's avatar
Andreas Marek committed
394

395
396
397
398
399
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
              print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
              stop
            endif
Andreas Marek's avatar
Andreas Marek committed
400

401
402
403
404
405
            successCUDA = cuda_malloc(umc_dev, umc_size*size_of_real_datatype)
            if (.not.(successCUDA)) then
              print *,"bandred_real: error in cudaMalloc"
              stop
            endif
Andreas Marek's avatar
Andreas Marek committed
406

407
408
409
          endif
        else ! GPU not used
          ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
Andreas Marek's avatar
Andreas Marek committed
410

411
412
413
414
415
          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
            stop
          endif
Andreas Marek's avatar
Andreas Marek committed
416

417
418
419
420
421
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating umcCPU "//errorMessage
            stop
          endif
Andreas Marek's avatar
Andreas Marek committed
422

423
424
425
426
427
428
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vr "//errorMessage
            stop
          endif
        endif ! use GPU
Andreas Marek's avatar
Andreas Marek committed
429

430
        if (useGPU) then
431
          vmrCUDA(1 : cur_l_rows * n_cols) = 0._rk
432
        else
433
          vmrCPU(1:l_rows,1:n_cols) = 0._rk
434
        endif
Andreas Marek's avatar
Andreas Marek committed
435

436
437
        vr(:) = 0
        tmat(:,:,istep) = 0
Andreas Marek's avatar
Andreas Marek committed
438

439
        if (useGPU) then
440
          umcCUDA(1 : umc_size) = 0._rk
Andreas Marek's avatar
Andreas Marek committed
441

442
443
444
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
Andreas Marek's avatar
Andreas Marek committed
445

446
          if(lc_start .le. 0) lc_start = 1
Andreas Marek's avatar
Andreas Marek committed
447

448
449
          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
Andreas Marek's avatar
Andreas Marek committed
450

451
          if(my_pcol == cur_pcol) then
Andreas Marek's avatar
Andreas Marek committed
452

453
454
455
456
457
458
459
460
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), lda*size_of_real_datatype,         &
                                       (a_dev + ((lc_start-1) * lda*size_of_real_datatype)),    &
                                       lda*size_of_real_datatype, lr_end*size_of_real_datatype, &
                                       (lc_end - lc_start+1), cudaMemcpyDeviceToHost)
            if (.not.(successCUDA)) then
              print *,"bandred_real: error in cudaMemcpy2d"
              stop
            endif
Andreas Marek's avatar
Andreas Marek committed
461

462
463
          endif
        endif ! useGPU
Andreas Marek's avatar
Andreas Marek committed
464

465
        ! Reduce current block to lower triangular form
Andreas Marek's avatar
Andreas Marek committed
466

467
468
        if (useQR) then
          if (which_qr_decomposition == 1) then
469
470
            vmrCols = 2*n_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR
471
            call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
472
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
473
474
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
475
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
476
                                     blockheuristic)
477
478

#else
479
            call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
480
481
482
483
484
485
486
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
487
          endif
488
       else !useQR
Andreas Marek's avatar
Andreas Marek committed
489

490
         do lc = n_cols, 1, -1
Andreas Marek's avatar
Andreas Marek committed
491

492
493
           ncol = istep*nbw + lc ! absolute column number of householder vector
           nrow = ncol - nbw ! Absolute number of pivot row
Andreas Marek's avatar
Andreas Marek committed
494

495
496
           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
Andreas Marek's avatar
Andreas Marek committed
497

498
           tau = 0
Andreas Marek's avatar
Andreas Marek committed
499

500
           if (nrow == 1) exit ! Nothing to do
Andreas Marek's avatar
Andreas Marek committed
501

502
           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
Andreas Marek's avatar
Andreas Marek committed
503

504
           if (my_pcol==cur_pcol) then
Andreas Marek's avatar
Andreas Marek committed
505

506
507
             ! Get vector to be transformed; distribute last element and norm of
             ! remaining elements to all procs in current column
Andreas Marek's avatar
Andreas Marek committed
508

509
             vr(1:lr) = a(1:lr,lch) ! vector to be transformed
Andreas Marek's avatar
Andreas Marek committed
510

511
512
513
514
515
             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
516
               aux1(2) = 0._rk
517
             endif
Andreas Marek's avatar
Andreas Marek committed
518

519
#ifdef WITH_MPI
520

521
522
523
524
525
526
#ifdef DOUBLE_PRECISION_REAL
             call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
             call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif

527
528
529
#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
530

531
532
             vnorm2 = aux2(1)
             vrl    = aux2(2)
Andreas Marek's avatar
Andreas Marek committed
533

534
             ! Householder transformation
Andreas Marek's avatar
Andreas Marek committed
535

536
             call hh_transform_real(vrl, vnorm2, xf, tau)
Andreas Marek's avatar
Andreas Marek committed
537

538
             ! Scale vr and store Householder vector for back transformation
Andreas Marek's avatar
Andreas Marek committed
539

540
541
542
543
             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
544
               vr(lr) = 1._rk
545
546
547
             else
               a(1:lr,lch) = vr(1:lr)
             endif
548

549
           endif
550

551
           ! Broadcast Householder vector and tau along columns
552

553
           vr(lr+1) = tau
554
#ifdef WITH_MPI
555

556
557
558
559
560
#ifdef DOUBLE_PRECISION_REAL
           call MPI_Bcast(vr, lr+1, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr)
#else
           call MPI_Bcast(vr, lr+1, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr)
#endif
561

562
#endif /* WITH_MPI */
563
564
           if (useGPU) then
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
565
           else
566
             vmrCPU(1:lr,lc) = vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
567
568
           endif

569
570
           tau = vr(lr+1)
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
571

572
573
           ! Transform remaining columns in current block with Householder vector
           ! Local dot product
574

575
           aux1 = 0
576

577
578
579
580
#ifdef WITH_OPENMP
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation
581

582
583
           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns
584

585
586
587
588
589
590
591
592
593
594
595
596
597
598
           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo
599

600
           ! Get global dot products
601

602
603
           !$omp barrier
           !$omp single
604
605
#ifdef WITH_MPI

606
607
608
#ifdef DOUBLE_PRECISION_REAL
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
609
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
610
#endif
611
612
613
614
615

#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1

#endif /* WITH_MPI */
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
#else /* WITH_OPENMP */
           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo
646

647
           ! Get global dot products
648
#ifdef WITH_MPI
649

650
651
652
653
654
#ifdef DOUBLE_PRECISION_REAL
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
655

656
657
658
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
659
           ! Transform
Andreas Marek's avatar
Andreas Marek committed
660

661
662
663
664
665
666
667
668
669
670
           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
             endif
           enddo
#endif /* WITH_OPENMP */
         enddo ! lc
Andreas Marek's avatar
Andreas Marek committed
671
672

         if (useGPU) then
673
674
675
676
677
678
679
680
681
682
683
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
             successCUDA = cuda_memcpy2d((a_dev+((lc_start-1)*lda*size_of_real_datatype)),          &
                                          lda*size_of_real_datatype, loc(a(1, lc_start)),           &
                                          lda*size_of_real_datatype,  lr_end*size_of_real_datatype, &
                                          (lc_end - lc_start+1),cudaMemcpyHostToDevice)
             if (.not.(successCUDA)) then
               print *,"bandred_real: error in cudaMemcpy2d"
               stop
             endif
684

685
           endif
Andreas Marek's avatar
Andreas Marek committed
686
687
         endif

688
689
         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk
Andreas Marek's avatar
Andreas Marek committed
690

691
         vav = 0
Andreas Marek's avatar
Andreas Marek committed
692

693
#ifdef DOUBLE_PRECISION_REAL
694
695
         if (useGPU) then
           if (l_rows>0) &
696
             call dsyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCUDA, cur_l_rows, 0.0_rk, vav, ubound(vav,dim=1))
697
698
         else
           if (l_rows>0) &
699
             call dsyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCPU, ubound(vmrCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1))
700
         endif
701
702
703
704
705
706
707
708
709
710
#else
         if (useGPU) then
           if (l_rows>0) &
             call ssyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCUDA, cur_l_rows, 0.0_rk, vav, ubound(vav,dim=1))
         else
           if (l_rows>0) &
             call ssyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCPU, ubound(vmrCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1))
         endif
#endif

711
         call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
Andreas Marek's avatar
Andreas Marek committed
712

713
         ! Calculate triangular matrix T for block Householder Transformation
Andreas Marek's avatar
Andreas Marek committed
714

715
716
717
         do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if (lc<n_cols) then
718
719
720
721
722
#ifdef DOUBLE_PRECISION_REAL
             call dtrmv('U', 'T', 'N', n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
#else
             call strmv('U', 'T', 'N', n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
#endif
723
             tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
Andreas Marek's avatar
Andreas Marek committed
724
725
           endif
         enddo
726
       endif
Andreas Marek's avatar
Andreas Marek committed
727

728
       ! Transpose vmr -> vmc (stored in umc, second half)
Andreas Marek's avatar
Andreas Marek committed
729
730

       if (useGPU) then
731
732
733
734
735
736
737
         call elpa_transpose_vectors_real  (vmrCUDA, cur_l_rows, mpi_comm_rows, &
                                            umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, mpi_comm_cols, &
                                            1, istep*nbw, n_cols, nblk)
       else
         call elpa_transpose_vectors_real  (vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
                                            umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
                                            1, istep*nbw, n_cols, nblk)
Andreas Marek's avatar
Andreas Marek committed
738
739
       endif

740
741
742
743
744
745
746
       ! Calculate umc = A**T * vmr
       ! Note that the distributed A has to be transposed
       ! Opposed to direct tridiagonalization there is no need to use the cache locality
       ! of the tiles, so we can use strips of the matrix

       ! here the GPU version and CPU version diverged substantially, due to the newest
       ! optimizations due to Intel. The GPU version has to be re-written
Andreas Marek's avatar
Andreas Marek committed
747
       if (useGPU) then
748
         umcCUDA(1 : l_cols * n_cols) = 0.0_rk
749
         vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = 0
Andreas Marek's avatar
Andreas Marek committed
750

751
752
753
754
755
756
         if (l_cols>0 .and. l_rows>0) then
           successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_real_datatype,cudaMemcpyHostToDevice)
           if (.not.(successCUDA)) then
             print *,"bandred_real: error in cudaMemcpy"
             stop
           endif
Andreas Marek's avatar
Andreas Marek committed
757

758
759
760
761
762
           successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_real_datatype,cudaMemcpyHostToDevice)
           if (.not.(successCUDA)) then
             print *,"bandred_real: error in cudaMemcpy"
             stop
           endif
Andreas Marek's avatar
Andreas Marek committed
763

764
           do i=0,(istep*nbw-1)/tile_size
Andreas Marek's avatar
Andreas Marek committed
765

766
767
768
             lcs = i*l_cols_tile+1
             lce = min(l_cols,(i+1)*l_cols_tile)
             if (lce<lcs) cycle
Andreas Marek's avatar
Andreas Marek committed
769

770
             lre = min(l_rows,(i+1)*l_rows_tile)
771
772
773
774
775
776
777
778
779
#ifdef DOUBLE_PRECISION_REAL
             call cublas_dgemm('T', 'N', lce-lcs+1, n_cols, lre, &
                               1.0_rk, (a_dev + ((lcs-1)*lda*size_of_real_datatype)), lda, vmr_dev,cur_l_rows, &
                               1.0_rk, (umc_dev+ (lcs-1)*size_of_real_datatype), cur_l_cols)
#else
             call cublas_sgemm('T', 'N', lce-lcs+1, n_cols, lre, &
                               1.0_rk, (a_dev + ((lcs-1)*lda*size_of_real_datatype)), lda, vmr_dev,cur_l_rows, &
                               1.0_rk, (umc_dev+ (lcs-1)*size_of_real_datatype), cur_l_cols)
#endif
780
781
             if(i==0) cycle
             lre = min(l_rows,i*l_rows_tile)
782
783
784
785
786
787
788
789
#ifdef DOUBLE_PRECISION_REAL
             call cublas_dgemm('N', 'N', lre,n_cols, lce-lcs+1,&
                               1.0_rk, (a_dev+ ((lcs-1)*lda*size_of_real_datatype)), lda,                  &
                               (umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_real_datatype), cur_l_cols, &
                               1.0_rk, (vmr_dev+(cur_l_rows * n_cols)*size_of_real_datatype), cur_l_rows)
#else
             call cublas_sgemm('N', 'N', lre,n_cols, lce-lcs+1,&
                               1.0_rk, (a_dev+ ((lcs-1)*lda*size_of_real_datatype)), lda,                  &
790
                               (umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_real_datatype), cur_l_cols, &
791
792
                               1.0_rk, (vmr_dev+(cur_l_rows * n_cols)*size_of_real_datatype), cur_l_rows)
#endif
793
           enddo
Andreas Marek's avatar
Andreas Marek committed
794

795
796
797
798
799
           successCUDA = cuda_memcpy(loc(vmrCUDA(1)), vmr_dev,vmr_size*size_of_real_datatype,cudaMemcpyDeviceToHost)
           if (.not.(successCUDA)) then
             print *,"bandred_real: error in cudaMemcpy"
             stop
           endif
Andreas Marek's avatar
Andreas Marek committed
800

801
802
803
804
805
           successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_real_datatype,cudaMemcpyDeviceToHost)
           if (.not.(successCUDA)) then
             print *,"bandred_real: error in cudaMemcpy"
             stop
           endif
Andreas Marek's avatar
Andreas Marek committed
806

807
         endif ! l_cols>0 .and. l_rows>0
Andreas Marek's avatar
Andreas Marek committed
808

809
810
       else ! do not useGPU version
         !Code for Algorithm 4
Andreas Marek's avatar
Andreas Marek committed
811

812
813
814
815
816
817
818
819
820
821
822
823
         n_way = 1
#ifdef WITH_OPENMP
         n_way = omp_get_max_threads()
#endif
         !umc(1:l_cols,1:n_cols) = 0.d0
         !vmr(1:l_rows,n_cols+1:2*n_cols) = 0
#ifdef WITH_OPENMP
         !$omp parallel private( i,lcs,lce,lrs,lre)
#endif
         if (n_way > 1) then
           !$omp do
           do i=1,min(l_cols_tile, l_cols)
824
             umcCPU(i,1:n_cols) = 0.0_rk
825
           enddo
826

827
828
           !$omp do
           do i=1,l_rows
829
             vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rk
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
           enddo
           if (l_cols>0 .and. l_rows>0) then

             !SYMM variant 4
             !Partitioned Matrix Expression:
             ! Ct = Atl Bt + Atr Bb
             ! Cb = Atr' Bt + Abl Bb
             !
             !Loop invariant:
             ! Ct = Atl Bt + Atr Bb
             !
             !Update:
             ! C1 = A10'B0 + A11B1 + A21 B2
             !
             !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
             !is easily parallelized, and regardless of choise of algorithm,
             !the startup cost for parallelizing the dgemms inside the loop is too great

             !$omp do schedule(static,1)
             do i=0,(istep*nbw-1)/tile_size
               lcs = i*l_cols_tile+1                   ! local column start
               lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end

               lrs = i*l_rows_tile+1                   ! local row start
               lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end

               !C1 += [A11 A12] [B1
               !                 B2]
               if ( lre > lrs .and. l_cols > lcs ) then
859
860
861
862
863
864
865
866
#ifdef DOUBLE_PRECISION_REAL
                 call DGEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1,          &
                            1.0_rk, a(lrs,lcs), ubound(a,dim=1),                 &
                                  umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1),  &
                            0.0_rk, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
#else
                 call SGEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1,          &
                            1.0_rk, a(lrs,lcs), ubound(a,dim=1),                 &
867
                                  umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1),  &
868
869
                            0.0_rk, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
#endif
870
               endif
Andreas Marek's avatar
Andreas Marek committed
871

872
873
               ! C1 += A10' B0
               if ( lce > lcs .and. i > 0 ) then
874
875
876
#ifdef DOUBLE_PRECISION_REAL
                 call DGEMM('T', 'N', lce-lcs+1, n_cols, lrs-1,           &
                            1.0_rk, a(1,lcs),   ubound(a,dim=1),           &
877
                                  vmrCPU(1,1),   ubound(vmrCPU,dim=1),   &
878
879
880
881
882
883
884
                            0.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1))
#else
                 call SGEMM('T', 'N', lce-lcs+1, n_cols, lrs-1,           &
                            1.0_rk, a(1,lcs),   ubound(a,dim=1),           &
                                  vmrCPU(1,1),   ubound(vmrCPU,dim=1),   &
                            0.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1))
#endif
885
886
887
888
               endif
             enddo
           endif ! l_cols>0 .and. l_rows>0
         else ! n_way > 1
889
           umcCPU(1:l_cols,1:n_cols) = 0.0_rk
890
891
892
893
894
895
896
897
898
           vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0
           if (l_cols>0 .and. l_rows>0) then
             do i=0,(istep*nbw-1)/tile_size

               lcs = i*l_cols_tile+1
               lce = min(l_cols,(i+1)*l_cols_tile)
               if (lce<lcs) cycle

               lre = min(l_rows,(i+1)*l_rows_tile)
899
900
901
#ifdef DOUBLE_PRECISION_REAL
               call DGEMM('T', 'N', lce-lcs+1, n_cols, lre, 1.0_rk, a(1,lcs), ubound(a,dim=1), &
                            vmrCPU, ubound(vmrCPU,dim=1), 1.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1))
902
903
904

               if (i==0) cycle
                 lre = min(l_rows,i*l_rows_tile)
905
906
907
908
909
910
911
912
913
914
915
916
917
                 call DGEMM('N', 'N', lre, n_cols, lce-lcs+1, 1.0_rk, a(1,lcs), lda, &
                            umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), 1.0_rk, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
#else
               call SGEMM('T', 'N', lce-lcs+1, n_cols, lre, 1.0_rk, a(1,lcs), ubound(a,dim=1), &
                            vmrCPU, ubound(vmrCPU,dim=1), 1.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1))

               if (i==0) cycle
                 lre = min(l_rows,i*l_rows_tile)
                 call SGEMM('N', 'N', lre, n_cols, lce-lcs+1, 1.0_rk, a(1,lcs), lda, &
                            umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), 1.0_rk, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
#endif


918
919
920
921
922
             enddo
           endif
         endif ! n_way > 1
#ifdef WITH_OPENMP
        !$omp end parallel
923
#endif
924
       endif ! do not useGPU version
Andreas Marek's avatar
Andreas Marek committed
925

926
927
928
929
       ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
       ! on the processors containing the diagonal
       ! This is only necessary if ur has been calculated, i.e. if the
       ! global tile size is smaller than the global remaining matrix
Andreas Marek's avatar
Andreas Marek committed
930

931
932
       if (useGPU) then
         ! here the GPU version and CPU version divereged due to the same reasons as above
Andreas Marek's avatar
Andreas Marek committed
933

934
935
936
937
938
         if (tile_size < istep*nbw) then
           call elpa_reduce_add_vectors_real  (vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
                                               umcCUDA, cur_l_cols, mpi_comm_cols, &
                                               istep*nbw, n_cols, nblk)
         endif
Andreas Marek's avatar
Andreas Marek committed
939

940
941
942
943
944
945
         if (l_cols>0) then
           allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
           if (istat .ne. 0) then
             print *,"bandred_real: error when allocating tmpCUDA "//errorMessage
             stop
           endif
Andreas Marek's avatar
Andreas Marek committed
946

947
948
#ifdef WITH_MPI

949
950
951
952
953
#ifdef DOUBLE_PRECISION_REAL
           call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, ierr)
#else
           call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, ierr)
#endif
954
955
956
957

#else /* WITH_MPI */
           tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
#endif /* WITH_MPI */
958
           umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
Andreas Marek's avatar
Andreas Marek committed
959

960
961
962
963
964
965
966
967
           if (allocated(tmpCUDA)) then
             deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
             if (istat .ne. 0) then
               print *,"bandred_real: error when deallocating tmpCUDA "//errorMessage
               stop
             endif
           endif
         endif ! l_cols
Andreas Marek's avatar
Andreas Marek committed
968

969
970
971
972
973
974
         ! U = U * Tmat**T
         successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_real_datatype, cudaMemcpyHostToDevice)
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif
Andreas Marek's avatar
Andreas Marek committed
975

976
977
978
979
980
         successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif
981
982
983
984
985
986
987
#ifdef DOUBLE_PRECISION_REAL
         call cublas_dtrmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
                           1.0_rk, tmat_dev, nbw, umc_dev, cur_l_cols)
#else
         call cublas_strmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
                           1.0_rk, tmat_dev, nbw, umc_dev, cur_l_cols)
#endif
988
         ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
Andreas Marek's avatar
Andreas Marek committed
989

990
991
992
993
994
         successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif
995
996
997
998
#ifdef DOUBLE_PRECISION_REAL
         call cublas_dgemm('T', 'N', n_cols, n_cols, l_cols, &
                           1.0_rk, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, &
                           0.0_rk, vav_dev, nbw)
Andreas Marek's avatar
Andreas Marek committed
999

1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
         call cublas_dtrmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
                           1.0_rk, tmat_dev, nbw, vav_dev, nbw)
#else
         call cublas_sgemm('T', 'N', n_cols, n_cols, l_cols, &
                           1.0_rk, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, &
                           0.0_rk, vav_dev, nbw)

         call cublas_strmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
                           1.0_rk, tmat_dev, nbw, vav_dev, nbw)
#endif
Andreas Marek's avatar
Andreas Marek committed
1010
1011
1012



1013
1014
1015
1016
1017
         successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_real_datatype, cudaMemcpyDeviceToHost)
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif
Andreas Marek's avatar
Andreas Marek committed
1018

1019
         call symm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_cols)
Andreas Marek's avatar
Andreas Marek committed
1020

1021
1022
1023
1024
1025
         successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
         if (.not.(successCUDA)) then
           print *,"bandred_real: error in cudaMemcpy"
           stop
         endif
Andreas Marek's avatar
Andreas Marek committed
1026

1027
         ! U = U - 0.5 * V * VAV
1028
1029
1030
1031
1032
1033
1034
1035
1036
#ifdef DOUBLE_PRECISION_REAL
         call cublas_dgemm('N', 'N', l_cols, n_cols, n_cols,&
                           -0.5_rk, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, vav_dev,nbw,&
                           1.0_rk, umc_dev, cur_l_cols)
#else
         call cublas_sgemm('N', 'N', l_cols, n_cols, n_cols,&
                           -0.5_rk, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, vav_dev,nbw,&