elpa2_bandred_template.F90 61.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
Andreas Marek's avatar
Andreas Marek committed
63
  subroutine bandred_&
64
65
66
    &MATH_DATATYPE&
    &_&
    &PRECISION &
Wenzhe Yu's avatar
Wenzhe Yu committed
67
68
    (obj, na, a_mat, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
     wantDebug, useGPU, success, &
69
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
70
     useQR, &
71
#endif
Andreas Marek's avatar
Andreas Marek committed
72
     max_threads)
73

74
  !-------------------------------------------------------------------------------
75
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
76
77
78
79
80
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
Pavel Kus's avatar
Pavel Kus committed
81
  !  a_mat(lda,matrixCols)    Distributed matrix which should be reduced.
82
  !              Distribution is like in Scalapack.
Pavel Kus's avatar
Pavel Kus committed
83
84
  !              Opposed to Scalapack, a_mat(:,:) must be set completely (upper and lower half)
  !              a_mat(:,:) is overwritten on exit with the band and the Householder vectors
85
86
  !              in the upper half.
  !
Pavel Kus's avatar
Pavel Kus committed
87
88
  !  lda         Leading dimension of a_mat
  !  matrixCols  local columns of matrix a_mat
89
90
91
92
93
94
95
96
97
98
99
100
101
102
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

Andreas Marek's avatar
Andreas Marek committed
103
104
105
  use cuda_functions
  use iso_c_binding
  use elpa1_compute
106
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
107
  use omp_lib
108
#endif
Andreas Marek's avatar
Andreas Marek committed
109
110
  use precision
  use elpa_blas_interfaces
Andreas Marek's avatar
Andreas Marek committed
111
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
112
  use elpa_scalapack_interfaces
Andreas Marek's avatar
Andreas Marek committed
113
#endif
Andreas Marek's avatar
Andreas Marek committed
114
  use elpa_abstract_impl
115

Andreas Marek's avatar
Andreas Marek committed
116
  implicit none
117
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
118
119
  class(elpa_abstract_impl_t), intent(inout) :: obj
  integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
120
121

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
122
123
  MATH_DATATYPE(kind=rck)                     :: a_mat(lda,*)
  MATH_DATATYPE(kind=rck)                     :: tmat(nbw,nbw,*)
124
#else
Andreas Marek's avatar
Andreas Marek committed
125
126
  MATH_DATATYPE(kind=rck)                     :: a_mat(lda,matrixCols)
  MATH_DATATYPE(kind=rck)                     :: tmat(nbw,nbw,numBlocks)
127
#endif
Andreas Marek's avatar
Andreas Marek committed
128
129

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
130
  real(kind=rk)                               :: eps
131
#endif
Andreas Marek's avatar
Andreas Marek committed
132
133
134
135
136
137
138
139
  logical, intent(in)                         :: useGPU
  integer(kind=c_int)                         :: skewsymmetric
  logical                                     :: isSkewsymmetric
  character(20)                               :: gpuString

  integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols
  integer(kind=MPI_KIND)                      :: mpierr,  my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
  integer(kind=ik)                            :: l_cols, l_rows
140
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
141
  integer(kind=ik)                            :: vmrCols
142
#endif
143
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
144
  integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
Andreas Marek's avatar
Andreas Marek committed
145
#endif
Andreas Marek's avatar
Andreas Marek committed
146
147
148
  integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
  integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
  integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
149

Andreas Marek's avatar
Andreas Marek committed
150
151
152
  real(kind=rk)                              :: vnorm2
  MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau
  MATH_DATATYPE(kind=rck)                    :: vav(nbw,nbw)
153

Andreas Marek's avatar
Andreas Marek committed
154
155
156
157
  MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
  MATH_DATATYPE(kind=rck), pointer     :: vmrCUDA(:), umcCUDA(:)
  MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
  MATH_DATATYPE(kind=rck), allocatable :: vr(:)
158
159

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
160
161
162
163
  ! needed for blocked QR decomposition
  integer(kind=ik)                            :: PQRPARAM(11), work_size
  real(kind=rk)                    :: dwork_size(1)
  real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
164
#endif
Andreas Marek's avatar
Andreas Marek committed
165
166
  integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
  type(c_ptr)                                 :: vmr_host, umc_host
167
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
168
  !integer(kind=ik), external                  :: numroc -> use elpa_scalapack
169
#endif
Andreas Marek's avatar
Andreas Marek committed
170
171
172
173
  integer(kind=ik)                            :: ierr
  integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
  integer(kind=ik)                            :: l_rows2, vmr_size2, umc_size2
  integer(kind=c_intptr_t)                    :: lc_start, lc_end
174
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
175
  integer(kind=c_intptr_t)                    :: lce_1, lcs_1, lre_1
176
#endif
Andreas Marek's avatar
Andreas Marek committed
177
178
179
  integer(kind=ik)                            :: lr_end
  integer(kind=ik)                            :: na_cols
  integer(kind=BLAS_KIND)                     :: na_colsBLAS
180
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
181
182
  integer(kind=ik)                            :: na_rows
  integer(kind=BLAS_KIND)                     :: na_rowsBLAS
183
184
#endif

Andreas Marek's avatar
Andreas Marek committed
185
186
187
188
189
190
  logical, intent(in)                         :: wantDebug
  logical, intent(out)                        :: success
  logical                                     :: successCUDA
  integer(kind=ik)                            :: istat
  character(200)                              :: errorMessage
  integer(kind=ik)                            :: min_tile_size, error
191

192
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
193
  logical, intent(in)                         :: useQR
194
#endif
Andreas Marek's avatar
Andreas Marek committed
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
  integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
                                                ii, pp
  integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
                                                                    &PRECISION&
                                                                    &_&
                                                                    &MATH_DATATYPE

  logical                                     :: useGPU_reduction_lower_block_to_tridiagonal
  integer(kind=ik), intent(in)                :: max_threads
  logical                                     :: do_memcpy
  integer(kind=ik)                            :: i_blk,blk_off, blk_end

  call obj%get("is_skewsymmetric",skewsymmetric,error)
  if (error .ne. ELPA_OK) then
       print *,"Problem getting option for skewsymmetric settings. Aborting..."
       stop
  endif
  isSkewsymmetric = (skewsymmetric == 1)

  if(useGPU) then
    gpuString = "_gpu"
  else
    gpuString = ""
  endif

  call obj%timer%start("bandred_&
  &MATH_DATATYPE&
  &" // &
  PRECISION_SUFFIX // &
  gpuString )

  useGPU_reduction_lower_block_to_tridiagonal = .false.

  if (useGPU) then
    useGPU_reduction_lower_block_to_tridiagonal = .true.
Andreas Marek's avatar
Andreas Marek committed
230
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
231
232
233
234
235
    if (useQR) then
      !in this case switch off GPU usage for step "reduce current block to lower triangular form"
      ! since this is done by QR decomposition
      useGPU_reduction_lower_block_to_tridiagonal = .false.
    endif
Andreas Marek's avatar
Andreas Marek committed
236
#endif
Andreas Marek's avatar
Andreas Marek committed
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
  endif

  if (wantDebug) call obj%timer%start("mpi_communication")

  call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
  call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
  call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
  call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)

  my_prow = int(my_prowMPI,kind=c_int)
  np_rows = int(np_rowsMPI,kind=c_int)
  my_pcol = int(my_pcolMPI,kind=c_int)
  np_cols = int(np_colsMPI,kind=c_int)

  if (wantDebug) call obj%timer%stop("mpi_communication")
  success = .true.


  ! Semibandwith nbw must be a multiple of blocksize nblk
  if (mod(nbw,nblk)/=0) then
    if (my_prow==0 .and. my_pcol==0) then
      if (wantDebug) then
        write(error_unit,*) 'ELPA2_bandred_&
                             &MATH_DATATYPE&
                             &: ERROR: nbw=',nbw,', nblk=',nblk
        write(error_unit,*) 'ELPA2_bandred_&
                             &MATH_DATATYPE&
                             &: ELPA2 works only for nbw==n*nblk'
Andreas Marek's avatar
Andreas Marek committed
265
      endif
Andreas Marek's avatar
Andreas Marek committed
266
267
268
269
      success = .false.
      return
    endif
  endif
Andreas Marek's avatar
Andreas Marek committed
270

Andreas Marek's avatar
Andreas Marek committed
271
272
  ! na_rows in used nowhere; only na_cols
  if (useGPU) then
273
#ifdef WITH_MPI
274
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
275
    na_rowsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
276
                         int(my_prow,kind=BLAS_KIND), 0_BLAS_KIND, int(np_rows,kind=BLAS_KIND))
Andreas Marek's avatar
Andreas Marek committed
277
    na_rows = int(na_rowsBLAS,kind=c_int)
278
#endif
Andreas Marek's avatar
Andreas Marek committed
279
    na_colsBLAS = numroc(int(na,kind=BLAS_KIND), int(nblk,kind=BLAS_KIND), &
280
                         int(my_pcol,kind=BLAS_KIND), 0_BLAS_KIND, int(np_cols,kind=BLAS_KIND))
Andreas Marek's avatar
Andreas Marek committed
281
    na_cols = int(na_colsBLAS,kind=c_int)
282
#else
283
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
284
    na_rows = na
285
#endif
Andreas Marek's avatar
Andreas Marek committed
286
    na_cols = na
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
287
#endif /* WITH_MPI */
288

Andreas Marek's avatar
Andreas Marek committed
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
    ! Here we convert the regular host array into a pinned host array
    successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
    check_alloc_cuda("bandred: a_dev", successCUDA)

    successCUDA = cuda_host_register(int(loc(vav),kind=c_intptr_t), &
                  nbw * nbw * size_of_datatype,&
                  cudaHostRegisterDefault)
    check_host_register_cuda("bandred: vav", successCUDA)

    successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
    check_alloc_cuda("bandred: vav_dev", successCUDA)
  endif ! useGPU

  ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

  tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size

  ! make tile_size a smallest possible multiple of previously defined tile size, such that it is
  ! larger or equal to min_tile_size
  ! min_tile_size has been originally hardcoded as 128 * max(np_rows, np_cols), so it is now the implicit value
  ! it can, however, be set by the user
  call obj%get("min_tile_size", min_tile_size ,error)
  if (error .ne. ELPA_OK) then
    print *,"Problem setting option for min_tile_size. Aborting..."
    stop
  endif
  if(min_tile_size == 0) then
    ! not set by the user, use the default value
    min_tile_size = 128*max(np_rows, np_cols)
  endif
  tile_size = ((min_tile_size-1)/tile_size+1)*tile_size

  l_rows_tile = tile_size/np_rows ! local rows of a tile
  l_cols_tile = tile_size/np_cols ! local cols of a tile
323

324
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
325
  if (useQR) then
326

Andreas Marek's avatar
Andreas Marek committed
327
328
329
330
    if (which_qr_decomposition == 1) then
      call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
      allocate(tauvector(na), stat=istat, errmsg=errorMessage)
      check_allocate("bandred: tauvector", istat, errorMessage)
331

Andreas Marek's avatar
Andreas Marek committed
332
333
      allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
      check_allocate("bandred: blockheuristic", istat, errorMessage)
334

Andreas Marek's avatar
Andreas Marek committed
335
336
337
      l_rows = local_index(na, my_prow, np_rows, nblk, -1)
      allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
      check_allocate("bandred: vmrCPU", istat, errorMessage)
338

Andreas Marek's avatar
Andreas Marek committed
339
      vmrCols = na
340
341

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
342
343
344
345
346
      call qr_pdgeqrf_2dcomm_&
           &PRECISION&
           &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
                             nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                             mpi_comm_rows, mpi_comm_cols, blockheuristic)
347
348

#else
Andreas Marek's avatar
Andreas Marek committed
349
350
351
352
353
354
      call qr_pdgeqrf_2dcomm_&
           &PRECISION&
           &(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
                             vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                             nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                             mpi_comm_rows, mpi_comm_cols, blockheuristic)
355
356
#endif

Andreas Marek's avatar
Andreas Marek committed
357
358
359
360
361
362
      work_size = int(dwork_size(1))
      allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
      check_allocate("bandred: work_blocked", istat, errorMessage)
      work_blocked = 0.0_rk
      deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
      check_deallocate("bandred: vmrCPU", istat, errorMessage)
363

Andreas Marek's avatar
Andreas Marek committed
364
    endif ! which_qr_decomposition
365

Andreas Marek's avatar
Andreas Marek committed
366
  endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
367
#endif /* REALCASE */
368

Andreas Marek's avatar
Andreas Marek committed
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
  blk_end = (na-1)/nbw
  if (useGPU) then

    successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t), &
                  lda*na_cols*size_of_datatype, cudaHostRegisterDefault)
    check_host_register_cuda("bandred: a_mat", successCUDA)

    cur_l_rows = 0
    cur_l_cols = 0

    successCUDA = cuda_memcpy(a_dev, int(loc(a_mat),kind=c_intptr_t), &
                  lda*na_cols*size_of_datatype, cudaMemcpyHostToDevice)
    check_memcpy_cuda("bandred: a_dev", successCUDA)

    successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_datatype)
    check_alloc_cuda("bandred: tmat_dev", successCUDA)

    istep = (na-1)/nbw
    blk_end = (na-1)/nbw
    n_cols = min(na,(istep+1)*nbw)-istep*nbw
    l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
    l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
    cur_l_rows = max(l_rows,1)
    cur_l_cols = max(l_cols,1)
    vmr_size = cur_l_rows*2*n_cols
    umc_size = cur_l_cols*2*n_cols

    istep = (na-1)/nbw - 1
    n_cols = min(na,(istep+1)*nbw)-istep*nbw
    l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
    l_rows2 = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
    cur_l_rows = max(l_rows2,1)
    cur_l_cols = max(l_cols,1)
    vmr_size2 = cur_l_rows*2*n_cols
    umc_size2 = cur_l_cols*2*n_cols

    l_rows = max(l_rows,l_rows2)
    vmr_size = max(vmr_size,vmr_size2)
    umc_size = max(umc_size,umc_size2)

    allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
    if (istat .ne. 0) then
      print *,"bandred_&
              &MATH_DATATYPE&
              &: error when allocating vr "//errorMessage
      stop 1
    endif
416

Andreas Marek's avatar
Andreas Marek committed
417
418
419
    successCUDA = cuda_malloc_host(vmr_host,vmr_size*size_of_datatype)
    check_host_alloc_cuda("bandred: vmr_host", successCUDA)
    call c_f_pointer(vmr_host, vmrCUDA, (/vmr_size/))
420

Andreas Marek's avatar
Andreas Marek committed
421
422
    successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_datatype)
    check_alloc_cuda("bandred: vmr_dev", successCUDA)
423

Andreas Marek's avatar
Andreas Marek committed
424
425
426
    successCUDA = cuda_malloc_host(umc_host,umc_size*size_of_datatype)
    check_host_alloc_cuda("bandred: umc_host", successCUDA)
    call c_f_pointer(umc_host, umcCUDA, (/umc_size/))
427

Andreas Marek's avatar
Andreas Marek committed
428
429
    successCUDA = cuda_malloc(umc_dev, umc_size*size_of_datatype)
    check_alloc_cuda("bandred: umc_dev", successCUDA)
430

Andreas Marek's avatar
Andreas Marek committed
431
  endif ! useGPU
432

Andreas Marek's avatar
Andreas Marek committed
433
  do istep = blk_end, 1, -1
434

Andreas Marek's avatar
Andreas Marek committed
435
    n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
436

Andreas Marek's avatar
Andreas Marek committed
437
438
439
    ! Number of local columns/rows of remaining matrix
    l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
    l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
440

Andreas Marek's avatar
Andreas Marek committed
441
    ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
442

Andreas Marek's avatar
Andreas Marek committed
443
444
445
446
447
    if (useGPU) then
      cur_l_rows = max(l_rows, 1)
      cur_l_cols = max(l_cols, 1)
      vmr_size = cur_l_rows * 2 * n_cols
      umc_size = cur_l_cols * 2 * n_cols
448

Andreas Marek's avatar
Andreas Marek committed
449
    else ! GPU not used
450

Andreas Marek's avatar
Andreas Marek committed
451
452
453
      ! unify the the name vmr and vmrCPU, as well as vmrGPU
      ! the same for umcCPU and umcGPU
      ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
454

Andreas Marek's avatar
Andreas Marek committed
455
456
      allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
      check_allocate("bandred: vmrCPU", istat, errorMessage)
457

Andreas Marek's avatar
Andreas Marek committed
458
459
      allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
      check_allocate("bandred: umcCPU", istat, errorMessage)
Andreas Marek's avatar
Andreas Marek committed
460

Andreas Marek's avatar
Andreas Marek committed
461
462
      allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
      check_allocate("bandred: vr", istat, errorMessage)
463

Andreas Marek's avatar
Andreas Marek committed
464
    endif ! use GPU
465

Andreas Marek's avatar
Andreas Marek committed
466
467
468
469
470
471
    if (useGPU) then
      vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
      umcCUDA(1 : umc_size) = 0.0_rck
    else
      vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
    endif ! useGPU
472

Andreas Marek's avatar
Andreas Marek committed
473
474
475
476
477
478
    vr(:) = 0.0_rck
    tmat(:,:,istep) = 0.0_rck
    if (useGPU) then
      lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
      lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
      lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
479

Andreas Marek's avatar
Andreas Marek committed
480
      if (lc_start .le. 0) lc_start = 1
481

Andreas Marek's avatar
Andreas Marek committed
482
      do_memcpy = .false.
483

Andreas Marek's avatar
Andreas Marek committed
484
485
486
487
      ! Note: mod(nbw,nblk) == 0
      do i_blk = 1, nbw/nblk
        blk_off = (i_blk-1) * nblk
        cur_pcol = pcol(istep*nbw+1+blk_off, nblk, np_cols)
488

Andreas Marek's avatar
Andreas Marek committed
489
490
491
492
        if (my_pcol == cur_pcol) then
          do_memcpy = .true.
        endif
      enddo
493

Andreas Marek's avatar
Andreas Marek committed
494
495
496
497
498
499
500
      if (do_memcpy) then
        successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), &
                      int((lda*size_of_datatype),kind=c_intptr_t), &
                      (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
                      int(lda*size_of_datatype,kind=c_intptr_t), &
                      int(lr_end*size_of_datatype,kind=c_intptr_t), &
                      int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
501

Andreas Marek's avatar
Andreas Marek committed
502
503
504
        check_memcpy_cuda("bandred: a_dev -> a_mat", successCUDA)
      endif
    endif ! useGPU
505

Andreas Marek's avatar
Andreas Marek committed
506
    ! Reduce current block to lower triangular form
507
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
508
509
510
    if (useQR) then
      if (which_qr_decomposition == 1) then
        vmrCols = 2*n_cols
511
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
512
513
514
515
516
517
518
519
        call qr_pdgeqrf_2dcomm_&
             &PRECISION&
             &(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
                               na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                 work_size, na, n_cols, nblk, nblk,        &
                                 istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                 blockheuristic)
520
521

#else
Andreas Marek's avatar
Andreas Marek committed
522
523
524
525
526
527
528
529
530
        call qr_pdgeqrf_2dcomm_&
             &PRECISION&
             &(obj, a_mat(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
                                max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                 tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                 work_size, na, n_cols, nblk, nblk,        &
                                 istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                 blockheuristic)
531
#endif
Andreas Marek's avatar
Andreas Marek committed
532
      endif
533

Andreas Marek's avatar
Andreas Marek committed
534
    else !useQR
535
#endif /* REALCASE == 1 */
Andreas Marek's avatar
Andreas Marek committed
536
      do lc = n_cols, 1, -1
537

Andreas Marek's avatar
Andreas Marek committed
538
539
        ncol = istep*nbw + lc ! absolute column number of householder Vector
        nrow = ncol - nbw ! Absolute number of pivot row
540

Andreas Marek's avatar
Andreas Marek committed
541
542
        lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
        lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
543

Andreas Marek's avatar
Andreas Marek committed
544
        tau = 0
545

Andreas Marek's avatar
Andreas Marek committed
546
        if (nrow == 1) exit ! Nothing to do
547

Andreas Marek's avatar
Andreas Marek committed
548
        cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
549

Andreas Marek's avatar
Andreas Marek committed
550
        if (my_pcol==cur_pcol) then
551

Andreas Marek's avatar
Andreas Marek committed
552
553
          ! Get Vector to be transformed; distribute last element and norm of
          ! remaining elements to all procs in current column
554

Andreas Marek's avatar
Andreas Marek committed
555
          vr(1:lr) = a_mat(1:lr,lch) ! Vector to be transformed
556

Andreas Marek's avatar
Andreas Marek committed
557
558
559
560
561
562
563
          if (my_prow==prow(nrow, nblk, np_rows)) then
            aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
            aux1(2) = vr(lr)
          else
            aux1(1) = dot_product(vr(1:lr),vr(1:lr))
            aux1(2) = 0.0_rck
          endif
564
565

#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
566
567
568
569
          if (wantDebug) call obj%timer%start("mpi_communication")
          call mpi_allreduce(aux1, aux2, 2_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
                             MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
          if (wantDebug) call obj%timer%stop("mpi_communication")
570
571

#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
572
          aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
573
#endif
574

575
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
576
          vnorm2 = aux2(1)
577
578
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
579
          vnorm2 = real(aux2(1),kind=rk)
580
#endif
Andreas Marek's avatar
Andreas Marek committed
581
          vrl    = aux2(2)
582

Andreas Marek's avatar
Andreas Marek committed
583
584
          ! Householder transformation
          call hh_transform_&
Pavel Kus's avatar
Pavel Kus committed
585
586
             &MATH_DATATYPE&
             &_&
Andreas Marek's avatar
Andreas Marek committed
587
             &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
588
                         (obj, vrl, vnorm2, xf, tau, wantDebug)
Andreas Marek's avatar
Andreas Marek committed
589
590
591
592
593
594
595
596
597
598
          ! Scale vr and store Householder Vector for back transformation

          vr(1:lr) = vr(1:lr) * xf
          if (my_prow==prow(nrow, nblk, np_rows)) then
            a_mat(1:lr-1,lch) = vr(1:lr-1)
            a_mat(lr,lch) = vrl
            vr(lr) = 1.0_rck
          else
            a_mat(1:lr,lch) = vr(1:lr)
          endif
599

Andreas Marek's avatar
Andreas Marek committed
600
        endif
601

Andreas Marek's avatar
Andreas Marek committed
602
        ! Broadcast Householder Vector and tau along columns
603

Andreas Marek's avatar
Andreas Marek committed
604
        vr(lr+1) = tau
605
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
606
607
608
609
        if (wantDebug) call obj%timer%start("mpi_communication")
        call MPI_Bcast(vr, int(lr+1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
                      int(cur_pcol,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
        if (wantDebug) call obj%timer%stop("mpi_communication")
610
611

#endif /* WITH_MPI */
612

Andreas Marek's avatar
Andreas Marek committed
613
614
615
616
617
618
        if (useGPU_reduction_lower_block_to_tridiagonal) then
          vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
        else
          vmrCPU(1:lr,lc) = vr(1:lr)
        endif
        tau = vr(lr+1)
619

620
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
621
        tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
622
623
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
624
        tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
625
#endif
Andreas Marek's avatar
Andreas Marek committed
626
627
        ! Transform remaining columns in current block with Householder Vector
        ! Local dot product
628

Andreas Marek's avatar
Andreas Marek committed
629
        aux1 = 0.0_rck
630

631
#ifdef WITH_OPENMP_TRADITIONAL
632
633
#if 0
 ! original complex implementation without openmp. check performance
Andreas Marek's avatar
Andreas Marek committed
634
635
636
637
638
639
640
641
642
643
        nlc = 0 ! number of local columns
        do j=1,lc-1
          lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
          if (lcx>0) then
            nlc = nlc+1
            aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
          endif
        enddo

        ! Get global dot products
644
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
645
646
        if (wantDebug) call obj%timer%start("mpi_communication")
        if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
647
                                         int(mpi_comm_rows,kind=MPI_KIND), mpierr)
648

Andreas Marek's avatar
Andreas Marek committed
649
        ! Transform
650

Andreas Marek's avatar
Andreas Marek committed
651
652
653
654
655
656
        nlc = 0
        do j=1,lc-1
          lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
          if (lcx>0) then
            nlc = nlc+1
            a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
657

Andreas Marek's avatar
Andreas Marek committed
658
659
          endif
        enddo
660

661

Andreas Marek's avatar
Andreas Marek committed
662
        if (wantDebug) call obj%timer%stop("mpi_communication")
663
664
665

#else /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
666
        ! Transform
667

Andreas Marek's avatar
Andreas Marek committed
668
669
670
671
672
673
674
675
        nlc = 0
        do j=1,lc-1
          lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
          if (lcx>0) then
            nlc = nlc+1
            a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
          endif
        enddo
676
677
678

#endif /* WITH_MPI */
!
Andreas Marek's avatar
Andreas Marek committed
679
!       ! Transform
680
!
Andreas Marek's avatar
Andreas Marek committed
681
682
683
684
685
686
687
688
689
!       nlc = 0
!       do j=1,lc-1
!         lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!         if (lcx>0) then
!           nlc = nlc+1
!           a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)

!         endif
!       enddo
690
#endif /* if 0 */
691

Andreas Marek's avatar
Andreas Marek committed
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
        !Open up one omp region to avoid paying openmp overhead.
        !This does not help performance due to the addition of two openmp barriers around the MPI call,
        !But in the future this may be beneficial if these barriers are replaced with a faster implementation

        !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
        mynlc = 0 ! number of local columns

        !This loop does not have independent iterations,
        !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
        !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
        !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
        !the iteration number modulo the number of threads
        do j=1,lc-1
          lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
          if (lcx>0 ) then
            mynlc = mynlc+1
            if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
            endif
          endif
        enddo

        ! Get global dot products

        !$omp barrier
        !$omp single
718
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
719
720
721
722
        if (wantDebug) call obj%timer%start("mpi_communication")
        if (mynlc>0) call mpi_allreduce(aux1, aux2, int(mynlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
                                        MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
        if (wantDebug) call obj%timer%stop("mpi_communication")
723
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
724
        if (mynlc>0) aux2 = aux1
725
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
        !$omp end single
        !$omp barrier

        ! Transform
        transformChunkSize=32
        mynlc = 0
        do j=1,lc-1
          lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
          if (lcx>0) then
            mynlc = mynlc+1
            !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
            !However, for some reason this is slower than doing it manually, so it is parallelized as below.
            do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
              do pp = 1,transformChunkSize
                if (pp + ii > lr) exit
741
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
742
                a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
743
744
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
745
                a_mat(ii+pp,lcx) = a_mat(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
746
#endif
Andreas Marek's avatar
Andreas Marek committed
747
748
749
750
751
              enddo
            enddo
          endif
        enddo
        !$omp end parallel
752

753
#else /* WITH_OPENMP_TRADITIONAL */
754

Andreas Marek's avatar
Andreas Marek committed
755
756
757
758
759
760
761
762
        nlc = 0 ! number of local columns
        do j=1,lc-1
          lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
          if (lcx>0) then
            nlc = nlc+1
            if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
          endif
        enddo
763

Andreas Marek's avatar
Andreas Marek committed
764
        ! Get global dot products
765
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
766
767
768
769
        if (wantDebug) call obj%timer%start("mpi_communication")
        if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
                                      MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
        if (wantDebug) call obj%timer%stop("mpi_communication")
770
#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
771
        if (nlc>0) aux2=aux1
772
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
773
        ! Transform
774

Andreas Marek's avatar
Andreas Marek committed
775
776
777
778
779
        nlc = 0
        do j=1,lc-1
          lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
          if (lcx>0) then
            nlc = nlc+1
780
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
781
            a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
782
783
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
784
            a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
785
#endif
Andreas Marek's avatar
Andreas Marek committed
786
787
          endif
        enddo
788
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
      enddo ! lc

      if (useGPU_reduction_lower_block_to_tridiagonal) then
        ! store column tiles back to GPU
        if (do_memcpy) then
          successCUDA = cuda_memcpy2d((a_dev+ &
                        int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
                        int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
                        int(lda*size_of_datatype,kind=c_intptr_t), &
                        int(lr_end*size_of_datatype,kind=c_intptr_t), &
                        int((lc_end - lc_start+1),kind=c_intptr_t), &
                        int(cudaMemcpyHostToDevice,kind=c_int))
          check_memcpy_cuda("bandred: a_mat -> a_dev", successCUDA)
        endif
      endif
804

Andreas Marek's avatar
Andreas Marek committed
805
806
      ! Calculate scalar products of stored Householder vectors.
      ! This can be done in different ways, we use dsyrk
807

Andreas Marek's avatar
Andreas Marek committed
808
809
810
811
      vav = 0
      call obj%timer%start("blas")
      if (useGPU_reduction_lower_block_to_tridiagonal) then
        if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
812
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
813
        call PRECISION_SYRK('U', 'T',            &
Andreas Marek's avatar
Andreas Marek committed
814
815
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
816
        call PRECISION_HERK('U', 'C',            &
Andreas Marek's avatar
Andreas Marek committed
817
#endif
Andreas Marek's avatar
Andreas Marek committed
818
819
820
                           int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, &
                           vmrCUDA, int(cur_l_rows,kind=BLAS_KIND), &
                           ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND))
821

Andreas Marek's avatar
Andreas Marek committed
822
823
      else ! useGPU_reduction_to_tridiagonal
        if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
824
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
825
        call PRECISION_SYRK('U', 'T',           &
826
827
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
828
        call PRECISION_HERK('U', 'C',           &
829
#endif
Andreas Marek's avatar
Andreas Marek committed
830
831
832
833
                            int(n_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, vmrCPU, &
                            int(ubound(vmrCPU,dim=1),kind=BLAS_KIND), ZERO, vav, int(ubound(vav,dim=1),kind=BLAS_KIND))
      endif
      call obj%timer%stop("blas")
834
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
835
      call symm_matrix_allreduce_&
836
837
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
838
      call herm_matrix_allreduce_&
839
#endif
Andreas Marek's avatar
Andreas Marek committed
840
         &PRECISION &
841
                         (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
842
         ! Calculate triangular matrix T for block Householder Transformation
Andreas Marek's avatar
Andreas Marek committed
843
844
845
846
847
848
849
      call obj%timer%start("blas")
      do lc=n_cols,1,-1
        tau = tmat(lc,lc,istep)
        if (lc<n_cols) then
          call PRECISION_TRMV('U', BLAS_TRANS_OR_CONJ, 'N',&
                              int(n_cols-lc,kind=BLAS_KIND), tmat(lc+1,lc+1,istep), &
                              int(ubound(tmat,dim=1),kind=BLAS_KIND), vav(lc+1,lc), 1_BLAS_KIND)
850
851

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
852
          tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
853
854
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
855
          tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
856
#endif
Andreas Marek's avatar
Andreas Marek committed
857
858
859
        endif
      enddo
      call obj%timer%stop("blas")
860
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
861
    endif !useQR
862
#endif
Andreas Marek's avatar
Andreas Marek committed
863
864

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
    if (useGPU .and. useQR) then
      ! copy the data for furhter usage
      ! qr worked on *CPU arrarys
      !vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
      if (do_memcpy) then
        successCUDA = cuda_memcpy2d((a_dev+ &
                      int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
                      int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
                      int(lda*size_of_datatype,kind=c_intptr_t), &
                      int(lr_end*size_of_datatype,kind=c_intptr_t), &
                      int((lc_end - lc_start+1),kind=c_intptr_t), &
                      int(cudaMemcpyHostToDevice,kind=c_int))
        check_memcpy_cuda("bandred: a_mat -> a_dev", successCUDA)
      endif

    endif
Andreas Marek's avatar
Andreas Marek committed
881
882
#endif

Andreas Marek's avatar
Andreas Marek committed
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
    ! Transpose vmr -> vmc (stored in umc, second half)
    if (useGPU) then
      call elpa_transpose_vectors_&
           &MATH_DATATYPE&
           &_&
           &PRECISION &
                        (obj, vmrCUDA(:), cur_l_rows, mpi_comm_rows, &
                         umcCUDA(cur_l_cols * n_cols + 1:), cur_l_cols, &
                         mpi_comm_cols, 1, istep*nbw, n_cols, nblk, max_threads)
    else ! useGPU
      call elpa_transpose_vectors_&
           &MATH_DATATYPE&
           &_&
           &PRECISION &
                                        (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
                                         umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
                                         1, istep*nbw, n_cols, nblk, max_threads)
    endif
901

Andreas Marek's avatar
Andreas Marek committed
902
903
904
905
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
Andreas Marek's avatar
Andreas Marek committed
906
907


908
#if 0
Andreas Marek's avatar
Andreas Marek committed
909
910
911
    ! original complex implemetation check for performance
    umcCPU(1:l_cols,1:n_cols) = 0.0_rck
    vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
912

Andreas Marek's avatar
Andreas Marek committed
913
914
    if (l_cols>0 .and. l_rows>0) then
      do i=0,(istep*nbw-1)/tile_size
915

Andreas Marek's avatar
Andreas Marek committed
916
917
918
        lcs = i*l_cols_tile+1
        lce = min(l_cols,(i+1)*l_cols_tile)
        if (lce<lcs) cycle
919

Andreas Marek's avatar
Andreas Marek committed
920
        lre = min(l_rows,(i+1)*l_rows_tile)
921

Andreas Marek's avatar
Andreas Marek committed
922
923
924
925
          call obj%timer%start("blas")
          call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
                     vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
          call obj%timer%stop("blas")
926

Andreas Marek's avatar
Andreas Marek committed
927
928
929
930
931
932
933
        if (i==0) cycle
        lre = min(l_rows,i*l_rows_tile)
          call obj%timer%start("blas")
          call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
                     umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
          call obj%timer%stop("blas")
      enddo
934

Andreas Marek's avatar
Andreas Marek committed
935
    endif ! (l_cols>0 .and. l_rows>0)
936
937
#endif /* if 0 */

Andreas Marek's avatar
Andreas Marek committed
938
    !Code for Algorithm 4
939

Andreas Marek's avatar
Andreas Marek committed
940
941
    ! n_way is actually a branch for the number of OpenMP threads
    n_way = 1
942
#ifdef WITH_OPENMP_TRADITIONAL
943

Andreas Marek's avatar
Andreas Marek committed
944
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
945
    n_way = max_threads
946

Andreas Marek's avatar
Andreas Marek committed
947
    !$omp parallel private( i,lcs,lce,lrs,lre)
Andreas Marek's avatar
Andreas Marek committed
948
#endif
Andreas Marek's avatar
Andreas Marek committed
949
    if (n_way > 1) then
Andreas Marek's avatar
Andreas Marek committed
950
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
951
      !$omp do
952
#endif
Andreas Marek's avatar
Andreas Marek committed
953
954
955
      do i=1,min(l_cols_tile, l_cols)
        umcCPU(i,1:n_cols) = 0.0_rck
      enddo
956

Andreas Marek's avatar
Andreas Marek committed
957
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
958
      !$omp do
959
#endif
Andreas Marek's avatar
Andreas Marek committed
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
      do i=1,l_rows
        vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck
      enddo

      if (l_cols>0 .and. l_rows>0) then

        !SYMM variant 4
        !Partitioned Matrix Expression:
        ! Ct = Atl Bt + Atr Bb
        ! Cb = Atr' Bt + Abl Bb
        !
        !Loop invariant:
        ! Ct = Atl Bt + Atr Bb
        !
        !Update:
        ! C1 = A10'B0 + A11B1 + A21 B2
        !
        !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
        !is easily parallelized, and regardless of choise of algorithm,
        !the startup cost for parallelizing the dgemms inside the loop is too great
980
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
981
        !$omp do schedule(static,1)
982
#endif
Andreas Marek's avatar
Andreas Marek committed
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
        do i=0,(istep*nbw-1)/tile_size
          lcs = i*l_cols_tile+1                   ! local column start
          lce = min(l_cols, (i+1)*l_cols_tile)    ! local column end

          lrs = i*l_rows_tile+1                   ! local row start
          lre = min(l_rows, (i+1)*l_rows_tile)    ! local row end

          !C1 += [A11 A12] [B1
          !                 B2]
          if ( lre > lrs .and. l_cols > lcs ) then
            call obj%timer%start("blas")
            if (isSkewsymmetric) then
              call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
                                  int(l_cols-lcs+1,kind=BLAS_KIND),                                    &
                                  -ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND),       &
                                  umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND),      &
                                  ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) )
            else
              call PRECISION_GEMM('N', 'N', int(lre-lrs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
                                  int(l_cols-lcs+1,kind=BLAS_KIND),                                    &
                                  ONE, a_mat(lrs,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND),        &
                                  umcCPU(lcs,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND),      &
                                  ZERO, vmrCPU(lrs,n_cols+1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND) )

            endif
            call obj%timer%stop("blas")
          endif

          ! C1 += A10' B0
          if ( lce > lcs .and. i > 0 ) then
            call obj%timer%start("blas")
            call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',     &
                                int(lce-lcs+1,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(lrs-1,kind=BLAS_KIND), &
                                 ONE, a_mat(1,lcs), int(ubound(a_mat,dim=1),kind=BLAS_KIND),      &
                                 vmrCPU(1,1), int(ubound(vmrCPU,dim=1),kind=BLAS_KIND),   &
                                 ZERO, umcCPU(lcs,1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND) )
            call obj%timer%stop("blas")
          endif
        enddo
      endif ! l_cols>0 .and. l_rows>0

    else ! n_way > 1
1025
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
1026

Andreas Marek's avatar
Andreas Marek committed
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039