elpa2_bandred_template.F90 61.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), fomerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
63
64
65
66
    subroutine bandred_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
67
    (obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
Andreas Marek's avatar
Andreas Marek committed
68
     tmat_dev, wantDebug, useGPU, success &
69
#if REALCASE == 1
70
     , useQR)
71
72
#endif
#if COMPLEXCASE == 1
73
     )
74
#endif
75

76
  !-------------------------------------------------------------------------------
77
  !  bandred_real/complex: Reduces a distributed symmetric matrix to band form
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,matrixCols)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !  matrixCols  local columns of matrix a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,numBlocks)    where numBlocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------

      use cuda_functions
      use iso_c_binding
      use elpa1_compute
#ifdef WITH_OPENMP
      use omp_lib
#endif
      use precision
112
      use elpa_abstract_impl
113
      implicit none
114
#include "../general/precision_kinds.F90"
115
      class(elpa_abstract_impl_t), intent(inout) :: obj
116
117
118
      integer(kind=ik)                            :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols

#ifdef USE_ASSUMED_SIZE
119
      MATH_DATATYPE(kind=rck)                    :: a(lda,*), tmat(nbw,nbw,*)
120
#else
121
      MATH_DATATYPE(kind=rck)                    :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
122
#endif
Andreas Marek's avatar
Andreas Marek committed
123
124

#if REALCASE == 1
125
      real(kind=rk)                               :: eps
126
127
#endif
      logical, intent(in)                         :: useGPU
128

129
130
131
      integer(kind=ik)                            :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                            :: l_cols, l_rows
#if REALCASE == 1
132
      integer(kind=ik)                            :: vmrCols
133
#endif
Andreas Marek's avatar
Andreas Marek committed
134
135
136
137
#ifdef WITH_OPENMP
      integer(kind=ik)                            :: mynlc, lrs, transformChunkSize
#endif
      integer(kind=ik)                            :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
138
139
      integer(kind=ik)                            :: istep, ncol, lch, lcx, nlc
      integer(kind=ik)                            :: tile_size, l_rows_tile, l_cols_tile
140

141
142
      real(kind=rk)                    :: vnorm2
      MATH_DATATYPE(kind=rck)                    :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
143

144
!      complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
145
146
147
      MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:),  vmrCUDA(:),  umcCUDA(:)
      MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
      MATH_DATATYPE(kind=rck), allocatable :: vr(:)
148
149
150
151

#if REALCASE == 1
      ! needed for blocked QR decomposition
      integer(kind=ik)                            :: PQRPARAM(11), work_size
152
153
      real(kind=rk)                    :: dwork_size(1)
      real(kind=rk), allocatable       :: work_blocked(:), tauvector(:), blockheuristic(:)
154
#endif
155
      ! a_dev is passed from bandred_real to trans_ev_band
156
      integer(kind=C_intptr_T)                    :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
157
#ifdef WITH_MPI
158
159
160
161
      integer(kind=ik), external                  :: numroc
#endif
      integer(kind=ik)                            :: ierr
      integer(kind=ik)                            :: cur_l_rows, cur_l_cols, vmr_size, umc_size
Andreas Marek's avatar
Andreas Marek committed
162
      integer(kind=c_intptr_t)                    :: lc_start, lc_end
163
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
164
      integer(kind=c_intptr_t)                    :: lce_1, lcs_1, lre_1
165
166
167
168
169
#endif
      integer(kind=ik)                            :: lr_end
      integer(kind=ik)                            :: na_cols
#if COMPLEXCASE == 1
      integer(kind=ik)                            :: na_rows
170
171
#endif

172
173
174
175
176
      logical, intent(in)                         :: wantDebug
      logical, intent(out)                        :: success
      logical                                     :: successCUDA
      integer(kind=ik)                            :: istat
      character(200)                              :: errorMessage
177

178
179
180
181
#if REALCASE == 1
      logical, intent(in)                         :: useQR
#endif
      integer(kind=ik)                            :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
Andreas Marek's avatar
Andreas Marek committed
182
                                                    ii, pp, transformChunkSize
183
      integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_&
184
185
186
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
187

Andreas Marek's avatar
Andreas Marek committed
188
189
190
      logical                                     :: useGPU_reduction_lower_block_to_tridiagonal


191
      call obj%timer%start("bandred_&
Andreas Marek's avatar
Andreas Marek committed
192
193
194
195
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX &
      )
Andreas Marek's avatar
Andreas Marek committed
196
197
198
199
200
201
202
203
204
205
206
207

      if (useGPU) then
        useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
        if (useQR) then
          !in this case switch off GPU usage for step "reduce current block to lower triangular form"
          ! since this is done by QR decomposition
          useGPU_reduction_lower_block_to_tridiagonal = .false.
        endif
#endif
      endif

208
      if (wantDebug) call obj%timer%start("mpi_communication")
209
210
211
212
213

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
Andreas Marek's avatar
Andreas Marek committed
214

215
      if (wantDebug) call obj%timer%stop("mpi_communication")
216
217
218
219
220
221
222
      success = .true.


      ! Semibandwith nbw must be a multiple of blocksize nblk
      if (mod(nbw,nblk)/=0) then
        if (my_prow==0 .and. my_pcol==0) then
          if (wantDebug) then
Andreas Marek's avatar
Andreas Marek committed
223
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
224
225
                                 &MATH_DATATYPE&
                                 &: ERROR: nbw=',nbw,', nblk=',nblk
Andreas Marek's avatar
Andreas Marek committed
226
            write(error_unit,*) 'ELPA2_bandred_&
Andreas Marek's avatar
Andreas Marek committed
227
228
                                 &MATH_DATATYPE&
                                 &: ELPA2 works only for nbw==n*nblk'
229
230
231
232
233
234
          endif
          success = .false.
          return
        endif
      endif

Andreas Marek's avatar
Andreas Marek committed
235
      ! na_rows in used nowhere; only na_cols
236
237
      if (useGPU) then
#ifdef WITH_MPI
238
#if COMPLEXCASE == 1
239
        na_rows = numroc(na, nblk, my_prow, 0, np_rows)
240
#endif
241
242
        na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
243
244
245
#if COMPLEXCASE == 1
         na_rows = na
#endif
246
        na_cols = na
Andreas Marek's avatar
Cleanup    
Andreas Marek committed
247
#endif /* WITH_MPI */
248
249

        ! Here we convert the regular host array into a pinned host array
250
        successCUDA = cuda_malloc(a_dev, lda*na_cols* size_of_datatype)
251
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
252
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
253
254
                  &MATH_DATATYPE&
                  &: error in cudaMalloc a_dev 1"
Andreas Marek's avatar
Andreas Marek committed
255
          stop 1
256
257
        endif

258
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
259
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
260
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
261
262
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
Andreas Marek's avatar
Andreas Marek committed
263
          stop 1
264
265
        endif

266
        successCUDA = cuda_malloc(vav_dev, nbw*nbw* size_of_datatype)
267
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
268
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
269
270
                  &MATH_DATATYPE&
                  &: error in cudaMalloc vav_dev 1"
Andreas Marek's avatar
Andreas Marek committed
271
          stop 1
272
        endif
273
274
275
276
277
278
279
280
281
282
      endif ! useGPU

      ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

      tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
      tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

      l_rows_tile = tile_size/np_rows ! local rows of a tile
      l_cols_tile = tile_size/np_cols ! local cols of a tile

283
#if REALCASE == 1
284
285
286
      if (useQR) then

        if (which_qr_decomposition == 1) then
287
          call qr_pqrparam_init(obj,pqrparam(1:11),    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
288
289
290
          allocate(tauvector(na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating tauvector "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
291
            stop 1
292
293
294
295
296
          endif

          allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating blockheuristic "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
297
            stop 1
298
299
300
301
302
303
          endif

          l_rows = local_index(na, my_prow, np_rows, nblk, -1)
          allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
304
            stop 1
305
306
307
308
309
          endif

          vmrCols = na

#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
310
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
311
312
               &PRECISION&
               &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
313
314
315
316
                                 nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
317
          call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
318
319
               &PRECISION&
               &(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
320
321
322
323
324
                                 vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
                                 nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
                                 mpi_comm_rows, mpi_comm_cols, blockheuristic)
#endif

325
          work_size = int(dwork_size(1))
326
327
328
          allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when allocating work_blocked "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
329
            stop 1
330
          endif
Pavel Kus's avatar
Pavel Kus committed
331
          work_blocked = 0.0_rk
332
333
334
          deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
            print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
335
            stop 1
336
337
338
339
340
          endif

        endif ! which_qr_decomposition

      endif ! useQr
Andreas Marek's avatar
Andreas Marek committed
341
#endif /* REALCASE */
342
343
344
345
346

      if (useGPU) then

        cur_l_rows = 0
        cur_l_cols = 0
Andreas Marek's avatar
Andreas Marek committed
347

348
        successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
349
        if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
350
          print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
351
352
                  &MATH_DATATYPE&
                  &: error in cudaMemcpy a_dev 2"
Andreas Marek's avatar
Andreas Marek committed
353
          stop 1
354
        endif
355
356
357
358
359
360
361
362
363
364
365
      endif ! useGPU


      do istep = (na-1)/nbw, 1, -1

        n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

        ! Number of local columns/rows of remaining matrix
        l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
        l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)

366
367
        ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces

368
369
370
371
372
373
374
375
376
377
378
379
380
        if (useGPU) then
          cur_l_rows = max(l_rows, 1)
          cur_l_cols = max(l_cols, 1)

          vmr_size = cur_l_rows * 2 * n_cols
          umc_size = cur_l_cols * 2 * n_cols

          ! Allocate vmr and umc only if the inew size exceeds their current capacity
          ! Added for FORTRAN CALLS
          if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
            if (allocated(vr)) then
              deallocate(vr, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
381
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
382
383
                        &MATH_DATATYPE&
                        &: error when deallocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
384
                stop 1
385
386
387
388
              endif
            endif
            allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
            if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
389
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
390
391
                      &MATH_DATATYPE&
                      &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
392
              stop 1
393
394
395
396
397
398
399
400
            endif

          endif

          if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
            if (allocated(vmrCUDA)) then
              deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
401
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
402
403
                        &MATH_DATATYPE&
                        &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
404
                stop 1
405
406
407
408
              endif

              successCUDA = cuda_free(vmr_dev)
              if (.not.(successCUDA)) then
409
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
410
                        &MATH_DATATYPE&: error in cuda_free vmr_dev 1"
Andreas Marek's avatar
Andreas Marek committed
411
                stop 1
412
413
414
415
              endif
            endif

            allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
416

417
            if (istat .ne. 0) then
418
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
419
420
                      &MATH_DATATYPE&
                      &: error when allocating vmrCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
421
              stop 1
422
            endif
423
            successCUDA = cuda_malloc(vmr_dev, vmr_size* size_of_datatype)
424
            if (.not.(successCUDA)) then
425
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
426
427
                      &MATH_DATATYPE&
                      &: error in cudaMalloc: vmr_dev2"
Andreas Marek's avatar
Andreas Marek committed
428
              stop 1
429
430
431
            endif

          endif
432
433
434
435
436

          if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
            if (allocated(umcCUDA)) then
              deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
              if (istat .ne. 0) then
437
                print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
438
439
                        &MATH_DATATYPE&
                        &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
440
                stop 1
441
442
443
444
              endif

              successCUDA = cuda_free(umc_dev)
              if (.not.(successCUDA)) then
445
                 print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
446
447
                         &MATH_DATATYPE&
                         &: error in cudaFree umc_dev 1"
Andreas Marek's avatar
Andreas Marek committed
448
                 stop 1
449
450
451
              endif

            endif
452

453
            allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
454

455
            if (istat .ne. 0) then
456
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
457
458
                      &MATH_DATATYPE&
                      &: error when deallocating umcCUDA "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
459
              stop 1
460
461
            endif

462
            successCUDA = cuda_malloc(umc_dev, umc_size* size_of_datatype)
463
            if (.not.(successCUDA)) then
464
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
465
466
                      &MATH_DATATYPE&
                      &: error in cudaMalloc umc_dev 2"
Andreas Marek's avatar
Andreas Marek committed
467
              stop 1
468
            endif
469

470
          endif
471
472
473

        else ! GPU not used

474
475
476
          ! unify the the name vmr and vmrCPU, as well as vmrGPU
          ! the same for umcCPU and umcGPU
          ! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
477
478
479

          allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
480
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
481
482
                     &MATH_DATATYPE&
                     &: error when allocating vmrCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
483
            stop 1
484
485
          endif

486
487
          allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
488
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
489
490
                    &MATH_DATATYPE&
                    &: error when allocating umcCPU "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
491
            stop 1
492
          endif
Andreas Marek's avatar
Andreas Marek committed
493

494
495
          allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
          if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
496
            print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
497
498
                    &MATH_DATATYPE&
                    &: error when allocating vr "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
499
            stop 1
500
501
502
503
504
          endif

        endif ! use GPU

        if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
505
          vmrCUDA(1 : cur_l_rows * n_cols) = 0.0_rck
506
        else
Pavel Kus's avatar
Pavel Kus committed
507
          vmrCPU(1:l_rows,1:n_cols) = 0.0_rck
508
509
        endif ! useGPU

Pavel Kus's avatar
Pavel Kus committed
510
511
        vr(:) = 0.0_rck
        tmat(:,:,istep) = 0.0_rck
512
        if (useGPU) then
513
#if REALCASE == 1
Pavel Kus's avatar
Pavel Kus committed
514
          umcCUDA(1 : umc_size) = 0.0_rck
515
#endif
516
517
518
519
          lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
          lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
          lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

520
          if (lc_start .le. 0) lc_start = 1
521
522
523
524

          ! Here we assume that the processor grid and the block grid are aligned
          cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

525
526
          if (my_pcol == cur_pcol) then
            successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
Andreas Marek's avatar
Retab    
Andreas Marek committed
527
                                      int((lda*size_of_datatype),kind=c_intptr_t), &
528
529
530
531
                                            (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )),      &
                                            int(lda*size_of_datatype,kind=c_intptr_t),              &
                                            int(lr_end*size_of_datatype,kind=c_intptr_t),           &
                                             int((lc_end - lc_start+1),kind=c_intptr_t),int(cudaMemcpyDeviceToHost,kind=c_int))
532

533

534
            if (.not.(successCUDA)) then
Andreas Marek's avatar
Andreas Marek committed
535
              print *,"bandred_&
Andreas Marek's avatar
Andreas Marek committed
536
537
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy2d"
Andreas Marek's avatar
Andreas Marek committed
538
              stop 1
539
540
541
542
543
            endif
          endif
        endif ! useGPU

        ! Reduce current block to lower triangular form
544
#if REALCASE == 1
545
        if (useQR) then
Andreas Marek's avatar
Andreas Marek committed
546
547
548
549
          if (useGPU) then
 !            vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
          endif

550
551
552
          if (which_qr_decomposition == 1) then
            vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
Andreas Marek's avatar
Andreas Marek committed
553
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
554
555
                 &PRECISION&
                 &(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
556
557
558
559
560
561
562
                                   na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size,        &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)

#else
Andreas Marek's avatar
Andreas Marek committed
563
            call qr_pdgeqrf_2dcomm_&
Andreas Marek's avatar
Andreas Marek committed
564
565
                 &PRECISION&
                 &(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) ,   &
566
567
568
569
570
571
572
573
574
575
                                    max(l_rows,1), vmrCols, tauvector(1:na), na, &
                                     tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
                                     work_size, na, n_cols, nblk, nblk,        &
                                     istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                     0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,&
                                     blockheuristic)
#endif
          endif

       else !useQR
576
#endif /* REALCASE == 1 */
577
578
         do lc = n_cols, 1, -1

579
           ncol = istep*nbw + lc ! absolute column number of householder Vector
580
581
582
583
584
585
586
587
588
589
590
591
592
           nrow = ncol - nbw ! Absolute number of pivot row

           lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
           lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number

           tau = 0

           if (nrow == 1) exit ! Nothing to do

           cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block

           if (my_pcol==cur_pcol) then

593
             ! Get Vector to be transformed; distribute last element and norm of
594
595
             ! remaining elements to all procs in current column

596
             vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
597
598
599
600
601
602

             if (my_prow==prow(nrow, nblk, np_rows)) then
               aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
               aux1(2) = vr(lr)
             else
               aux1(1) = dot_product(vr(1:lr),vr(1:lr))
Pavel Kus's avatar
Pavel Kus committed
603
               aux1(2) = 0.0_rck
604
605
606
             endif

#ifdef WITH_MPI
607
             if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
608
             call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
609
                                MPI_SUM, mpi_comm_rows, mpierr)
610
             if (wantDebug) call obj%timer%stop("mpi_communication")
611
612
613

#else /* WITH_MPI */
              aux2 = aux1 ! this should be optimized
Andreas Marek's avatar
Andreas Marek committed
614
#endif
615

616
#if REALCASE == 1
617
             vnorm2 = aux2(1)
618
619
620
621
#endif
#if COMPLEXCASE == 1
             vnorm2 = real(aux2(1),kind=rk)
#endif
622
623
624
             vrl    = aux2(2)

             ! Householder transformation
Pavel Kus's avatar
Pavel Kus committed
625
626
627
       call hh_transform_&
             &MATH_DATATYPE&
             &_&
Andreas Marek's avatar
Andreas Marek committed
628
             &PRECISION &
Pavel Kus's avatar
Pavel Kus committed
629
                         (obj, vrl, vnorm2, xf, tau, wantDebug)
630
             ! Scale vr and store Householder Vector for back transformation
631
632
633
634
635

             vr(1:lr) = vr(1:lr) * xf
             if (my_prow==prow(nrow, nblk, np_rows)) then
               a(1:lr-1,lch) = vr(1:lr-1)
               a(lr,lch) = vrl
Pavel Kus's avatar
Pavel Kus committed
636
               vr(lr) = 1.0_rck
637
638
639
640
641
642
             else
               a(1:lr,lch) = vr(1:lr)
             endif

           endif

643
           ! Broadcast Householder Vector and tau along columns
644
645
646

           vr(lr+1) = tau
#ifdef WITH_MPI
647
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
648
           call MPI_Bcast(vr, lr+1, MPI_MATH_DATATYPE_PRECISION, &
Andreas Marek's avatar
Andreas Marek committed
649
                          cur_pcol, mpi_comm_cols, mpierr)
650
           if (wantDebug) call obj%timer%stop("mpi_communication")
651
652

#endif /* WITH_MPI */
653

Andreas Marek's avatar
Andreas Marek committed
654
           if (useGPU_reduction_lower_block_to_tridiagonal) then
655
             vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
656
657
658
           else
             vmrCPU(1:lr,lc) = vr(1:lr)
           endif
659
660
           tau = vr(lr+1)

661
662
663
664
665
666
#if REALCASE == 1
           tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
#endif
#if COMPLEXCASE == 1
           tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
#endif
667
           ! Transform remaining columns in current block with Householder Vector
668
669
           ! Local dot product

Pavel Kus's avatar
Pavel Kus committed
670
           aux1 = 0.0_rck
671
672

#ifdef WITH_OPENMP
673
674
675
#if 0
 ! original complex implementation without openmp. check performance
            nlc = 0 ! number of local columns
676
677
678
679
680
681
682
683
684
685
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
686
           if (wantDebug) call obj%timer%start("mpi_communication")
687
688
689
690
691
692
693
694
695
696
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
Andreas Marek's avatar
Andreas Marek committed
697

698
699
700
             endif
           enddo

701

702
           if (wantDebug) call obj%timer%stop("mpi_communication")
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727

#else /* WITH_MPI */
!          if (nlc>0) aux2=aux1

           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
             endif
           enddo

#endif /* WITH_MPI */
!
!           ! Transform
!
!           nlc = 0
!           do j=1,lc-1
!             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!             if (lcx>0) then
!               nlc = nlc+1
!               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
728

729
730
!             endif
!           enddo
731
#endif /* if 0 */
732

733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
           !Open up one omp region to avoid paying openmp overhead.
           !This does not help performance due to the addition of two openmp barriers around the MPI call,
           !But in the future this may be beneficial if these barriers are replaced with a faster implementation

           !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1)
           mynlc = 0 ! number of local columns

           !This loop does not have independent iterations,
           !'mynlc' is incremented each iteration, and it is difficult to remove this dependency
           !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration
           !That is, a thread only executes the work associated with an iteration if its thread id is congruent to
           !the iteration number modulo the number of threads
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0 ) then
               mynlc = mynlc+1
               if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
                   if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
               endif
             endif
           enddo

           ! Get global dot products

           !$omp barrier
           !$omp single
#ifdef WITH_MPI
760
           if (wantDebug) call obj%timer%start("mpi_communication")
761
           if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_MATH_DATATYPE_PRECISION, &
Pavel Kus's avatar
Pavel Kus committed
762
                                           MPI_SUM, mpi_comm_rows, mpierr)
763
           if (wantDebug) call obj%timer%stop("mpi_communication")
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
#else /* WITH_MPI */
           if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
           !$omp end single
           !$omp barrier

           ! Transform
           transformChunkSize=32
           mynlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               mynlc = mynlc+1
               !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32
               !However, for some reason this is slower than doing it manually, so it is parallelized as below.
               do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize
                  do pp = 1,transformChunkSize
                      if (pp + ii > lr) exit
#if REALCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp)
#endif
#if COMPLEXCASE == 1
                          a(ii+pp,lcx) = a(ii+pp,lcx) - conjg(tau)*aux2(mynlc)*vr(ii+pp)
#endif
                  enddo
               enddo
             endif
           enddo
           !$omp end parallel
793
794
795
796
797
798
799
800
801
802
803
804
805
806

#else /* WITH_OPENMP */

           nlc = 0 ! number of local columns
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
               if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
             endif
           enddo

           ! Get global dot products
#ifdef WITH_MPI
807
           if (wantDebug) call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
808
809
           if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_MATH_DATATYPE_PRECISION, &
                                         MPI_SUM, mpi_comm_rows, mpierr)
810
           if (wantDebug) call obj%timer%stop("mpi_communication")
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
#else /* WITH_MPI */
           if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
           ! Transform

           nlc = 0
           do j=1,lc-1
             lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
             if (lcx>0) then
               nlc = nlc+1
#if REALCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
               a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
             endif
           enddo
#endif /* WITH_OPENMP */
830
831
         enddo ! lc

Andreas Marek's avatar
Andreas Marek committed
832
         if (useGPU_reduction_lower_block_to_tridiagonal) then
833
834
835
           ! store column tiles back to GPU
           cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
           if (my_pcol == cur_pcol) then
836
             successCUDA = cuda_memcpy2d((a_dev+        &
837
838
839
840
841
                                         int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                         int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                         int(lda*size_of_datatype,kind=c_intptr_t),           &
                                         int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                         int((lc_end - lc_start+1),kind=c_intptr_t), &
842
                                         int(cudaMemcpyHostToDevice,kind=c_int))
843

844
             if (.not.(successCUDA)) then
845
               print *, "bandred_&
846
847
                        &MATH_DATATYPE&
                        &: cuda memcpy a_dev  failed ", istat
Andreas Marek's avatar
Andreas Marek committed
848
               stop 1
849
             endif
850
851
852
853
854
855
856
           endif
         endif

         ! Calculate scalar products of stored Householder vectors.
         ! This can be done in different ways, we use dsyrk

         vav = 0
857
         call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
858
         if (useGPU_reduction_lower_block_to_tridiagonal) then
859
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
860
861
862
863
864
865
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',            &
#endif
#if COMPLEXCASE == 1
             call PRECISION_HERK('U', 'C',            &
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
866
                           n_cols, l_rows, ONE, &
867
868
869
                           vmrCUDA, cur_l_rows, &
                           ZERO, vav, ubound(vav,dim=1))

Andreas Marek's avatar
Andreas Marek committed
870
         else ! useGPU_reduction_to_tridiagonal
871
           if (l_rows>0) &
Andreas Marek's avatar
Andreas Marek committed
872
873
#if REALCASE == 1
             call PRECISION_SYRK('U', 'T',           &
874
875
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
876
             call PRECISION_HERK('U', 'C',           &
877
#endif
Andreas Marek's avatar
Retab    
Andreas Marek committed
878
                           n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
Andreas Marek's avatar
Andreas Marek committed
879
         endif
880
         call obj%timer%stop("blas")
881
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
882
         call symm_matrix_allreduce_&
883
884
#endif
#if COMPLEXCASE == 1
885
         call herm_matrix_allreduce_&
886
#endif
Andreas Marek's avatar
Andreas Marek committed
887
         &PRECISION &
888
                         (obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
889
         ! Calculate triangular matrix T for block Householder Transformation
890
         call obj%timer%start("blas")
891
892
893
         do lc=n_cols,1,-1
           tau = tmat(lc,lc,istep)
           if (lc<n_cols) then
Pavel Kus's avatar
Pavel Kus committed
894
             call PRECISION_TRMV('U', BLAS_TRANS_OR_CONJ, 'N',&
Andreas Marek's avatar
Andreas Marek committed
895
                                 n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
896
897

#if REALCASE == 1
898
             tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
899
900
901
902
#endif
#if COMPLEXCASE == 1
             tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
#endif
903
904
           endif
         enddo
905
         call obj%timer%stop("blas")
906
907
908
#if REALCASE == 1
       endif !useQR
#endif
Andreas Marek's avatar
Andreas Marek committed
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935

#if REALCASE == 1
       if (useGPU .and. useQR) then
         ! copy the data for furhter usage
         ! qr worked on *CPU arrarys
         !vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
         cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
         if (my_pcol == cur_pcol) then
           successCUDA = cuda_memcpy2d((a_dev+        &
                                       int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)),    &
                                       int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
                                       int(lda*size_of_datatype,kind=c_intptr_t),           &
                                       int(lr_end*size_of_datatype,kind=c_intptr_t),        &
                                       int((lc_end - lc_start+1),kind=c_intptr_t), &
                                       int(cudaMemcpyHostToDevice,kind=c_int))

           if (.not.(successCUDA)) then
             print *, "bandred_&
                      &MATH_DATATYPE&
                      &: cuda memcpy a_dev  failed ", istat
             stop 1
           endif
         endif

       endif
#endif

936
937
       ! Transpose vmr -> vmc (stored in umc, second half)
       if (useGPU) then
938
         call elpa_transpose_vectors_&
Andreas Marek's avatar
Andreas Marek committed
939
940
941
              &MATH_DATATYPE&
              &_&
              &PRECISION &
942
943
944
                           (obj, vmrCUDA, cur_l_rows, mpi_comm_rows, &
                            umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, &
                            mpi_comm_cols, 1, istep*nbw, n_cols, nblk)
945
       else ! useGPU
946
         call elpa_transpose_vectors_&
Andreas Marek's avatar
Andreas Marek committed
947
948
949
              &MATH_DATATYPE&
              &_&
              &PRECISION &
950
                                           (obj, vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
951
952
953
954
955
956
957
958
                                            umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
                                            1, istep*nbw, n_cols, nblk)
       endif

       ! Calculate umc = A**T * vmr
       ! Note that the distributed A has to be transposed
       ! Opposed to direct tridiagonalization there is no need to use the cache locality
       ! of the tiles, so we can use strips of the matrix
Andreas Marek's avatar
Andreas Marek committed
959
960


961
962
#if 0
       ! original complex implemetation check for performance
Pavel Kus's avatar
Pavel Kus committed
963
964
       umcCPU(1:l_cols,1:n_cols) = 0.0_rck
       vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995

       if (l_cols>0 .and. l_rows>0) then
         do i=0,(istep*nbw-1)/tile_size

           lcs = i*l_cols_tile+1
           lce = min(l_cols,(i+1)*l_cols_tile)
           if (lce<lcs) cycle

           lre = min(l_rows,(i+1)*l_rows_tile)

             call obj%timer%start("blas")
             call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
                        vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
             call obj%timer%stop("blas")

           if (i==0) cycle
           lre = min(l_rows,i*l_rows_tile)
             call obj%timer%start("blas")
             call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
                        umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
             call obj%timer%stop("blas")
         enddo

       endif ! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */

       !Code for Algorithm 4

       ! n_way is actually a branch for the number of OpenMP threads
       n_way = 1
#ifdef WITH_OPENMP
996

Andreas Marek's avatar
Andreas Marek committed
997
#if REALCASE == 1
998
999
1000
       n_way = omp_get_max_threads()

       !$omp parallel private( i,lcs,lce,lrs,lre)
Andreas Marek's avatar
Andreas Marek committed
1001
#endif
1002
       if (n_way > 1) then
Andreas Marek's avatar
Andreas Marek committed
1003
#if REALCASE == 1
1004
         !$omp do
1005
#endif
1006
         do i=1,min(l_cols_tile, l_cols)
Pavel Kus's avatar
Pavel Kus committed
1007
           umcCPU(i,1:n_cols) = 0.0_rck
1008
         enddo
1009

Andreas Marek's avatar
Andreas Marek committed
1010
#if REALCASE == 1
1011
1012
1013
         !$omp do
#endif
         do i=1,l_rows
Pavel Kus's avatar
Pavel Kus committed
1014
           vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rck
1015
         enddo
1016

1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032