elpa2_trans_ev_band_to_full_template.F90 31.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

52
#include "../general/sanity.F90"
53
54
55
56
57

    subroutine trans_ev_band_to_full_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
58
    (obj, na, nqc, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, &
Andreas Marek's avatar
Andreas Marek committed
59
     q_dev, ldq, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, useGPU &
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#if REALCASE == 1
     ,useQr)
#endif
#if COMPLEXCASE == 1
     )
#endif

    !-------------------------------------------------------------------------------
    !  trans_ev_band_to_full_real/complex:
    !  Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
    !
    !  Parameters
    !
    !  na          Order of matrix a, number of rows of matrix q
    !
    !  nqc         Number of columns of matrix q
    !
    !  nblk        blocksize of cyclic distribution, must be the same in both directions!
    !
    !  nbw         semi bandwith
    !
    !  a(lda,matrixCols)    Matrix containing the Householder vectors (i.e. matrix a after bandred_real/complex)
    !              Distribution is like in Scalapack.
    !
    !  lda         Leading dimension of a
    !  matrixCols  local columns of matrix a and q
    !
    !  tmat(nbw,nbw,numBlocks) Factors returned by bandred_real/complex
    !
    !  q           On input: Eigenvectors of band matrix
    !              On output: Transformed eigenvectors
    !              Distribution is like in Scalapack.
    !
    !  ldq         Leading dimension of q
    !
    !  mpi_comm_rows
    !  mpi_comm_cols
    !              MPI-Communicators for rows/columns
    !
    !-------------------------------------------------------------------------------
      use precision
      use cuda_functions
      use iso_c_binding
103
      use elpa_abstract_impl
104
      implicit none
105
#include "../general/precision_kinds.F90"
106
      class(elpa_abstract_impl_t), intent(inout) :: obj
107
108
109
110
111
112
      logical, intent(in)                    :: useGPU
#if REALCASE == 1
     logical, intent(in)                     :: useQR
#endif
      integer(kind=ik)                       :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
113
      MATH_DATATYPE(kind=rck)               :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
114
#else
115
      MATH_DATATYPE(kind=rck)               :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
116
117
118
119
120
121
122
123
124
#endif
      integer(kind=C_intptr_T)               :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real

      integer(kind=ik)                       :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                       :: max_blocks_row, max_blocks_col, max_local_rows, &
                                                max_local_cols
      integer(kind=ik)                       :: l_cols, l_rows, l_colh, n_cols
      integer(kind=ik)                       :: istep, lc, ncol, nrow, nb, ns

125
      MATH_DATATYPE(kind=rck), allocatable   :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
126
127
128
129
130
131
132
133
      ! hvm_dev is fist used and set in this routine
      ! q is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
      ! tmp_dev is first used in this routine
      ! tmat_dev is passed along from bandred_real
      integer(kind=C_intptr_T)               :: hvm_dev, q_dev, tmp_dev, tmat_dev

      integer(kind=ik)                       :: i

Andreas Marek's avatar
Andreas Marek committed
134
#ifdef BAND_TO_FULL_BLOCKING
135
      MATH_DATATYPE(kind=rck), allocatable   :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
136
137
      integer(kind=ik)                       :: cwy_blocking, t_blocking, t_cols, t_rows
#endif
Andreas Marek's avatar
Andreas Marek committed
138

139
140
141
      integer(kind=ik)                       :: istat
      character(200)                         :: errorMessage
      logical                                :: successCUDA
142
      integer(kind=c_intptr_t), parameter    :: size_of_datatype = size_of_&
143
144
145
                                                                   &PRECISION&
                                                                   &_&
                                                                   &MATH_DATATYPE
146
      integer                                :: blocking_factor
147
      call obj%timer%start("trans_ev_band_to_full_&
148
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
149
      &" // &
150
151
      &PRECISION_SUFFIX &
      )
152
153
154
155
#ifdef BAND_TO_FULL_BLOCKING
      call obj%get("blocking_in_band_to_full",blocking_factor)
      print *,"Blocking factor: ", blocking_factor
#endif
156
      call obj%timer%start("mpi_communication")
157
158
159
160
161
162

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)

163
      call obj%timer%stop("mpi_communication")
164
165
166
167
168
169
170
171
172
173
174
175
176
177

      max_blocks_row = ((na -1)/nblk)/np_rows + 1  ! Rows of A
      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q!

      max_local_rows = max_blocks_row*nblk
      max_local_cols = max_blocks_col*nblk

      if (useGPU) then

#if REALCASE == 1
        ! here the GPU and CPU version diverged: the CPU version now always uses the useQR path which
        ! is not implemented in the GPU version
#endif

Andreas Marek's avatar
Andreas Marek committed
178
179
180
        ! the GPU version does not (yet) support blocking
        ! but the handling is the same for real/complex case

181
182
183
        allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
184
185
                  &MATH_DATATYPE&
                  &: error when allocating tmp1 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
186
          stop 1
187
188
189
190
191
        endif

        allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
192
193
                   &MATH_DATATYPE&
                   &: error when allocating tmp2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
194
          stop 1
195
196
197
198
199
        endif

        allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
200
201
                  &MATH_DATATYPE&
                  &: error when allocating hvb "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
202
          stop 1
203
204
205
206
207
        endif

        allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
208
209
                  &MATH_DATATYPE&
                  &: error when allocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
210
          stop 1
211
212
        endif

213
        successCUDA = cuda_malloc(hvm_dev, (max_local_rows)*nbw* size_of_datatype)
214
215
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
216
217
                  &MATH_DATATYPE&
                  &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
218
          stop 1
219
220
        endif

221
        successCUDA = cuda_malloc(tmp_dev, (max_local_cols)*nbw* size_of_datatype)
222
223
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
224
225
                  &MATH_DATATYPE&
                  &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
226
          stop 1
227
228
229
        endif

!#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
230
231
232
233
!! it should be possible to keep tmat dev on the device and not copy it around
!! already existent on GPU
!        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* &
!#if REALCASE == 1
Andreas Marek's avatar
Retab  
Andreas Marek committed
234
!  size_of_PRECISION_real)
Andreas Marek's avatar
Andreas Marek committed
235
236
237
238
239
240
241
!#endif
!#if COMPLEXCASE == 1
!        size_of_PRECISION_complex)
!#endif
!
!        if (.not.(successCUDA)) then
!          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
242
243
!    &MATH_DATATYPE&
!    &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
244
!          stop 1
Andreas Marek's avatar
Andreas Marek committed
245
!        endif
246
247
248
249
!#endif

#if REALCASE == 1
! q_dev already living on device
250
!        successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
251
252
!        if (.not.(successCUDA)) then
!          print *,"trans_ev_band_to_full_real: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
253
!          stop 1
254
255
256
257
258
259
260
261
!        endif
  !      q_temp(:,:) = 0.0
  !      q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)

!        ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
!        successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
!        if (.not.(successCUDA)) then
!          print *,"trans_ev_band_to_full_real: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
262
!          stop 1
263
264
265
!        endif
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
266
267
268
!         successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_complex)
!         if (.not.(successCUDA)) then
!           print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
269
!           stop 1
Andreas Marek's avatar
Andreas Marek committed
270
271
272
273
274
!         endif
!
!         successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
!          if (.not.(successCUDA)) then
!            print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
275
!            stop 1
Andreas Marek's avatar
Andreas Marek committed
276
!          endif
277
278
279
#endif

        ! if MPI is NOT used the following steps could be done on the GPU and memory transfers could be avoided
280
        successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)* size_of_datatype)
281
282
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
283
284
                  &MATH_DATATYPE&
                  &: error in cudaMalloc"
Andreas Marek's avatar
Andreas Marek committed
285
          stop 1
286
287
        endif

288
289
        hvm = 0.0_rck   ! Must be set to 0 !!!
        hvb = 0.0_rck   ! Safety only
290
291
292
293
294
295
296
297
298
299
300
301
        l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q

        do istep=1,(na-1)/nbw

          n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step

          ! Broadcast all Householder vectors for current step compressed in hvb

          nb = 0
          ns = 0

          do lc = 1, n_cols
302
            ncol = istep*nbw + lc ! absolute column number of householder Vector
303
304
305
306
307
308
309
310
311
312
313
            nrow = ncol - nbw ! absolute number of pivot row

            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
            l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number

            if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)

            nb = nb+l_rows

            if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
314
              call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
315
              call MPI_Bcast(hvb(ns+1), nb-ns, MPI_MATH_DATATYPE_PRECISION,&
316
                             pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
317

318
              call obj%timer%stop("mpi_communication")
319
320
321
322
323
324
325
326
327
328
329
330
331
332

#endif /* WITH_MPI */
              ns = nb
            endif
          enddo

          ! Expand compressed Householder vectors into matrix hvm

          nb = 0
          do lc = 1, n_cols
            nrow = (istep-1)*nbw+lc ! absolute number of pivot row
            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast

            hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
333
            if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.0_rck
334
335
336
            nb = nb+l_rows
          enddo

337
          successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
338
339
340

          if (.not.(successCUDA)) then
            print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
341
            stop 1
342
343
344
345
346
347
348
349

          endif

          l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)

          ! Q = Q - V * T**T * V**T * Q

          if (l_rows>0) then
Andreas Marek's avatar
Andreas Marek committed
350
            call obj%timer%start("cublas")
Pavel Kus's avatar
Pavel Kus committed
351
            call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Retab  
Andreas Marek committed
352
                                 n_cols, l_cols, l_rows, ONE, hvm_dev, max_local_rows, &
353
                                       q_dev, ldq , ZERO, tmp_dev, n_cols)
Andreas Marek's avatar
Andreas Marek committed
354
            call obj%timer%stop("cublas")
355
356
357
358
359

#ifdef WITH_MPI

            ! copy data from device to host for a later MPI_ALLREDUCE
            ! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
360
            successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
361
362
            if (.not.(successCUDA)) then
              print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
363
              stop 1
364
365
            endif

366

367
#else /* WITH_MPI */
368
            ! in real case no copy needed. Don't do it in complex case neither
Andreas Marek's avatar
Andreas Marek committed
369
370
#endif /* WITH_MPI */

371
          else ! l_rows>0
372
            tmp1(1:l_cols*n_cols) = 0.0_rck
373
374
375
          endif ! l_rows>0

#ifdef WITH_MPI
376
          call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
377
          call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_MATH_DATATYPE_PRECISION, &
378
                             MPI_SUM, mpi_comm_rows, mpierr)
379
          call obj%timer%stop("mpi_communication")
380
381
382
383
384
385
386
387
388

#else /* WITH_MPI */
!          tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */

          if (l_rows>0) then
#ifdef WITH_MPI
            ! after the mpi_allreduce we have to copy back to the device
            ! copy back to device
389
            successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols* size_of_datatype, &
Andreas Marek's avatar
Retab  
Andreas Marek committed
390
              cudaMemcpyHostToDevice)
391
392
            if (.not.(successCUDA)) then
              print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
393
394
                      &MATH_DATATYPE&
                      &: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
395
              stop 1
396
397
            endif
#else /* WITH_MPI */
398
            ! in real case no memcopy needed. Don't do it in complex case neither
399
400
#endif /* WITH_MPI */

401
!#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
402
403
           ! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
           ! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
404
405
406
407
408
           ! need to upload another slice
           successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)

           if (.not.(successCUDA)) then
             print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
409
410
                     &MATH_DATATYPE&
                     &: error in cudaMemcpy"
411
412
             stop 1
           endif
413
!#endif /* WITH_MPI */
414

Andreas Marek's avatar
Andreas Marek committed
415
            call obj%timer%start("cublas")
Pavel Kus's avatar
Pavel Kus committed
416
            call cublas_PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',       &
417
418
419
420
                                        n_cols, l_cols, ONE, tmat_dev, nbw, tmp_dev, n_cols)

            call cublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm_dev, max_local_rows, &
                                       tmp_dev, n_cols, one, q_dev, ldq)
Andreas Marek's avatar
Andreas Marek committed
421
            call obj%timer%stop("cublas")
422
423
424

            ! copy to host maybe this can be avoided
            ! this is not necessary hvm is not used anymore
425
            successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
426
427
            if (.not.(successCUDA)) then
              print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
428
              stop 1
429
430
431
432
433
434
435
436
            endif
          endif ! l_rows > 0

        enddo ! istep



      else ! do not useGPU
Andreas Marek's avatar
Andreas Marek committed
437
438

#ifdef BAND_TO_FULL_BLOCKING
439
        ! t_blocking was formerly 2; 3 is a better choice
440
        t_blocking = blocking_factor ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once
441
442
443
444
445
446
447

        ! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked.
!       if ( na >= ((t_blocking+1)*nbw) ) then
        cwy_blocking = t_blocking * nbw

        allocate(tmp1(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
448
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
449
450
                  &MATH_DATATYPE&
                  &: error when allocating tmp1 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
451
          stop 1
452
453
454
455
        endif

        allocate(tmp2(max_local_cols*cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
456
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
457
458
                  &MATH_DATATYPE&
                  &: error when allocating tmp2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
459
          stop 1
460
461
462
463
        endif

        allocate(hvb(max_local_rows*cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
464
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
465
466
                  &MATH_DATATYPE&
                  &: error when allocating hvb "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
467
          stop 1
468
469
470
471
        endif

        allocate(hvm(max_local_rows,cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
472
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
473
474
                  &MATH_DATATYPE&
                  &: error when allocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
475
          stop 1
476
477
        endif

Andreas Marek's avatar
Andreas Marek committed
478
479
#else /* BAND_TO_FULL_BLOCKING */

480
481
        allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
482
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
483
484
                  &MATH_DATATYPE&
                  &: error when allocating tmp1 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
485
          stop 1
486
487
488
489
        endif

        allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
490
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
491
                  &MATH_DATATYPE&: error when allocating tmp2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
492
          stop 1
493
494
495
496
        endif

        allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
497
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
498
499
                  &MATH_DATATYPE&
                  &: error when allocating hvb "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
500
          stop 1
501
502
503
504
        endif

        allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
505
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
506
507
                  &MATH_DATATYPE&
                  &: error when allocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
508
          stop 1
509
        endif
Andreas Marek's avatar
Andreas Marek committed
510
#endif /* BAND_TO_FULL_BLOCKING */
511

Andreas Marek's avatar
Andreas Marek committed
512
#ifdef BAND_TO_FULL_BLOCKING
513
514
        allocate(tmat_complete(cwy_blocking,cwy_blocking), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
515
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
516
517
                   &MATH_DATATYPE&
                   &: error when allocating tmat_complete "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
518
          stop 1
519
520
521
        endif
        allocate(t_tmp(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
522
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
523
524
                  &MATH_DATATYPE&
                  &: error when allocating t_tmp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
525
          stop 1
526
527
528
        endif
        allocate(t_tmp2(cwy_blocking,nbw), stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
Andreas Marek's avatar
Andreas Marek committed
529
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
530
531
                  &MATH_DATATYPE&
                  &: error when allocating t_tmp2 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
532
          stop 1
533
534
535
536
537
538
539
540
541
        endif
#endif
!        else
!          allocate(tmp1(max_local_cols*nbw))
!          allocate(tmp2(max_local_cols*nbw))
!          allocate(hvb(max_local_rows*nbw))
!          allocate(hvm(max_local_rows,nbw))
!        endif

542
543
        hvm = 0.0_rck   ! Must be set to 0 !!!
        hvb = 0.0_rck   ! Safety only
544
545
546
547
        l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q

!       if ( na >= ((t_blocking+1)*nbw) ) then

Andreas Marek's avatar
Andreas Marek committed
548
#ifdef BAND_TO_FULL_BLOCKING
549
        do istep=1,((na-1)/nbw-1)/t_blocking + 1
Andreas Marek's avatar
Andreas Marek committed
550
#else
551
552
553
        do istep=1,(na-1)/nbw
#endif

Andreas Marek's avatar
Andreas Marek committed
554
#ifdef BAND_TO_FULL_BLOCKING
555
          ! This the call when using  na >= ((t_blocking+1)*nbw)
556
557
          !      n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw
          ! Number of columns in current step
558
559
560
561
562
563
564
565
566
          ! As an alternative we add some special case handling if na < cwy_blocking
          IF (na < cwy_blocking) THEN
            n_cols = MAX(0, na-nbw)
            IF ( n_cols .eq. 0 ) THEN
              EXIT
            END IF
          ELSE
            n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step
          END IF
Andreas Marek's avatar
Andreas Marek committed
567
#else /* BAND_TO_FULL_BLOCKING */
568
          n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
Andreas Marek's avatar
Andreas Marek committed
569
#endif /* BAND_TO_FULL_BLOCKING */
570
571
572
573
574
575
          ! Broadcast all Householder vectors for current step compressed in hvb

          nb = 0
          ns = 0

          do lc = 1, n_cols
Andreas Marek's avatar
Andreas Marek committed
576
#ifdef BAND_TO_FULL_BLOCKING
577
            ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder Vector
Andreas Marek's avatar
Andreas Marek committed
578
#else
579
            ncol = istep*nbw + lc ! absolute column number of householder Vector
580
581
582
583
584
585
586
587
588
589
590
591
#endif
            nrow = ncol - nbw ! absolute number of pivot row

            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
            l_colh = local_index(ncol  , my_pcol, np_cols, nblk, -1) ! HV local column number

            if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)

            nb = nb+l_rows

            if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
592
              call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
593
              call MPI_Bcast(hvb(ns+1), nb-ns, MPI_MATH_DATATYPE_PRECISION,    &
594
                             pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
595

596
              call obj%timer%stop("mpi_communication")
597
598
599
600
601
602
603
604
605
606

#endif /* WITH_MPI */
              ns = nb
            endif
          enddo ! lc

          ! Expand compressed Householder vectors into matrix hvm

          nb = 0
          do lc = 1, n_cols
Andreas Marek's avatar
Andreas Marek committed
607
#ifdef BAND_TO_FULL_BLOCKING
608
            nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row
Andreas Marek's avatar
Andreas Marek committed
609
#else
610
611
612
613
614
            nrow = (istep-1)*nbw+lc ! absolute number of pivot row
#endif
            l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast

            hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
615
            if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.0_rck
616
617
618
            nb = nb+l_rows
          enddo

Andreas Marek's avatar
Andreas Marek committed
619
#ifdef BAND_TO_FULL_BLOCKING
620
621
622
623
624
625
626
627
628
          l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1)

          ! compute tmat2 out of tmat(:,:,)
          tmat_complete = 0
          do i = 1, t_blocking
            t_cols = MIN(nbw, n_cols - (i-1)*nbw)
            if (t_cols <= 0) exit
            t_rows = (i - 1) * nbw
            tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
Andreas Marek's avatar
Andreas Marek committed
629

630
            if (i > 1) then
Andreas Marek's avatar
Andreas Marek committed
631
              call obj%timer%start("blas")
Pavel Kus's avatar
Pavel Kus committed
632
              call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',      &
Andreas Marek's avatar
Retab  
Andreas Marek committed
633
                           t_rows, t_cols, l_rows, ONE, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
Andreas Marek's avatar
Andreas Marek committed
634
635
                                 max_local_rows, ZERO, t_tmp, cwy_blocking)

Andreas Marek's avatar
Andreas Marek committed
636
              call obj%timer%stop("blas")
637
#ifdef WITH_MPI
638
              call obj%timer%start("mpi_communication")
639

Pavel Kus's avatar
Pavel Kus committed
640
              call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_MATH_DATATYPE_PRECISION,    &
Andreas Marek's avatar
Andreas Marek committed
641
                                 MPI_SUM, mpi_comm_rows, mpierr)
642
              call obj%timer%stop("mpi_communication")
Andreas Marek's avatar
Andreas Marek committed
643
              call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
644
645
              call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, ONE, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
              call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -ONE, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
646
                         t_tmp2, cwy_blocking)
Andreas Marek's avatar
Andreas Marek committed
647
              call obj%timer%stop("blas")
648
649
650

              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)

Andreas Marek's avatar
Andreas Marek committed
651
#else /* WITH_MPI */
652
!              t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
Andreas Marek's avatar
Andreas Marek committed
653
              call obj%timer%start("blas")
Andreas Marek's avatar
Andreas Marek committed
654
655
              call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, ONE, tmat_complete, cwy_blocking, t_tmp, cwy_blocking)
              call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -ONE, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
656
                         t_tmp, cwy_blocking)
Andreas Marek's avatar
Andreas Marek committed
657
              call obj%timer%stop("blas")
658
659
660

              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)

Andreas Marek's avatar
Andreas Marek committed
661
#endif /* WITH_MPI */
662
663
664
665
666
667
668
669

!              call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, CONST_1_0, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
!              call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
!                         t_tmp2, cwy_blocking)

!              tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
             endif
          enddo
Andreas Marek's avatar
Andreas Marek committed
670
#else /* BAND_TO_FULL_BLOCKING */
671
672
673
674
675
676
          l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
#endif

          ! Q = Q - V * T**T * V**T * Q

          if (l_rows>0) then
677
            call obj%timer%start("blas")
678

Pavel Kus's avatar
Pavel Kus committed
679
            call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N',         &
Andreas Marek's avatar
Retab  
Andreas Marek committed
680
                          n_cols, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
681
                                q, ldq, ZERO, tmp1, n_cols)
Andreas Marek's avatar
Andreas Marek committed
682
            call obj%timer%stop("blas")
683
684
685

          else ! l_rows>0

686
            tmp1(1:l_cols*n_cols) = 0.0_rck
687
688
689
          endif ! l_rows>0

#ifdef WITH_MPI
690
          call obj%timer%start("mpi_communication")
Pavel Kus's avatar
Pavel Kus committed
691
          call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_rows ,mpierr)
692
          call obj%timer%stop("mpi_communication")
693

Andreas Marek's avatar
Andreas Marek committed
694
          call obj%timer%start("blas")
695
696

          if (l_rows>0) then
Andreas Marek's avatar
Andreas Marek committed
697
698
#ifdef BAND_TO_FULL_BLOCKING

Pavel Kus's avatar
Pavel Kus committed
699
            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Retab  
Andreas Marek committed
700
                          n_cols, l_cols, ONE, tmat_complete, cwy_blocking, tmp2, n_cols)
701
            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), tmp2, n_cols, ONE, q, ldq)
Andreas Marek's avatar
Andreas Marek committed
702
703
704

#else /* BAND_TO_FULL_BLOCKING */

Pavel Kus's avatar
Pavel Kus committed
705
            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Andreas Marek committed
706
                                n_cols, l_cols, ONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
707
708
            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), &
                                tmp2, n_cols, ONE, q, ldq)
Andreas Marek's avatar
Andreas Marek committed
709
710

#endif /* BAND_TO_FULL_BLOCKING */
711
712

          endif
Andreas Marek's avatar
Andreas Marek committed
713
          call obj%timer%stop("blas")
714
715
#else /* WITH_MPI */
!          tmp2 = tmp1
Andreas Marek's avatar
Andreas Marek committed
716
          call obj%timer%start("blas")
717
          if (l_rows>0) then
Andreas Marek's avatar
Andreas Marek committed
718
#ifdef BAND_TO_FULL_BLOCKING
Pavel Kus's avatar
Pavel Kus committed
719
            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Andreas Marek committed
720
                                n_cols, l_cols, ONE, tmat_complete, cwy_blocking, tmp1, n_cols)
721
            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), tmp1, n_cols, ONE, q, ldq)
Andreas Marek's avatar
Andreas Marek committed
722
723
#else /* BAND_TO_FULL_BLOCKING */

Pavel Kus's avatar
Pavel Kus committed
724
            call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N',        &
Andreas Marek's avatar
Andreas Marek committed
725
                                n_cols, l_cols, ONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
726
727
            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm, ubound(hvm,dim=1), &
                                tmp1, n_cols, ONE, q, ldq)
Andreas Marek's avatar
Andreas Marek committed
728
729

#endif  /* BAND_TO_FULL_BLOCKING */
730
          endif
Andreas Marek's avatar
Andreas Marek committed
731
          call obj%timer%stop("blas")
732
733
734
735
736
737
738
739
740
741
742
743
744
745
#endif /* WITH_MPI */

!          if (l_rows>0) then
!            call PRECISION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, CONST_1_0, tmat_complete, cwy_blocking, tmp2, n_cols)
!            call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONST_1_0, hvm, ubound(hvm,dim=1), tmp2, n_cols, CONST_1_0, q, ldq)
!          endif

        enddo ! istep

      endif ! useGPU

      deallocate(tmp1, tmp2, hvb, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
746
747
                 &MATH_DATATYPE&
                 &: error when deallocating tmp1 tmp2 hvb "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
748
        stop 1
749
750
751
752
753
754
      endif

      if (useGPU) then
        successCUDA = cuda_free(hvm_dev)
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
755
756
                  &MATH_DATATYPE&
                  &: error in cudaFree"
Andreas Marek's avatar
Andreas Marek committed
757
          stop 1
758
759
760
761
762
        endif

        successCUDA = cuda_free(tmp_dev)
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
763
764
                  &MATH_DATATYPE&
                  &: error in cudaFree"
Andreas Marek's avatar
Andreas Marek committed
765
          stop 1
766
767
768
769
770
        endif

        successCUDA = cuda_free(tmat_dev)
        if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
771
772
                  &MATH_DATATYPE&
                  &: error in cudaFree"
Andreas Marek's avatar
Andreas Marek committed
773
          stop 1
774
775
776
        endif

         ! final transfer of q_dev
777
         successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
778
779
780

         if (.not.(successCUDA)) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
781
782
                  &MATH_DATATYPE&
                  &: error in cudamemcpu q_dev"
Andreas Marek's avatar
Andreas Marek committed
783
          stop 1
784
785
786
787
788
789
790
         endif

         !   q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)

         successCUDA = cuda_free(q_dev)
         if (.not.(successCUDA)) then
           print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
791
792
                   &MATH_DATATYPE&
                   &: error in cudaFree"
Andreas Marek's avatar
Andreas Marek committed
793
           stop 1
794
795
796
797
798
         endif

         !   deallocate(q_temp, stat=istat, errmsg=errorMessage)
         !   if (istat .ne. 0) then
         !     print *,"error when deallocating q_temp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
799
         !     stop 1
800
801
802
803
         !   endif
         !   deallocate(tmat_temp, stat=istat, errmsg=errorMessage)
         !   if (istat .ne. 0) then
         !     print *,"trans_ev_band_to_full_real: error when deallocating tmat_temp "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
804
         !     stop 1
805
806
807
808
809
810
811
         !   endif

      endif ! useGPU

      deallocate(hvm, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
812
813
                &MATH_DATATYPE&
                &: error when deallocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
814
        stop 1
815
816
      endif

Andreas Marek's avatar
Andreas Marek committed
817
#if BAND_TO_FULL_BLOCKING
818
819
820
821
      if (.not.(useGPU)) then
        deallocate(tmat_complete, t_tmp, t_tmp2, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_band_to_full_&
Andreas Marek's avatar
Andreas Marek committed
822
823
                  &MATH_DATATYPE&
                  &: error when deallocating tmat_complete, t_tmp, t_tmp2 "//errorMessage
824
825
          stop 1
        endif
826
827
828
      endif
#endif

829
      call obj%timer%stop("trans_ev_band_to_full_&
830
831
832
833
834
835
836
837
838
839
840
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX&
      )

    end subroutine trans_ev_band_to_full_&
    &MATH_DATATYPE&
    &_&
    &PRECISION