elpa1_trans_ev_complex_template.X90 17 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif


!> \brief Transforms the eigenvectors of a tridiagonal matrix back
!>                     to the eigenvectors of the original matrix
!>                     (like Scalapack Routine PDORMTR)
!>
!  Parameters
!
Pavel Kus's avatar
Pavel Kus committed
62
!> \param na          Order of matrix a_mat, number of rows of matrix q_mat
63
!>
Pavel Kus's avatar
Pavel Kus committed
64
!> \param nqc         Number of columns of matrix q_mat
65
!>
Pavel Kus's avatar
Pavel Kus committed
66
!> \param a_mat(lda,matrixCols)  Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
67
68
!>                           Distribution is like in Scalapack.
!>
Pavel Kus's avatar
Pavel Kus committed
69
!> \param lda         Leading dimension of a_mat
70
71
72
!>
!> \param tau(na)     Factors of the Householder vectors
!>
Pavel Kus's avatar
Pavel Kus committed
73
!> \param q_mat           On input: Eigenvectors of tridiagonal matrix
74
75
76
!>                    On output: Transformed eigenvectors
!>                    Distribution is like in Scalapack.
!>    
Pavel Kus's avatar
Pavel Kus committed
77
!> \param ldq         Leading dimension of q_mat
78
79
80
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
Pavel Kus's avatar
Pavel Kus committed
81
!> \param matrixCols  local columns of matrix a_mat and q_mat
82
83
84
85
86
87
88
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!>
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
!>
Pavel Kus's avatar
Pavel Kus committed
89
    subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
90
91
      use cuda_functions
      use iso_c_binding
92
93
94
95
96
97
98
99
#ifdef HAVE_DETAILED_TIMINGS
      use timings
#else
      use timings_dummy
#endif
      use precision
      implicit none

100
101
      integer(kind=ik), intent(in)                  ::  na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
      complex(kind=COMPLEX_DATATYPE), intent(in)    :: tau(na)
102
#ifdef USE_ASSUMED_SIZE
103
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
104
#else
105
      complex(kind=COMPLEX_DATATYPE), intent(inout) ::  a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
106
#endif
107
      logical, intent(in)                           :: useGPU
108

109
110
111
112
113
114
115
116
117
118
      integer(kind=ik)              :: max_stored_rows
#ifdef DOUBLE_PRECISION_COMPLEX
      complex(kind=ck8), parameter   :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8)
#else
      complex(kind=ck4), parameter   :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4)
#endif
      integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
      integer(kind=ik)              :: l_cols, l_rows, l_colh, nstor
      integer(kind=ik)              :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
119
      integer(kind=ik)              :: hvn_ubnd, hvm_ubnd
120
121

      complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
122
      complex(kind=COMPLEX_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
123
124
      integer(kind=ik)              :: istat
      character(200)                :: errorMessage
125
126
127
      
      integer(kind=C_intptr_T)    :: q_dev, tmp_dev, hvm_dev, tmat_dev
      logical                     :: successCUDA
128
129
130
131
132
133
134
135
136
137
138
139

      call timer%start("trans_ev_complex" // PRECISION_SUFFIX)
      call timer%start("mpi_communication")

      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      call timer%stop("mpi_communication")

      totalblocks = (na-1)/nblk + 1
      max_blocks_row = (totalblocks-1)/np_rows + 1
Pavel Kus's avatar
Pavel Kus committed
140
      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q_mat!
141
142
143
144
145
146
147

      max_local_rows = max_blocks_row*nblk
      max_local_cols = max_blocks_col*nblk

      max_stored_rows = (63/nblk+1)*nblk

      allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
148
      call check_alloc("trans_ev_complex", "tmat", istat, errorMessage)
149
150

      allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
151
      call check_alloc("trans_ev_complex", "h1", istat, errorMessage)
152
153

      allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
154
      call check_alloc("trans_ev_complex", "h2", istat, errorMessage)
155
156

      allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
157
      call check_alloc("trans_ev_complex", "tmp1", istat, errorMessage)
158
159

      allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
160
      call check_alloc("trans_ev_complex", "tmp2", istat, errorMessage)
161
162

      allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
163
      call check_alloc("trans_ev_complex", "hvb", istat, errorMessage)
164
165

      allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
166
      call check_alloc("trans_ev_complex", "hvm", istat, errorMessage)
167
168
169
170

      hvm = 0   ! Must be set to 0 !!!
      hvb = 0   ! Safety only

Pavel Kus's avatar
Pavel Kus committed
171
      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
172
173

      nstor = 0
174
175
176
      if (useGPU) then
        hvn_ubnd = 0
      endif
177
178
179

      ! In the complex case tau(2) /= 0
      if (my_prow == prow(1, nblk, np_rows)) then
Pavel Kus's avatar
Pavel Kus committed
180
        q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(CONE-tau(2))
181
182
      endif

183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
      if (useGPU) then
        ! todo: this is used only for copying hmv to device.. it should be possible to go without it
        allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
        call check_alloc("trans_ev_complex", "hvm1", istat, errorMessage)

        successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_complex)
        check_alloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_complex)
        check_alloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_complex)
        check_alloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_complex)
        check_alloc_cuda("trans_ev", successCUDA)

Pavel Kus's avatar
Pavel Kus committed
200
!         q_dev = q_mat
201
202
        successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
                                  cudaMemcpyHostToDevice)
203
204
205
        check_memcpy_cuda("trans_ev", successCUDA)        
      endif
      
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
      do istep=1,na,nblk

        ics = MAX(istep,3)
        ice = MIN(istep+nblk-1,na)
        if (ice<ics) cycle

        cur_pcol = pcol(istep, nblk, np_cols)

        nb = 0
        do ic=ics,ice

          l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder vector
          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector


          if (my_pcol==cur_pcol) then
Pavel Kus's avatar
Pavel Kus committed
222
            hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
            if (my_prow==prow(ic-1, nblk, np_rows)) then
              hvb(nb+l_rows) = 1.
            endif
          endif

          nb = nb+l_rows
        enddo

#ifdef WITH_MPI
        call timer%start("mpi_communication")

        if (nb>0) &
           call MPI_Bcast(hvb, nb, MPI_COMPLEX_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
        call timer%stop("mpi_communication")

#endif /* WITH_MPI */
        nb = 0
        do ic=ics,ice
          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
          hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
243
244
245
          if (useGPU) then
            hvm_ubnd = l_rows
          endif
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
          nstor = nstor+1
          nb = nb+l_rows
        enddo

        ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
        if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then

          ! Calculate scalar products of stored vectors.
          ! This can be done in different ways, we use zherk

          tmat = 0
          if (l_rows>0) &
             call PRECISION_HERK('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows)
          nc = 0
          do n=1,nstor-1
            h1(nc+1:nc+n) = tmat(1:n,n+1)
            nc = nc+n
          enddo
#ifdef WITH_MPI
          call timer%start("mpi_communication")
          if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
          call timer%stop("mpi_communication")

#else /* WITH_MPI */
          if (nc>0) h2=h1
#endif /* WITH_MPI */
          ! Calculate triangular matrix T

          nc = 0
          tmat(1,1) = tau(ice-nstor+1)
          do n=1,nstor-1
277
278
279
            call PRECISION_TRMV('L', 'C', 'N', n,   &
                                tmat, max_stored_rows,   &
                                h2(nc+1),1)
280
281
282
283
            tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1)
            tmat(n+1,n+1) = tau(ice-nstor+n+1)
            nc = nc+n
          enddo
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
          
          if (useGPU) then
            ! todo: is this reshape really neccessary?
            hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))

            !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
            successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)),   &
                          hvm_ubnd * nstor * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
            check_memcpy_cuda("trans_ev", successCUDA)
                        
            !tmat_dev = tmat
            successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)),   &
                          max_stored_rows * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
            check_memcpy_cuda("trans_ev", successCUDA)
          endif
299
300
301
302

          ! Q = Q - V * T * V**T * Q

          if (l_rows>0) then
303
304
305
306
307
308
309
310
311
            if(useGPU) then
              call cublas_PRECISION_gemm('C', 'N', nstor, l_cols, l_rows,  &
                         CONE, hvm_dev, hvm_ubnd,  &
                         q_dev, ldq,  &
                         CZERO, tmp_dev, nstor)

            else !useGPU            
              call PRECISION_GEMM('C', 'N', nstor, l_cols, l_rows,  &
                                  CONE, hvm, ubound(hvm,dim=1), &
Pavel Kus's avatar
Pavel Kus committed
312
                                  q_mat, ldq,  &
313
314
315
316
317
318
319
320
321
322
323
                                  CZERO, tmp1 ,nstor)
            endif !useGPU
          else !l_rows>0
            if (useGPU) then
              successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_PRECISION_complex)
              check_memcpy_cuda("trans_ev", successCUDA)
            else  
              tmp1(1:l_cols*nstor) = 0
            endif 
          endif  !l_rows>0

324
#ifdef WITH_MPI
325
326
327
328
329
330
331
332
          ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
          ! todo: does it need to be copied whole? Wouldn't be a part sufficient?
          if (useGPU) then          
            successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev,  &
                          max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
            check_memcpy_cuda("trans_ev", successCUDA)
          endif 
          
333
334
335
336
          call timer%start("mpi_communication")
          call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
          call timer%stop("mpi_communication")

337
338
339
340
341
342
          ! copy back tmp2 - after reduction...
          if (useGPU) then          
            successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)),  &
                          max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
            check_memcpy_cuda("trans_ev", successCUDA)
          endif 
343
344
#else
!          tmp2 = tmp1
345
346
#endif
          
347
          if (l_rows>0) then
348
349
350
351
352
353
354
355
356
357
358
359
360
361
            if(useGPU) then 
              call cublas_PRECISION_trmm('L', 'L', 'N', 'N', nstor, l_cols,  &
                                            CONE, tmat_dev, max_stored_rows,  &
                                            tmp_dev, nstor)
              call cublas_PRECISION_gemm('N', 'N' ,l_rows ,l_cols ,nstor,  &
                                            -CONE, hvm_dev, hvm_ubnd,  &
                                            tmp_dev, nstor,   &
                                            CONE, q_dev, ldq)
            else  !useGPU           
#ifdef WITH_MPI
              ! tmp2 = tmat * tmp2
              call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols,  &
                                  CONE, tmat, max_stored_rows,  &
                                  tmp2, nstor)
Pavel Kus's avatar
Pavel Kus committed
362
              !q_mat = q_mat - hvm*tmp2 
363
364
365
              call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor,  &
                                  -CONE, hvm, ubound(hvm,dim=1),  &
                                  tmp2, nstor,  &
Pavel Kus's avatar
Pavel Kus committed
366
                                  CONE, q_mat, ldq)
367
368
369
370
371
372
373
#else
              call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols,  &
                                  CONE, tmat, max_stored_rows,  &
                                  tmp1, nstor)
              call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor,  &
                                  -CONE, hvm, ubound(hvm,dim=1),  &
                                  tmp1, nstor,  &
Pavel Kus's avatar
Pavel Kus committed
374
                                  CONE, q_mat, ldq)
375
#endif
376
377
            endif ! useGPU
          endif  ! l_rows>0
378
379
380
381

!          if (l_rows>0) then
!            call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
!            call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
Pavel Kus's avatar
Pavel Kus committed
382
!                        tmp2, nstor, CONE, q_mat, ldq)
383
384
385
!          endif

          nstor = 0
386
387
388
        endif        
        
      enddo ! istep=1,na,nblk
389
390
391
392
393
394
395

      deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
       print *,"trans_ev_complex: error when deallocating hvb "//errorMessage
       stop
      endif

396
      if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
397
398
399
        !q_mat = q_dev
        successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_complex, &
                                  cudaMemcpyDeviceToHost)
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
        check_memcpy_cuda("trans_ev", successCUDA)     
        
        deallocate(hvm1, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
          print *,"trans_ev_complex: error when deallocating hvm1 "//errorMessage
          stop
        endif        
        
        !deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
        successCUDA = cuda_free(q_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(tmp_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(hvm_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(tmat_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

      endif

423
424
425
      call timer%stop("trans_ev_complex" // PRECISION_SUFFIX)

    end subroutine trans_ev_complex_PRECISION