elpa2.F90 270 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".



! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".


#include "config-f90.h"

module ELPA2

! Version 1.1.2, 2011-02-21

65
  use elpa_utilities
66
  USE ELPA1
67
68
  use elpa2_utilities

69

70
71
72
#ifdef HAVE_ISO_FORTRAN_ENV
  use iso_fortran_env, only : error_unit
#endif
73
74
75

  use elpa_pdgeqrf

76
77
78
79
80
#ifdef WITH_GPU_VERSION
  use cuda_routines
  use cuda_c_kernel
  use iso_c_binding
#endif
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
  implicit none

  PRIVATE ! By default, all routines contained are private

  ! The following routines are public:

  public :: solve_evp_real_2stage
  public :: solve_evp_complex_2stage

  public :: bandred_real
  public :: tridiag_band_real
  public :: trans_ev_tridi_to_band_real
  public :: trans_ev_band_to_full_real

  public :: bandred_complex
  public :: tridiag_band_complex
  public :: trans_ev_tridi_to_band_complex
  public :: trans_ev_band_to_full_complex
99
100
101
102
#ifndef HAVE_ISO_FORTRAN_ENV
  integer, parameter :: error_unit = 6
#endif

103
104
105
106
107
108
  public :: band_band_real
  public :: divide_band

  integer, public :: which_qr_decomposition = 1     ! defines, which QR-decomposition algorithm will be used
                                                    ! 0 for unblocked
                                                    ! 1 for blocked (maxrank: nblk)
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
!-------------------------------------------------------------------------------

  ! The following array contains the Householder vectors of the
  ! transformation band -> tridiagonal.
  ! It is allocated and set in tridiag_band_real and used in
  ! trans_ev_tridi_to_band_real.
  ! It must be deallocated by the user after trans_ev_tridi_to_band_real!

  real*8, allocatable :: hh_trans_real(:,:)
  complex*16, allocatable :: hh_trans_complex(:,:)

!-------------------------------------------------------------------------------

  include 'mpif.h'


!******
contains
127

128
129
130
131
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk,        &
                                 mpi_comm_rows, mpi_comm_cols,           &
                                 mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
                                 useQR) result(success)
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

!-------------------------------------------------------------------------------
!  solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
167
168
169
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
170
   implicit none
171
172
   logical, intent(in), optional :: useQR
   logical                       :: useQRActual, useQREnvironment
Andreas Marek's avatar
Andreas Marek committed
173
   integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
174
   integer                       :: THIS_REAL_ELPA_KERNEL
175

176
   integer, intent(in)           :: na, nev, lda, ldq, mpi_comm_rows, &
177
                                    mpi_comm_cols, mpi_comm_all
178
   integer, intent(in)           :: nblk
179
   real*8, intent(inout)         :: a(lda,*), ev(na), q(ldq,*)
180

181
182
183
184
185
186
   integer                       :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
   integer                       :: nbw, num_blocks
   real*8, allocatable           :: tmat(:,:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
   logical                       :: success
187
188
   logical, save                 :: firstCall = .true.
   logical                       :: wantDebug
189

190
191
192
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_real_2stage")
#endif
193
194
195
196
197
198
199
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
200

201
202
203
204
205
206
207
208

   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif

209
210
   success = .true.

211
212
213
214
215
216
217
218
219
220
221
222
223
   useQRActual = .false.

   ! set usage of qr decomposition via API call
   if (present(useQR)) then
     if (useQR) useQRActual = .true.
     if (.not.(useQR)) useQRACtual = .false.
   endif

   ! overwrite this with environment variable settings
   if (qr_decomposition_via_environment_variable(useQREnvironment)) then
     useQRActual = useQREnvironment
   endif

224
   if (useQRActual) then
225
226
227
228
     if (mod(na,nblk) .ne. 0) then
       if (wantDebug) then
         write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
       endif
Andreas Marek's avatar
Andreas Marek committed
229
     print *, "Do not use QR-decomposition for this matrix and blocksize."
Andreas Marek's avatar
Andreas Marek committed
230
231
     success = .false.
     return
232
     endif
233
234
   endif

235

236
237
238
   if (present(THIS_REAL_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
239
   else
240

241
242
243
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
Andreas Marek's avatar
Andreas Marek committed
244
245
246
247
   endif

   ! check whether choosen kernel is allowed
   if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
248

249
250
251
252
253
254
255
256
257
258
259
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
260

261
262
263
264
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
265
266

   endif
267
268
269

   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

270
271
272
#ifdef WITH_GPU_VERSION
   nbw = nblk
#else
273
   nbw = (31/nblk+1)*nblk
274
#endif
275
276
277
278
279
280
281
282
   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
283
   call bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
284
                     tmat, wantDebug, success, useQRActual)
285
   if (.not.(success)) return
286
   ttt1 = MPI_Wtime()
287
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
288
      write(error_unit,*) 'Time bandred_real               :',ttt1-ttt0
289
290
291
292
293
294

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
295
296
   call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, &
                          mpi_comm_cols, mpi_comm_all)
297
   ttt1 = MPI_Wtime()
298
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
299
      write(error_unit,*) 'Time tridiag_band_real          :',ttt1-ttt0
300
301
302
303
304
305
306
307
308
309

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
310
   call solve_tridi(na, nev, ev, e, q, ldq, nblk, mpi_comm_rows,  &
311
                    mpi_comm_cols, wantDebug, success)
312
313
   if (.not.(success)) return

314
   ttt1 = MPI_Wtime()
315
316
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
     write(error_unit,*) 'Time solve_tridi                :',ttt1-ttt0
317
318
319
320
321
322
323
324
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   deallocate(e)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
325
   call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows, &
326
                                    mpi_comm_cols, wantDebug, success, THIS_REAL_ELPA_KERNEL)
327
   if (.not.(success)) return
328
   ttt1 = MPI_Wtime()
329
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
330
      write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
331
332
333
334
335
336
337

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_real)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
338
339
   call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, &
                                   mpi_comm_cols, useQRActual)
340
   ttt1 = MPI_Wtime()
341
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
342
      write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
343
344
345
   time_evp_back = ttt1-ttts

   deallocate(tmat)
346
347
348
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_real_2stage")
#endif
349
350
1  format(a,f10.3)

351
end function solve_evp_real_2stage
352
353
354
355
356

!-------------------------------------------------------------------------------

!-------------------------------------------------------------------------------

357
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
Andreas Marek's avatar
Andreas Marek committed
358
                                    mpi_comm_rows, mpi_comm_cols,      &
359
                                    mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394

!-------------------------------------------------------------------------------
!  solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
!  Parameters
!
!  na          Order of matrix a
!
!  nev         Number of eigenvalues needed
!
!  a(lda,*)    Distributed matrix for which eigenvalues are to be computed.
!              Distribution is like in Scalapack.
!              The full matrix must be set (not only one half like in scalapack).
!              Destroyed on exit (upper and lower half).
!
!  lda         Leading dimension of a
!
!  ev(na)      On output: eigenvalues of a, every processor gets the complete set
!
!  q(ldq,*)    On output: Eigenvectors of a
!              Distribution is like in Scalapack.
!              Must be always dimensioned to the full size (corresponding to (na,na))
!              even if only a part of the eigenvalues is needed.
!
!  ldq         Leading dimension of q
!
!  nblk        blocksize of cyclic distribution, must be the same in both directions!
!
!  mpi_comm_rows
!  mpi_comm_cols
!              MPI-Communicators for rows/columns
!  mpi_comm_all
!              MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
395
396
397
#ifdef HAVE_DETAILED_TIMINGS
 use timings
#endif
398
   implicit none
Andreas Marek's avatar
Andreas Marek committed
399
400
   integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
   integer                       :: THIS_COMPLEX_ELPA_KERNEL
401
402
403
404
405
406
407
408
409
410
   integer, intent(in)           :: na, nev, lda, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
   complex*16, intent(inout)     :: a(lda,*), q(ldq,*)
   real*8, intent(inout)         :: ev(na)

   integer                       :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
   integer                       :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
   complex*16, allocatable       :: tmat(:,:,:)
   real*8, allocatable           :: q_real(:,:), e(:)
   real*8                        :: ttt0, ttt1, ttts
   integer                       :: i
411

412
413
414
   logical                       :: success, wantDebug
   logical, save                 :: firstCall = .true.

415
416
417
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("solve_evp_complex_2stage")
#endif
Andreas Marek's avatar
Andreas Marek committed
418
419
   call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
   call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
420
421
422
423
424

   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
425

426
427
428
429
430
431
432
433
   wantDebug = .false.
   if (firstCall) then
     ! are debug messages desired?
     wantDebug = debug_messages_via_environment_variable()
     firstCall = .false.
   endif


434
435
   success = .true.

436
437
438
   if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
     ! user defined kernel via the optional argument in the API call
     THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
Andreas Marek's avatar
Andreas Marek committed
439
   else
440
441
442
     ! if kernel is not choosen via api
     ! check whether set by environment variable
     THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
Andreas Marek's avatar
Andreas Marek committed
443
   endif
444

Andreas Marek's avatar
Andreas Marek committed
445
446
   ! check whether choosen kernel is allowed
   if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
447

448
449
450
451
452
453
454
455
456
457
458
     if (my_pe == 0) then
       write(error_unit,*) " "
       write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
       write(error_unit,*) "is not in the list of the allowed kernels!"
       write(error_unit,*) " "
       write(error_unit,*) "Allowed kernels are:"
       do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
         if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
           write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
         endif
       enddo
Andreas Marek's avatar
Andreas Marek committed
459

460
461
462
463
       write(error_unit,*) " "
       write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
     endif
     THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
Andreas Marek's avatar
Andreas Marek committed
464
   endif
465
466
467
468
469
470
471
472
473
474
475
476
   ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32

   nbw = (31/nblk+1)*nblk

   num_blocks = (na-1)/nbw + 1

   allocate(tmat(nbw,nbw,num_blocks))

   ! Reduction full -> band

   ttt0 = MPI_Wtime()
   ttts = ttt0
477
   call bandred_complex(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
478
                        tmat, wantDebug, success)
479
480
481
482
483
484
   if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
     call timer%stop()
#endif
     return
   endif
485
   ttt1 = MPI_Wtime()
486
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
487
      write(error_unit,*) 'Time bandred_complex               :',ttt1-ttt0
488
489
490
491
492
493
494
495

   ! Reduction band -> tridiagonal

   allocate(e(na))

   ttt0 = MPI_Wtime()
   call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
   ttt1 = MPI_Wtime()
496
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
497
      write(error_unit,*) 'Time tridiag_band_complex          :',ttt1-ttt0
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513

   call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
   call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)

   ttt1 = MPI_Wtime()
   time_evp_fwd = ttt1-ttts

   l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
   l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
   l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

   allocate(q_real(l_rows,l_cols))

   ! Solve tridiagonal system

   ttt0 = MPI_Wtime()
514
   call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,1), nblk, &
515
                    mpi_comm_rows, mpi_comm_cols, wantDebug, success)
516
517
   if (.not.(success)) return

518
   ttt1 = MPI_Wtime()
519
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times)  &
520
      write(error_unit,*) 'Time solve_tridi                   :',ttt1-ttt0
521
522
523
524
525
526
527
528
529
530
   time_evp_solve = ttt1-ttt0
   ttts = ttt1

   q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)

   deallocate(e, q_real)

   ! Backtransform stage 1

   ttt0 = MPI_Wtime()
Andreas Marek's avatar
Andreas Marek committed
531
   call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq,  &
532
                                       mpi_comm_rows, mpi_comm_cols,&
533
                                       wantDebug, success,THIS_COMPLEX_ELPA_KERNEL)
534
   if (.not.(success)) return
535
   ttt1 = MPI_Wtime()
536
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
537
      write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
538
539
540
541
542
543
544
545
546

   ! We can now deallocate the stored householder vectors
   deallocate(hh_trans_complex)

   ! Backtransform stage 2

   ttt0 = MPI_Wtime()
   call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, mpi_comm_rows, mpi_comm_cols)
   ttt1 = MPI_Wtime()
547
   if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
548
      write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
549
550
551
   time_evp_back = ttt1-ttts

   deallocate(tmat)
552
553
554
#ifdef HAVE_DETAILED_TIMINGS
   call timer%stop("solve_evp_complex_2stage")
#endif
555
556
557

1  format(a,f10.3)

558
end function solve_evp_complex_2stage
559
560
561

!-------------------------------------------------------------------------------

562
subroutine bandred_real(na, a, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols, &
563
                        tmat, wantDebug, success, useQR)
564

565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
  !-------------------------------------------------------------------------------
  !  bandred_real: Reduces a distributed symmetric matrix to band form
  !
  !  Parameters
  !
  !  na          Order of matrix
  !
  !  a(lda,*)    Distributed matrix which should be reduced.
  !              Distribution is like in Scalapack.
  !              Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
  !              a(:,:) is overwritten on exit with the band and the Householder vectors
  !              in the upper half.
  !
  !  lda         Leading dimension of a
  !
  !  nblk        blocksize of cyclic distribution, must be the same in both directions!
  !
  !  nbw         semi bandwith of output matrix
  !
  !  mpi_comm_rows
  !  mpi_comm_cols
  !              MPI-Communicators for rows/columns
  !
  !  tmat(nbw,nbw,num_blocks)    where num_blocks = (na-1)/nbw + 1
  !              Factors for the Householder vectors (returned), needed for back transformation
  !
  !-------------------------------------------------------------------------------
592
#ifdef HAVE_DETAILED_TIMINGS
593
594
595
596
597
598
  use timings
#endif

#ifdef WITH_GPU_VERSION
  use cuda_routines
  use iso_c_binding
599
#endif
600

601
   implicit none
602

603
604
   integer             :: na, lda, nblk, nbw, mpi_comm_rows, mpi_comm_cols
   real*8              :: a(lda,*), tmat(nbw,nbw,*)
605

606
607
608
609
610
   integer             :: my_prow, my_pcol, np_rows, np_cols, mpierr
   integer             :: l_cols, l_rows
   integer             :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
   integer             :: istep, ncol, lch, lcx, nlc
   integer             :: tile_size, l_rows_tile, l_cols_tile
611

612
613
614
615
#ifdef WITH_GPU_VERSION
   real*8              :: eps
#endif

616
   real*8              :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
617

618
619
620
621
#ifdef WITH_GPU_VERSION
   real*8, allocatable :: tmp(:), vr(:), vmr(:), umc(:)
#else

622
   real*8, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
623
#endif
624

625
626
627
628
629
   ! needed for blocked QR decomposition
   integer             :: PQRPARAM(11), work_size
   real*8              :: dwork_size(1)
   real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)

630
631
632
633
634
635
636
637
638
#ifdef WITH_GPU_VERSION
   integer(C_SIZE_T)   :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
   integer, external   :: numroc
   integer             :: istat, ierr
   integer             :: cur_l_rows, cur_l_cols, vmr_size, umc_size
   integer(C_SIZE_T)   :: lc_start, lc_end
   integer             :: lr_end
   integer                 :: na_rows, na_cols
#endif
639
   logical, intent(in) :: wantDebug
640
641
   logical, intent(out):: success

642
643
   logical, intent(in) :: useQR

644
645
646
#ifdef HAVE_DETAILED_TIMINGS
   call timer%start("bandred_real")
#endif
647
648
649
650
   call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
   call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
   call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
   call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
651
   success = .true.
652
653


654
   ! Semibandwith nbw must be a multiple of blocksize nblk
655
656
   if (mod(nbw,nblk)/=0) then
     if (my_prow==0 .and. my_pcol==0) then
657
658
659
660
       if (wantDebug) then
         write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
         write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
       endif
661
       success = .false.
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
662
       return
663
     endif
664
665
666
667
668
669
670
671
672
673
   endif

   ! Matrix is split into tiles; work is done only for tiles on the diagonal or above

   tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
   tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide

   l_rows_tile = tile_size/np_rows ! local rows of a tile
   l_cols_tile = tile_size/np_cols ! local cols of a tile

674
   if (useQR) then
675
676
677
678
#ifdef WITH_GPU_VERSION
     print *,"qr decomposition at the moment not supported with GPU"
     stop
#else
679
680
681
682
683
684
     if (which_qr_decomposition == 1) then
       call qr_pqrparam_init(pqrparam,    nblk,'M',0,   nblk,'M',0,   nblk,'M',1,'s')
       allocate(tauvector(na))
       allocate(blockheuristic(nblk))
       l_rows = local_index(na, my_prow, np_rows, nblk, -1)
       allocate(vmr(max(l_rows,1),na))
685

686
       call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), tmat(1,1,1), nbw, dwork_size(1), -1, na, &
687
                             nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
688
689
       work_size = dwork_size(1)
       allocate(work_blocked(work_size))
690

691
692
693
       work_blocked = 0.0d0
       deallocate(vmr)
     endif
694
#endif
695
696
   endif

697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
#ifdef WITH_GPU_VERSION
   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)

   ! Here we convert the regular host array into a pinned host array
   istat = cuda_malloc(a_dev, lda*na_cols*8_8)
   istat = cuda_malloc(tmat_dev, nbw*nbw*8_8)
   istat = cuda_malloc(vav_dev, nbw*nbw*8_8)

   cur_l_rows = 0
   cur_l_cols = 0

   istat = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*8_8,cudaMemcpyHostToDevice)
#endif

712
713
   do istep = (na-1)/nbw, 1, -1

714
     n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
715

716
717
718
     ! Number of local columns/rows of remaining matrix
     l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
     l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
719

720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
#ifdef WITH_GPU_VERSION
     cur_l_rows = max(l_rows, 1)
     cur_l_cols = max(l_cols, 1)

     vmr_size = cur_l_rows * 2 * n_cols
     umc_size = cur_l_cols * 2 * n_cols

     ! Allocate vmr and umc only if the inew size exceeds their current capacity
     ! Added for FORTRAN CALLS
     if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, 1))) then
       if (allocated(vr)) deallocate(vr)
       allocate(vr(l_rows + 1))
     endif

     if ((.not. allocated(vmr)) .or. (vmr_size .gt. ubound(vmr, 1))) then
         if (allocated(vmr)) then
             deallocate(vmr)
             istat = cuda_free(vmr_dev)
         endif
         allocate(vmr(vmr_size))
         istat = cuda_malloc(vmr_dev, vmr_size*8_8)
     endif



     if ((.not. allocated(umc)) .or. (umc_size .gt. ubound(umc, 1))) then
         if (allocated(umc)) then
             deallocate(umc)
             istat = cuda_free(umc_dev)
         endif
         allocate(umc(umc_size))
         istat = cuda_malloc(umc_dev, umc_size*8_8)
     endif
#else
754
     ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
755

756
757
     allocate(vmr(max(l_rows,1),2*n_cols))
     allocate(umc(max(l_cols,1),2*n_cols))
758

759
     allocate(vr(l_rows+1))
760
#endif
761

762
763
764
#ifdef WITH_GPU_VERSION
     vmr(1 : cur_l_rows * n_cols) = 0.
#else
765
     vmr(1:l_rows,1:n_cols) = 0.
766
#endif
767
768
     vr(:) = 0
     tmat(:,:,istep) = 0
769

770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
#ifdef WITH_GPU_VERSION
     umc(1 : umc_size) = 0.

     lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
     lc_end   = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
     lr_end   = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)

     if(lc_start .le. 0) lc_start = 1

     ! Here we assume that the processor grid and the block grid are aligned
     cur_pcol = pcol(istep*nbw+1, nblk, np_cols)

     if(my_pcol == cur_pcol) then

       istat = cuda_memcpy2d(loc(a(1, lc_start)), lda*8_8, (a_dev + ((lc_start-1) * lda*8_8)), lda*8_8, &
                            lr_end*8_8, (lc_end - lc_start+1), cudaMemcpyDeviceToHost)
     endif
#endif

789
     ! Reduce current block to lower triangular form
790
791
792
793
794
795
796
797
798
799

     if (useQR) then
       if (which_qr_decomposition == 1) then
         call qr_pdgeqrf_2dcomm(a, lda, vmr, max(l_rows,1), tauvector(1), &
                                  tmat(1,1,istep), nbw, work_blocked,       &
                                  work_size, na, n_cols, nblk, nblk,        &
                                  istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
                                  0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
                                  blockheuristic)
       endif
800
     else
801

802
       do lc = n_cols, 1, -1
803

804
805
         ncol = istep*nbw + lc ! absolute column number of householder vector
         nrow = ncol - nbw ! Absolute number of pivot row
806

807
808
         lr  = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
         lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
809

810
         tau = 0
811

812
         if (nrow == 1) exit ! Nothing to do
813

814
         cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
815

816
         if (my_pcol==cur_pcol) then
817

818
819
           ! Get vector to be transformed; distribute last element and norm of
           ! remaining elements to all procs in current column
820

821
           vr(1:lr) = a(1:lr,lch) ! vector to be transformed
822

823
           if (my_prow==prow(nrow, nblk, np_rows)) then
824
825
826
827
828
829
             aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
             aux1(2) = vr(lr)
           else
             aux1(1) = dot_product(vr(1:lr),vr(1:lr))
             aux1(2) = 0.
           endif
830

831
           call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
832

833
834
           vnorm2 = aux2(1)
           vrl    = aux2(2)
835

836
           ! Householder transformation
837

838
           call hh_transform_real(vrl, vnorm2, xf, tau)
839

840
           ! Scale vr and store Householder vector for back transformation
841

842
           vr(1:lr) = vr(1:lr) * xf
843
           if (my_prow==prow(nrow, nblk, np_rows)) then
844
845
846
847
848
             a(1:lr-1,lch) = vr(1:lr-1)
             a(lr,lch) = vrl
             vr(lr) = 1.
           else
             a(1:lr,lch) = vr(1:lr)
849
           endif
850

851
         endif
852

853
         ! Broadcast Householder vector and tau along columns
854

855
856
         vr(lr+1) = tau
         call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
857
858
859
#ifdef WITH_GPU_VERSION
         vmr(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
#else
860
         vmr(1:lr,lc) = vr(1:lr)
861
#endif
862
863
         tau = vr(lr+1)
         tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
864

865
866
         ! Transform remaining columns in current block with Householder vector
         ! Local dot product
867

868
         aux1 = 0
869

870
871
872
873
874
875
876
877
         nlc = 0 ! number of local columns
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
           endif
         enddo
878

879
880
         ! Get global dot products
         if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
881

882
         ! Transform
883

884
885
886
887
888
889
890
891
892
893
         nlc = 0
         do j=1,lc-1
           lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
           if (lcx>0) then
             nlc = nlc+1
             a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
           endif
         enddo

       enddo
894

895
896
897
898
899
900
901
902
903
#ifdef WITH_GPU_VERSION
      ! store column tiles back to GPU
      cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
      if (my_pcol == cur_pcol) then
        istat = cuda_memcpy2d((a_dev+((lc_start-1)*lda*8_8)), lda*8_8, loc(a(1, lc_start)), lda*8_8,  lr_end*8_8, &
                                  (lc_end - lc_start+1),cudaMemcpyHostToDevice)

      endif
#endif
904
905
       ! Calculate scalar products of stored Householder vectors.
       ! This can be done in different ways, we use dsyrk
906

907
       vav = 0
908
909
#ifdef WITH_GPU_VERSION
       if (l_rows>0) &
910
         call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,cur_l_rows,0.d0,vav,ubound(vav,1))
911
#else
912
       if (l_rows>0) &
913
           call dsyrk('U','T',n_cols,l_rows,1.d0,vmr,ubound(vmr,1),0.d0,vav,ubound(vav,1))
914
#endif
915
       call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_rows)
916

917
       ! Calculate triangular matrix T for block Householder Transformation
918

919
920
921
922
923
924
925
       do lc=n_cols,1,-1
         tau = tmat(lc,lc,istep)
         if (lc<n_cols) then
           call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,1),vav(lc+1,lc),1)
           tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
         endif
       enddo
926
     endif
927

928
    ! Transpose vmr -> vmc (stored in umc, second half)
929
930
931
932
933
#ifdef WITH_GPU_VERSION
    call elpa_transpose_vectors  (vmr, cur_l_rows, mpi_comm_rows, &
                                    umc(cur_l_cols * n_cols + 1), cur_l_cols, mpi_comm_cols, &
                                    1, istep*nbw, n_cols, nblk)
#else
934
    call elpa_transpose_vectors  (vmr, ubound(vmr,1), mpi_comm_rows, &
935
936
                                    umc(1,n_cols+1), ubound(umc,1), mpi_comm_cols, &
                                    1, istep*nbw, n_cols, nblk)
937
#endif
938

939
940
941
942
    ! Calculate umc = A**T * vmr
    ! Note that the distributed A has to be transposed
    ! Opposed to direct tridiagonalization there is no need to use the cache locality
    ! of the tiles, so we can use strips of the matrix
943
944
945
946
#ifdef WITH_GPU_VERSION
    umc(1 : l_cols * n_cols) = 0.d0
    vmr(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = 0
#else
947
948
    umc(1:l_cols,1:n_cols) = 0.d0
    vmr(1:l_rows,n_cols+1:2*n_cols) = 0
949
#endif
950
    if (l_cols>0 .and. l_rows>0) then
951
952
953
954
955

#ifdef WITH_GPU_VERSION
      ierr = cuda_memcpy(vmr_dev, loc(vmr(1)), vmr_size*8_8,cudaMemcpyHostToDevice)
      ierr = cuda_memcpy(umc_dev, loc(umc(1)), umc_size*8_8,cudaMemcpyHostToDevice)
#endif
956
      do i=0,(istep*nbw-1)/tile_size
957

958
959
960
        lcs = i*l_cols_tile+1
        lce = min(l_cols,(i+1)*l_cols_tile)
        if (lce<lcs) cycle
961

962
        lre = min(l_rows,(i+1)*l_rows_tile)
963
964
965
966
967
968
969
970
971
972
973
974
#ifdef WITH_GPU_VERSION
        call cublas_dgemm('T','N',lce-lcs+1,n_cols,lre, &
                        1.d0, (a_dev + ((lcs-1)*lda*8_8)), lda, vmr_dev,cur_l_rows, &
                        1.d0, (umc_dev+ (lcs-1)*8_8), cur_l_cols)

        if(i==0) cycle
        lre = min(l_rows,i*l_rows_tile)

        call cublas_dgemm('N','N',lre,n_cols,lce-lcs+1,&
                   1.d0, (a_dev+ ((lcs-1)*lda*8_8)),lda,(umc_dev+(cur_l_cols * n_cols+lcs-1)*8_8), cur_l_cols, &
                   1.d0, (vmr_dev+(cur_l_rows * n_cols)*8_8), cur_l_rows)
#else
975
976
        call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,1), &
                     vmr,ubound(vmr,1),1.d0,umc(lcs,1),ubound(umc,1))
977

978
979
980
981
        if (i==0) cycle
        lre = min(l_rows,i*l_rows_tile)
        call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
                     umc(lcs,n_cols+1),ubound(umc,1),1.d0,vmr(1,n_cols+1),ubound(vmr,1))
982
#endif
983
      enddo
984
985
986
987
#ifdef WITH_GPU_VERSION
      ierr = cuda_memcpy(loc(vmr(1)), vmr_dev,vmr_size*8_8,cudaMemcpyDeviceToHost)
      ierr = cuda_memcpy(loc(umc(1)), umc_dev, umc_size*8_8,cudaMemcpyDeviceToHost)
#endif
988
    endif
989

990
991
992
993
    ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
    ! on the processors containing the diagonal
    ! This is only necessary if ur has been calculated, i.e. if the
    ! global tile size is smaller than the global remaining matrix
994

995
    if (tile_size < istep*nbw) then
996
997
998
999
1000
#ifdef WITH_GPU_VERSION
       call elpa_reduce_add_vectors  (vmr(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
                                        umc, cur_l_cols, mpi_comm_cols, &
                                        istep*nbw, n_cols, nblk)
#else
1001
1002
1003
       call elpa_reduce_add_vectors  (vmr(1,n_cols+1),ubound(vmr,1),mpi_comm_rows, &
                                      umc, ubound(umc,1), mpi_comm_cols, &
                                      istep*nbw, n_cols, nblk)
1004
#endif
1005
    endif
1006

1007
    if (l_cols>0) then
1008
1009
1010
1011
1012
#ifdef WITH_GPU_VERSION
      allocate(tmp(l_cols * n_cols))
      call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,ierr)
      umc(1 : l_cols * n_cols) = tmp(1 : l_cols * n_cols)
#else
1013
1014
1015
      allocate(tmp(l_cols,n_cols))
      call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
      umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
1016
#endif
1017
1018
      deallocate(tmp)
    endif
1019

1020
    ! U = U * Tmat**T
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
#ifdef WITH_GPU_VERSION
    ierr = cuda_memcpy(umc_dev, loc(umc(1)), umc_size*8_8, cudaMemcpyHostToDevice)
    istat = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*8_8,cudaMemcpyHostToDevice)
    call cublas_dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols, &
                  1.d0, tmat_dev,nbw,umc_dev,cur_l_cols)

    ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T


    call cublas_dgemm('T','N',n_cols,n_cols,l_cols, &
                  1.d0, umc_dev,cur_l_cols,(umc_dev+(cur_l_cols * n_cols )*8_8),cur_l_cols, &
                  0.d0, vav_dev,nbw)

    call cublas_dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols, &
                   1.d0, tmat_dev,nbw, vav_dev, nbw)

1037

1038
1039
1040
1041
1042
1043
    istat = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*8_8, cudaMemcpyDeviceToHost)

    call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_cols)

    istat = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*8_8,cudaMemcpyHostToDevice)
#else
1044
    call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,1),umc,ubound(umc,1))
1045

1046
    ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
1047

1048
1049
    call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umc,ubound(umc,1),umc(1,n_cols+1),ubound(umc,1),0.d0,vav,ubound(vav,1))
    call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,1),vav,ubound(vav,1))
1050

1051
    call symm_matrix_allreduce(n_cols,vav,ubound(vav,1),mpi_comm_cols)
1052
#endif
1053

1054
    ! U = U - 0.5 * V * VAV
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
#ifdef WITH_GPU_VERSION
    call cublas_dgemm('N','N',l_cols,n_cols,n_cols,&
                    -0.5d0, (umc_dev+(cur_l_cols * n_cols )*8_8),cur_l_cols, vav_dev,nbw,&
                     1.0d0, umc_dev,cur_l_cols)

    istat = cuda_memcpy(loc(umc(1)), umc_dev, umc_size*8_8, cudaMemcpyDeviceToHost)

    ! Transpose umc -> umr (stored in vmr, second half)

    call elpa_transpose_vectors  (umc, cur_l_cols, mpi_comm_cols, &
                                   vmr(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
                                   1, istep*nbw, n_cols, nblk)
    ierr = cuda_memcpy(vmr_dev, loc(vmr(1)), vmr_size*8_8, cudaMemcpyHostToDevice)
    ierr = cuda_memcpy(umc_dev, loc(umc(1)), umc_size*8_8, cudaMemcpyHostToDevice)
#else
1070
    call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umc(1,n_cols+1),ubound(umc,1),vav,ubound(vav,1),1.d0,umc,ubound(umc,1))
1071

1072
    ! Transpose umc -> umr (stored in vmr, second half)
1073

1074
1075
1076
    call elpa_transpose_vectors  (umc, ubound(umc,1), mpi_comm_cols, &
                                   vmr(1,n_cols+1), ubound(vmr,1), mpi_comm_rows, &
                                   1, istep*nbw, n_cols, nblk)
1077
#endif
1078

1079
    ! A = A - V*U**T - U*V**T
1080

1081
1082
1083
1084
1085
    do i=0,(istep*nbw-1)/tile_size
      lcs = i*l_cols_tile+1
      lce = min(l_cols,(i+1)*l_cols_tile)
      lre = min(l_rows,(i+1)*l_rows_tile)
      if (lce<lcs .or. lre<1) cycle
1086
1087
1088
1089
1090
#ifdef WITH_GPU_VERSION
      call cublas_dgemm('N', 'T', lre, lce-lcs+1, 2*n_cols, -1.d0, &
                 vmr_dev,cur_l_rows,(umc_dev +(lcs-1)*8_8),cur_l_cols, &
                 1.d0,(a_dev+(lcs-1)*lda*8_8),lda)
#else