elpa2_template.F90 23.6 KB
Newer Older
1
!    This file is part of ELPA.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
 function elpa_solve_evp_&
  &MATH_DATATYPE&
  &_&
  &2stage_&
  &PRECISION&
57
  &_impl (obj, a, ev, q) result(success)
58

59
   use elpa_abstract_impl
60
   use elpa_utilities
61 62 63 64 65 66 67
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
   use cuda_functions
   use mod_check_for_gpu
   use iso_c_binding
   implicit none
Andreas Marek's avatar
Andreas Marek committed
68 69
   class(elpa_abstract_impl_t), intent(inout)                         :: obj
   logical                                                            :: useGPU
70
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
71 72
   logical                                                            :: useQR
   logical                                                            :: useQRActual
Andreas Marek's avatar
Andreas Marek committed
73
#endif
Andreas Marek's avatar
Andreas Marek committed
74
   integer(kind=c_int)                                                :: kernel
75 76

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
77
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,*)
Andreas Marek's avatar
Andreas Marek committed
78
   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,*)
79
#else
Andreas Marek's avatar
Andreas Marek committed
80
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,obj%local_ncols)
Andreas Marek's avatar
Andreas Marek committed
81
   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
82
#endif
Andreas Marek's avatar
Andreas Marek committed
83 84
   real(kind=C_DATATYPE_KIND), intent(inout)                          :: ev(obj%na)
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: hh_trans(:,:)
85

Andreas Marek's avatar
Andreas Marek committed
86
   integer(kind=c_int)                                                :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
Andreas Marek's avatar
Andreas Marek committed
87 88 89 90
   integer(kind=c_int)                                                :: nbw, num_blocks
#if COMPLEXCASE == 1
   integer(kind=c_int)                                                :: l_cols_nev, l_rows, l_cols
#endif
Andreas Marek's avatar
Andreas Marek committed
91 92
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: tmat(:,:,:)
   real(kind=C_DATATYPE_KIND), allocatable                            :: e(:)
93
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
94
   real(kind=C_DATATYPE_KIND), allocatable                            :: q_real(:,:)
95
#endif
Andreas Marek's avatar
Andreas Marek committed
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target           :: q_dummy(:,:)
   MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer                       :: q_actual(:,:)


   integer(kind=c_intptr_t)                                           :: tmat_dev, q_dev, a_dev

   integer(kind=c_int)                                                :: i
   logical                                                            :: success, successCUDA
   logical                                                            :: wantDebug
   integer(kind=c_int)                                                :: istat, gpu, debug, qr
   character(200)                                                     :: errorMessage
   logical                                                            :: do_useGPU, do_useGPU_trans_ev_tridi
   integer(kind=c_int)                                                :: numberOfGPUDevices
   integer(kind=c_intptr_t), parameter                                :: size_of_datatype = size_of_&
                                                                                            &PRECISION&
                                                                                            &_&
                                                                                            &MATH_DATATYPE
    integer(kind=ik)                                                  :: na, nev, lda, ldq, nblk, matrixCols, &
                                                                         mpi_comm_rows, mpi_comm_cols,        &
Andreas Marek's avatar
Andreas Marek committed
115
                                                                         mpi_comm_all, check_pd, error
Andreas Marek's avatar
Andreas Marek committed
116 117 118

    logical                                                           :: do_bandred, do_tridiag, do_solve_tridi,  &
                                                                         do_trans_to_band, do_trans_to_full
119

120
    call obj%timer%start("elpa_solve_evp_&
121
    &MATH_DATATYPE&
122 123 124
    &_2stage_&
    &PRECISION&
    &")
125

Andreas Marek's avatar
Andreas Marek committed
126 127
    success = .true.

Andreas Marek's avatar
Andreas Marek committed
128
    if (present(q)) then
129
      obj%eigenvalues_only = .false.
Andreas Marek's avatar
Andreas Marek committed
130
    else
131
      obj%eigenvalues_only = .true.
Andreas Marek's avatar
Andreas Marek committed
132 133
    endif

134 135 136 137 138 139 140
    na         = obj%na
    nev        = obj%nev
    lda        = obj%local_nrows
    ldq        = obj%local_nrows
    nblk       = obj%nblk
    matrixCols = obj%local_ncols

Andreas Marek's avatar
Andreas Marek committed
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
   ! special case na = 1
   if (na .eq. 1) then
#if REALCASE == 1
     ev(1) = a(1,1)
     if (.not.(obj%eigenvalues_only)) then
       q(1,1) = CONST_REAL_1_0
     endif
#endif
#if COMPLEXCASE == 1
     ev(1) = real(a(1,1))
     if (.not.(obj%eigenvalues_only)) then
       q(1,1) = CONST_COMPLEX_PAIR_1_0
     endif
#endif
     call obj%timer%stop("elpa_solve_evp_&
     &MATH_DATATYPE&
     &_2stage_&
     &PRECISION&
     &")
     return
   endif

163 164 165 166 167
   if (nev == 0) then
     nev = 1
     obj%eigenvalues_only = .true.
   endif

168
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
169 170 171 172 173
    call obj%get("real_kernel",kernel,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
174
    ! check consistency between request for GPUs and defined kernel
Andreas Marek's avatar
Andreas Marek committed
175 176 177 178 179
    call obj%get("gpu", gpu,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
180
    if (gpu == 1) then
181 182
      if (kernel .ne. ELPA_2STAGE_REAL_GPU) then
        write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
183
        write(error_unit,*) "The compute kernel will be executed on CPUs!"
184
      else if (nblk .ne. 128) then
185 186 187
        kernel = ELPA_2STAGE_REAL_GENERIC
      endif
    endif
188 189 190 191 192
    if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
      if (gpu .ne. 1) then
        write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
      endif
    endif
193 194 195 196 197 198 199 200 201 202

#ifdef SINGLE_PRECISION_REAL
    ! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
    if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4 .or. &
        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6        ) then
        write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for POWER8"
        write(error_unit,*) "The GENERIC kernel will be used at the moment"
        kernel = ELPA_2STAGE_REAL_GENERIC
    endif
203 204 205 206 207 208 209 210
    ! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
    if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6        ) then
        write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
        write(error_unit,*) "The GENERIC kernel will be used at the moment"
        kernel = ELPA_2STAGE_REAL_GENERIC
    endif
211 212
#endif

213 214 215
#endif

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
216 217 218 219 220
    call obj%get("complex_kernel",kernel,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
221
    ! check consistency between request for GPUs and defined kernel
Andreas Marek's avatar
Andreas Marek committed
222 223 224 225 226
    call obj%get("gpu", gpu,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
227
    if (gpu == 1) then
228 229
      if (kernel .ne. ELPA_2STAGE_COMPLEX_GPU) then
        write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
230
        write(error_unit,*) "The compute kernel will be executed on CPUs!"
231
      else if (nblk .ne. 128) then
232 233 234
        kernel = ELPA_2STAGE_COMPLEX_GENERIC
      endif
    endif
235 236 237 238 239 240
    if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
      if (gpu .ne. 1) then
        write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
      endif
    endif

241
#endif
Andreas Marek's avatar
Andreas Marek committed
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
    call obj%get("mpi_comm_rows",mpi_comm_rows,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
    call obj%get("mpi_comm_cols",mpi_comm_cols,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
    call obj%get("mpi_comm_parent",mpi_comm_all,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
257

258
    if (gpu .eq. 1) then
259 260 261 262 263 264
      useGPU = .true.
    else
      useGPU = .false.
    endif

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
265 266 267 268 269
    call obj%get("qr",qr,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
270
    if (qr .eq. 1) then
271 272 273 274 275 276
      useQR = .true.
    else
      useQR = .false.
    endif

#endif
277
    call obj%timer%start("mpi_communication")
278 279 280 281 282 283 284
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
285
    call obj%timer%stop("mpi_communication")
286

Andreas Marek's avatar
Andreas Marek committed
287 288 289 290 291
    call obj%get("debug",debug,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
292
    wantDebug = debug == 1
293 294 295 296 297 298 299 300

    do_useGPU      = .false.
    do_useGPU_trans_ev_tridi =.false.


#if REALCASE == 1
    useQRActual = .false.
    ! set usage of qr decomposition via API call
301 302
    if (useQR) useQRActual = .true.
    if (.not.(useQR)) useQRACtual = .false.
303 304 305 306 307 308 309 310 311 312 313 314 315

    if (useQRActual) then
      if (mod(na,2) .ne. 0) then
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
#endif /* REALCASE */

316 317
    if (useGPU) then
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
318

319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
         do_useGPU = .true.

         ! set the neccessary parameters
         cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
         cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
         cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
         cudaHostRegisterPortable = cuda_hostRegisterPortable()
         cudaHostRegisterMapped   = cuda_hostRegisterMapped()
      else
        print *,"GPUs are requested but not detected! Aborting..."
        success = .false.
        return
      endif
    else
      ! check whether set by environment variable
Andreas Marek's avatar
Andreas Marek committed
334 335 336 337 338
      call obj%get("gpu",gpu,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
339
      do_useGPU = gpu == 1
340 341
      if (do_useGPU) then
        if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
342 343 344 345 346 347 348 349 350 351 352 353 354

           ! set the neccessary parameters
           cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
           cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
           cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
           cudaHostRegisterPortable = cuda_hostRegisterPortable()
           cudaHostRegisterMapped   = cuda_hostRegisterMapped()
        else
          print *,"GPUs are requested but not detected! Aborting..."
          success = .false.
          return
        endif
      endif
355
    endif
356 357 358

    ! check consistency between request for GPUs and defined kernel
    if (do_useGPU) then
359 360 361 362 363
      if (nblk .ne. 128) then
        ! cannot run on GPU with this blocksize
        ! disable GPU usage for trans_ev_tridi
        do_useGPU_trans_ev_tridi = .false.
      else
364 365 366 367 368 369 370
#if REALCASE == 1
        if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
#endif
#if COMPLEXCASE == 1
        if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
#endif
          do_useGPU_trans_ev_tridi = .true.
Andreas Marek's avatar
Andreas Marek committed
371
        else
372
          do_useGPU_trans_ev_tridi = .false.
Andreas Marek's avatar
Andreas Marek committed
373
        endif
374 375
      endif
    endif
376

377

Andreas Marek's avatar
Andreas Marek committed
378

379
    if (.not. obj%eigenvalues_only) then
Andreas Marek's avatar
Andreas Marek committed
380 381 382 383 384 385
      q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
    else
     allocate(q_dummy(1:obj%local_nrows,1:obj%local_ncols))
     q_actual => q_dummy(1:obj%local_nrows,1:obj%local_ncols)
    endif

386 387 388 389 390 391 392 393 394 395 396 397 398

    ! set the default values for each of the 5 compute steps
    do_bandred        = .true.
    do_tridiag        = .true.
    do_solve_tridi    = .true.
    do_trans_to_band  = .true.
    do_trans_to_full  = .true.

    if (obj%eigenvalues_only) then
      do_trans_to_band  = .false.
      do_trans_to_full  = .false.
    endif

Andreas Marek's avatar
Andreas Marek committed
399
    if (obj%is_set("bandwidth") == 1) then
Andreas Marek's avatar
Andreas Marek committed
400
      call obj%get("bandwidth",nbw,error)
401
      if (nbw == 0) then
402
        if (wantDebug) then
403
          write(error_unit,*) "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
Andreas Marek's avatar
Andreas Marek committed
404 405
                              "for a diagonal matrix! This is too simple"
          endif
406
        print *, "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
Andreas Marek's avatar
Andreas Marek committed
407
                 "for a diagonal matrix! This is too simple"
408 409 410
        success = .false.
        return
      endif
411 412
      if (mod(nbw, nblk) .ne. 0) then
        ! treat matrix with an effective bandwidth slightly bigger than specified bandwidth
Andreas Marek's avatar
Andreas Marek committed
413
        ! such that effective bandwidth is a multiply of nblk. which is a prerequiste for ELPA
414 415 416
        nbw = nblk * ceiling(real(nbw,kind=c_double)/real(nblk,kind=c_double))

        ! just check that effective bandwidth is NOT larger than matrix size
Andreas Marek's avatar
Andreas Marek committed
417
        if (nbw .gt. na) then
418 419
          if (wantDebug) then
            write(error_unit,*) "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
Andreas Marek's avatar
Andreas Marek committed
420 421 422
                                "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
                                "solve your problem by not specifing a bandwidth"
          endif
423
          print *, "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
Andreas Marek's avatar
Andreas Marek committed
424 425
                                "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
                                "solve your problem by not specifing a bandwidth"
426 427
          success = .false.
          return
Andreas Marek's avatar
Andreas Marek committed
428
        endif
429
      endif
430 431 432 433 434
      do_bandred       = .false. ! we already have a banded matrix
      do_solve_tridi   = .true.  ! we also have to solve something :-)
      do_trans_to_band = .true.  ! and still we have to backsub to banded
      do_trans_to_full = .false. ! but not to full since we have a banded matrix
    else ! bandwidth is not set
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455

      ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
      ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
      ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
      ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
      ! on this and maybe allow a run-time optimization here
      if (do_useGPU) then
        nbw = nblk
      else
#if REALCASE == 1
        nbw = (63/nblk+1)*nblk
#elif COMPLEXCASE == 1
        nbw = (31/nblk+1)*nblk
#endif
      endif

      num_blocks = (na-1)/nbw + 1

      allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_evp_&
456 457 458 459
        &MATH_DATATYPE&
        &_2stage_&
        &PRECISION&
        &" // ": error when allocating tmat "//errorMessage
460 461 462
        stop 1
      endif

463 464 465 466 467 468 469 470 471 472
      do_bandred       = .true.
      do_solve_tridi   = .true.
      do_trans_to_band = .true.
      do_trans_to_full = .true.
    end if  ! matrix not already banded on input

    ! start the computations in 5 steps

    if (do_bandred) then
      call obj%timer%start("bandred")
473 474 475 476 477
      ! Reduction full -> band
      call bandred_&
      &MATH_DATATYPE&
      &_&
      &PRECISION &
478
      (obj, na, a, &
479 480
      a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
      tmat_dev,  wantDebug, do_useGPU, success &
481
#if REALCASE == 1
482
      , useQRActual &
483
#endif
484 485
      )
      call obj%timer%stop("bandred")
486
      if (.not.(success)) return
487 488
    endif

489 490

     ! Reduction band -> tridiagonal
491 492 493 494 495 496 497 498 499
     if (do_tridiag) then
       allocate(e(na), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when allocating e "//errorMessage
         stop 1
       endif
500

501 502
       call obj%timer%start("tridiag")
       call tridiag_band_&
503
       &MATH_DATATYPE&
504 505
       &_&
       &PRECISION&
506 507
       (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
        do_useGPU, wantDebug)
508 509

#ifdef WITH_MPI
510 511 512 513
       call obj%timer%start("mpi_communication")
       call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
       call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
       call obj%timer%stop("mpi_communication")
514
#endif /* WITH_MPI */
515 516
       call obj%timer%stop("tridiag")
     endif ! do_tridiag
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532

#if COMPLEXCASE == 1
     l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
     l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
     l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

     allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when allocating q_real"//errorMessage
       stop 1
     endif
#endif

     ! Solve tridiagonal system
533 534 535 536 537
     if (do_solve_tridi) then
       call obj%timer%start("solve")
       call solve_tridi_&
       &PRECISION &
       (obj, na, nev, ev, e, &
538
#if REALCASE == 1
539
       q_actual, ldq,   &
540 541
#endif
#if COMPLEXCASE == 1
542
       q_real, ubound(q_real,dim=1), &
543
#endif
544
       nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU, wantDebug, success)
545 546 547
       call obj%timer%stop("solve")
       if (.not.(success)) return
     endif ! do_solve_tridi
548 549 550 551 552 553 554 555 556

     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when deallocating e "//errorMessage
       stop 1
     endif

557
     if (obj%eigenvalues_only) then
Andreas Marek's avatar
Andreas Marek committed
558 559 560 561
       do_trans_to_band = .false.
       do_trans_to_full = .false.
     else

Andreas Marek's avatar
Andreas Marek committed
562 563 564 565 566
       call obj%get("check_pd",check_pd,error)
       if (error .ne. ELPA_OK) then
         print *,"Problem getting option. Aborting..."
         stop
       endif
Andreas Marek's avatar
Andreas Marek committed
567 568 569 570 571 572 573 574 575 576 577
       if (check_pd .eq. 1) then
         check_pd = 0
         do i = 1, na
           if (ev(i) .gt. THRESHOLD) then
             check_pd = check_pd + 1
           endif
         enddo
         if (check_pd .lt. na) then
           ! not positiv definite => eigenvectors needed
           do_trans_to_band = .true.
           do_trans_to_full = .true.
Andreas Marek's avatar
Andreas Marek committed
578
         else
Andreas Marek's avatar
Andreas Marek committed
579 580 581 582 583
           do_trans_to_band = .false.
           do_trans_to_full = .false.
         endif
       endif
     endif ! eigenvalues only
584

585
     if (do_trans_to_band) then
586
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
587
       ! q must be given thats why from here on we can use q and not q_actual
588

Andreas Marek's avatar
Andreas Marek committed
589
       q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
590

Andreas Marek's avatar
Andreas Marek committed
591 592 593 594 595 596 597 598
       deallocate(q_real, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage: error when deallocating q_real"//errorMessage
         stop 1
       endif
#endif
Andreas Marek's avatar
Andreas Marek committed
599

Andreas Marek's avatar
Andreas Marek committed
600 601
       ! Backtransform stage 1
       call obj%timer%start("trans_ev_to_band")
602

Andreas Marek's avatar
Andreas Marek committed
603
       call trans_ev_tridi_to_band_&
604
       &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
605 606 607 608 609 610 611
       &_&
       &PRECISION &
       (obj, na, nev, nblk, nbw, q, &
       q_dev, &
       ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi, &
       success=success, kernel=kernel)
       call obj%timer%stop("trans_ev_to_band")
612

Andreas Marek's avatar
Andreas Marek committed
613
       if (.not.(success)) return
614

Andreas Marek's avatar
Andreas Marek committed
615 616 617 618 619 620 621 622
       ! We can now deallocate the stored householder vectors
       deallocate(hh_trans, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *, "solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when deallocating hh_trans "//errorMessage
         stop 1
623
       endif
624
     endif ! do_trans_to_band
625

626
     if (do_trans_to_full) then
Andreas Marek's avatar
Andreas Marek committed
627
       call obj%timer%start("trans_ev_to_full")
628 629 630
       if ( (do_useGPU) .and. .not.(do_useGPU_trans_ev_tridi) ) then
         ! copy to device if we want to continue on GPU
         successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
631

632 633
         successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
       endif
Andreas Marek's avatar
Andreas Marek committed
634

635
       ! Backtransform stage 2
Andreas Marek's avatar
Andreas Marek committed
636

637 638 639 640 641 642 643 644
       call trans_ev_band_to_full_&
       &MATH_DATATYPE&
       &_&
       &PRECISION &
       (obj, na, nev, nblk, nbw, a, &
       a_dev, lda, tmat, tmat_dev,  q,  &
       q_dev, &
       ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU &
645
#if REALCASE == 1
646
       , useQRActual  &
647
#endif
648
       )
649

650

651 652 653 654 655 656 657 658
       deallocate(tmat, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when deallocating tmat"//errorMessage
         stop 1
       endif
Andreas Marek's avatar
Andreas Marek committed
659
       call obj%timer%stop("trans_ev_to_full")
660
     endif ! do_trans_to_full
Andreas Marek's avatar
Andreas Marek committed
661

662
     if (obj%eigenvalues_only) then
Andreas Marek's avatar
Andreas Marek committed
663
       deallocate(q_dummy, stat=istat, errmsg=errorMessage)
664 665
       if (istat .ne. 0) then
         print *,"solve_evp_&
Andreas Marek's avatar
Andreas Marek committed
666 667 668 669
         &MATH_DATATYPE&
         &_1stage_&
         &PRECISION&
         &" // ": error when deallocating q_dummy "//errorMessage
670 671 672 673
         stop 1
       endif
     endif

674
     call obj%timer%stop("elpa_solve_evp_&
675
     &MATH_DATATYPE&
676 677 678
     &_2stage_&
    &PRECISION&
    &")
679 680 681 682 683 684
1    format(a,f10.3)

   end function elpa_solve_evp_&
   &MATH_DATATYPE&
   &_2stage_&
   &PRECISION&
685
   &_impl
686 687

! vim: syntax=fortran