elpa2_template.F90 29.7 KB
Newer Older
1
!    This file is part of ELPA.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
 function elpa_solve_evp_&
  &MATH_DATATYPE&
  &_&
  &2stage_&
  &PRECISION&
57
  &_impl (obj, a, ev, q) result(success)
58

59
   use elpa_abstract_impl
60
   use elpa_utilities
61 62 63 64 65
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
   use cuda_functions
   use mod_check_for_gpu
66 67
   use elpa_omp

68 69
   use iso_c_binding
   implicit none
70
#include "../general/precision_kinds.F90"
71 72
   class(elpa_abstract_impl_t), intent(inout)                         :: obj
   logical                                                            :: useGPU
73
#if REALCASE == 1
74 75
   logical                                                            :: useQR
   logical                                                            :: useQRActual
76
#endif
77
   integer(kind=c_int)                                                :: kernel
78 79

#ifdef USE_ASSUMED_SIZE
80
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,*)
81
   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,*)
82
#else
83
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,obj%local_ncols)
84
   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
85
#endif
86 87
   real(kind=C_DATATYPE_KIND), intent(inout)                          :: ev(obj%na)
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: hh_trans(:,:)
88

89
   integer(kind=c_int)                                                :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
90 91 92 93
   integer(kind=c_int)                                                :: nbw, num_blocks
#if COMPLEXCASE == 1
   integer(kind=c_int)                                                :: l_cols_nev, l_rows, l_cols
#endif
94 95
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: tmat(:,:,:)
   real(kind=C_DATATYPE_KIND), allocatable                            :: e(:)
96
#if COMPLEXCASE == 1
97
   real(kind=C_DATATYPE_KIND), allocatable                            :: q_real(:,:)
98
#endif
99 100 101 102 103 104 105 106 107 108 109
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target           :: q_dummy(:,:)
   MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer                       :: q_actual(:,:)


   integer(kind=c_intptr_t)                                           :: tmat_dev, q_dev, a_dev

   integer(kind=c_int)                                                :: i
   logical                                                            :: success, successCUDA
   logical                                                            :: wantDebug
   integer(kind=c_int)                                                :: istat, gpu, debug, qr
   character(200)                                                     :: errorMessage
110
   logical                                                            :: do_useGPU, do_useGPU_bandred, &
111
                                                                         do_useGPU_tridiag_band, do_useGPU_solve_tridi, &
112 113
                                                                         do_useGPU_trans_ev_tridi_to_band, &
                                                                         do_useGPU_trans_ev_band_to_full
114 115 116 117 118 119 120
   integer(kind=c_int)                                                :: numberOfGPUDevices
   integer(kind=c_intptr_t), parameter                                :: size_of_datatype = size_of_&
                                                                                            &PRECISION&
                                                                                            &_&
                                                                                            &MATH_DATATYPE
    integer(kind=ik)                                                  :: na, nev, lda, ldq, nblk, matrixCols, &
                                                                         mpi_comm_rows, mpi_comm_cols,        &
121
                                                                         mpi_comm_all, check_pd, error
122 123 124

    logical                                                           :: do_bandred, do_tridiag, do_solve_tridi,  &
                                                                         do_trans_to_band, do_trans_to_full
125

Andreas Marek's avatar
Andreas Marek committed
126
    integer(kind=ik)                                                  :: nrThreads
127 128 129 130
#if REALCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
131
#undef BLAS_KERNEL
132 133
#define GPU_KERNEL ELPA_2STAGE_REAL_GPU
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
134
#define BLAS_KERNEL ELPA_2STAGE_REAL_BLAS_BLOCK4
135 136 137 138 139 140
#define KERNEL_STRING "real_kernel"
#endif
#if COMPLEXCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
141
#undef BLAS_KERNEL
142 143
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
144 145
! TODO blas complex kernel
#define BLAS_KERNEL ELPA_2STAGE_REAL_BLAS_BLOCK4
146 147 148
#define KERNEL_STRING "complex_kernel"
#endif

149
    call obj%timer%start("elpa_solve_evp_&
150
    &MATH_DATATYPE&
151 152 153
    &_2stage_&
    &PRECISION&
    &")
154

Andreas Marek's avatar
Andreas Marek committed
155 156

#ifdef WITH_OPENMP
157 158 159 160 161
    ! store the number of OpenMP threads used in the calling function
    ! restore this at the end of ELPA 2
    omp_threads_caller = omp_get_max_threads()

    ! check the number of threads that ELPA should use internally
Andreas Marek's avatar
Andreas Marek committed
162 163
    call obj%get("omp_threads",nrThreads,error)
    call omp_set_num_threads(nrThreads)
Andreas Marek's avatar
Andreas Marek committed
164 165 166 167
#else
    nrThreads = 1
#endif

168 169
    success = .true.

170
    if (present(q)) then
171
      obj%eigenvalues_only = .false.
172
    else
173
      obj%eigenvalues_only = .true.
174 175
    endif

176 177 178 179 180 181 182
    na         = obj%na
    nev        = obj%nev
    lda        = obj%local_nrows
    ldq        = obj%local_nrows
    nblk       = obj%nblk
    matrixCols = obj%local_ncols

183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
    call obj%get("mpi_comm_rows",mpi_comm_rows,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
    call obj%get("mpi_comm_cols",mpi_comm_cols,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
    call obj%get("mpi_comm_parent",mpi_comm_all,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif

    call obj%timer%start("mpi_communication")
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
    call obj%timer%stop("mpi_communication")

209 210 211 212 213 214 215
   ! special case na = 1
   if (na .eq. 1) then
#if REALCASE == 1
     ev(1) = a(1,1)
#endif
#if COMPLEXCASE == 1
     ev(1) = real(a(1,1))
216
#endif
217
     if (.not.(obj%eigenvalues_only)) then
218
       q(1,1) = ONE
219
     endif
220 221 222 223 224 225 226 227

     ! restore original OpenMP settings
#ifdef WITH_OPENMP
     ! store the number of OpenMP threads used in the calling function
     ! restore this at the end of ELPA 2
     call omp_set_num_threads(omp_threads_caller)
#endif

228 229 230 231 232 233 234 235
     call obj%timer%stop("elpa_solve_evp_&
     &MATH_DATATYPE&
     &_2stage_&
     &PRECISION&
     &")
     return
   endif

236 237 238 239 240
   if (nev == 0) then
     nev = 1
     obj%eigenvalues_only = .true.
   endif

241
    call obj%get(KERNEL_STRING,kernel,error)
242 243 244 245
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
246 247

    ! GPU settings
248 249 250 251 252
    call obj%get("gpu", gpu,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
253

254
    useGPU = (gpu == 1)
255

256
    do_useGPU = .false.
257
    if (useGPU) then
258
      call obj%timer%start("check_for_gpu")
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then

         do_useGPU = .true.

         ! set the neccessary parameters
         cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
         cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
         cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
         cudaHostRegisterPortable = cuda_hostRegisterPortable()
         cudaHostRegisterMapped   = cuda_hostRegisterMapped()
      else
        print *,"GPUs are requested but not detected! Aborting..."
        success = .false.
        return
      endif
274
      call obj%timer%stop("check_for_gpu")
275 276
    endif

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
    do_useGPU_bandred = do_useGPU
    do_useGPU_tridiag_band = do_useGPU
    do_useGPU_solve_tridi = do_useGPU
    do_useGPU_trans_ev_tridi_to_band = do_useGPU
    do_useGPU_trans_ev_band_to_full = do_useGPU

    ! only if we want (and can) use GPU in general, look what are the
    ! requirements for individual routines. Implicitly they are all set to 1, so
    ! unles specified otherwise by the user, GPU versions of all individual
    ! routines should be used
    if(do_useGPU) then
      call obj%get("gpu_bandred", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_bandred = (gpu == 1)

      call obj%get("gpu_tridiag_band", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_tridiag_band = (gpu == 1)

      call obj%get("gpu_solve_tridi", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_solve_tridi = (gpu == 1)

      call obj%get("gpu_trans_ev_tridi_to_band", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_trans_ev_tridi_to_band = (gpu == 1)

      call obj%get("gpu_trans_ev_band_to_full", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_trans_ev_band_to_full = (gpu == 1)
    endif

324
    ! check consistency between request for GPUs and defined kernel
325 326

    if (do_useGPU_trans_ev_tridi_to_band) then
327 328 329 330 331 332 333 334 335 336 337 338
      if (kernel .ne. BLAS_KERNEL) then
        if (kernel .ne. GPU_KERNEL) then
          write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
          write(error_unit,*) "The compute kernel will be executed on CPUs!"
          do_useGPU_trans_ev_tridi_to_band = .false.
        else if (nblk .ne. 128) then
          write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
          write(error_unit,*) "The compute kernel will be executed on CPUs!"
          do_useGPU_trans_ev_tridi_to_band = .false.
          kernel = GENERIC_KERNEL
        endif
      endif   ! blas kernel TODO improve this logic
339
    endif
340

341 342 343
    ! check again, now kernel and do_useGPU_trans_ev_tridi_to_band sould be
    ! finally consistent
    if (do_useGPU_trans_ev_tridi_to_band) then
344 345 346 347 348 349 350 351 352 353 354
      if (kernel .ne. BLAS_KERNEL) then ! for BLAS kernel both GPU and CPU possible TODO maybe should have BLAS_GPU_KERNEL?
        if (kernel .ne. GPU_KERNEL) then
          ! this should never happen, checking as an assert
          write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel!  Aborting..."
          stop
        endif
        if (nblk .ne. 128) then
          ! this should never happen, checking as an assert
          write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize!  Aborting..."
          stop
        endif
355 356 357 358 359 360 361 362
      endif
    else
      if (kernel .eq. GPU_KERNEL) then
        ! combination not allowed
        write(error_unit,*) "ELPA: Warning, GPU usage has NOT been requested but compute kernel &
                            &is defined as the GPU kernel!  Aborting..."
        stop
        !TODO do error handling properly
363 364
      endif
    endif
365

366

367
#if REALCASE == 1
368 369 370 371 372 373 374 375 376
#ifdef SINGLE_PRECISION_REAL
    ! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
    if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4 .or. &
        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6        ) then
        write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for POWER8"
        write(error_unit,*) "The GENERIC kernel will be used at the moment"
        kernel = ELPA_2STAGE_REAL_GENERIC
    endif
377 378 379 380 381 382 383 384
    ! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
    if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6        ) then
        write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
        write(error_unit,*) "The GENERIC kernel will be used at the moment"
        kernel = ELPA_2STAGE_REAL_GENERIC
    endif
385 386
#endif

387 388 389 390
#endif


#if REALCASE == 1
391 392 393 394 395
    call obj%get("qr",qr,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
396
    if (qr .eq. 1) then
397 398 399 400 401 402
      useQR = .true.
    else
      useQR = .false.
    endif

#endif
403

404 405 406 407 408
    call obj%get("debug",debug,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
409
    wantDebug = debug == 1
410 411 412 413 414 415



#if REALCASE == 1
    useQRActual = .false.
    ! set usage of qr decomposition via API call
416 417
    if (useQR) useQRActual = .true.
    if (.not.(useQR)) useQRACtual = .false.
418 419 420 421 422 423 424 425 426 427 428 429 430 431

    if (useQRActual) then
      if (mod(na,2) .ne. 0) then
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
#endif /* REALCASE */


432

433
    if (.not. obj%eigenvalues_only) then
434 435 436 437 438 439
      q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
    else
     allocate(q_dummy(1:obj%local_nrows,1:obj%local_ncols))
     q_actual => q_dummy(1:obj%local_nrows,1:obj%local_ncols)
    endif

440 441 442 443 444 445 446 447 448 449 450 451 452

    ! set the default values for each of the 5 compute steps
    do_bandred        = .true.
    do_tridiag        = .true.
    do_solve_tridi    = .true.
    do_trans_to_band  = .true.
    do_trans_to_full  = .true.

    if (obj%eigenvalues_only) then
      do_trans_to_band  = .false.
      do_trans_to_full  = .false.
    endif

Andreas Marek's avatar
Andreas Marek committed
453
    if (obj%is_set("bandwidth") == 1) then
454 455
      ! bandwidth is set. That means, that the inputed matrix is actually banded and thus the 
      ! first step of ELPA2 should be skipped
456
      call obj%get("bandwidth",nbw,error)
457
      if (nbw == 0) then
458
        if (wantDebug) then
459
          write(error_unit,*) "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
Andreas Marek's avatar
Andreas Marek committed
460 461
                              "for a diagonal matrix! This is too simple"
          endif
462
        print *, "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
Andreas Marek's avatar
Andreas Marek committed
463
                 "for a diagonal matrix! This is too simple"
464 465 466
        success = .false.
        return
      endif
467 468
      if (mod(nbw, nblk) .ne. 0) then
        ! treat matrix with an effective bandwidth slightly bigger than specified bandwidth
Andreas Marek's avatar
Andreas Marek committed
469
        ! such that effective bandwidth is a multiply of nblk. which is a prerequiste for ELPA
470 471 472
        nbw = nblk * ceiling(real(nbw,kind=c_double)/real(nblk,kind=c_double))

        ! just check that effective bandwidth is NOT larger than matrix size
Andreas Marek's avatar
Andreas Marek committed
473
        if (nbw .gt. na) then
474 475
          if (wantDebug) then
            write(error_unit,*) "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
Andreas Marek's avatar
Andreas Marek committed
476 477 478
                                "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
                                "solve your problem by not specifing a bandwidth"
          endif
479
          print *, "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
Andreas Marek's avatar
Andreas Marek committed
480 481
                                "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
                                "solve your problem by not specifing a bandwidth"
482 483
          success = .false.
          return
Andreas Marek's avatar
Andreas Marek committed
484
        endif
485
      endif
486 487 488 489
      do_bandred       = .false. ! we already have a banded matrix
      do_solve_tridi   = .true.  ! we also have to solve something :-)
      do_trans_to_band = .true.  ! and still we have to backsub to banded
      do_trans_to_full = .false. ! but not to full since we have a banded matrix
490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
    else ! matrix is not banded, determine the intermediate bandwidth for full->banded->tridi
      !first check if the intermediate bandwidth was set by the user
      call obj%get("intermediate_bandwidth", nbw, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif

      if(nbw == 0) then
        ! intermediate bandwidth was not specified, select one of the defaults

        ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
        ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
        ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
        ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
        ! on this and maybe allow a run-time optimization here
        if (do_useGPU) then
          nbw = nblk
        else
509
#if REALCASE == 1
510
          nbw = (63/nblk+1)*nblk
511
#elif COMPLEXCASE == 1
512
          nbw = (31/nblk+1)*nblk
513
#endif
514 515 516 517 518 519 520 521 522 523
        endif

      else
        ! intermediate bandwidth has been specified by the user, check, whether correctly
        if (mod(nbw, nblk) .ne. 0) then
          print *, "Specified bandwidth ",nbw," has to be mutiple of the blocksize ", nblk, ". Aborting..."
          success = .false.
          return
        endif
      endif !nbw == 0
524 525 526

      num_blocks = (na-1)/nbw + 1

527 528
      ! tmat is needed only in full->band and band->full steps, so alocate here
      ! (not allocated for banded matrix on input)
529 530 531
      allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_evp_&
532 533 534 535
        &MATH_DATATYPE&
        &_2stage_&
        &PRECISION&
        &" // ": error when allocating tmat "//errorMessage
536 537 538
        stop 1
      endif

539 540 541 542 543 544 545 546 547 548 549 550
      ! if either of full->band or band->full steps are to be done on GPU,
      ! allocate also corresponding array on GPU.
      if (do_useGPU_bandred .or.  do_useGPU_trans_ev_band_to_full) then
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
        if (.not.(successCUDA)) then
          print *,"bandred_&
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
          stop 1
        endif
      endif

551 552 553 554
      do_bandred       = .true.
      do_solve_tridi   = .true.
      do_trans_to_band = .true.
      do_trans_to_full = .true.
555
    endif  ! matrix not already banded on input
556 557 558 559 560

    ! start the computations in 5 steps

    if (do_bandred) then
      call obj%timer%start("bandred")
561 562 563
#ifdef HAVE_LIKWID
      call likwid_markerStartRegion("bandred")
#endif
564 565 566 567 568
      ! Reduction full -> band
      call bandred_&
      &MATH_DATATYPE&
      &_&
      &PRECISION &
569
      (obj, na, a, &
570
      a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
571
      tmat_dev,  wantDebug, do_useGPU_bandred, success, &
572
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
573
      useQRActual, &
574
#endif
Andreas Marek's avatar
Andreas Marek committed
575
       nrThreads)
576 577 578
#ifdef HAVE_LIKWID
      call likwid_markerStopRegion("bandred")
#endif
579
      call obj%timer%stop("bandred")
580
      if (.not.(success)) return
581 582
    endif

583 584

     ! Reduction band -> tridiagonal
585 586 587 588 589 590 591 592 593
     if (do_tridiag) then
       allocate(e(na), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when allocating e "//errorMessage
         stop 1
       endif
594

595
       call obj%timer%start("tridiag")
596 597 598
#ifdef HAVE_LIKWID
       call likwid_markerStartRegion("tridiag")
#endif
599
       call tridiag_band_&
600
       &MATH_DATATYPE&
601 602
       &_&
       &PRECISION&
603
       (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
604
       do_useGPU_tridiag_band, wantDebug, nrThreads)
605 606

#ifdef WITH_MPI
607 608 609 610
       call obj%timer%start("mpi_communication")
       call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
       call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
       call obj%timer%stop("mpi_communication")
611
#endif /* WITH_MPI */
612 613 614
#ifdef HAVE_LIKWID
       call likwid_markerStopRegion("tridiag")
#endif
615 616
       call obj%timer%stop("tridiag")
     endif ! do_tridiag
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632

#if COMPLEXCASE == 1
     l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
     l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
     l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

     allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when allocating q_real"//errorMessage
       stop 1
     endif
#endif

     ! Solve tridiagonal system
633 634
     if (do_solve_tridi) then
       call obj%timer%start("solve")
635 636 637
#ifdef HAVE_LIKWID
       call likwid_markerStartRegion("solve")
#endif
638 639 640
       call solve_tridi_&
       &PRECISION &
       (obj, na, nev, ev, e, &
641
#if REALCASE == 1
642
       q_actual, ldq,   &
643 644
#endif
#if COMPLEXCASE == 1
645
       q_real, ubound(q_real,dim=1), &
646
#endif
647
       nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads)
648 649 650
#ifdef HAVE_LIKWID
       call likwid_markerStopRegion("solve")
#endif
651 652 653
       call obj%timer%stop("solve")
       if (.not.(success)) return
     endif ! do_solve_tridi
654 655 656 657 658 659 660 661 662

     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when deallocating e "//errorMessage
       stop 1
     endif

663
     if (obj%eigenvalues_only) then
Andreas Marek's avatar
Andreas Marek committed
664 665 666 667
       do_trans_to_band = .false.
       do_trans_to_full = .false.
     else

668 669 670 671 672
       call obj%get("check_pd",check_pd,error)
       if (error .ne. ELPA_OK) then
         print *,"Problem getting option. Aborting..."
         stop
       endif
Andreas Marek's avatar
Andreas Marek committed
673 674 675 676 677 678 679 680 681 682 683
       if (check_pd .eq. 1) then
         check_pd = 0
         do i = 1, na
           if (ev(i) .gt. THRESHOLD) then
             check_pd = check_pd + 1
           endif
         enddo
         if (check_pd .lt. na) then
           ! not positiv definite => eigenvectors needed
           do_trans_to_band = .true.
           do_trans_to_full = .true.
Andreas Marek's avatar
Andreas Marek committed
684
         else
Andreas Marek's avatar
Andreas Marek committed
685 686 687 688 689
           do_trans_to_band = .false.
           do_trans_to_full = .false.
         endif
       endif
     endif ! eigenvalues only
690

691
     if (do_trans_to_band) then
692
#if COMPLEXCASE == 1
693
       ! q must be given thats why from here on we can use q and not q_actual
694

695
       q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
696

697 698 699 700 701 702 703 704
       deallocate(q_real, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage: error when deallocating q_real"//errorMessage
         stop 1
       endif
#endif
Andreas Marek's avatar
Andreas Marek committed
705

706 707
       ! Backtransform stage 1
       call obj%timer%start("trans_ev_to_band")
708 709 710
#ifdef HAVE_LIKWID
       call likwid_markerStartRegion("trans_ev_to_band")
#endif
711
       call trans_ev_tridi_to_band_&
712
       &MATH_DATATYPE&
713 714 715 716
       &_&
       &PRECISION &
       (obj, na, nev, nblk, nbw, q, &
       q_dev, &
717
       ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
Andreas Marek's avatar
Andreas Marek committed
718
       nrThreads, success=success, kernel=kernel)
719 720 721
#ifdef HAVE_LIKWID
       call likwid_markerStopRegion("trans_ev_to_band")
#endif
722
       call obj%timer%stop("trans_ev_to_band")
723

724
       if (.not.(success)) return
725

726 727 728 729 730 731 732 733
       ! We can now deallocate the stored householder vectors
       deallocate(hh_trans, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *, "solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when deallocating hh_trans "//errorMessage
         stop 1
734
       endif
735
     endif ! do_trans_to_band
736

737 738 739 740 741 742 743 744 745 746
     ! the array q might reside on device or host, depending on whether GPU is
     ! used or not. We thus have to transfer he data manually, if one of the
     ! routines is run on GPU and the other not.

     ! first deal with the situation that first backward step was on GPU
     if(do_useGPU_trans_ev_tridi_to_band) then
       ! if the second backward step is to be performed, but not on GPU, we have
       ! to transfer q to the host
       if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
         successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
747 748 749 750
         if (.not.(successCUDA)) then
           print *,"elpa2_template, error in copy to host"
           stop 1
         endif
751 752 753 754 755 756 757 758 759 760 761 762
       endif

       ! if the last step is not required at all, or will be performed on CPU,
       ! release the memmory allocated on the device
       if((.not. do_trans_to_full) .or. (.not. do_useGPU_trans_ev_band_to_full)) then
         successCUDA = cuda_free(q_dev)
       endif
     endif

     !TODO check that the memory is properly dealocated on the host in case that
     !the last step is not required

763
     if (do_trans_to_full) then
764
       call obj%timer%start("trans_ev_to_full")
765 766 767
#ifdef HAVE_LIKWID
       call likwid_markerStartRegion("trans_ev_to_full")
#endif
768
       if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
769 770
         ! copy to device if we want to continue on GPU
         successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
771

772
         successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
773 774 775 776
         if (.not.(successCUDA)) then
           print *,"elpa2_template, error in copy to device"
           stop 1
         endif
777
       endif
778

779
       ! Backtransform stage 2
780

781 782 783 784 785 786 787
       call trans_ev_band_to_full_&
       &MATH_DATATYPE&
       &_&
       &PRECISION &
       (obj, na, nev, nblk, nbw, a, &
       a_dev, lda, tmat, tmat_dev,  q,  &
       q_dev, &
788
       ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
789
#if REALCASE == 1
790
       , useQRActual  &
791
#endif
792
       )
793

794

795 796 797 798 799 800 801 802
       deallocate(tmat, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when deallocating tmat"//errorMessage
         stop 1
       endif
803 804 805
#ifdef HAVE_LIKWID
       call likwid_markerStopRegion("trans_ev_to_full")
#endif
806
       call obj%timer%stop("trans_ev_to_full")
807
     endif ! do_trans_to_full
808

809 810 811 812 813 814 815 816 817 818
     if(do_bandred .or. do_trans_to_full) then
       if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
         successCUDA = cuda_free(tmat_dev)
         if (.not.(successCUDA)) then
           print *,"elpa2_template: error in cudaFree, tmat_dev"
           stop 1
         endif
       endif
     endif

819
     if (obj%eigenvalues_only) then
820
       deallocate(q_dummy, stat=istat, errmsg=errorMessage)
821 822
       if (istat .ne. 0) then
         print *,"solve_evp_&
823 824 825 826
         &MATH_DATATYPE&
         &_1stage_&
         &PRECISION&
         &" // ": error when deallocating q_dummy "//errorMessage
827 828 829 830
         stop 1
       endif
     endif

831 832 833 834 835 836 837
     ! restore original OpenMP settings
#ifdef WITH_OPENMP
    ! store the number of OpenMP threads used in the calling function
    ! restore this at the end of ELPA 2
    call omp_set_num_threads(omp_threads_caller)
#endif

838
     call obj%timer%stop("elpa_solve_evp_&
839
     &MATH_DATATYPE&
840 841 842
     &_2stage_&
    &PRECISION&
    &")
843 844 845 846 847 848
1    format(a,f10.3)

   end function elpa_solve_evp_&
   &MATH_DATATYPE&
   &_2stage_&
   &PRECISION&
849
   &_impl
850 851

! vim: syntax=fortran