elpa2_template.F90 32.7 KB
Newer Older
1
!    This file is part of ELPA.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
Andreas Marek's avatar
Andreas Marek committed
52 53 54

#include "elpa/elpa_simd_constants.h"

55 56 57 58 59
 function elpa_solve_evp_&
  &MATH_DATATYPE&
  &_&
  &2stage_&
  &PRECISION&
60
  &_impl (obj, a, ev, q) result(success)
61

62
   use elpa_abstract_impl
63
   use elpa_utilities
64 65 66 67 68
   use elpa1_compute
   use elpa2_compute
   use elpa_mpi
   use cuda_functions
   use mod_check_for_gpu
69
   use elpa_omp
Andreas Marek's avatar
Andreas Marek committed
70 71 72
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
   use simd_kernel
#endif
73 74
   use iso_c_binding
   implicit none
75
#include "../general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
76 77
   class(elpa_abstract_impl_t), intent(inout)                         :: obj
   logical                                                            :: useGPU
78
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
79 80
   logical                                                            :: useQR
   logical                                                            :: useQRActual
Andreas Marek's avatar
Andreas Marek committed
81
#endif
Andreas Marek's avatar
Andreas Marek committed
82
   integer(kind=c_int)                                                :: kernel, kernelByUser
83 84

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
85
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,*)
Andreas Marek's avatar
Andreas Marek committed
86
   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
87
#else
Andreas Marek's avatar
Andreas Marek committed
88
   MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout)                 :: a(obj%local_nrows,obj%local_ncols)
Andreas Marek's avatar
Andreas Marek committed
89
   MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
90
#endif
Andreas Marek's avatar
Andreas Marek committed
91 92
   real(kind=C_DATATYPE_KIND), intent(inout)                          :: ev(obj%na)
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: hh_trans(:,:)
93

Andreas Marek's avatar
Andreas Marek committed
94
   integer(kind=c_int)                                                :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
Andreas Marek's avatar
Andreas Marek committed
95 96 97 98
   integer(kind=c_int)                                                :: nbw, num_blocks
#if COMPLEXCASE == 1
   integer(kind=c_int)                                                :: l_cols_nev, l_rows, l_cols
#endif
Andreas Marek's avatar
Andreas Marek committed
99 100
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable                   :: tmat(:,:,:)
   real(kind=C_DATATYPE_KIND), allocatable                            :: e(:)
101
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
102
   real(kind=C_DATATYPE_KIND), allocatable                            :: q_real(:,:)
103
#endif
Andreas Marek's avatar
Andreas Marek committed
104 105 106 107 108 109 110 111 112 113 114
   MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target           :: q_dummy(:,:)
   MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer                       :: q_actual(:,:)


   integer(kind=c_intptr_t)                                           :: tmat_dev, q_dev, a_dev

   integer(kind=c_int)                                                :: i
   logical                                                            :: success, successCUDA
   logical                                                            :: wantDebug
   integer(kind=c_int)                                                :: istat, gpu, debug, qr
   character(200)                                                     :: errorMessage
115
   logical                                                            :: do_useGPU, do_useGPU_bandred, &
116
                                                                         do_useGPU_tridiag_band, do_useGPU_solve_tridi, &
117 118
                                                                         do_useGPU_trans_ev_tridi_to_band, &
                                                                         do_useGPU_trans_ev_band_to_full
Andreas Marek's avatar
Andreas Marek committed
119 120 121 122 123 124 125
   integer(kind=c_int)                                                :: numberOfGPUDevices
   integer(kind=c_intptr_t), parameter                                :: size_of_datatype = size_of_&
                                                                                            &PRECISION&
                                                                                            &_&
                                                                                            &MATH_DATATYPE
    integer(kind=ik)                                                  :: na, nev, lda, ldq, nblk, matrixCols, &
                                                                         mpi_comm_rows, mpi_comm_cols,        &
Andreas Marek's avatar
Andreas Marek committed
126
                                                                         mpi_comm_all, check_pd, error
Andreas Marek's avatar
Andreas Marek committed
127 128 129

    logical                                                           :: do_bandred, do_tridiag, do_solve_tridi,  &
                                                                         do_trans_to_band, do_trans_to_full
130

Andreas Marek's avatar
Andreas Marek committed
131
    integer(kind=ik)                                                  :: nrThreads
Andreas Marek's avatar
Andreas Marek committed
132 133 134 135 136 137
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
    integer(kind=c_int)                                               :: simdSetAvailable(NUMBER_OF_INSTR)
#endif



138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
#if REALCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_REAL_GPU
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
#define KERNEL_STRING "real_kernel"
#endif
#if COMPLEXCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
#define KERNEL_STRING "complex_kernel"
#endif

155
    call obj%timer%start("elpa_solve_evp_&
156
    &MATH_DATATYPE&
157 158 159
    &_2stage_&
    &PRECISION&
    &")
160

Andreas Marek's avatar
Andreas Marek committed
161 162

#ifdef WITH_OPENMP
163 164 165 166 167
    ! store the number of OpenMP threads used in the calling function
    ! restore this at the end of ELPA 2
    omp_threads_caller = omp_get_max_threads()

    ! check the number of threads that ELPA should use internally
Andreas Marek's avatar
Andreas Marek committed
168 169
    call obj%get("omp_threads",nrThreads,error)
    call omp_set_num_threads(nrThreads)
Andreas Marek's avatar
Andreas Marek committed
170 171 172 173
#else
    nrThreads = 1
#endif

Andreas Marek's avatar
Andreas Marek committed
174 175
    success = .true.

Andreas Marek's avatar
Andreas Marek committed
176
    if (present(q)) then
177
      obj%eigenvalues_only = .false.
Andreas Marek's avatar
Andreas Marek committed
178
    else
179
      obj%eigenvalues_only = .true.
Andreas Marek's avatar
Andreas Marek committed
180 181
    endif

182 183 184 185 186 187 188
    na         = obj%na
    nev        = obj%nev
    lda        = obj%local_nrows
    ldq        = obj%local_nrows
    nblk       = obj%nblk
    matrixCols = obj%local_ncols

189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
    call obj%get("mpi_comm_rows",mpi_comm_rows,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
    call obj%get("mpi_comm_cols",mpi_comm_cols,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
    call obj%get("mpi_comm_parent",mpi_comm_all,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif

    call obj%timer%start("mpi_communication")
    call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
    call mpi_comm_size(mpi_comm_all,n_pes,mpierr)

    call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
    call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
    call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
    call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
    call obj%timer%stop("mpi_communication")

Andreas Marek's avatar
Andreas Marek committed
215 216 217 218 219 220 221
   ! special case na = 1
   if (na .eq. 1) then
#if REALCASE == 1
     ev(1) = a(1,1)
#endif
#if COMPLEXCASE == 1
     ev(1) = real(a(1,1))
222
#endif
Andreas Marek's avatar
Andreas Marek committed
223
     if (.not.(obj%eigenvalues_only)) then
224
       q(1,1) = ONE
Andreas Marek's avatar
Andreas Marek committed
225
     endif
226 227 228 229 230 231 232 233

     ! restore original OpenMP settings
#ifdef WITH_OPENMP
     ! store the number of OpenMP threads used in the calling function
     ! restore this at the end of ELPA 2
     call omp_set_num_threads(omp_threads_caller)
#endif

Andreas Marek's avatar
Andreas Marek committed
234 235 236 237 238 239 240 241
     call obj%timer%stop("elpa_solve_evp_&
     &MATH_DATATYPE&
     &_2stage_&
     &PRECISION&
     &")
     return
   endif

242 243 244 245 246
   if (nev == 0) then
     nev = 1
     obj%eigenvalues_only = .true.
   endif

247
    call obj%get(KERNEL_STRING,kernel,error)
Andreas Marek's avatar
Andreas Marek committed
248 249 250 251
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
252 253

    ! GPU settings
Andreas Marek's avatar
Andreas Marek committed
254 255 256 257 258
    call obj%get("gpu", gpu,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
259

260
    useGPU = (gpu == 1)
261

262
    do_useGPU = .false.
263
    if (useGPU) then
264
      call obj%timer%start("check_for_gpu")
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
      if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then

         do_useGPU = .true.

         ! set the neccessary parameters
         cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
         cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
         cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
         cudaHostRegisterPortable = cuda_hostRegisterPortable()
         cudaHostRegisterMapped   = cuda_hostRegisterMapped()
      else
        print *,"GPUs are requested but not detected! Aborting..."
        success = .false.
        return
      endif
280
      call obj%timer%stop("check_for_gpu")
281 282
    endif

283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
    do_useGPU_bandred = do_useGPU
    do_useGPU_tridiag_band = do_useGPU
    do_useGPU_solve_tridi = do_useGPU
    do_useGPU_trans_ev_tridi_to_band = do_useGPU
    do_useGPU_trans_ev_band_to_full = do_useGPU

    ! only if we want (and can) use GPU in general, look what are the
    ! requirements for individual routines. Implicitly they are all set to 1, so
    ! unles specified otherwise by the user, GPU versions of all individual
    ! routines should be used
    if(do_useGPU) then
      call obj%get("gpu_bandred", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_bandred = (gpu == 1)

      call obj%get("gpu_tridiag_band", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_tridiag_band = (gpu == 1)

      call obj%get("gpu_solve_tridi", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_solve_tridi = (gpu == 1)

      call obj%get("gpu_trans_ev_tridi_to_band", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_trans_ev_tridi_to_band = (gpu == 1)

      call obj%get("gpu_trans_ev_band_to_full", gpu, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif
      do_useGPU_trans_ev_band_to_full = (gpu == 1)
    endif

330
    ! check consistency between request for GPUs and defined kernel
331 332 333

    if (do_useGPU_trans_ev_tridi_to_band) then
      if (kernel .ne. GPU_KERNEL) then
334
        write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
335
        write(error_unit,*) "The compute kernel will be executed on CPUs!"
336
        do_useGPU_trans_ev_tridi_to_band = .false.
337
      else if (nblk .ne. 128) then
338 339 340 341
        write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
        write(error_unit,*) "The compute kernel will be executed on CPUs!"
        do_useGPU_trans_ev_tridi_to_band = .false.
        kernel = GENERIC_KERNEL
342 343
      endif
    endif
344

345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
    ! check again, now kernel and do_useGPU_trans_ev_tridi_to_band sould be
    ! finally consistent
    if (do_useGPU_trans_ev_tridi_to_band) then
      if (kernel .ne. GPU_KERNEL) then
        ! this should never happen, checking as an assert
        write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel!  Aborting..."
        stop
      endif
      if (nblk .ne. 128) then
        ! this should never happen, checking as an assert
        write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize!  Aborting..."
        stop
      endif
    else
      if (kernel .eq. GPU_KERNEL) then
        ! combination not allowed
        write(error_unit,*) "ELPA: Warning, GPU usage has NOT been requested but compute kernel &
                            &is defined as the GPU kernel!  Aborting..."
        stop
        !TODO do error handling properly
365 366
      endif
    endif
367

368

369
#if REALCASE == 1
370 371 372 373 374 375 376 377 378
#ifdef SINGLE_PRECISION_REAL
    ! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
    if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK4 .or. &
        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK6        ) then
        write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for POWER8"
        write(error_unit,*) "The GENERIC kernel will be used at the moment"
        kernel = ELPA_2STAGE_REAL_GENERIC
    endif
379 380 381 382 383 384 385 386
    ! special case at the moment NO single precision kernels on SPARC64 -> set GENERIC for now
    if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK4 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK6        ) then
        write(error_unit,*) "ELPA: At the moment there exist no specific SINGLE precision kernels for SPARC64"
        write(error_unit,*) "The GENERIC kernel will be used at the moment"
        kernel = ELPA_2STAGE_REAL_GENERIC
    endif
387 388
#endif

389 390
#endif

Andreas Marek's avatar
Andreas Marek committed
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
     ! consistency check: is user set kernel still identical with "kernel" or did
     ! we change it above? This is a mess and should be cleaned up
     call obj%get(KERNEL_STRING,kernelByUser,error)
     if (error .ne. ELPA_OK) then
       print *,"Problem getting option. Aborting..."
       stop
     endif

     if (kernelByUser .ne. kernel) then
       call obj%set(KERNEL_STRING, kernel, error)
       if (error .ne. ELPA_OK) then
         print *,"Problem setting option. Aborting..."
         stop
       endif
     endif

#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
     ! find a kernel which is supported on all used CPUs
     ! at the moment this works only on Intel CPUs
     simdSetAvailable(:) = 0
     call get_cpuid_set(simdSetAvailable, NUMBER_OF_INSTR)
#ifdef WITH_MPI
     call MPI_ALLREDUCE(mpi_in_place, simdSetAvailable, NUMBER_OF_INSTR, MPI_INTEGER, MPI_BAND, mpi_comm_all, mpierr)
#endif

     ! compare user chosen kernel with possible kernels
     call obj%get(KERNEL_STRING,kernelByUser,error)
     if (error .ne. ELPA_OK) then
       print *,"Problem getting option. Aborting..."
       stop
     endif

     ! map kernel to SIMD Set, and check whether this is set is available on all cores

#if REALCASE == 1
    if (simdSetAvailable(map_real_kernel_to_simd_instruction(kernelByUser)) /= 1) then
#endif
#if COMPLEXCASE == 1
    if (simdSetAvailable(map_complex_kernel_to_simd_instruction(kernelByUser)) /=1) then
#endif

      ! if we are not purely running on Intel CPUs, this feature does not work at the moment
      ! this restriction should be lifted step by step
      if (simdSetAvailable(CPU_MANUFACTURER) /= 1) then
         if (my_pe == 0 ) then
         write(error_unit,*) "You enabled the experimental feature of an heterogenous cluster support."
         write(error_unit,*) "However, this works at the moment only if ELPA is run on (different) Intel CPUs!"
         write(error_unit,*) "ELPA detected also non Intel-CPUs, and will this abort now"
         stop
        endif
      else
        if (my_pe == 0 ) then
          write(error_unit,*) "The ELPA 2stage kernel of your choice, cannot be run on all CPUs"
          write(error_unit,*) "ELPA will use another kernel..."
        endif

        ! find best kernel available for supported instruction sets
        do i = NUMBER_OF_INSTR, 2, -1
          if (simdSetAvailable(i) == 1) then
            ! map to "best" kernel with this instruction set
            ! this can be only done for kernels that ELPA has been configured to use
#if REALCASE == 1
            kernel = map_simd_instruction_to_real_kernel(i)
#endif
#if COMPLEXCASE == 1
            kernel = map_simd_instruction_to_complex_kernel(i)
#endif
            if (obj%can_set(KERNEL_STRING, kernel) == ELPA_OK) then
              call obj%set(KERNEL_STRING, kernel, error)
              if (error .ne. ELPA_OK) then
                print *,"Problem setting option. Aborting..."
                stop
              endif
              if (my_pe == 0 ) write(error_unit,*) "ELPA decided to use ",elpa_int_value_to_string(KERNEL_STRING, kernel)
              exit 
            endif
          endif
        enddo
      endif

    endif
#endif /* HAVE_HETEROGENOUS_CLUSTER_SUPPORT */
473 474

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
475 476 477 478 479
    call obj%get("qr",qr,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
480
    if (qr .eq. 1) then
481 482 483 484 485 486
      useQR = .true.
    else
      useQR = .false.
    endif

#endif
487

Andreas Marek's avatar
Andreas Marek committed
488 489 490 491 492
    call obj%get("debug",debug,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option. Aborting..."
      stop
    endif
493
    wantDebug = debug == 1
494 495 496 497 498 499



#if REALCASE == 1
    useQRActual = .false.
    ! set usage of qr decomposition via API call
500 501
    if (useQR) useQRActual = .true.
    if (.not.(useQR)) useQRACtual = .false.
502 503 504 505 506 507 508 509 510 511 512 513 514 515

    if (useQRActual) then
      if (mod(na,2) .ne. 0) then
        if (wantDebug) then
          write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
        endif
        print *, "Do not use QR-decomposition for this matrix and blocksize."
        success = .false.
        return
      endif
    endif
#endif /* REALCASE */


Andreas Marek's avatar
Andreas Marek committed
516

517
    if (.not. obj%eigenvalues_only) then
Andreas Marek's avatar
Andreas Marek committed
518 519 520 521 522 523
      q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
    else
     allocate(q_dummy(1:obj%local_nrows,1:obj%local_ncols))
     q_actual => q_dummy(1:obj%local_nrows,1:obj%local_ncols)
    endif

524 525 526 527 528 529 530 531 532 533 534 535 536

    ! set the default values for each of the 5 compute steps
    do_bandred        = .true.
    do_tridiag        = .true.
    do_solve_tridi    = .true.
    do_trans_to_band  = .true.
    do_trans_to_full  = .true.

    if (obj%eigenvalues_only) then
      do_trans_to_band  = .false.
      do_trans_to_full  = .false.
    endif

Andreas Marek's avatar
Andreas Marek committed
537
    if (obj%is_set("bandwidth") == 1) then
538 539
      ! bandwidth is set. That means, that the inputed matrix is actually banded and thus the 
      ! first step of ELPA2 should be skipped
Andreas Marek's avatar
Andreas Marek committed
540
      call obj%get("bandwidth",nbw,error)
541
      if (nbw == 0) then
542
        if (wantDebug) then
543
          write(error_unit,*) "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
Andreas Marek's avatar
Andreas Marek committed
544 545
                              "for a diagonal matrix! This is too simple"
          endif
546
        print *, "Specified bandwidth = 0; ELPA refuses to solve the eigenvalue problem ", &
Andreas Marek's avatar
Andreas Marek committed
547
                 "for a diagonal matrix! This is too simple"
548 549 550
        success = .false.
        return
      endif
551 552
      if (mod(nbw, nblk) .ne. 0) then
        ! treat matrix with an effective bandwidth slightly bigger than specified bandwidth
Andreas Marek's avatar
Andreas Marek committed
553
        ! such that effective bandwidth is a multiply of nblk. which is a prerequiste for ELPA
554 555 556
        nbw = nblk * ceiling(real(nbw,kind=c_double)/real(nblk,kind=c_double))

        ! just check that effective bandwidth is NOT larger than matrix size
Andreas Marek's avatar
Andreas Marek committed
557
        if (nbw .gt. na) then
558 559
          if (wantDebug) then
            write(error_unit,*) "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
Andreas Marek's avatar
Andreas Marek committed
560 561 562
                                "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
                                "solve your problem by not specifing a bandwidth"
          endif
563
          print *, "Specified bandwidth ",nbw," leads internaly to a computed bandwidth ", &
Andreas Marek's avatar
Andreas Marek committed
564 565
                                "which is larger than the matrix size ",na," ! ELPA will abort! Try to", &
                                "solve your problem by not specifing a bandwidth"
566 567
          success = .false.
          return
Andreas Marek's avatar
Andreas Marek committed
568
        endif
569
      endif
570 571 572 573
      do_bandred       = .false. ! we already have a banded matrix
      do_solve_tridi   = .true.  ! we also have to solve something :-)
      do_trans_to_band = .true.  ! and still we have to backsub to banded
      do_trans_to_full = .false. ! but not to full since we have a banded matrix
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
    else ! matrix is not banded, determine the intermediate bandwidth for full->banded->tridi
      !first check if the intermediate bandwidth was set by the user
      call obj%get("intermediate_bandwidth", nbw, error)
      if (error .ne. ELPA_OK) then
        print *,"Problem getting option. Aborting..."
        stop
      endif

      if(nbw == 0) then
        ! intermediate bandwidth was not specified, select one of the defaults

        ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
        ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
        ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
        ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
        ! on this and maybe allow a run-time optimization here
        if (do_useGPU) then
          nbw = nblk
        else
593
#if REALCASE == 1
594
          nbw = (63/nblk+1)*nblk
595
#elif COMPLEXCASE == 1
596
          nbw = (31/nblk+1)*nblk
597
#endif
598 599 600 601 602 603 604 605 606 607
        endif

      else
        ! intermediate bandwidth has been specified by the user, check, whether correctly
        if (mod(nbw, nblk) .ne. 0) then
          print *, "Specified bandwidth ",nbw," has to be mutiple of the blocksize ", nblk, ". Aborting..."
          success = .false.
          return
        endif
      endif !nbw == 0
608 609 610

      num_blocks = (na-1)/nbw + 1

Pavel Kus's avatar
Pavel Kus committed
611 612
      ! tmat is needed only in full->band and band->full steps, so alocate here
      ! (not allocated for banded matrix on input)
613 614 615
      allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
        print *,"solve_evp_&
616 617 618 619
        &MATH_DATATYPE&
        &_2stage_&
        &PRECISION&
        &" // ": error when allocating tmat "//errorMessage
620 621 622
        stop 1
      endif

Pavel Kus's avatar
Pavel Kus committed
623 624 625 626 627 628 629 630 631 632 633 634
      ! if either of full->band or band->full steps are to be done on GPU,
      ! allocate also corresponding array on GPU.
      if (do_useGPU_bandred .or.  do_useGPU_trans_ev_band_to_full) then
        successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
        if (.not.(successCUDA)) then
          print *,"bandred_&
                  &MATH_DATATYPE&
                  &: error in cudaMalloc tmat_dev 1"
          stop 1
        endif
      endif

635 636 637 638
      do_bandred       = .true.
      do_solve_tridi   = .true.
      do_trans_to_band = .true.
      do_trans_to_full = .true.
639
    endif  ! matrix not already banded on input
640 641 642 643 644

    ! start the computations in 5 steps

    if (do_bandred) then
      call obj%timer%start("bandred")
Pavel Kus's avatar
Pavel Kus committed
645 646 647
#ifdef HAVE_LIKWID
      call likwid_markerStartRegion("bandred")
#endif
648 649 650 651 652
      ! Reduction full -> band
      call bandred_&
      &MATH_DATATYPE&
      &_&
      &PRECISION &
653
      (obj, na, a, &
654
      a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
655
      tmat_dev,  wantDebug, do_useGPU_bandred, success, &
656
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
657
      useQRActual, &
658
#endif
Andreas Marek's avatar
Andreas Marek committed
659
       nrThreads)
Pavel Kus's avatar
Pavel Kus committed
660 661 662
#ifdef HAVE_LIKWID
      call likwid_markerStopRegion("bandred")
#endif
663
      call obj%timer%stop("bandred")
664
      if (.not.(success)) return
665 666
    endif

667 668

     ! Reduction band -> tridiagonal
669 670 671 672 673 674 675 676 677
     if (do_tridiag) then
       allocate(e(na), stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when allocating e "//errorMessage
         stop 1
       endif
678

679
       call obj%timer%start("tridiag")
Pavel Kus's avatar
Pavel Kus committed
680 681 682
#ifdef HAVE_LIKWID
       call likwid_markerStartRegion("tridiag")
#endif
683
       call tridiag_band_&
684
       &MATH_DATATYPE&
685 686
       &_&
       &PRECISION&
687
       (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
688
       do_useGPU_tridiag_band, wantDebug, nrThreads)
689 690

#ifdef WITH_MPI
691 692 693 694
       call obj%timer%start("mpi_communication")
       call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
       call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
       call obj%timer%stop("mpi_communication")
695
#endif /* WITH_MPI */
Pavel Kus's avatar
Pavel Kus committed
696 697 698
#ifdef HAVE_LIKWID
       call likwid_markerStopRegion("tridiag")
#endif
699 700
       call obj%timer%stop("tridiag")
     endif ! do_tridiag
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716

#if COMPLEXCASE == 1
     l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
     l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
     l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev

     allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when allocating q_real"//errorMessage
       stop 1
     endif
#endif

     ! Solve tridiagonal system
717 718
     if (do_solve_tridi) then
       call obj%timer%start("solve")
Pavel Kus's avatar
Pavel Kus committed
719 720 721
#ifdef HAVE_LIKWID
       call likwid_markerStartRegion("solve")
#endif
722 723 724
       call solve_tridi_&
       &PRECISION &
       (obj, na, nev, ev, e, &
725
#if REALCASE == 1
726
       q_actual, ldq,   &
727 728
#endif
#if COMPLEXCASE == 1
729
       q_real, ubound(q_real,dim=1), &
730
#endif
731
       nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads)
Pavel Kus's avatar
Pavel Kus committed
732 733 734
#ifdef HAVE_LIKWID
       call likwid_markerStopRegion("solve")
#endif
735 736 737
       call obj%timer%stop("solve")
       if (.not.(success)) return
     endif ! do_solve_tridi
738 739 740 741 742 743 744 745 746

     deallocate(e, stat=istat, errmsg=errorMessage)
     if (istat .ne. 0) then
       print *,"solve_evp_&
       &MATH_DATATYPE&
       &_2stage: error when deallocating e "//errorMessage
       stop 1
     endif

747
     if (obj%eigenvalues_only) then
Andreas Marek's avatar
Andreas Marek committed
748 749 750 751
       do_trans_to_band = .false.
       do_trans_to_full = .false.
     else

Andreas Marek's avatar
Andreas Marek committed
752 753 754 755 756
       call obj%get("check_pd",check_pd,error)
       if (error .ne. ELPA_OK) then
         print *,"Problem getting option. Aborting..."
         stop
       endif
Andreas Marek's avatar
Andreas Marek committed
757 758 759 760 761 762 763 764 765 766 767
       if (check_pd .eq. 1) then
         check_pd = 0
         do i = 1, na
           if (ev(i) .gt. THRESHOLD) then
             check_pd = check_pd + 1
           endif
         enddo
         if (check_pd .lt. na) then
           ! not positiv definite => eigenvectors needed
           do_trans_to_band = .true.
           do_trans_to_full = .true.
Andreas Marek's avatar
Andreas Marek committed
768
         else
Andreas Marek's avatar
Andreas Marek committed
769 770 771 772 773
           do_trans_to_band = .false.
           do_trans_to_full = .false.
         endif
       endif
     endif ! eigenvalues only
774

775
     if (do_trans_to_band) then
776
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
777
       ! q must be given thats why from here on we can use q and not q_actual
778

Andreas Marek's avatar
Andreas Marek committed
779
       q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
780

Andreas Marek's avatar
Andreas Marek committed
781 782 783 784 785 786 787 788
       deallocate(q_real, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage: error when deallocating q_real"//errorMessage
         stop 1
       endif
#endif
Andreas Marek's avatar
Andreas Marek committed
789

Andreas Marek's avatar
Andreas Marek committed
790 791
       ! Backtransform stage 1
       call obj%timer%start("trans_ev_to_band")
Pavel Kus's avatar
Pavel Kus committed
792 793 794
#ifdef HAVE_LIKWID
       call likwid_markerStartRegion("trans_ev_to_band")
#endif
Andreas Marek's avatar
Andreas Marek committed
795
       call trans_ev_tridi_to_band_&
796
       &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
797 798 799 800
       &_&
       &PRECISION &
       (obj, na, nev, nblk, nbw, q, &
       q_dev, &
801
       ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
Andreas Marek's avatar
Andreas Marek committed
802
       nrThreads, success=success, kernel=kernel)
Pavel Kus's avatar
Pavel Kus committed
803 804 805
#ifdef HAVE_LIKWID
       call likwid_markerStopRegion("trans_ev_to_band")
#endif
Andreas Marek's avatar
Andreas Marek committed
806
       call obj%timer%stop("trans_ev_to_band")
807

Andreas Marek's avatar
Andreas Marek committed
808
       if (.not.(success)) return
809

Andreas Marek's avatar
Andreas Marek committed
810 811 812 813 814 815 816 817
       ! We can now deallocate the stored householder vectors
       deallocate(hh_trans, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *, "solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when deallocating hh_trans "//errorMessage
         stop 1
818
       endif
819
     endif ! do_trans_to_band
820

821 822 823 824 825 826 827 828 829
     ! the array q might reside on device or host, depending on whether GPU is
     ! used or not. We thus have to transfer he data manually, if one of the
     ! routines is run on GPU and the other not.

     ! first deal with the situation that first backward step was on GPU
     if(do_useGPU_trans_ev_tridi_to_band) then
       ! if the second backward step is to be performed, but not on GPU, we have
       ! to transfer q to the host
       if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
Andreas Marek's avatar
Andreas Marek committed
830
         successCUDA = cuda_memcpy(int(loc(q),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
Pavel Kus's avatar
Pavel Kus committed
831 832 833 834
         if (.not.(successCUDA)) then
           print *,"elpa2_template, error in copy to host"
           stop 1
         endif
835 836 837 838 839 840 841 842 843 844 845 846
       endif

       ! if the last step is not required at all, or will be performed on CPU,
       ! release the memmory allocated on the device
       if((.not. do_trans_to_full) .or. (.not. do_useGPU_trans_ev_band_to_full)) then
         successCUDA = cuda_free(q_dev)
       endif
     endif

     !TODO check that the memory is properly dealocated on the host in case that
     !the last step is not required

847
     if (do_trans_to_full) then
Andreas Marek's avatar
Andreas Marek committed
848
       call obj%timer%start("trans_ev_to_full")
Pavel Kus's avatar
Pavel Kus committed
849 850 851
#ifdef HAVE_LIKWID
       call likwid_markerStartRegion("trans_ev_to_full")
#endif
852
       if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
853 854
         ! copy to device if we want to continue on GPU
         successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
Andreas Marek's avatar
Andreas Marek committed
855

Andreas Marek's avatar
Andreas Marek committed
856
         successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
Pavel Kus's avatar
Pavel Kus committed
857 858 859 860
         if (.not.(successCUDA)) then
           print *,"elpa2_template, error in copy to device"
           stop 1
         endif
861
       endif
Andreas Marek's avatar
Andreas Marek committed
862

863
       ! Backtransform stage 2
Andreas Marek's avatar
Andreas Marek committed
864

865 866 867 868 869 870 871
       call trans_ev_band_to_full_&
       &MATH_DATATYPE&
       &_&
       &PRECISION &
       (obj, na, nev, nblk, nbw, a, &
       a_dev, lda, tmat, tmat_dev,  q,  &
       q_dev, &
872
       ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
873
#if REALCASE == 1
874
       , useQRActual  &
875
#endif
876
       )
877

878

879 880 881 882 883 884 885 886
       deallocate(tmat, stat=istat, errmsg=errorMessage)
       if (istat .ne. 0) then
         print *,"solve_evp_&
         &MATH_DATATYPE&
         &_2stage_&
         &PRECISION " // ": error when deallocating tmat"//errorMessage
         stop 1
       endif
Pavel Kus's avatar
Pavel Kus committed
887 888 889
#ifdef HAVE_LIKWID
       call likwid_markerStopRegion("trans_ev_to_full")
#endif
Andreas Marek's avatar
Andreas Marek committed
890
       call obj%timer%stop("trans_ev_to_full")
891
     endif ! do_trans_to_full
Andreas Marek's avatar
Andreas Marek committed
892

Pavel Kus's avatar
Pavel Kus committed
893 894 895 896 897 898 899 900 901 902
     if(do_bandred .or. do_trans_to_full) then
       if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
         successCUDA = cuda_free(tmat_dev)
         if (.not.(successCUDA)) then
           print *,"elpa2_template: error in cudaFree, tmat_dev"
           stop 1
         endif
       endif
     endif

903
     if (obj%eigenvalues_only) then
Andreas Marek's avatar
Andreas Marek committed
904
       deallocate(q_dummy, stat=istat, errmsg=errorMessage)
905 906
       if (istat .ne. 0) then
         print *,"solve_evp_&
Andreas Marek's avatar
Andreas Marek committed
907 908 909 910
         &MATH_DATATYPE&
         &_1stage_&
         &PRECISION&
         &" // ": error when deallocating q_dummy "//errorMessage
911 912 913 914
         stop 1
       endif
     endif

915 916 917 918 919 920 921
     ! restore original OpenMP settings
#ifdef WITH_OPENMP
    ! store the number of OpenMP threads used in the calling function
    ! restore this at the end of ELPA 2
    call omp_set_num_threads(omp_threads_caller)
#endif

922
     call obj%timer%stop("elpa_solve_evp_&
923
     &MATH_DATATYPE&
924 925 926
     &_2stage_&
    &PRECISION&
    &")
927 928 929 930 931 932
1    format(a,f10.3)

   end function elpa_solve_evp_&
   &MATH_DATATYPE&
   &_2stage_&
   &PRECISION&
933
   &_impl
934 935

! vim: syntax=fortran