! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.rzg.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". ! ELPA2 -- 2-stage solver for ELPA ! ! Copyright of the original code rests with the authors inside the ELPA ! consortium. The copyright of any additional modifications shall rest ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". #include "config-f90.h" module ELPA2 ! Version 1.1.2, 2011-02-21 use elpa_utilities use elpa1 use elpa1_compute use elpa2_compute use elpa2_utilities use elpa_pdgeqrf use iso_c_binding #ifdef WITH_GPU_VERSION ! use cuda_routines ! use cuda_c_kernel ! use iso_c_binding #endif implicit none PRIVATE ! By default, all routines contained are private ! The following routines are public: public :: solve_evp_real_2stage public :: solve_evp_complex_2stage !------------------------------------------------------------------------------- ! The following array contains the Householder vectors of the ! transformation band -> tridiagonal. ! It is allocated and set in tridiag_band_real and used in ! trans_ev_tridi_to_band_real. ! It must be deallocated by the user after trans_ev_tridi_to_band_real! ! real*8, allocatable :: hh_trans_real(:,:) ! complex*16, allocatable :: hh_trans_complex(:,:) !------------------------------------------------------------------------------- include 'mpif.h' !****** contains function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, & mpi_comm_rows, mpi_comm_cols, & mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,& useQR) result(success) !------------------------------------------------------------------------------- ! solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach ! ! Parameters ! ! na Order of matrix a ! ! nev Number of eigenvalues needed ! ! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. ! Distribution is like in Scalapack. ! The full matrix must be set (not only one half like in scalapack). ! Destroyed on exit (upper and lower half). ! ! lda Leading dimension of a ! matrixCols local columns of matrix a and q ! ! ev(na) On output: eigenvalues of a, every processor gets the complete set ! ! q(ldq,matrixCols) On output: Eigenvectors of a ! Distribution is like in Scalapack. ! Must be always dimensioned to the full size (corresponding to (na,na)) ! even if only a part of the eigenvalues is needed. ! ! ldq Leading dimension of q ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! mpi_comm_all ! MPI-Communicator for the total processor set ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use cuda_functions use mod_check_for_gpu implicit none logical, intent(in), optional :: useQR logical :: useQRActual, useQREnvironment integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API integer :: THIS_REAL_ELPA_KERNEL integer, intent(in) :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, & mpi_comm_cols, mpi_comm_all integer, intent(in) :: nblk real*8, intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) integer :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr integer :: nbw, num_blocks real*8, allocatable :: tmat(:,:,:), e(:) real*8 :: ttt0, ttt1, ttts integer :: i logical :: success logical, save :: firstCall = .true. logical :: wantDebug integer :: istat character(200) :: errorMessage logical :: useGPU integer :: numberOfGPUDevices #ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_evp_real_2stage") #endif call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) call mpi_comm_size(mpi_comm_all,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) wantDebug = .false. if (firstCall) then ! are debug messages desired? wantDebug = debug_messages_via_environment_variable() firstCall = .false. endif success = .true. useQRActual = .false. useGPU = .false. ! set usage of qr decomposition via API call if (present(useQR)) then if (useQR) useQRActual = .true. if (.not.(useQR)) useQRACtual = .false. endif ! overwrite this with environment variable settings if (qr_decomposition_via_environment_variable(useQREnvironment)) then useQRActual = useQREnvironment endif if (useQRActual) then if (mod(na,nblk) .ne. 0) then if (wantDebug) then write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize" endif print *, "Do not use QR-decomposition for this matrix and blocksize." success = .false. return endif endif if (present(THIS_REAL_ELPA_KERNEL_API)) then ! user defined kernel via the optional argument in the API call THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API else ! if kernel is not choosen via api ! check whether set by environment variable THIS_REAL_ELPA_KERNEL = get_actual_real_kernel() endif ! check whether choosen kernel is allowed if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then if (my_pe == 0) then write(error_unit,*) " " write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL) write(error_unit,*) "is not in the list of the allowed kernels!" write(error_unit,*) " " write(error_unit,*) "Allowed kernels are:" do i=1,size(REAL_ELPA_KERNEL_NAMES(:)) if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i) endif enddo write(error_unit,*) " " write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !" endif THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC endif if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then if (check_for_gpu(my_pe,numberOfGPUDevices)) then useGPU = .true. endif if (nblk .ne. 128) then print *,"At the moment GPU version needs blocksize 128" stop endif ! set the neccessary parameters cudaMemcpyHostToDevice = cuda_memcpyHostToDevice() cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost() cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice() cudaHostRegisterPortable = cuda_hostRegisterPortable() cudaHostRegisterMapped = cuda_hostRegisterMapped() endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 if (useGPU) then nbw = nblk else nbw = (31/nblk+1)*nblk endif num_blocks = (na-1)/nbw + 1 allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage stop endif ! Reduction full -> band ttt0 = MPI_Wtime() ttts = ttt0 call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & tmat, wantDebug, useGPU, success, useQRActual) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time bandred_real :',ttt1-ttt0 ! Reduction band -> tridiagonal allocate(e(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_real_2stage: error when allocating e "//errorMessage stop endif ttt0 = MPI_Wtime() call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, & mpi_comm_cols, mpi_comm_all) ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time tridiag_band_real :',ttt1-ttt0 call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr) call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr) ttt1 = MPI_Wtime() time_evp_fwd = ttt1-ttts ! Solve tridiagonal system ttt0 = MPI_Wtime() call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 time_evp_solve = ttt1-ttt0 ttts = ttt1 deallocate(e, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage stop endif ! Backtransform stage 1 ttt0 = MPI_Wtime() call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, useGPU, success, THIS_REAL_ELPA_KERNEL) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0 ! We can now deallocate the stored householder vectors deallocate(hh_trans_real, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage stop endif ! Backtransform stage 2 print *,"useGPU== ",useGPU ttt0 = MPI_Wtime() call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, & mpi_comm_cols, useGPU, useQRActual) ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0 time_evp_back = ttt1-ttts deallocate(tmat, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_evp_real_2stage") #endif 1 format(a,f10.3) end function solve_evp_real_2stage !------------------------------------------------------------------------------- !------------------------------------------------------------------------------- function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, mpi_comm_rows, mpi_comm_cols, & mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success) !------------------------------------------------------------------------------- ! solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach ! ! Parameters ! ! na Order of matrix a ! ! nev Number of eigenvalues needed ! ! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. ! Distribution is like in Scalapack. ! The full matrix must be set (not only one half like in scalapack). ! Destroyed on exit (upper and lower half). ! ! lda Leading dimension of a ! matrixCols local columns of matrix a and q ! ! ev(na) On output: eigenvalues of a, every processor gets the complete set ! ! q(ldq,matrixCols) On output: Eigenvectors of a ! Distribution is like in Scalapack. ! Must be always dimensioned to the full size (corresponding to (na,na)) ! even if only a part of the eigenvalues is needed. ! ! ldq Leading dimension of q ! ! nblk blocksize of cyclic distribution, must be the same in both directions! ! ! mpi_comm_rows ! mpi_comm_cols ! MPI-Communicators for rows/columns ! mpi_comm_all ! MPI-Communicator for the total processor set ! !------------------------------------------------------------------------------- #ifdef HAVE_DETAILED_TIMINGS use timings #endif use cuda_functions use mod_check_for_gpu implicit none integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API integer :: THIS_COMPLEX_ELPA_KERNEL integer, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all complex*16, intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols) real*8, intent(inout) :: ev(na) integer :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes integer :: l_cols, l_rows, l_cols_nev, nbw, num_blocks complex*16, allocatable :: tmat(:,:,:) real*8, allocatable :: q_real(:,:), e(:) real*8 :: ttt0, ttt1, ttts integer :: i logical :: success, wantDebug logical, save :: firstCall = .true. integer :: istat character(200) :: errorMessage logical :: useGPU integer :: numberOfGPUDevices #ifdef HAVE_DETAILED_TIMINGS call timer%start("solve_evp_complex_2stage") #endif call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) call mpi_comm_size(mpi_comm_all,n_pes,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) useGPU = .false. wantDebug = .false. if (firstCall) then ! are debug messages desired? wantDebug = debug_messages_via_environment_variable() firstCall = .false. endif success = .true. if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then ! user defined kernel via the optional argument in the API call THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API else ! if kernel is not choosen via api ! check whether set by environment variable THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel() endif ! check whether choosen kernel is allowed if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then if (my_pe == 0) then write(error_unit,*) " " write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL) write(error_unit,*) "is not in the list of the allowed kernels!" write(error_unit,*) " " write(error_unit,*) "Allowed kernels are:" do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:)) if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i) endif enddo write(error_unit,*) " " write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" endif THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC endif if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then if (check_for_gpu(my_pe, numberOfGPUDevices)) then useGPU=.true. endif if (nblk .ne. 128) then print *,"At the moment GPU version needs blocksize 128" stop endif ! set the neccessary parameters cudaMemcpyHostToDevice = cuda_memcpyHostToDevice() cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost() cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice() cudaHostRegisterPortable = cuda_hostRegisterPortable() cudaHostRegisterMapped = cuda_hostRegisterMapped() endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 nbw = (31/nblk+1)*nblk num_blocks = (na-1)/nbw + 1 allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage stop endif ! Reduction full -> band ttt0 = MPI_Wtime() ttts = ttt0 call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & tmat, wantDebug, useGPU, success) if (.not.(success)) then #ifdef HAVE_DETAILED_TIMINGS call timer%stop() #endif return endif ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time bandred_complex :',ttt1-ttt0 ! Reduction band -> tridiagonal allocate(e(na), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage stop endif ttt0 = MPI_Wtime() call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all) ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0 call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr) call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr) ttt1 = MPI_Wtime() time_evp_fwd = ttt1-ttts l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage stop endif ! Solve tridiagonal system ttt0 = MPI_Wtime() call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, & mpi_comm_rows, mpi_comm_cols, wantDebug, success) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 time_evp_solve = ttt1-ttt0 ttts = ttt1 q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) deallocate(e, q_real, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage stop endif ! Backtransform stage 1 ttt0 = MPI_Wtime() call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, & matrixCols, mpi_comm_rows, mpi_comm_cols,& wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL) if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0 ! We can now deallocate the stored householder vectors deallocate(hh_trans_complex, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_complex_2stage: error when deallocating hh_trans_complex"//errorMessage stop endif ! Backtransform stage 2 ttt0 = MPI_Wtime() call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, & mpi_comm_cols, useGPU) ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0 time_evp_back = ttt1-ttts deallocate(tmat, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_complex_2stage: error when deallocating tmat "//errorMessage stop endif #ifdef HAVE_DETAILED_TIMINGS call timer%stop("solve_evp_complex_2stage") #endif 1 format(a,f10.3) end function solve_evp_complex_2stage !------------------------------------------------------------------------------- end module ELPA2