Commit 7d2f7fe2 authored by Andreas Marek's avatar Andreas Marek

Template for ELPA1

parent dba18ce9
......@@ -51,6 +51,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa1_template.X90 \
src/elpa2_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
......@@ -970,6 +971,7 @@ EXTRA_DIST = \
src/elpa2_bandred_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa1_template.X90 \
src/elpa2_template.X90 \
src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_band_to_full_template.X90 \
......
......@@ -557,151 +557,14 @@ end function elpa_get_communicators
!>
!> \result success
#define DOUBLE_PRECISION_REAL 1
#define DOUBLE_PRECISION_COMPLEX 1
#define REAL_DATATYPE c_double
#define COMPLEX_DATATYPE c_float
function elpa_solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
useGPU) result(success)
use precision
use cuda_functions
use mod_check_for_gpu
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use iso_c_binding
use elpa_mpi
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, mpi_comm_all
real(kind=REAL_DATATYPE), intent(out) :: ev(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: a(lda,*)
real(kind=REAL_DATATYPE), intent(out) :: q(ldq,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: a(lda,matrixCols)
real(kind=REAL_DATATYPE), intent(out) :: q(ldq,matrixCols)
#endif
logical, intent(in), optional :: useGPU
logical :: success
logical :: do_useGPU
integer(kind=ik) :: numberOfGPUDevices
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, mpierr
real(kind=REAL_DATATYPE), allocatable :: e(:), tau(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical, save :: firstCall = .true.
logical :: wantDebug
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("elpa_solve_evp_real_1stage_double")
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
success = .true.
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
do_useGPU = .false.
if (present(useGPU)) then
! user defined GPU usage via the optional argument in the API call
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
else
! check whether set by environment variable
do_useGPU = gpu_usage_via_environment_variable()
endif
allocate(e(na), tau(na))
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
#else
call tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
#else
call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_real_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU)
#else
call trans_ev_real_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
time_evp_back = ttt1-ttt0
deallocate(e, tau)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("elpa_solve_evp_real_1stage_double")
#endif
end function elpa_solve_evp_real_1stage_double
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
#undef COMPLEX_DATATYPE
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa1_template.X90"
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_COMPLEX
#define REAL_DATATYPE c_float
#define COMPLEX_DATATYPE c_float
!> \brief elpa_solve_evp_real_1stage_single: Fortran function to solve the real single-precision eigenvalue problem with 1-stage solver
!>
! Parameters
......@@ -738,149 +601,12 @@ end function elpa_solve_evp_real_1stage_double
!>
!> \result success
function elpa_solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
useGPU) result(success)
use cuda_functions
use mod_check_for_gpu
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use iso_c_binding
use elpa_mpi
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
real(kind=REAL_DATATYPE), intent(out) :: ev(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: a(lda,*)
real(kind=REAL_DATATYPE), intent(out) :: q(ldq,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: a(lda,matrixCols)
real(kind=REAL_DATATYPE), intent(out) :: q(ldq,matrixCols)
#endif
logical, intent(in), optional :: useGPU
logical :: success
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, mpierr
real(kind=REAL_DATATYPE), allocatable :: e(:), tau(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical, save :: firstCall = .true.
logical :: wantDebug
logical :: do_useGPU
integer(kind=ik) :: numberOfGPUDevices
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("elpa_solve_evp_real_1stage_single")
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
success = .true.
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
do_useGPU = .false.
if (present(useGPU)) then
! user defined GPU usage via the optional argument in the API call
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
else
! check whether set by environment variable
do_useGPU = gpu_usage_via_environment_variable()
endif
allocate(e(na), tau(na))
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
#else
call tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
#else
call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_real_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU)
#else
call trans_ev_real_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
time_evp_back = ttt1-ttt0
deallocate(e, tau)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("elpa_solve_evp_real_1stage_single")
#endif
end function elpa_solve_evp_real_1stage_single
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
#undef COMPLEX_DATATYPE
#define REALCASE 1
#include "precision_macros.h"
#include "elpa1_template.X90"
#undef REALCASE
#endif /* WANT_SINGLE_PRECISION_REAL */
#define DOUBLE_PRECISION_REAL 1
#define DOUBLE_PRECISION_COMPLEX 1
#define REAL_DATATYPE c_double
#define COMPLEX_DATATYPE c_double
!> \brief elpa_solve_evp_complex_1stage_double: Fortran function to solve the complex double-precision eigenvalue problem with 1-stage solver
!>
! Parameters
......@@ -916,159 +642,15 @@ end function elpa_solve_evp_real_1stage_single
!> \param useGPU use GPU version (.true. or .false.)
!>
!> \result success
function elpa_solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
useGPU) result(success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
use cuda_functions
use mod_check_for_gpu
use iso_c_binding
use elpa_mpi
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a(lda,*)
complex(kind=COMPLEX_DATATYPE), intent(out) :: q(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a(lda,matrixCols)
complex(kind=COMPLEX_DATATYPE), intent(out) :: q(ldq,matrixCols)
#endif
real(kind=REAL_DATATYPE), intent(out) :: ev(na)
logical :: success
logical, intent(in), optional :: useGPU
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=c_int) :: l_rows, l_cols, l_cols_nev
real(kind=REAL_DATATYPE), allocatable :: q_real(:,:), e(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: tau(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical, save :: firstCall = .true.
logical :: wantDebug
logical :: do_useGPU
integer(kind=ik) :: numberOfGPUDevices
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("elpa_solve_evp_complex_1stage_double")
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
success = .true.
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
do_useGPU = .false.
if (present(useGPU)) then
! user defined GPU usage via the optional argument in the API call
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
else
! check whether set by environment variable
do_useGPU = gpu_usage_via_environment_variable()
endif
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
allocate(e(na), tau(na))
allocate(q_real(l_rows,l_cols))
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
call tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
#else
call tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
call solve_tridi_double(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
#else
call solve_tridi_single(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttt0 = MPI_Wtime()
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
#ifdef DOUBLE_PRECISION_COMPLEX
call trans_ev_complex_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU)
#else
call trans_ev_complex_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0
time_evp_back = ttt1-ttt0
deallocate(q_real)
deallocate(e, tau)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("elpa_solve_evp_complex_1stage_double")
#endif
end function elpa_solve_evp_complex_1stage_double
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
#undef COMPLEX_DATATYPE
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa1_template.X90"
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION_COMPLEX
#define COMPLEX_DATATYPE c_float
#define REAL_DATATYPE c_float
!> \brief elpa_solve_evp_complex_1stage_single: Fortran function to solve the complex single-precision eigenvalue problem with 1-stage solver
!>
......@@ -1106,154 +688,10 @@ end function elpa_solve_evp_complex_1stage_double
!>
!> \result success
function elpa_solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
useGPU) result(success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
use cuda_functions
use mod_check_for_gpu
use iso_c_binding
use elpa_mpi
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols
integer(kind=c_int), intent(in) :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a(lda,*)
complex(kind=COMPLEX_DATATYPE), intent(out) :: q(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a(lda,matrixCols)
complex(kind=COMPLEX_DATATYPE), intent(out) :: q(ldq,matrixCols)
#endif
real(kind=REAL_DATATYPE), intent(out) :: ev(na)
logical, intent(in), optional :: useGPU
logical :: success
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=c_int) :: l_rows, l_cols, l_cols_nev
real(kind=REAL_DATATYPE), allocatable :: q_real(:,:), e(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: tau(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical, save :: firstCall = .true.
logical :: wantDebug
logical :: do_useGPU
integer(kind=ik) :: numberOfGPUDevices
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("elpa_solve_evp_complex_1stage_single")
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
success = .true.
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
do_useGPU = .false.
if (present(useGPU)) then
! user defined GPU usage via the optional argument in the API call
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
else
! check whether set by environment variable
do_useGPU = gpu_usage_via_environment_variable()
endif
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
allocate(e(na), tau(na))
allocate(q_real(l_rows,l_cols))
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
call tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
#else
call tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX
call solve_tridi_double(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
#else
call solve_tridi_single(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttt0 = MPI_Wtime()
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)