Commit e859a72e authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to add matrix re-distribution

parent 919e9ba0
This diff is collapsed.
...@@ -661,7 +661,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product( ...@@ -661,7 +661,8 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# add tests for scalapack for some specific test cases # add tests for scalapack for some specific test cases
runScalapackTest = False runScalapackTest = False
if (instr == "avx2" and cov == "coverage" and m == "mpi"): #if (instr == "avx2" and cov == "coverage" and m == "mpi"):
if (instr == "avx2" and m == "mpi"):
runScalapackTest = True runScalapackTest = True
...@@ -790,7 +791,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product( ...@@ -790,7 +791,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
if (runScalapackTest): if (runScalapackTest):
print(" - ./ci_test_scripts/run_ci_tests.sh -c \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" + " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \ print(" - ./ci_test_scripts/run_ci_tests.sh -c \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" + " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \
+ libs + " " + ldflags + " " + " "+ scalapackldflags +" " + scalapackfcflags \ + libs + " " + ldflags + " " + " "+ scalapackldflags +" " + scalapackfcflags \
+ " --enable-option-checking=fatal --enable-scalapack-tests" + " " + mpi_configure_flag + " " + openmp[o] \ + " --enable-option-checking=fatal --enable-scalapack-tests --enable-autotune-redistribute-matrix" + " " + mpi_configure_flag + " " + openmp[o] \
+ " " + precision[p] + " " + assumed_size[a] + " " + band_to_full_blocking[b] \ + " " + precision[p] + " " + assumed_size[a] + " " + band_to_full_blocking[b] \
+ " " +gpu[g] + INSTRUCTION_OPTIONS + "\" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -q \"srun\" -S $SLURM -g " +gpuJob) + " " +gpu[g] + INSTRUCTION_OPTIONS + "\" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -q \"srun\" -S $SLURM -g " +gpuJob)
......
...@@ -104,30 +104,6 @@ if test x"${with_mpi}" = x"yes"; then ...@@ -104,30 +104,6 @@ if test x"${with_mpi}" = x"yes"; then
AC_DEFINE([WITH_MPI], [1], [use MPI]) AC_DEFINE([WITH_MPI], [1], [use MPI])
fi fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
dnl C dnl C
AC_LANG_PUSH([C]) AC_LANG_PUSH([C])
...@@ -1411,6 +1387,53 @@ if test x"${enable_autotuning}" = x"yes"; then ...@@ -1411,6 +1387,53 @@ if test x"${enable_autotuning}" = x"yes"; then
AC_DEFINE([ENABLE_AUTOTUNING], [1], [enable autotuning functionality]) AC_DEFINE([ENABLE_AUTOTUNING], [1], [enable autotuning functionality])
fi fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
AC_MSG_CHECKING(whether matrix redistribution should be considered in autotuning)
AC_ARG_ENABLE([autotune-redistribute-matrix],
AS_HELP_STRING([--enable-autotune-redistribute-matrix],
[Allows ELPA during autotuning to re-distribute the matrix to find the best (ELPA internal) block size for block-cyclic distribution (Needs Scalapack functionality)]),
[if test x"$enableval" = x"yes"; then
enable_autotune_redistribute_matrix=yes
else
enable_autotune_redistribute_matrix=no
fi],
[enable_autotune_redistribute_matrix=no])
AC_MSG_RESULT([${enable_autotune_redistribute_matrix}])
if test x"${enable_autotune_redistribute_matrix}" = x"yes" ; then
if test x"${enable_scalapack_tests}" = x"no"; then
AC_MSG_ERROR([Please also set --enable_scalapack_tests in this case])
fi
if test x"${with_mpi}" = x"no"; then
AC_MSG_ERROR([For this option ELPA must be build with MPI enabled])
fi
AC_DEFINE([REDISTRIBUTE_MATRIX],[1],[enable matrix re-distribution during autotuning])
fi
AC_MSG_CHECKING(whether C tests should be provided) AC_MSG_CHECKING(whether C tests should be provided)
AC_ARG_ENABLE([c-tests], AC_ARG_ENABLE([c-tests],
AS_HELP_STRING([--enable-c-tests], AS_HELP_STRING([--enable-c-tests],
......
...@@ -9,6 +9,16 @@ ...@@ -9,6 +9,16 @@
name = value, name = value,
#define ELPA_ENUM_SUM(name, value, ...) +1 #define ELPA_ENUM_SUM(name, value, ...) +1
/* MATRIX layout */
#define ELPA_FOR_ALL_MATRIX_LAYOUTS(X) \
X(COLUMN_MAJOR_ORDER, 1) \
X(ROW_MAJOR_ORDER, 2)
enum MATRIX_LAYOUTS {
ELPA_FOR_ALL_MATRIX_LAYOUTS(ELPA_ENUM_ENTRY)
};
#define ELPA_NUMBER_OF_MATRIX_LAYOUTS (0 ELPA_FOR_ALL_MATRIX_LAYOUTS(ELPA_ENUM_SUM))
/* Solver constants */ /* Solver constants */
#define ELPA_FOR_ALL_SOLVERS(X) \ #define ELPA_FOR_ALL_SOLVERS(X) \
......
...@@ -58,7 +58,7 @@ function elpa_solve_evp_& ...@@ -58,7 +58,7 @@ function elpa_solve_evp_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_1stage_& &_1stage_&
&PRECISION& &PRECISION&
&_impl (obj, a, ev, q) result(success) &_impl (obj, aExtern, ev, qExtern) result(success)
use precision use precision
use cuda_functions use cuda_functions
use mod_check_for_gpu use mod_check_for_gpu
...@@ -67,35 +67,41 @@ function elpa_solve_evp_& ...@@ -67,35 +67,41 @@ function elpa_solve_evp_&
use elpa_mpi use elpa_mpi
use elpa1_compute use elpa1_compute
use elpa_omp use elpa_omp
#ifdef REDISTRIBUTE_MATRIX
use elpa_scalapack_interfaces
#endif
implicit none implicit none
#include "../general/precision_kinds.F90" #include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj class(elpa_abstract_impl_t), intent(inout) :: obj
real(kind=REAL_DATATYPE), intent(out) :: ev(obj%na) real(kind=REAL_DATATYPE), intent(out) :: ev(obj%na)
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a(obj%local_nrows,*) MATH_DATATYPE(kind=rck), intent(inout), target :: aExtern(obj%local_nrows,*)
MATH_DATATYPE(kind=rck), optional,target,intent(out) :: q(obj%local_nrows,*) MATH_DATATYPE(kind=rck), optional,target,intent(out) :: qExtern(obj%local_nrows,*)
#else #else
MATH_DATATYPE(kind=rck), intent(inout) :: a(obj%local_nrows,obj%local_ncols) MATH_DATATYPE(kind=rck), intent(inout), target :: aExtern(obj%local_nrows,obj%local_ncols)
#ifdef HAVE_SKEWSYMMETRIC #ifdef HAVE_SKEWSYMMETRIC
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,2*obj%local_ncols) MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: qExtern(obj%local_nrows,2*obj%local_ncols)
#else #else
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols) MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: qExtern(obj%local_nrows,obj%local_ncols)
#endif #endif
#endif #endif
MATH_DATATYPE(kind=rck), pointer :: a(:,:)
MATH_DATATYPE(kind=rck), pointer :: q(:,:)
#if REALCASE == 1 #if REALCASE == 1
real(kind=C_DATATYPE_KIND), allocatable :: tau(:) real(kind=C_DATATYPE_KIND), allocatable :: tau(:)
real(kind=C_DATATYPE_KIND), allocatable, target :: q_dummy(:,:) real(kind=C_DATATYPE_KIND), allocatable, target :: q_dummy(:,:)
real(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:) real(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
#endif /* REALCASE */ #endif /* REALCASE */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
real(kind=REAL_DATATYPE), allocatable :: q_real(:,:) real(kind=REAL_DATATYPE), allocatable :: q_real(:,:)
complex(kind=C_DATATYPE_KIND), allocatable :: tau(:) complex(kind=C_DATATYPE_KIND), allocatable :: tau(:)
complex(kind=C_DATATYPE_KIND), allocatable,target :: q_dummy(:,:) complex(kind=C_DATATYPE_KIND), allocatable,target :: q_dummy(:,:)
complex(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:) complex(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
...@@ -117,13 +123,30 @@ function elpa_solve_evp_& ...@@ -117,13 +123,30 @@ function elpa_solve_evp_&
logical :: wantDebug logical :: wantDebug
integer(kind=c_int) :: istat, debug, gpu integer(kind=c_int) :: istat, debug, gpu
character(200) :: errorMessage character(200) :: errorMessage
integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, & integer(kind=ik) :: na, nev, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, & mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, check_pd, i, error mpi_comm_all, check_pd, i, error, matrixRows
#ifdef REDISTRIBUTE_MATRIX
integer(kind=ik) :: nblkInternal, matrixOrder
character(len=1) :: layoutInternal, layoutExternal
integer(kind=BLAS_KIND) :: external_blacs_ctxt, external_blacs_ctxt_
integer(kind=BLAS_KIND) :: np_rows_, np_cols_, my_prow_, my_pcol_
integer(kind=BLAS_KIND) :: np_rows__, np_cols__, my_prow__, my_pcol__
integer(kind=BLAS_KIND) :: sc_desc_(1:9), sc_desc(1:9)
integer(kind=BLAS_KIND) :: na_rows_, na_cols_, info_, blacs_ctxt_
integer(kind=ik) :: mpi_comm_rows_, mpi_comm_cols_
integer(kind=MPI_KIND) :: mpi_comm_rowsMPI_, mpi_comm_colsMPI_
character(len=1), parameter :: matrixLayouts(2) = [ 'C', 'R' ]
MATH_DATATYPE(kind=rck), allocatable, target :: aIntern(:,:)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target :: qIntern(:,:)
#endif
logical :: do_tridiag, do_solve, do_trans_ev logical :: do_tridiag, do_solve, do_trans_ev
integer(kind=ik) :: nrThreads integer(kind=ik) :: nrThreads
integer(kind=ik) :: global_index integer(kind=ik) :: global_index
logical :: reDistributeMatrix, doRedistributeMatrix
call obj%timer%start("elpa_solve_evp_& call obj%timer%start("elpa_solve_evp_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -131,6 +154,19 @@ function elpa_solve_evp_& ...@@ -131,6 +154,19 @@ function elpa_solve_evp_&
&PRECISION& &PRECISION&
&") &")
reDistributeMatrix = .false.
matrixRows = obj%local_nrows
matrixCols = obj%local_ncols
call obj%get("mpi_comm_parent", mpi_comm_all, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
my_pe = int(my_peMPI,kind=c_int)
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
! store the number of OpenMP threads used in the calling function ! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2 ! restore this at the end of ELPA 2
...@@ -149,7 +185,7 @@ function elpa_solve_evp_& ...@@ -149,7 +185,7 @@ function elpa_solve_evp_&
success = .true. success = .true.
if (present(q)) then if (present(qExtern)) then
obj%eigenvalues_only = .false. obj%eigenvalues_only = .false.
else else
obj%eigenvalues_only = .true. obj%eigenvalues_only = .true.
...@@ -157,11 +193,178 @@ function elpa_solve_evp_& ...@@ -157,11 +193,178 @@ function elpa_solve_evp_&
na = obj%na na = obj%na
nev = obj%nev nev = obj%nev
lda = obj%local_nrows matrixRows = obj%local_nrows
ldq = obj%local_nrows
nblk = obj%nblk nblk = obj%nblk
matrixCols = obj%local_ncols matrixCols = obj%local_ncols
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
#ifdef REDISTRIBUTE_MATRIX
doRedistributeMatrix = .false.
call obj%set("internal_nblk", 32, error)
if (obj%is_set("internal_nblk") == 1) then
if (obj%is_set("matrix_order") == 1) then
reDistributeMatrix = .true.
else
reDistributeMatrix = .false.
if (my_pe == 0) then
write(error_unit,*) 'Warning: Matrix re-distribution is not used, since you did not set the matrix_order'
write(error_unit,*) ' Only task 0 prints this warning'
endif
endif
endif
if (reDistributeMatrix) then
! get the block-size to be used
call obj%get("internal_nblk", nblkInternal, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("matrix_order", matrixOrder, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
layoutExternal=matrixLayouts(matrixOrder)
if (nblkInternal == nblk) then
doRedistributeMatrix = .false.
else
doRedistributeMatrix = .true.
endif
endif
if (doRedistributeMatrix) then
! create a new internal blacs discriptor
! matrix will still be distributed over the same process grid
! as input matrices A and Q where
external_blacs_ctxt = int(mpi_comm_all,kind=BLAS_KIND)
! construct current descriptor
if (obj%is_set("blacs_context") == 0) then
call obj%set("blacs_context", int(external_blacs_ctxt,kind=c_int), error)
if (error .NE. ELPA_OK) then
stop "A"
endif
endif
sc_desc(1) = 1
sc_desc(2) = external_blacs_ctxt
sc_desc(3) = obj%na
sc_desc(4) = obj%na
sc_desc(5) = obj%nblk
sc_desc(6) = obj%nblk
sc_desc(7) = 0
sc_desc(8) = 0
sc_desc(9) = obj%local_nrows
layoutInternal = 'C'
! and np_rows and np_cols
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
np_rows = int(np_rowsMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
blacs_ctxt_ = int(mpi_comm_all,kind=BLAS_KIND)
! we get new blacs context and the local process grid coordinates
call BLACS_Gridinit(blacs_ctxt_, layoutInternal, int(np_rows,kind=BLAS_KIND), int(np_cols,kind=BLAS_KIND))
call BLACS_Gridinfo(blacs_ctxt_, np_rows_, np_cols_, my_prow_, my_pcol_)
if (np_rows /= np_rows_) then
print *, "BLACS_Gridinfo returned different values for np_rows as set by BLACS_Gridinit"
stop 1
endif
if (np_cols /= np_cols_) then
print *, "BLACS_Gridinfo returned different values for np_cols as set by BLACS_Gridinit"
stop 1
endif
call mpi_comm_split(int(mpi_comm_all,kind=MPI_KIND), int(my_pcol_,kind=MPI_KIND), &
int(my_prow_,kind=MPI_KIND), mpi_comm_rowsMPI_, mpierr)
mpi_comm_rows_ = int(mpi_comm_rowsMPI_,kind=c_int)
call mpi_comm_split(int(mpi_comm_all,kind=MPI_KIND), int(my_prow_,kind=MPI_KIND), &
int(my_pcol_,kind=MPI_KIND), mpi_comm_colsMPI_, mpierr)
mpi_comm_cols_ = int(mpi_comm_colsMPI_,kind=c_int)
if (int(np_rows_,kind=c_int) /= np_rows) then
print *,"OHO: ",np_rows,np_rows_
print *, "BLACS_Gridinfo returned different values for np_rows as set by BLACS_Gridinit"
stop
endif
if (int(np_cols_,kind=c_int) /= np_cols) then
print *,"OHO: ",np_cols,np_cols_
print *, "BLACS_Gridinfo returned different values for np_cols as set by BLACS_Gridinit"
stop
endif
! now we can set up the the blacs descriptor
sc_desc_(:) = 0
na_rows_ = numroc(int(na,kind=BLAS_KIND), int(nblkInternal,kind=BLAS_KIND), my_prow_, 0_BLAS_KIND, np_rows_)
na_cols_ = numroc(int(na,kind=BLAS_KIND), int(nblkInternal,kind=BLAS_KIND), my_pcol_, 0_BLAS_KIND, np_cols_)
info_ = 0
call descinit(sc_desc_, int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), int(nblkInternal,kind=BLAS_KIND), &
int(nblkInternal,kind=BLAS_KIND), 0_BLAS_KIND, 0_BLAS_KIND, &
blacs_ctxt_, na_rows_, info_)
if (info_ .ne. 0) then
write(error_unit,*) 'Error in BLACS descinit! info=',info_
write(error_unit,*) 'the internal re-distribution of the matrices failed!'
call MPI_ABORT(int(mpi_comm_all,kind=MPI_KIND), 1_MPI_KIND, mpierr)
endif
! allocate the memory for the redistributed matrices
allocate(aIntern(na_rows_,na_cols_))
#ifdef HAVE_SKEWSYMMETRIC
allocate(qIntern(na_rows_,2*na_cols_))
#else
allocate(qIntern(na_rows_,na_cols_))
#endif
call scal_PRECISION_GEMR2D &
(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), aExtern, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, aIntern, &
1_BLAS_KIND, 1_BLAS_KIND, sc_desc_, blacs_ctxt_)
!call scal_PRECISION_GEMR2D &
!(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), qExtern, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, qIntern, &
!1_BLAS_KIND, 1_BLAS_KIND, sc_desc_, blacs_ctxt_)
!map all important new variables to be used from here
nblk = nblkInternal
matrixCols = na_cols_
matrixRows = na_rows_
mpi_comm_cols = mpi_comm_cols_
mpi_comm_rows = mpi_comm_rows_
a => aIntern(1:matrixRows,1:matrixCols)
q => qIntern(1:matrixRows,1:matrixCols)
else
a => aExtern(1:matrixRows,1:matrixCols)
q => qExtern(1:matrixRows,1:matrixCols)
endif
#endif /* REDISTRIBUTE_MATRIX */
! special case na = 1 ! special case na = 1
if (na .eq. 1) then if (na .eq. 1) then
#if REALCASE == 1 #if REALCASE == 1
...@@ -193,18 +396,6 @@ function elpa_solve_evp_& ...@@ -193,18 +396,6 @@ function elpa_solve_evp_&
obj%eigenvalues_only = .true. obj%eigenvalues_only = .true.
endif endif
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("gpu",gpu,error) call obj%get("gpu",gpu,error)
if (error .ne. ELPA_OK) then if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..." print *,"Problem getting option. Aborting..."
...@@ -251,13 +442,6 @@ function elpa_solve_evp_& ...@@ -251,13 +442,6 @@ function elpa_solve_evp_&
if (useGPU) then if (useGPU) then
call obj%timer%start("check_for_gpu") call obj%timer%start("check_for_gpu")
call obj%get("mpi_comm_parent", mpi_comm_all,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
my_pe = int(my_peMPI,kind=c_int)
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true. do_useGPU = .true.
...@@ -311,12 +495,16 @@ function elpa_solve_evp_& ...@@ -311,12 +495,16 @@ function elpa_solve_evp_&
! allocate a dummy q_intern, if eigenvectors should not be commputed and thus q is NOT present ! allocate a dummy q_intern, if eigenvectors should not be commputed and thus q is NOT present
if (.not.(obj%eigenvalues_only)) then if (.not.(obj%eigenvalues_only)) then
q_actual => q(1:obj%local_nrows,1:obj%local_ncols) q_actual => q(1:matrixRows,1:matrixCols)
else else
allocate(q_dummy(obj%local_nrows,obj%local_ncols)) allocate(q_dummy(1:matrixRows,1:matrixCols))
q_actual => q_dummy q_actual => q_dummy
endif endif
! test only
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
...@@ -363,7 +551,7 @@ function elpa_solve_evp_& ...@@ -363,7 +551,7 @@ function elpa_solve_evp_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION& &PRECISION&
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads) & (obj, na, a, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads)
#ifdef WITH_NVTX #ifdef WITH_NVTX
call nvtxRangePop() call nvtxRangePop()
...@@ -382,12 +570,11 @@ function elpa_solve_evp_& ...@@ -382,12 +570,11 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX #ifdef WITH_NVTX
call nvtxRangePush("solve") call nvtxRangePush("solve")
#endif #endif
call solve_tridi_& call solve_tridi_&
&PRECISION& &PRECISION&
& (obj, na, nev, ev, e, & & (obj, na, nev, ev, e, &
#if REALCASE == 1 #if REALCASE == 1
q_actual, ldq, & q_actual, matrixRows, &
#endif #endif