Commit d13bdb79 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'matrix_redistribute' into ELPA_GPU_pinned

parents 41b542c7 b380ce92
......@@ -123,6 +123,28 @@ gnu-gnu-mpi-openmp-ilp64:
- ./ci_test_scripts/run_ci_tests.sh -c "CC=\"mpicc\" CFLAGS=\"-O3 -mavx\" FC=\"mpif90\" FCFLAGS=\"-O3 -mavx\" SCALAPACK_LDFLAGS=\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_OMP_ILP64 \" SCALAPACK_FCFLAGS=\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP_ILP64 \" --enable-option-checking=fatal --with-mpi=yes --enable-openmp --disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM
 
 
# gnu-gnu-matrix-redistribute-mpi-noomp
gnu-gnu-mpi-noopenmp-matrix-redistribute:
tags:
- avx
artifacts:
when: on_success
expire_in: 2 month
script:
- ./ci_test_scripts/run_ci_tests.sh -c "CC=\"mpicc\" CFLAGS=\"-O3 -mavx\" FC=\"mpif90\" FCFLAGS=\"-O3 -mavx\" SCALAPACK_LDFLAGS=\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \" SCALAPACK_FCFLAGS=\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \" --enable-option-checking=fatal --with-mpi=yes --disable-openmp --disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM
# gnu-gnu-matrix-redistribute-mpi-openmp
gnu-gnu-mpi-openmp-matrix-redistribute:
tags:
- avx
artifacts:
when: on_success
expire_in: 2 month
script:
- ./ci_test_scripts/run_ci_tests.sh -c "CC=\"mpicc\" CFLAGS=\"-O3 -mavx\" FC=\"mpif90\" FCFLAGS=\"-O3 -mavx\" SCALAPACK_LDFLAGS=\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_OMP \" SCALAPACK_FCFLAGS=\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP \" --enable-option-checking=fatal --with-mpi=yes --enable-openmp --disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM
# python tests
python-intel-intel-mpi-openmp:
tags:
......
......@@ -74,6 +74,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1/elpa1_compute_template.F90 \
src/elpa2/elpa2_compute_real_template.F90 \
src/elpa2/elpa2_compute_complex_template.F90 \
src/helpers/elpa_redistribute_template.F90 \
src/elpa1/elpa1_template.F90 \
src/elpa2/elpa2_template.F90 \
src/elpa2/qr/qr_utils_template.F90 \
......@@ -672,6 +673,7 @@ EXTRA_DIST = \
src/GPU/cuUtils_template.cu \
src/elpa_api_math_template.F90 \
src/elpa_impl_math_template.F90 \
src/helpers/elpa_redistribute_template.F90 \
src/elpa_impl_generalized_transform_template.F90 \
src/elpa1/elpa1_compute_template.F90 \
src/elpa1/elpa1_merge_systems_real_template.F90 \
......
......@@ -416,6 +416,48 @@ ilp64_no_omp_mpi_tests = [
print("\n".join(ilp64_no_omp_mpi_tests))
#two test for matrix-redistribute
matrix_redistribute_mpi_tests = [
"# gnu-gnu-matrix-redistribute-mpi-noomp",
"gnu-gnu-mpi-noopenmp-matrix-redistribute:",
" tags:",
" - avx",
" artifacts:",
" when: on_success",
" expire_in: 2 month",
" script:",
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=\\"mpicc\\" CFLAGS=\\"-O3 -mavx\\" '
'FC=\\"mpif90\\" FCFLAGS=\\"-O3 -mavx\\" '
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
"# gnu-gnu-matrix-redistribute-mpi-openmp",
"gnu-gnu-mpi-openmp-matrix-redistribute:",
" tags:",
" - avx",
" artifacts:",
" when: on_success",
" expire_in: 2 month",
" script:",
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=\\"mpicc\\" CFLAGS=\\"-O3 -mavx\\" '
'FC=\\"mpif90\\" FCFLAGS=\\"-O3 -mavx\\" '
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_OMP \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_OMP \\" '
'--enable-option-checking=fatal --with-mpi=yes --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-scalapack-tests --enable-autotune-redistribute-matrix || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
]
print("\n".join(matrix_redistribute_mpi_tests))
# add python tests
python_ci_tests = [
"# python tests",
......@@ -661,8 +703,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
# add tests for scalapack for some specific test cases
runScalapackTest = False
if (instr == "avx2" and cov == "coverage" and m == "mpi"):
runScalapackTest = True
#if (instr == "avx2" and cov == "coverage" and m == "mpi"):
#if (instr == "avx2" and m == "mpi"):
# runScalapackTest = True
# address-sanitize only with gnu compiler
......@@ -790,7 +833,7 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
if (runScalapackTest):
print(" - ./ci_test_scripts/run_ci_tests.sh -c \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" + " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \
+ libs + " " + ldflags + " " + " "+ scalapackldflags +" " + scalapackfcflags \
+ " --enable-option-checking=fatal --enable-scalapack-tests" + " " + mpi_configure_flag + " " + openmp[o] \
+ " --enable-option-checking=fatal --enable-scalapack-tests --enable-autotune-redistribute-matrix" + " " + mpi_configure_flag + " " + openmp[o] \
+ " " + precision[p] + " " + assumed_size[a] + " " + band_to_full_blocking[b] \
+ " " +gpu[g] + INSTRUCTION_OPTIONS + "\" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -q \"srun\" -S $SLURM -g " +gpuJob)
......
......@@ -104,30 +104,6 @@ if test x"${with_mpi}" = x"yes"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
dnl C
AC_LANG_PUSH([C])
......@@ -1424,6 +1400,100 @@ if test x"${enable_autotuning}" = x"yes"; then
AC_DEFINE([ENABLE_AUTOTUNING], [1], [enable autotuning functionality])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
AC_MSG_CHECKING(whether matrix redistribution should be considered in autotuning)
AC_ARG_ENABLE([autotune-redistribute-matrix],
AS_HELP_STRING([--enable-autotune-redistribute-matrix],
[Allows ELPA during autotuning to re-distribute the matrix to find the best (ELPA internal) block size for block-cyclic distribution (Needs Scalapack functionality)]),
[if test x"$enableval" = x"yes"; then
enable_autotune_redistribute_matrix=yes
else
enable_autotune_redistribute_matrix=no
fi],
[enable_autotune_redistribute_matrix=no])
AC_MSG_RESULT([${enable_autotune_redistribute_matrix}])
if test x"${enable_autotune_redistribute_matrix}" = x"yes" ; then
if test x"${enable_scalapack_tests}" = x"no"; then
AC_MSG_ERROR([Please also set --enable_scalapack_tests in this case])
fi
if test x"${with_mpi}" = x"no"; then
AC_MSG_ERROR([For this option ELPA must be build with MPI enabled])
fi
AC_DEFINE([REDISTRIBUTE_MATRIX],[1],[enable matrix re-distribution during autotuning])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
AC_MSG_CHECKING(whether matrix redistribution should be considered in autotuning)
AC_ARG_ENABLE([autotune-redistribute-matrix],
AS_HELP_STRING([--enable-autotune-redistribute-matrix],
[Allows ELPA during autotuning to re-distribute the matrix to find the best (ELPA internal) block size for block-cyclic distribution (Needs Scalapack functionality)]),
[if test x"$enableval" = x"yes"; then
enable_autotune_redistribute_matrix=yes
else
enable_autotune_redistribute_matrix=no
fi],
[enable_autotune_redistribute_matrix=no])
AC_MSG_RESULT([${enable_autotune_redistribute_matrix}])
if test x"${enable_autotune_redistribute_matrix}" = x"yes" ; then
if test x"${enable_scalapack_tests}" = x"no"; then
AC_MSG_ERROR([Please also set --enable_scalapack_tests in this case])
fi
if test x"${with_mpi}" = x"no"; then
AC_MSG_ERROR([For this option ELPA must be build with MPI enabled])
fi
AC_DEFINE([REDISTRIBUTE_MATRIX],[1],[enable matrix re-distribution during autotuning])
fi
AC_MSG_CHECKING(whether C tests should be provided)
AC_ARG_ENABLE([c-tests],
AS_HELP_STRING([--enable-c-tests],
......
......@@ -9,6 +9,16 @@
name = value,
#define ELPA_ENUM_SUM(name, value, ...) +1
/* MATRIX layout */
#define ELPA_FOR_ALL_MATRIX_LAYOUTS(X) \
X(COLUMN_MAJOR_ORDER, 1) \
X(ROW_MAJOR_ORDER, 2)
enum MATRIX_LAYOUTS {
ELPA_FOR_ALL_MATRIX_LAYOUTS(ELPA_ENUM_ENTRY)
};
#define ELPA_NUMBER_OF_MATRIX_LAYOUTS (0 ELPA_FOR_ALL_MATRIX_LAYOUTS(ELPA_ENUM_SUM))
/* Solver constants */
#define ELPA_FOR_ALL_SOLVERS(X) \
......
......@@ -58,7 +58,18 @@ function elpa_solve_evp_&
&MATH_DATATYPE&
&_1stage_&
&PRECISION&
&_impl (obj, a, ev, q) result(success)
&_impl (obj, &
#ifdef REDISTRIBUTE_MATRIX
aExtern, &
#else
a, &
#endif
ev, &
#ifdef REDISTRIBUTE_MATRIX
qExtern) result(success)
#else
q) result(success)
#endif
use precision
use cuda_functions
use mod_check_for_gpu
......@@ -67,35 +78,61 @@ function elpa_solve_evp_&
use elpa_mpi
use elpa1_compute
use elpa_omp
#ifdef REDISTRIBUTE_MATRIX
use elpa_scalapack_interfaces
#endif
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
real(kind=REAL_DATATYPE), intent(out) :: ev(obj%na)
class(elpa_abstract_impl_t), intent(inout) :: obj
real(kind=REAL_DATATYPE), intent(out) :: ev(obj%na)
#ifdef REDISTRIBUTE_MATRIX
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a(obj%local_nrows,*)
MATH_DATATYPE(kind=rck), optional,target,intent(out) :: q(obj%local_nrows,*)
MATH_DATATYPE(kind=rck), intent(inout), target :: aExtern(obj%local_nrows,*)
MATH_DATATYPE(kind=rck), optional,target,intent(out) :: qExtern(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=rck), intent(inout), target :: aExtern(obj%local_nrows,obj%local_ncols)
#ifdef HAVE_SKEWSYMMETRIC
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: qExtern(obj%local_nrows,2*obj%local_ncols)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: qExtern(obj%local_nrows,obj%local_ncols)
#endif
#endif /* USE_ASSUMED_SIZE */
#else /* REDISTRIBUTE_MATRIX */
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout), target :: a(obj%local_nrows,*)
MATH_DATATYPE(kind=rck), optional,target,intent(out) :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=rck), intent(inout), target :: a(obj%local_nrows,obj%local_ncols)
#ifdef HAVE_SKEWSYMMETRIC
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,2*obj%local_ncols)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
#endif
#endif /* USE_ASSUMED_SIZE */
#endif /* REDISTRIBUTE_MATRIX */
#ifdef REDISTRIBUTE_MATRIX
MATH_DATATYPE(kind=rck), pointer :: a(:,:)
MATH_DATATYPE(kind=rck), pointer :: q(:,:)
#endif
#if REALCASE == 1
real(kind=C_DATATYPE_KIND), allocatable :: tau(:)
real(kind=C_DATATYPE_KIND), allocatable, target :: q_dummy(:,:)
real(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
real(kind=C_DATATYPE_KIND), allocatable :: tau(:)
real(kind=C_DATATYPE_KIND), allocatable, target :: q_dummy(:,:)
real(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
#endif /* REALCASE */
#if COMPLEXCASE == 1
real(kind=REAL_DATATYPE), allocatable :: q_real(:,:)
complex(kind=C_DATATYPE_KIND), allocatable :: tau(:)
real(kind=REAL_DATATYPE), allocatable :: q_real(:,:)
complex(kind=C_DATATYPE_KIND), allocatable :: tau(:)
complex(kind=C_DATATYPE_KIND), allocatable,target :: q_dummy(:,:)
complex(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
complex(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
#endif /* COMPLEXCASE */
......@@ -117,13 +154,31 @@ function elpa_solve_evp_&
logical :: wantDebug
integer(kind=c_int) :: istat, debug, gpu
character(200) :: errorMessage
integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, &
integer(kind=ik) :: na, nev, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, check_pd, i, error
mpi_comm_all, check_pd, i, error, matrixRows
#ifdef REDISTRIBUTE_MATRIX
integer(kind=ik) :: nblkInternal, matrixOrder
character(len=1) :: layoutInternal, layoutExternal
integer(kind=c_int) :: external_blacs_ctxt
integer(kind=BLAS_KIND) :: external_blacs_ctxt_
integer(kind=BLAS_KIND) :: np_rows_, np_cols_, my_prow_, my_pcol_
integer(kind=BLAS_KIND) :: np_rows__, np_cols__, my_prow__, my_pcol__
integer(kind=BLAS_KIND) :: sc_desc_(1:9), sc_desc(1:9)
integer(kind=BLAS_KIND) :: na_rows_, na_cols_, info_, blacs_ctxt_
integer(kind=ik) :: mpi_comm_rows_, mpi_comm_cols_
integer(kind=MPI_KIND) :: mpi_comm_rowsMPI_, mpi_comm_colsMPI_
character(len=1), parameter :: matrixLayouts(2) = [ 'C', 'R' ]
MATH_DATATYPE(kind=rck), allocatable, target :: aIntern(:,:)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable, target :: qIntern(:,:)
#endif
logical :: do_tridiag, do_solve, do_trans_ev
integer(kind=ik) :: nrThreads
integer(kind=ik) :: global_index
logical :: reDistributeMatrix, doRedistributeMatrix
call obj%timer%start("elpa_solve_evp_&
&MATH_DATATYPE&
......@@ -131,6 +186,20 @@ function elpa_solve_evp_&
&PRECISION&
&")
reDistributeMatrix = .false.
matrixRows = obj%local_nrows
matrixCols = obj%local_ncols
call obj%get("mpi_comm_parent", mpi_comm_all, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
my_pe = int(my_peMPI,kind=c_int)
#ifdef WITH_OPENMP
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
......@@ -149,7 +218,11 @@ function elpa_solve_evp_&
success = .true.
#ifdef REDISTRIBUTE_MATRIX
if (present(qExtern)) then
#else
if (present(q)) then
#endif
obj%eigenvalues_only = .false.
else
obj%eigenvalues_only = .true.
......@@ -157,11 +230,25 @@ function elpa_solve_evp_&
na = obj%na
nev = obj%nev
lda = obj%local_nrows
ldq = obj%local_nrows
matrixRows = obj%local_nrows
nblk = obj%nblk
matrixCols = obj%local_ncols
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
#ifdef REDISTRIBUTE_MATRIX
#include "../helpers/elpa_redistribute_template.F90"
#endif /* REDISTRIBUTE_MATRIX */
! special case na = 1
if (na .eq. 1) then
#if REALCASE == 1
......@@ -193,18 +280,6 @@ function elpa_solve_evp_&
obj%eigenvalues_only = .true.
endif
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for mpi_comm_rows. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for mpi_comm_cols. Aborting..."
stop
endif
call obj%get("gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu. Aborting..."
......@@ -251,13 +326,6 @@ function elpa_solve_evp_&
if (useGPU) then
call obj%timer%start("check_for_gpu")
call obj%get("mpi_comm_parent", mpi_comm_all,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for mpi_comm_parent. Aborting..."
stop
endif
call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
my_pe = int(my_peMPI,kind=c_int)
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
......@@ -311,12 +379,16 @@ function elpa_solve_evp_&
! allocate a dummy q_intern, if eigenvectors should not be commputed and thus q is NOT present
if (.not.(obj%eigenvalues_only)) then
q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
q_actual => q(1:matrixRows,1:matrixCols)
else
allocate(q_dummy(obj%local_nrows,obj%local_ncols))
allocate(q_dummy(1:matrixRows,1:matrixCols))
q_actual => q_dummy
endif
! test only
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
#if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
......@@ -363,7 +435,7 @@ function elpa_solve_evp_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads)
& (obj, na, a, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads)
#ifdef WITH_NVTX
call nvtxRangePop()
......@@ -382,12 +454,11 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX
call nvtxRangePush("solve")
#endif
call solve_tridi_&
&PRECISION&
& (obj, na, nev, ev, e, &
#if REALCASE == 1
q_actual, ldq, &
q_actual, matrixRows, &
#endif
#if COMPLEXCASE == 1
q_real, l_rows, &
......@@ -437,23 +508,23 @@ function elpa_solve_evp_&
! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
! This makes the eigenvectors complex.
! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols) = 0.0
do i = 1, obj%local_nrows
q(1:matrixRows, matrixCols+1:2*matrixCols) = 0.0
do i = 1, matrixRows
! global_index = indxl2g(i, nblk, my_prow, 0, np_rows)
global_index = np_rows*nblk*((i-1)/nblk) + MOD(i-1,nblk) + MOD(np_rows+my_prow-0, np_rows)*nblk + 1
if (mod(global_index-1,4) .eq. 0) then
! do nothing
end if
if (mod(global_index-1,4) .eq. 1) then
q(i,obj%local_ncols+1:2*obj%local_ncols) = q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
q(i,matrixCols+1:2*matrixCols) = q(i,1:matrixCols)
q(i,1:matrixCols) = 0
end if
if (mod(global_index-1,4) .eq. 2) then
q(i,1:obj%local_ncols) = -q(i,1:obj%local_ncols)
q(i,1:matrixCols) = -q(i,1:matrixCols)
end if
if (mod(global_index-1,4) .eq. 3) then
q(i,obj%local_ncols+1:2*obj%local_ncols) = -q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
q(i,matrixCols+1:2*matrixCols) = -q(i,1:matrixCols)
q(i,1:matrixCols) = 0
end if
end do
endif
......@@ -465,13 +536,12 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX
call nvtxRangePush("trans_ev")
#endif
! In the skew-symmetric case this transforms the real part
call trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj, na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
& (obj, na, nev, a, matrixRows, tau, q, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
if (isSkewsymmetric) then
! Transform imaginary part
! Transformation of real and imaginary part could also be one call of trans_ev_tridi acting on the n x 2n matrix.
......@@ -479,7 +549,7 @@ function elpa_solve_evp_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj, na, nev, a, lda, tau, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), ldq, nblk, matrixCols, &
& (obj, na, nev, a, matrixRows, tau, q(1:matrixRows, matrixCols+1:2*matrixCols), matrixRows, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
endif
......@@ -536,6 +606,46 @@ function elpa_solve_evp_&
call omp_set_num_threads(omp_threads_caller)
#endif
#ifdef REDISTRIBUTE_MATRIX
! redistribute back if necessary
if (doRedistributeMatrix) then
!if (layoutInternal /= layoutExternal) then
! ! maybe this can be skiped I now the process grid
! ! and np_rows and np_cols
! call obj%get("mpi_comm_rows",mpi_comm_rows,error)
! call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
! call obj%get("mpi_comm_cols",mpi_comm_cols,error)
! call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
! np_rows = int(np_rowsMPI,kind=c_int)
! np_cols = int(np_colsMPI,kind=c_int)
! ! we get new blacs context and the local process grid coordinates
! call BLACS_Gridinit(external_blacs_ctxt, layoutInternal, int(np_rows,kind=BLAS_KIND), int(np_cols,kind=BLAS_KIND))
! call BLACS_Gridinfo(int(external_blacs_ctxt,KIND=BLAS_KIND), np_rows__, &
! np_cols__, my_prow__, my_pcol__)
!endif