Commit fa78e003 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'long_int_scalapack' into 'master_pre_stage'

Long int scalapack

See merge request !23
parents f31efe63 71ebefc2
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -591,6 +591,8 @@ noinst_LTLIBRARIES += libelpatest@SUFFIX@.la
libelpatest@SUFFIX@_la_FCFLAGS = $(test_program_fcflags)
libelpatest@SUFFIX@_la_SOURCES = \
test/shared/tests_variable_definitions.F90 \
test/shared/mod_tests_scalapack_interfaces.F90 \
test/shared/mod_tests_blas_interfaces.F90 \
test/shared/test_util.F90 \
test/shared/test_read_input_parameters.F90 \
test/shared/test_check_correctness.F90 \
......@@ -766,6 +768,8 @@ EXTRA_DIST = \
manual_cpp \
nvcc_wrap \
remove_xcompiler \
src/helpers/fortran_blas_interfaces.F90 \
src/helpers/fortran_scalapack_interfaces.F90 \
src/GPU/cuUtils_template.cu \
src/elpa_api_math_template.F90 \
src/elpa_impl_math_template.F90 \
......
......@@ -59,6 +59,15 @@ export MKL_ANACONDA_INTEL_SCALAPACK_MPI_OMP_BASELINE="-L$ANACONDA_HOME/lib -lmkl
export MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP="-I$MKL_HOME/include/intel64/lp64"
export MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP="$MKL_ANACONDA_INTEL_SCALAPACK_MPI_OMP_BASELINE -Wl,-rpath,$ANACONDA_HOME/lib"
export MKL_GFORTRAN_SCALAPACK_NOMPI_NOOMP_ILP64_BASELINE="-L$MKL_HOME/lib/intel64 -lmkl_gf_ilp64 -lmkl_sequential -lmkl_core -lpthread"
export MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_NOOMP_ILP64="-I$MKL_HOME/include/intel64/ilp64"
export MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_NOOMP_ILP64="$MKL_GFORTRAN_SCALAPACK_NOMPI_NOOMP_ILP64_BASELINE -Wl,-rpath,$MKL_HOME/lib/intel64"
export MKL_GFORTRAN_SCALAPACK_NOMPI_OMP_ILP64_BASELINE="-L$MKL_HOME/lib/intel64 -lmkl_gf_ilp64 -lmkl_sequential -lmkl_core -lpthread"
export MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_OMP_ILP64="-I$MKL_HOME/include/intel64/ilp64"
export MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_OMP_ILP64="$MKL_GFORTRAN_SCALAPACK_NOMPI_OMP_ILP64_BASELINE -Wl,-rpath,$MKL_HOME/lib/intel64"
export ASAN_OPTIONS=suppressions=./ci_test_scripts/no_asan_for_mpi.supp,fast_unwind_on_malloc=0
export LSAN_OPTIONS=suppressions=./ci_test_scripts/no_lsan_for_mpi.supp
......
......@@ -148,7 +148,7 @@ def set_cflags_fcflags(instr, cc, fc, instruction_set):
FCFLAGS += "-O3 -xMIC-AVX512"
if (instr == "avx2"):
INSTRUCTION_OPTIONS = instruction_set[instr]
INSTRUCTION_OPTIONS = instruction_set[instr] + " --disable-avx512"
if (cc == "gnu"):
CFLAGS += "-O3 -mavx2 -mfma"
else:
......@@ -332,8 +332,47 @@ print(" - ./ci_test_scripts/run_distcheck_tests.sh -c \" FC=mpiifort FCFLAGS
print("\n\n")
#add two tests for ilp64 mkl interface
ilp64_no_omp_tests = [
"# gnu-gnu-ilp64-nompi-noomp",
"gnu-gnu-nompi-noopenmp-ilp64:",
" tags:",
" - avx",
" artifacts:",
" when: on_success",
" expire_in: 2 month",
" script:",
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=\\"gcc\\" CFLAGS=\\"-O3 -mavx\\" '
'FC=\\"gfortran\\" FCFLAGS=\\"-O3 -mavx\\" '
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_NOOMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_NOOMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=no --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
"# gnu-gnu-ilp64-nompi-openmp",
"gnu-gnu-nompi-openmp-ilp64:",
" tags:",
" - avx",
" artifacts:",
" when: on_success",
" expire_in: 2 month",
" script:",
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=\\"gcc\\" CFLAGS=\\"-O3 -mavx\\" '
'FC=\\"gfortran\\" FCFLAGS=\\"-O3 -mavx\\" '
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_OMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_OMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=no --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
]
print("\n".join(ilp64_no_omp_tests))
# add python tests
python_ci_tests = [
......
......@@ -144,9 +144,12 @@ then
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "make check TEST_FLAGS=\" $matrixSize $nrEV $blockSize \" " >> ./run_${CLUSTER}_1node_2GPU.sh
echo " " >> ./run_${CLUSTER}_1node_2GPU.sh
echo "exitCode=\$?" >> ./run_${CLUSTER}_1node_2GPU.sh
echo " " >> ./run_${CLUSTER}_1node_2GPU.sh
echo "#copy everything back from /tmp/elpa to runner directory" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "cp -r * \$runner_path" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "cd .. && rm -rf /tmp/elpa_\$SLURM_JOB_ID" >> ./run_${CLUSTER}_1node_2GPU.sh
echo "exit \$exitCode" >> ./run_${CLUSTER}_1node_2GPU.sh
echo " "
echo "Job script for the run"
......@@ -179,9 +182,12 @@ then
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node.sh
echo "make check TEST_FLAGS=\" $matrixSize $nrEV $blockSize \" " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "exitCode=\$?" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "#copy everything back from /tmp/elpa to runner directory" >> ./run_${CLUSTER}_1node.sh
echo "cp -r * \$runner_path" >> ./run_${CLUSTER}_1node.sh
echo "cd .. && rm -rf /tmp/elpa_\$SLURM_JOB_ID" >> ./run_${CLUSTER}_1node.sh
echo "exit \$exitCode" >> ./run_${CLUSTER}_1node.sh
echo " "
echo "Job script for the run"
......
......@@ -127,10 +127,12 @@ then
echo "export DISTCHECK_CONFIGURE_FLAGS=\" $distcheckConfigureArgs \" " >> ./run_${CLUSTER}_1node.sh
echo "make distcheck TEST_FLAGS=\" $matrixSize $nrEV $blockSize \" || { chmod u+rwX -R . ; exit 1 ; } " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "exitCode=\$?" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "#copy everything back from /tmp/elpa to runner directory" >> ./run_${CLUSTER}_1node.sh
echo "cp -r * \$runner_path" >> ./run_${CLUSTER}_1node.sh
echo "cd .. && rm -rf /tmp/elpa_\$SLURM_JOB_ID" >> ./run_${CLUSTER}_1node.sh
echo "exit \$exitCode" >> ./run_${CLUSTER}_1node.sh
echo " "
echo "Job script for the run"
cat ./run_${CLUSTER}_1node.sh
......
......@@ -175,12 +175,14 @@ then
echo "popd" >> ./run_${CLUSTER}_1node.sh
echo "pushd build" >> ./run_${CLUSTER}_1node.sh
echo "make distclean || { exit 1; }" >> ./run_${CLUSTER}_1node.sh
echo "exitCode=\$?" >> ./run_${CLUSTER}_1node.sh
echo "rm -rf installdest" >> ./run_${CLUSTER}_1node.sh
echo "popd" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh
echo "#copy everything back from /tmp/elpa to runner directory" >> ./run_${CLUSTER}_1node.sh
echo "cp -r * \$runner_path" >> ./run_${CLUSTER}_1node.sh
echo "cd .. && rm -rf /tmp/elpa_\$SLURM_JOB_ID" >> ./run_${CLUSTER}_1node.sh
echo "exit \$exitCode" >> ./run_${CLUSTER}_1node.sh
echo " "
echo "Job script for the run"
cat ./run_${CLUSTER}_1node.sh
......
......@@ -208,6 +208,68 @@ if test x"${enable_heterogenous_cluster_support}" = x"yes"; then
fi
AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"])
dnl 64bit integer support for BLACS/LAPACK/SCALAPACK support
dnl first long int
AC_CHECK_SIZEOF([long int])
size_of_long_int="${ac_cv_sizeof_long_int}"
dnl then 64bit blas
AC_MSG_CHECKING(whether 64bit integers should be used for math libraries (BLAS/LAPACK/SCALAPACK))
AC_ARG_ENABLE([64bit-integer-math-support],
AS_HELP_STRING([--64bit-integer-math-support],
[allows to link against the 64bit integer versions of the math libraries BLAS, LAPACK, and SCALAPACK]),
[
if test x"$enableval" = x"yes"; then
enable_64bit_integer_math_support=yes
else
enable_64bit_integer_math_support=no
fi
],
[enable_64bit_integer_math_support="no"])
AC_MSG_RESULT([$enable_64bit_integer_math_support])
if test x"${enable_64bit_integer_math_support}" = x"yes"; then
if test x"${enable_legacy}" = x"yes"; then
AC_MSG_ERROR([You cannot both enable 64bit integer support and the legacy interface!])
fi
dnl at least INTEL MPI does _NOT_ support 64BIT integer mode for C thus disable C tests in this Case
if test x"${enable_c_tests}" = x"yes"; then
AC_MSG_ERROR([You cannot both define 64bit integer support and C tests. Reconfigure!])
fi
dnl check whether long int is the correct data-type in C
if test x"${size_of_long_int}" = x"8"; then
echo "Found C data-type \"long int\" with 8 bytes"
else
AC_MSG_ERROR([The C data-type "long int" is only ${size_of_long_int} bytes; Needed is 8 bytes])
fi
AC_DEFINE([HAVE_64BIT_INTEGER_MATH_SUPPORT], [1], [allow to link against the 64bit integer versions of math libraries])
fi
AM_CONDITIONAL([HAVE_64BIT_INTEGER_MATH_SUPPORT],[test x"$enable_64bit_integer_math_support" = x"yes"])
dnl then 64bit blas
AC_MSG_CHECKING(whether 64bit integers should be used for the MPI library)
AC_ARG_ENABLE([64bit-integer-mpi-support],
AS_HELP_STRING([--64bit-integer-mpi-support],
[allows to link against the 64bit integer versions of the MPI library]),
[
if test x"$enableval" = x"yes"; then
enable_64bit_integer_mpi_support=yes
else
enable_64bit_integer_mpi_support=no
fi
],
[enable_64bit_integer_mpi_support="no"])
AC_MSG_RESULT([$enable_64bit_integer_mpi_support])
if test x"${enable_64bit_integer_mpi_support}" = x"yes"; then
if test x"${enable_legacy}" = x"yes"; then
AC_MSG_ERROR([You cannot both enable 64bit integer support and the legacy interface!])
fi
AC_DEFINE([HAVE_64BIT_INTEGER_MPI_SUPPORT], [1], [allow to link against the 64bit integer versions of the MPI library])
fi
AM_CONDITIONAL([HAVE_64BIT_INTEGER_MPI_SUPPORT],[test x"$enable_64bit_integer_mpi_support" = x"yes"])
AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
int main(int argc, char **argv) {
......
......@@ -248,6 +248,8 @@ for lang, p, d in product(sorted(language_flag.keys()), sorted(prec_flag.keys())
name = "validate_autotune{langsuffix}_{d}_{p}".format(langsuffix=language_flag[lang], d=d, p=p)
print("if ENABLE_AUTOTUNING")
if lang == "C":
print("if ENABLE_C_TESTS")
print("check_SCRIPTS += " + name + "_extended.sh")
print("noinst_PROGRAMS += " + name)
if lang == "Fortran":
......@@ -267,6 +269,8 @@ for lang, p, d in product(sorted(language_flag.keys()), sorted(prec_flag.keys())
domain_flag[d],
prec_flag[p]]))
print("endif\n" * endifs)
if lang == "C":
print("endif")
print("endif")
name = "validate_multiple_objs_real_double"
......@@ -282,6 +286,7 @@ print(" " + " \\\n ".join([
print("endif")
name = "validate_multiple_objs_real_double_c_version"
print("if ENABLE_C_TESTS")
print("if ENABLE_AUTOTUNING")
print("check_SCRIPTS += " + name + "_extended.sh")
print("noinst_PROGRAMS += " + name)
......@@ -292,6 +297,8 @@ print(" " + " \\\n ".join([
domain_flag['real'],
prec_flag['double']]))
print("endif")
print("endif")
name = "validate_split_comm_real_double"
print("check_SCRIPTS += " + name + "_extended.sh")
......
......@@ -99,12 +99,16 @@
integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
l_rqm, ns, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
l_cols_qreorg, np, l_idx, nqcols1, nqcols2
integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
np_cols, mpierr
np_cols
integer(kind=MPI_KIND) :: mpierr
integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
integer(kind=ik) :: np_next, np_prev, np_rem
integer(kind=ik) :: idx(na), idx1(na), idx2(na)
integer(kind=BLAS_KIND) :: idxBLAS(NA)
integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
integer(kind=ik) :: istat
......@@ -130,10 +134,16 @@
call obj%timer%start("merge_systems" // PRECISION_SUFFIX)
success = .true.
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
! If my processor column isn't in the requested set, do nothing
......@@ -230,7 +240,9 @@
rho = 2.0_rk*beta
! Calculate index for merging both systems by ascending eigenvalues
call obj%timer%start("blas")
call PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx )
call PRECISION_LAMRG( int(nm,kind=BLAS_KIND), int(na-nm,kind=BLAS_KIND), d, &
1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
idx(:) = int(idxBLAS(:),kind=ik)
call obj%timer%stop("blas")
! Calculate the allowable deflation tolerance
......@@ -391,8 +403,8 @@
d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
else ! na1==2
call obj%timer%start("blas")
call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
call PRECISION_LAED5(1_BLAS_KIND, d1, z1, qtrans(1,1), rho, d(1))
call PRECISION_LAED5(2_BLAS_KIND, d1, z1, qtrans(1,2), rho, d(2))
call obj%timer%stop("blas")
call transform_columns_&
&PRECISION&
......@@ -404,7 +416,9 @@
! Calculate arrangement of all eigenvalues in output
call obj%timer%start("blas")
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call PRECISION_LAMRG( int(na1,kind=BLAS_KIND), int(na-na1,kind=BLAS_KIND), d, &
1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
idx(:) = int(idxBLAS(:),kind=ik)
call obj%timer%stop("blas")
! Rearrange eigenvalues
......@@ -437,16 +451,19 @@
ddiff(1:na1) = 0
info = 0
infoBLAS = int(info,kind=BLAS_KIND)
#ifdef WITH_OPENMP
call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
my_thread = omp_get_thread_num()
!$OMP DO
#endif
DO i = my_proc+1, na1, n_procs ! work distributed over all processors
call obj%timer%start("blas")
call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
call PRECISION_LAED4(int(na1,kind=BLAS_KIND), int(i,kind=BLAS_KIND), d1, z1, delta, &
rho, s, infoBLAS) ! s is not used!
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
if (info/=0) then
! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
......@@ -548,7 +565,9 @@
call obj%timer%start("blas")
! Calculate arrangement of all eigenvalues in output
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call PRECISION_LAMRG(int(na1,kind=BLAS_KIND), int(na-na1,kind=BLAS_KIND), d, &
1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
idx(:) = int(idxBLAS(:),kind=ik)
call obj%timer%stop("blas")
! Rearrange eigenvalues
tmp = d
......@@ -678,9 +697,9 @@
endif
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL_PRECISION, &
np_next, 1111, np_prev, 1111, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call MPI_Sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=MPI_KIND), MPI_REAL_PRECISION, &
int(np_next,kind=MPI_KIND), 1111_MPI_KIND, int(np_prev,kind=MPI_KIND), &
1111_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
......@@ -775,10 +794,11 @@
else
call obj%timer%start("blas")
call obj%timer%start("gemm")
call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, &
1.0_rk, qtmp1, ubound(qtmp1,dim=1), &
ev, ubound(ev,dim=1), &
1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1))
call PRECISION_GEMM('N', 'N', int(l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), &
int(nnzu,kind=BLAS_KIND), &
1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=BLAS_KIND), &
ev, int(ubound(ev,dim=1),kind=BLAS_KIND), &
1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND))
call obj%timer%stop("gemm")
call obj%timer%stop("blas")
endif ! useGPU
......@@ -833,10 +853,11 @@
else
call obj%timer%start("blas")
call obj%timer%start("gemm")
call PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, &
1.0_rk, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), &
ev, ubound(ev,dim=1), &
1.0_rk, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
call PRECISION_GEMM('N', 'N', int(l_rows-l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), &
int(nnzl,kind=BLAS_KIND), &
1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=BLAS_KIND), &
ev, int(ubound(ev,dim=1),kind=BLAS_KIND), &
1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND))
call obj%timer%stop("gemm")
call obj%timer%stop("blas")
endif ! useGPU
......@@ -921,7 +942,7 @@
&PRECISION&
&(obj, idx_ev, nLength)
use precision
use elpa_abstract_impl
use elpa_abstract_impl
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: nLength
......@@ -962,14 +983,16 @@
else
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL_PRECISION, pc2, mod(i,4096), mpi_comm_cols, mpierr)
call mpi_send(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, int(mod(i,4096),kind=MPI_KIND), &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
else if (pc2==my_pcol) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL_PRECISION, pc1, mod(i,4096), mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(qtmp(1,nc), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, int(mod(i,4096),kind=MPI_KIND), &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1)
......@@ -1027,9 +1050,9 @@
else
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL_PRECISION, pc2, 1, &
tmp, l_rows, MPI_REAL_PRECISION, pc2, 1, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_sendrecv(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, &
tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)
......@@ -1039,9 +1062,9 @@
else if (pc2==my_pcol) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL_PRECISION, pc1, 1, &
tmp, l_rows, MPI_REAL_PRECISION, pc1, 1, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_sendrecv(q(l_rqs,lc2), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, &
tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp(1:l_rows) = q(l_rqs:l_rqe,lc2)
......@@ -1072,7 +1095,7 @@
! Do an mpi_allreduce over processor rows
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(z, tmp, n, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp = z
......@@ -1087,7 +1110,7 @@
if (npc_n==np_cols) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp, z, n, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp = z
......@@ -1102,8 +1125,9 @@
z(:) = z(:) + tmp(:)
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call MPI_Sendrecv_replace(z, n, MPI_REAL_PRECISION, np_next, 1111, np_prev, 1111, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call MPI_Sendrecv_replace(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_next,kind=MPI_KIND), 1111_MPI_KIND, &
int(np_prev,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
enddo
......@@ -1130,7 +1154,7 @@
! Do an mpi_allreduce over processor rows
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(z, tmp, n, MPI_REAL_PRECISION, MPI_PROD, mpi_comm_rows, mpierr)
call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp = z
......@@ -1145,7 +1169,7 @@
if (npc_n==np_cols) then
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp, z, n, MPI_REAL_PRECISION, MPI_PROD, mpi_comm_cols, mpierr)
call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
z = tmp
......@@ -1161,7 +1185,8 @@
do np = npc_0+1, npc_0+npc_n-1
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_recv(tmp, n, MPI_REAL_PRECISION, np, 1111, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
tmp(1:n) = z(1:n)
......@@ -1171,15 +1196,18 @@
do np = npc_0+1, npc_0+npc_n-1
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_send(z, n, MPI_REAL_PRECISION, np, 1111, mpi_comm_cols, mpierr)
call mpi_send(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
enddo
else
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
call mpi_send(tmp, n, MPI_REAL_PRECISION, npc_0, 1111, mpi_comm_cols, mpierr)
call mpi_recv(z ,n, MPI_REAL_PRECISION, npc_0, 1111, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_send(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call mpi_recv(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
z(1:n) = tmp(1:n)
......
......@@ -75,8 +75,8 @@ subroutine solve_tridi_&
logical, intent(out) :: success
integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols
integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:)
integer(kind=ik) :: istat
......@@ -93,10 +93,16 @@ subroutine solve_tridi_&
call obj%timer%start("solve_tridi" // PRECISION_SUFFIX // gpuString)
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
success = .true.
......@@ -293,7 +299,8 @@ subroutine solve_tridi_&
call obj%timer%start("mpi_communication")
if (my_pcol==np_off) then
do n=np_off+np1,np_off+nprocs-1
call mpi_send(d(noff+1), nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
call mpi_send(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), 1_MPI_KIND, &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
enddo