Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
fa78e003
Commit
fa78e003
authored
Oct 26, 2019
by
Andreas Marek
Browse files
Merge branch 'long_int_scalapack' into 'master_pre_stage'
Long int scalapack See merge request
!23
parents
f31efe63
71ebefc2
Changes
81
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
fa78e003
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Makefile.am
View file @
fa78e003
...
...
@@ -591,6 +591,8 @@ noinst_LTLIBRARIES += libelpatest@SUFFIX@.la
libelpatest@SUFFIX@
_la_FCFLAGS
=
$(test_program_fcflags)
libelpatest@SUFFIX@
_la_SOURCES
=
\
test
/shared/tests_variable_definitions.F90
\
test
/shared/mod_tests_scalapack_interfaces.F90
\
test
/shared/mod_tests_blas_interfaces.F90
\
test
/shared/test_util.F90
\
test
/shared/test_read_input_parameters.F90
\
test
/shared/test_check_correctness.F90
\
...
...
@@ -766,6 +768,8 @@ EXTRA_DIST = \
manual_cpp
\
nvcc_wrap
\
remove_xcompiler
\
src/helpers/fortran_blas_interfaces.F90
\
src/helpers/fortran_scalapack_interfaces.F90
\
src/GPU/cuUtils_template.cu
\
src/elpa_api_math_template.F90
\
src/elpa_impl_math_template.F90
\
...
...
ci_test_scripts/.ci-env-vars
View file @
fa78e003
...
...
@@ -59,6 +59,15 @@ export MKL_ANACONDA_INTEL_SCALAPACK_MPI_OMP_BASELINE="-L$ANACONDA_HOME/lib -lmkl
export MKL_ANACONDA_INTEL_SCALAPACK_FCFLAGS_MPI_OMP="-I$MKL_HOME/include/intel64/lp64"
export MKL_ANACONDA_INTEL_SCALAPACK_LDFLAGS_MPI_OMP="$MKL_ANACONDA_INTEL_SCALAPACK_MPI_OMP_BASELINE -Wl,-rpath,$ANACONDA_HOME/lib"
export MKL_GFORTRAN_SCALAPACK_NOMPI_NOOMP_ILP64_BASELINE="-L$MKL_HOME/lib/intel64 -lmkl_gf_ilp64 -lmkl_sequential -lmkl_core -lpthread"
export MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_NOOMP_ILP64="-I$MKL_HOME/include/intel64/ilp64"
export MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_NOOMP_ILP64="$MKL_GFORTRAN_SCALAPACK_NOMPI_NOOMP_ILP64_BASELINE -Wl,-rpath,$MKL_HOME/lib/intel64"
export MKL_GFORTRAN_SCALAPACK_NOMPI_OMP_ILP64_BASELINE="-L$MKL_HOME/lib/intel64 -lmkl_gf_ilp64 -lmkl_sequential -lmkl_core -lpthread"
export MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_OMP_ILP64="-I$MKL_HOME/include/intel64/ilp64"
export MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_OMP_ILP64="$MKL_GFORTRAN_SCALAPACK_NOMPI_OMP_ILP64_BASELINE -Wl,-rpath,$MKL_HOME/lib/intel64"
export ASAN_OPTIONS=suppressions=./ci_test_scripts/no_asan_for_mpi.supp,fast_unwind_on_malloc=0
export LSAN_OPTIONS=suppressions=./ci_test_scripts/no_lsan_for_mpi.supp
...
...
ci_test_scripts/generate_gitlab_ci_tests.py
View file @
fa78e003
...
...
@@ -148,7 +148,7 @@ def set_cflags_fcflags(instr, cc, fc, instruction_set):
FCFLAGS
+=
"-O3 -xMIC-AVX512"
if
(
instr
==
"avx2"
):
INSTRUCTION_OPTIONS
=
instruction_set
[
instr
]
INSTRUCTION_OPTIONS
=
instruction_set
[
instr
]
+
" --disable-avx512"
if
(
cc
==
"gnu"
):
CFLAGS
+=
"-O3 -mavx2 -mfma"
else
:
...
...
@@ -332,8 +332,47 @@ print(" - ./ci_test_scripts/run_distcheck_tests.sh -c \" FC=mpiifort FCFLAGS
print
(
"
\n\n
"
)
#add two tests for ilp64 mkl interface
ilp64_no_omp_tests
=
[
"# gnu-gnu-ilp64-nompi-noomp"
,
"gnu-gnu-nompi-noopenmp-ilp64:"
,
" tags:"
,
" - avx"
,
" artifacts:"
,
" when: on_success"
,
" expire_in: 2 month"
,
" script:"
,
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=
\\
"gcc
\\
" CFLAGS=
\\
"-O3 -mavx
\\
" '
'FC=
\\
"gfortran
\\
" FCFLAGS=
\\
"-O3 -mavx
\\
" '
'SCALAPACK_LDFLAGS=
\\
"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_NOOMP_ILP64
\\
" '
'SCALAPACK_FCFLAGS=
\\
"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_NOOMP_ILP64
\\
" '
'--enable-option-checking=fatal --with-mpi=no --disable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM'
,
"
\n
"
,
"# gnu-gnu-ilp64-nompi-openmp"
,
"gnu-gnu-nompi-openmp-ilp64:"
,
" tags:"
,
" - avx"
,
" artifacts:"
,
" when: on_success"
,
" expire_in: 2 month"
,
" script:"
,
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=
\\
"gcc
\\
" CFLAGS=
\\
"-O3 -mavx
\\
" '
'FC=
\\
"gfortran
\\
" FCFLAGS=
\\
"-O3 -mavx
\\
" '
'SCALAPACK_LDFLAGS=
\\
"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_OMP_ILP64
\\
" '
'SCALAPACK_FCFLAGS=
\\
"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_NOMPI_OMP_ILP64
\\
" '
'--enable-option-checking=fatal --with-mpi=no --enable-openmp '
'--disable-gpu --enable-avx --disable-avx2 --disable-avx512 --enable-64bit-integer-math-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM'
,
"
\n
"
,
]
print
(
"
\n
"
.
join
(
ilp64_no_omp_tests
))
# add python tests
python_ci_tests
=
[
...
...
ci_test_scripts/run_ci_tests.sh
View file @
fa78e003
...
...
@@ -144,9 +144,12 @@ then
echo
"export TASKS=
$mpiTasks
"
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
"make check TEST_FLAGS=
\"
$matrixSize
$nrEV
$blockSize
\"
"
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
" "
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
"exitCode=
\$
?"
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
" "
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
"#copy everything back from /tmp/elpa to runner directory"
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
"cp -r *
\$
runner_path"
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
"cd .. && rm -rf /tmp/elpa_
\$
SLURM_JOB_ID"
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
"exit
\$
exitCode"
>>
./run_
${
CLUSTER
}
_1node_2GPU.sh
echo
" "
echo
"Job script for the run"
...
...
@@ -179,9 +182,12 @@ then
echo
"export TASKS=
$mpiTasks
"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"make check TEST_FLAGS=
\"
$matrixSize
$nrEV
$blockSize
\"
"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
" "
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"exitCode=
\$
?"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
" "
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"#copy everything back from /tmp/elpa to runner directory"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"cp -r *
\$
runner_path"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"cd .. && rm -rf /tmp/elpa_
\$
SLURM_JOB_ID"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"exit
\$
exitCode"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
" "
echo
"Job script for the run"
...
...
ci_test_scripts/run_distcheck_tests.sh
View file @
fa78e003
...
...
@@ -127,10 +127,12 @@ then
echo
"export DISTCHECK_CONFIGURE_FLAGS=
\"
$distcheckConfigureArgs
\"
"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"make distcheck TEST_FLAGS=
\"
$matrixSize
$nrEV
$blockSize
\"
|| { chmod u+rwX -R . ; exit 1 ; } "
>>
./run_
${
CLUSTER
}
_1node.sh
echo
" "
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"exitCode=
\$
?"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
" "
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"#copy everything back from /tmp/elpa to runner directory"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"cp -r *
\$
runner_path"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"cd .. && rm -rf /tmp/elpa_
\$
SLURM_JOB_ID"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"exit
\$
exitCode"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
" "
echo
"Job script for the run"
cat
./run_
${
CLUSTER
}
_1node.sh
...
...
ci_test_scripts/run_project_tests.sh
View file @
fa78e003
...
...
@@ -175,12 +175,14 @@ then
echo
"popd"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"pushd build"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"make distclean || { exit 1; }"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"exitCode=
\$
?"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"rm -rf installdest"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"popd"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
" "
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"#copy everything back from /tmp/elpa to runner directory"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"cp -r *
\$
runner_path"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"cd .. && rm -rf /tmp/elpa_
\$
SLURM_JOB_ID"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
"exit
\$
exitCode"
>>
./run_
${
CLUSTER
}
_1node.sh
echo
" "
echo
"Job script for the run"
cat
./run_
${
CLUSTER
}
_1node.sh
...
...
configure.ac
View file @
fa78e003
...
...
@@ -208,6 +208,68 @@ if test x"${enable_heterogenous_cluster_support}" = x"yes"; then
fi
AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"])
dnl 64bit integer support for BLACS/LAPACK/SCALAPACK support
dnl first long int
AC_CHECK_SIZEOF([long int])
size_of_long_int="${ac_cv_sizeof_long_int}"
dnl then 64bit blas
AC_MSG_CHECKING(whether 64bit integers should be used for math libraries (BLAS/LAPACK/SCALAPACK))
AC_ARG_ENABLE([64bit-integer-math-support],
AS_HELP_STRING([--64bit-integer-math-support],
[allows to link against the 64bit integer versions of the math libraries BLAS, LAPACK, and SCALAPACK]),
[
if test x"$enableval" = x"yes"; then
enable_64bit_integer_math_support=yes
else
enable_64bit_integer_math_support=no
fi
],
[enable_64bit_integer_math_support="no"])
AC_MSG_RESULT([$enable_64bit_integer_math_support])
if test x"${enable_64bit_integer_math_support}" = x"yes"; then
if test x"${enable_legacy}" = x"yes"; then
AC_MSG_ERROR([You cannot both enable 64bit integer support and the legacy interface!])
fi
dnl at least INTEL MPI does _NOT_ support 64BIT integer mode for C thus disable C tests in this Case
if test x"${enable_c_tests}" = x"yes"; then
AC_MSG_ERROR([You cannot both define 64bit integer support and C tests. Reconfigure!])
fi
dnl check whether long int is the correct data-type in C
if test x"${size_of_long_int}" = x"8"; then
echo "Found C data-type \"long int\" with 8 bytes"
else
AC_MSG_ERROR([The C data-type "long int" is only ${size_of_long_int} bytes; Needed is 8 bytes])
fi
AC_DEFINE([HAVE_64BIT_INTEGER_MATH_SUPPORT], [1], [allow to link against the 64bit integer versions of math libraries])
fi
AM_CONDITIONAL([HAVE_64BIT_INTEGER_MATH_SUPPORT],[test x"$enable_64bit_integer_math_support" = x"yes"])
dnl then 64bit blas
AC_MSG_CHECKING(whether 64bit integers should be used for the MPI library)
AC_ARG_ENABLE([64bit-integer-mpi-support],
AS_HELP_STRING([--64bit-integer-mpi-support],
[allows to link against the 64bit integer versions of the MPI library]),
[
if test x"$enableval" = x"yes"; then
enable_64bit_integer_mpi_support=yes
else
enable_64bit_integer_mpi_support=no
fi
],
[enable_64bit_integer_mpi_support="no"])
AC_MSG_RESULT([$enable_64bit_integer_mpi_support])
if test x"${enable_64bit_integer_mpi_support}" = x"yes"; then
if test x"${enable_legacy}" = x"yes"; then
AC_MSG_ERROR([You cannot both enable 64bit integer support and the legacy interface!])
fi
AC_DEFINE([HAVE_64BIT_INTEGER_MPI_SUPPORT], [1], [allow to link against the 64bit integer versions of the MPI library])
fi
AM_CONDITIONAL([HAVE_64BIT_INTEGER_MPI_SUPPORT],[test x"$enable_64bit_integer_mpi_support" = x"yes"])
AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
int main(int argc, char **argv) {
...
...
generate_automake_test_programs.py
View file @
fa78e003
...
...
@@ -248,6 +248,8 @@ for lang, p, d in product(sorted(language_flag.keys()), sorted(prec_flag.keys())
name
=
"validate_autotune{langsuffix}_{d}_{p}"
.
format
(
langsuffix
=
language_flag
[
lang
],
d
=
d
,
p
=
p
)
print
(
"if ENABLE_AUTOTUNING"
)
if
lang
==
"C"
:
print
(
"if ENABLE_C_TESTS"
)
print
(
"check_SCRIPTS += "
+
name
+
"_extended.sh"
)
print
(
"noinst_PROGRAMS += "
+
name
)
if
lang
==
"Fortran"
:
...
...
@@ -267,6 +269,8 @@ for lang, p, d in product(sorted(language_flag.keys()), sorted(prec_flag.keys())
domain_flag
[
d
],
prec_flag
[
p
]]))
print
(
"endif
\n
"
*
endifs
)
if
lang
==
"C"
:
print
(
"endif"
)
print
(
"endif"
)
name
=
"validate_multiple_objs_real_double"
...
...
@@ -282,6 +286,7 @@ print(" " + " \\\n ".join([
print
(
"endif"
)
name
=
"validate_multiple_objs_real_double_c_version"
print
(
"if ENABLE_C_TESTS"
)
print
(
"if ENABLE_AUTOTUNING"
)
print
(
"check_SCRIPTS += "
+
name
+
"_extended.sh"
)
print
(
"noinst_PROGRAMS += "
+
name
)
...
...
@@ -292,6 +297,8 @@ print(" " + " \\\n ".join([
domain_flag
[
'real'
],
prec_flag
[
'double'
]]))
print
(
"endif"
)
print
(
"endif"
)
name
=
"validate_split_comm_real_double"
print
(
"check_SCRIPTS += "
+
name
+
"_extended.sh"
)
...
...
src/elpa1/elpa1_merge_systems_real_template.F90
View file @
fa78e003
...
...
@@ -99,12 +99,16 @@
integer
(
kind
=
ik
)
::
i
,
j
,
na1
,
na2
,
l_rows
,
l_cols
,
l_rqs
,
l_rqe
,
&
l_rqm
,
ns
,
info
integer
(
kind
=
BLAS_KIND
)
::
infoBLAS
integer
(
kind
=
ik
)
::
l_rnm
,
nnzu
,
nnzl
,
ndef
,
ncnt
,
max_local_cols
,
&
l_cols_qreorg
,
np
,
l_idx
,
nqcols1
,
nqcols2
integer
(
kind
=
ik
)
::
my_proc
,
n_procs
,
my_prow
,
my_pcol
,
np_rows
,
&
np_cols
,
mpierr
np_cols
integer
(
kind
=
MPI_KIND
)
::
mpierr
integer
(
kind
=
MPI_KIND
)
::
my_prowMPI
,
np_rowsMPI
,
my_pcolMPI
,
np_colsMPI
integer
(
kind
=
ik
)
::
np_next
,
np_prev
,
np_rem
integer
(
kind
=
ik
)
::
idx
(
na
),
idx1
(
na
),
idx2
(
na
)
integer
(
kind
=
BLAS_KIND
)
::
idxBLAS
(
NA
)
integer
(
kind
=
ik
)
::
coltyp
(
na
),
idxq1
(
na
),
idxq2
(
na
)
integer
(
kind
=
ik
)
::
istat
...
...
@@ -130,10 +134,16 @@
call
obj
%
timer
%
start
(
"merge_systems"
//
PRECISION_SUFFIX
)
success
=
.true.
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_comm_rank
(
mpi_comm_rows
,
my_prow
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_rows
,
np_rows
,
mpierr
)
call
mpi_comm_rank
(
mpi_comm_cols
,
my_pcol
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_cols
,
np_cols
,
mpierr
)
call
mpi_comm_rank
(
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
)
,
my_prowMPI
,
mpierr
)
call
mpi_comm_size
(
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
)
,
np_rowsMPI
,
mpierr
)
call
mpi_comm_rank
(
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
my_pcolMPI
,
mpierr
)
call
mpi_comm_size
(
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
np_colsMPI
,
mpierr
)
my_prow
=
int
(
my_prowMPI
,
kind
=
c_int
)
np_rows
=
int
(
np_rowsMPI
,
kind
=
c_int
)
my_pcol
=
int
(
my_pcolMPI
,
kind
=
c_int
)
np_cols
=
int
(
np_colsMPI
,
kind
=
c_int
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
! If my processor column isn't in the requested set, do nothing
...
...
@@ -230,7 +240,9 @@
rho
=
2.0_rk
*
beta
! Calculate index for merging both systems by ascending eigenvalues
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_LAMRG
(
nm
,
na
-
nm
,
d
,
1
,
1
,
idx
)
call
PRECISION_LAMRG
(
int
(
nm
,
kind
=
BLAS_KIND
),
int
(
na
-
nm
,
kind
=
BLAS_KIND
),
d
,
&
1_BLAS_KIND
,
1_BLAS_KIND
,
idxBLAS
)
idx
(:)
=
int
(
idxBLAS
(:),
kind
=
ik
)
call
obj
%
timer
%
stop
(
"blas"
)
! Calculate the allowable deflation tolerance
...
...
@@ -391,8 +403,8 @@
d
(
1
)
=
d1
(
1
)
+
rho
*
z1
(
1
)
**
2
! solve secular equation
else
! na1==2
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_LAED5
(
1
,
d1
,
z1
,
qtrans
(
1
,
1
),
rho
,
d
(
1
))
call
PRECISION_LAED5
(
2
,
d1
,
z1
,
qtrans
(
1
,
2
),
rho
,
d
(
2
))
call
PRECISION_LAED5
(
1
_BLAS_KIND
,
d1
,
z1
,
qtrans
(
1
,
1
),
rho
,
d
(
1
))
call
PRECISION_LAED5
(
2
_BLAS_KIND
,
d1
,
z1
,
qtrans
(
1
,
2
),
rho
,
d
(
2
))
call
obj
%
timer
%
stop
(
"blas"
)
call
transform_columns_
&
&
PRECISION
&
...
...
@@ -404,7 +416,9 @@
! Calculate arrangement of all eigenvalues in output
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_LAMRG
(
na1
,
na
-
na1
,
d
,
1
,
1
,
idx
)
call
PRECISION_LAMRG
(
int
(
na1
,
kind
=
BLAS_KIND
),
int
(
na
-
na1
,
kind
=
BLAS_KIND
),
d
,
&
1_BLAS_KIND
,
1_BLAS_KIND
,
idxBLAS
)
idx
(:)
=
int
(
idxBLAS
(:),
kind
=
ik
)
call
obj
%
timer
%
stop
(
"blas"
)
! Rearrange eigenvalues
...
...
@@ -437,16 +451,19 @@
ddiff
(
1
:
na1
)
=
0
info
=
0
infoBLAS
=
int
(
info
,
kind
=
BLAS_KIND
)
#ifdef WITH_OPENMP
call
obj
%
timer
%
start
(
"OpenMP parallel"
//
PRECISION_SUFFIX
)
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,
infoBLAS,
j)
my_thread
=
omp_get_thread_num
()
!$OMP DO
#endif
DO
i
=
my_proc
+1
,
na1
,
n_procs
! work distributed over all processors
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_LAED4
(
na1
,
i
,
d1
,
z1
,
delta
,
rho
,
s
,
info
)
! s is not used!
call
PRECISION_LAED4
(
int
(
na1
,
kind
=
BLAS_KIND
),
int
(
i
,
kind
=
BLAS_KIND
),
d1
,
z1
,
delta
,
&
rho
,
s
,
infoBLAS
)
! s is not used!
info
=
int
(
infoBLAS
,
kind
=
ik
)
call
obj
%
timer
%
stop
(
"blas"
)
if
(
info
/
=
0
)
then
! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
...
...
@@ -548,7 +565,9 @@
call
obj
%
timer
%
start
(
"blas"
)
! Calculate arrangement of all eigenvalues in output
call
PRECISION_LAMRG
(
na1
,
na
-
na1
,
d
,
1
,
1
,
idx
)
call
PRECISION_LAMRG
(
int
(
na1
,
kind
=
BLAS_KIND
),
int
(
na
-
na1
,
kind
=
BLAS_KIND
),
d
,
&
1_BLAS_KIND
,
1_BLAS_KIND
,
idxBLAS
)
idx
(:)
=
int
(
idxBLAS
(:),
kind
=
ik
)
call
obj
%
timer
%
stop
(
"blas"
)
! Rearrange eigenvalues
tmp
=
d
...
...
@@ -678,9 +697,9 @@
endif
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Sendrecv_replace
(
qtmp1
,
l_rows
*
max_local_cols
,
MPI_REAL_PRECISION
,
&
np_next
,
1111
,
np_prev
,
1111
,
&
mpi_comm_cols
,
MPI_STATUS_IGNORE
,
mpierr
)
call
MPI_Sendrecv_replace
(
qtmp1
,
int
(
l_rows
*
max_local_cols
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
&
int
(
np_next
,
kind
=
MPI_KIND
),
1111_MPI_KIND
,
int
(
np_prev
,
kind
=
MPI_KIND
)
,
&
1111_MPI_KIND
,
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
MPI_STATUS_IGNORE
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
endif
...
...
@@ -775,10 +794,11 @@
else
call
obj
%
timer
%
start
(
"blas"
)
call
obj
%
timer
%
start
(
"gemm"
)
call
PRECISION_GEMM
(
'N'
,
'N'
,
l_rnm
,
ncnt
,
nnzu
,
&
1.0_rk
,
qtmp1
,
ubound
(
qtmp1
,
dim
=
1
),
&
ev
,
ubound
(
ev
,
dim
=
1
),
&
1.0_rk
,
qtmp2
(
1
,
1
),
ubound
(
qtmp2
,
dim
=
1
))
call
PRECISION_GEMM
(
'N'
,
'N'
,
int
(
l_rnm
,
kind
=
BLAS_KIND
),
int
(
ncnt
,
kind
=
BLAS_KIND
),
&
int
(
nnzu
,
kind
=
BLAS_KIND
),
&
1.0_rk
,
qtmp1
,
int
(
ubound
(
qtmp1
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
ev
,
int
(
ubound
(
ev
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
1.0_rk
,
qtmp2
(
1
,
1
),
int
(
ubound
(
qtmp2
,
dim
=
1
),
kind
=
BLAS_KIND
))
call
obj
%
timer
%
stop
(
"gemm"
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
! useGPU
...
...
@@ -833,10 +853,11 @@
else
call
obj
%
timer
%
start
(
"blas"
)
call
obj
%
timer
%
start
(
"gemm"
)
call
PRECISION_GEMM
(
'N'
,
'N'
,
l_rows
-
l_rnm
,
ncnt
,
nnzl
,
&
1.0_rk
,
qtmp1
(
l_rnm
+1
,
1
),
ubound
(
qtmp1
,
dim
=
1
),
&
ev
,
ubound
(
ev
,
dim
=
1
),
&
1.0_rk
,
qtmp2
(
l_rnm
+1
,
1
),
ubound
(
qtmp2
,
dim
=
1
))
call
PRECISION_GEMM
(
'N'
,
'N'
,
int
(
l_rows
-
l_rnm
,
kind
=
BLAS_KIND
),
int
(
ncnt
,
kind
=
BLAS_KIND
),
&
int
(
nnzl
,
kind
=
BLAS_KIND
),
&
1.0_rk
,
qtmp1
(
l_rnm
+1
,
1
),
int
(
ubound
(
qtmp1
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
ev
,
int
(
ubound
(
ev
,
dim
=
1
),
kind
=
BLAS_KIND
),
&
1.0_rk
,
qtmp2
(
l_rnm
+1
,
1
),
int
(
ubound
(
qtmp2
,
dim
=
1
),
kind
=
BLAS_KIND
))
call
obj
%
timer
%
stop
(
"gemm"
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
! useGPU
...
...
@@ -921,7 +942,7 @@
&
PRECISION
&
&(
obj
,
idx_ev
,
nLength
)
use
precision
use
elpa_abstract_impl
use
elpa_abstract_impl
implicit
none
class
(
elpa_abstract_impl_t
),
intent
(
inout
)
::
obj
integer
(
kind
=
ik
),
intent
(
in
)
::
nLength
...
...
@@ -962,14 +983,16 @@
else
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_send
(
q
(
l_rqs
,
lc1
),
l_rows
,
MPI_REAL_PRECISION
,
pc2
,
mod
(
i
,
4096
),
mpi_comm_cols
,
mpierr
)
call
mpi_send
(
q
(
l_rqs
,
lc1
),
int
(
l_rows
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
pc2
,
int
(
mod
(
i
,
4096
),
kind
=
MPI_KIND
),
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
endif
else
if
(
pc2
==
my_pcol
)
then
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_recv
(
qtmp
(
1
,
nc
),
l_rows
,
MPI_REAL_PRECISION
,
pc1
,
mod
(
i
,
4096
),
mpi_comm_cols
,
MPI_STATUS_IGNORE
,
mpierr
)
call
mpi_recv
(
qtmp
(
1
,
nc
),
int
(
l_rows
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
pc1
,
int
(
mod
(
i
,
4096
),
kind
=
MPI_KIND
),
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
MPI_STATUS_IGNORE
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
qtmp
(
1
:
l_rows
,
nc
)
=
q
(
l_rqs
:
l_rqe
,
lc1
)
...
...
@@ -1027,9 +1050,9 @@
else
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_sendrecv
(
q
(
l_rqs
,
lc1
),
l_rows
,
MPI_REAL_PRECISION
,
pc2
,
1
,
&
tmp
,
l_rows
,
MPI_REAL_PRECISION
,
pc2
,
1
,
&
mpi_comm_cols
,
MPI_STATUS_IGNORE
,
mpierr
)
call
mpi_sendrecv
(
q
(
l_rqs
,
lc1
),
int
(
l_rows
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
pc2
,
1
_MPI_KIND
,
&
tmp
,
int
(
l_rows
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
pc2
,
1
_MPI_KIND
,
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
MPI_STATUS_IGNORE
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
tmp
(
1
:
l_rows
)
=
q
(
l_rqs
:
l_rqe
,
lc1
)
...
...
@@ -1039,9 +1062,9 @@
else
if
(
pc2
==
my_pcol
)
then
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_sendrecv
(
q
(
l_rqs
,
lc2
),
l_rows
,
MPI_REAL_PRECISION
,
pc1
,
1
,
&
tmp
,
l_rows
,
MPI_REAL_PRECISION
,
pc1
,
1
,
&
mpi_comm_cols
,
MPI_STATUS_IGNORE
,
mpierr
)
call
mpi_sendrecv
(
q
(
l_rqs
,
lc2
),
int
(
l_rows
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
pc1
,
1
_MPI_KIND
,
&
tmp
,
int
(
l_rows
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
pc1
,
1
_MPI_KIND
,
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
MPI_STATUS_IGNORE
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
tmp
(
1
:
l_rows
)
=
q
(
l_rqs
:
l_rqe
,
lc2
)
...
...
@@ -1072,7 +1095,7 @@
! Do an mpi_allreduce over processor rows
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
z
,
tmp
,
n
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_rows
,
mpierr
)
call
mpi_allreduce
(
z
,
tmp
,
int
(
n
,
kind
=
MPI_KIND
)
,
MPI_REAL_PRECISION
,
MPI_SUM
,
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
tmp
=
z
...
...
@@ -1087,7 +1110,7 @@
if
(
npc_n
==
np_cols
)
then
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
tmp
,
z
,
n
,
MPI_REAL_PRECISION
,
MPI_SUM
,
mpi_comm_cols
,
mpierr
)
call
mpi_allreduce
(
tmp
,
z
,
int
(
n
,
kind
=
MPI_KIND
)
,
MPI_REAL_PRECISION
,
MPI_SUM
,
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
tmp
=
z
...
...
@@ -1102,8 +1125,9 @@
z
(:)
=
z
(:)
+
tmp
(:)
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Sendrecv_replace
(
z
,
n
,
MPI_REAL_PRECISION
,
np_next
,
1111
,
np_prev
,
1111
,
&
mpi_comm_cols
,
MPI_STATUS_IGNORE
,
mpierr
)
call
MPI_Sendrecv_replace
(
z
,
int
(
n
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
int
(
np_next
,
kind
=
MPI_KIND
),
1111_MPI_KIND
,
&
int
(
np_prev
,
kind
=
MPI_KIND
),
1111_MPI_KIND
,
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
MPI_STATUS_IGNORE
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
enddo
...
...
@@ -1130,7 +1154,7 @@
! Do an mpi_allreduce over processor rows
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
z
,
tmp
,
n
,
MPI_REAL_PRECISION
,
MPI_PROD
,
mpi_comm_rows
,
mpierr
)
call
mpi_allreduce
(
z
,
tmp
,
int
(
n
,
kind
=
MPI_KIND
)
,
MPI_REAL_PRECISION
,
MPI_PROD
,
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
tmp
=
z
...
...
@@ -1145,7 +1169,7 @@
if
(
npc_n
==
np_cols
)
then
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_allreduce
(
tmp
,
z
,
n
,
MPI_REAL_PRECISION
,
MPI_PROD
,
mpi_comm_cols
,
mpierr
)
call
mpi_allreduce
(
tmp
,
z
,
int
(
n
,
kind
=
MPI_KIND
)
,
MPI_REAL_PRECISION
,
MPI_PROD
,
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
z
=
tmp
...
...
@@ -1161,7 +1185,8 @@
do
np
=
npc_0
+1
,
npc_0
+
npc_n
-1
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_recv
(
tmp
,
n
,
MPI_REAL_PRECISION
,
np
,
1111
,
mpi_comm_cols
,
MPI_STATUS_IGNORE
,
mpierr
)
call
mpi_recv
(
tmp
,
int
(
n
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
int
(
np
,
kind
=
MPI_KIND
),
1111_MPI_KIND
,
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
MPI_STATUS_IGNORE
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
tmp
(
1
:
n
)
=
z
(
1
:
n
)
...
...
@@ -1171,15 +1196,18 @@
do
np
=
npc_0
+1
,
npc_0
+
npc_n
-1
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_send
(
z
,
n
,
MPI_REAL_PRECISION
,
np
,
1111
,
mpi_comm_cols
,
mpierr
)
call
mpi_send
(
z
,
int
(
n
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
int
(
np
,
kind
=
MPI_KIND
),
1111_MPI_KIND
,
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#endif /* WITH_MPI */
enddo
else
#ifdef WITH_MPI
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_send
(
tmp
,
n
,
MPI_REAL_PRECISION
,
npc_0
,
1111
,
mpi_comm_cols
,
mpierr
)
call
mpi_recv
(
z
,
n
,
MPI_REAL_PRECISION
,
npc_0
,
1111
,
mpi_comm_cols
,
MPI_STATUS_IGNORE
,
mpierr
)
call
mpi_send
(
tmp
,
int
(
n
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
int
(
npc_0
,
kind
=
MPI_KIND
),
1111_MPI_KIND
,
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
mpierr
)
call
mpi_recv
(
z
,
int
(
n
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
int
(
npc_0
,
kind
=
MPI_KIND
),
1111_MPI_KIND
,
&
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
),
MPI_STATUS_IGNORE
,
mpierr
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
z
(
1
:
n
)
=
tmp
(
1
:
n
)
...
...
src/elpa1/elpa1_solve_tridi_real_template.F90
View file @
fa78e003
...
...
@@ -75,8 +75,8 @@ subroutine solve_tridi_&
logical
,
intent
(
out
)
::
success
integer
(
kind
=
ik
)
::
i
,
j
,
n
,
np
,
nc
,
nev1
,
l_cols
,
l_rows
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
my_prow
,
my_pcol
,
np_rows
,
np_cols
integer
(
kind
=
MPI_KIND
)
::
mpierr
,
my_prowMPI
,
my_pcolMPI
,
np_rowsMPI
,
np_colsMPI
integer
(
kind
=
ik
),
allocatable
::
limits
(:),
l_col
(:),
p_col
(:),
l_col_bc
(:),
p_col_bc
(:)
integer
(
kind
=
ik
)
::
istat
...
...
@@ -93,10 +93,16 @@ subroutine solve_tridi_&
call
obj
%
timer
%
start
(
"solve_tridi"
//
PRECISION_SUFFIX
//
gpuString
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_comm_rank
(
mpi_comm_rows
,
my_prow
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_rows
,
np_rows
,
mpierr
)
call
mpi_comm_rank
(
mpi_comm_cols
,
my_pcol
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_cols
,
np_cols
,
mpierr
)
call
mpi_comm_rank
(
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
)
,
my_prowMPI
,
mpierr
)
call
mpi_comm_size
(
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
)
,
np_rowsMPI
,
mpierr
)
call
mpi_comm_rank
(
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
my_pcolMPI
,
mpierr
)
call
mpi_comm_size
(
int
(
mpi_comm_cols
,
kind
=
MPI_KIND
)
,
np_colsMPI
,
mpierr
)
my_prow
=
int
(
my_prowMPI
,
kind
=
c_int
)
np_rows
=
int
(
np_rowsMPI
,
kind
=
c_int
)
my_pcol
=
int
(
my_pcolMPI
,
kind
=
c_int
)
np_cols
=
int
(
np_colsMPI
,
kind
=
c_int
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
success
=
.true.
...
...
@@ -293,7 +299,8 @@ subroutine solve_tridi_&
call
obj
%
timer
%
start
(
"mpi_communication"
)
if
(
my_pcol
==
np_off
)
then
do
n
=
np_off
+
np1
,
np_off
+
nprocs
-1
call
mpi_send
(
d
(
noff
+1
),
nmid
,
MPI_REAL_PRECISION
,
n
,
1
,
mpi_comm_cols
,
mpierr
)
call
mpi_send
(
d
(
noff
+1
),
int
(
nmid
,
kind
=
MPI_KIND
),
MPI_REAL_PRECISION
,
int
(
n
,
kind
=
MPI_KIND
),
1_MPI_KIND
,
&