Commit 9536b3ef authored by Sebastian Ohlmann's avatar Sebastian Ohlmann

Merge remote-tracking branch 'origin/master_pre_stage' into fix_typos

parents ab4c1841 e2d71af0
This diff is collapsed.
......@@ -448,6 +448,7 @@ dist_man_MANS = \
man/elpa_setup.3 \
man/elpa_eigenvalues.3 \
man/elpa_eigenvectors.3 \
man/elpa_generalized_eigenvectors.3 \
man/elpa_cholesky.3 \
man/elpa_invert_triangular.3 \
man/elpa_solve_tridiagonal.3 \
......
......@@ -51,8 +51,8 @@ export MKL_GFORTRAN_SCALAPACK_NO_MPI_OMP_BASELINE="-L$MKL_HOME/lib/intel64 -lmkl
export MKL_GFORTRAN_SCALAPACK_FCFLAGS_NO_MPI_OMP="-I$MKL_HOME/include/intel64/lp64"
export MKL_GFORTRAN_SCALAPACK_LDFLAGS_NO_MPI_OMP="$MKL_GFORTRAN_SCALAPACK_NO_MPI_OMP_BASELINE -Wl,-rpath,$MKL_HOME/lib/intel64"
export ASAN_OPTIONS=suppressions=no_asan_for_mpi.supp,fast_unwind_on_malloc=0
export LSAN_OPTIONS=suppressions=no_lsan_for_mpi.supp
export ASAN_OPTIONS=suppressions=./ci_test_scripts/no_asan_for_mpi.supp,fast_unwind_on_malloc=0
export LSAN_OPTIONS=suppressions=./ci_test_scripts/no_lsan_for_mpi.supp
fi
......
#!/bin/bash
source /etc/profile.d/modules.sh
set -ex
source ./ci_test_scripts/.ci-env-vars
module list
echo $1
make -j $1
#!/bin/bash
source /etc/profile.d/modules.sh
set -ex
pwd
source ./ci_test_scripts/.ci-env-vars
echo $1
eval ./configure $1
......@@ -11,6 +11,7 @@ blockSize=16
ompThreads=1
configueArg=""
skipStep=0
batchCommand=""
function usage() {
cat >&2 <<-EOF
......@@ -18,7 +19,7 @@ function usage() {
Call all the necessary steps to perform an ELPA CI test
Usage:
run_ci_tests [-c configure arguments] [-j makeTasks] [-h] [-t MPI Tasks] [-m matrix size] [-n number of eigenvectors] [-b block size] [-o OpenMP threads] [-s skipStep]
run_ci_tests [-c configure arguments] [-j makeTasks] [-h] [-t MPI Tasks] [-m matrix size] [-n number of eigenvectors] [-b block size] [-o OpenMP threads] [-s skipStep] [-q submit command]
Options:
-c configure arguments
......@@ -44,13 +45,16 @@ function usage() {
-s skipStep
Skip the test run if 1 (default 0)
-q submit command
Job steps will be submitted via command to a batch system (default no submission)
-h
Print this help text
EOF
}
while getopts "c:t:j:m:n:b:o:s:h" opt; do
while getopts "c:t:j:m:n:b:o:s:q:h" opt; do
case $opt in
j)
makeTasks=$OPTARG;;
......@@ -68,6 +72,8 @@ while getopts "c:t:j:m:n:b:o:s:h" opt; do
configureArgs=$OPTARG;;
s)
skipStep=$OPTARG;;
q)
batchCommand=$OPTARG;;
:)
echo "Option -$OPTARG requires an argument" >&2;;
h)
......@@ -78,21 +84,43 @@ while getopts "c:t:j:m:n:b:o:s:h" opt; do
esac
done
if [ $skipStep -eq 0]
if [ $skipStep -eq 1 ]
then
echo "Skipping the test since option -s has been specified"
exit 0
else
eval ./configure $configureArgs
if [ $? -ne 0 ]; then cat confi.log && exit 1; fi
make -j $makeTasks
if [ $? -ne 0 ]; then exit 1; fi
OMP_NUM_THREADS=$ompThreads make check TASKS=$mpiTasks TEST_FLAGS="$matrixSize $nrEV $blockSize" || { cat test-suite-log; exit 1; }
if [ $? -ne 0 ]; then exit 1; fi
grep -i "Expected %stop" test-suite.log && exit 1 || true ;
if [ $? -ne 0 ]; then exit 1; fi
echo $batchCommand
if [ "$batchCommand" == "srun" ]
then
echo "Running with $batchCommand with $SRUN_COMMANDLINE_CONFIGURE"
# $batchCommand --ntasks-per-core=1 --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE bash -c ' {source /etc/profile.d/modules.sh && source ./ci_test_scripts/ci-env-vars && eval ./configure $configureArgs; }'
$batchCommand --ntasks-per-core=1 --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE ./ci_test_scripts/configure_step.sh "$configureArgs"
if [ $? -ne 0 ]; then cat config.log && exit 1; fi
sleep 1
$batchCommand --ntasks-per-core=1 --ntasks=1 --cpus-per-task=8 $SRUN_COMMANDLINE_BUILD ./ci_test_scripts/build_step.sh $makeTasks
if [ $? -ne 0 ]; then exit 1; fi
sleep 1
$batchCommand --ntasks-per-core=1 --ntasks=1 --cpus-per-task=2 $SRUN_COMMANDLINE_RUN ./ci_test_scripts/test_step.sh $mpiTasks $ompThreads "TEST_FLAGS=\" $matrixSize $nrEV $blockSize \" "
if [ $? -ne 0 ]; then exit 1; fi
grep -i "Expected %stop" test-suite.log && exit 1 || true ;
if [ $? -ne 0 ]; then exit 1; fi
else
#eval ./configure $configureArgs
./ci_test_scripts/configure_step.sh "$configureArgs"
if [ $? -ne 0 ]; then cat config.log && exit 1; fi
make -j $makeTasks
if [ $? -ne 0 ]; then exit 1; fi
OMP_NUM_THREADS=$ompThreads make check TASKS=$mpiTasks TEST_FLAGS="$matrixSize $nrEV $blockSize" || { cat test-suite-log; exit 1; }
if [ $? -ne 0 ]; then exit 1; fi
grep -i "Expected %stop" test-suite.log && exit 1 || true ;
if [ $? -ne 0 ]; then exit 1; fi
fi
fi
#!/bin/bash
source /etc/profile.d/modules.sh
set -ex
source ./ci_test_scripts/.ci-env-vars
module list
ulimit -s unlimited
ulimit -v unlimited
if [ "$(hostname)" != "miy01" -a "$(hostname)" != "miy02" -a "$(hostname)" != "miy03" ] ; then export I_MPI_STATS=10; fi
if [ "$(hostname)" != "miy01" -a "$(hostname)" != "miy02" -a "$(hostname)" != "miy03" ] ; then unset SLURM_MPI_TYPE I_MPI_SLURM_EXT I_MPI_PMI_LIBRARY I_MPI_PMI2 I_MPI_HYDRA_BOOTSTRAP; fi
export OMP_NUM_THREADS=$2
eval make check TASKS=$1 ${3} || { cat test-suite-log; exit 1; }
......@@ -851,6 +851,63 @@ if test x"${need_avx512}" = x"yes"; then
AC_MSG_ERROR([Could not compile a test program with AVX512, adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_AVX512],[1],[AVX512 is supported on this CPU])
if test x"$can_compile_avx512" = x"yes"; then
AC_MSG_CHECKING([whether we compile for Xeon])
AC_RUN_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1_real;
__m512d x1 = _mm512_xor_pd(h1_real, sign);
return 0;
}
])],
[can_compile_avx512_xeon=yes],
[can_compile_avx512_xeon=no]
)
AC_MSG_RESULT([${can_compile_avx512_xeon}])
AC_MSG_CHECKING([whether we compile for Xeon PHI])
AC_RUN_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
__m512d sign;
__m512d h1;
__m512d h2_real;
__m512d x1 = (__m512d) _mm512_xor_epi64((__m512i) h2_real, (__m512i) sign);
return 0;
}
])],
[can_compile_avx512_xeon_phi=yes],
[can_compile_avx512_xeon_phi=no]
)
AC_MSG_RESULT([${can_compile_avx512_xeon_phi}])
# this is needed for the intel compiler
if test x"$can_compile_avx512_xeon" = x"yes" ; then
if test x"$can_compile_avx512_xeon_phi" = x"yes" ; then
# we want only one to be true; this is ugly but could not come up with a better way
grep Phi /proc/cpuinfo > /dev/null
if test x"$?" = x"0" ; then
echo "Xeon PHI found ... disabling AVX512 Xeon"
can_compile_avx512_xeon=no
fi
fi
fi
if test x"$can_compile_avx512_xeon" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON],[1],[AVX512 for Xeon is supported on this CPU])
else
if test x"$can_compile_avx512_xeon_phi" = x"yes"; then
AC_DEFINE([HAVE_AVX512_XEON_PHI],[1],[AVX512 for Xeon-PHI is supported on this CPU])
else
AC_MSG_ERROR([Oho! We can neither compile AVX512 intrinsics for Xeon nor Xeon Phi. This should not happen!])
fi
fi
fi
fi
AC_LANG_POP([C])
......@@ -1157,5 +1214,5 @@ if test x"$enable_kcomputer" = x"yes" ; then
echo "call: make -f ../generated_headers.am generated-headers top_srcdir=.."
echo "BEFORE triggering the build with make!"
else
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir"
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
fi
......@@ -286,9 +286,7 @@ popd
%files tools
%attr(0755,root,root) %{_bindir}/elpa2_print_kernels
%attr(0755,root,root) %{_bindir}/elpa_tests
%attr(0644,root,root) %{_mandir}/man1/elpa2_print_kernels.1.gz
%attr(0644,root,root) %{_mandir}/man1/elpa_tests.1.gz
%files devel
%defattr(-,root,root)
......@@ -313,7 +311,6 @@ popd
%files -n %{name}_openmp-tools
%defattr(-,root,root)
%attr(0755,root,root) %{_bindir}/elpa2_print_kernels_openmp
%attr(0755,root,root) %{_bindir}/elpa_tests_openmp
%files -n %{name}_openmp-devel
......
......@@ -61,6 +61,33 @@
)(handle, a, ev, q, error)
/*! \brief generic C method for elpa_generalized_eigenvectors
*
* \details
* \param handle handle of the ELPA object, which defines the problem
* \param a float/double float complex/double complex pointer to matrix a
* \param b float/double float complex/double complex pointer to matrix b
* \param ev on return: float/double pointer to eigenvalues
* \param q on return: float/double float complex/double complex pointer to eigenvectors
* \param is_already_decomposed set to 1, if b already decomposed by previous call to elpa_generalized
* \param error on return the error code, which can be queried with elpa_strerr()
* \result void
*/
#define elpa_generalized_eigenvectors(handle, a, b, ev, q, is_already_decomposed, error) _Generic((a), \
double*: \
elpa_generalized_eigenvectors_d, \
\
float*: \
elpa_generalized_eigenvectors_f, \
\
double complex*: \
elpa_generalized_eigenvectors_dc, \
\
float complex*: \
elpa_generalized_eigenvectors_fc \
)(handle, a, b, ev, q, is_already_decomposed, error)
/*! \brief generic C method for elpa_eigenvalues
*
* \details
......
......@@ -44,6 +44,8 @@ test_type_flag = {
"solve_tridiagonal": "-DTEST_SOLVE_TRIDIAGONAL",
"cholesky": "-DTEST_CHOLESKY",
"hermitian_multiply": "-DTEST_HERMITIAN_MULTIPLY",
"generalized" : "-DTEST_GENERALIZED_EIGENPROBLEM",
"generalized_decomp": "-DTEST_GENERALIZED_DECOMP_EIGENPROBLEM",
}
layout_flag = {
......@@ -64,6 +66,10 @@ for lang, m, g, q, t, p, d, s, lay in product(sorted(language_flag.keys()),
if lang == "C" and (m == "analytic" or m == "toeplitz" or m == "frank" or lay == "all_layouts"):
continue
# not implemented in the test.c file yet
if lang == "C" and (t == "cholesky" or t == "hermitian_multiply" or q == 1):
continue
# exclude some test combinations
# analytic tests only for "eigenvectors" and not on GPU
......@@ -77,12 +83,25 @@ for lang, m, g, q, t, p, d, s, lay in product(sorted(language_flag.keys()),
if(s in ["scalapack_all", "scalapack_part"] and (g == 1 or t != "eigenvectors" or m != "analytic")):
continue
# do not test single-precision scalapack
if(s in ["scalapack_all", "scalapack_part"] and ( p == "single")):
continue
# solve tridiagonal only for real toeplitz matrix in 1stage
if (t == "solve_tridiagonal" and (s != "1stage" or d != "real" or m != "toeplitz")):
continue
# cholesky tests only 1stage and teoplitz matrix
if (t == "cholesky" and (m != "toeplitz" or s == "2stage")):
# solve generalized only for random matrix in 1stage
if (t == "generalized" and (m != "random" or s == "2stage")):
continue
# solve generalized already decomposed only for random matrix in 1stage
# maybe this test should be further restricted, maybe not so important...
if (t == "generalized_decomp" and (m != "random" or s == "2stage")):
continue
# cholesky tests only 1stage and teoplitz or random matrix
if (t == "cholesky" and ((not (m == "toeplitz" or m == "random")) or s == "2stage")):
continue
if (t == "eigenvalues" and (m == "random")):
......
.TH "elpa_generalized_eigenvectors" 3 "Thu Feb 1 2018" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
elpa_generalized_eigenvectors \- computes the generalized eigenvalues and (part of) the eigenvector spectrum for a real symmetric or complex hermitian matrix
.br
.SH SYNOPSIS
.br
.SS FORTRAN INTERFACE
use elpa
.br
class(elpa_t), pointer :: elpa
.br
.RI "call elpa%\fBgeneralized_eigenvectors\fP (a, b, ev, q, is_already_decomopsed, error)"
.br
.RI " "
.br
.RI "With the definitions of the input and output variables:"
.br
.RI "class(elpa_t) :: \fBelpa\fP ! returns an instance of the ELPA object"
.br
.TP
.RI "datatype :: \fBa\fP"
The matrix a for which the eigenvalues should be computed. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
.TP
.RI "datatype :: \fBb\fP"
The matrix b defining the generalized eigenvalue problem. The dimensions and datatype of the matrix b has to be the same as for matrix a.
.TP
.RI "datatype :: \fBev\fP"
The vector ev where the eigenvalues will be stored in \fIascending\fP order. The datatype of the vector ev can be either "real(kind=c_double)", or "real(kind=c_float)", depending of the datatype of the matrix. Note that complex hermitian matrices also have real valued eigenvalues.
.TP
.RI "datatype :: \fBq\fP"
The storage space for the computed eigenvectors. The dimensions of matrix a must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The datatype of the matrix can be one of "real(kind=c_double)", "real(kind=c_float)", "complex(kind=c_double)", or "complex(kind=c_float)"
.TP
.RI "logical :: \fBis_already_decomposed\fP"
Has to be set to .false. for the first call with a given b and .true. for
each subsequent call with the same b, since b then already contains
decomposition and thus the decomposing step is skipped.
.TP
.RI "integer, optional :: \fBerror\fP"
The return error code of the function. Should be "ELPA_OK". The error code can be querried with the function \fBelpa_strerr\fP(3)
.br
.SS C INTERFACE
#include <elpa/elpa.h>
.br
elpa_t handle;
.br
.RI "void \fBelpa_eigenvalues\fP(\fBelpa_t\fP handle, \fBdatatype\fP *a, \fBdatatype\fP *ev, \fBdatatype\fP *q, \fBint\fP *error);"
.br
.RI " "
.br
.RI "With the definitions of the input and output variables:"
.br
.TP
.RI "elpa_t \fBhandle\fP;"
The handle to the ELPA object
.TP
.RI "datatype *\fBa\fP;"
The matrix a for which the eigenvalues should be computed. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
.TP
.RI "datatype *\fBev\fP;"
The storage for the computed eigenvalues. Eigenvalues will be stored in \fIascendig\fP order. The \fBdatatype\fP can be either "double" or "float". Note that the eigenvalues of complex hermitian matrices are also real.
.TP
.RI "datatype *\fBq\fP;"
The storage space for the computed eigenvectors. The dimensions of the matrix must be set \fIBEFORE\fP with the methods \fBelpa_set\fP(3) and \fBelpa_setup\fP(3). The \fBdatatype\fP can be one of "double", "float", "double complex", or "float complex".
.TP
.RI "int \fBis_already_decomposed\fP;"
Has to be set to 0 for the first call with a given b and 1 for
each subsequent call with the same b, since b then already contains
decomposition and thus the decomposing step is skipped.
.TP
.RI "int *\fBerror\fP;"
The error code of the function. Should be "ELPA_OK". The error codes can be querried with \fBelpa_strerr\fP(3)
.SH DESCRIPTION
Compute the generalized eigenvalues and (parts of) the eigenvector spectrum of a real symmtric or complex hermitian matrix.The functions \fBelpa_init\fP(3), \fBelpa_allocate\fP(3), \fBelpa_set\fP(3), and \fBelpa_setup\fP(3) must be called \fIBEFORE\fP \fBelpa_generalized_eigenvalues\fP can be called. Especially the number of eigenvectors to be computed can be set with \fPelpa_set\fB(3). Unlike in the case of ordinary eigenvalue problem, the generalized problem calls some external scalapack routines. The user is responsible for initialization of the blacs context, which then has to be passed to elpa by \fPelpa_set\fB(3) \fIBEFORE\fP \fBelpa_generalized_eigenvalues\fP can be called.
.br
.SH "SEE ALSO"
.br
\fBelpa2_print_kernels\fP(1) \fBelpa_init\fP(3) \fBelpa_allocate\fP(3) \fBelpa_set\fP(3) \fBelpa_setup\fP(3) \fBelpa_strerr\fP(3) \fBelpa_eigenvalues\fP(3) \fBelpa_eigenvectors\fP(3) \fBelpa_cholesky\fP(3) \fBelpa_invert_triangular\fP(3) \fBelpa_solve_tridiagonal\fP(3) \fBelpa_hermitian_multiply\fP(3) \fBelpa_uninit\fP(3) \fBelpa_deallocate\fP(3)
......@@ -92,6 +92,10 @@ integer parameter. The parent MPI communicator which includes all MPI process wh
.TP
.RI \fB"bandwidth"\fP:
integer parameter. Some ELPA compute steps can be accelerated if the matrix is already in banded form. If set, ELPA assumes that the bandwidth of the matrix is the value set.
.TP
.RI \fB"blacs_context"\fP:
integer parameter. The generalized eigenvalue solver \fBelpa_generalized_eigenvectors\fP(3) use internal calls to some of the scalapack routines. Thus before calling it, the user has to provide properly initialized blacs context.
.TP
.RI \fB"timings"\fP:
Choose whether time measurements should be done in the ELPA routines.
......
.TH "elpa_tests" 1 "Thur Mar 17 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
elpa_tests \- Provide all tests for the ELPA library\&.
.SH SYNOPSIS
.br
elpa_tests [--help] [datatype={real|complex}] [na=number] [nev=number] [nblk=size of block cyclic distribution] [--output_eigenvalues] [--output_eigenvectors] [--real-kernel=name_of_kernel] [--complex-kernel=name_of_kernel] [--use-gpu={0|1}] [--use-qr={0,1}] [--tests={all|solve-tridi|1stage|2stage|cholesky|invert-triangular|transpose-mulitply}]
.br
.SH "Description"
.PP
Provide in a configurable way all test for the ELPA library.
.br
It is possible to run all test which have been implemented in ELPA
If no options are set, default options are choses (and printed).
.SH "Options"
.PP
.br
.RI "--help print help information"
.br
.RI "dataype={real|complex} choose whether real or complex test cases are run. If not given, the default is real case."
.br
.RI "na=integer number choose the size of the matrix. If not given, the default na=4000 is set."
.br
.RI "nev=integer number choose number of eigenvalues/eigenvectors to compute. If not given, the default nev=1500 is set."
.br.
.RI "nblk=integer number set the size of the block cyclic distribution. If not given, the default nblk=16 is set."
.br
.RI "--output-eigenvalues if set, the computed eigenvalues will be stored in files. (default no)."
.br
.RI "--output-eigenvectors if set, the computed eigenvectors will be stored in files. (default no)."
.br
.RI "--real-kernel=string if given, use only this kernel for the real 2stage step. Available kernels can be querried with elpa2_print_kernels."
.br
.RI "--complex-kernel=string if given, use only this kernel for the complex 2stage step Available kernels can be querried with elpa2_print_kernels"
.br
.RI "--use-gpu={0|1}] switch on GPU usage. Fails, if GPUs are not available. Default no"
.br
.RI "--use-qr={0,1} use QR-decomposition in real 2stage step. Default no"
.br
.RI "--tests={all|solve-tridi|1stage|2stage| if given, run only specified tests. Default is all"
.RI " cholesky|invert-triangular|"
.RI " transpose-mulitply}"
.SH "Author"
A. Marek, MPCDF
.SH "Reporting bugs"
Report bugs to the ELPA mail elpa-library@mpcdf.mpg.de
.SH "SEE ALSO"
\fBelpa2_print_kernels\fP(1) \fBelpa_get_communicators\fP(3) \fBelpa_solve_evp_real_double\fP(3) \fBelpa_solve_evp_real_single\fP(3) \fBelpa_solve_evp_complex_double\fP(3) \fBelpa_solve_evp_complex_single\fP(3) \fBelpa_solve_evp_real_1stage_double\fP(3) \fBelpa_solve_evp_real_1stage_single\fP(3) \fBelpa_solve_evp_real_2stage_double\fP(3) \fBelpa_solve_evp_real_2stage_single\fP(3) \fBelpa_solve_evp_complex_2stage_double\fP(3) \fBelpa_solve_evp_complex_2stage_single\fP(3)
......@@ -159,9 +159,9 @@
#endif
integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=c_intptr_t) :: lc_start, lc_end
integer(kind=c_intptr_t) :: lc_start, lc_end
#if COMPLEXCASE == 1
integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
#endif
integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols
......@@ -185,11 +185,27 @@
&_&
&MATH_DATATYPE
logical :: useGPU_reduction_lower_block_to_tridiagonal
call obj%timer%start("bandred_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
useGPU_reduction_lower_block_to_tridiagonal = .false.
if (useGPU) then
useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
if (useQR) then
!in this case switch off GPU usage for step "reduce current block to lower triangular form"
! since this is done by QR decomposition
useGPU_reduction_lower_block_to_tridiagonal = .false.
endif
#endif
endif
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -206,18 +222,18 @@
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
! na_rows in used nowhere; only na_cols
! na_rows in used nowhere; only na_cols
if (useGPU) then
#ifdef WITH_MPI
#if COMPLEXCASE == 1
......@@ -268,11 +284,6 @@
#if REALCASE == 1
if (useQR) then
if (useGPU) then
print *,"qr decomposition at the moment not supported with GPU"
stop 1
endif
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage)
......@@ -533,6 +544,10 @@
! Reduce current block to lower triangular form
#if REALCASE == 1
if (useQR) then
if (useGPU) then
! vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
endif
if (which_qr_decomposition == 1) then
vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
......@@ -637,7 +652,7 @@
#endif /* WITH_MPI */
if (useGPU) then
if (useGPU_reduction_lower_block_to_tridiagonal) then
vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
else
vmrCPU(1:lr,lc) = vr(1:lr)
......@@ -815,7 +830,7 @@
#endif /* WITH_OPENMP */
enddo ! lc
if (useGPU) then
if (useGPU_reduction_lower_block_to_tridiagonal) then
! store column tiles back to GPU
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
......@@ -841,7 +856,7 @@
vav = 0
call obj%timer%start("blas")
if (useGPU) then
if (useGPU_reduction_lower_block_to_tridiagonal) then
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', &
......@@ -853,7 +868,7 @@
vmrCUDA, cur_l_rows, &
ZERO, vav, ubound(vav,dim=1))
else ! useGPU
else ! useGPU_reduction_to_tridiagonal
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', &
......@@ -892,6 +907,33 @@
#if REALCASE == 1
endif !useQR
#endif
#if REALCASE == 1
if (useGPU .and. useQR) then
! copy the data for furhter usage
! qr worked on *CPU arrarys
!vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
int(cudaMemcpyHostToDevice,kind=c_int))
if (.not.(successCUDA)) then
print *, "bandred_&
&MATH_DATATYPE&
&: cuda memcpy a_dev failed ", istat
stop 1
endif
endif
endif
#endif
! Transpose vmr -> vmc (stored in umc, second half)
if (useGPU)