Commit d836eea2 authored by Andreas Marek's avatar Andreas Marek

Enable 64bit lapack support

ELPA can now link agains a 64bit integer verion of BLAS/LAPACK.
Currently this only works if ELPA is compiled with MPI=OFF!

The 64bit support is not available in the legacy interface
parent 88095c9e
......@@ -79,6 +79,28 @@ distcheck-no-autotune:
# gnu-gnu-ilp64-nompi-noomp
gnu-gnu-nompi-noopenmp-ilp64:
tags:
- avx
artifacts:
when: on_success
expire_in: 2 month
script:
- ./ci_test_scripts/run_ci_tests.sh -c "CC=\"gcc\" CFLAGS=\"-O3 -xAVX\" FC=\"gfortram\" FCFLAGS=\"-O3 -xAVX\" SCALAPACK_LDFLAGS=\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_NOOMP_ILP64 \" SCALAPACK_FCFLAGS=\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MOMPI_NOOMP_ILP64 \" --enable-option-checking=fatal --with-mpi=no --disable-openmp --disable-gpu --enable-avx --enable-64bit-integer-support || { cat config.log; exit 1; }" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM
# gnu-gnu-ilp64-nompi-openmp
gnu-gnu-nompi-openmp-ilp64:
tags:
- avx
artifacts:
when: on_success
expire_in: 2 month
script:
- ./ci_test_scripts/run_ci_tests.sh -c "CC=\"gcc\" CFLAGS=\"-O3 -xAVX\" FC=\"gfortram\" FCFLAGS=\"-O3 -xAVX\" SCALAPACK_LDFLAGS=\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_OMP_ILP64 \" SCALAPACK_FCFLAGS=\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MOMPI_OMP_ILP64 \" --enable-option-checking=fatal --with-mpi=no --enable-openmp --disable-gpu --enable-avx --enable-64bit-integer-support || { cat config.log; exit 1; }" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE -s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM
# python tests
python-intel-intel-mpi-openmp:
tags:
......
......@@ -591,6 +591,8 @@ noinst_LTLIBRARIES += libelpatest@SUFFIX@.la
libelpatest@SUFFIX@_la_FCFLAGS = $(test_program_fcflags)
libelpatest@SUFFIX@_la_SOURCES = \
test/shared/tests_variable_definitions.F90 \
test/shared/mod_tests_scalapack_interfaces.F90 \
test/shared/mod_tests_blas_interfaces.F90 \
test/shared/test_util.F90 \
test/shared/test_read_input_parameters.F90 \
test/shared/test_check_correctness.F90 \
......@@ -766,6 +768,8 @@ EXTRA_DIST = \
manual_cpp \
nvcc_wrap \
remove_xcompiler \
src/helpers/fortran_blas_interfaces.F90 \
src/helpers/fortran_scalapack_interfaces.F90 \
src/GPU/cuUtils_template.cu \
src/elpa_api_math_template.F90 \
src/elpa_impl_math_template.F90 \
......
......@@ -332,8 +332,47 @@ print(" - ./ci_test_scripts/run_distcheck_tests.sh -c \" FC=mpiifort FCFLAGS
print("\n\n")
#add two tests for ilp64 mkl interface
ilp64_no_omp_tests = [
"# gnu-gnu-ilp64-nompi-noomp",
"gnu-gnu-nompi-noopenmp-ilp64:",
" tags:",
" - avx",
" artifacts:",
" when: on_success",
" expire_in: 2 month",
" script:",
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=\\"gcc\\" CFLAGS=\\"-O3 -xAVX\\" '
'FC=\\"gfortram\\" FCFLAGS=\\"-O3 -xAVX\\" '
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_NOOMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MOMPI_NOOMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=no --disable-openmp '
'--disable-gpu --enable-avx --enable-64bit-integer-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
"# gnu-gnu-ilp64-nompi-openmp",
"gnu-gnu-nompi-openmp-ilp64:",
" tags:",
" - avx",
" artifacts:",
" when: on_success",
" expire_in: 2 month",
" script:",
' - ./ci_test_scripts/run_ci_tests.sh -c "'
'CC=\\"gcc\\" CFLAGS=\\"-O3 -xAVX\\" '
'FC=\\"gfortram\\" FCFLAGS=\\"-O3 -xAVX\\" '
'SCALAPACK_LDFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_LDFLAGS_NOMPI_OMP_ILP64 \\" '
'SCALAPACK_FCFLAGS=\\"$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MOMPI_OMP_ILP64 \\" '
'--enable-option-checking=fatal --with-mpi=no --enable-openmp '
'--disable-gpu --enable-avx --enable-64bit-integer-support || { cat config.log; exit 1; }'
'" -j 8 -t $MPI_TASKS -m $MATRIX_SIZE -n $NUMBER_OF_EIGENVECTORS -b $BLOCK_SIZE '
'-s $SKIP_STEP -i $INTERACTIVE_RUN -S $SLURM',
"\n",
]
print("\n".join(ilp64_no_omp_tests))
# add python tests
python_ci_tests = [
......
......@@ -208,7 +208,12 @@ if test x"${enable_heterogenous_cluster_support}" = x"yes"; then
fi
AM_CONDITIONAL([HAVE_HETEROGENOUS_CLUSTER_SUPPORT],[test x"$enable_heterogenous_cluster_support" = x"yes"])
dnl 64bit integer support for BLACS/LAPACK/SCALAPACK
dnl 64bit integer support for BLACS/LAPACK/SCALAPACK support
dnl first long int
AC_CHECK_SIZEOF([long int])
size_of_long_int="${ac_cv_sizeof_long_int}"
dnl then 64bit
AC_MSG_CHECKING(whether 64bit integers should be used for math libraries (BLAS/LAPACK/SCALAPACK))
AC_ARG_ENABLE([64bit-integer-support],
AS_HELP_STRING([--64bit-integer-support],
......@@ -223,15 +228,21 @@ AC_ARG_ENABLE([64bit-integer-support],
[enable_64bit_integer_support="no"])
AC_MSG_RESULT([$enable_64bit_integer_support])
if test x"${enable_64bit_integer_support}" = x"yes"; then
if test x"${enable_legacy}" = x"yes"; then
AC_MSG_ERROR([You cannot both enable 64bit integer support and the legacy interface!])
fi
dnl check whether long int is the correct data-type in C
if test x"${size_of_long_int}" = x"8"; then
echo "Found C data-type \"long int\" with 8 bytes"
else
AC_MSG_ERROR([The C data-type "long int" is only ${size_of_long_int} bytes; Needed is 8 bytes])
fi
AC_DEFINE([HAVE_64BIT_INTEGER_SUPPORT], [1], [allow to link against the 64bit integer versions of math libraries])
fi
AM_CONDITIONAL([HAVE_64BIT_INTEGER_SUPPORT],[test x"$enable_64bit_integer_support" = x"yes"])
AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
int main(int argc, char **argv) {
......
......@@ -99,12 +99,14 @@
integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
l_rqm, ns, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
l_cols_qreorg, np, l_idx, nqcols1, nqcols2
integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
np_cols, mpierr
integer(kind=ik) :: np_next, np_prev, np_rem
integer(kind=ik) :: idx(na), idx1(na), idx2(na)
integer(kind=BLAS_KIND) :: idxBLAS(NA)
integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
integer(kind=ik) :: istat
......@@ -230,7 +232,9 @@
rho = 2.0_rk*beta
! Calculate index for merging both systems by ascending eigenvalues
call obj%timer%start("blas")
call PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx )
call PRECISION_LAMRG( int(nm,kind=BLAS_KIND), int(na-nm,kind=BLAS_KIND), d, &
1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
idx(:) = int(idxBLAS(:),kind=ik)
call obj%timer%stop("blas")
! Calculate the allowable deflation tolerance
......@@ -391,8 +395,8 @@
d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
else ! na1==2
call obj%timer%start("blas")
call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
call PRECISION_LAED5(1_BLAS_KIND, d1, z1, qtrans(1,1), rho, d(1))
call PRECISION_LAED5(2_BLAS_KIND, d1, z1, qtrans(1,2), rho, d(2))
call obj%timer%stop("blas")
call transform_columns_&
&PRECISION&
......@@ -404,7 +408,9 @@
! Calculate arrangement of all eigenvalues in output
call obj%timer%start("blas")
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call PRECISION_LAMRG( int(na1,kind=BLAS_KIND), int(na-na1,kind=BLAS_KIND), d, &
1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
idx(:) = int(idxBLAS(:),kind=ik)
call obj%timer%stop("blas")
! Rearrange eigenvalues
......@@ -437,16 +443,19 @@
ddiff(1:na1) = 0
info = 0
infoBLAS = int(info,kind=BLAS_KIND)
#ifdef WITH_OPENMP
call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
my_thread = omp_get_thread_num()
!$OMP DO
#endif
DO i = my_proc+1, na1, n_procs ! work distributed over all processors
call obj%timer%start("blas")
call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
call PRECISION_LAED4(int(na1,kind=BLAS_KIND), int(i,kind=BLAS_KIND), d1, z1, delta, &
rho, s, infoBLAS) ! s is not used!
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
if (info/=0) then
! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
......@@ -548,7 +557,9 @@
call obj%timer%start("blas")
! Calculate arrangement of all eigenvalues in output
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call PRECISION_LAMRG(int(na1,kind=BLAS_KIND), int(na-na1,kind=BLAS_KIND), d, &
1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
idx(:) = int(idxBLAS(:),kind=ik)
call obj%timer%stop("blas")
! Rearrange eigenvalues
tmp = d
......@@ -775,10 +786,11 @@
else
call obj%timer%start("blas")
call obj%timer%start("gemm")
call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, &
1.0_rk, qtmp1, ubound(qtmp1,dim=1), &
ev, ubound(ev,dim=1), &
1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1))
call PRECISION_GEMM('N', 'N', int(l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), &
int(nnzu,kind=BLAS_KIND), &
1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=BLAS_KIND), &
ev, int(ubound(ev,dim=1),kind=BLAS_KIND), &
1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND))
call obj%timer%stop("gemm")
call obj%timer%stop("blas")
endif ! useGPU
......@@ -833,10 +845,11 @@
else
call obj%timer%start("blas")
call obj%timer%start("gemm")
call PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, &
1.0_rk, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), &
ev, ubound(ev,dim=1), &
1.0_rk, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
call PRECISION_GEMM('N', 'N', int(l_rows-l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), &
int(nnzl,kind=BLAS_KIND), &
1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=BLAS_KIND), &
ev, int(ubound(ev,dim=1),kind=BLAS_KIND), &
1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND))
call obj%timer%stop("gemm")
call obj%timer%stop("blas")
endif ! useGPU
......
......@@ -599,6 +599,7 @@ subroutine solve_tridi_&
real(kind=REAL_DATATYPE) :: dtmp
integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:)
logical, intent(in) :: wantDebug
......@@ -630,7 +631,10 @@ subroutine solve_tridi_&
stop 1
endif
call obj%timer%start("blas")
call PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
call PRECISION_STEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
if (info /= 0) then
......@@ -642,7 +646,8 @@ subroutine solve_tridi_&
d(:) = ds(:)
e(:) = es(:)
call obj%timer%start("blas")
call PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info)
call PRECISION_STEQR('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), work, infoBLAS )
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
! If DSTEQR fails also, we don't know what to do further ...
......
......@@ -301,7 +301,8 @@
#if COMPLEXCASE == 1
call PRECISION_HERK('U', 'C', &
#endif
nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows)
int(nstor,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, &
hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), ZERO, tmat, int(max_stored_rows,kind=BLAS_KIND))
call obj%timer%stop("blas")
nc = 0
do n = 1, nstor-1
......@@ -323,7 +324,8 @@
tmat(1,1) = tau(ice-nstor+1)
do n = 1, nstor-1
call obj%timer%start("blas")
call PRECISION_TRMV('L', BLAS_TRANS_OR_CONJ , 'N', n, tmat, max_stored_rows, h2(nc+1), 1)
call PRECISION_TRMV('L', BLAS_TRANS_OR_CONJ , 'N', int(n,kind=BLAS_KIND), tmat, &
int(max_stored_rows,kind=BLAS_KIND), h2(nc+1), 1_BLAS_KIND)
call obj%timer%stop("blas")
tmat(n+1,1:n) = &
......@@ -369,8 +371,9 @@
call obj%timer%start("blas")
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
q_mat, ldq, ZERO, tmp1, nstor)
int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
int(l_rows,kind=BLAS_KIND), ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), &
q_mat, int(ldq,kind=BLAS_KIND), ZERO, tmp1, int(nstor,kind=BLAS_KIND))
call obj%timer%stop("blas")
endif ! useGPU
......@@ -422,19 +425,21 @@
#ifdef WITH_MPI
! tmp2 = tmat * tmp2
call obj%timer%start("blas")
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
ONE, tmat, max_stored_rows, tmp2, nstor)
call PRECISION_TRMM('L', 'L', 'N', 'N', int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
ONE, tmat, int(max_stored_rows,kind=BLAS_KIND), tmp2, int(nstor,kind=BLAS_KIND))
!q_mat = q_mat - hvm*tmp2
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-ONE, hvm, ubound(hvm,dim=1), tmp2, nstor, ONE, q_mat, ldq)
call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), int(nstor,kind=BLAS_KIND), &
-ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), tmp2, int(nstor,kind=BLAS_KIND), &
ONE, q_mat, int(ldq,kind=BLAS_KIND))
call obj%timer%stop("blas")
#else /* WITH_MPI */
call obj%timer%start("blas")
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
ONE, tmat, max_stored_rows, tmp1, nstor)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-ONE, hvm, ubound(hvm,dim=1), tmp1, nstor, ONE, q_mat, ldq)
call PRECISION_TRMM('L', 'L', 'N', 'N', int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
ONE, tmat, int(max_stored_rows,kind=BLAS_KIND), tmp1, int(nstor,kind=BLAS_KIND))
call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), &
int(nstor,kind=BLAS_KIND), -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), &
tmp1, int(nstor,kind=BLAS_KIND), ONE, q_mat, int(ldq,kind=BLAS_KIND))
call obj%timer%stop("blas")
#endif /* WITH_MPI */
endif ! useGPU
......
......@@ -385,16 +385,16 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
call PRECISION_GEMV('N', &
l_rows, 2*n_stored_vecs, &
ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), &
ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
#if REALCASE == 1
uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
uv_stored_cols(l_cols+1,1), int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
#endif
#if COMPLEXCASE == 1
aux, 1, &
aux, 1_BLAS_KIND, &
#endif
ONE, v_row, 1)
ONE, v_row, 1_BLAS_KIND)
if (wantDebug) call obj%timer%stop("blas")
endif
......@@ -527,15 +527,15 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (mod(n_iter,n_threads) == my_thread) then
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, &
v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_row(l_row_beg), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
if (i/=j) then
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1, &
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), v_col(l_col_beg), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1)
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
endif
if (wantDebug) call obj%timer%stop("blas")
endif
......@@ -548,17 +548,17 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (.not. useGPU) then
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg, l_col_beg), lda, &
v_row(l_row_beg), 1, &
ONE, u_col(l_col_beg), 1)
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg, l_col_beg), int(lda,kind=BLAS_KIND), &
v_row(l_row_beg), 1_BLAS_KIND, &
ONE, u_col(l_col_beg), 1_BLAS_KIND)
if (i/=j) then
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, &
v_col(l_col_beg), 1, &
ONE, u_row(l_row_beg), 1)
call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_col(l_col_beg), 1_BLAS_KIND, &
ONE, u_row(l_row_beg), 1_BLAS_KIND)
endif
if (wantDebug) call obj%timer%stop("blas")
endif ! not useGPU
......@@ -663,13 +663,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#if COMPLEXCASE == 1
call PRECISION_GEMV('C', &
#endif
l_rows, 2*n_stored_vecs, &
ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
v_row, 1, ZERO, aux, 1)
int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), &
ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
v_row, 1_BLAS_KIND, ZERO, aux, 1_BLAS_KIND)
call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs, &
ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1), &
aux, 1, ONE, u_col, 1)
call PRECISION_GEMV('N', int(l_cols,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), &
ONE, uv_stored_cols, int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
aux, 1_BLAS_KIND, ONE, u_col, 1_BLAS_KIND)
if (wantDebug) call obj%timer%stop("blas")
endif
......@@ -797,10 +797,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
else !useGPU
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
ONE, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1), &
uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1), &
ONE, a_mat(l_row_beg,l_col_beg), lda)
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
int(2*n_stored_vecs,kind=BLAS_KIND), &
ONE, vu_stored_rows(l_row_beg,1), int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), &
uv_stored_cols(l_col_beg,1), int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND))
if (wantDebug) call obj%timer%stop("blas")
endif !useGPU
enddo
......
......@@ -63,6 +63,7 @@
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
integer(kind=ik) :: n, nc, i, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik) :: lcs, lce, lrs, lre
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
......@@ -192,7 +193,9 @@
if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then
call obj%timer%start("blas")
call PRECISION_POTRF('U', na-n+1, a(l_row1,l_col1), lda, info)
call PRECISION_POTRF('U', int(na-n+1,kind=BLAS_KIND), a(l_row1,l_col1), &
int(lda,kind=BLAS_KIND), infoBLAS )
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
if (info/=0) then
......@@ -223,7 +226,9 @@
! Cholesky-Factorization of this block
call obj%timer%start("blas")
call PRECISION_POTRF('U', nblk, a(l_row1,l_col1), lda, info)
call PRECISION_POTRF('U', int(nblk,kind=BLAS_KIND), a(l_row1,l_col1), &
int(lda,kind=BLAS_KIND) , infoBLAS )
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
if (info/=0) then
......@@ -270,8 +275,9 @@
call obj%timer%start("blas")
if (l_cols-l_colx+1>0) &
call PRECISION_TRSM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', nblk, l_cols-l_colx+1, ONE, tmp2, &
ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
call PRECISION_TRSM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', int(nblk,kind=BLAS_KIND), &
int(l_cols-l_colx+1,kind=BLAS_KIND), ONE, tmp2, &
int(ubound(tmp2,dim=1),kind=BLAS_KIND), a(l_row1,l_colx), int(lda,kind=BLAS_KIND) )
call obj%timer%stop("blas")
endif
......@@ -310,9 +316,11 @@
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<lrs) cycle
call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, lre-lrs+1, lce-lcs+1, nblk, -ONE, &
tmatr(lrs,1), ubound(tmatr,dim=1), tmatc(lcs,1), ubound(tmatc,dim=1), &
ONE, a(lrs,lcs), lda)
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(lre-lrs+1,kind=BLAS_KIND), int(lce-lcs+1,kind=BLAS_KIND), &
int(nblk,kind=BLAS_KIND), -ONE, &
tmatr(lrs,1), int(ubound(tmatr,dim=1),kind=BLAS_KIND), tmatc(lcs,1), &
int(ubound(tmatc,dim=1),kind=BLAS_KIND), &
ONE, a(lrs,lcs), int(lda,kind=BLAS_KIND))
call obj%timer%stop("blas")
enddo
......
......@@ -72,6 +72,7 @@
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
integer(kind=ik) :: n, nc, i, info, ns, nb
integer(kind=BLAS_KIND) :: infoBLAS
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:,:), tmat1(:,:), tmat2(:,:)
logical :: wantDebug
logical :: success
......@@ -178,8 +179,9 @@
if (my_pcol==pcol(n, nblk, np_cols)) then
call obj%timer%start("blas")
call PRECISION_TRTRI('U', 'N', nb, a(l_row1,l_col1), lda, info)
call PRECISION_TRTRI('U', 'N', int(nb,kind=BLAS_KIND), a(l_row1,l_col1), int(lda,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
if (info/=0) then
......@@ -222,8 +224,8 @@
call obj%timer%start("blas")
if (l_cols-l_colx+1>0) &
call PRECISION_TRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, ONE, &
tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda)
call PRECISION_TRMM('L', 'U', 'N', 'N', int(nb,kind=BLAS_KIND), int(l_cols-l_colx+1,kind=BLAS_KIND), ONE, &
tmp2, int(ubound(tmp2,dim=1),kind=BLAS_KIND), a(l_row1,l_colx), int(lda,kind=BLAS_KIND))
call obj%timer%stop("blas")
if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0
......@@ -257,9 +259,11 @@
call obj%timer%start("blas")
if (l_row1>1 .and. l_cols-l_col1+1>0) &
call PRECISION_GEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -ONE, &
tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), ONE, &
a(1,l_col1), lda)
call PRECISION_GEMM('N', 'N', int(l_row1-1,kind=BLAS_KIND), int(l_cols-l_col1+1,kind=BLAS_KIND), &
int(nb,kind=BLAS_KIND), -ONE, &
tmat1, int(ubound(tmat1,dim=1),kind=BLAS_KIND), tmat2(1,l_col1), &
int(ubound(tmat2,dim=1),kind=BLAS_KIND), ONE, &
a(1,l_col1), int(lda,kind=BLAS_KIND) )
call obj%timer%stop("blas")
......
......@@ -58,6 +58,7 @@
use elpa_mpi
use precision
use elpa_abstract_impl
use elpa_blas_interfaces
implicit none
#include "../../src/general/precision_kinds.F90"
......@@ -275,8 +276,11 @@
if (lrs<=lre) then
call obj%timer%start("blas")
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', nstor, lce-lcs+1, lre-lrs+1, ONE, &
aux_mat(lrs,1), ubound(aux_mat,dim=1), b(lrs,lcs), ldb,ZERO, tmp1, nstor)
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', int(nstor,kind=BLAS_KIND), &
int(lce-lcs+1,kind=BLAS_KIND), int(lre-lrs+1,kind=BLAS_KIND), &
ONE, aux_mat(lrs,1), int(ubound(aux_mat,dim=1),kind=BLAS_KIND), &
b(lrs,lcs), int(ldb,kind=BLAS_KIND), ZERO, tmp1, &
int(nstor,kind=BLAS_KIND))
call obj%timer%stop("blas")
else
tmp1 = 0
......
This diff is collapsed.
......@@ -117,6 +117,7 @@
real(kind=rk) :: work(nb*nb2), work2(nb2*nb2)
integer(kind=ik) :: lwork, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik) :: istep, i, n, dest
integer(kind=ik) :: n_off, na_s
......@@ -234,7 +235,10 @@
! Transform first block column of remaining matrix
call obj%timer%start("blas")
call PRECISION_GEQRF(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info)
call PRECISION_GEQRF(int(n,kind=BLAS_KIND), int(nb2,kind=BLAS_KIND), ab(1+nb2,na_s-n_off), &
int(2*nb-1,kind=BLAs_KIND), tau, work, int(lwork,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
do i=1,nb2
......@@ -334,7 +338,10 @@
&PRECISION&
&(obj,nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
call obj%timer%start("blas")
call PRECISION_GEQRF(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info)
call PRECISION_GEQRF(int(nr,kind=BLAS_KIND), int(nb2,kind=BLAS_KIND), ab(nb+1,ns), &
int(2*nb-1,kind=BLAS_KIND), tau_new, work, int(lwork,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)