diff --git a/configure.ac b/configure.ac index 5e51fbf5e39b705c62980bd7c62058ee43ba603d..e95672c6fc7f12e2fefe6e2598ff03966297dba1 100644 --- a/configure.ac +++ b/configure.ac @@ -46,7 +46,7 @@ AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, w AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20200417], [Current ELPA autotune version]) AC_DEFINE_SUBST(CURRENT_AUTOTUNE_VERSION, 20200417, "Current ELPA autotune version") AC_DEFINE_UNQUOTED([ELPA_BUILDTIME], [$ELPA_BUILDTIME], ["Time of build"]) - +AX_COMPARE_VERSION([$ELPA_BUILDTIME], [gt], [1604905771],[old_elpa_version=yes],[old_elpa_version=no]) AX_CHECK_GNU_MAKE() if test x$_cv_gnu_make_command = x ; then @@ -1776,6 +1776,9 @@ else echo "build config should be compiled into the library: no" fi +if test x"$have_loop_blocking" = x"yes"; then + AC_DEFINE([LOOP_BLOCKING],[1],[use blocking in loops]) +fi AC_SUBST([SUFFIX]) AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc]) @@ -1986,4 +1989,10 @@ else make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP" fi - +if test x"$old_elpa_version" = x"yes"; then + echo " " + echo " It is possible that your current version of ELPA is not the latest one." + echo " You might want to have a look at https://elpa.mpcdf.mpg.de, whether a more recent" + echo " version has been released already" + echo " " +fi diff --git a/m4/ax_check_gcc_version.m4 b/m4/ax_check_gcc_version.m4 new file mode 100644 index 0000000000000000000000000000000000000000..388bce5796dedd760a0f569208b6b7097ed1676d --- /dev/null +++ b/m4/ax_check_gcc_version.m4 @@ -0,0 +1,24 @@ +AC_DEFUN([AX_GCC_VERSION], [ + GCC_VERSION="" + echo "calling gcc" + echo $CC + $CC | grep gcc + echo $? + AX_CHECK_COMPILE_FLAG([-dumpversion], + [ax_gcc_version_option=yes], + [ax_gcc_version_option=no]) + + AS_IF([test "x$GCC" = "xyes"],[ + AS_IF([test "x$ax_gcc_version_option" != "xno"],[ + AC_CACHE_CHECK([gcc version],[ax_cv_gcc_version],[ + ax_cv_gcc_version="`$CC -dumpversion`" + AS_IF([test "x$ax_cv_gcc_version" = "x"],[ + ax_cv_gcc_version="" + ]) + ]) + GCC_VERSION=$ax_cv_gcc_version + ]) + ]) + AC_SUBST([GCC_VERSION]) +]) + diff --git a/m4/ax_compare_version.m4 b/m4/ax_compare_version.m4 new file mode 100644 index 0000000000000000000000000000000000000000..fe541a1875e78a3c52ec63301c04dbb315fb8b18 --- /dev/null +++ b/m4/ax_compare_version.m4 @@ -0,0 +1,174 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_compare_version.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_COMPARE_VERSION(VERSION_A, OP, VERSION_B, [ACTION-IF-TRUE], [ACTION-IF-FALSE]) +# +# DESCRIPTION +# +# This macro compares two version strings. Due to the various number of +# minor-version numbers that can exist, and the fact that string +# comparisons are not compatible with numeric comparisons, this is not +# necessarily trivial to do in a autoconf script. This macro makes doing +# these comparisons easy. +# +# The six basic comparisons are available, as well as checking equality +# limited to a certain number of minor-version levels. +# +# The operator OP determines what type of comparison to do, and can be one +# of: +# +# eq - equal (test A == B) +# ne - not equal (test A != B) +# le - less than or equal (test A <= B) +# ge - greater than or equal (test A >= B) +# lt - less than (test A < B) +# gt - greater than (test A > B) +# +# Additionally, the eq and ne operator can have a number after it to limit +# the test to that number of minor versions. +# +# eq0 - equal up to the length of the shorter version +# ne0 - not equal up to the length of the shorter version +# eqN - equal up to N sub-version levels +# neN - not equal up to N sub-version levels +# +# When the condition is true, shell commands ACTION-IF-TRUE are run, +# otherwise shell commands ACTION-IF-FALSE are run. The environment +# variable 'ax_compare_version' is always set to either 'true' or 'false' +# as well. +# +# Examples: +# +# AX_COMPARE_VERSION([3.15.7],[lt],[3.15.8]) +# AX_COMPARE_VERSION([3.15],[lt],[3.15.8]) +# +# would both be true. +# +# AX_COMPARE_VERSION([3.15.7],[eq],[3.15.8]) +# AX_COMPARE_VERSION([3.15],[gt],[3.15.8]) +# +# would both be false. +# +# AX_COMPARE_VERSION([3.15.7],[eq2],[3.15.8]) +# +# would be true because it is only comparing two minor versions. +# +# AX_COMPARE_VERSION([3.15.7],[eq0],[3.15]) +# +# would be true because it is only comparing the lesser number of minor +# versions of the two values. +# +# Note: The characters that separate the version numbers do not matter. An +# empty string is the same as version 0. OP is evaluated by autoconf, not +# configure, so must be a string, not a variable. +# +# The author would like to acknowledge Guido Draheim whose advice about +# the m4_case and m4_ifvaln functions make this macro only include the +# portions necessary to perform the specific comparison specified by the +# OP argument in the final configure script. +# +# LICENSE +# +# Copyright (c) 2008 Tim Toolan +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. +#serial 13 + +dnl ######################################################################### +AC_DEFUN([AX_COMPARE_VERSION], [ + AC_REQUIRE([AC_PROG_AWK]) + # Used to indicate true or false condition + ax_compare_version=false + # Convert the two version strings to be compared into a format that + # allows a simple string comparison. The end result is that a version + # string of the form 1.12.5-r617 will be converted to the form + # 0001001200050617. In other words, each number is zero padded to four + # digits, and non digits are removed. + AS_VAR_PUSHDEF([A],[ax_compare_version_A]) + A=`echo "$1" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \ + -e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \ + -e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \ + -e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \ + -e 's/[[^0-9]]//g'` + + AS_VAR_PUSHDEF([B],[ax_compare_version_B]) + B=`echo "$3" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \ + -e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \ + -e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \ + -e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \ + -e 's/[[^0-9]]//g'` + + dnl # In the case of le, ge, lt, and gt, the strings are sorted as necessary + dnl # then the first line is used to determine if the condition is true. + dnl # The sed right after the echo is to remove any indented white space. + m4_case(m4_tolower($2), + [lt],[ + ax_compare_version=`echo "x$A +x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/false/;s/x${B}/true/;1q"` + ], + [gt],[ + ax_compare_version=`echo "x$A +x$B" | sed 's/^ *//' | sort | sed "s/x${A}/false/;s/x${B}/true/;1q"` + ], + [le],[ + ax_compare_version=`echo "x$A +x$B" | sed 's/^ *//' | sort | sed "s/x${A}/true/;s/x${B}/false/;1q"` + ], + [ge],[ + ax_compare_version=`echo "x$A +x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/true/;s/x${B}/false/;1q"` + ],[ + dnl Split the operator from the subversion count if present. + m4_bmatch(m4_substr($2,2), + [0],[ + # A count of zero means use the length of the shorter version. + # Determine the number of characters in A and B. + ax_compare_version_len_A=`echo "$A" | $AWK '{print(length)}'` + ax_compare_version_len_B=`echo "$B" | $AWK '{print(length)}'` + + # Set A to no more than B's length and B to no more than A's length. + A=`echo "$A" | sed "s/\(.\{$ax_compare_version_len_B\}\).*/\1/"` + B=`echo "$B" | sed "s/\(.\{$ax_compare_version_len_A\}\).*/\1/"` + ], + [[0-9]+],[ + # A count greater than zero means use only that many subversions + A=`echo "$A" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"` + B=`echo "$B" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"` + ], + [.+],[ + AC_WARNING( + [invalid OP numeric parameter: $2]) + ],[]) + + # Pad zeros at end of numbers to make same length. + ax_compare_version_tmp_A="$A`echo $B | sed 's/./0/g'`" + B="$B`echo $A | sed 's/./0/g'`" + A="$ax_compare_version_tmp_A" + # Check for equality or inequality as necessary. + m4_case(m4_tolower(m4_substr($2,0,2)), + [eq],[ + test "x$A" = "x$B" && ax_compare_version=true + ], + [ne],[ + test "x$A" != "x$B" && ax_compare_version=true + ],[ + AC_WARNING([invalid OP parameter: $2]) + ]) + ]) + + AS_VAR_POPDEF([A])dnl + AS_VAR_POPDEF([B])dnl + + dnl # Execute ACTION-IF-TRUE / ACTION-IF-FALSE. + if test "$ax_compare_version" = "true" ; then + have_loop_blocking=yes + m4_ifvaln([$4],[$4],[:])dnl + m4_ifvaln([$5],[else $5])dnl + fi +]) dnl AX_COMPARE_VERSION diff --git a/src/elpa1/elpa1_merge_systems_real_template.F90 b/src/elpa1/elpa1_merge_systems_real_template.F90 index 0a44602858c669611640785f857f5bf5637245be..f60d5903a1e7072485e2dc7836b507f87529b779 100644 --- a/src/elpa1/elpa1_merge_systems_real_template.F90 +++ b/src/elpa1/elpa1_merge_systems_real_template.F90 @@ -54,1180 +54,1180 @@ #include "../general/sanity.F90" - subroutine merge_systems_& - &PRECISION & - (obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, & - l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success, max_threads) - use cuda_functions - use iso_c_binding - use precision - use elpa_abstract_impl - use elpa_blas_interfaces +subroutine merge_systems_& +&PRECISION & + (obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, & + l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success, max_threads) + use cuda_functions + use iso_c_binding + use precision + use elpa_abstract_impl + use elpa_blas_interfaces #ifdef WITH_OPENMP - use omp_lib + use omp_lib #endif - implicit none + implicit none #include "../general/precision_kinds.F90" - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik), intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, & - mpi_comm_cols, npc_0, npc_n - integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na) - real(kind=REAL_DATATYPE), intent(inout) :: d(na), e + class(elpa_abstract_impl_t), intent(inout) :: obj + integer(kind=ik), intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, npc_0, npc_n + integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na) + real(kind=REAL_DATATYPE), intent(inout) :: d(na), e #ifdef USE_ASSUMED_SIZE - real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*) + real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*) #else - real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols) + real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols) #endif - logical, intent(in) :: useGPU, wantDebug - logical, intent(out) :: success - - ! TODO: play with max_strip. If it was larger, matrices being multiplied - ! might be larger as well! - integer(kind=ik), parameter :: max_strip=128 - - - real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, & - qtrans(2,2), dmax, zmax, d1new, d2new - real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), & - dbase(na), ddiff(na), ev_scale(na), tmp(na) - real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na) - real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:) + logical, intent(in) :: useGPU, wantDebug + logical, intent(out) :: success + + ! TODO: play with max_strip. If it was larger, matrices being multiplied + ! might be larger as well! + integer(kind=ik), parameter :: max_strip=128 + + + real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, & + qtrans(2,2), dmax, zmax, d1new, d2new + real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), & + dbase(na), ddiff(na), ev_scale(na), tmp(na) + real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na) + real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:) #ifdef WITH_OPENMP - real(kind=REAL_DATATYPE), allocatable :: z_p(:,:) + real(kind=REAL_DATATYPE), allocatable :: z_p(:,:) #endif - integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, & - l_rqm, ns, info - integer(kind=BLAS_KIND) :: infoBLAS - integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, & - l_cols_qreorg, np, l_idx, nqcols1, nqcols2 - integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, & - np_cols - integer(kind=MPI_KIND) :: mpierr - integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI - integer(kind=ik) :: np_next, np_prev, np_rem - integer(kind=ik) :: idx(na), idx1(na), idx2(na) - integer(kind=BLAS_KIND) :: idxBLAS(NA) - integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na) - - integer(kind=ik) :: istat - character(200) :: errorMessage - integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m - - integer(kind=c_intptr_t) :: num - integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev - logical :: successCUDA - integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& - &PRECISION& - &_real - integer(kind=ik), intent(in) :: max_threads + integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, & + l_rqm, ns, info + integer(kind=BLAS_KIND) :: infoBLAS + integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, & + l_cols_qreorg, np, l_idx, nqcols1, nqcols2 + integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, & + np_cols + integer(kind=MPI_KIND) :: mpierr + integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI + integer(kind=ik) :: np_next, np_prev, np_rem + integer(kind=ik) :: idx(na), idx1(na), idx2(na) + integer(kind=BLAS_KIND) :: idxBLAS(NA) + integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na) + + integer(kind=ik) :: istat + character(200) :: errorMessage + integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m + + integer(kind=c_intptr_t) :: num + integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev + logical :: successCUDA + integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& + &PRECISION& + &_real + integer(kind=ik), intent(in) :: max_threads #ifdef WITH_OPENMP - integer(kind=ik) :: my_thread + integer(kind=ik) :: my_thread - allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage) - check_allocate("merge_systems: z_p",istat, errorMessage) + allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage) + check_allocate("merge_systems: z_p",istat, errorMessage) #endif - call obj%timer%start("merge_systems" // PRECISION_SUFFIX) - success = .true. - call obj%timer%start("mpi_communication") - call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr) - call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr) - call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr) - call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr) - - my_prow = int(my_prowMPI,kind=c_int) - np_rows = int(np_rowsMPI,kind=c_int) - my_pcol = int(my_pcolMPI,kind=c_int) - np_cols = int(np_colsMPI,kind=c_int) - - call obj%timer%stop("mpi_communication") - - ! If my processor column isn't in the requested set, do nothing - - if (my_pcol=npc_0+npc_n) then - call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) - return - endif - ! Determine number of "next" and "prev" column for ring sends - - if (my_pcol == npc_0+npc_n-1) then - np_next = npc_0 - else - np_next = my_pcol + 1 - endif + call obj%timer%start("merge_systems" // PRECISION_SUFFIX) + success = .true. + call obj%timer%start("mpi_communication") + call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr) + call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr) + call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr) + call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr) + + my_prow = int(my_prowMPI,kind=c_int) + np_rows = int(np_rowsMPI,kind=c_int) + my_pcol = int(my_pcolMPI,kind=c_int) + np_cols = int(np_colsMPI,kind=c_int) + + call obj%timer%stop("mpi_communication") + + ! If my processor column isn't in the requested set, do nothing + + if (my_pcol=npc_0+npc_n) then + call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) + return + endif + ! Determine number of "next" and "prev" column for ring sends + + if (my_pcol == npc_0+npc_n-1) then + np_next = npc_0 + else + np_next = my_pcol + 1 + endif + + if (my_pcol == npc_0) then + np_prev = npc_0+npc_n-1 + else + np_prev = my_pcol - 1 + endif + call check_monotony_& + &PRECISION& + &(obj, nm,d,'Input1',wantDebug, success) + if (.not.(success)) then + call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) + return + endif + call check_monotony_& + &PRECISION& + &(obj,na-nm,d(nm+1),'Input2',wantDebug, success) + if (.not.(success)) then + call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) + return + endif + ! Get global number of processors and my processor number. + ! Please note that my_proc does not need to match any real processor number, + ! it is just used for load balancing some loops. + + n_procs = np_rows*npc_n + my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major + + + ! Local limits of the rows of Q + + l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q + l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm + l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q + + l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm + l_rows = l_rqe-l_rqs+1 ! Total number of local rows + + + ! My number of local columns + + l_cols = COUNT(p_col(1:na)==my_pcol) + + ! Get max number of local columns + + max_local_cols = 0 + do np = npc_0, npc_0+npc_n-1 + max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np)) + enddo + + ! Calculations start here + + beta = abs(e) + sig = sign(1.0_rk,e) + + ! Calculate rank-1 modifier z + + z(:) = 0 + + if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then + ! nm is local on my row + do i = 1, na + if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i)) + enddo + endif + + if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then + ! nm+1 is local on my row + do i = 1, na + if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i)) + enddo + endif + + call global_gather_& + &PRECISION& + &(obj, z, na) + ! Normalize z so that norm(z) = 1. Since z is the concatenation of + ! two normalized vectors, norm2(z) = sqrt(2). + z = z/sqrt(2.0_rk) + rho = 2.0_rk*beta + ! Calculate index for merging both systems by ascending eigenvalues + call obj%timer%start("blas") + call PRECISION_LAMRG( int(nm,kind=BLAS_KIND), int(na-nm,kind=BLAS_KIND), d, & + 1_BLAS_KIND, 1_BLAS_KIND, idxBLAS ) + idx(:) = int(idxBLAS(:),kind=ik) + call obj%timer%stop("blas") + + ! Calculate the allowable deflation tolerance + + zmax = maxval(abs(z)) + dmax = maxval(abs(d)) + EPS = PRECISION_LAMCH( 'E' ) ! return epsilon + TOL = 8.0_rk*EPS*MAX(dmax,zmax) + + ! If the rank-1 modifier is small enough, no more needs to be done + ! except to reorganize D and Q + + IF ( RHO*zmax <= TOL ) THEN + + ! Rearrange eigenvalues + + tmp = d + do i=1,na + d(i) = tmp(idx(i)) + enddo + + ! Rearrange eigenvectors + call resort_ev_& + &PRECISION & + (obj, idx, na) - if (my_pcol == npc_0) then - np_prev = npc_0+npc_n-1 - else - np_prev = my_pcol - 1 - endif - call check_monotony_& - &PRECISION& - &(obj, nm,d,'Input1',wantDebug, success) - if (.not.(success)) then - call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) - return - endif - call check_monotony_& - &PRECISION& - &(obj,na-nm,d(nm+1),'Input2',wantDebug, success) - if (.not.(success)) then - call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) - return - endif - ! Get global number of processors and my processor number. - ! Please note that my_proc does not need to match any real processor number, - ! it is just used for load balancing some loops. + call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) - n_procs = np_rows*npc_n - my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major + return + ENDIF + ! Merge and deflate system - ! Local limits of the rows of Q + na1 = 0 + na2 = 0 - l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q - l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm - l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q + ! COLTYP: + ! 1 : non-zero in the upper half only; + ! 2 : dense; + ! 3 : non-zero in the lower half only; + ! 4 : deflated. - l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm - l_rows = l_rqe-l_rqs+1 ! Total number of local rows + coltyp(1:nm) = 1 + coltyp(nm+1:na) = 3 + do i=1,na - ! My number of local columns + if (rho*abs(z(idx(i))) <= tol) then - l_cols = COUNT(p_col(1:na)==my_pcol) + ! Deflate due to small z component. - ! Get max number of local columns + na2 = na2+1 + d2(na2) = d(idx(i)) + idx2(na2) = idx(i) + coltyp(idx(i)) = 4 - max_local_cols = 0 - do np = npc_0, npc_0+npc_n-1 - max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np)) - enddo + else if (na1>0) then - ! Calculations start here + ! Check if eigenvalues are close enough to allow deflation. - beta = abs(e) - sig = sign(1.0_rk,e) + S = Z(idx(i)) + C = Z1(na1) - ! Calculate rank-1 modifier z + ! Find sqrt(a**2+b**2) without overflow or + ! destructive underflow. + TAU = PRECISION_LAPY2( C, S ) + T = D1(na1) - D(idx(i)) + C = C / TAU + S = -S / TAU + IF ( ABS( T*C*S ) <= TOL ) THEN - z(:) = 0 + ! Deflation is possible. - if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then - ! nm is local on my row - do i = 1, na - if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i)) - enddo - endif + na2 = na2+1 - if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then - ! nm+1 is local on my row - do i = 1, na - if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i)) - enddo - endif + Z1(na1) = TAU - call global_gather_& - &PRECISION& - &(obj, z, na) - ! Normalize z so that norm(z) = 1. Since z is the concatenation of - ! two normalized vectors, norm2(z) = sqrt(2). - z = z/sqrt(2.0_rk) - rho = 2.0_rk*beta - ! Calculate index for merging both systems by ascending eigenvalues - call obj%timer%start("blas") - call PRECISION_LAMRG( int(nm,kind=BLAS_KIND), int(na-nm,kind=BLAS_KIND), d, & - 1_BLAS_KIND, 1_BLAS_KIND, idxBLAS ) - idx(:) = int(idxBLAS(:),kind=ik) - call obj%timer%stop("blas") + d2new = D(idx(i))*C**2 + D1(na1)*S**2 + d1new = D(idx(i))*S**2 + D1(na1)*C**2 - ! Calculate the allowable deflation tolerance + ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0 + ! This means that after the above transformation it must be + ! D1(na1) <= d1new <= D(idx(i)) + ! D1(na1) <= d2new <= D(idx(i)) + ! + ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1)) + ! so there is no problem with sorting here. + ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1) + ! which makes a check (and possibly a resort) necessary. + ! + ! The above relations may not hold exactly due to numeric differences + ! so they have to be enforced in order not to get troubles with sorting. - zmax = maxval(abs(z)) - dmax = maxval(abs(d)) - EPS = PRECISION_LAMCH( 'E' ) ! return epsilon - TOL = 8.0_rk*EPS*MAX(dmax,zmax) - ! If the rank-1 modifier is small enough, no more needs to be done - ! except to reorganize D and Q + if (d1newD(idx(i))) d1new = D(idx(i)) - IF ( RHO*zmax <= TOL ) THEN + if (d2newD(idx(i))) d2new = D(idx(i)) - ! Rearrange eigenvalues + D1(na1) = d1new - tmp = d - do i=1,na - d(i) = tmp(idx(i)) + do j=na2-1,1,-1 + if (d2new0) then - - ! Check if eigenvalues are close enough to allow deflation. - - S = Z(idx(i)) - C = Z1(na1) - - ! Find sqrt(a**2+b**2) without overflow or - ! destructive underflow. - TAU = PRECISION_LAPY2( C, S ) - T = D1(na1) - D(idx(i)) - C = C / TAU - S = -S / TAU - IF ( ABS( T*C*S ) <= TOL ) THEN - - ! Deflation is possible. - - na2 = na2+1 - - Z1(na1) = TAU - - d2new = D(idx(i))*C**2 + D1(na1)*S**2 - d1new = D(idx(i))*S**2 + D1(na1)*C**2 - - ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0 - ! This means that after the above transformation it must be - ! D1(na1) <= d1new <= D(idx(i)) - ! D1(na1) <= d2new <= D(idx(i)) - ! - ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1)) - ! so there is no problem with sorting here. - ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1) - ! which makes a check (and possibly a resort) necessary. - ! - ! The above relations may not hold exactly due to numeric differences - ! so they have to be enforced in order not to get troubles with sorting. - - - if (d1newD(idx(i))) d1new = D(idx(i)) - - if (d2newD(idx(i))) d2new = D(idx(i)) - - D1(na1) = d1new - - do j=na2-1,1,-1 - if (d2new2) then - if (na1==1 .or. na1==2) then - ! if(my_proc==0) print *,'--- Remark solve_tridi: na1==',na1,' proc==',myid - - if (na1==1) then - d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation - else ! na1==2 - call obj%timer%start("blas") - call PRECISION_LAED5(1_BLAS_KIND, d1, z1, qtrans(1,1), rho, d(1)) - call PRECISION_LAED5(2_BLAS_KIND, d1, z1, qtrans(1,2), rho, d(2)) - call obj%timer%stop("blas") - call transform_columns_& - &PRECISION& - &(obj, idx1(1), idx1(2)) - endif - - ! Add the deflated eigenvalues - d(na1+1:na) = d2(1:na2) - - ! Calculate arrangement of all eigenvalues in output - call obj%timer%start("blas") - call PRECISION_LAMRG( int(na1,kind=BLAS_KIND), int(na-na1,kind=BLAS_KIND), d, & - 1_BLAS_KIND, 1_BLAS_KIND, idxBLAS ) - idx(:) = int(idxBLAS(:),kind=ik) - call obj%timer%stop("blas") - ! Rearrange eigenvalues - - tmp = d - do i=1,na - d(i) = tmp(idx(i)) - enddo - - ! Rearrange eigenvectors + ! Solve secular equation - do i=1,na - if (idx(i)<=na1) then - idxq1(i) = idx1(idx(i)) - else - idxq1(i) = idx2(idx(i)-na1) - endif - enddo - call resort_ev_& - &PRECISION& - &(obj, idxq1, na) - else if (na1>2) then - - ! Solve secular equation - - z(1:na1) = 1 + z(1:na1) = 1 #ifdef WITH_OPENMP - z_p(1:na1,:) = 1 + z_p(1:na1,:) = 1 #endif - dbase(1:na1) = 0 - ddiff(1:na1) = 0 + dbase(1:na1) = 0 + ddiff(1:na1) = 0 - info = 0 - infoBLAS = int(info,kind=BLAS_KIND) + info = 0 + infoBLAS = int(info,kind=BLAS_KIND) !#ifdef WITH_OPENMP ! -! call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) +! call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX) !!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j) -! my_thread = omp_get_thread_num() +! my_thread = omp_get_thread_num() !!$OMP DO !#endif - DO i = my_proc+1, na1, n_procs ! work distributed over all processors - call obj%timer%start("blas") - call PRECISION_LAED4(int(na1,kind=BLAS_KIND), int(i,kind=BLAS_KIND), d1, z1, delta, & - rho, s, infoBLAS) ! s is not used! - info = int(infoBLAS,kind=ik) - call obj%timer%stop("blas") - if (info/=0) then - ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) - ! use the more stable bisection algorithm in solve_secular_equation - ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection' - call solve_secular_equation_& - &PRECISION& - &(obj, na1, i, d1, z1, delta, rho, s) - endif + DO i = my_proc+1, na1, n_procs ! work distributed over all processors + call obj%timer%start("blas") + call PRECISION_LAED4(int(na1,kind=BLAS_KIND), int(i,kind=BLAS_KIND), d1, z1, delta, & + rho, s, infoBLAS) ! s is not used! + info = int(infoBLAS,kind=ik) + call obj%timer%stop("blas") + if (info/=0) then + ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) + ! use the more stable bisection algorithm in solve_secular_equation + ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection' + call solve_secular_equation_& + &PRECISION& + &(obj, na1, i, d1, z1, delta, rho, s) + endif - ! Compute updated z + ! Compute updated z !#ifdef WITH_OPENMP -! do j=1,na1 -! if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) ) -! enddo -! z_p(i,my_thread) = z_p(i,my_thread)*delta(i) +! do j=1,na1 +! if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) ) +! enddo +! z_p(i,my_thread) = z_p(i,my_thread)*delta(i) !#else - do j=1,na1 - if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) ) - enddo - z(i) = z(i)*delta(i) + do j=1,na1 + if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) ) + enddo + z(i) = z(i)*delta(i) !#endif - ! store dbase/ddiff - - if (i1) then + + if (np_rem==npc_0) then + np_rem = npc_0+npc_n-1 + else + np_rem = np_rem-1 + endif +#ifdef WITH_MPI + call obj%timer%start("mpi_communication") + call MPI_Sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=MPI_KIND), MPI_REAL_PRECISION, & + int(np_next,kind=MPI_KIND), 1111_MPI_KIND, int(np_prev,kind=MPI_KIND), & + 1111_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) + call obj%timer%stop("mpi_communication") +#endif /* WITH_MPI */ + endif + + if (useGPU) then + successCUDA = cuda_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), & + gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice) + check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA) + endif + + ! Gather the parts in d1 and z which are fitting to qtmp1. + ! This also delivers nnzu/nnzl for proc np_rem + + nnzu = 0 + nnzl = 0 + do i=1,na1 + if (p_col(idx1(i))==np_rem) then + if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then + nnzu = nnzu+1 + d1u(nnzu) = d1(i) + zu (nnzu) = z (i) + endif + if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then + nnzl = nnzl+1 + d1l(nnzl) = d1(i) + zl (nnzl) = z (i) + endif + endif + enddo + + ! Set the deflated eigenvectors in Q (comming from proc np_rem) + + ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix + do i = 1, na + j = idx(i) + if (j>na1) then + if (p_col(idx2(j-na1))==np_rem) then + ndef = ndef+1 + if (p_col_out(i)==my_pcol) & + q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef) + endif + endif + enddo + + do ns = 0, nqcols1-1, max_strip ! strimining loop + + ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip + + ! Get partial result from (output) Q + + do i = 1, ncnt + qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) + enddo - num = (gemm_dim_k * gemm_dim_m) * size_of_datatype - successCUDA = cuda_host_register(int(loc(qtmp2),kind=c_intptr_t),num,& - cudaHostRegisterDefault) - check_host_register_cuda("merge_systems: qtmp2", successCUDA) + ! Compute eigenvectors of the rank-1 modified matrix. + ! Parts for multiplying with upper half of Q: + + do i = 1, ncnt + j = idx(idxq1(i+ns)) + ! Calculate the j-th eigenvector of the deflated system + ! See above why we are doing it this way! + tmp(1:nnzu) = d1u(1:nnzu)-dbase(j) + call v_add_s_& + &PRECISION& + &(obj,tmp,nnzu,ddiff(j)) + ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j) + enddo - successCUDA = cuda_malloc(qtmp2_dev, num) - check_alloc_cuda("merge_systems: qtmp2_dev", successCUDA) - endif + if(useGPU) then + !TODO: it should be enough to copy l_rows x ncnt + successCUDA = cuda_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), & + gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) + check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA) + + !TODO the previous loop could be possible to do on device and thus + !copy less + successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), & + gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) + check_memcpy_cuda("merge_systems: ev_dev", successCUDA) + endif + + ! Multiply old Q with eigenvectors (upper half) + + if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then + if (useGPU) then + call obj%timer%start("cublas") + call cublas_PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, & + 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), & + ev_dev, ubound(ev,dim=1), & + 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1)) + call obj%timer%stop("cublas") + else + call obj%timer%start("blas") + call obj%timer%start("gemm") + call PRECISION_GEMM('N', 'N', int(l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), & + int(nnzu,kind=BLAS_KIND), & + 1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=BLAS_KIND), & + ev, int(ubound(ev,dim=1),kind=BLAS_KIND), & + 1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND)) + call obj%timer%stop("gemm") + call obj%timer%stop("blas") + endif ! useGPU + endif + + ! Compute eigenvectors of the rank-1 modified matrix. + ! Parts for multiplying with lower half of Q: + + do i = 1, ncnt + j = idx(idxq1(i+ns)) + ! Calculate the j-th eigenvector of the deflated system + ! See above why we are doing it this way! + tmp(1:nnzl) = d1l(1:nnzl)-dbase(j) + call v_add_s_& + &PRECISION& + &(obj,tmp,nnzl,ddiff(j)) + ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j) + enddo - ! Gather nonzero upper/lower components of old matrix Q - ! which are needed for multiplication with new eigenvectors - - nnzu = 0 - nnzl = 0 - do i = 1, na1 - l_idx = l_col(idx1(i)) - if (p_col(idx1(i))==my_pcol) then - if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then - nnzu = nnzu+1 - qtmp1(1:l_rnm,nnzu) = q(l_rqs:l_rqm,l_idx) - endif - if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then - nnzl = nnzl+1 - qtmp1(l_rnm+1:l_rows,nnzl) = q(l_rqm+1:l_rqe,l_idx) - endif - endif - enddo + if(useGPU) then + !TODO the previous loop could be possible to do on device and thus + !copy less + successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), & + gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) + check_memcpy_cuda("merge_systems: ev_dev", successCUDA) + endif + + ! Multiply old Q with eigenvectors (lower half) + + if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then + if (useGPU) then + call obj%timer%start("cublas") + call cublas_PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, & + 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), & + ev_dev, ubound(ev,dim=1), & + 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1)) + call obj%timer%stop("cublas") + else + call obj%timer%start("blas") + call obj%timer%start("gemm") + call PRECISION_GEMM('N', 'N', int(l_rows-l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), & + int(nnzl,kind=BLAS_KIND), & + 1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=BLAS_KIND), & + ev, int(ubound(ev,dim=1),kind=BLAS_KIND), & + 1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND)) + call obj%timer%stop("gemm") + call obj%timer%stop("blas") + endif ! useGPU + endif + + if(useGPU) then + !TODO either copy only half of the matrix here, and get rid of the + !previous copy or copy whole array here + successCUDA = cuda_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, & + gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA) + endif + + ! Put partial result into (output) Q + + do i = 1, ncnt + q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i) + enddo - ! Gather deflated eigenvalues behind nonzero components + enddo !ns = 0, nqcols1-1, max_strip ! strimining loop + enddo !do np = 1, npc_n - ndef = max(nnzu,nnzl) - do i = 1, na2 - l_idx = l_col(idx2(i)) - if (p_col(idx2(i))==my_pcol) then - ndef = ndef+1 - qtmp1(1:l_rows,ndef) = q(l_rqs:l_rqe,l_idx) - endif - enddo + if(useGPU) then + successCUDA = cuda_host_unregister(int(loc(qtmp1),kind=c_intptr_t)) + check_host_unregister_cuda("merge_systems: qtmp1", successCUDA) - l_cols_qreorg = ndef ! Number of columns in reorganized matrix + successCUDA = cuda_free(qtmp1_dev) + check_dealloc_cuda("merge_systems: qtmp1_dev", successCUDA) + + successCUDA = cuda_host_unregister(int(loc(qtmp2),kind=c_intptr_t)) + check_host_unregister_cuda("merge_systems: qtmp2", successCUDA) - ! Set (output) Q to 0, it will sum up new Q + successCUDA = cuda_free(qtmp2_dev) + check_dealloc_cuda("merge_systems: qtmp2_dev", successCUDA) - DO i = 1, na - if(p_col_out(i)==my_pcol) q(l_rqs:l_rqe,l_col_out(i)) = 0 - enddo + successCUDA = cuda_host_unregister(int(loc(ev),kind=c_intptr_t)) + check_host_unregister_cuda("merge_systems: ev", successCUDA) - np_rem = my_pcol + successCUDA = cuda_free(ev_dev) + check_dealloc_cuda("merge_systems: ev_dev", successCUDA) + endif - do np = 1, npc_n - ! Do a ring send of qtmp1 + deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage) + check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errorMessage) + endif !very outer test (na1==1 .or. na1==2) +#ifdef WITH_OPENMP + deallocate(z_p, stat=istat, errmsg=errorMessage) + check_deallocate("merge_systems: z_p",istat, errorMessage) +#endif - if (np>1) then + call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) - if (np_rem==npc_0) then - np_rem = npc_0+npc_n-1 - else - np_rem = np_rem-1 - endif -#ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call MPI_Sendrecv_replace(qtmp1, int(l_rows*max_local_cols,kind=MPI_KIND), MPI_REAL_PRECISION, & - int(np_next,kind=MPI_KIND), 1111_MPI_KIND, int(np_prev,kind=MPI_KIND), & - 1111_MPI_KIND, int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) - call obj%timer%stop("mpi_communication") -#endif /* WITH_MPI */ - endif + return - if (useGPU) then - successCUDA = cuda_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), & - gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA) - endif - - ! Gather the parts in d1 and z which are fitting to qtmp1. - ! This also delivers nnzu/nnzl for proc np_rem - - nnzu = 0 - nnzl = 0 - do i=1,na1 - if (p_col(idx1(i))==np_rem) then - if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then - nnzu = nnzu+1 - d1u(nnzu) = d1(i) - zu (nnzu) = z (i) - endif - if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then - nnzl = nnzl+1 - d1l(nnzl) = d1(i) - zl (nnzl) = z (i) - endif - endif - enddo + contains + subroutine add_tmp_& + &PRECISION& + &(obj, d1, dbase, ddiff, z, ev_scale_value, na1,i) + use precision + use elpa_abstract_impl + implicit none + class(elpa_abstract_impl_t), intent(inout) :: obj + integer(kind=ik), intent(in) :: na1, i - ! Set the deflated eigenvectors in Q (comming from proc np_rem) - - ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix - do i = 1, na - j = idx(i) - if (j>na1) then - if (p_col(idx2(j-na1))==np_rem) then - ndef = ndef+1 - if (p_col_out(i)==my_pcol) & - q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef) - endif - endif - enddo + real(kind=REAL_DATATYPE), intent(in) :: d1(:), dbase(:), ddiff(:), z(:) + real(kind=REAL_DATATYPE), intent(inout) :: ev_scale_value + real(kind=REAL_DATATYPE) :: tmp(1:na1) - do ns = 0, nqcols1-1, max_strip ! strimining loop - - ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip - - ! Get partial result from (output) Q - - do i = 1, ncnt - qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) - enddo - - ! Compute eigenvectors of the rank-1 modified matrix. - ! Parts for multiplying with upper half of Q: - - do i = 1, ncnt - j = idx(idxq1(i+ns)) - ! Calculate the j-th eigenvector of the deflated system - ! See above why we are doing it this way! - tmp(1:nnzu) = d1u(1:nnzu)-dbase(j) - call v_add_s_& - &PRECISION& - &(obj,tmp,nnzu,ddiff(j)) - ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j) - enddo - - if(useGPU) then - !TODO: it should be enough to copy l_rows x ncnt - successCUDA = cuda_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), & - gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA) - - !TODO the previous loop could be possible to do on device and thus - !copy less - successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), & - gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("merge_systems: ev_dev", successCUDA) - endif - - ! Multiply old Q with eigenvectors (upper half) - - if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) then - if (useGPU) then - call obj%timer%start("cublas") - call cublas_PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, & - 1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), & - ev_dev, ubound(ev,dim=1), & - 1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1)) - call obj%timer%stop("cublas") - else - call obj%timer%start("blas") - call obj%timer%start("gemm") - call PRECISION_GEMM('N', 'N', int(l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), & - int(nnzu,kind=BLAS_KIND), & - 1.0_rk, qtmp1, int(ubound(qtmp1,dim=1),kind=BLAS_KIND), & - ev, int(ubound(ev,dim=1),kind=BLAS_KIND), & - 1.0_rk, qtmp2(1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND)) - call obj%timer%stop("gemm") - call obj%timer%stop("blas") - endif ! useGPU - endif - - ! Compute eigenvectors of the rank-1 modified matrix. - ! Parts for multiplying with lower half of Q: - - do i = 1, ncnt - j = idx(idxq1(i+ns)) - ! Calculate the j-th eigenvector of the deflated system - ! See above why we are doing it this way! - tmp(1:nnzl) = d1l(1:nnzl)-dbase(j) - call v_add_s_& - &PRECISION& - &(obj,tmp,nnzl,ddiff(j)) - ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j) - enddo - - if(useGPU) then - !TODO the previous loop could be possible to do on device and thus - !copy less - successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), & - gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("merge_systems: ev_dev", successCUDA) - endif - - ! Multiply old Q with eigenvectors (lower half) - - if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) then - if (useGPU) then - call obj%timer%start("cublas") - call cublas_PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, & - 1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), & - ev_dev, ubound(ev,dim=1), & - 1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1)) - call obj%timer%stop("cublas") - else - call obj%timer%start("blas") - call obj%timer%start("gemm") - call PRECISION_GEMM('N', 'N', int(l_rows-l_rnm,kind=BLAS_KIND), int(ncnt,kind=BLAS_KIND), & - int(nnzl,kind=BLAS_KIND), & - 1.0_rk, qtmp1(l_rnm+1,1), int(ubound(qtmp1,dim=1),kind=BLAS_KIND), & - ev, int(ubound(ev,dim=1),kind=BLAS_KIND), & - 1.0_rk, qtmp2(l_rnm+1,1), int(ubound(qtmp2,dim=1),kind=BLAS_KIND)) - call obj%timer%stop("gemm") - call obj%timer%stop("blas") - endif ! useGPU - endif - - if(useGPU) then - !TODO either copy only half of the matrix here, and get rid of the - !previous copy or copy whole array here - successCUDA = cuda_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, & - gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost) - check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA) - endif - - ! Put partial result into (output) Q - - do i = 1, ncnt - q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i) - enddo - - enddo !ns = 0, nqcols1-1, max_strip ! strimining loop - enddo !do np = 1, npc_n - - if(useGPU) then - successCUDA = cuda_host_unregister(int(loc(qtmp1),kind=c_intptr_t)) - check_host_unregister_cuda("merge_systems: qtmp1", successCUDA) - - successCUDA = cuda_free(qtmp1_dev) - check_dealloc_cuda("merge_systems: qtmp1_dev", successCUDA) - - successCUDA = cuda_host_unregister(int(loc(qtmp2),kind=c_intptr_t)) - check_host_unregister_cuda("merge_systems: qtmp2", successCUDA) - - successCUDA = cuda_free(qtmp2_dev) - check_dealloc_cuda("merge_systems: qtmp2_dev", successCUDA) - - successCUDA = cuda_host_unregister(int(loc(ev),kind=c_intptr_t)) - check_host_unregister_cuda("merge_systems: ev", successCUDA) - - successCUDA = cuda_free(ev_dev) - check_dealloc_cuda("merge_systems: ev_dev", successCUDA) - endif + ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code + ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results - deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage) - check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errorMessage) - endif !very outer test (na1==1 .or. na1==2) -#ifdef WITH_OPENMP - deallocate(z_p, stat=istat, errmsg=errorMessage) - check_deallocate("merge_systems: z_p",istat, errorMessage) -#endif + ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i) + ! in exactly this order, but we want to prevent compiler optimization - call obj%timer%stop("merge_systems" // PRECISION_SUFFIX) + tmp(1:na1) = d1(1:na1) -dbase(i) + call v_add_s_& + &PRECISION& + &(obj, tmp(1:na1),na1,ddiff(i)) + tmp(1:na1) = z(1:na1) / tmp(1:na1) + ev_scale_value = 1.0_rk/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) - return + end subroutine add_tmp_& + &PRECISION - contains - subroutine add_tmp_& - &PRECISION& - &(obj, d1, dbase, ddiff, z, ev_scale_value, na1,i) - use precision - use elpa_abstract_impl - implicit none - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik), intent(in) :: na1, i - - real(kind=REAL_DATATYPE), intent(in) :: d1(:), dbase(:), ddiff(:), z(:) - real(kind=REAL_DATATYPE), intent(inout) :: ev_scale_value - real(kind=REAL_DATATYPE) :: tmp(1:na1) - - ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code - ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results - - ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i) - ! in exactly this order, but we want to prevent compiler optimization - - tmp(1:na1) = d1(1:na1) -dbase(i) - call v_add_s_& - &PRECISION& - &(obj, tmp(1:na1),na1,ddiff(i)) - tmp(1:na1) = z(1:na1) / tmp(1:na1) - ev_scale_value = 1.0_rk/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) - - end subroutine add_tmp_& - &PRECISION - - subroutine resort_ev_& - &PRECISION& - &(obj, idx_ev, nLength) - use precision - use elpa_abstract_impl - implicit none - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik), intent(in) :: nLength - integer(kind=ik) :: idx_ev(nLength) - integer(kind=ik) :: i, nc, pc1, pc2, lc1, lc2, l_cols_out + subroutine resort_ev_& + &PRECISION& + &(obj, idx_ev, nLength) + use precision + use elpa_abstract_impl + implicit none + class(elpa_abstract_impl_t), intent(inout) :: obj + integer(kind=ik), intent(in) :: nLength + integer(kind=ik) :: idx_ev(nLength) + integer(kind=ik) :: i, nc, pc1, pc2, lc1, lc2, l_cols_out - real(kind=REAL_DATATYPE), allocatable :: qtmp(:,:) - integer(kind=ik) :: istat - character(200) :: errorMessage + real(kind=REAL_DATATYPE), allocatable :: qtmp(:,:) + integer(kind=ik) :: istat + character(200) :: errorMessage - if (l_rows==0) return ! My processor column has no work to do + if (l_rows==0) return ! My processor column has no work to do - ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i)) + ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i)) - l_cols_out = COUNT(p_col_out(1:na)==my_pcol) - allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage) - check_allocate("resort_ev: qtmp",istat, errorMessage) + l_cols_out = COUNT(p_col_out(1:na)==my_pcol) + allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage) + check_allocate("resort_ev: qtmp",istat, errorMessage) - nc = 0 + nc = 0 - do i=1,na + do i=1,na - pc1 = p_col(idx_ev(i)) - lc1 = l_col(idx_ev(i)) - pc2 = p_col_out(i) + pc1 = p_col(idx_ev(i)) + lc1 = l_col(idx_ev(i)) + pc2 = p_col_out(i) - if (pc2<0) cycle ! This column is not needed in output + if (pc2<0) cycle ! This column is not needed in output - if (pc2==my_pcol) nc = nc+1 ! Counter for output columns + if (pc2==my_pcol) nc = nc+1 ! Counter for output columns - if (pc1==my_pcol) then - if (pc2==my_pcol) then - ! send and recieve column are local - qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) - else + if (pc1==my_pcol) then + if (pc2==my_pcol) then + ! send and recieve column are local + qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) + else #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_send(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, int(mod(i,4096),kind=MPI_KIND), & - int(mpi_comm_cols,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_send(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, int(mod(i,4096),kind=MPI_KIND), & + int(mpi_comm_cols,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ - endif - else if (pc2==my_pcol) then + endif + else if (pc2==my_pcol) then #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_recv(qtmp(1,nc), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, int(mod(i,4096),kind=MPI_KIND), & - int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_recv(qtmp(1,nc), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, int(mod(i,4096),kind=MPI_KIND), & + int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) + qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) #endif /* WITH_MPI */ - endif - enddo - - ! Insert qtmp into (output) q - - nc = 0 - - do i=1,na - - pc2 = p_col_out(i) - lc2 = l_col_out(i) - - if (pc2==my_pcol) then - nc = nc+1 - q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc) - endif - enddo - - deallocate(qtmp, stat=istat, errmsg=errorMessage) - check_deallocate("resort_ev: qtmp",istat, errorMessage) - end subroutine resort_ev_& - &PRECISION - - subroutine transform_columns_& - &PRECISION& - &(obj, col1, col2) - use precision - use elpa_abstract_impl - implicit none - class(elpa_abstract_impl_t), intent(inout) :: obj - - integer(kind=ik) :: col1, col2 - integer(kind=ik) :: pc1, pc2, lc1, lc2 - - if (l_rows==0) return ! My processor column has no work to do - - pc1 = p_col(col1) - lc1 = l_col(col1) - pc2 = p_col(col2) - lc2 = l_col(col2) - - if (pc1==my_pcol) then - if (pc2==my_pcol) then - ! both columns are local - tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1) - q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) - q(l_rqs:l_rqe,lc1) = tmp(1:l_rows) - else + endif + enddo + + ! Insert qtmp into (output) q + + nc = 0 + + do i=1,na + + pc2 = p_col_out(i) + lc2 = l_col_out(i) + + if (pc2==my_pcol) then + nc = nc+1 + q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc) + endif + enddo + + deallocate(qtmp, stat=istat, errmsg=errorMessage) + check_deallocate("resort_ev: qtmp",istat, errorMessage) + end subroutine resort_ev_& + &PRECISION + + subroutine transform_columns_& + &PRECISION& + &(obj, col1, col2) + use precision + use elpa_abstract_impl + implicit none + class(elpa_abstract_impl_t), intent(inout) :: obj + + integer(kind=ik) :: col1, col2 + integer(kind=ik) :: pc1, pc2, lc1, lc2 + + if (l_rows==0) return ! My processor column has no work to do + + pc1 = p_col(col1) + lc1 = l_col(col1) + pc2 = p_col(col2) + lc2 = l_col(col2) + + if (pc1==my_pcol) then + if (pc2==my_pcol) then + ! both columns are local + tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1) + q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) + q(l_rqs:l_rqe,lc1) = tmp(1:l_rows) + else #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_sendrecv(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, & - tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, & - int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_sendrecv(q(l_rqs,lc1), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, & + tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc2, 1_MPI_KIND, & + int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - tmp(1:l_rows) = q(l_rqs:l_rqe,lc1) + tmp(1:l_rows) = q(l_rqs:l_rqe,lc1) #endif /* WITH_MPI */ - q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1) - endif - else if (pc2==my_pcol) then + q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1) + endif + else if (pc2==my_pcol) then #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_sendrecv(q(l_rqs,lc2), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, & - tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, & - int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_sendrecv(q(l_rqs,lc2), int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, & + tmp, int(l_rows,kind=MPI_KIND), MPI_REAL_PRECISION, pc1, 1_MPI_KIND, & + int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - tmp(1:l_rows) = q(l_rqs:l_rqe,lc2) + tmp(1:l_rows) = q(l_rqs:l_rqe,lc2) #endif /* WITH_MPI */ - q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) - endif - end subroutine transform_columns_& - &PRECISION + q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) + endif + end subroutine transform_columns_& + &PRECISION - subroutine global_gather_& - &PRECISION& - &(obj, z, n) - ! This routine sums up z over all processors. - ! It should only be used for gathering distributed results, - ! i.e. z(i) should be nonzero on exactly 1 processor column, - ! otherways the results may be numerically different on different columns - use precision - use elpa_abstract_impl - implicit none - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik) :: n - real(kind=REAL_DATATYPE) :: z(n) - real(kind=REAL_DATATYPE) :: tmp(n) - - if (npc_n==1 .and. np_rows==1) return ! nothing to do - - ! Do an mpi_allreduce over processor rows + subroutine global_gather_& + &PRECISION& + &(obj, z, n) + ! This routine sums up z over all processors. + ! It should only be used for gathering distributed results, + ! i.e. z(i) should be nonzero on exactly 1 processor column, + ! otherways the results may be numerically different on different columns + use precision + use elpa_abstract_impl + implicit none + class(elpa_abstract_impl_t), intent(inout) :: obj + integer(kind=ik) :: n + real(kind=REAL_DATATYPE) :: z(n) + real(kind=REAL_DATATYPE) :: tmp(n) + + if (npc_n==1 .and. np_rows==1) return ! nothing to do + + ! Do an mpi_allreduce over processor rows #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - tmp = z + tmp = z #endif /* WITH_MPI */ - ! If only 1 processor column, we are done - if (npc_n==1) then - z(:) = tmp(:) - return - endif + ! If only 1 processor column, we are done + if (npc_n==1) then + z(:) = tmp(:) + return + endif - ! If all processor columns are involved, we can use mpi_allreduce - if (npc_n==np_cols) then + ! If all processor columns are involved, we can use mpi_allreduce + if (npc_n==np_cols) then #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - tmp = z + tmp = z #endif /* WITH_MPI */ - return - endif + return + endif - ! Do a ring send over processor columns - z(:) = 0 - do np = 1, npc_n - z(:) = z(:) + tmp(:) + ! Do a ring send over processor columns + z(:) = 0 + do np = 1, npc_n + z(:) = z(:) + tmp(:) #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call MPI_Sendrecv_replace(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_next,kind=MPI_KIND), 1111_MPI_KIND, & - int(np_prev,kind=MPI_KIND), 1111_MPI_KIND, & + call obj%timer%start("mpi_communication") + call MPI_Sendrecv_replace(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np_next,kind=MPI_KIND), 1111_MPI_KIND, & + int(np_prev,kind=MPI_KIND), 1111_MPI_KIND, & int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ - enddo - end subroutine global_gather_& - &PRECISION + enddo + end subroutine global_gather_& + &PRECISION - subroutine global_product_& + subroutine global_product_& &PRECISION& &(obj, z, n) - ! This routine calculates the global product of z. - use precision - use elpa_abstract_impl - implicit none - class(elpa_abstract_impl_t), intent(inout) :: obj + ! This routine calculates the global product of z. + use precision + use elpa_abstract_impl + implicit none + class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik) :: n - real(kind=REAL_DATATYPE) :: z(n) + integer(kind=ik) :: n + real(kind=REAL_DATATYPE) :: z(n) - real(kind=REAL_DATATYPE) :: tmp(n) + real(kind=REAL_DATATYPE) :: tmp(n) - if (npc_n==1 .and. np_rows==1) return ! nothing to do + if (npc_n==1 .and. np_rows==1) return ! nothing to do - ! Do an mpi_allreduce over processor rows + ! Do an mpi_allreduce over processor rows #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_rows,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_allreduce(z, tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_rows,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - tmp = z + tmp = z #endif /* WITH_MPI */ - ! If only 1 processor column, we are done - if (npc_n==1) then - z(:) = tmp(:) - return - endif + ! If only 1 processor column, we are done + if (npc_n==1) then + z(:) = tmp(:) + return + endif - ! If all processor columns are involved, we can use mpi_allreduce - if (npc_n==np_cols) then + ! If all processor columns are involved, we can use mpi_allreduce + if (npc_n==np_cols) then #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_cols,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_allreduce(tmp, z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_PROD, int(mpi_comm_cols,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - z = tmp + z = tmp #endif /* WITH_MPI */ - return - endif + return + endif - ! We send all vectors to the first proc, do the product there - ! and redistribute the result. + ! We send all vectors to the first proc, do the product there + ! and redistribute the result. - if (my_pcol == npc_0) then - z(1:n) = tmp(1:n) - do np = npc_0+1, npc_0+npc_n-1 + if (my_pcol == npc_0) then + z(1:n) = tmp(1:n) + do np = npc_0+1, npc_0+npc_n-1 #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_recv(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, & - int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call mpi_recv(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, & + int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - tmp(1:n) = z(1:n) -#endif /* WITH_MPI */ - z(1:n) = z(1:n)*tmp(1:n) - enddo - do np = npc_0+1, npc_0+npc_n-1 -#ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call mpi_send(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, & - int(mpi_comm_cols,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") + tmp(1:n) = z(1:n) #endif /* WITH_MPI */ - enddo - else + z(1:n) = z(1:n)*tmp(1:n) + enddo + do np = npc_0+1, npc_0+npc_n-1 #ifdef WITH_MPI call obj%timer%start("mpi_communication") - call mpi_send(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, & + call mpi_send(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), 1111_MPI_KIND, & int(mpi_comm_cols,kind=MPI_KIND), mpierr) - call mpi_recv(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, & - int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) call obj%timer%stop("mpi_communication") +#endif /* WITH_MPI */ + enddo + else +#ifdef WITH_MPI + call obj%timer%start("mpi_communication") + call mpi_send(tmp, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, & + int(mpi_comm_cols,kind=MPI_KIND), mpierr) + call mpi_recv(z, int(n,kind=MPI_KIND), MPI_REAL_PRECISION, int(npc_0,kind=MPI_KIND), 1111_MPI_KIND, & + int(mpi_comm_cols,kind=MPI_KIND), MPI_STATUS_IGNORE, mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - z(1:n) = tmp(1:n) + z(1:n) = tmp(1:n) #endif /* WITH_MPI */ - endif - end subroutine global_product_& - &PRECISION + endif + end subroutine global_product_& + &PRECISION - subroutine check_monotony_& - &PRECISION& - &(obj, n,d,text, wantDebug, success) - ! This is a test routine for checking if the eigenvalues are monotonically increasing. - ! It is for debug purposes only, an error should never be triggered! - use precision - use elpa_abstract_impl - implicit none - - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik) :: n - real(kind=REAL_DATATYPE) :: d(n) - character*(*) :: text - - integer(kind=ik) :: i - logical, intent(in) :: wantDebug - logical, intent(out) :: success - - success = .true. - do i=1,n-1 - if (d(i+1)1) then + nev1 = l_cols ! all eigenvectors are needed + else + nev1 = MIN(nev,l_cols) + endif + call solve_tridi_col_& + &PRECISION_AND_SUFFIX & + (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & + matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads) + if (.not.(success)) then + call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString) + return + endif + ! If there is only 1 processor column, we are done + + if (np_cols==1) then + deallocate(limits, stat=istat, errmsg=errorMessage) + check_deallocate("solve_tridi: limits", istat, errorMessage) + + call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString) + return + endif + + ! Set index arrays for Q columns + + ! Dense distribution scheme: + + allocate(l_col(na), stat=istat, errmsg=errorMessage) + check_allocate("solve_tridi: l_col", istat, errorMessage) + + allocate(p_col(na), stat=istat, errmsg=errorMessage) + check_allocate("solve_tridi: p_col", istat, errorMessage) + + n = 0 + do np=0,np_cols-1 + nc = local_index(na, np, np_cols, nblk, -1) + do i=1,nc + n = n+1 + l_col(n) = i + p_col(n) = np + enddo + enddo + + ! Block cyclic distribution scheme, only nev columns are set: + + allocate(l_col_bc(na), stat=istat, errmsg=errorMessage) + check_allocate("solve_tridi: l_col_bc", istat, errorMessage) + + allocate(p_col_bc(na), stat=istat, errmsg=errorMessage) + check_allocate("solve_tridi: p_col_bc", istat, errorMessage) + + p_col_bc(:) = -1 + l_col_bc(:) = -1 + + do i = 0, na-1, nblk*np_cols + do j = 0, np_cols-1 + do n = 1, nblk + if (i+j*nblk+n <= MIN(nev,na)) then + p_col_bc(i+j*nblk+n) = j + l_col_bc(i+j*nblk+n) = i/np_cols + n + endif + enddo + enddo + enddo + + ! Recursively merge sub problems + call merge_recursive_& + &PRECISION & + (obj, 0, np_cols, useGPU, wantDebug, success) + if (.not.(success)) then + call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString) + return + endif + + deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage) + check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errorMessage) + + call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString) + return + + contains + recursive subroutine merge_recursive_& + &PRECISION & + (obj, np_off, nprocs, useGPU, wantDebug, success) + use precision + use elpa_abstract_impl + implicit none - my_prow = int(my_prowMPI,kind=c_int) - np_rows = int(np_rowsMPI,kind=c_int) - my_pcol = int(my_pcolMPI,kind=c_int) - np_cols = int(np_colsMPI,kind=c_int) + ! noff is always a multiple of nblk_ev + ! nlen-noff is always > nblk_ev - call obj%timer%stop("mpi_communication") + class(elpa_abstract_impl_t), intent(inout) :: obj + integer(kind=ik) :: np_off, nprocs + integer(kind=ik) :: np1, np2, noff, nlen, nmid, n + logical, intent(in) :: useGPU, wantDebug + logical, intent(out) :: success success = .true. - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q - l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q - - ! Set Q to 0 - q(1:l_rows, 1:l_cols) = 0.0_rk - - ! Get the limits of the subdivisons, each subdivison has as many cols - ! as fit on the respective processor column - - allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage) - check_allocate("solve_tridi: limits", istat, errorMessage) - - limits(0) = 0 - do np=0,np_cols-1 - nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np - - ! Check for the case that a column has have zero width. - ! This is not supported! - ! Scalapack supports it but delivers no results for these columns, - ! which is rather annoying - if (nc==0) then - call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX) - if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width' - success = .false. - return - endif - limits(np+1) = limits(np) + nc - enddo - - ! Subdivide matrix by subtracting rank 1 modifications - - do i=1,np_cols-1 - n = limits(i) - d(n) = d(n)-abs(e(n)) - d(n+1) = d(n+1)-abs(e(n)) - enddo - - ! Solve sub problems on processsor columns - - nc = limits(my_pcol) ! column after which my problem starts - - if (np_cols>1) then - nev1 = l_cols ! all eigenvectors are needed - else - nev1 = MIN(nev,l_cols) - endif - call solve_tridi_col_& - &PRECISION_AND_SUFFIX & - (obj, l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & - matrixCols, mpi_comm_rows, useGPU, wantDebug, success, max_threads) - if (.not.(success)) then - call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString) - return - endif - ! If there is only 1 processor column, we are done - - if (np_cols==1) then - deallocate(limits, stat=istat, errmsg=errorMessage) - check_deallocate("solve_tridi: limits", istat, errorMessage) - - call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString) + if (nprocs<=1) then + ! Safety check only + if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs + success = .false. return endif + ! Split problem into 2 subproblems of size np1 / np2 - ! Set index arrays for Q columns + np1 = nprocs/2 + np2 = nprocs-np1 - ! Dense distribution scheme: + if (np1 > 1) call merge_recursive_& + &PRECISION & + (obj, np_off, np1, useGPU, wantDebug, success) + if (.not.(success)) return + if (np2 > 1) call merge_recursive_& + &PRECISION & + (obj, np_off+np1, np2, useGPU, wantDebug, success) + if (.not.(success)) return - allocate(l_col(na), stat=istat, errmsg=errorMessage) - check_allocate("solve_tridi: l_col", istat, errorMessage) + noff = limits(np_off) + nmid = limits(np_off+np1) - noff + nlen = limits(np_off+nprocs) - noff - allocate(p_col(na), stat=istat, errmsg=errorMessage) - check_allocate("solve_tridi: p_col", istat, errorMessage) - - n = 0 - do np=0,np_cols-1 - nc = local_index(na, np, np_cols, nblk, -1) - do i=1,nc - n = n+1 - l_col(n) = i - p_col(n) = np +#ifdef WITH_MPI + call obj%timer%start("mpi_communication") + if (my_pcol==np_off) then + do n=np_off+np1,np_off+nprocs-1 + call mpi_send(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), 1_MPI_KIND, & + int(mpi_comm_cols,kind=MPI_KIND), mpierr) enddo - enddo - - ! Block cyclic distribution scheme, only nev columns are set: - - allocate(l_col_bc(na), stat=istat, errmsg=errorMessage) - check_allocate("solve_tridi: l_col_bc", istat, errorMessage) - - allocate(p_col_bc(na), stat=istat, errmsg=errorMessage) - check_allocate("solve_tridi: p_col_bc", istat, errorMessage) - - p_col_bc(:) = -1 - l_col_bc(:) = -1 - - do i = 0, na-1, nblk*np_cols - do j = 0, np_cols-1 - do n = 1, nblk - if (i+j*nblk+n <= MIN(nev,na)) then - p_col_bc(i+j*nblk+n) = j - l_col_bc(i+j*nblk+n) = i/np_cols + n - endif - enddo - enddo - enddo - - ! Recursively merge sub problems - call merge_recursive_& - &PRECISION & - (obj, 0, np_cols, useGPU, wantDebug, success) - if (.not.(success)) then - call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString) - return endif - - deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage) - check_deallocate("solve_tridi: limits, l_col, p_col, l_col_bc, p_col_bc", istat, errorMessage) - - call obj%timer%stop("solve_tridi" // PRECISION_SUFFIX // gpuString) - return - - contains - recursive subroutine merge_recursive_& - &PRECISION & - (obj, np_off, nprocs, useGPU, wantDebug, success) - use precision - use elpa_abstract_impl - implicit none - - ! noff is always a multiple of nblk_ev - ! nlen-noff is always > nblk_ev - - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik) :: np_off, nprocs - integer(kind=ik) :: np1, np2, noff, nlen, nmid, n - logical, intent(in) :: useGPU, wantDebug - logical, intent(out) :: success - - success = .true. - - if (nprocs<=1) then - ! Safety check only - if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs - success = .false. - return - endif - ! Split problem into 2 subproblems of size np1 / np2 - - np1 = nprocs/2 - np2 = nprocs-np1 - - if (np1 > 1) call merge_recursive_& - &PRECISION & - (obj, np_off, np1, useGPU, wantDebug, success) - if (.not.(success)) return - if (np2 > 1) call merge_recursive_& - &PRECISION & - (obj, np_off+np1, np2, useGPU, wantDebug, success) - if (.not.(success)) return - - noff = limits(np_off) - nmid = limits(np_off+np1) - noff - nlen = limits(np_off+nprocs) - noff - -#ifdef WITH_MPI - call obj%timer%start("mpi_communication") - if (my_pcol==np_off) then - do n=np_off+np1,np_off+nprocs-1 - call mpi_send(d(noff+1), int(nmid,kind=MPI_KIND), MPI_REAL_PRECISION, int(n,kind=MPI_KIND), 1_MPI_KIND, & - int(mpi_comm_cols,kind=MPI_KIND), mpierr) - enddo - endif - call obj%timer%stop("mpi_communication") + call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ - if (my_pcol>=np_off+np1 .and. my_pcol=np_off+np1 .and. my_pcol=np_off .and. my_pcol=np_off .and. my_pcol2*min_submatrix_size) - n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries - ndiv = ndiv*2 - enddo + n = na + ndiv = 1 + do while(2*ndiv<=np_rows .and. n>2*min_submatrix_size) + n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries + ndiv = ndiv*2 + enddo - ! If there is only 1 processor row and not all eigenvectors are needed - ! and the matrix size is big enough, then use 2 subdivisions - ! so that merge_systems is called once and only the needed - ! eigenvectors are calculated for the final problem. + ! If there is only 1 processor row and not all eigenvectors are needed + ! and the matrix size is big enough, then use 2 subdivisions + ! so that merge_systems is called once and only the needed + ! eigenvectors are calculated for the final problem. - if (np_rows==1 .and. nev2*min_submatrix_size) ndiv = 2 + if (np_rows==1 .and. nev2*min_submatrix_size) ndiv = 2 - allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage) - check_deallocate("solve_tridi_col: limits", istat, errorMessage) + allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage) + check_deallocate("solve_tridi_col: limits", istat, errorMessage) - limits(0) = 0 - limits(ndiv) = na + limits(0) = 0 + limits(ndiv) = na - n = ndiv - do while(n>1) - n = n/2 ! n is always a power of 2 - do i=0,ndiv-1,2*n - ! We want to have even boundaries (for cache line alignments) - limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2 - enddo - enddo + n = ndiv + do while(n>1) + n = n/2 ! n is always a power of 2 + do i=0,ndiv-1,2*n + ! We want to have even boundaries (for cache line alignments) + limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2 + enddo + enddo - ! Calculate the maximum size of a subproblem + ! Calculate the maximum size of a subproblem - max_size = 0 - do i=1,ndiv - max_size = MAX(max_size,limits(i)-limits(i-1)) - enddo + max_size = 0 + do i=1,ndiv + max_size = MAX(max_size,limits(i)-limits(i-1)) + enddo - ! Subdivide matrix by subtracting rank 1 modifications + ! Subdivide matrix by subtracting rank 1 modifications - do i=1,ndiv-1 - n = limits(i) - d(n) = d(n)-abs(e(n)) - d(n+1) = d(n+1)-abs(e(n)) - enddo + do i=1,ndiv-1 + n = limits(i) + d(n) = d(n)-abs(e(n)) + d(n+1) = d(n+1)-abs(e(n)) + enddo - if (np_rows==1) then + if (np_rows==1) then - ! For 1 processor row there may be 1 or 2 subdivisions - do n=0,ndiv-1 - noff = limits(n) ! Start of subproblem - nlen = limits(n+1)-noff ! Size of subproblem + ! For 1 processor row there may be 1 or 2 subdivisions + do n=0,ndiv-1 + noff = limits(n) ! Start of subproblem + nlen = limits(n+1)-noff ! Size of subproblem - call solve_tridi_single_problem_& - &PRECISION_AND_SUFFIX & - (obj, nlen,d(noff+1),e(noff+1), & - q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success) + call solve_tridi_single_problem_& + &PRECISION_AND_SUFFIX & + (obj, nlen,d(noff+1),e(noff+1), & + q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success) - if (.not.(success)) return - enddo + if (.not.(success)) return + enddo - else + else - ! Solve sub problems in parallel with solve_tridi_single - ! There is at maximum 1 subproblem per processor + ! Solve sub problems in parallel with solve_tridi_single + ! There is at maximum 1 subproblem per processor - allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage) - check_deallocate("solve_tridi_col: qmat1", istat, errorMessage) + allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage) + check_deallocate("solve_tridi_col: qmat1", istat, errorMessage) - allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage) - check_deallocate("solve_tridi_col: qmat2", istat, errorMessage) + allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage) + check_deallocate("solve_tridi_col: qmat2", istat, errorMessage) - qmat1 = 0 ! Make sure that all elements are defined + qmat1 = 0 ! Make sure that all elements are defined - if (my_prow < ndiv) then + if (my_prow < ndiv) then - noff = limits(my_prow) ! Start of subproblem - nlen = limits(my_prow+1)-noff ! Size of subproblem - call solve_tridi_single_problem_& - &PRECISION_AND_SUFFIX & - (obj, nlen,d(noff+1),e(noff+1),qmat1, & - ubound(qmat1,dim=1), wantDebug, success) + noff = limits(my_prow) ! Start of subproblem + nlen = limits(my_prow+1)-noff ! Size of subproblem + call solve_tridi_single_problem_& + &PRECISION_AND_SUFFIX & + (obj, nlen,d(noff+1),e(noff+1),qmat1, & + ubound(qmat1,dim=1), wantDebug, success) - if (.not.(success)) return - endif + if (.not.(success)) return + endif - ! Fill eigenvectors in qmat1 into global matrix q + ! Fill eigenvectors in qmat1 into global matrix q - do np = 0, ndiv-1 + do np = 0, ndiv-1 - noff = limits(np) - nlen = limits(np+1)-noff + noff = limits(np) + nlen = limits(np+1)-noff #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - call MPI_Bcast(d(noff+1), int(nlen,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), & - int(mpi_comm_rows,kind=MPI_KIND), mpierr) - qmat2 = qmat1 - call MPI_Bcast(qmat2, int(max_size*max_size,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), & - int(mpi_comm_rows,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + call MPI_Bcast(d(noff+1), int(nlen,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), & + int(mpi_comm_rows,kind=MPI_KIND), mpierr) + qmat2 = qmat1 + call MPI_Bcast(qmat2, int(max_size*max_size,kind=MPI_KIND), MPI_REAL_PRECISION, int(np,kind=MPI_KIND), & + int(mpi_comm_rows,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ -! qmat2 = qmat1 ! is this correct +! qmat2 = qmat1 ! is this correct #endif /* WITH_MPI */ - do i=1,nlen + do i=1,nlen #ifdef WITH_MPI - call distribute_global_column_& - &PRECISION & - (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) + call distribute_global_column_& + &PRECISION & + (obj, qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) #else /* WITH_MPI */ - call distribute_global_column_& - &PRECISION & - (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) + call distribute_global_column_& + &PRECISION & + (obj, qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) #endif /* WITH_MPI */ - enddo + enddo - enddo + enddo - deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage) - check_deallocate("solve_tridi_col: qmat1, qmat2", istat, errorMessage) + deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage) + check_deallocate("solve_tridi_col: qmat1, qmat2", istat, errorMessage) - endif + endif - ! Allocate and set index arrays l_col and p_col + ! Allocate and set index arrays l_col and p_col - allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errorMessage) - check_deallocate("solve_tridi_col: l_col, p_col_i, p_col_o", istat, errorMessage) + allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errorMessage) + check_deallocate("solve_tridi_col: l_col, p_col_i, p_col_o", istat, errorMessage) - do i=1,na - l_col(i) = i - p_col_i(i) = 0 - p_col_o(i) = 0 - enddo + do i=1,na + l_col(i) = i + p_col_i(i) = 0 + p_col_o(i) = 0 + enddo - ! Merge subproblems + ! Merge subproblems - n = 1 - do while(n 1e-14_rk8) then + if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk8) then #else - if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then + if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then #endif - write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1) - else - write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1) - write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.' - write(error_unit,'(a)') 'In this extent, this is completely harmless.' - write(error_unit,'(a)') 'Still, we keep this info message just in case.' - end if - allocate(qtmp(nlen), stat=istat, errmsg=errorMessage) - check_allocate("solve_tridi_single: qtmp", istat, errorMessage) - - dtmp = d(i+1) - qtmp(1:nlen) = q(1:nlen,i+1) - do j=i,1,-1 - if (dtmp 0 and d(i+1) > d(i) - ! - ! but this routine will not terminate with error if these are not satisfied - ! (it will normally converge to a pole in this case). - ! - ! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases - ! N=1 and N=2 which is not compatible with DLAED4. - ! Thus this routine shouldn't be used for these cases as a simple replacement - ! of DLAED4. - ! - ! The arguments are the same as in DLAED4 (with the exception of the INFO argument): - ! - ! - ! N (input) INTEGER - ! The length of all arrays. - ! - ! I (input) INTEGER - ! The index of the eigenvalue to be computed. 1 <= I <= N. - ! - ! D (input) DOUBLE PRECISION array, dimension (N) - ! The original eigenvalues. It is assumed that they are in - ! order, D(I) < D(J) for I < J. - ! - ! Z (input) DOUBLE PRECISION array, dimension (N) - ! The components of the updating Vector. - ! - ! DELTA (output) DOUBLE PRECISION array, dimension (N) - ! DELTA contains (D(j) - lambda_I) in its j-th component. - ! See remark above about DLAED4 compatibility! - ! - ! RHO (input) DOUBLE PRECISION - ! The scalar in the symmetric updating formula. - ! - ! DLAM (output) DOUBLE PRECISION - ! The computed lambda_I, the I-th updated eigenvalue. - !------------------------------------------------------------------------------- - - use precision - use elpa_abstract_impl - implicit none +subroutine solve_secular_equation_& +&PRECISION& +&(obj, n, i, d, z, delta, rho, dlam) +!------------------------------------------------------------------------------- +! This routine solves the secular equation of a symmetric rank 1 modified +! diagonal matrix: +! +! 1. + rho*SUM(z(:)**2/(d(:)-x)) = 0 +! +! It does the same as the LAPACK routine DLAED4 but it uses a bisection technique +! which is more robust (it always yields a solution) but also slower +! than the algorithm used in DLAED4. +! +! The same restictions than in DLAED4 hold, namely: +! +! rho > 0 and d(i+1) > d(i) +! +! but this routine will not terminate with error if these are not satisfied +! (it will normally converge to a pole in this case). +! +! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases +! N=1 and N=2 which is not compatible with DLAED4. +! Thus this routine shouldn't be used for these cases as a simple replacement +! of DLAED4. +! +! The arguments are the same as in DLAED4 (with the exception of the INFO argument): +! +! +! N (input) INTEGER +! The length of all arrays. +! +! I (input) INTEGER +! The index of the eigenvalue to be computed. 1 <= I <= N. +! +! D (input) DOUBLE PRECISION array, dimension (N) +! The original eigenvalues. It is assumed that they are in +! order, D(I) < D(J) for I < J. +! +! Z (input) DOUBLE PRECISION array, dimension (N) +! The components of the updating Vector. +! +! DELTA (output) DOUBLE PRECISION array, dimension (N) +! DELTA contains (D(j) - lambda_I) in its j-th component. +! See remark above about DLAED4 compatibility! +! +! RHO (input) DOUBLE PRECISION +! The scalar in the symmetric updating formula. +! +! DLAM (output) DOUBLE PRECISION +! The computed lambda_I, the I-th updated eigenvalue. +!------------------------------------------------------------------------------- + + use precision + use elpa_abstract_impl + implicit none #include "../../src/general/precision_kinds.F90" - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik) :: n, i - real(kind=rk) :: d(n), z(n), delta(n), rho, dlam + class(elpa_abstract_impl_t), intent(inout) :: obj + integer(kind=ik) :: n, i + real(kind=rk) :: d(n), z(n), delta(n), rho, dlam - integer(kind=ik) :: iter - real(kind=rk) :: a, b, x, y, dshift + integer(kind=ik) :: iter + real(kind=rk) :: a, b, x, y, dshift - ! In order to obtain sufficient numerical accuracy we have to shift the problem - ! either by d(i) or d(i+1), whichever is closer to the solution + ! In order to obtain sufficient numerical accuracy we have to shift the problem + ! either by d(i) or d(i+1), whichever is closer to the solution - ! Upper and lower bound of the shifted solution interval are a and b + ! Upper and lower bound of the shifted solution interval are a and b - call obj%timer%start("solve_secular_equation" // PRECISION_SUFFIX) - if (i==n) then + call obj%timer%start("solve_secular_equation" // PRECISION_SUFFIX) + if (i==n) then - ! Special case: Last eigenvalue - ! We shift always by d(n), lower bound is d(n), - ! upper bound is determined by a guess: + ! Special case: Last eigenvalue + ! We shift always by d(n), lower bound is d(n), + ! upper bound is determined by a guess: - dshift = d(n) - delta(:) = d(:) - dshift + dshift = d(n) + delta(:) = d(:) - dshift - a = 0.0_rk ! delta(n) - b = rho*SUM(z(:)**2) + 1.0_rk ! rho*SUM(z(:)**2) is the lower bound for the guess - else + a = 0.0_rk ! delta(n) + b = rho*SUM(z(:)**2) + 1.0_rk ! rho*SUM(z(:)**2) is the lower bound for the guess + else - ! Other eigenvalues: lower bound is d(i), upper bound is d(i+1) - ! We check the sign of the function in the midpoint of the interval - ! in order to determine if eigenvalue is more close to d(i) or d(i+1) - x = 0.5_rk*(d(i)+d(i+1)) - y = 1.0_rk + rho*SUM(z(:)**2/(d(:)-x)) - if (y>0) then - ! solution is next to d(i) - dshift = d(i) - else - ! solution is next to d(i+1) - dshift = d(i+1) - endif + ! Other eigenvalues: lower bound is d(i), upper bound is d(i+1) + ! We check the sign of the function in the midpoint of the interval + ! in order to determine if eigenvalue is more close to d(i) or d(i+1) + x = 0.5_rk*(d(i)+d(i+1)) + y = 1.0_rk + rho*SUM(z(:)**2/(d(:)-x)) + if (y>0) then + ! solution is next to d(i) + dshift = d(i) + else + ! solution is next to d(i+1) + dshift = d(i+1) + endif - delta(:) = d(:) - dshift - a = delta(i) - b = delta(i+1) + delta(:) = d(:) - dshift + a = delta(i) + b = delta(i+1) - endif + endif - ! Bisection: + ! Bisection: - do iter=1,200 + do iter=1,200 - ! Interval subdivision - x = 0.5_rk*(a+b) - if (x==a .or. x==b) exit ! No further interval subdivisions possible + ! Interval subdivision + x = 0.5_rk*(a+b) + if (x==a .or. x==b) exit ! No further interval subdivisions possible #ifdef DOUBLE_PRECISION_REAL - if (abs(x) < 1.e-200_rk8) exit ! x next to pole + if (abs(x) < 1.e-200_rk8) exit ! x next to pole #else - if (abs(x) < 1.e-20_rk4) exit ! x next to pole + if (abs(x) < 1.e-20_rk4) exit ! x next to pole #endif - ! evaluate value at x + ! evaluate value at x - y = 1. + rho*SUM(z(:)**2/(delta(:)-x)) + y = 1. + rho*SUM(z(:)**2/(delta(:)-x)) - if (y==0) then - ! found exact solution - exit - elseif (y>0) then - b = x - else - a = x - endif + if (y==0) then + ! found exact solution + exit + elseif (y>0) then + b = x + else + a = x + endif - enddo + enddo - ! Solution: + ! Solution: - dlam = x + dshift - delta(:) = delta(:) - x - call obj%timer%stop("solve_secular_equation" // PRECISION_SUFFIX) + dlam = x + dshift + delta(:) = delta(:) - x + call obj%timer%stop("solve_secular_equation" // PRECISION_SUFFIX) - end subroutine solve_secular_equation_& - &PRECISION - !------------------------------------------------------------------------------- +end subroutine solve_secular_equation_& +&PRECISION +!------------------------------------------------------------------------------- #endif #if REALCASE == 1 - subroutine hh_transform_real_& +subroutine hh_transform_real_& #endif #if COMPLEXCASE == 1 - subroutine hh_transform_complex_& +subroutine hh_transform_complex_& #endif - &PRECISION & - (obj, alpha, xnorm_sq, xf, tau, wantDebug) +&PRECISION & +(obj, alpha, xnorm_sq, xf, tau, wantDebug) #if REALCASE == 1 - ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:) + ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:) #endif #if COMPLEXCASE == 1 - ! Similar to LAPACK routine ZLARFP, but uses ||x||**2 instead of x(:) + ! Similar to LAPACK routine ZLARFP, but uses ||x||**2 instead of x(:) #endif - ! and returns the factor xf by which x has to be scaled. - ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150 - ! since this would be expensive for the parallel implementation. - use precision - use elpa_abstract_impl - implicit none + ! and returns the factor xf by which x has to be scaled. + ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150 + ! since this would be expensive for the parallel implementation. + use precision + use elpa_abstract_impl + implicit none #include "../general/precision_kinds.F90" - class(elpa_abstract_impl_t), intent(inout) :: obj - logical, intent(in) :: wantDebug + class(elpa_abstract_impl_t), intent(inout) :: obj + logical, intent(in) :: wantDebug #if REALCASE == 1 - real(kind=rk), intent(inout) :: alpha + real(kind=rk), intent(inout) :: alpha #endif #if COMPLEXCASE == 1 - complex(kind=ck), intent(inout) :: alpha + complex(kind=ck), intent(inout) :: alpha #endif - real(kind=rk), intent(in) :: xnorm_sq + real(kind=rk), intent(in) :: xnorm_sq #if REALCASE == 1 - real(kind=rk), intent(out) :: xf, tau + real(kind=rk), intent(out) :: xf, tau #endif #if COMPLEXCASE == 1 - complex(kind=ck), intent(out) :: xf, tau - real(kind=rk) :: ALPHR, ALPHI + complex(kind=ck), intent(out) :: xf, tau + real(kind=rk) :: ALPHR, ALPHI #endif - real(kind=rk) :: BETA + real(kind=rk) :: BETA - if (wantDebug) call obj%timer%start("hh_transform_& - &MATH_DATATYPE& - &" // & - &PRECISION_SUFFIX ) + if (wantDebug) call obj%timer%start("hh_transform_& + &MATH_DATATYPE& + &" // & + &PRECISION_SUFFIX ) #if COMPLEXCASE == 1 - ALPHR = real( ALPHA, kind=rk ) - ALPHI = PRECISION_IMAG( ALPHA ) + ALPHR = real( ALPHA, kind=rk ) + ALPHI = PRECISION_IMAG( ALPHA ) #endif #if REALCASE == 1 - if ( XNORM_SQ==0.0_rk ) then + if ( XNORM_SQ==0.0_rk ) then #endif #if COMPLEXCASE == 1 - if ( XNORM_SQ==0.0_rk .AND. ALPHI==0.0_rk ) then + if ( XNORM_SQ==0.0_rk .AND. ALPHI==0.0_rk ) then #endif #if REALCASE == 1 - if ( ALPHA>=0.0_rk ) then + if ( ALPHA>=0.0_rk ) then #endif #if COMPLEXCASE == 1 - if ( ALPHR>=0.0_rk ) then + if ( ALPHR>=0.0_rk ) then #endif - TAU = 0.0_rk - else - TAU = 2.0_rk - ALPHA = -ALPHA - endif - XF = 0.0_rk + TAU = 0.0_rk + else + TAU = 2.0_rk + ALPHA = -ALPHA + endif + XF = 0.0_rk - else + else #if REALCASE == 1 - BETA = SIGN( SQRT( ALPHA**2 + XNORM_SQ ), ALPHA ) + BETA = SIGN( SQRT( ALPHA**2 + XNORM_SQ ), ALPHA ) #endif #if COMPLEXCASE == 1 - BETA = SIGN( SQRT( ALPHR**2 + ALPHI**2 + XNORM_SQ ), ALPHR ) + BETA = SIGN( SQRT( ALPHR**2 + ALPHI**2 + XNORM_SQ ), ALPHR ) #endif - ALPHA = ALPHA + BETA - IF ( BETA<0 ) THEN - BETA = -BETA - TAU = -ALPHA / BETA - ELSE + ALPHA = ALPHA + BETA + IF ( BETA<0 ) THEN + BETA = -BETA + TAU = -ALPHA / BETA + ELSE #if REALCASE == 1 - ALPHA = XNORM_SQ / ALPHA + ALPHA = XNORM_SQ / ALPHA #endif #if COMPLEXCASE == 1 - ALPHR = ALPHI * (ALPHI/real( ALPHA , kind=rk)) - ALPHR = ALPHR + XNORM_SQ/real( ALPHA, kind=rk ) + ALPHR = ALPHI * (ALPHI/real( ALPHA , kind=rk)) + ALPHR = ALPHR + XNORM_SQ/real( ALPHA, kind=rk ) #endif #if REALCASE == 1 - TAU = ALPHA / BETA - ALPHA = -ALPHA + TAU = ALPHA / BETA + ALPHA = -ALPHA #endif #if COMPLEXCASE == 1 - TAU = PRECISION_CMPLX( ALPHR/BETA, -ALPHI/BETA ) - ALPHA = PRECISION_CMPLX( -ALPHR, ALPHI ) + TAU = PRECISION_CMPLX( ALPHR/BETA, -ALPHI/BETA ) + ALPHA = PRECISION_CMPLX( -ALPHR, ALPHI ) #endif - END IF - XF = 1.0_rk/ALPHA - ALPHA = BETA - endif + END IF + XF = 1.0_rk/ALPHA + ALPHA = BETA + endif - if (wantDebug) call obj%timer%stop("hh_transform_& - &MATH_DATATYPE& - &" // & - &PRECISION_SUFFIX ) + if (wantDebug) call obj%timer%stop("hh_transform_& + &MATH_DATATYPE& + &" // & + &PRECISION_SUFFIX ) #if REALCASE == 1 - end subroutine hh_transform_real_& +end subroutine hh_transform_real_& #endif #if COMPLEXCASE == 1 end subroutine hh_transform_complex_& diff --git a/src/elpa1/elpa1_trans_ev_template.F90 b/src/elpa1/elpa1_trans_ev_template.F90 index c6dd3b9cbe4aa2348afa948971fea6ed4348d75b..2d16afec090e7c92f5416ab0a2933f8744856e08 100644 --- a/src/elpa1/elpa1_trans_ev_template.F90 +++ b/src/elpa1/elpa1_trans_ev_template.F90 @@ -88,472 +88,472 @@ !> \param useGPU If true, GPU version of the subroutine will be used !> - subroutine trans_ev_& - &MATH_DATATYPE& - &_& - &PRECISION & - (obj, na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU) - use cuda_functions - use iso_c_binding - use precision - use elpa_abstract_impl - use elpa_blas_interfaces - - implicit none +subroutine trans_ev_& +&MATH_DATATYPE& +&_& +&PRECISION & +(obj, na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU) + use cuda_functions + use iso_c_binding + use precision + use elpa_abstract_impl + use elpa_blas_interfaces + + implicit none #include "../general/precision_kinds.F90" - class(elpa_abstract_impl_t), intent(inout) :: obj - integer(kind=ik), intent(in) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - MATH_DATATYPE(kind=rck), intent(in) :: tau(na) + class(elpa_abstract_impl_t), intent(inout) :: obj + integer(kind=ik), intent(in) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + MATH_DATATYPE(kind=rck), intent(in) :: tau(na) #ifdef USE_ASSUMED_SIZE - MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*) - MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,*) + MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*) + MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,*) #else - MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols) - MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,matrixCols) + MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols) + MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,matrixCols) #endif - logical, intent(in) :: useGPU - integer(kind=ik) :: max_stored_rows, max_stored_rows_fac - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols - integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI - integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols - integer(kind=ik) :: l_cols, l_rows, l_colh, nstor - integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol - integer(kind=ik) :: hvn_ubnd, hvm_ubnd - - MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:) - MATH_DATATYPE(kind=rck), pointer :: tmp1(:), tmp2(:) - MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:) - MATH_DATATYPE(kind=rck), pointer :: tmat(:,:) - MATH_DATATYPE(kind=rck), pointer :: hvm1(:) - type(c_ptr) :: tmp1_host, tmp2_host - type(c_ptr) :: hvm1_host, tmat_host - - integer(kind=ik) :: istat - character(200) :: errorMessage - character(20) :: gpuString - - integer(kind=c_intptr_t) :: num - integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev - integer(kind=ik) :: blockStep - logical :: successCUDA - integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& - &PRECISION& - &_& - &MATH_DATATYPE - integer(kind=ik) :: error - if(useGPU) then - gpuString = "_gpu" - else - gpuString = "" - endif - - call obj%timer%start("trans_ev_& - &MATH_DATATYPE& - &" // & - &PRECISION_SUFFIX //& - gpuString) - - call obj%timer%start("mpi_communication") - call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr) - call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr) - call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr) - call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr) - - my_prow = int(my_prowMPI, kind=c_int) - np_rows = int(np_rowsMPI, kind=c_int) - my_pcol = int(my_pcolMPI, kind=c_int) - np_cols = int(np_colsMPI, kind=c_int) - call obj%timer%stop("mpi_communication") - - call obj%get("max_stored_rows",max_stored_rows_fac, error) + logical, intent(in) :: useGPU + integer(kind=ik) :: max_stored_rows, max_stored_rows_fac + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols + integer(kind=MPI_KIND) :: mpierr, my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI + integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols + integer(kind=ik) :: l_cols, l_rows, l_colh, nstor + integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol + integer(kind=ik) :: hvn_ubnd, hvm_ubnd + + MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:) + MATH_DATATYPE(kind=rck), pointer :: tmp1(:), tmp2(:) + MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:) + MATH_DATATYPE(kind=rck), pointer :: tmat(:,:) + MATH_DATATYPE(kind=rck), pointer :: hvm1(:) + type(c_ptr) :: tmp1_host, tmp2_host + type(c_ptr) :: hvm1_host, tmat_host + + integer(kind=ik) :: istat + character(200) :: errorMessage + character(20) :: gpuString + + integer(kind=c_intptr_t) :: num + integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev + integer(kind=ik) :: blockStep + logical :: successCUDA + integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& + &PRECISION& + &_& + &MATH_DATATYPE + integer(kind=ik) :: error + if(useGPU) then + gpuString = "_gpu" + else + gpuString = "" + endif + + call obj%timer%start("trans_ev_& + &MATH_DATATYPE& + &" // & + &PRECISION_SUFFIX //& + gpuString) - totalblocks = (na-1)/nblk + 1 - max_blocks_row = (totalblocks-1)/np_rows + 1 - max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q_mat! + call obj%timer%start("mpi_communication") + call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr) + call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr) + call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr) + call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr) - max_local_rows = max_blocks_row*nblk - max_local_cols = max_blocks_col*nblk + my_prow = int(my_prowMPI, kind=c_int) + np_rows = int(np_rowsMPI, kind=c_int) + my_pcol = int(my_pcolMPI, kind=c_int) + np_cols = int(np_colsMPI, kind=c_int) + call obj%timer%stop("mpi_communication") - max_stored_rows = (max_stored_rows_fac/nblk+1)*nblk + call obj%get("max_stored_rows",max_stored_rows_fac, error) - if (.not.(useGPU)) then - allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) - call check_alloc("trans_ev_& - &MATH_DATATYPE& - &", "tmat", istat, errorMessage) + totalblocks = (na-1)/nblk + 1 + max_blocks_row = (totalblocks-1)/np_rows + 1 + max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q_mat! - allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) - call check_alloc("trans_ev_& - &MATH_DATATYPE& - &", "tmp1", istat, errorMessage) + max_local_rows = max_blocks_row*nblk + max_local_cols = max_blocks_col*nblk - allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) - call check_alloc("trans_ev_& - &MATH_DATATYPE& - &", "tmp2", istat, errorMessage) - endif + max_stored_rows = (max_stored_rows_fac/nblk+1)*nblk - allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) + if (.not.(useGPU)) then + allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) call check_alloc("trans_ev_& &MATH_DATATYPE& - &", "h1", istat, errorMessage) + &", "tmat", istat, errorMessage) - allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) + allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) call check_alloc("trans_ev_& &MATH_DATATYPE& - &", "h2", istat, errorMessage) + &", "tmp1", istat, errorMessage) - allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) + allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) call check_alloc("trans_ev_& &MATH_DATATYPE& - &", "hvn", istat, errorMessage) + &", "tmp2", istat, errorMessage) + endif - allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) - call check_alloc("trans_ev_& - &MATH_DATATYPE& - &", "hvm", istat, errorMessage) + allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) + call check_alloc("trans_ev_& + &MATH_DATATYPE& + &", "h1", istat, errorMessage) - hvm = 0 ! Must be set to 0 !!! - hvb = 0 ! Safety only - blockStep = nblk + allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) + call check_alloc("trans_ev_& + &MATH_DATATYPE& + &", "h2", istat, errorMessage) + + allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) + call check_alloc("trans_ev_& + &MATH_DATATYPE& + &", "hvn", istat, errorMessage) + + allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) + call check_alloc("trans_ev_& + &MATH_DATATYPE& + &", "hvm", istat, errorMessage) - l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat + hvm = 0 ! Must be set to 0 !!! + hvb = 0 ! Safety only + blockStep = nblk - nstor = 0 - if (useGPU) then - hvn_ubnd = 0 - endif + l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat + + nstor = 0 + if (useGPU) then + hvn_ubnd = 0 + endif #if COMPLEXCASE == 1 - ! In the complex case tau(2) /= 0 - if (my_prow == prow(1, nblk, np_rows)) then - q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2)) - endif + ! In the complex case tau(2) /= 0 + if (my_prow == prow(1, nblk, np_rows)) then + q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2)) + endif #endif - if (useGPU) then - ! todo: this is used only for copying hmv to device.. it should be possible to go without it - !allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage) - !call check_alloc("trans_ev_& - !&MATH_DATATYPE& - !&", "hvm1", istat, errorMessage) - num = (max_local_rows*max_stored_rows) * size_of_datatype - successCUDA = cuda_malloc_host(hvm1_host,num) - check_alloc_cuda("trans_ev: hvm1_host", successCUDA) - call c_f_pointer(hvm1_host,hvm1,(/(max_local_rows*max_stored_rows)/)) - - num = (max_stored_rows*max_stored_rows) * size_of_datatype - successCUDA = cuda_malloc_host(tmat_host,num) - check_alloc_cuda("trans_ev: tmat_host", successCUDA) - call c_f_pointer(tmat_host,tmat,(/max_stored_rows,max_stored_rows/)) - - num = (max_local_cols*max_stored_rows) * size_of_datatype - successCUDA = cuda_malloc_host(tmp1_host,num) - check_alloc_cuda("trans_ev: tmp1_host", successCUDA) - call c_f_pointer(tmp1_host,tmp1,(/(max_local_cols*max_stored_rows)/)) - - num = (max_local_cols*max_stored_rows) * size_of_datatype - successCUDA = cuda_malloc_host(tmp2_host,num) - check_alloc_cuda("trans_ev: tmp2_host", successCUDA) - call c_f_pointer(tmp2_host,tmp2,(/(max_local_cols*max_stored_rows)/)) - - successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype) - check_alloc_cuda("trans_ev", successCUDA) - - successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype) - check_alloc_cuda("trans_ev", successCUDA) - - successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype) - check_alloc_cuda("trans_ev", successCUDA) - - num = ldq * matrixCols * size_of_datatype - successCUDA = cuda_malloc(q_dev, num) - check_alloc_cuda("trans_ev", successCUDA) - - successCUDA = cuda_host_register(int(loc(q_mat),kind=c_intptr_t),num,& - cudaHostRegisterDefault) - check_host_register_cuda("trans_ev: q_mat", successCUDA) - - successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), & - num, cudaMemcpyHostToDevice) - check_memcpy_cuda("trans_ev", successCUDA) - endif ! useGPU - - do istep = 1, na, blockStep - ics = MAX(istep,3) - ice = MIN(istep+nblk-1,na) - if (ice0) & - call MPI_Bcast(hvb, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION , int(cur_pcol,kind=MPI_KIND), & - int(mpi_comm_cols,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") -#endif /* WITH_MPI */ + num = (max_local_cols*max_stored_rows) * size_of_datatype + successCUDA = cuda_malloc_host(tmp1_host,num) + check_alloc_cuda("trans_ev: tmp1_host", successCUDA) + call c_f_pointer(tmp1_host,tmp1,(/(max_local_cols*max_stored_rows)/)) - nb = 0 - do ic = ics, ice - l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector - hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) - if (useGPU) then - hvm_ubnd = l_rows + num = (max_local_cols*max_stored_rows) * size_of_datatype + successCUDA = cuda_malloc_host(tmp2_host,num) + check_alloc_cuda("trans_ev: tmp2_host", successCUDA) + call c_f_pointer(tmp2_host,tmp2,(/(max_local_cols*max_stored_rows)/)) + + successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype) + check_alloc_cuda("trans_ev", successCUDA) + + successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype) + check_alloc_cuda("trans_ev", successCUDA) + + successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype) + check_alloc_cuda("trans_ev", successCUDA) + + num = ldq * matrixCols * size_of_datatype + successCUDA = cuda_malloc(q_dev, num) + check_alloc_cuda("trans_ev", successCUDA) + + successCUDA = cuda_host_register(int(loc(q_mat),kind=c_intptr_t),num,& + cudaHostRegisterDefault) + check_host_register_cuda("trans_ev: q_mat", successCUDA) + + successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), & + num, cudaMemcpyHostToDevice) + check_memcpy_cuda("trans_ev", successCUDA) + endif ! useGPU + + do istep = 1, na, blockStep + ics = MAX(istep,3) + ice = MIN(istep+nblk-1,na) + if (ice max_stored_rows .or. istep+nblk > na .or. (na/np_rows <= 256 .and. nstor >= 32)) then + nb = nb+l_rows + enddo - ! Calculate scalar products of stored vectors. - ! This can be done in different ways, we use dsyrk or zherk +#ifdef WITH_MPI + call obj%timer%start("mpi_communication") + if (nb>0) & + call MPI_Bcast(hvb, int(nb,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION , int(cur_pcol,kind=MPI_KIND), & + int(mpi_comm_cols,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") +#endif /* WITH_MPI */ - tmat = 0 - call obj%timer%start("blas") - if (l_rows>0) & + nb = 0 + do ic = ics, ice + l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder Vector + hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) + if (useGPU) then + hvm_ubnd = l_rows + endif + nstor = nstor+1 + nb = nb+l_rows + enddo + + ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough! + if (nstor+nblk > max_stored_rows .or. istep+nblk > na .or. (na/np_rows <= 256 .and. nstor >= 32)) then + + ! Calculate scalar products of stored vectors. + ! This can be done in different ways, we use dsyrk or zherk + + tmat = 0 + call obj%timer%start("blas") + if (l_rows>0) & #if REALCASE == 1 - call PRECISION_SYRK('U', 'T', & + call PRECISION_SYRK('U', 'T', & #endif #if COMPLEXCASE == 1 - call PRECISION_HERK('U', 'C', & + call PRECISION_HERK('U', 'C', & #endif - int(nstor,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, & - hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), ZERO, tmat, int(max_stored_rows,kind=BLAS_KIND)) - call obj%timer%stop("blas") - nc = 0 - do n = 1, nstor-1 - h1(nc+1:nc+n) = tmat(1:n,n+1) - nc = nc+n - enddo + int(nstor,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, & + hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), ZERO, tmat, int(max_stored_rows,kind=BLAS_KIND)) + call obj%timer%stop("blas") + nc = 0 + do n = 1, nstor-1 + h1(nc+1:nc+n) = tmat(1:n,n+1) + nc = nc+n + enddo #ifdef WITH_MPI - call obj%timer%start("mpi_communication") - if (nc>0) call mpi_allreduce( h1, h2, int(nc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, & - int(mpi_comm_rows,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") + call obj%timer%start("mpi_communication") + if (nc>0) call mpi_allreduce( h1, h2, int(nc,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, & + int(mpi_comm_rows,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - if (nc > 0) h2 = h1 + if (nc > 0) h2 = h1 #endif /* WITH_MPI */ - ! Calculate triangular matrix T + ! Calculate triangular matrix T - nc = 0 - tmat(1,1) = tau(ice-nstor+1) - do n = 1, nstor-1 - call obj%timer%start("blas") - call PRECISION_TRMV('L', BLAS_TRANS_OR_CONJ , 'N', int(n,kind=BLAS_KIND), tmat, & - int(max_stored_rows,kind=BLAS_KIND), h2(nc+1), 1_BLAS_KIND) - call obj%timer%stop("blas") + nc = 0 + tmat(1,1) = tau(ice-nstor+1) + do n = 1, nstor-1 + call obj%timer%start("blas") + call PRECISION_TRMV('L', BLAS_TRANS_OR_CONJ , 'N', int(n,kind=BLAS_KIND), tmat, & + int(max_stored_rows,kind=BLAS_KIND), h2(nc+1), 1_BLAS_KIND) + call obj%timer%stop("blas") - tmat(n+1,1:n) = & + tmat(n+1,1:n) = & #if REALCASE == 1 - -h2(nc+1:nc+n) & + -h2(nc+1:nc+n) & #endif #if COMPLEXCASE == 1 - -conjg(h2(nc+1:nc+n)) & + -conjg(h2(nc+1:nc+n)) & #endif - *tau(ice-nstor+n+1) + *tau(ice-nstor+n+1) - tmat(n+1,n+1) = tau(ice-nstor+n+1) - nc = nc+n - enddo + tmat(n+1,n+1) = tau(ice-nstor+n+1) + nc = nc+n + enddo - if (useGPU) then - ! todo: is this reshape really neccessary? - hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /)) + if (useGPU) then + ! todo: is this reshape really neccessary? + hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /)) - !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor) - successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), & - hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice) + !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor) + successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), & + hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("trans_ev", successCUDA) - - !tmat_dev = tmat - successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), & - max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("trans_ev", successCUDA) - endif + check_memcpy_cuda("trans_ev", successCUDA) - ! Q = Q - V * T * V**T * Q + !tmat_dev = tmat + successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), & + max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice) + check_memcpy_cuda("trans_ev", successCUDA) + endif - if (l_rows>0) then - if (useGPU) then - call obj%timer%start("cublas") - call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', & - nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd, & - q_dev, ldq, ZERO, tmp_dev, nstor) - call obj%timer%stop("cublas") + ! Q = Q - V * T * V**T * Q - else ! useGPU + if (l_rows>0) then + if (useGPU) then + call obj%timer%start("cublas") + call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', & + nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd, & + q_dev, ldq, ZERO, tmp_dev, nstor) + call obj%timer%stop("cublas") - call obj%timer%start("blas") - call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', & - int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), & - int(l_rows,kind=BLAS_KIND), ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), & - q_mat, int(ldq,kind=BLAS_KIND), ZERO, tmp1, int(nstor,kind=BLAS_KIND)) - call obj%timer%stop("blas") - endif ! useGPU + else ! useGPU - else !l_rows>0 + call obj%timer%start("blas") + call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', & + int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), & + int(l_rows,kind=BLAS_KIND), ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), & + q_mat, int(ldq,kind=BLAS_KIND), ZERO, tmp1, int(nstor,kind=BLAS_KIND)) + call obj%timer%stop("blas") + endif ! useGPU - if (useGPU) then - successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype) - check_memcpy_cuda("trans_ev", successCUDA) - else - tmp1(1:l_cols*nstor) = 0 - endif - endif !l_rows>0 + else !l_rows>0 -#ifdef WITH_MPI - ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI - ! todo: does it need to be copied whole? Wouldn't be a part sufficient? if (useGPU) then - successCUDA = cuda_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, & - max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost) + successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype) check_memcpy_cuda("trans_ev", successCUDA) + else + tmp1(1:l_cols*nstor) = 0 endif - call obj%timer%start("mpi_communication") - call mpi_allreduce(tmp1, tmp2, int(nstor*l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, & - int(mpi_comm_rows,kind=MPI_KIND), mpierr) - call obj%timer%stop("mpi_communication") - ! copy back tmp2 - after reduction... - if (useGPU) then - successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), & - max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("trans_ev", successCUDA) - endif ! useGPU + endif !l_rows>0 + +#ifdef WITH_MPI + ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI + ! todo: does it need to be copied whole? Wouldn't be a part sufficient? + if (useGPU) then + successCUDA = cuda_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, & + max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("trans_ev", successCUDA) + endif + call obj%timer%start("mpi_communication") + call mpi_allreduce(tmp1, tmp2, int(nstor*l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, MPI_SUM, & + int(mpi_comm_rows,kind=MPI_KIND), mpierr) + call obj%timer%stop("mpi_communication") + ! copy back tmp2 - after reduction... + if (useGPU) then + successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), & + max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice) + check_memcpy_cuda("trans_ev", successCUDA) + endif ! useGPU #else /* WITH_MPI */ -! tmp2 = tmp1 +! tmp2 = tmp1 #endif /* WITH_MPI */ - if (l_rows>0) then - if (useGPU) then - call obj%timer%start("cublas") - call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', & - nstor, l_cols, ONE, tmat_dev, max_stored_rows, & - tmp_dev, nstor) - - call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, & - -ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor, & - ONE, q_dev, ldq) - call obj%timer%stop("cublas") - else !useGPU + if (l_rows>0) then + if (useGPU) then + call obj%timer%start("cublas") + call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', & + nstor, l_cols, ONE, tmat_dev, max_stored_rows, & + tmp_dev, nstor) + + call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, & + -ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor, & + ONE, q_dev, ldq) + call obj%timer%stop("cublas") + else !useGPU #ifdef WITH_MPI - ! tmp2 = tmat * tmp2 - call obj%timer%start("blas") - call PRECISION_TRMM('L', 'L', 'N', 'N', int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), & - ONE, tmat, int(max_stored_rows,kind=BLAS_KIND), tmp2, int(nstor,kind=BLAS_KIND)) - !q_mat = q_mat - hvm*tmp2 - call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), int(nstor,kind=BLAS_KIND), & - -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), tmp2, int(nstor,kind=BLAS_KIND), & - ONE, q_mat, int(ldq,kind=BLAS_KIND)) - call obj%timer%stop("blas") + ! tmp2 = tmat * tmp2 + call obj%timer%start("blas") + call PRECISION_TRMM('L', 'L', 'N', 'N', int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), & + ONE, tmat, int(max_stored_rows,kind=BLAS_KIND), tmp2, int(nstor,kind=BLAS_KIND)) + !q_mat = q_mat - hvm*tmp2 + call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), int(nstor,kind=BLAS_KIND), & + -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), tmp2, int(nstor,kind=BLAS_KIND), & + ONE, q_mat, int(ldq,kind=BLAS_KIND)) + call obj%timer%stop("blas") #else /* WITH_MPI */ - call obj%timer%start("blas") - - call PRECISION_TRMM('L', 'L', 'N', 'N', int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), & - ONE, tmat, int(max_stored_rows,kind=BLAS_KIND), tmp1, int(nstor,kind=BLAS_KIND)) - call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), & - int(nstor,kind=BLAS_KIND), -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), & - tmp1, int(nstor,kind=BLAS_KIND), ONE, q_mat, int(ldq,kind=BLAS_KIND)) - call obj%timer%stop("blas") + call obj%timer%start("blas") + + call PRECISION_TRMM('L', 'L', 'N', 'N', int(nstor,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), & + ONE, tmat, int(max_stored_rows,kind=BLAS_KIND), tmp1, int(nstor,kind=BLAS_KIND)) + call PRECISION_GEMM('N', 'N', int(l_rows,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), & + int(nstor,kind=BLAS_KIND), -ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), & + tmp1, int(nstor,kind=BLAS_KIND), ONE, q_mat, int(ldq,kind=BLAS_KIND)) + call obj%timer%stop("blas") #endif /* WITH_MPI */ - endif ! useGPU - endif ! l_rows>0 - nstor = 0 - endif ! (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) + endif ! useGPU + endif ! l_rows>0 + nstor = 0 + endif ! (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) - enddo ! istep + enddo ! istep - deallocate(h1, h2, hvb, hvm, stat=istat, errmsg=errorMessage) + deallocate(h1, h2, hvb, hvm, stat=istat, errmsg=errorMessage) + check_deallocate("trans_ev_& + &MATH_DATATYPE& + &: h1, h2, hvb, hvm", istat, errorMessage) + + if (useGPU) then + !q_mat = q_dev + successCUDA = cuda_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), & + q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("trans_ev", successCUDA) + + successCUDA = cuda_host_unregister(int(loc(q_mat),kind=c_intptr_t)) + check_host_unregister_cuda("trans_ev: q_mat", successCUDA) + + successCUDA = cuda_free_host(hvm1_host) + check_host_dealloc_cuda("trans_ev: hvm1_host", successCUDA) + nullify(hvm1) + + successCUDA = cuda_free_host(tmat_host) + check_host_dealloc_cuda("trans_ev: tmat_host", successCUDA) + nullify(tmat) + + successCUDA = cuda_free_host(tmp1_host) + check_host_dealloc_cuda("trans_ev: tmp1_host", successCUDA) + nullify(tmp1) + + successCUDA = cuda_free_host(tmp2_host) + check_host_dealloc_cuda("trans_ev: tmp2_host", successCUDA) + nullify(tmp2) + + !deallocate(hvm1, stat=istat, errmsg=errorMessage) + !if (istat .ne. 0) then + ! print *,"trans_ev_& + ! &MATH_DATATYPE& + ! &: error when deallocating hvm1 "//errorMessage + ! stop 1 + !endif + + !deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev) + successCUDA = cuda_free(q_dev) + check_dealloc_cuda("trans_ev", successCUDA) + + successCUDA = cuda_free(tmp_dev) + check_dealloc_cuda("trans_ev", successCUDA) + + successCUDA = cuda_free(hvm_dev) + check_dealloc_cuda("trans_ev", successCUDA) + + successCUDA = cuda_free(tmat_dev) + check_dealloc_cuda("trans_ev", successCUDA) + else + deallocate(tmat, tmp1, tmp2, stat=istat, errmsg=errorMessage) check_deallocate("trans_ev_& - &MATH_DATATYPE& - &: h1, h2, hvb, hvm", istat, errorMessage) - - if (useGPU) then - !q_mat = q_dev - successCUDA = cuda_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), & - q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost) - check_memcpy_cuda("trans_ev", successCUDA) - - successCUDA = cuda_host_unregister(int(loc(q_mat),kind=c_intptr_t)) - check_host_unregister_cuda("trans_ev: q_mat", successCUDA) - - successCUDA = cuda_free_host(hvm1_host) - check_host_dealloc_cuda("trans_ev: hvm1_host", successCUDA) - nullify(hvm1) - - successCUDA = cuda_free_host(tmat_host) - check_host_dealloc_cuda("trans_ev: tmat_host", successCUDA) - nullify(tmat) - - successCUDA = cuda_free_host(tmp1_host) - check_host_dealloc_cuda("trans_ev: tmp1_host", successCUDA) - nullify(tmp1) - - successCUDA = cuda_free_host(tmp2_host) - check_host_dealloc_cuda("trans_ev: tmp2_host", successCUDA) - nullify(tmp2) - - !deallocate(hvm1, stat=istat, errmsg=errorMessage) - !if (istat .ne. 0) then - ! print *,"trans_ev_& - ! &MATH_DATATYPE& - ! &: error when deallocating hvm1 "//errorMessage - ! stop 1 - !endif - - !deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev) - successCUDA = cuda_free(q_dev) - check_dealloc_cuda("trans_ev", successCUDA) - - successCUDA = cuda_free(tmp_dev) - check_dealloc_cuda("trans_ev", successCUDA) - - successCUDA = cuda_free(hvm_dev) - check_dealloc_cuda("trans_ev", successCUDA) - - successCUDA = cuda_free(tmat_dev) - check_dealloc_cuda("trans_ev", successCUDA) - else - deallocate(tmat, tmp1, tmp2, stat=istat, errmsg=errorMessage) - check_deallocate("trans_ev_& - &MATH_DATATYPE& - &: tmat, tmp1, tmp2", istat, errorMessage) - endif - - - call obj%timer%stop("trans_ev_& &MATH_DATATYPE& - &" // & - &PRECISION_SUFFIX // & - gpuString ) + &: tmat, tmp1, tmp2", istat, errorMessage) + endif - end subroutine trans_ev_& - &MATH_DATATYPE& - &_& - &PRECISION + + call obj%timer%stop("trans_ev_& + &MATH_DATATYPE& + &" // & + &PRECISION_SUFFIX // & + gpuString ) + +end subroutine trans_ev_& +&MATH_DATATYPE& +&_& +&PRECISION diff --git a/src/elpa1/elpa1_tridiag_template.F90 b/src/elpa1/elpa1_tridiag_template.F90 index d4cccce68787d6c39b1a70ed14fad7a9f3295bcf..b728ed6f77b8d774dae49bffdbfc9b034cffbaf4 100644 --- a/src/elpa1/elpa1_tridiag_template.F90 +++ b/src/elpa1/elpa1_tridiag_template.F90 @@ -745,407 +745,405 @@ subroutine tridiag_& do i=0,max_threads-1 u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i) u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i) - enddo + enddo #endif /* WITH_OPENMP */ - ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v - if (n_stored_vecs > 0) then - if (wantDebug) call obj%timer%start("blas") + ! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v + if (n_stored_vecs > 0) then + if (wantDebug) call obj%timer%start("blas") #if REALCASE == 1 - call PRECISION_GEMV('T', & + call PRECISION_GEMV('T', & #endif #if COMPLEXCASE == 1 - call PRECISION_GEMV('C', & + call PRECISION_GEMV('C', & #endif - int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), & - ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), & - v_row, 1_BLAS_KIND, ZERO, aux, 1_BLAS_KIND) + int(l_rows,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), & + ONE, vu_stored_rows, int(ubound(vu_stored_rows,dim=1),kind=BLAS_KIND), & + v_row, 1_BLAS_KIND, ZERO, aux, 1_BLAS_KIND) - call PRECISION_GEMV('N', int(l_cols,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), & - ONE, uv_stored_cols, int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), & - aux, 1_BLAS_KIND, ONE, u_col, 1_BLAS_KIND) - if (wantDebug) call obj%timer%stop("blas") - endif + call PRECISION_GEMV('N', int(l_cols,kind=BLAS_KIND), int(2*n_stored_vecs,kind=BLAS_KIND), & + ONE, uv_stored_cols, int(ubound(uv_stored_cols,dim=1),kind=BLAS_KIND), & + aux, 1_BLAS_KIND, ONE, u_col, 1_BLAS_KIND) + if (wantDebug) call obj%timer%stop("blas") + endif - endif ! (l_rows>0 .and. l_cols>0) + endif ! (l_rows>0 .and. l_cols>0) - ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts - ! on the processors containing the diagonal - ! This is only necessary if u_row has been calculated, i.e. if the - ! global tile size is smaller than the global remaining matrix + ! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts + ! on the processors containing the diagonal + ! This is only necessary if u_row has been calculated, i.e. if the + ! global tile size is smaller than the global remaining matrix - if (tile_size < istep-1) then + if (tile_size < istep-1) then - call elpa_reduce_add_vectors_& - &MATH_DATATYPE& - &_& - &PRECISION & - (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), & - mpi_comm_cols, istep-1, 1, nblk, max_threads) + call elpa_reduce_add_vectors_& + &MATH_DATATYPE& + &_& + &PRECISION & + (obj, u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), & + mpi_comm_cols, istep-1, 1, nblk, max_threads) - endif + endif - ! Sum up all the u_col(:) parts, transpose u_col -> u_row + ! Sum up all the u_col(:) parts, transpose u_col -> u_row - if (l_cols>0) then - tmp(1:l_cols) = u_col(1:l_cols) + if (l_cols>0) then + tmp(1:l_cols) = u_col(1:l_cols) #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - call mpi_allreduce(tmp, u_col, int(l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & - MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + call mpi_allreduce(tmp, u_col, int(l_cols,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, & + MPI_SUM, int(mpi_comm_rows,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - u_col = tmp + u_col = tmp #endif /* WITH_MPI */ - endif - if (isSkewsymmetric) then - call elpa_transpose_vectors_ss_& - &MATH_DATATYPE& - &_& - &PRECISION & - (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), & - mpi_comm_rows, 1, istep-1, 1, nblk, max_threads) - else - call elpa_transpose_vectors_& - &MATH_DATATYPE& - &_& - &PRECISION & - (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), & - mpi_comm_rows, 1, istep-1, 1, nblk, max_threads) - endif - - ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) - x = 0 - if (l_cols>0) & - x = dot_product(v_col(1:l_cols),u_col(1:l_cols)) + endif + if (isSkewsymmetric) then + call elpa_transpose_vectors_ss_& + &MATH_DATATYPE& + &_& + &PRECISION & + (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), & + mpi_comm_rows, 1, istep-1, 1, nblk, max_threads) + else + call elpa_transpose_vectors_& + &MATH_DATATYPE& + &_& + &PRECISION & + (obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), & + mpi_comm_rows, 1, istep-1, 1, nblk, max_threads) + endif + + ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) + x = 0 + if (l_cols>0) & + x = dot_product(v_col(1:l_cols),u_col(1:l_cols)) #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - call mpi_allreduce(x, vav, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + call mpi_allreduce(x, vav, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, int(mpi_comm_cols,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #else /* WITH_MPI */ - vav = x + vav = x #endif /* WITH_MPI */ - ! store u and v in the matrices U and V - ! these matrices are stored combined in one here + ! store u and v in the matrices U and V + ! these matrices are stored combined in one here - do j=1,l_rows + do j=1,l_rows #if REALCASE == 1 - vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j) - vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j) + vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j) + vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j) #endif #if COMPLEXCASE == 1 - vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j) - vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j) + vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j) + vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j) #endif - enddo - do j=1,l_cols + enddo + do j=1,l_cols #if REALCASE == 1 - uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j) - uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j) + uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j) + uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j) #endif #if COMPLEXCASE == 1 - uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j) - uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j) + uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j) + uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j) #endif - enddo + enddo - ! We have calculated another Hauseholder Vector, number of implicitly stored increased - n_stored_vecs = n_stored_vecs+1 + ! We have calculated another Hauseholder Vector, number of implicitly stored increased + n_stored_vecs = n_stored_vecs+1 - ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T - if (n_stored_vecs == max_stored_uv .or. istep == 3) then + ! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T + if (n_stored_vecs == max_stored_uv .or. istep == 3) then - if (useGPU) then - successCUDA = cuda_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), & - max_local_rows * 2 * max_stored_uv * & - size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("tridiag: uv_stored_rows_dev", successCUDA) - - successCUDA = cuda_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), & - max_local_cols * 2 * max_stored_uv * & - size_of_datatype, cudaMemcpyHostToDevice) - check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA) - endif - - do i = 0, (istep-2)/tile_size - ! go over tiles above (or on) the diagonal - l_col_beg = i*l_cols_per_tile+1 - l_col_end = min(l_cols,(i+1)*l_cols_per_tile) - l_row_beg = 1 - l_row_end = min(l_rows,(i+1)*l_rows_per_tile) - if (l_col_end 0) then - a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) & - + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs)) - end if + if (my_prow == prow(istep-1, nblk, np_rows) .and. my_pcol == pcol(istep-1, nblk, np_cols)) then + if (useGPU) then + !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols) + a_offset = ((l_rows - 1) + matrixRows * (l_cols - 1)) * size_of_datatype + + successCUDA = cuda_memcpy(int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), a_dev + a_offset, & + 1 * size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("tridiag: a_dev 3", successCUDA) + + endif + if (n_stored_vecs > 0) then + a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) & + + dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs)) + end if #if REALCASE == 1 - if (isSkewsymmetric) then - d_vec(istep-1) = 0.0_rk - else - d_vec(istep-1) = a_mat(l_rows,l_cols) - endif + if (isSkewsymmetric) then + d_vec(istep-1) = 0.0_rk + else + d_vec(istep-1) = a_mat(l_rows,l_cols) + endif #endif #if COMPLEXCASE == 1 - d_vec(istep-1) = real(a_mat(l_rows,l_cols),kind=rk) + d_vec(istep-1) = real(a_mat(l_rows,l_cols),kind=rk) #endif - if (useGPU) then - !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols) - !successCUDA = cuda_threadsynchronize() - !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA) + if (useGPU) then + !a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols) + !successCUDA = cuda_threadsynchronize() + !check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA) - successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), & - int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice) - check_memcpy_cuda("tridiag: a_dev 4", successCUDA) - endif - endif + successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), & + int(1 * size_of_datatype, kind=c_intptr_t), cudaMemcpyHostToDevice) + check_memcpy_cuda("tridiag: a_dev 4", successCUDA) + endif + endif - enddo ! main cycle over istep=na,3,-1 + enddo ! main cycle over istep=na,3,-1 #if COMPLEXCASE == 1 - ! Store e_vec(1) and d_vec(1) - - if (my_pcol==pcol(2, nblk, np_cols)) then - if (my_prow==prow(1, nblk, np_rows)) then - ! We use last l_cols value of loop above - if(useGPU) then - successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, & - 1 * size_of_datatype, cudaMemcpyDeviceToHost) - check_memcpy_cuda("tridiag: a_dev 5", successCUDA) - vrl = aux3(1) - else !useGPU - vrl = a_mat(1,l_cols) - endif !useGPU - call hh_transform_complex_& - &PRECISION & - (obj, vrl, 0.0_rk, xf, tau(2), wantDebug) + ! Store e_vec(1) and d_vec(1) + + if (my_pcol==pcol(2, nblk, np_cols)) then + if (my_prow==prow(1, nblk, np_rows)) then + ! We use last l_cols value of loop above + if (useGPU) then + successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, & + 1 * size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("tridiag: a_dev 5", successCUDA) + vrl = aux3(1) + else !useGPU + vrl = a_mat(1,l_cols) + endif !useGPU + call hh_transform_complex_& + &PRECISION & + (obj, vrl, 0.0_rk, xf, tau(2), wantDebug) #if REALCASE == 1 - e_vec(1) = vrl + e_vec(1) = vrl #endif #if COMPLEXCASE == 1 - e_vec(1) = real(vrl,kind=rk) + e_vec(1) = real(vrl,kind=rk) #endif - - - a_mat(1,l_cols) = 1. ! for consistency only - endif + a_mat(1,l_cols) = 1. ! for consistency only + endif #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(prow(1, nblk, np_rows),kind=MPI_KIND), & - int(mpi_comm_rows,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(prow(1, nblk, np_rows),kind=MPI_KIND), & + int(mpi_comm_rows,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ - endif + endif #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(pcol(2, nblk, np_cols),kind=MPI_KIND), & - int(mpi_comm_cols,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + call mpi_bcast(tau(2), 1_MPI_KIND, MPI_COMPLEX_PRECISION, int(pcol(2, nblk, np_cols),kind=MPI_KIND), & + int(mpi_comm_cols,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ - if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then - if(useGPU) then - successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, & - 1 * size_of_datatype, cudaMemcpyDeviceToHost) - check_memcpy_cuda("tridiag: a_dev 6", successCUDA) - d_vec(1) = PRECISION_REAL(aux3(1)) - else !useGPU - d_vec(1) = PRECISION_REAL(a_mat(1,1)) - endif !useGPU - endif + if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then + if (useGPU) then + successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, & + 1 * size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("tridiag: a_dev 6", successCUDA) + d_vec(1) = PRECISION_REAL(aux3(1)) + else !useGPU + d_vec(1) = PRECISION_REAL(a_mat(1,1)) + endif !useGPU + endif #endif /* COMPLEXCASE == 1 */ #if REALCASE == 1 - ! Store e_vec(1) - - if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then - if(useGPU) then - successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, & - 1 * size_of_datatype, cudaMemcpyDeviceToHost) - check_memcpy_cuda("tridiag: a_dev 7", successCUDA) - else !useGPU - e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above - endif !useGPU - endif + ! Store e_vec(1) + + if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then + if (useGPU) then + successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (matrixRows * (l_cols - 1)) * size_of_datatype, & + 1 * size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("tridiag: a_dev 7", successCUDA) + else !useGPU + e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above + endif !useGPU + endif - ! Store d_vec(1) - if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then - if(useGPU) then - successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost) - check_memcpy_cuda("tridiag: a_dev 8", successCUDA) - else !useGPU - if (isSkewsymmetric) then - d_vec(1) = 0.0_rk - else - d_vec(1) = a_mat(1,1) - endif - endif !useGPU + ! Store d_vec(1) + if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then + if(useGPU) then + successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost) + check_memcpy_cuda("tridiag: a_dev 8", successCUDA) + else !useGPU + if (isSkewsymmetric) then + d_vec(1) = 0.0_rk + else + d_vec(1) = a_mat(1,1) endif + endif !useGPU + endif #endif - deallocate(tmp, stat=istat, errmsg=errorMessage) - check_deallocate("tridiag: tmp", istat, errorMessage) + deallocate(tmp, stat=istat, errmsg=errorMessage) + check_deallocate("tridiag: tmp", istat, errorMessage) - if (useGPU) then - ! todo: should we leave a_mat on the device for further use? - successCUDA = cuda_free(a_dev) - check_dealloc_cuda("tridiag: a_dev 9", successCUDA) + if (useGPU) then + ! todo: should we leave a_mat on the device for further use? + successCUDA = cuda_free(a_dev) + check_dealloc_cuda("tridiag: a_dev 9", successCUDA) - successCUDA = cuda_free(v_row_dev) - check_dealloc_cuda("tridiag: v_row_dev", successCUDA) + successCUDA = cuda_free(v_row_dev) + check_dealloc_cuda("tridiag: v_row_dev", successCUDA) - successCUDA = cuda_free(u_row_dev) - check_dealloc_cuda("tridiag: (u_row_dev", successCUDA) + successCUDA = cuda_free(u_row_dev) + check_dealloc_cuda("tridiag: (u_row_dev", successCUDA) - successCUDA = cuda_free(v_col_dev) - check_dealloc_cuda("tridiag: v_col_dev", successCUDA) + successCUDA = cuda_free(v_col_dev) + check_dealloc_cuda("tridiag: v_col_dev", successCUDA) - successCUDA = cuda_free(u_col_dev) - check_dealloc_cuda("tridiag: u_col_dev ", successCUDA) + successCUDA = cuda_free(u_col_dev) + check_dealloc_cuda("tridiag: u_col_dev ", successCUDA) - successCUDA = cuda_free(vu_stored_rows_dev) - check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successCUDA) + successCUDA = cuda_free(vu_stored_rows_dev) + check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successCUDA) - successCUDA = cuda_free(uv_stored_cols_dev) - check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successCUDA) - endif + successCUDA = cuda_free(uv_stored_cols_dev) + check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successCUDA) + endif - ! distribute the arrays d_vec and e_vec to all processors + ! distribute the arrays d_vec and e_vec to all processors - allocate(tmp_real(na), stat=istat, errmsg=errorMessage) - check_allocate("tridiag: tmp_real", istat, errorMessage) + allocate(tmp_real(na), stat=istat, errmsg=errorMessage) + check_allocate("tridiag: tmp_real", istat, errorMessage) #ifdef WITH_MPI - if (wantDebug) call obj%timer%start("mpi_communication") - tmp_real = d_vec - call mpi_allreduce(tmp_real, d_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, & - int(mpi_comm_rows,kind=MPI_KIND), mpierr) - tmp_real = d_vec - call mpi_allreduce(tmp_real, d_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, & - int(mpi_comm_cols,kind=MPI_KIND), mpierr) - tmp_real = e_vec - call mpi_allreduce(tmp_real, e_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, & - int(mpi_comm_rows,kind=MPI_KIND), mpierr) - tmp_real = e_vec - call mpi_allreduce(tmp_real, e_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, & - int(mpi_comm_cols,kind=MPI_KIND), mpierr) - if (wantDebug) call obj%timer%stop("mpi_communication") + if (wantDebug) call obj%timer%start("mpi_communication") + tmp_real = d_vec + call mpi_allreduce(tmp_real, d_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, & + int(mpi_comm_rows,kind=MPI_KIND), mpierr) + tmp_real = d_vec + call mpi_allreduce(tmp_real, d_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, & + int(mpi_comm_cols,kind=MPI_KIND), mpierr) + tmp_real = e_vec + call mpi_allreduce(tmp_real, e_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, & + int(mpi_comm_rows,kind=MPI_KIND), mpierr) + tmp_real = e_vec + call mpi_allreduce(tmp_real, e_vec, int(na,kind=MPI_KIND), MPI_REAL_PRECISION, MPI_SUM, & + int(mpi_comm_cols,kind=MPI_KIND), mpierr) + if (wantDebug) call obj%timer%stop("mpi_communication") #endif /* WITH_MPI */ - deallocate(tmp_real, stat=istat, errmsg=errorMessage) - check_deallocate("tridiag: tmp_real", istat, errorMessage) + deallocate(tmp_real, stat=istat, errmsg=errorMessage) + check_deallocate("tridiag: tmp_real", istat, errorMessage) - if (useGPU) then - successCUDA = cuda_host_unregister(int(loc(a_mat),kind=c_intptr_t)) - check_host_unregister_cuda("tridiag: a_mat", successCUDA) + if (useGPU) then + successCUDA = cuda_host_unregister(int(loc(a_mat),kind=c_intptr_t)) + check_host_unregister_cuda("tridiag: a_mat", successCUDA) - successCUDA = cuda_free_host(v_row_host) - check_host_dealloc_cuda("tridiag: v_row_host", successCUDA) - nullify(v_row) + successCUDA = cuda_free_host(v_row_host) + check_host_dealloc_cuda("tridiag: v_row_host", successCUDA) + nullify(v_row) - successCUDA = cuda_free_host(v_col_host) - check_host_dealloc_cuda("tridiag: v_col_host", successCUDA) - nullify(v_col) + successCUDA = cuda_free_host(v_col_host) + check_host_dealloc_cuda("tridiag: v_col_host", successCUDA) + nullify(v_col) - successCUDA = cuda_free_host(u_col_host) - check_host_dealloc_cuda("tridiag: u_col_host", successCUDA) - nullify(u_col) + successCUDA = cuda_free_host(u_col_host) + check_host_dealloc_cuda("tridiag: u_col_host", successCUDA) + nullify(u_col) - successCUDA = cuda_free_host(u_row_host) - check_host_dealloc_cuda("tridiag: u_row_host", successCUDA) - nullify(u_row) + successCUDA = cuda_free_host(u_row_host) + check_host_dealloc_cuda("tridiag: u_row_host", successCUDA) + nullify(u_row) - successCUDA = cuda_host_unregister(int(loc(uv_stored_cols),kind=c_intptr_t)) - check_host_unregister_cuda("tridiag: uv_stored_cols", successCUDA) + successCUDA = cuda_host_unregister(int(loc(uv_stored_cols),kind=c_intptr_t)) + check_host_unregister_cuda("tridiag: uv_stored_cols", successCUDA) - successCUDA = cuda_host_unregister(int(loc(vu_stored_rows),kind=c_intptr_t)) - check_host_unregister_cuda("tridiag: vu_stored_rows", successCUDA) + successCUDA = cuda_host_unregister(int(loc(vu_stored_rows),kind=c_intptr_t)) + check_host_unregister_cuda("tridiag: vu_stored_rows", successCUDA) - successCUDA = cuda_host_unregister(int(loc(e_vec),kind=c_intptr_t)) - check_host_unregister_cuda("tridiag: e_vec", successCUDA) + successCUDA = cuda_host_unregister(int(loc(e_vec),kind=c_intptr_t)) + check_host_unregister_cuda("tridiag: e_vec", successCUDA) - successCUDA = cuda_host_unregister(int(loc(d_vec),kind=c_intptr_t)) - check_host_unregister_cuda("tridiag: d_vec", successCUDA) - else - deallocate(v_row, v_col, u_row, u_col, stat=istat, errmsg=errorMessage) - check_deallocate("tridiag: v_row, v_col, u_row, u_col", istat, errorMessage) - endif + successCUDA = cuda_host_unregister(int(loc(d_vec),kind=c_intptr_t)) + check_host_unregister_cuda("tridiag: d_vec", successCUDA) + else + deallocate(v_row, v_col, u_row, u_col, stat=istat, errmsg=errorMessage) + check_deallocate("tridiag: v_row, v_col, u_row, u_col", istat, errorMessage) + endif - deallocate(vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage) - check_deallocate("tridiag: vu_stored_rows, uv_stored_cols", istat, errorMessage) + deallocate(vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage) + check_deallocate("tridiag: vu_stored_rows, uv_stored_cols", istat, errorMessage) - call obj%timer%stop("tridiag_& - &MATH_DATATYPE& - &" // & - PRECISION_SUFFIX // & - gpuString ) + call obj%timer%stop("tridiag_& + &MATH_DATATYPE& + &" // & + PRECISION_SUFFIX // & + gpuString ) - end subroutine tridiag_& - &MATH_DATATYPE& - &_& - &PRECISION +end subroutine tridiag_& +&MATH_DATATYPE& +&_& +&PRECISION